dayhoff-tools 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +10 -0
- dayhoff_tools/cli/utility_commands.py +21 -11
- dayhoff_tools/deployment/processors.py +136 -24
- {dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/RECORD +7 -7
- {dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/entry_points.txt +0 -0
dayhoff_tools/__init__.py
CHANGED
@@ -0,0 +1,10 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
|
3
|
+
try:
|
4
|
+
# The package name here should match the 'name' field in your pyproject.toml
|
5
|
+
__version__ = importlib.metadata.version("dayhoff-tools")
|
6
|
+
except importlib.metadata.PackageNotFoundError:
|
7
|
+
# This is a fallback for when the package might not be installed (e.g., running from source
|
8
|
+
# without installation, or during development). You can set it to None, "unknown",
|
9
|
+
# or handle it as you see fit.
|
10
|
+
__version__ = "unknown"
|
@@ -437,14 +437,13 @@ def install_dependencies(
|
|
437
437
|
False,
|
438
438
|
"--install-project",
|
439
439
|
"-p",
|
440
|
-
help="Install the local project package itself into the environment.",
|
440
|
+
help="Install the local project package itself (with 'full' extras) into the environment.",
|
441
441
|
),
|
442
442
|
):
|
443
443
|
"""Install dependencies based on pyproject.toml.
|
444
444
|
|
445
445
|
Ensures uv.lock matches pyproject.toml and syncs the environment.
|
446
|
-
|
447
|
-
or cloning/pulling a repository.
|
446
|
+
When -p is used, installs the local project with its [full] optional dependencies.
|
448
447
|
"""
|
449
448
|
# ANSI color codes
|
450
449
|
BLUE = "\033[94m"
|
@@ -457,15 +456,26 @@ def install_dependencies(
|
|
457
456
|
print(f"Running command: {BLUE}{' '.join(lock_cmd)}{RESET}")
|
458
457
|
subprocess.run(lock_cmd, check=True, capture_output=True)
|
459
458
|
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
459
|
+
if install_project:
|
460
|
+
# Step 2a: Install the project with 'full' extras
|
461
|
+
print("Installing the local project with 'full' extras...")
|
462
|
+
# The .[full] syntax tells pip to install the current project ('.')
|
463
|
+
# with its 'full' optional dependencies.
|
464
|
+
pip_install_cmd = ["uv", "pip", "install", "-e", ".[full]"]
|
465
|
+
print(f"Running command: {BLUE}{' '.join(pip_install_cmd)}{RESET}")
|
466
|
+
subprocess.run(pip_install_cmd, check=True)
|
467
467
|
|
468
|
-
|
468
|
+
print("Project installed with 'full' extras successfully.")
|
469
|
+
else:
|
470
|
+
# Original behavior: Sync environment without installing the project
|
471
|
+
print(
|
472
|
+
"Syncing environment with lock file (project itself will not be installed)..."
|
473
|
+
)
|
474
|
+
# --all-groups ensures all non-project dependencies (like dev) are installed
|
475
|
+
sync_cmd = ["uv", "sync", "--all-groups", "--no-install-project"]
|
476
|
+
print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
|
477
|
+
subprocess.run(sync_cmd, check=True)
|
478
|
+
print("Dependencies synced successfully (project not installed).")
|
469
479
|
|
470
480
|
except subprocess.CalledProcessError as e:
|
471
481
|
stderr_output = e.stderr.decode() if e.stderr else "No stderr output."
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import csv
|
1
2
|
import logging
|
2
3
|
import os
|
3
4
|
import shlex
|
@@ -6,6 +7,8 @@ import subprocess
|
|
6
7
|
from abc import ABC, abstractmethod
|
7
8
|
from pathlib import Path
|
8
9
|
|
10
|
+
from dayhoff_tools.fasta import subset_fasta
|
11
|
+
|
9
12
|
logger = logging.getLogger(__name__)
|
10
13
|
|
11
14
|
|
@@ -267,10 +270,12 @@ class MMSeqsProfileProcessor(Processor):
|
|
267
270
|
# Define INTERMEDIATE output file paths within mmseqs_temp_dir
|
268
271
|
intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
|
269
272
|
intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
|
273
|
+
intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
|
270
274
|
|
271
275
|
# Define FINAL output file paths within run_base_dir, using target stem
|
272
|
-
|
273
|
-
final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}
|
276
|
+
final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
|
277
|
+
final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
|
278
|
+
final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
|
274
279
|
|
275
280
|
# --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
|
276
281
|
query_db = mmseqs_temp_dir / "queryDB"
|
@@ -405,40 +410,147 @@ class MMSeqsProfileProcessor(Processor):
|
|
405
410
|
run_base_dir,
|
406
411
|
)
|
407
412
|
|
408
|
-
#
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
"
|
424
|
-
|
413
|
+
# 8.5 Convert M8 to CSV with headers
|
414
|
+
logger.info(
|
415
|
+
f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
|
416
|
+
)
|
417
|
+
csv_headers = [
|
418
|
+
"query_id",
|
419
|
+
"target_id",
|
420
|
+
"percent_identity",
|
421
|
+
"alignment_length",
|
422
|
+
"mismatches",
|
423
|
+
"gap_openings",
|
424
|
+
"query_start",
|
425
|
+
"query_end",
|
426
|
+
"target_start",
|
427
|
+
"target_end",
|
428
|
+
"e_value",
|
429
|
+
"bit_score",
|
430
|
+
]
|
431
|
+
try:
|
432
|
+
if intermediate_results_m8_file.exists():
|
433
|
+
with open(intermediate_results_m8_file, "r") as m8_in, open(
|
434
|
+
intermediate_results_as_csv_file, "w", newline=""
|
435
|
+
) as csv_out:
|
436
|
+
writer = csv.writer(csv_out)
|
437
|
+
writer.writerow(csv_headers)
|
438
|
+
for line in m8_in:
|
439
|
+
if line.strip(): # Ensure line is not empty
|
440
|
+
columns = line.strip().split("\t")
|
441
|
+
writer.writerow(columns)
|
442
|
+
logger.info(
|
443
|
+
f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
|
444
|
+
)
|
445
|
+
else:
|
446
|
+
logger.warning(
|
447
|
+
f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
|
448
|
+
)
|
449
|
+
# Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
|
450
|
+
intermediate_results_as_csv_file.touch()
|
451
|
+
except Exception as e:
|
452
|
+
logger.error(f"Failed to convert M8 to CSV: {e}")
|
453
|
+
# Touch the CSV file in case of error to prevent downstream errors if it's expected
|
454
|
+
if not intermediate_results_as_csv_file.exists():
|
455
|
+
intermediate_results_as_csv_file.touch()
|
456
|
+
# We might still want to raise e here, depending on desired error handling for CSV conversion failure
|
457
|
+
|
458
|
+
# 9. Extract hit sequences from M8 results using subset_fasta
|
459
|
+
logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
|
460
|
+
hit_sequence_ids = set()
|
461
|
+
try:
|
462
|
+
if not intermediate_results_m8_file.exists():
|
463
|
+
logger.warning(
|
464
|
+
f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
|
465
|
+
)
|
466
|
+
intermediate_hits_fasta_file.touch() # Create empty hits file
|
467
|
+
else:
|
468
|
+
with open(intermediate_results_m8_file, "r") as m8_file:
|
469
|
+
for line in m8_file:
|
470
|
+
if line.strip(): # Ensure line is not empty
|
471
|
+
columns = line.strip().split("\t")
|
472
|
+
if len(columns) >= 2:
|
473
|
+
hit_sequence_ids.add(
|
474
|
+
columns[1]
|
475
|
+
) # Target ID is the second column
|
476
|
+
logger.info(
|
477
|
+
f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
|
478
|
+
)
|
479
|
+
|
480
|
+
if not hit_sequence_ids:
|
481
|
+
logger.warning(
|
482
|
+
f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
|
483
|
+
)
|
484
|
+
intermediate_hits_fasta_file.touch() # Create empty file
|
485
|
+
else:
|
486
|
+
logger.info(
|
487
|
+
f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
|
488
|
+
)
|
489
|
+
try:
|
490
|
+
subset_fasta(
|
491
|
+
fasta_file=str(local_target_file),
|
492
|
+
output_path=str(intermediate_hits_fasta_file),
|
493
|
+
target_ids=hit_sequence_ids,
|
494
|
+
exclude=False,
|
495
|
+
return_written_ids=False,
|
496
|
+
)
|
497
|
+
logger.info(
|
498
|
+
f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
|
499
|
+
)
|
500
|
+
except FileNotFoundError as e:
|
501
|
+
logger.error(
|
502
|
+
f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
|
503
|
+
)
|
504
|
+
if not intermediate_hits_fasta_file.exists():
|
505
|
+
intermediate_hits_fasta_file.touch()
|
506
|
+
raise
|
507
|
+
except Exception as e:
|
508
|
+
logger.error(
|
509
|
+
f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
|
510
|
+
)
|
511
|
+
if not intermediate_hits_fasta_file.exists():
|
512
|
+
intermediate_hits_fasta_file.touch()
|
513
|
+
raise
|
514
|
+
except Exception as e:
|
515
|
+
logger.error(
|
516
|
+
f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
|
517
|
+
)
|
518
|
+
if not intermediate_hits_fasta_file.exists():
|
519
|
+
intermediate_hits_fasta_file.touch()
|
520
|
+
raise
|
521
|
+
|
522
|
+
# 10. Write the set of hit sequence IDs to a .txt file
|
523
|
+
logger.info(
|
524
|
+
f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
|
425
525
|
)
|
526
|
+
try:
|
527
|
+
with open(final_hits_txt_file, "w") as txt_out:
|
528
|
+
for seq_id in sorted(
|
529
|
+
list(hit_sequence_ids)
|
530
|
+
): # Sort for consistent output
|
531
|
+
txt_out.write(f"{seq_id}\n")
|
532
|
+
logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
|
533
|
+
except Exception as e:
|
534
|
+
logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
|
535
|
+
# The main workflow should still proceed even if this supplementary file fails
|
426
536
|
|
427
537
|
logger.info(
|
428
538
|
f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
|
429
539
|
)
|
430
540
|
|
431
541
|
# Move and rename final output files from mmseqs_temp_dir to run_base_dir
|
432
|
-
if
|
542
|
+
if intermediate_results_as_csv_file.exists():
|
433
543
|
shutil.move(
|
434
|
-
str(
|
544
|
+
str(intermediate_results_as_csv_file), str(final_results_csv_file)
|
545
|
+
)
|
546
|
+
logger.info(
|
547
|
+
f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
|
435
548
|
)
|
436
|
-
logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
|
437
549
|
else:
|
438
550
|
logger.warning(
|
439
|
-
f"Intermediate
|
551
|
+
f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
|
440
552
|
)
|
441
|
-
|
553
|
+
final_results_csv_file.touch() # Create empty file in run_base_dir if not found
|
442
554
|
|
443
555
|
if intermediate_hits_fasta_file.exists():
|
444
556
|
shutil.move(
|
@@ -1,17 +1,17 @@
|
|
1
|
-
dayhoff_tools/__init__.py,sha256=
|
1
|
+
dayhoff_tools/__init__.py,sha256=M5zThPyEBRYa5CfwlzKhcqTevWn3OKu62cjV6Zqie2A,469
|
2
2
|
dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf2ElfZDXEpY,11188
|
3
3
|
dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
|
4
4
|
dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
dayhoff_tools/cli/cloud_commands.py,sha256=NGux28-cjDyCADF-L1tjdEMzkCMYX8V4xNvpK6EWcZA,40802
|
6
6
|
dayhoff_tools/cli/main.py,sha256=47EGb28ALaYFc7oAUGlY1D66AIDmc4RZiXxN-gPVrpQ,4519
|
7
7
|
dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
|
8
|
-
dayhoff_tools/cli/utility_commands.py,sha256=
|
8
|
+
dayhoff_tools/cli/utility_commands.py,sha256=08MMLlQZEjJ_r7Cd6b92aYYPjH7gMvMQvCOcpoNnFVo,24352
|
9
9
|
dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
|
10
10
|
dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
|
11
11
|
dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
|
14
|
-
dayhoff_tools/deployment/processors.py,sha256=
|
14
|
+
dayhoff_tools/deployment/processors.py,sha256=0D2r2-NgVcFfxF0QSqLZy86cxEWSn78jLTKYf60fLBQ,25519
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
16
|
dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
|
17
17
|
dayhoff_tools/fasta.py,sha256=e7xw3pInoupqCGE0-fJTOzmW_earL1M7qPyoqIPfUT4,46269
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.21.dist-info/METADATA,sha256=R6Q1CWHxPpXkuYs0OwJ-WSSb9pnPIS8V4xbZHKjugZE,2225
|
30
|
+
dayhoff_tools-1.1.21.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.21.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.21.dist-info/RECORD,,
|
File without changes
|
File without changes
|