dayhoff-tools 1.1.25__py3-none-any.whl → 1.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/deployment/processors.py +67 -134
- dayhoff_tools/deployment/swarm.py +0 -5
- dayhoff_tools/embedders.py +0 -1
- {dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/METADATA +11 -1
- {dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/RECORD +7 -7
- {dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/entry_points.txt +0 -0
@@ -273,12 +273,10 @@ class MMSeqsProfileProcessor(Processor):
|
|
273
273
|
|
274
274
|
# Define INTERMEDIATE output file paths within mmseqs_temp_dir
|
275
275
|
intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
|
276
|
-
intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
|
277
276
|
intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
|
278
277
|
|
279
278
|
# Define FINAL output file paths within run_base_dir, using target stem
|
280
279
|
final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
|
281
|
-
final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
|
282
280
|
final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
|
283
281
|
|
284
282
|
# --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
|
@@ -416,7 +414,7 @@ class MMSeqsProfileProcessor(Processor):
|
|
416
414
|
|
417
415
|
# 8.5 Convert M8 to CSV with headers
|
418
416
|
logger.info(
|
419
|
-
f"Converting M8 results {intermediate_results_m8_file}
|
417
|
+
f"Converting M8 results to CSV: {intermediate_results_m8_file} -> {intermediate_results_as_csv_file}"
|
420
418
|
)
|
421
419
|
csv_headers = [
|
422
420
|
"query_id",
|
@@ -433,130 +431,81 @@ class MMSeqsProfileProcessor(Processor):
|
|
433
431
|
"bit_score",
|
434
432
|
]
|
435
433
|
try:
|
436
|
-
if intermediate_results_m8_file.exists():
|
437
|
-
with open(intermediate_results_m8_file, "r") as m8_in, open(
|
438
|
-
intermediate_results_as_csv_file, "w", newline=""
|
439
|
-
) as csv_out:
|
440
|
-
writer = csv.writer(csv_out)
|
441
|
-
writer.writerow(csv_headers)
|
442
|
-
for line in m8_in:
|
443
|
-
if line.strip(): # Ensure line is not empty
|
444
|
-
columns = line.strip().split("\t")
|
445
|
-
writer.writerow(columns)
|
446
|
-
logger.info(
|
447
|
-
f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
|
448
|
-
)
|
449
|
-
else:
|
434
|
+
if not intermediate_results_m8_file.exists():
|
450
435
|
logger.warning(
|
451
|
-
f"
|
436
|
+
f"M8 results file {intermediate_results_m8_file} not found. CSV will be empty."
|
452
437
|
)
|
453
|
-
#
|
454
|
-
|
438
|
+
# Create an empty CSV with headers if M8 is missing
|
439
|
+
with open(
|
440
|
+
intermediate_results_as_csv_file, "w", newline=""
|
441
|
+
) as csvfile:
|
442
|
+
writer = csv.writer(csvfile)
|
443
|
+
writer.writerow(m8_columns)
|
444
|
+
else:
|
445
|
+
with open(intermediate_results_m8_file, "r") as m8file, open(
|
446
|
+
intermediate_results_as_csv_file, "w", newline=""
|
447
|
+
) as csvfile:
|
448
|
+
writer = csv.writer(csvfile)
|
449
|
+
writer.writerow(m8_columns)
|
450
|
+
for line in m8file:
|
451
|
+
writer.writerow(line.strip().split("\t"))
|
455
452
|
except Exception as e:
|
456
|
-
logger.error(f"
|
457
|
-
#
|
453
|
+
logger.error(f"Error converting M8 to CSV: {e}", exc_info=True)
|
454
|
+
# Ensure an empty csv is created on error to prevent downstream issues
|
458
455
|
if not intermediate_results_as_csv_file.exists():
|
459
|
-
|
460
|
-
|
456
|
+
with open(
|
457
|
+
intermediate_results_as_csv_file, "w", newline=""
|
458
|
+
) as csvfile:
|
459
|
+
writer = csv.writer(csvfile)
|
460
|
+
writer.writerow(m8_columns) # write headers even on error
|
461
461
|
|
462
|
-
# 9. Extract hit
|
462
|
+
# 9. Extract hit sequence IDs from M8 results for the TXT file
|
463
|
+
hit_sequence_ids = set()
|
463
464
|
logger.info(
|
464
|
-
f"
|
465
|
+
f"Extracting hit IDs from {intermediate_results_m8_file} for TXT output."
|
465
466
|
)
|
466
|
-
hit_sequence_ids = set()
|
467
467
|
try:
|
468
|
-
if
|
469
|
-
logger.warning(
|
470
|
-
f"PROCESSOR: M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty.",
|
471
|
-
exc_info=True,
|
472
|
-
)
|
473
|
-
intermediate_hits_fasta_file.touch() # Create empty hits file
|
474
|
-
else:
|
468
|
+
if intermediate_results_m8_file.exists():
|
475
469
|
with open(intermediate_results_m8_file, "r") as m8_file:
|
476
470
|
for line in m8_file:
|
477
|
-
if line.strip():
|
471
|
+
if line.strip(): # Check if line is not empty
|
478
472
|
columns = line.strip().split("\t")
|
479
473
|
if len(columns) >= 2:
|
480
|
-
hit_sequence_ids.add(
|
474
|
+
hit_sequence_ids.add(
|
475
|
+
columns[1]
|
476
|
+
) # Add target_accession
|
481
477
|
logger.info(
|
482
|
-
f"
|
478
|
+
f"Found {len(hit_sequence_ids)} unique hit IDs in M8 file."
|
483
479
|
)
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
)
|
490
|
-
intermediate_hits_fasta_file.touch()
|
491
|
-
else:
|
492
|
-
logger.info(f"PROCESSOR: === CALLING subset_fasta ===")
|
493
|
-
logger.info(
|
494
|
-
f"PROCESSOR: Input FASTA for subset_fasta: {local_target_file}"
|
495
|
-
)
|
496
|
-
logger.info(
|
497
|
-
f"PROCESSOR: Output FASTA for subset_fasta: {intermediate_hits_fasta_file}"
|
498
|
-
)
|
499
|
-
logger.info(
|
500
|
-
f"PROCESSOR: Number of target IDs for subset_fasta: {len(hit_sequence_ids)}"
|
501
|
-
)
|
502
|
-
try:
|
503
|
-
subset_fasta(
|
504
|
-
fasta_file=str(local_target_file),
|
505
|
-
output_path=str(intermediate_hits_fasta_file),
|
506
|
-
target_ids=hit_sequence_ids,
|
507
|
-
exclude=False,
|
508
|
-
return_written_ids=False,
|
509
|
-
)
|
510
|
-
logger.info(
|
511
|
-
f"PROCESSOR: === RETURNED from subset_fasta ==="
|
512
|
-
)
|
513
|
-
logger.info(
|
514
|
-
f"PROCESSOR: Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
|
515
|
-
)
|
516
|
-
# More specific error catching can be added if subset_fasta raises custom exceptions
|
517
|
-
except FileNotFoundError as e_fnf:
|
518
|
-
logger.error(
|
519
|
-
f"PROCESSOR: subset_fasta FileNotFoundError: {e_fnf}. Ensuring {intermediate_hits_fasta_file} exists as empty.",
|
520
|
-
exc_info=True,
|
521
|
-
)
|
522
|
-
if not intermediate_hits_fasta_file.exists():
|
523
|
-
intermediate_hits_fasta_file.touch()
|
524
|
-
raise
|
525
|
-
except (
|
526
|
-
Exception
|
527
|
-
) as e_sub: # Catch any other exception from subset_fasta
|
528
|
-
logger.error(
|
529
|
-
f"PROCESSOR: subset_fasta failed to create {intermediate_hits_fasta_file}: {e_sub}",
|
530
|
-
exc_info=True,
|
531
|
-
)
|
532
|
-
if not intermediate_hits_fasta_file.exists():
|
533
|
-
intermediate_hits_fasta_file.touch()
|
534
|
-
raise
|
535
|
-
except Exception as e_m8_proc:
|
480
|
+
else:
|
481
|
+
logger.warning(
|
482
|
+
f"Intermediate M8 file {intermediate_results_m8_file} not found. Hit TXT file will be empty."
|
483
|
+
)
|
484
|
+
except Exception as e:
|
536
485
|
logger.error(
|
537
|
-
f"
|
486
|
+
f"Error reading M8 file {intermediate_results_m8_file} for hit ID extraction: {e}",
|
538
487
|
exc_info=True,
|
539
488
|
)
|
540
|
-
if
|
541
|
-
intermediate_hits_fasta_file.touch()
|
542
|
-
raise
|
489
|
+
# Proceed even if M8 reading fails, TXT will be empty
|
543
490
|
|
544
|
-
# 10. Write the set of hit sequence IDs to
|
491
|
+
# 10. Write the set of hit sequence IDs to the final .txt file
|
545
492
|
logger.info(
|
546
|
-
f"
|
493
|
+
f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
|
547
494
|
)
|
548
495
|
try:
|
549
496
|
with open(final_hits_txt_file, "w") as txt_out:
|
550
|
-
for
|
551
|
-
|
552
|
-
): # Sort for consistent output
|
497
|
+
# Sort IDs for consistent output
|
498
|
+
for seq_id in sorted(list(hit_sequence_ids)):
|
553
499
|
txt_out.write(f"{seq_id}\n")
|
554
|
-
logger.info(
|
555
|
-
f"PROCESSOR: Successfully wrote hit IDs to {final_hits_txt_file}"
|
556
|
-
)
|
500
|
+
logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
|
557
501
|
except Exception as e:
|
558
|
-
logger.error(
|
559
|
-
|
502
|
+
logger.error(
|
503
|
+
f"Failed to write hit IDs to {final_hits_txt_file}: {e}",
|
504
|
+
exc_info=True,
|
505
|
+
)
|
506
|
+
# Ensure the file exists even if writing fails
|
507
|
+
if not final_hits_txt_file.exists():
|
508
|
+
final_hits_txt_file.touch()
|
560
509
|
|
561
510
|
logger.info(
|
562
511
|
f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
|
@@ -576,41 +525,25 @@ class MMSeqsProfileProcessor(Processor):
|
|
576
525
|
)
|
577
526
|
final_results_csv_file.touch() # Create empty file in run_base_dir if not found
|
578
527
|
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
)
|
583
|
-
logger.info(f"Moved and renamed hits FASTA to {final_hits_fasta_file}")
|
584
|
-
else:
|
585
|
-
logger.warning(
|
586
|
-
f"Intermediate hits FASTA {intermediate_hits_fasta_file} not found. Creating empty target file."
|
587
|
-
)
|
588
|
-
final_hits_fasta_file.touch() # Create empty file in run_base_dir if not found
|
528
|
+
logger.info(
|
529
|
+
f"MMSeqsProfileProcessor run completed for {input_file}. Output CSV: {final_results_csv_file}"
|
530
|
+
)
|
589
531
|
|
532
|
+
except Exception as e:
|
533
|
+
logger.error(
|
534
|
+
f"MMSeqsProfileProcessor failed for {input_file}: {e}", exc_info=True
|
535
|
+
)
|
536
|
+
raise
|
590
537
|
finally:
|
591
|
-
#
|
538
|
+
# --- Cleanup --- #
|
539
|
+
logger.info(f"Cleaning up temporary directory: {mmseqs_temp_dir}")
|
592
540
|
if mmseqs_temp_dir.exists():
|
593
541
|
shutil.rmtree(mmseqs_temp_dir)
|
542
|
+
if local_target_file.exists() and local_target_file != Path(input_file):
|
594
543
|
logger.info(
|
595
|
-
f"
|
544
|
+
f"Cleaning up local copy of target file: {local_target_file}"
|
596
545
|
)
|
546
|
+
local_target_file.unlink()
|
547
|
+
logger.info("MMSeqsProfileProcessor cleanup finished.")
|
597
548
|
|
598
|
-
|
599
|
-
# so it does not get uploaded with the results.
|
600
|
-
if local_target_file.exists():
|
601
|
-
try:
|
602
|
-
local_target_file.unlink()
|
603
|
-
logger.info(
|
604
|
-
f"Cleaned up copied input file from run directory: {local_target_file}"
|
605
|
-
)
|
606
|
-
except OSError as e:
|
607
|
-
logger.error(
|
608
|
-
f"Error deleting copied input file {local_target_file}: {e}"
|
609
|
-
)
|
610
|
-
|
611
|
-
# The run_base_dir (containing only the final, meaningfully named output files)
|
612
|
-
# will be cleaned up by the Operator after its contents are uploaded.
|
613
|
-
|
614
|
-
return str(
|
615
|
-
run_base_dir
|
616
|
-
) # Return the path to the directory containing meaningfully named results
|
549
|
+
return str(run_base_dir) # Return the path to the directory containing outputs
|
@@ -439,11 +439,6 @@ class Operator:
|
|
439
439
|
- For AWS spot instances, uses IMDSv2 to check instance-action metadata
|
440
440
|
- For GCP preemptible VMs, checks both maintenance-event and preempted metadata
|
441
441
|
"""
|
442
|
-
logger.info(
|
443
|
-
"DEBUG: _check_for_termination has been temporarily disabled for testing."
|
444
|
-
)
|
445
|
-
return # DEBUG: Temporarily disable to test if this is causing premature shutdown
|
446
|
-
|
447
442
|
while not _shutdown_requested.is_set():
|
448
443
|
try:
|
449
444
|
# Check AWS spot termination using IMDSv2 (token-based auth)
|
dayhoff_tools/embedders.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: dayhoff-tools
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.27
|
4
4
|
Summary: Common tools for all the repos at Dayhoff Labs
|
5
5
|
Author: Daniel Martin-Alarcon
|
6
6
|
Author-email: dma@dayhofflabs.com
|
@@ -10,22 +10,32 @@ Classifier: Programming Language :: Python :: 3.10
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
12
12
|
Classifier: Programming Language :: Python :: 3.13
|
13
|
+
Provides-Extra: embedders
|
13
14
|
Provides-Extra: full
|
14
15
|
Requires-Dist: biopython (>=1.84) ; extra == "full"
|
16
|
+
Requires-Dist: biopython (>=1.85) ; extra == "embedders"
|
15
17
|
Requires-Dist: boto3 (>=1.36.8) ; extra == "full"
|
16
18
|
Requires-Dist: docker (>=7.1.0) ; extra == "full"
|
19
|
+
Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
|
17
20
|
Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
|
18
21
|
Requires-Dist: firebase-admin (>=6.5.0)
|
19
22
|
Requires-Dist: h5py (>=3.11.0) ; extra == "full"
|
23
|
+
Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
|
24
|
+
Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
|
25
|
+
Requires-Dist: pandas (>=2.2.3) ; extra == "embedders"
|
20
26
|
Requires-Dist: pandas (>=2.2.3) ; extra == "full"
|
21
27
|
Requires-Dist: pyyaml (>=6.0)
|
22
28
|
Requires-Dist: questionary (>=2.0.1)
|
23
29
|
Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
|
24
30
|
Requires-Dist: requests (>=2.31.0)
|
31
|
+
Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
|
25
32
|
Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
|
26
33
|
Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
|
27
34
|
Requires-Dist: toml (>=0.10)
|
35
|
+
Requires-Dist: torch (>=2.4.0) ; extra == "embedders"
|
36
|
+
Requires-Dist: tqdm (>=4.67.1) ; extra == "embedders"
|
28
37
|
Requires-Dist: transformers (==4.36.2) ; extra == "full"
|
38
|
+
Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
|
29
39
|
Requires-Dist: typer (>=0.9.0)
|
30
40
|
Description-Content-Type: text/markdown
|
31
41
|
|
@@ -11,9 +11,9 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRR
|
|
11
11
|
dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
|
14
|
-
dayhoff_tools/deployment/processors.py,sha256=
|
15
|
-
dayhoff_tools/deployment/swarm.py,sha256=
|
16
|
-
dayhoff_tools/embedders.py,sha256=
|
14
|
+
dayhoff_tools/deployment/processors.py,sha256=Ao4hU4rEXlyvLq-8wt8syqnfB05N7fIKYtjjdKCmx3g,22695
|
15
|
+
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
|
+
dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
|
17
17
|
dayhoff_tools/fasta.py,sha256=HJ25D_u5F-tU6fZMkJfIhvqMSmnR32JK1QdCPXoHJ5g,49785
|
18
18
|
dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
|
19
19
|
dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.27.dist-info/METADATA,sha256=b49Gayl-plG0GhoElOAHZm_jAG2qWHNEdkcRbMfEJTM,2761
|
30
|
+
dayhoff_tools-1.1.27.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.27.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.27.dist-info/RECORD,,
|
File without changes
|
File without changes
|