dayhoff-tools 1.1.25__py3-none-any.whl → 1.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -273,12 +273,10 @@ class MMSeqsProfileProcessor(Processor):
273
273
 
274
274
  # Define INTERMEDIATE output file paths within mmseqs_temp_dir
275
275
  intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
276
- intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
277
276
  intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
278
277
 
279
278
  # Define FINAL output file paths within run_base_dir, using target stem
280
279
  final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
281
- final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
282
280
  final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
283
281
 
284
282
  # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
@@ -416,7 +414,7 @@ class MMSeqsProfileProcessor(Processor):
416
414
 
417
415
  # 8.5 Convert M8 to CSV with headers
418
416
  logger.info(
419
- f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
417
+ f"Converting M8 results to CSV: {intermediate_results_m8_file} -> {intermediate_results_as_csv_file}"
420
418
  )
421
419
  csv_headers = [
422
420
  "query_id",
@@ -433,130 +431,81 @@ class MMSeqsProfileProcessor(Processor):
433
431
  "bit_score",
434
432
  ]
435
433
  try:
436
- if intermediate_results_m8_file.exists():
437
- with open(intermediate_results_m8_file, "r") as m8_in, open(
438
- intermediate_results_as_csv_file, "w", newline=""
439
- ) as csv_out:
440
- writer = csv.writer(csv_out)
441
- writer.writerow(csv_headers)
442
- for line in m8_in:
443
- if line.strip(): # Ensure line is not empty
444
- columns = line.strip().split("\t")
445
- writer.writerow(columns)
446
- logger.info(
447
- f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
448
- )
449
- else:
434
+ if not intermediate_results_m8_file.exists():
450
435
  logger.warning(
451
- f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
436
+ f"M8 results file {intermediate_results_m8_file} not found. CSV will be empty."
452
437
  )
453
- # Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
454
- intermediate_results_as_csv_file.touch()
438
+ # Create an empty CSV with headers if M8 is missing
439
+ with open(
440
+ intermediate_results_as_csv_file, "w", newline=""
441
+ ) as csvfile:
442
+ writer = csv.writer(csvfile)
443
+ writer.writerow(m8_columns)
444
+ else:
445
+ with open(intermediate_results_m8_file, "r") as m8file, open(
446
+ intermediate_results_as_csv_file, "w", newline=""
447
+ ) as csvfile:
448
+ writer = csv.writer(csvfile)
449
+ writer.writerow(m8_columns)
450
+ for line in m8file:
451
+ writer.writerow(line.strip().split("\t"))
455
452
  except Exception as e:
456
- logger.error(f"Failed to convert M8 to CSV: {e}")
457
- # Touch the CSV file in case of error to prevent downstream errors if it's expected
453
+ logger.error(f"Error converting M8 to CSV: {e}", exc_info=True)
454
+ # Ensure an empty csv is created on error to prevent downstream issues
458
455
  if not intermediate_results_as_csv_file.exists():
459
- intermediate_results_as_csv_file.touch()
460
- # We might still want to raise e here, depending on desired error handling for CSV conversion failure
456
+ with open(
457
+ intermediate_results_as_csv_file, "w", newline=""
458
+ ) as csvfile:
459
+ writer = csv.writer(csvfile)
460
+ writer.writerow(m8_columns) # write headers even on error
461
461
 
462
- # 9. Extract hit sequences from M8 results using subset_fasta
462
+ # 9. Extract hit sequence IDs from M8 results for the TXT file
463
+ hit_sequence_ids = set()
463
464
  logger.info(
464
- f"PROCESSOR: Parsing M8 results from: {intermediate_results_m8_file}"
465
+ f"Extracting hit IDs from {intermediate_results_m8_file} for TXT output."
465
466
  )
466
- hit_sequence_ids = set()
467
467
  try:
468
- if not intermediate_results_m8_file.exists():
469
- logger.warning(
470
- f"PROCESSOR: M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty.",
471
- exc_info=True,
472
- )
473
- intermediate_hits_fasta_file.touch() # Create empty hits file
474
- else:
468
+ if intermediate_results_m8_file.exists():
475
469
  with open(intermediate_results_m8_file, "r") as m8_file:
476
470
  for line in m8_file:
477
- if line.strip():
471
+ if line.strip(): # Check if line is not empty
478
472
  columns = line.strip().split("\t")
479
473
  if len(columns) >= 2:
480
- hit_sequence_ids.add(columns[1])
474
+ hit_sequence_ids.add(
475
+ columns[1]
476
+ ) # Add target_accession
481
477
  logger.info(
482
- f"PROCESSOR: Found {len(hit_sequence_ids)} unique target IDs in M8 results."
478
+ f"Found {len(hit_sequence_ids)} unique hit IDs in M8 file."
483
479
  )
484
-
485
- if not hit_sequence_ids:
486
- logger.warning(
487
- f"PROCESSOR: No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty.",
488
- exc_info=True,
489
- )
490
- intermediate_hits_fasta_file.touch()
491
- else:
492
- logger.info(f"PROCESSOR: === CALLING subset_fasta ===")
493
- logger.info(
494
- f"PROCESSOR: Input FASTA for subset_fasta: {local_target_file}"
495
- )
496
- logger.info(
497
- f"PROCESSOR: Output FASTA for subset_fasta: {intermediate_hits_fasta_file}"
498
- )
499
- logger.info(
500
- f"PROCESSOR: Number of target IDs for subset_fasta: {len(hit_sequence_ids)}"
501
- )
502
- try:
503
- subset_fasta(
504
- fasta_file=str(local_target_file),
505
- output_path=str(intermediate_hits_fasta_file),
506
- target_ids=hit_sequence_ids,
507
- exclude=False,
508
- return_written_ids=False,
509
- )
510
- logger.info(
511
- f"PROCESSOR: === RETURNED from subset_fasta ==="
512
- )
513
- logger.info(
514
- f"PROCESSOR: Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
515
- )
516
- # More specific error catching can be added if subset_fasta raises custom exceptions
517
- except FileNotFoundError as e_fnf:
518
- logger.error(
519
- f"PROCESSOR: subset_fasta FileNotFoundError: {e_fnf}. Ensuring {intermediate_hits_fasta_file} exists as empty.",
520
- exc_info=True,
521
- )
522
- if not intermediate_hits_fasta_file.exists():
523
- intermediate_hits_fasta_file.touch()
524
- raise
525
- except (
526
- Exception
527
- ) as e_sub: # Catch any other exception from subset_fasta
528
- logger.error(
529
- f"PROCESSOR: subset_fasta failed to create {intermediate_hits_fasta_file}: {e_sub}",
530
- exc_info=True,
531
- )
532
- if not intermediate_hits_fasta_file.exists():
533
- intermediate_hits_fasta_file.touch()
534
- raise
535
- except Exception as e_m8_proc:
480
+ else:
481
+ logger.warning(
482
+ f"Intermediate M8 file {intermediate_results_m8_file} not found. Hit TXT file will be empty."
483
+ )
484
+ except Exception as e:
536
485
  logger.error(
537
- f"PROCESSOR: Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e_m8_proc}",
486
+ f"Error reading M8 file {intermediate_results_m8_file} for hit ID extraction: {e}",
538
487
  exc_info=True,
539
488
  )
540
- if not intermediate_hits_fasta_file.exists():
541
- intermediate_hits_fasta_file.touch()
542
- raise
489
+ # Proceed even if M8 reading fails, TXT will be empty
543
490
 
544
- # 10. Write the set of hit sequence IDs to a .txt file
491
+ # 10. Write the set of hit sequence IDs to the final .txt file
545
492
  logger.info(
546
- f"PROCESSOR: Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
493
+ f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
547
494
  )
548
495
  try:
549
496
  with open(final_hits_txt_file, "w") as txt_out:
550
- for seq_id in sorted(
551
- list(hit_sequence_ids)
552
- ): # Sort for consistent output
497
+ # Sort IDs for consistent output
498
+ for seq_id in sorted(list(hit_sequence_ids)):
553
499
  txt_out.write(f"{seq_id}\n")
554
- logger.info(
555
- f"PROCESSOR: Successfully wrote hit IDs to {final_hits_txt_file}"
556
- )
500
+ logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
557
501
  except Exception as e:
558
- logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
559
- # The main workflow should still proceed even if this supplementary file fails
502
+ logger.error(
503
+ f"Failed to write hit IDs to {final_hits_txt_file}: {e}",
504
+ exc_info=True,
505
+ )
506
+ # Ensure the file exists even if writing fails
507
+ if not final_hits_txt_file.exists():
508
+ final_hits_txt_file.touch()
560
509
 
561
510
  logger.info(
562
511
  f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
@@ -576,41 +525,25 @@ class MMSeqsProfileProcessor(Processor):
576
525
  )
577
526
  final_results_csv_file.touch() # Create empty file in run_base_dir if not found
578
527
 
579
- if intermediate_hits_fasta_file.exists():
580
- shutil.move(
581
- str(intermediate_hits_fasta_file), str(final_hits_fasta_file)
582
- )
583
- logger.info(f"Moved and renamed hits FASTA to {final_hits_fasta_file}")
584
- else:
585
- logger.warning(
586
- f"Intermediate hits FASTA {intermediate_hits_fasta_file} not found. Creating empty target file."
587
- )
588
- final_hits_fasta_file.touch() # Create empty file in run_base_dir if not found
528
+ logger.info(
529
+ f"MMSeqsProfileProcessor run completed for {input_file}. Output CSV: {final_results_csv_file}"
530
+ )
589
531
 
532
+ except Exception as e:
533
+ logger.error(
534
+ f"MMSeqsProfileProcessor failed for {input_file}: {e}", exc_info=True
535
+ )
536
+ raise
590
537
  finally:
591
- # Clean up the MMseqs2 temporary directory (mmseqs_tmp) which contains intermediate DBs etc.
538
+ # --- Cleanup --- #
539
+ logger.info(f"Cleaning up temporary directory: {mmseqs_temp_dir}")
592
540
  if mmseqs_temp_dir.exists():
593
541
  shutil.rmtree(mmseqs_temp_dir)
542
+ if local_target_file.exists() and local_target_file != Path(input_file):
594
543
  logger.info(
595
- f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
544
+ f"Cleaning up local copy of target file: {local_target_file}"
596
545
  )
546
+ local_target_file.unlink()
547
+ logger.info("MMSeqsProfileProcessor cleanup finished.")
597
548
 
598
- # Clean up the copied input file (local_target_file) from the run_base_dir
599
- # so it does not get uploaded with the results.
600
- if local_target_file.exists():
601
- try:
602
- local_target_file.unlink()
603
- logger.info(
604
- f"Cleaned up copied input file from run directory: {local_target_file}"
605
- )
606
- except OSError as e:
607
- logger.error(
608
- f"Error deleting copied input file {local_target_file}: {e}"
609
- )
610
-
611
- # The run_base_dir (containing only the final, meaningfully named output files)
612
- # will be cleaned up by the Operator after its contents are uploaded.
613
-
614
- return str(
615
- run_base_dir
616
- ) # Return the path to the directory containing meaningfully named results
549
+ return str(run_base_dir) # Return the path to the directory containing outputs
@@ -439,11 +439,6 @@ class Operator:
439
439
  - For AWS spot instances, uses IMDSv2 to check instance-action metadata
440
440
  - For GCP preemptible VMs, checks both maintenance-event and preempted metadata
441
441
  """
442
- logger.info(
443
- "DEBUG: _check_for_termination has been temporarily disabled for testing."
444
- )
445
- return # DEBUG: Temporarily disable to test if this is causing premature shutdown
446
-
447
442
  while not _shutdown_requested.is_set():
448
443
  try:
449
444
  # Check AWS spot termination using IMDSv2 (token-based auth)
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  import os
3
3
  import time
4
- from abc import ABC, abstractmethod
5
4
  from typing import Dict, List, Literal, Optional, Tuple, cast
6
5
 
7
6
  import h5py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.25
3
+ Version: 1.1.27
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -10,22 +10,32 @@ Classifier: Programming Language :: Python :: 3.10
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
+ Provides-Extra: embedders
13
14
  Provides-Extra: full
14
15
  Requires-Dist: biopython (>=1.84) ; extra == "full"
16
+ Requires-Dist: biopython (>=1.85) ; extra == "embedders"
15
17
  Requires-Dist: boto3 (>=1.36.8) ; extra == "full"
16
18
  Requires-Dist: docker (>=7.1.0) ; extra == "full"
19
+ Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
17
20
  Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
18
21
  Requires-Dist: firebase-admin (>=6.5.0)
19
22
  Requires-Dist: h5py (>=3.11.0) ; extra == "full"
23
+ Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
24
+ Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
25
+ Requires-Dist: pandas (>=2.2.3) ; extra == "embedders"
20
26
  Requires-Dist: pandas (>=2.2.3) ; extra == "full"
21
27
  Requires-Dist: pyyaml (>=6.0)
22
28
  Requires-Dist: questionary (>=2.0.1)
23
29
  Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
24
30
  Requires-Dist: requests (>=2.31.0)
31
+ Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
25
32
  Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
26
33
  Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
27
34
  Requires-Dist: toml (>=0.10)
35
+ Requires-Dist: torch (>=2.4.0) ; extra == "embedders"
36
+ Requires-Dist: tqdm (>=4.67.1) ; extra == "embedders"
28
37
  Requires-Dist: transformers (==4.36.2) ; extra == "full"
38
+ Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
29
39
  Requires-Dist: typer (>=0.9.0)
30
40
  Description-Content-Type: text/markdown
31
41
 
@@ -11,9 +11,9 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRR
11
11
  dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
14
- dayhoff_tools/deployment/processors.py,sha256=kdXbS354DUKCN-kkj2sbt6T06cNG8Hphjj9e_nyCt1g,26676
15
- dayhoff_tools/deployment/swarm.py,sha256=Xoe-lLQYDT3FwCrPzImgpbHdWRmsK6WERh1IMMNWb2c,21898
16
- dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
14
+ dayhoff_tools/deployment/processors.py,sha256=Ao4hU4rEXlyvLq-8wt8syqnfB05N7fIKYtjjdKCmx3g,22695
15
+ dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
+ dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
17
17
  dayhoff_tools/fasta.py,sha256=HJ25D_u5F-tU6fZMkJfIhvqMSmnR32JK1QdCPXoHJ5g,49785
18
18
  dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
19
19
  dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.25.dist-info/METADATA,sha256=u9c1AI8g1qTnwBU3rPpoZm9BCk1AMu3AZry03FaeduI,2225
30
- dayhoff_tools-1.1.25.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.25.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.25.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.27.dist-info/METADATA,sha256=b49Gayl-plG0GhoElOAHZm_jAG2qWHNEdkcRbMfEJTM,2761
30
+ dayhoff_tools-1.1.27.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.27.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.27.dist-info/RECORD,,