dayhoff-tools 1.1.22__tar.gz → 1.1.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/PKG-INFO +1 -1
  2. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/utility_commands.py +1 -1
  3. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/processors.py +40 -20
  4. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/fasta.py +128 -56
  5. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/pyproject.toml +1 -1
  6. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/README.md +0 -0
  7. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/__init__.py +0 -0
  8. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/chemistry/standardizer.py +0 -0
  9. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/chemistry/utils.py +0 -0
  10. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/__init__.py +0 -0
  11. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/cloud_commands.py +0 -0
  12. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/main.py +0 -0
  13. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/swarm_commands.py +0 -0
  14. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/base.py +0 -0
  15. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  16. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  17. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  18. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/job_runner.py +0 -0
  19. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/swarm.py +0 -0
  20. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/embedders.py +0 -0
  21. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/file_ops.py +0 -0
  22. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/h5.py +0 -0
  23. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/gcp.py +0 -0
  24. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/gtdb.py +0 -0
  25. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/kegg.py +0 -0
  26. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/mmseqs.py +0 -0
  27. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/structure.py +0 -0
  28. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/intake/uniprot.py +0 -0
  29. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/logs.py +0 -0
  30. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/sqlite.py +0 -0
  31. {dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.22
3
+ Version: 1.1.24
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -577,7 +577,7 @@ def update_dependencies(
577
577
  "dayhoff-tools"
578
578
  ) # Use the actual package name
579
579
  pattern = re.compile(
580
- rf"^(\\s*['\"])({package_name_re})(\\[[^\\]]+\\])?(?:[^'\"[\,\\s]*)?(['\"].*)$",
580
+ rf"^(\\s*['\"])({package_name_re})(\\[[^\]]+\\])?(?:[^'\"[\,\\s]*)?(['\"].*)$",
581
581
  re.MULTILINE,
582
582
  )
583
583
  # package_name variable is still 'dayhoff-tools'
@@ -460,35 +460,44 @@ class MMSeqsProfileProcessor(Processor):
460
460
  # We might still want to raise e here, depending on desired error handling for CSV conversion failure
461
461
 
462
462
  # 9. Extract hit sequences from M8 results using subset_fasta
463
- logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
463
+ logger.info(
464
+ f"PROCESSOR: Parsing M8 results from: {intermediate_results_m8_file}"
465
+ )
464
466
  hit_sequence_ids = set()
465
467
  try:
466
468
  if not intermediate_results_m8_file.exists():
467
469
  logger.warning(
468
- f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
470
+ f"PROCESSOR: M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty.",
471
+ exc_info=True,
469
472
  )
470
473
  intermediate_hits_fasta_file.touch() # Create empty hits file
471
474
  else:
472
475
  with open(intermediate_results_m8_file, "r") as m8_file:
473
476
  for line in m8_file:
474
- if line.strip(): # Ensure line is not empty
477
+ if line.strip():
475
478
  columns = line.strip().split("\t")
476
479
  if len(columns) >= 2:
477
- hit_sequence_ids.add(
478
- columns[1]
479
- ) # Target ID is the second column
480
+ hit_sequence_ids.add(columns[1])
480
481
  logger.info(
481
- f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
482
+ f"PROCESSOR: Found {len(hit_sequence_ids)} unique target IDs in M8 results."
482
483
  )
483
484
 
484
485
  if not hit_sequence_ids:
485
486
  logger.warning(
486
- f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
487
+ f"PROCESSOR: No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty.",
488
+ exc_info=True,
487
489
  )
488
- intermediate_hits_fasta_file.touch() # Create empty file
490
+ intermediate_hits_fasta_file.touch()
489
491
  else:
492
+ logger.info(f"PROCESSOR: === CALLING subset_fasta ===")
493
+ logger.info(
494
+ f"PROCESSOR: Input FASTA for subset_fasta: {local_target_file}"
495
+ )
496
+ logger.info(
497
+ f"PROCESSOR: Output FASTA for subset_fasta: {intermediate_hits_fasta_file}"
498
+ )
490
499
  logger.info(
491
- f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
500
+ f"PROCESSOR: Number of target IDs for subset_fasta: {len(hit_sequence_ids)}"
492
501
  )
493
502
  try:
494
503
  subset_fasta(
@@ -499,25 +508,34 @@ class MMSeqsProfileProcessor(Processor):
499
508
  return_written_ids=False,
500
509
  )
501
510
  logger.info(
502
- f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
511
+ f"PROCESSOR: === RETURNED from subset_fasta ==="
512
+ )
513
+ logger.info(
514
+ f"PROCESSOR: Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
503
515
  )
504
- except FileNotFoundError as e:
516
+ # More specific error catching can be added if subset_fasta raises custom exceptions
517
+ except FileNotFoundError as e_fnf:
505
518
  logger.error(
506
- f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
519
+ f"PROCESSOR: subset_fasta FileNotFoundError: {e_fnf}. Ensuring {intermediate_hits_fasta_file} exists as empty.",
520
+ exc_info=True,
507
521
  )
508
522
  if not intermediate_hits_fasta_file.exists():
509
523
  intermediate_hits_fasta_file.touch()
510
524
  raise
511
- except Exception as e:
525
+ except (
526
+ Exception
527
+ ) as e_sub: # Catch any other exception from subset_fasta
512
528
  logger.error(
513
- f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
529
+ f"PROCESSOR: subset_fasta failed to create {intermediate_hits_fasta_file}: {e_sub}",
530
+ exc_info=True,
514
531
  )
515
532
  if not intermediate_hits_fasta_file.exists():
516
533
  intermediate_hits_fasta_file.touch()
517
534
  raise
518
- except Exception as e:
535
+ except Exception as e_m8_proc:
519
536
  logger.error(
520
- f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
537
+ f"PROCESSOR: Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e_m8_proc}",
538
+ exc_info=True,
521
539
  )
522
540
  if not intermediate_hits_fasta_file.exists():
523
541
  intermediate_hits_fasta_file.touch()
@@ -525,7 +543,7 @@ class MMSeqsProfileProcessor(Processor):
525
543
 
526
544
  # 10. Write the set of hit sequence IDs to a .txt file
527
545
  logger.info(
528
- f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
546
+ f"PROCESSOR: Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
529
547
  )
530
548
  try:
531
549
  with open(final_hits_txt_file, "w") as txt_out:
@@ -533,13 +551,15 @@ class MMSeqsProfileProcessor(Processor):
533
551
  list(hit_sequence_ids)
534
552
  ): # Sort for consistent output
535
553
  txt_out.write(f"{seq_id}\n")
536
- logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
554
+ logger.info(
555
+ f"PROCESSOR: Successfully wrote hit IDs to {final_hits_txt_file}"
556
+ )
537
557
  except Exception as e:
538
558
  logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
539
559
  # The main workflow should still proceed even if this supplementary file fails
540
560
 
541
561
  logger.info(
542
- f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
562
+ f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
543
563
  )
544
564
 
545
565
  # Move and rename final output files from mmseqs_temp_dir to run_base_dir
@@ -8,7 +8,7 @@ import sqlite3
8
8
  import time
9
9
  from functools import partial
10
10
  from pathlib import Path
11
- from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
11
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
12
 
13
13
  import requests
14
14
  from Bio import SeqIO
@@ -604,29 +604,48 @@ def process_chunk(
604
604
  ) -> Tuple[List[str], Set[str]]:
605
605
  output_sequences = []
606
606
  written_ids = set()
607
- current_id = ""
608
- current_seq = []
609
-
610
- def id_matches(seq_id: str) -> bool:
611
- return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
612
-
613
- for line in chunk:
614
- line = line.strip()
615
- if line.startswith(">"):
616
- if current_id and current_seq:
617
- if id_matches(current_id) != exclude:
618
- output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
619
- written_ids.add(current_id)
620
- current_id = line[1:]
621
- current_seq = []
622
- elif current_id:
623
- current_seq.append(line)
624
-
625
- # Process the last sequence in the chunk
626
- if current_id and current_seq and id_matches(current_id) != exclude:
627
- output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
628
- written_ids.add(current_id)
607
+ current_id: str = ""
608
+ current_seq: List[str] = []
609
+
610
+ # Get a unique worker ID, could be process ID
611
+ worker_id = os.getpid()
612
+ logger.debug(
613
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} processing a chunk. Target IDs count: {len(target_ids_lower)}, Exclude: {exclude}"
614
+ )
615
+ try:
616
+
617
+ def id_matches(seq_id: str) -> bool:
618
+ return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
619
+
620
+ for line in chunk:
621
+ line = line.strip()
622
+ if line.startswith(">"):
623
+ if current_id and current_seq:
624
+ if id_matches(current_id) != exclude:
625
+ output_sequences.append(
626
+ f">{current_id}\n{''.join(current_seq)}\n"
627
+ )
628
+ written_ids.add(current_id)
629
+ current_id = line[1:]
630
+ current_seq = []
631
+ elif current_id:
632
+ current_seq.append(line)
633
+
634
+ # Process the last sequence in the chunk
635
+ if current_id and current_seq and id_matches(current_id) != exclude:
636
+ output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
637
+ written_ids.add(current_id)
629
638
 
639
+ except Exception as e:
640
+ logger.error(
641
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} encountered error: {e}",
642
+ exc_info=True,
643
+ )
644
+ # Re-raising the exception so the main process's pool error handling can catch it
645
+ raise
646
+ logger.debug(
647
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} finished chunk. Output sequences: {len(output_sequences)}, Written IDs: {len(written_ids)}"
648
+ )
630
649
  return output_sequences, written_ids
631
650
 
632
651
 
@@ -655,50 +674,98 @@ def subset_fasta(
655
674
  Raises:
656
675
  FileExistsError: If the output file already exists.
657
676
  """
677
+ logger.info(
678
+ f"SUBSET_FASTA: Starting for input '{fasta_file}', output '{output_path}'. Target IDs: {len(target_ids)}, Exclude: {exclude}"
679
+ )
658
680
  _check_output_file(output_path)
659
681
 
660
682
  target_ids_lower = {id.lower() for id in target_ids}
661
683
  total_size = os.path.getsize(fasta_file)
662
- chunk_size = max(
663
- 1, total_size // (multiprocessing.cpu_count() * 2)
664
- ) # Adjust chunk size based on CPU count
665
684
 
666
- def chunk_reader(file_obj, chunk_size: int):
685
+ # Determine a reasonable number of processes
686
+ num_processes = multiprocessing.cpu_count()
687
+ # Adjust chunk size based on number of processes to balance load vs memory
688
+ # Aim for at least a few chunks per process if possible, but not too many small chunks.
689
+ # This is a heuristic and might need tuning.
690
+ # Let's make chunks reasonably large, e.g., 10-50MB, or ensure at least num_processes chunks.
691
+ # If total_size is very small, chunk_size could become 0 if not handled.
692
+ desired_chunk_size_mb = 32
693
+ chunk_size = max(1, desired_chunk_size_mb * 1024 * 1024)
694
+ num_chunks = max(1, math.ceil(total_size / chunk_size))
695
+
696
+ def chunk_reader(
697
+ file_obj, cs: int
698
+ ) -> Iterator[List[str]]: # Explicitly Iterator[List[str]]
667
699
  chunk = []
668
700
  chunk_bytes = 0
669
701
  for line in file_obj:
670
702
  chunk.append(line)
671
703
  chunk_bytes += len(line)
672
- if chunk_bytes >= chunk_size and line.startswith(">"):
704
+ if chunk_bytes >= cs and line.startswith(">"):
673
705
  yield chunk
674
706
  chunk = [line]
675
707
  chunk_bytes = len(line)
676
708
  if chunk:
677
709
  yield chunk
678
710
 
679
- open_func = gzip.open if fasta_file.endswith(".gz") else open
680
- mode = "rt" if fasta_file.endswith(".gz") else "r"
711
+ mode = "rt" # text mode for both gzip and regular open
681
712
 
682
- with open_func(fasta_file, mode) as input_file:
683
- with multiprocessing.Pool() as pool:
684
- process_func = partial(
685
- process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
713
+ all_written_ids: Set[str] = set()
714
+ try:
715
+ with open(fasta_file, mode) as input_file:
716
+ logger.info(
717
+ f"SUBSET_FASTA: Using up to {num_processes} worker processes for {num_chunks} potential chunks."
686
718
  )
687
- results = list(
688
- tqdm(
689
- pool.imap(process_func, chunk_reader(input_file, chunk_size)),
690
- total=total_size // chunk_size,
691
- desc="Processing FASTA",
719
+
720
+ with multiprocessing.Pool(processes=num_processes) as pool:
721
+ logger.info(
722
+ f"SUBSET_FASTA: Multiprocessing pool created (intended processes: {num_processes})."
692
723
  )
693
- )
694
724
 
695
- all_written_ids = set()
696
- with open(output_path, "w") as output_file:
697
- for output_sequences, written_ids in results:
698
- output_file.writelines(output_sequences)
699
- all_written_ids.update(written_ids)
725
+ process_func = partial(
726
+ process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
727
+ )
700
728
 
701
- print(f"Wrote {len(all_written_ids)} sequences to {output_path}")
729
+ # Using imap_unordered can sometimes be better for memory with many results,
730
+ # as results are processed as they complete.
731
+ # However, for aggregation later, order doesn't strictly matter for building the final set/list of strings.
732
+ # tqdm will work with imap and imap_unordered.
733
+
734
+ # Calculate total for tqdm more robustly
735
+ actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
736
+
737
+ try:
738
+ results_buffer = []
739
+ for result_tuple in tqdm(
740
+ pool.imap(process_func, chunk_reader(input_file, chunk_size)),
741
+ total=actual_num_chunks_for_tqdm, # Use calculated number of chunks
742
+ desc="Processing FASTA (subset_fasta)",
743
+ ):
744
+ results_buffer.append(result_tuple)
745
+ logger.debug("SUBSET_FASTA: pool.imap completed.")
746
+ except Exception as e_pool:
747
+ logger.error(
748
+ f"SUBSET_FASTA: Error during multiprocessing pool.imap: {e_pool}",
749
+ exc_info=True,
750
+ )
751
+ raise
752
+
753
+ logger.debug(
754
+ f"SUBSET_FASTA: Aggregating results from {len(results_buffer)} processed chunks."
755
+ )
756
+ with open(output_path, "w") as output_file:
757
+ for output_sequences, written_ids_chunk in results_buffer:
758
+ output_file.writelines(output_sequences)
759
+ all_written_ids.update(written_ids_chunk)
760
+ except Exception as e_main:
761
+ logger.error(
762
+ f"SUBSET_FASTA: Error in main processing logic: {e_main}", exc_info=True
763
+ )
764
+ raise
765
+
766
+ logger.info(
767
+ f"SUBSET_FASTA: Wrote {len(all_written_ids)} sequences to {output_path}. Finished."
768
+ )
702
769
  return all_written_ids if return_written_ids else None
703
770
 
704
771
 
@@ -779,7 +846,7 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
779
846
  batch = []
780
847
 
781
848
  for protein_id, sequence in tqdm(
782
- _protein_generator(fasta_file),
849
+ _protein_generator(Path(fasta_file)), # Pass as Path object
783
850
  total=estimated_records,
784
851
  desc="Processing proteins",
785
852
  ):
@@ -804,22 +871,27 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
804
871
  print(f"Conversion completed. SQLite database saved to {db_file}")
805
872
 
806
873
 
807
- def _protein_generator(fasta_path: Path) -> Iterator[tuple[str, str]]:
874
+ def _protein_generator(
875
+ fasta_path: Path,
876
+ ) -> Iterator[tuple[str, str]]: # fasta_path is Path
808
877
  """
809
878
  Generate protein data from a FASTA file.
810
-
811
879
  Args:
812
880
  fasta_path (Path): Path to the FASTA file.
813
-
814
881
  Yields:
815
882
  tuple[str, str]: A tuple containing protein_id and sequence.
816
883
  """
817
- for record in SeqIO.parse(fasta_path, "fasta"):
818
- protein_id = record.id.split()[
819
- 0
820
- ] # Assumes the first part of the id is the protein_id
821
- sequence = str(record.seq)
822
- yield protein_id, sequence
884
+ # Ensure we use 'rt' for text mode reading, especially if gzipped
885
+ open_func = gzip.open if str(fasta_path).endswith(".gz") else open
886
+ mode = "rt"
887
+
888
+ with open_func(fasta_path, mode) as handle:
889
+ for record in SeqIO.parse(handle, "fasta"):
890
+ protein_id = record.id.split()[
891
+ 0
892
+ ] # Assumes the first part of the id is the protein_id
893
+ sequence = str(record.seq)
894
+ yield protein_id, sequence
823
895
 
824
896
 
825
897
  def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
5
5
 
6
6
  [project]
7
7
  name = "dayhoff-tools"
8
- version = "1.1.22"
8
+ version = "1.1.24"
9
9
  description = "Common tools for all the repos at Dayhoff Labs"
10
10
  authors = [
11
11
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
File without changes