dayhoff-tools 1.1.36__py3-none-any.whl → 1.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,6 +61,7 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
61
61
  "allocation_policy", # Goes into batch_config.allocationPolicy
62
62
  "logs_policy", # Goes into batch_config.logsPolicy
63
63
  "batch_job", # Contains detailed task and resource specs
64
+ "image_uri",
64
65
  # Keys like job_name, region, registry_uri, repository are used by other functions
65
66
  # or for other purposes, not directly for constructing the core batch_config JSON here.
66
67
  }
dayhoff_tools/fasta.py CHANGED
@@ -13,8 +13,6 @@ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
13
13
  import requests
14
14
  from Bio import SeqIO
15
15
  from Bio.SeqRecord import SeqRecord
16
- from tqdm import tqdm
17
- from tqdm.notebook import tqdm as tqdm_notebook
18
16
 
19
17
  logger = logging.getLogger(__name__)
20
18
 
@@ -140,6 +138,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
140
138
  Raises:
141
139
  FileExistsError: If the output file already exists.
142
140
  """
141
+ from tqdm import tqdm
142
+
143
143
  _check_output_file(output_path)
144
144
 
145
145
  if isinstance(input_path, str):
@@ -290,6 +290,11 @@ def split_fasta(
290
290
  Returns:
291
291
  int: The number of output files created.
292
292
  """
293
+ from typing import TYPE_CHECKING, Optional
294
+
295
+ if TYPE_CHECKING:
296
+ from tqdm import tqdm
297
+
293
298
  # Ensure the target folder exists
294
299
  os.makedirs(target_folder, exist_ok=True)
295
300
 
@@ -299,7 +304,7 @@ def split_fasta(
299
304
  files_created = 0
300
305
  current_output_file_sequence_count = 0
301
306
  current_output_file_bytes_written = 0
302
- pbar: tqdm | None = None
307
+ pbar: Optional["tqdm"] = None
303
308
  output_file = None # Will be opened when we encounter the first header line
304
309
  output_file_path = ""
305
310
 
@@ -314,6 +319,8 @@ def split_fasta(
314
319
  # Open the large FASTA file for reading
315
320
  with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
316
321
  if show_progress:
322
+ from tqdm import tqdm
323
+
317
324
  total_size = os.path.getsize(fasta_file)
318
325
  pbar = tqdm(
319
326
  total=total_size,
@@ -441,6 +448,9 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
441
448
  Raises:
442
449
  FileExistsError: If the output file already exists.
443
450
  """
451
+ from Bio import SeqIO
452
+ from tqdm import tqdm
453
+
444
454
  _check_output_file(output_file)
445
455
 
446
456
  # Load sequences from file1 with a progress bar
@@ -497,6 +507,8 @@ def simplify_fasta_ids(
497
507
  Raises:
498
508
  FileExistsError: If the output file already exists.
499
509
  """
510
+ from Bio import SeqIO
511
+
500
512
  _check_output_file(output_fasta)
501
513
 
502
514
  count = 0
@@ -575,6 +587,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
575
587
  Raises:
576
588
  ValueError: If there's an issue reading or parsing the input file.
577
589
  """
590
+ from Bio import SeqIO
591
+ from tqdm import tqdm
592
+
578
593
  sequence_ids: Set[str] = set()
579
594
  try:
580
595
  estimated_records = estimate_sequences(fasta_file)
@@ -735,6 +750,8 @@ def subset_fasta(
735
750
  actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
736
751
 
737
752
  try:
753
+ from tqdm import tqdm
754
+
738
755
  results_buffer = []
739
756
  for result_tuple in tqdm(
740
757
  pool.imap(process_func, chunk_reader(input_file, chunk_size)),
@@ -769,7 +786,7 @@ def subset_fasta(
769
786
  return all_written_ids if return_written_ids else None
770
787
 
771
788
 
772
- def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
789
+ def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
773
790
  """
774
791
  Load a FASTA file into a dictionary with record IDs as keys.
775
792
  Keep only the first instance of each identifier.
@@ -780,6 +797,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
780
797
  Returns:
781
798
  Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
782
799
  """
800
+ from Bio import SeqIO
801
+ from Bio.SeqRecord import SeqRecord
802
+ from tqdm import tqdm
803
+
783
804
  record_dict: Dict[str, SeqRecord] = {}
784
805
  estimated_sequences = estimate_sequences(fasta_file)
785
806
 
@@ -815,6 +836,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
815
836
  Example:
816
837
  fasta_to_sqlite("proteins.fasta", "proteins.db")
817
838
  """
839
+ from Bio import SeqIO
840
+ from tqdm import tqdm
841
+
818
842
  _check_output_file(db_file)
819
843
 
820
844
  if not os.path.exists(fasta_file):
@@ -881,6 +905,8 @@ def _protein_generator(
881
905
  Yields:
882
906
  tuple[str, str]: A tuple containing protein_id and sequence.
883
907
  """
908
+ from Bio import SeqIO
909
+
884
910
  # Ensure we use 'rt' for text mode reading, especially if gzipped
885
911
  open_func = gzip.open if str(fasta_path).endswith(".gz") else open
886
912
  mode = "rt"
@@ -911,6 +937,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
911
937
  FileNotFoundError: If the input file doesn't exist
912
938
  ValueError: If the FASTA file is malformed
913
939
  """
940
+ from Bio import SeqIO
941
+ from tqdm import tqdm
942
+
914
943
  if not os.path.exists(fasta_path):
915
944
  raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
916
945
 
@@ -987,6 +1016,12 @@ def clean_fasta_duplicates(
987
1016
  FileExistsError: If the output file already exists
988
1017
  FileNotFoundError: If the input file doesn't exist
989
1018
  """
1019
+ from Bio import SeqIO
1020
+ from tqdm import tqdm
1021
+
1022
+ if not os.path.exists(input_path):
1023
+ raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
1024
+
990
1025
  _check_output_file(output_path)
991
1026
 
992
1027
  # First pass: collect sequence hashes for each ID
@@ -1075,6 +1110,8 @@ def fetch_uniprot_fasta(
1075
1110
  Returns:
1076
1111
  tuple: (success_count, failed_count, output_filepath, failed_accessions)
1077
1112
  """
1113
+ from tqdm.notebook import tqdm as tqdm_notebook
1114
+
1078
1115
  # Convert set to list for batch processing
1079
1116
  accession_list = list(accession_set)
1080
1117
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.36
3
+ Version: 1.1.38
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -8,13 +8,13 @@ dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2z
8
8
  dayhoff_tools/cli/utility_commands.py,sha256=ER4VrJt4hu904MwrcltUXjwBWT4uFrP-aPXjdXyT3F8,24685
9
9
  dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
10
10
  dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0S5zCVJr1M,17884
11
- dayhoff_tools/deployment/deploy_gcp.py,sha256=jiEE_tBVeSavAI8o_6qPDPpaoXKexcaNIa4uXcv3y0M,8839
11
+ dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
14
14
  dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
16
  dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
17
- dayhoff_tools/fasta.py,sha256=Ls6AG84IgG8COgAefqB3KS6iMbnixP_Up5EwUur-VUs,49780
17
+ dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
18
18
  dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
19
19
  dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
20
20
  dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.36.dist-info/METADATA,sha256=71FGAv8K1KGE-Q7MN3W2bfZYDHCYUb7GGnsLkwPLQPw,2843
30
- dayhoff_tools-1.1.36.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.36.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.36.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.38.dist-info/METADATA,sha256=nDSK0SHTOMdieTxWDLScNArXB4g5TLAocONnt4xD89k,2843
30
+ dayhoff_tools-1.1.38.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.38.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.38.dist-info/RECORD,,