dayhoff-tools 1.1.37__py3-none-any.whl → 1.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,6 +61,7 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
61
61
  "allocation_policy", # Goes into batch_config.allocationPolicy
62
62
  "logs_policy", # Goes into batch_config.logsPolicy
63
63
  "batch_job", # Contains detailed task and resource specs
64
+ "image_uri",
64
65
  # Keys like job_name, region, registry_uri, repository are used by other functions
65
66
  # or for other purposes, not directly for constructing the core batch_config JSON here.
66
67
  }
@@ -7,11 +7,14 @@ import h5py
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
  import torch
10
+ import torch.utils.data
10
11
  from dayhoff_tools.deployment.processors import Processor
11
12
  from dayhoff_tools.fasta import (
12
13
  clean_noncanonical_fasta,
13
14
  clean_noncanonical_fasta_to_dict,
14
15
  )
16
+ from esm import FastaBatchedDataset, pretrained
17
+ from transformers import T5EncoderModel, T5Tokenizer
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
@@ -49,8 +52,6 @@ class ESMEmbedder(Processor):
49
52
 
50
53
  def _load_model(self):
51
54
  """Download pre-trained model and load onto device"""
52
- from esm import pretrained
53
-
54
55
  self.model, self.alphabet = pretrained.load_model_and_alphabet(self.model_name)
55
56
  self.model.eval()
56
57
  if torch.cuda.is_available():
@@ -61,8 +62,6 @@ class ESMEmbedder(Processor):
61
62
 
62
63
  def _load_dataset(self, fasta_file: str) -> None:
63
64
  """Load FASTA file into batched dataset and dataloader"""
64
- from esm import FastaBatchedDataset
65
-
66
65
  if not fasta_file.endswith(".fasta"):
67
66
  raise ValueError("Input file must have .fasta extension.")
68
67
 
@@ -764,8 +763,6 @@ class ProstT5Embedder(Embedder):
764
763
  The model automatically selects half precision (float16) when running on GPU
765
764
  and full precision (float32) when running on CPU.
766
765
  """
767
- from transformers import T5EncoderModel, T5Tokenizer
768
-
769
766
  tokenizer = T5Tokenizer.from_pretrained(
770
767
  "Rostlab/ProstT5", do_lower_case=False, legacy=True
771
768
  )
@@ -854,8 +851,6 @@ class T5Embedder(Embedder):
854
851
  The model automatically handles memory management and batch processing
855
852
  based on sequence sizes and available resources.
856
853
  """
857
- from transformers import T5EncoderModel, T5Tokenizer
858
-
859
854
  tokenizer = T5Tokenizer.from_pretrained(
860
855
  "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
861
856
  )
dayhoff_tools/fasta.py CHANGED
@@ -11,6 +11,8 @@ from pathlib import Path
11
11
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
12
12
 
13
13
  import requests
14
+ from Bio import SeqIO
15
+ from Bio.SeqRecord import SeqRecord
14
16
 
15
17
  logger = logging.getLogger(__name__)
16
18
 
@@ -136,6 +138,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
136
138
  Raises:
137
139
  FileExistsError: If the output file already exists.
138
140
  """
141
+ from tqdm import tqdm
142
+
139
143
  _check_output_file(output_path)
140
144
 
141
145
  if isinstance(input_path, str):
@@ -286,6 +290,11 @@ def split_fasta(
286
290
  Returns:
287
291
  int: The number of output files created.
288
292
  """
293
+ from typing import TYPE_CHECKING, Optional
294
+
295
+ if TYPE_CHECKING:
296
+ from tqdm import tqdm
297
+
289
298
  # Ensure the target folder exists
290
299
  os.makedirs(target_folder, exist_ok=True)
291
300
 
@@ -295,7 +304,7 @@ def split_fasta(
295
304
  files_created = 0
296
305
  current_output_file_sequence_count = 0
297
306
  current_output_file_bytes_written = 0
298
- pbar: tqdm | None = None
307
+ pbar: Optional["tqdm"] = None
299
308
  output_file = None # Will be opened when we encounter the first header line
300
309
  output_file_path = ""
301
310
 
@@ -310,6 +319,8 @@ def split_fasta(
310
319
  # Open the large FASTA file for reading
311
320
  with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
312
321
  if show_progress:
322
+ from tqdm import tqdm
323
+
313
324
  total_size = os.path.getsize(fasta_file)
314
325
  pbar = tqdm(
315
326
  total=total_size,
@@ -438,6 +449,7 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
438
449
  FileExistsError: If the output file already exists.
439
450
  """
440
451
  from Bio import SeqIO
452
+ from tqdm import tqdm
441
453
 
442
454
  _check_output_file(output_file)
443
455
 
@@ -738,6 +750,8 @@ def subset_fasta(
738
750
  actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
739
751
 
740
752
  try:
753
+ from tqdm import tqdm
754
+
741
755
  results_buffer = []
742
756
  for result_tuple in tqdm(
743
757
  pool.imap(process_func, chunk_reader(input_file, chunk_size)),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.37
3
+ Version: 1.1.39
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -8,13 +8,13 @@ dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2z
8
8
  dayhoff_tools/cli/utility_commands.py,sha256=ER4VrJt4hu904MwrcltUXjwBWT4uFrP-aPXjdXyT3F8,24685
9
9
  dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
10
10
  dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0S5zCVJr1M,17884
11
- dayhoff_tools/deployment/deploy_gcp.py,sha256=jiEE_tBVeSavAI8o_6qPDPpaoXKexcaNIa4uXcv3y0M,8839
11
+ dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
14
14
  dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
- dayhoff_tools/embedders.py,sha256=yO13jwVSqd-gk-PUhBE_7lnlRUIHn2tirxZ_mFAVAgM,36557
17
- dayhoff_tools/fasta.py,sha256=UARjl3w4O6hlZSgzuZZkG-Mi89TLxvxUDL-WEt0-5OU,50210
16
+ dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
17
+ dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
18
18
  dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
19
19
  dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
20
20
  dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.37.dist-info/METADATA,sha256=wN6qyLCZ5vAQaZdS-8KNuzFnlHIWSz1moDJSugGyVjY,2843
30
- dayhoff_tools-1.1.37.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.37.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.37.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.39.dist-info/METADATA,sha256=c-7TpBB15X71z48kz2BUHFFqWddHgywH88BLequz3d0,2843
30
+ dayhoff_tools-1.1.39.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.39.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.39.dist-info/RECORD,,