dayhoff-tools 1.1.36__py3-none-any.whl → 1.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,14 +7,11 @@ import h5py
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
  import torch
10
- import torch.utils.data
11
10
  from dayhoff_tools.deployment.processors import Processor
12
11
  from dayhoff_tools.fasta import (
13
12
  clean_noncanonical_fasta,
14
13
  clean_noncanonical_fasta_to_dict,
15
14
  )
16
- from esm import FastaBatchedDataset, pretrained
17
- from transformers import T5EncoderModel, T5Tokenizer
18
15
 
19
16
  logger = logging.getLogger(__name__)
20
17
 
@@ -52,6 +49,8 @@ class ESMEmbedder(Processor):
52
49
 
53
50
  def _load_model(self):
54
51
  """Download pre-trained model and load onto device"""
52
+ from esm import pretrained
53
+
55
54
  self.model, self.alphabet = pretrained.load_model_and_alphabet(self.model_name)
56
55
  self.model.eval()
57
56
  if torch.cuda.is_available():
@@ -62,6 +61,8 @@ class ESMEmbedder(Processor):
62
61
 
63
62
  def _load_dataset(self, fasta_file: str) -> None:
64
63
  """Load FASTA file into batched dataset and dataloader"""
64
+ from esm import FastaBatchedDataset
65
+
65
66
  if not fasta_file.endswith(".fasta"):
66
67
  raise ValueError("Input file must have .fasta extension.")
67
68
 
@@ -763,6 +764,8 @@ class ProstT5Embedder(Embedder):
763
764
  The model automatically selects half precision (float16) when running on GPU
764
765
  and full precision (float32) when running on CPU.
765
766
  """
767
+ from transformers import T5EncoderModel, T5Tokenizer
768
+
766
769
  tokenizer = T5Tokenizer.from_pretrained(
767
770
  "Rostlab/ProstT5", do_lower_case=False, legacy=True
768
771
  )
@@ -851,6 +854,8 @@ class T5Embedder(Embedder):
851
854
  The model automatically handles memory management and batch processing
852
855
  based on sequence sizes and available resources.
853
856
  """
857
+ from transformers import T5EncoderModel, T5Tokenizer
858
+
854
859
  tokenizer = T5Tokenizer.from_pretrained(
855
860
  "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
856
861
  )
dayhoff_tools/fasta.py CHANGED
@@ -11,10 +11,6 @@ from pathlib import Path
11
11
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
12
12
 
13
13
  import requests
14
- from Bio import SeqIO
15
- from Bio.SeqRecord import SeqRecord
16
- from tqdm import tqdm
17
- from tqdm.notebook import tqdm as tqdm_notebook
18
14
 
19
15
  logger = logging.getLogger(__name__)
20
16
 
@@ -441,6 +437,8 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
441
437
  Raises:
442
438
  FileExistsError: If the output file already exists.
443
439
  """
440
+ from Bio import SeqIO
441
+
444
442
  _check_output_file(output_file)
445
443
 
446
444
  # Load sequences from file1 with a progress bar
@@ -497,6 +495,8 @@ def simplify_fasta_ids(
497
495
  Raises:
498
496
  FileExistsError: If the output file already exists.
499
497
  """
498
+ from Bio import SeqIO
499
+
500
500
  _check_output_file(output_fasta)
501
501
 
502
502
  count = 0
@@ -575,6 +575,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
575
575
  Raises:
576
576
  ValueError: If there's an issue reading or parsing the input file.
577
577
  """
578
+ from Bio import SeqIO
579
+ from tqdm import tqdm
580
+
578
581
  sequence_ids: Set[str] = set()
579
582
  try:
580
583
  estimated_records = estimate_sequences(fasta_file)
@@ -769,7 +772,7 @@ def subset_fasta(
769
772
  return all_written_ids if return_written_ids else None
770
773
 
771
774
 
772
- def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
775
+ def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
773
776
  """
774
777
  Load a FASTA file into a dictionary with record IDs as keys.
775
778
  Keep only the first instance of each identifier.
@@ -780,6 +783,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
780
783
  Returns:
781
784
  Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
782
785
  """
786
+ from Bio import SeqIO
787
+ from Bio.SeqRecord import SeqRecord
788
+ from tqdm import tqdm
789
+
783
790
  record_dict: Dict[str, SeqRecord] = {}
784
791
  estimated_sequences = estimate_sequences(fasta_file)
785
792
 
@@ -815,6 +822,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
815
822
  Example:
816
823
  fasta_to_sqlite("proteins.fasta", "proteins.db")
817
824
  """
825
+ from Bio import SeqIO
826
+ from tqdm import tqdm
827
+
818
828
  _check_output_file(db_file)
819
829
 
820
830
  if not os.path.exists(fasta_file):
@@ -881,6 +891,8 @@ def _protein_generator(
881
891
  Yields:
882
892
  tuple[str, str]: A tuple containing protein_id and sequence.
883
893
  """
894
+ from Bio import SeqIO
895
+
884
896
  # Ensure we use 'rt' for text mode reading, especially if gzipped
885
897
  open_func = gzip.open if str(fasta_path).endswith(".gz") else open
886
898
  mode = "rt"
@@ -911,6 +923,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
911
923
  FileNotFoundError: If the input file doesn't exist
912
924
  ValueError: If the FASTA file is malformed
913
925
  """
926
+ from Bio import SeqIO
927
+ from tqdm import tqdm
928
+
914
929
  if not os.path.exists(fasta_path):
915
930
  raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
916
931
 
@@ -987,6 +1002,12 @@ def clean_fasta_duplicates(
987
1002
  FileExistsError: If the output file already exists
988
1003
  FileNotFoundError: If the input file doesn't exist
989
1004
  """
1005
+ from Bio import SeqIO
1006
+ from tqdm import tqdm
1007
+
1008
+ if not os.path.exists(input_path):
1009
+ raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
1010
+
990
1011
  _check_output_file(output_path)
991
1012
 
992
1013
  # First pass: collect sequence hashes for each ID
@@ -1075,6 +1096,8 @@ def fetch_uniprot_fasta(
1075
1096
  Returns:
1076
1097
  tuple: (success_count, failed_count, output_filepath, failed_accessions)
1077
1098
  """
1099
+ from tqdm.notebook import tqdm as tqdm_notebook
1100
+
1078
1101
  # Convert set to list for batch processing
1079
1102
  accession_list = list(accession_set)
1080
1103
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.36
3
+ Version: 1.1.37
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -13,8 +13,8 @@ dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqq
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
14
14
  dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
- dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
17
- dayhoff_tools/fasta.py,sha256=Ls6AG84IgG8COgAefqB3KS6iMbnixP_Up5EwUur-VUs,49780
16
+ dayhoff_tools/embedders.py,sha256=yO13jwVSqd-gk-PUhBE_7lnlRUIHn2tirxZ_mFAVAgM,36557
17
+ dayhoff_tools/fasta.py,sha256=UARjl3w4O6hlZSgzuZZkG-Mi89TLxvxUDL-WEt0-5OU,50210
18
18
  dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
19
19
  dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
20
20
  dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.36.dist-info/METADATA,sha256=71FGAv8K1KGE-Q7MN3W2bfZYDHCYUb7GGnsLkwPLQPw,2843
30
- dayhoff_tools-1.1.36.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.36.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.36.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.37.dist-info/METADATA,sha256=wN6qyLCZ5vAQaZdS-8KNuzFnlHIWSz1moDJSugGyVjY,2843
30
+ dayhoff_tools-1.1.37.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.37.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.37.dist-info/RECORD,,