dayhoff-tools 1.1.29__py3-none-any.whl → 1.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -94,9 +94,9 @@ def run_command() -> None:
94
94
  stderr=None, # Use parent's stderr
95
95
  )
96
96
 
97
- logger.info("Command completed successfully")
97
+ logger.info("Job command completed successfully")
98
98
  except subprocess.CalledProcessError as e:
99
- logger.error(f"Command failed with return code: {e.returncode}")
99
+ logger.error(f"Job command failed with return code: {e.returncode}")
100
100
  raise
101
101
  except Exception as e:
102
102
  logger.error(f"Error executing command: {str(e)}")
@@ -133,6 +133,13 @@ def run_job(
133
133
  logger = logging.getLogger(__name__)
134
134
 
135
135
  logger.info(f"Job runner starting in mode: {mode}")
136
+ import importlib.metadata
137
+
138
+ try:
139
+ version = importlib.metadata.version("dayhoff-tools")
140
+ logger.info(f"dayhoff-tools version: {version}")
141
+ except importlib.metadata.PackageNotFoundError:
142
+ logger.warning("Could not determine dayhoff-tools version")
136
143
 
137
144
  if mode not in ["setup", "execute", "setup_and_execute"]:
138
145
  logger.error(f"Invalid mode: {mode}")
@@ -146,8 +153,6 @@ def run_job(
146
153
  if mode in ["execute", "setup_and_execute"]:
147
154
  run_command()
148
155
 
149
- logger.info("Job completed successfully")
150
-
151
156
  except Exception as e:
152
157
  logger.error(f"Job failed with error: {str(e)}", exc_info=True)
153
158
  sys.exit(1)
@@ -1,4 +1,5 @@
1
1
  import csv
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  import shlex
@@ -7,6 +8,9 @@ import subprocess
7
8
  from abc import ABC, abstractmethod
8
9
  from pathlib import Path
9
10
 
11
+ from Bio import SeqIO
12
+ from Bio.Seq import Seq
13
+
10
14
  logger = logging.getLogger(__name__)
11
15
 
12
16
 
@@ -23,6 +27,276 @@ class Processor(ABC):
23
27
  return output_path
24
28
 
25
29
 
30
+ class InterProScanProcessor(Processor):
31
+ """Processes a single FASTA file using InterProScan and extracts target domains.
32
+
33
+ This processor handles the analysis of protein sequences using InterProScan,
34
+ and extracts specific domains based on their InterPro accession IDs.
35
+ It maps sequence identifiers correctly using MD5 hashes from the TSV output
36
+ to handle differences in sequence ID representation between input FASTA and
37
+ InterProScan JSON output.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ interproscan_install_dir: str, # Path to the InterProScan installation
43
+ interproscan_temp_dir_mount: str, # Path to temporary directory for InterProScan
44
+ num_threads: int, # Number of CPU threads for InterProScan to use
45
+ output_formats: list[
46
+ str
47
+ ], # List of desired output formats (e.g., ["JSON", "TSV"])
48
+ target_iprs: set[str], # Set of InterPro IDs to extract domains for
49
+ other_interproscan_options: (
50
+ str | None
51
+ ) = None, # Additional command-line options
52
+ ):
53
+ """Initialize the InterProScanProcessor.
54
+
55
+ Args:
56
+ interproscan_install_dir: Path to the InterProScan installation directory.
57
+ interproscan_temp_dir_mount: Path to the temporary directory for InterProScan.
58
+ num_threads: Number of CPU threads for InterProScan to use.
59
+ output_formats: List of desired output formats (e.g., ["JSON", "TSV"]).
60
+ target_iprs: A set of InterPro accession IDs to extract domain sequences for.
61
+ other_interproscan_options: Additional command-line options for interproscan.sh.
62
+ """
63
+ self.interproscan_sh_path = Path(interproscan_install_dir) / "interproscan.sh"
64
+ if not self.interproscan_sh_path.is_file():
65
+ raise FileNotFoundError(
66
+ f"interproscan.sh not found at {self.interproscan_sh_path}"
67
+ )
68
+
69
+ self.interproscan_temp_dir_mount = Path(interproscan_temp_dir_mount)
70
+ # Ensure the temp directory exists
71
+ self.interproscan_temp_dir_mount.mkdir(parents=True, exist_ok=True)
72
+
73
+ self.num_threads = num_threads
74
+ self.output_formats = output_formats
75
+
76
+ # Ensure both JSON and TSV formats are included for domain extraction
77
+ if "JSON" not in self.output_formats:
78
+ self.output_formats.append("JSON")
79
+ if "TSV" not in self.output_formats:
80
+ self.output_formats.append("TSV")
81
+
82
+ self.target_iprs = target_iprs
83
+ self.other_options = (
84
+ other_interproscan_options if other_interproscan_options else ""
85
+ )
86
+
87
+ logger.info(
88
+ f"InterProScanProcessor initialized with script: {self.interproscan_sh_path}"
89
+ )
90
+ logger.info(
91
+ f"Temp dir mount for InterProScan: {self.interproscan_temp_dir_mount}"
92
+ )
93
+ logger.info(f"Target IPRs: {self.target_iprs}")
94
+
95
+ def run(self, input_file: str) -> str:
96
+ """Run InterProScan on the input FASTA file and extract domain sequences.
97
+
98
+ This method processes a FASTA file through InterProScan, extracts domains
99
+ of interest based on the target_iprs list, and writes the extracted domains
100
+ to a separate FASTA file. Domain sequences are correctly mapped using MD5 hashes
101
+ from the TSV output to handle differences in sequence ID representation.
102
+
103
+ Args:
104
+ input_file: Path to the input FASTA file.
105
+
106
+ Returns:
107
+ Path to the output directory containing extracted domain sequences and raw results.
108
+ """
109
+ input_file_path = Path(input_file).resolve()
110
+ input_file_stem = input_file_path.stem
111
+
112
+ # Create output directory structure
113
+ chunk_output_dir = Path(f"results_{input_file_stem}").resolve()
114
+ chunk_output_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ raw_ipr_output_dir = chunk_output_dir / "raw_ipr_output"
117
+ raw_ipr_output_dir.mkdir(parents=True, exist_ok=True)
118
+
119
+ # --- Clean input FASTA file to remove stop codons ---
120
+ cleaned_input_file_path = (
121
+ raw_ipr_output_dir / f"{input_file_stem}_cleaned.fasta"
122
+ )
123
+ logger.info(
124
+ f"Cleaning input FASTA file: {input_file_path} to remove '*' characters."
125
+ )
126
+ cleaned_records = []
127
+ has_asterisks = False
128
+
129
+ for record in SeqIO.parse(input_file_path, "fasta"):
130
+ original_seq_str = str(record.seq)
131
+ if "*" in original_seq_str:
132
+ has_asterisks = True
133
+ cleaned_seq_str = original_seq_str.replace("*", "")
134
+ record.seq = Seq(cleaned_seq_str)
135
+ logger.debug(f"Removed '*' from sequence {record.id}")
136
+ cleaned_records.append(record)
137
+
138
+ if has_asterisks:
139
+ SeqIO.write(cleaned_records, cleaned_input_file_path, "fasta")
140
+ logger.info(f"Cleaned FASTA written to {cleaned_input_file_path}")
141
+ ipr_input_file_to_use = cleaned_input_file_path
142
+ else:
143
+ logger.info(
144
+ f"No '*' characters found in {input_file_path}. Using original."
145
+ )
146
+ ipr_input_file_to_use = input_file_path
147
+ # --- End of cleaning ---
148
+
149
+ # Set up InterProScan output base path
150
+ ipr_output_base = raw_ipr_output_dir / input_file_stem
151
+
152
+ # Build the InterProScan command
153
+ cmd = [
154
+ str(self.interproscan_sh_path),
155
+ "-i",
156
+ str(ipr_input_file_to_use),
157
+ "-b",
158
+ str(ipr_output_base),
159
+ "-f",
160
+ ",".join(self.output_formats),
161
+ "--cpu",
162
+ str(self.num_threads),
163
+ "--tempdir",
164
+ str(self.interproscan_temp_dir_mount),
165
+ "--disable-precalc",
166
+ ]
167
+
168
+ # Add additional options if provided
169
+ if self.other_options:
170
+ cmd.extend(self.other_options.split())
171
+
172
+ # Run InterProScan
173
+ logger.info(f"Running InterProScan command: {' '.join(cmd)}")
174
+ try:
175
+ process = subprocess.run(cmd, check=True, capture_output=True, text=True)
176
+ logger.info(f"InterProScan STDOUT: {process.stdout}")
177
+ if process.stderr:
178
+ logger.info(f"InterProScan STDERR: {process.stderr}")
179
+ except subprocess.CalledProcessError as e:
180
+ logger.error(f"InterProScan failed for {input_file_path}")
181
+ logger.error(f"Return code: {e.returncode}")
182
+ logger.error(f"STDOUT: {e.stdout}")
183
+ logger.error(f"STDERR: {e.stderr}")
184
+ # Create a failure marker file
185
+ Path(chunk_output_dir / "INTERPROSCAN_FAILED.txt").touch()
186
+ return str(chunk_output_dir)
187
+
188
+ # Define paths for output files
189
+ extracted_domains_fasta_path = (
190
+ chunk_output_dir / f"{input_file_stem}_extracted_domains.fasta"
191
+ )
192
+ json_output_path = ipr_output_base.with_suffix(".json")
193
+ tsv_output_path = ipr_output_base.with_suffix(".tsv")
194
+
195
+ # Check for required output formats
196
+ if "JSON" not in self.output_formats or not json_output_path.is_file():
197
+ logger.warning(
198
+ f"JSON output format not requested or file not found: {json_output_path}. Cannot extract domains."
199
+ )
200
+ return str(chunk_output_dir)
201
+
202
+ if "TSV" not in self.output_formats or not tsv_output_path.is_file():
203
+ logger.warning(
204
+ f"TSV output format not found: {tsv_output_path}. This is needed to map sequence IDs."
205
+ )
206
+ return str(chunk_output_dir)
207
+
208
+ # Extract domains using the JSON and TSV outputs
209
+ try:
210
+ # Create MD5 to sequence ID mapping from TSV
211
+ md5_to_id = {}
212
+ with open(tsv_output_path, "r") as f:
213
+ for line in f:
214
+ parts = line.strip().split("\t")
215
+ if len(parts) >= 3: # Ensure there are enough columns
216
+ seq_id = parts[0]
217
+ md5 = parts[1]
218
+ md5_to_id[md5] = seq_id
219
+
220
+ logger.debug(f"Created MD5 to ID mapping with {len(md5_to_id)} entries")
221
+
222
+ # Load protein sequences for coordinate mapping
223
+ protein_sequences = SeqIO.to_dict(
224
+ SeqIO.parse(ipr_input_file_to_use, "fasta")
225
+ )
226
+
227
+ # Process JSON for domain extraction
228
+ extracted_count = 0
229
+ with (
230
+ open(extracted_domains_fasta_path, "w") as f_out,
231
+ open(json_output_path, "r") as f_json,
232
+ ):
233
+ data = json.load(f_json)
234
+ if "results" not in data:
235
+ logger.info(f"No 'results' key in JSON output {json_output_path}")
236
+ return str(chunk_output_dir)
237
+
238
+ for result in data.get("results", []):
239
+ # Map sequence via MD5 hash
240
+ md5 = result.get("md5")
241
+ if not md5 or md5 not in md5_to_id:
242
+ logger.debug(f"MD5 hash not found in mapping: {md5}")
243
+ continue
244
+
245
+ protein_acc = md5_to_id[md5]
246
+ if protein_acc not in protein_sequences:
247
+ logger.debug(f"Sequence ID not found in FASTA: {protein_acc}")
248
+ continue
249
+
250
+ original_seq_record = protein_sequences[protein_acc]
251
+ for match in result.get("matches", []):
252
+ # Extract the InterPro domain entry
253
+ signature = match.get("signature", {})
254
+ entry = signature.get("entry")
255
+ if not entry or entry.get("accession") not in self.target_iprs:
256
+ continue
257
+
258
+ ipr_id = entry.get("accession")
259
+ ipr_desc = entry.get("description", "N/A").replace(" ", "_")
260
+ logger.info(
261
+ f"Found target domain {ipr_id} ({ipr_desc}) in sequence {protein_acc}"
262
+ )
263
+
264
+ for location in match.get("locations", []):
265
+ start = location.get("start")
266
+ end = location.get("end")
267
+ if start is not None and end is not None:
268
+ domain_seq_str = str(
269
+ original_seq_record.seq[start - 1 : end]
270
+ )
271
+ domain_fasta_header = f">{original_seq_record.id}|{ipr_id}|{start}-{end}|{ipr_desc}"
272
+ f_out.write(f"{domain_fasta_header}\n")
273
+ f_out.write(f"{domain_seq_str}\n")
274
+ extracted_count += 1
275
+ logger.debug(
276
+ f"Extracted domain {ipr_id} ({start}-{end}) from {protein_acc}"
277
+ )
278
+
279
+ logger.info(
280
+ f"Extracted {extracted_count} domain sequences to {extracted_domains_fasta_path}"
281
+ )
282
+
283
+ except FileNotFoundError:
284
+ logger.error(
285
+ f"Input FASTA file {ipr_input_file_to_use} not found during domain extraction."
286
+ )
287
+ except json.JSONDecodeError:
288
+ logger.error(f"Error decoding JSON from {json_output_path}.")
289
+ except Exception as e:
290
+ logger.error(f"Error during domain extraction: {e}", exc_info=True)
291
+
292
+ # Clean up if the input file was a temporary one
293
+ if has_asterisks and cleaned_input_file_path != input_file_path:
294
+ if cleaned_input_file_path.exists():
295
+ cleaned_input_file_path.unlink()
296
+
297
+ return str(chunk_output_dir)
298
+
299
+
26
300
  class BoltzPredictor(Processor):
27
301
  """Processor for running Boltz docking predictions.
28
302
 
@@ -438,7 +712,7 @@ class MMSeqsProfileProcessor(Processor):
438
712
  intermediate_results_as_csv_file, "w", newline=""
439
713
  ) as csvfile:
440
714
  writer = csv.writer(csvfile)
441
- writer.writerow(m8_columns)
715
+ writer.writerow(csv_headers)
442
716
  else:
443
717
  with (
444
718
  open(intermediate_results_m8_file, "r") as m8file,
@@ -447,7 +721,7 @@ class MMSeqsProfileProcessor(Processor):
447
721
  ) as csvfile,
448
722
  ):
449
723
  writer = csv.writer(csvfile)
450
- writer.writerow(m8_columns)
724
+ writer.writerow(csv_headers)
451
725
  for line in m8file:
452
726
  writer.writerow(line.strip().split("\t"))
453
727
  except Exception as e:
@@ -458,7 +732,7 @@ class MMSeqsProfileProcessor(Processor):
458
732
  intermediate_results_as_csv_file, "w", newline=""
459
733
  ) as csvfile:
460
734
  writer = csv.writer(csvfile)
461
- writer.writerow(m8_columns) # write headers even on error
735
+ writer.writerow(csv_headers) # write headers even on error
462
736
 
463
737
  # 9. Extract hit sequence IDs from M8 results for the TXT file
464
738
  hit_sequence_ids = set()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.29
3
+ Version: 1.1.31
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -38,6 +38,7 @@ Requires-Dist: tqdm (>=4.67.1) ; extra == "full"
38
38
  Requires-Dist: transformers (==4.36.2) ; extra == "full"
39
39
  Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
40
40
  Requires-Dist: typer (>=0.9.0)
41
+ Requires-Dist: tzdata (>=2025.2)
41
42
  Description-Content-Type: text/markdown
42
43
 
43
44
  # dayhoff-tools
@@ -10,8 +10,8 @@ dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFn
10
10
  dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
11
11
  dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
- dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
14
- dayhoff_tools/deployment/processors.py,sha256=HFu3Ty7eebWLnyHQIjLuqjsfnqtnDbFZFH7z4eHDZIo,22730
13
+ dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
14
+ dayhoff_tools/deployment/processors.py,sha256=q2io07xO6f6twEH1iLz9GFoGaKh76qC4kcv519Q2G7g,34583
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
16
  dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
17
17
  dayhoff_tools/fasta.py,sha256=Ls6AG84IgG8COgAefqB3KS6iMbnixP_Up5EwUur-VUs,49780
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.29.dist-info/METADATA,sha256=MvXJheXMvDNdVfreK1ZRPeupmBAv0D5LxP5qZilW0Fg,2810
30
- dayhoff_tools-1.1.29.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.29.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.29.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.31.dist-info/METADATA,sha256=16xeYUw5Vk8m3cvEudAPbzXBv_CWnAq3Hn5ihSbpnIo,2843
30
+ dayhoff_tools-1.1.31.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.31.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.31.dist-info/RECORD,,