dayhoff-tools 1.1.38__tar.gz → 1.1.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/PKG-INFO +1 -1
  2. dayhoff_tools-1.1.40/dayhoff_tools/deployment/processors.py +402 -0
  3. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/pyproject.toml +1 -1
  4. dayhoff_tools-1.1.38/dayhoff_tools/deployment/processors.py +0 -824
  5. dayhoff_tools-1.1.38/dayhoff_tools/embedders.py +0 -892
  6. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/README.md +0 -0
  7. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/__init__.py +0 -0
  8. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/chemistry/standardizer.py +0 -0
  9. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/chemistry/utils.py +0 -0
  10. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/cli/__init__.py +0 -0
  11. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/cli/cloud_commands.py +0 -0
  12. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/cli/main.py +0 -0
  13. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/cli/swarm_commands.py +0 -0
  14. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/cli/utility_commands.py +0 -0
  15. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/base.py +0 -0
  16. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  17. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  18. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  19. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/job_runner.py +0 -0
  20. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/deployment/swarm.py +0 -0
  21. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/fasta.py +0 -0
  22. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/file_ops.py +0 -0
  23. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/h5.py +0 -0
  24. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/gcp.py +0 -0
  25. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/gtdb.py +0 -0
  26. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/kegg.py +0 -0
  27. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/mmseqs.py +0 -0
  28. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/structure.py +0 -0
  29. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/intake/uniprot.py +0 -0
  30. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/logs.py +0 -0
  31. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/sqlite.py +0 -0
  32. {dayhoff_tools-1.1.38 → dayhoff_tools-1.1.40}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.38
3
+ Version: 1.1.40
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -0,0 +1,402 @@
1
+ import csv
2
+ import json
3
+ import logging
4
+ import os
5
+ import shlex
6
+ import shutil
7
+ import subprocess
8
+ from abc import ABC, abstractmethod
9
+ from pathlib import Path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Processor(ABC):
15
+ """Processes data locally. Abstract class for specific calculations.
16
+ Takes in a single file and produces a single file or folder of outputs."""
17
+
18
+ @abstractmethod
19
+ def run(self, input_file: str) -> str:
20
+ """Do the calculation, including reading from input_file
21
+ and writing to output_file"""
22
+ output_path = "output_file"
23
+
24
+ return output_path
25
+
26
+
27
+ class InterProScanProcessor(Processor):
28
+ """Processes a single FASTA file using InterProScan and extracts target domains.
29
+
30
+ This processor handles the analysis of protein sequences using InterProScan,
31
+ and extracts specific domains based on their InterPro accession IDs.
32
+ It maps sequence identifiers correctly using MD5 hashes from the TSV output
33
+ to handle differences in sequence ID representation between input FASTA and
34
+ InterProScan JSON output.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ interproscan_install_dir: str, # Path to the InterProScan installation
40
+ interproscan_temp_dir_mount: str, # Path to temporary directory for InterProScan
41
+ num_threads: int, # Number of CPU threads for InterProScan to use
42
+ output_formats: list[
43
+ str
44
+ ], # List of desired output formats (e.g., ["JSON", "TSV"])
45
+ target_iprs: set[str], # Set of InterPro IDs to extract domains for
46
+ other_interproscan_options: (
47
+ str | None
48
+ ) = None, # Additional command-line options
49
+ ):
50
+ """Initialize the InterProScanProcessor.
51
+
52
+ Args:
53
+ interproscan_install_dir: Path to the InterProScan installation directory.
54
+ interproscan_temp_dir_mount: Path to the temporary directory for InterProScan.
55
+ num_threads: Number of CPU threads for InterProScan to use.
56
+ output_formats: List of desired output formats (e.g., ["JSON", "TSV"]).
57
+ target_iprs: A set of InterPro accession IDs to extract domain sequences for.
58
+ other_interproscan_options: Additional command-line options for interproscan.sh.
59
+ """
60
+ self.interproscan_sh_path = Path(interproscan_install_dir) / "interproscan.sh"
61
+ if not self.interproscan_sh_path.is_file():
62
+ raise FileNotFoundError(
63
+ f"interproscan.sh not found at {self.interproscan_sh_path}"
64
+ )
65
+
66
+ self.interproscan_temp_dir_mount = Path(interproscan_temp_dir_mount)
67
+ # Ensure the temp directory exists
68
+ self.interproscan_temp_dir_mount.mkdir(parents=True, exist_ok=True)
69
+
70
+ self.num_threads = num_threads
71
+ self.output_formats = output_formats
72
+
73
+ # Ensure both JSON and TSV formats are included for domain extraction
74
+ if "JSON" not in self.output_formats:
75
+ self.output_formats.append("JSON")
76
+ if "TSV" not in self.output_formats:
77
+ self.output_formats.append("TSV")
78
+
79
+ self.target_iprs = target_iprs
80
+ self.other_options = (
81
+ other_interproscan_options if other_interproscan_options else ""
82
+ )
83
+
84
+ logger.info(
85
+ f"InterProScanProcessor initialized with script: {self.interproscan_sh_path}"
86
+ )
87
+ logger.info(
88
+ f"Temp dir mount for InterProScan: {self.interproscan_temp_dir_mount}"
89
+ )
90
+ logger.info(f"Target IPRs: {self.target_iprs}")
91
+
92
+ def run(self, input_file: str) -> str:
93
+ """Run InterProScan on the input FASTA file and extract domain sequences.
94
+
95
+ This method processes a FASTA file through InterProScan, extracts domains
96
+ of interest based on the target_iprs list, and writes the extracted domains
97
+ to a separate FASTA file. Domain sequences are correctly mapped using MD5 hashes
98
+ from the TSV output to handle differences in sequence ID representation.
99
+
100
+ Args:
101
+ input_file: Path to the input FASTA file.
102
+
103
+ Returns:
104
+ Path to the output directory containing extracted domain sequences and raw results.
105
+ """
106
+ from Bio import SeqIO
107
+ from Bio.Seq import Seq
108
+
109
+ input_file_path = Path(input_file).resolve()
110
+ input_file_stem = input_file_path.stem
111
+
112
+ # Create output directory structure
113
+ chunk_output_dir = Path(f"results_{input_file_stem}").resolve()
114
+ chunk_output_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ raw_ipr_output_dir = chunk_output_dir / "raw_ipr_output"
117
+ raw_ipr_output_dir.mkdir(parents=True, exist_ok=True)
118
+
119
+ # --- Clean input FASTA file to remove stop codons ---
120
+ cleaned_input_file_path = (
121
+ raw_ipr_output_dir / f"{input_file_stem}_cleaned.fasta"
122
+ )
123
+ logger.info(
124
+ f"Cleaning input FASTA file: {input_file_path} to remove '*' characters."
125
+ )
126
+ cleaned_records = []
127
+ has_asterisks = False
128
+
129
+ for record in SeqIO.parse(input_file_path, "fasta"):
130
+ original_seq_str = str(record.seq)
131
+ if "*" in original_seq_str:
132
+ has_asterisks = True
133
+ cleaned_seq_str = original_seq_str.replace("*", "")
134
+ record.seq = Seq(cleaned_seq_str)
135
+ logger.debug(f"Removed '*' from sequence {record.id}")
136
+ cleaned_records.append(record)
137
+
138
+ if has_asterisks:
139
+ SeqIO.write(cleaned_records, cleaned_input_file_path, "fasta")
140
+ logger.info(f"Cleaned FASTA written to {cleaned_input_file_path}")
141
+ ipr_input_file_to_use = cleaned_input_file_path
142
+ else:
143
+ logger.info(
144
+ f"No '*' characters found in {input_file_path}. Using original."
145
+ )
146
+ ipr_input_file_to_use = input_file_path
147
+ # --- End of cleaning ---
148
+
149
+ # Set up InterProScan output base path
150
+ ipr_output_base = raw_ipr_output_dir / input_file_stem
151
+
152
+ # Build the InterProScan command
153
+ cmd = [
154
+ str(self.interproscan_sh_path),
155
+ "-i",
156
+ str(ipr_input_file_to_use),
157
+ "-b",
158
+ str(ipr_output_base),
159
+ "-f",
160
+ ",".join(self.output_formats),
161
+ "--cpu",
162
+ str(self.num_threads),
163
+ "--tempdir",
164
+ str(self.interproscan_temp_dir_mount),
165
+ "--disable-precalc",
166
+ ]
167
+
168
+ # Add additional options if provided
169
+ if self.other_options:
170
+ cmd.extend(self.other_options.split())
171
+
172
+ # Run InterProScan
173
+ logger.info(f"Running InterProScan command: {' '.join(cmd)}")
174
+ try:
175
+ process = subprocess.run(cmd, check=True, capture_output=True, text=True)
176
+ logger.info(f"InterProScan STDOUT: {process.stdout}")
177
+ if process.stderr:
178
+ logger.info(f"InterProScan STDERR: {process.stderr}")
179
+ except subprocess.CalledProcessError as e:
180
+ logger.error(f"InterProScan failed for {input_file_path}")
181
+ logger.error(f"Return code: {e.returncode}")
182
+ logger.error(f"STDOUT: {e.stdout}")
183
+ logger.error(f"STDERR: {e.stderr}")
184
+ # Create a failure marker file
185
+ Path(chunk_output_dir / "INTERPROSCAN_FAILED.txt").touch()
186
+ return str(chunk_output_dir)
187
+
188
+ # Define paths for output files
189
+ extracted_domains_fasta_path = (
190
+ chunk_output_dir / f"{input_file_stem}_extracted_domains.fasta"
191
+ )
192
+ json_output_path = ipr_output_base.with_suffix(".json")
193
+ tsv_output_path = ipr_output_base.with_suffix(".tsv")
194
+
195
+ # Check for required output formats
196
+ if "JSON" not in self.output_formats or not json_output_path.is_file():
197
+ logger.warning(
198
+ f"JSON output format not requested or file not found: {json_output_path}. Cannot extract domains."
199
+ )
200
+ return str(chunk_output_dir)
201
+
202
+ if "TSV" not in self.output_formats or not tsv_output_path.is_file():
203
+ logger.warning(
204
+ f"TSV output format not found: {tsv_output_path}. This is needed to map sequence IDs."
205
+ )
206
+ return str(chunk_output_dir)
207
+
208
+ # Extract domains using the JSON and TSV outputs
209
+ try:
210
+ # Create MD5 to sequence ID mapping from TSV
211
+ md5_to_id = {}
212
+ with open(tsv_output_path, "r") as f:
213
+ for line in f:
214
+ parts = line.strip().split("\t")
215
+ if len(parts) >= 3: # Ensure there are enough columns
216
+ seq_id = parts[0]
217
+ md5 = parts[1]
218
+ md5_to_id[md5] = seq_id
219
+
220
+ logger.debug(f"Created MD5 to ID mapping with {len(md5_to_id)} entries")
221
+
222
+ # Load protein sequences for coordinate mapping
223
+ protein_sequences = SeqIO.to_dict(
224
+ SeqIO.parse(ipr_input_file_to_use, "fasta")
225
+ )
226
+
227
+ # Process JSON for domain extraction
228
+ extracted_count = 0
229
+ with (
230
+ open(extracted_domains_fasta_path, "w") as f_out,
231
+ open(json_output_path, "r") as f_json,
232
+ ):
233
+ data = json.load(f_json)
234
+ if "results" not in data:
235
+ logger.info(f"No 'results' key in JSON output {json_output_path}")
236
+ return str(chunk_output_dir)
237
+
238
+ for result in data.get("results", []):
239
+ # Map sequence via MD5 hash
240
+ md5 = result.get("md5")
241
+ if not md5 or md5 not in md5_to_id:
242
+ logger.debug(f"MD5 hash not found in mapping: {md5}")
243
+ continue
244
+
245
+ protein_acc = md5_to_id[md5]
246
+ if protein_acc not in protein_sequences:
247
+ logger.debug(f"Sequence ID not found in FASTA: {protein_acc}")
248
+ continue
249
+
250
+ original_seq_record = protein_sequences[protein_acc]
251
+ for match in result.get("matches", []):
252
+ # Extract the InterPro domain entry
253
+ signature = match.get("signature", {})
254
+ entry = signature.get("entry")
255
+ if not entry or entry.get("accession") not in self.target_iprs:
256
+ continue
257
+
258
+ ipr_id = entry.get("accession")
259
+ ipr_desc = entry.get("description", "N/A").replace(" ", "_")
260
+ logger.info(
261
+ f"Found target domain {ipr_id} ({ipr_desc}) in sequence {protein_acc}"
262
+ )
263
+
264
+ for location in match.get("locations", []):
265
+ start = location.get("start")
266
+ end = location.get("end")
267
+ if start is not None and end is not None:
268
+ domain_seq_str = str(
269
+ original_seq_record.seq[start - 1 : end]
270
+ )
271
+ domain_fasta_header = f">{original_seq_record.id}|{ipr_id}|{start}-{end}|{ipr_desc}"
272
+ f_out.write(f"{domain_fasta_header}\n")
273
+ f_out.write(f"{domain_seq_str}\n")
274
+ extracted_count += 1
275
+ logger.debug(
276
+ f"Extracted domain {ipr_id} ({start}-{end}) from {protein_acc}"
277
+ )
278
+
279
+ logger.info(
280
+ f"Extracted {extracted_count} domain sequences to {extracted_domains_fasta_path}"
281
+ )
282
+
283
+ except FileNotFoundError:
284
+ logger.error(
285
+ f"Input FASTA file {ipr_input_file_to_use} not found during domain extraction."
286
+ )
287
+ except json.JSONDecodeError:
288
+ logger.error(f"Error decoding JSON from {json_output_path}.")
289
+ except Exception as e:
290
+ logger.error(f"Error during domain extraction: {e}", exc_info=True)
291
+
292
+ # Clean up if the input file was a temporary one
293
+ if has_asterisks and cleaned_input_file_path != input_file_path:
294
+ if cleaned_input_file_path.exists():
295
+ cleaned_input_file_path.unlink()
296
+
297
+ return str(chunk_output_dir)
298
+
299
+
300
+ class BoltzPredictor(Processor):
301
+ """Processor for running Boltz docking predictions.
302
+
303
+ This class wraps the Boltz docking tool to predict protein structures
304
+ from sequence data.
305
+ """
306
+
307
+ def __init__(self, num_workers: int, boltz_options: str | None = None):
308
+ """Initialize the BoltzPredictor.
309
+
310
+ Args:
311
+ num_workers: Number of worker threads to use as a default.
312
+ This can be overridden if --num_workers is present
313
+ in boltz_options.
314
+ boltz_options: A string containing additional command-line options
315
+ to pass to the Boltz predictor. Options should be
316
+ space-separated (e.g., "--option1 value1 --option2").
317
+ """
318
+ self.num_workers = num_workers
319
+ self.boltz_options = boltz_options
320
+
321
+ def run(self, input_file: str) -> str:
322
+ """Run Boltz prediction on the input file.
323
+
324
+ Constructs the command using the input file, default number of workers,
325
+ and any additional options provided via `boltz_options`. If `--num_workers`
326
+ is specified in `boltz_options`, it overrides the default `num_workers`.
327
+
328
+ Args:
329
+ input_file: Path to the input file containing sequences
330
+
331
+ Returns:
332
+ Path to the output directory created by Boltz
333
+
334
+ Raises:
335
+ subprocess.CalledProcessError: If Boltz prediction fails
336
+ """
337
+ # Determine expected output directory name
338
+ input_base = os.path.splitext(os.path.basename(input_file))[0]
339
+ expected_output_dir = f"boltz_results_{input_base}"
340
+ logger.info(f"Expected output directory: {expected_output_dir}")
341
+
342
+ # Start building the command
343
+ cmd = ["boltz", "predict", input_file]
344
+
345
+ # Parse additional options if provided
346
+ additional_args = []
347
+ num_workers_in_opts = False
348
+ if self.boltz_options:
349
+ try:
350
+ parsed_opts = shlex.split(self.boltz_options)
351
+ additional_args.extend(parsed_opts)
352
+ if "--num_workers" in parsed_opts:
353
+ num_workers_in_opts = True
354
+ logger.info(
355
+ f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
356
+ )
357
+ except ValueError as e:
358
+ logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
359
+ # Decide if we should raise an error or proceed without options
360
+ # For now, proceed without the additional options
361
+ additional_args = [] # Clear potentially partially parsed args
362
+
363
+ # Add num_workers if not specified in options
364
+ if not num_workers_in_opts:
365
+ logger.info(f"Using default num_workers: {self.num_workers}")
366
+ cmd.extend(["--num_workers", str(self.num_workers)])
367
+
368
+ # Add the parsed additional arguments
369
+ cmd.extend(additional_args)
370
+
371
+ # Log the final command
372
+ # Use shlex.join for safer command logging, especially if paths/args have spaces
373
+ try:
374
+ safe_cmd_str = shlex.join(cmd)
375
+ logger.info(f"Running command: {safe_cmd_str}")
376
+ except AttributeError: # shlex.join is Python 3.8+
377
+ logger.info(f"Running command: {' '.join(cmd)}")
378
+
379
+ # Stream output in real-time
380
+ process = subprocess.Popen(
381
+ cmd,
382
+ stdout=subprocess.PIPE,
383
+ stderr=subprocess.STDOUT,
384
+ text=True,
385
+ bufsize=1,
386
+ )
387
+
388
+ stdout = process.stdout
389
+ if stdout:
390
+ for line in iter(stdout.readline, ""):
391
+ logger.info(f"BOLTZ: {line.rstrip()}")
392
+
393
+ # Wait for process to complete
394
+ return_code = process.wait()
395
+ if return_code != 0:
396
+ logger.error(f"Boltz prediction failed with exit code {return_code}")
397
+ raise subprocess.CalledProcessError(return_code, cmd)
398
+
399
+ logger.info(
400
+ f"Boltz prediction completed successfully. Output in {expected_output_dir}"
401
+ )
402
+ return expected_output_dir
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
5
5
 
6
6
  [project]
7
7
  name = "dayhoff-tools"
8
- version = "1.1.38"
8
+ version = "1.1.40"
9
9
  description = "Common tools for all the repos at Dayhoff Labs"
10
10
  authors = [
11
11
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}