dayhoff-tools 1.1.29__py3-none-any.whl → 1.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/deployment/job_runner.py +9 -4
- dayhoff_tools/deployment/processors.py +277 -3
- {dayhoff_tools-1.1.29.dist-info → dayhoff_tools-1.1.31.dist-info}/METADATA +2 -1
- {dayhoff_tools-1.1.29.dist-info → dayhoff_tools-1.1.31.dist-info}/RECORD +6 -6
- {dayhoff_tools-1.1.29.dist-info → dayhoff_tools-1.1.31.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.29.dist-info → dayhoff_tools-1.1.31.dist-info}/entry_points.txt +0 -0
@@ -94,9 +94,9 @@ def run_command() -> None:
|
|
94
94
|
stderr=None, # Use parent's stderr
|
95
95
|
)
|
96
96
|
|
97
|
-
logger.info("
|
97
|
+
logger.info("Job command completed successfully")
|
98
98
|
except subprocess.CalledProcessError as e:
|
99
|
-
logger.error(f"
|
99
|
+
logger.error(f"Job command failed with return code: {e.returncode}")
|
100
100
|
raise
|
101
101
|
except Exception as e:
|
102
102
|
logger.error(f"Error executing command: {str(e)}")
|
@@ -133,6 +133,13 @@ def run_job(
|
|
133
133
|
logger = logging.getLogger(__name__)
|
134
134
|
|
135
135
|
logger.info(f"Job runner starting in mode: {mode}")
|
136
|
+
import importlib.metadata
|
137
|
+
|
138
|
+
try:
|
139
|
+
version = importlib.metadata.version("dayhoff-tools")
|
140
|
+
logger.info(f"dayhoff-tools version: {version}")
|
141
|
+
except importlib.metadata.PackageNotFoundError:
|
142
|
+
logger.warning("Could not determine dayhoff-tools version")
|
136
143
|
|
137
144
|
if mode not in ["setup", "execute", "setup_and_execute"]:
|
138
145
|
logger.error(f"Invalid mode: {mode}")
|
@@ -146,8 +153,6 @@ def run_job(
|
|
146
153
|
if mode in ["execute", "setup_and_execute"]:
|
147
154
|
run_command()
|
148
155
|
|
149
|
-
logger.info("Job completed successfully")
|
150
|
-
|
151
156
|
except Exception as e:
|
152
157
|
logger.error(f"Job failed with error: {str(e)}", exc_info=True)
|
153
158
|
sys.exit(1)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import csv
|
2
|
+
import json
|
2
3
|
import logging
|
3
4
|
import os
|
4
5
|
import shlex
|
@@ -7,6 +8,9 @@ import subprocess
|
|
7
8
|
from abc import ABC, abstractmethod
|
8
9
|
from pathlib import Path
|
9
10
|
|
11
|
+
from Bio import SeqIO
|
12
|
+
from Bio.Seq import Seq
|
13
|
+
|
10
14
|
logger = logging.getLogger(__name__)
|
11
15
|
|
12
16
|
|
@@ -23,6 +27,276 @@ class Processor(ABC):
|
|
23
27
|
return output_path
|
24
28
|
|
25
29
|
|
30
|
+
class InterProScanProcessor(Processor):
|
31
|
+
"""Processes a single FASTA file using InterProScan and extracts target domains.
|
32
|
+
|
33
|
+
This processor handles the analysis of protein sequences using InterProScan,
|
34
|
+
and extracts specific domains based on their InterPro accession IDs.
|
35
|
+
It maps sequence identifiers correctly using MD5 hashes from the TSV output
|
36
|
+
to handle differences in sequence ID representation between input FASTA and
|
37
|
+
InterProScan JSON output.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
interproscan_install_dir: str, # Path to the InterProScan installation
|
43
|
+
interproscan_temp_dir_mount: str, # Path to temporary directory for InterProScan
|
44
|
+
num_threads: int, # Number of CPU threads for InterProScan to use
|
45
|
+
output_formats: list[
|
46
|
+
str
|
47
|
+
], # List of desired output formats (e.g., ["JSON", "TSV"])
|
48
|
+
target_iprs: set[str], # Set of InterPro IDs to extract domains for
|
49
|
+
other_interproscan_options: (
|
50
|
+
str | None
|
51
|
+
) = None, # Additional command-line options
|
52
|
+
):
|
53
|
+
"""Initialize the InterProScanProcessor.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
interproscan_install_dir: Path to the InterProScan installation directory.
|
57
|
+
interproscan_temp_dir_mount: Path to the temporary directory for InterProScan.
|
58
|
+
num_threads: Number of CPU threads for InterProScan to use.
|
59
|
+
output_formats: List of desired output formats (e.g., ["JSON", "TSV"]).
|
60
|
+
target_iprs: A set of InterPro accession IDs to extract domain sequences for.
|
61
|
+
other_interproscan_options: Additional command-line options for interproscan.sh.
|
62
|
+
"""
|
63
|
+
self.interproscan_sh_path = Path(interproscan_install_dir) / "interproscan.sh"
|
64
|
+
if not self.interproscan_sh_path.is_file():
|
65
|
+
raise FileNotFoundError(
|
66
|
+
f"interproscan.sh not found at {self.interproscan_sh_path}"
|
67
|
+
)
|
68
|
+
|
69
|
+
self.interproscan_temp_dir_mount = Path(interproscan_temp_dir_mount)
|
70
|
+
# Ensure the temp directory exists
|
71
|
+
self.interproscan_temp_dir_mount.mkdir(parents=True, exist_ok=True)
|
72
|
+
|
73
|
+
self.num_threads = num_threads
|
74
|
+
self.output_formats = output_formats
|
75
|
+
|
76
|
+
# Ensure both JSON and TSV formats are included for domain extraction
|
77
|
+
if "JSON" not in self.output_formats:
|
78
|
+
self.output_formats.append("JSON")
|
79
|
+
if "TSV" not in self.output_formats:
|
80
|
+
self.output_formats.append("TSV")
|
81
|
+
|
82
|
+
self.target_iprs = target_iprs
|
83
|
+
self.other_options = (
|
84
|
+
other_interproscan_options if other_interproscan_options else ""
|
85
|
+
)
|
86
|
+
|
87
|
+
logger.info(
|
88
|
+
f"InterProScanProcessor initialized with script: {self.interproscan_sh_path}"
|
89
|
+
)
|
90
|
+
logger.info(
|
91
|
+
f"Temp dir mount for InterProScan: {self.interproscan_temp_dir_mount}"
|
92
|
+
)
|
93
|
+
logger.info(f"Target IPRs: {self.target_iprs}")
|
94
|
+
|
95
|
+
def run(self, input_file: str) -> str:
|
96
|
+
"""Run InterProScan on the input FASTA file and extract domain sequences.
|
97
|
+
|
98
|
+
This method processes a FASTA file through InterProScan, extracts domains
|
99
|
+
of interest based on the target_iprs list, and writes the extracted domains
|
100
|
+
to a separate FASTA file. Domain sequences are correctly mapped using MD5 hashes
|
101
|
+
from the TSV output to handle differences in sequence ID representation.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
input_file: Path to the input FASTA file.
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
Path to the output directory containing extracted domain sequences and raw results.
|
108
|
+
"""
|
109
|
+
input_file_path = Path(input_file).resolve()
|
110
|
+
input_file_stem = input_file_path.stem
|
111
|
+
|
112
|
+
# Create output directory structure
|
113
|
+
chunk_output_dir = Path(f"results_{input_file_stem}").resolve()
|
114
|
+
chunk_output_dir.mkdir(parents=True, exist_ok=True)
|
115
|
+
|
116
|
+
raw_ipr_output_dir = chunk_output_dir / "raw_ipr_output"
|
117
|
+
raw_ipr_output_dir.mkdir(parents=True, exist_ok=True)
|
118
|
+
|
119
|
+
# --- Clean input FASTA file to remove stop codons ---
|
120
|
+
cleaned_input_file_path = (
|
121
|
+
raw_ipr_output_dir / f"{input_file_stem}_cleaned.fasta"
|
122
|
+
)
|
123
|
+
logger.info(
|
124
|
+
f"Cleaning input FASTA file: {input_file_path} to remove '*' characters."
|
125
|
+
)
|
126
|
+
cleaned_records = []
|
127
|
+
has_asterisks = False
|
128
|
+
|
129
|
+
for record in SeqIO.parse(input_file_path, "fasta"):
|
130
|
+
original_seq_str = str(record.seq)
|
131
|
+
if "*" in original_seq_str:
|
132
|
+
has_asterisks = True
|
133
|
+
cleaned_seq_str = original_seq_str.replace("*", "")
|
134
|
+
record.seq = Seq(cleaned_seq_str)
|
135
|
+
logger.debug(f"Removed '*' from sequence {record.id}")
|
136
|
+
cleaned_records.append(record)
|
137
|
+
|
138
|
+
if has_asterisks:
|
139
|
+
SeqIO.write(cleaned_records, cleaned_input_file_path, "fasta")
|
140
|
+
logger.info(f"Cleaned FASTA written to {cleaned_input_file_path}")
|
141
|
+
ipr_input_file_to_use = cleaned_input_file_path
|
142
|
+
else:
|
143
|
+
logger.info(
|
144
|
+
f"No '*' characters found in {input_file_path}. Using original."
|
145
|
+
)
|
146
|
+
ipr_input_file_to_use = input_file_path
|
147
|
+
# --- End of cleaning ---
|
148
|
+
|
149
|
+
# Set up InterProScan output base path
|
150
|
+
ipr_output_base = raw_ipr_output_dir / input_file_stem
|
151
|
+
|
152
|
+
# Build the InterProScan command
|
153
|
+
cmd = [
|
154
|
+
str(self.interproscan_sh_path),
|
155
|
+
"-i",
|
156
|
+
str(ipr_input_file_to_use),
|
157
|
+
"-b",
|
158
|
+
str(ipr_output_base),
|
159
|
+
"-f",
|
160
|
+
",".join(self.output_formats),
|
161
|
+
"--cpu",
|
162
|
+
str(self.num_threads),
|
163
|
+
"--tempdir",
|
164
|
+
str(self.interproscan_temp_dir_mount),
|
165
|
+
"--disable-precalc",
|
166
|
+
]
|
167
|
+
|
168
|
+
# Add additional options if provided
|
169
|
+
if self.other_options:
|
170
|
+
cmd.extend(self.other_options.split())
|
171
|
+
|
172
|
+
# Run InterProScan
|
173
|
+
logger.info(f"Running InterProScan command: {' '.join(cmd)}")
|
174
|
+
try:
|
175
|
+
process = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
176
|
+
logger.info(f"InterProScan STDOUT: {process.stdout}")
|
177
|
+
if process.stderr:
|
178
|
+
logger.info(f"InterProScan STDERR: {process.stderr}")
|
179
|
+
except subprocess.CalledProcessError as e:
|
180
|
+
logger.error(f"InterProScan failed for {input_file_path}")
|
181
|
+
logger.error(f"Return code: {e.returncode}")
|
182
|
+
logger.error(f"STDOUT: {e.stdout}")
|
183
|
+
logger.error(f"STDERR: {e.stderr}")
|
184
|
+
# Create a failure marker file
|
185
|
+
Path(chunk_output_dir / "INTERPROSCAN_FAILED.txt").touch()
|
186
|
+
return str(chunk_output_dir)
|
187
|
+
|
188
|
+
# Define paths for output files
|
189
|
+
extracted_domains_fasta_path = (
|
190
|
+
chunk_output_dir / f"{input_file_stem}_extracted_domains.fasta"
|
191
|
+
)
|
192
|
+
json_output_path = ipr_output_base.with_suffix(".json")
|
193
|
+
tsv_output_path = ipr_output_base.with_suffix(".tsv")
|
194
|
+
|
195
|
+
# Check for required output formats
|
196
|
+
if "JSON" not in self.output_formats or not json_output_path.is_file():
|
197
|
+
logger.warning(
|
198
|
+
f"JSON output format not requested or file not found: {json_output_path}. Cannot extract domains."
|
199
|
+
)
|
200
|
+
return str(chunk_output_dir)
|
201
|
+
|
202
|
+
if "TSV" not in self.output_formats or not tsv_output_path.is_file():
|
203
|
+
logger.warning(
|
204
|
+
f"TSV output format not found: {tsv_output_path}. This is needed to map sequence IDs."
|
205
|
+
)
|
206
|
+
return str(chunk_output_dir)
|
207
|
+
|
208
|
+
# Extract domains using the JSON and TSV outputs
|
209
|
+
try:
|
210
|
+
# Create MD5 to sequence ID mapping from TSV
|
211
|
+
md5_to_id = {}
|
212
|
+
with open(tsv_output_path, "r") as f:
|
213
|
+
for line in f:
|
214
|
+
parts = line.strip().split("\t")
|
215
|
+
if len(parts) >= 3: # Ensure there are enough columns
|
216
|
+
seq_id = parts[0]
|
217
|
+
md5 = parts[1]
|
218
|
+
md5_to_id[md5] = seq_id
|
219
|
+
|
220
|
+
logger.debug(f"Created MD5 to ID mapping with {len(md5_to_id)} entries")
|
221
|
+
|
222
|
+
# Load protein sequences for coordinate mapping
|
223
|
+
protein_sequences = SeqIO.to_dict(
|
224
|
+
SeqIO.parse(ipr_input_file_to_use, "fasta")
|
225
|
+
)
|
226
|
+
|
227
|
+
# Process JSON for domain extraction
|
228
|
+
extracted_count = 0
|
229
|
+
with (
|
230
|
+
open(extracted_domains_fasta_path, "w") as f_out,
|
231
|
+
open(json_output_path, "r") as f_json,
|
232
|
+
):
|
233
|
+
data = json.load(f_json)
|
234
|
+
if "results" not in data:
|
235
|
+
logger.info(f"No 'results' key in JSON output {json_output_path}")
|
236
|
+
return str(chunk_output_dir)
|
237
|
+
|
238
|
+
for result in data.get("results", []):
|
239
|
+
# Map sequence via MD5 hash
|
240
|
+
md5 = result.get("md5")
|
241
|
+
if not md5 or md5 not in md5_to_id:
|
242
|
+
logger.debug(f"MD5 hash not found in mapping: {md5}")
|
243
|
+
continue
|
244
|
+
|
245
|
+
protein_acc = md5_to_id[md5]
|
246
|
+
if protein_acc not in protein_sequences:
|
247
|
+
logger.debug(f"Sequence ID not found in FASTA: {protein_acc}")
|
248
|
+
continue
|
249
|
+
|
250
|
+
original_seq_record = protein_sequences[protein_acc]
|
251
|
+
for match in result.get("matches", []):
|
252
|
+
# Extract the InterPro domain entry
|
253
|
+
signature = match.get("signature", {})
|
254
|
+
entry = signature.get("entry")
|
255
|
+
if not entry or entry.get("accession") not in self.target_iprs:
|
256
|
+
continue
|
257
|
+
|
258
|
+
ipr_id = entry.get("accession")
|
259
|
+
ipr_desc = entry.get("description", "N/A").replace(" ", "_")
|
260
|
+
logger.info(
|
261
|
+
f"Found target domain {ipr_id} ({ipr_desc}) in sequence {protein_acc}"
|
262
|
+
)
|
263
|
+
|
264
|
+
for location in match.get("locations", []):
|
265
|
+
start = location.get("start")
|
266
|
+
end = location.get("end")
|
267
|
+
if start is not None and end is not None:
|
268
|
+
domain_seq_str = str(
|
269
|
+
original_seq_record.seq[start - 1 : end]
|
270
|
+
)
|
271
|
+
domain_fasta_header = f">{original_seq_record.id}|{ipr_id}|{start}-{end}|{ipr_desc}"
|
272
|
+
f_out.write(f"{domain_fasta_header}\n")
|
273
|
+
f_out.write(f"{domain_seq_str}\n")
|
274
|
+
extracted_count += 1
|
275
|
+
logger.debug(
|
276
|
+
f"Extracted domain {ipr_id} ({start}-{end}) from {protein_acc}"
|
277
|
+
)
|
278
|
+
|
279
|
+
logger.info(
|
280
|
+
f"Extracted {extracted_count} domain sequences to {extracted_domains_fasta_path}"
|
281
|
+
)
|
282
|
+
|
283
|
+
except FileNotFoundError:
|
284
|
+
logger.error(
|
285
|
+
f"Input FASTA file {ipr_input_file_to_use} not found during domain extraction."
|
286
|
+
)
|
287
|
+
except json.JSONDecodeError:
|
288
|
+
logger.error(f"Error decoding JSON from {json_output_path}.")
|
289
|
+
except Exception as e:
|
290
|
+
logger.error(f"Error during domain extraction: {e}", exc_info=True)
|
291
|
+
|
292
|
+
# Clean up if the input file was a temporary one
|
293
|
+
if has_asterisks and cleaned_input_file_path != input_file_path:
|
294
|
+
if cleaned_input_file_path.exists():
|
295
|
+
cleaned_input_file_path.unlink()
|
296
|
+
|
297
|
+
return str(chunk_output_dir)
|
298
|
+
|
299
|
+
|
26
300
|
class BoltzPredictor(Processor):
|
27
301
|
"""Processor for running Boltz docking predictions.
|
28
302
|
|
@@ -438,7 +712,7 @@ class MMSeqsProfileProcessor(Processor):
|
|
438
712
|
intermediate_results_as_csv_file, "w", newline=""
|
439
713
|
) as csvfile:
|
440
714
|
writer = csv.writer(csvfile)
|
441
|
-
writer.writerow(
|
715
|
+
writer.writerow(csv_headers)
|
442
716
|
else:
|
443
717
|
with (
|
444
718
|
open(intermediate_results_m8_file, "r") as m8file,
|
@@ -447,7 +721,7 @@ class MMSeqsProfileProcessor(Processor):
|
|
447
721
|
) as csvfile,
|
448
722
|
):
|
449
723
|
writer = csv.writer(csvfile)
|
450
|
-
writer.writerow(
|
724
|
+
writer.writerow(csv_headers)
|
451
725
|
for line in m8file:
|
452
726
|
writer.writerow(line.strip().split("\t"))
|
453
727
|
except Exception as e:
|
@@ -458,7 +732,7 @@ class MMSeqsProfileProcessor(Processor):
|
|
458
732
|
intermediate_results_as_csv_file, "w", newline=""
|
459
733
|
) as csvfile:
|
460
734
|
writer = csv.writer(csvfile)
|
461
|
-
writer.writerow(
|
735
|
+
writer.writerow(csv_headers) # write headers even on error
|
462
736
|
|
463
737
|
# 9. Extract hit sequence IDs from M8 results for the TXT file
|
464
738
|
hit_sequence_ids = set()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: dayhoff-tools
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.31
|
4
4
|
Summary: Common tools for all the repos at Dayhoff Labs
|
5
5
|
Author: Daniel Martin-Alarcon
|
6
6
|
Author-email: dma@dayhofflabs.com
|
@@ -38,6 +38,7 @@ Requires-Dist: tqdm (>=4.67.1) ; extra == "full"
|
|
38
38
|
Requires-Dist: transformers (==4.36.2) ; extra == "full"
|
39
39
|
Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
|
40
40
|
Requires-Dist: typer (>=0.9.0)
|
41
|
+
Requires-Dist: tzdata (>=2025.2)
|
41
42
|
Description-Content-Type: text/markdown
|
42
43
|
|
43
44
|
# dayhoff-tools
|
@@ -10,8 +10,8 @@ dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFn
|
|
10
10
|
dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
|
11
11
|
dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
|
-
dayhoff_tools/deployment/job_runner.py,sha256=
|
14
|
-
dayhoff_tools/deployment/processors.py,sha256=
|
13
|
+
dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
|
14
|
+
dayhoff_tools/deployment/processors.py,sha256=q2io07xO6f6twEH1iLz9GFoGaKh76qC4kcv519Q2G7g,34583
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
16
|
dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
|
17
17
|
dayhoff_tools/fasta.py,sha256=Ls6AG84IgG8COgAefqB3KS6iMbnixP_Up5EwUur-VUs,49780
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.31.dist-info/METADATA,sha256=16xeYUw5Vk8m3cvEudAPbzXBv_CWnAq3Hn5ihSbpnIo,2843
|
30
|
+
dayhoff_tools-1.1.31.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.31.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.31.dist-info/RECORD,,
|
File without changes
|
File without changes
|