dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +10 -0
- dayhoff_tools/cli/cloud_commands.py +179 -43
- dayhoff_tools/cli/engine1/__init__.py +323 -0
- dayhoff_tools/cli/engine1/engine_core.py +703 -0
- dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
- dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
- dayhoff_tools/cli/engine1/engine_management.py +505 -0
- dayhoff_tools/cli/engine1/shared.py +501 -0
- dayhoff_tools/cli/engine1/studio_commands.py +825 -0
- dayhoff_tools/cli/engines_studios/__init__.py +6 -0
- dayhoff_tools/cli/engines_studios/api_client.py +351 -0
- dayhoff_tools/cli/engines_studios/auth.py +144 -0
- dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
- dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
- dayhoff_tools/cli/engines_studios/progress.py +260 -0
- dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
- dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
- dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
- dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
- dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
- dayhoff_tools/cli/main.py +106 -7
- dayhoff_tools/cli/utility_commands.py +896 -179
- dayhoff_tools/deployment/base.py +70 -6
- dayhoff_tools/deployment/deploy_aws.py +165 -25
- dayhoff_tools/deployment/deploy_gcp.py +78 -5
- dayhoff_tools/deployment/deploy_utils.py +20 -7
- dayhoff_tools/deployment/job_runner.py +9 -4
- dayhoff_tools/deployment/processors.py +230 -418
- dayhoff_tools/deployment/swarm.py +47 -12
- dayhoff_tools/embedders.py +28 -26
- dayhoff_tools/fasta.py +181 -64
- dayhoff_tools/warehouse.py +268 -1
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
- dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
- dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,5 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
|
-
import shutil
|
|
5
3
|
import subprocess
|
|
6
4
|
from abc import ABC, abstractmethod
|
|
7
5
|
from pathlib import Path
|
|
@@ -22,460 +20,274 @@ class Processor(ABC):
|
|
|
22
20
|
return output_path
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
class
|
|
26
|
-
"""
|
|
23
|
+
class InterProScanProcessor(Processor):
|
|
24
|
+
"""Processes a single FASTA file using InterProScan and extracts target domains.
|
|
27
25
|
|
|
28
|
-
This
|
|
29
|
-
|
|
26
|
+
This processor handles the analysis of protein sequences using InterProScan,
|
|
27
|
+
and extracts specific domains based on their InterPro accession IDs.
|
|
28
|
+
It maps sequence identifiers correctly using MD5 hashes from the TSV output
|
|
29
|
+
to handle differences in sequence ID representation between input FASTA and
|
|
30
|
+
InterProScan JSON output.
|
|
30
31
|
"""
|
|
31
32
|
|
|
32
|
-
def __init__(
|
|
33
|
-
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
interproscan_install_dir: str, # Path to the InterProScan installation
|
|
36
|
+
interproscan_temp_dir_mount: str, # Path to temporary directory for InterProScan
|
|
37
|
+
num_threads: int, # Number of CPU threads for InterProScan to use
|
|
38
|
+
output_formats: list[
|
|
39
|
+
str
|
|
40
|
+
], # List of desired output formats (e.g., ["JSON", "TSV"])
|
|
41
|
+
target_iprs: set[str], # Set of InterPro IDs to extract domains for
|
|
42
|
+
other_interproscan_options: (
|
|
43
|
+
str | None
|
|
44
|
+
) = None, # Additional command-line options
|
|
45
|
+
):
|
|
46
|
+
"""Initialize the InterProScanProcessor.
|
|
34
47
|
|
|
35
48
|
Args:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
interproscan_install_dir: Path to the InterProScan installation directory.
|
|
50
|
+
interproscan_temp_dir_mount: Path to the temporary directory for InterProScan.
|
|
51
|
+
num_threads: Number of CPU threads for InterProScan to use.
|
|
52
|
+
output_formats: List of desired output formats (e.g., ["JSON", "TSV"]).
|
|
53
|
+
target_iprs: A set of InterPro accession IDs to extract domain sequences for.
|
|
54
|
+
other_interproscan_options: Additional command-line options for interproscan.sh.
|
|
42
55
|
"""
|
|
43
|
-
self.
|
|
44
|
-
self.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
56
|
+
self.interproscan_sh_path = Path(interproscan_install_dir) / "interproscan.sh"
|
|
57
|
+
if not self.interproscan_sh_path.is_file():
|
|
58
|
+
raise FileNotFoundError(
|
|
59
|
+
f"interproscan.sh not found at {self.interproscan_sh_path}"
|
|
60
|
+
)
|
|
48
61
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
62
|
+
self.interproscan_temp_dir_mount = Path(interproscan_temp_dir_mount)
|
|
63
|
+
# Ensure the temp directory exists
|
|
64
|
+
self.interproscan_temp_dir_mount.mkdir(parents=True, exist_ok=True)
|
|
52
65
|
|
|
53
|
-
|
|
54
|
-
|
|
66
|
+
self.num_threads = num_threads
|
|
67
|
+
self.output_formats = output_formats
|
|
55
68
|
|
|
56
|
-
|
|
57
|
-
|
|
69
|
+
# Ensure both JSON and TSV formats are included for domain extraction
|
|
70
|
+
if "JSON" not in self.output_formats:
|
|
71
|
+
self.output_formats.append("JSON")
|
|
72
|
+
if "TSV" not in self.output_formats:
|
|
73
|
+
self.output_formats.append("TSV")
|
|
58
74
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# Determine expected output directory name
|
|
63
|
-
input_base = os.path.splitext(os.path.basename(input_file))[0]
|
|
64
|
-
expected_output_dir = f"boltz_results_{input_base}"
|
|
65
|
-
logger.info(f"Expected output directory: {expected_output_dir}")
|
|
66
|
-
|
|
67
|
-
# Start building the command
|
|
68
|
-
cmd = ["boltz", "predict", input_file]
|
|
69
|
-
|
|
70
|
-
# Parse additional options if provided
|
|
71
|
-
additional_args = []
|
|
72
|
-
num_workers_in_opts = False
|
|
73
|
-
if self.boltz_options:
|
|
74
|
-
try:
|
|
75
|
-
parsed_opts = shlex.split(self.boltz_options)
|
|
76
|
-
additional_args.extend(parsed_opts)
|
|
77
|
-
if "--num_workers" in parsed_opts:
|
|
78
|
-
num_workers_in_opts = True
|
|
79
|
-
logger.info(
|
|
80
|
-
f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
|
|
81
|
-
)
|
|
82
|
-
except ValueError as e:
|
|
83
|
-
logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
|
|
84
|
-
# Decide if we should raise an error or proceed without options
|
|
85
|
-
# For now, proceed without the additional options
|
|
86
|
-
additional_args = [] # Clear potentially partially parsed args
|
|
87
|
-
|
|
88
|
-
# Add num_workers if not specified in options
|
|
89
|
-
if not num_workers_in_opts:
|
|
90
|
-
logger.info(f"Using default num_workers: {self.num_workers}")
|
|
91
|
-
cmd.extend(["--num_workers", str(self.num_workers)])
|
|
92
|
-
|
|
93
|
-
# Add the parsed additional arguments
|
|
94
|
-
cmd.extend(additional_args)
|
|
95
|
-
|
|
96
|
-
# Log the final command
|
|
97
|
-
# Use shlex.join for safer command logging, especially if paths/args have spaces
|
|
98
|
-
try:
|
|
99
|
-
safe_cmd_str = shlex.join(cmd)
|
|
100
|
-
logger.info(f"Running command: {safe_cmd_str}")
|
|
101
|
-
except AttributeError: # shlex.join is Python 3.8+
|
|
102
|
-
logger.info(f"Running command: {' '.join(cmd)}")
|
|
103
|
-
|
|
104
|
-
# Stream output in real-time
|
|
105
|
-
process = subprocess.Popen(
|
|
106
|
-
cmd,
|
|
107
|
-
stdout=subprocess.PIPE,
|
|
108
|
-
stderr=subprocess.STDOUT,
|
|
109
|
-
text=True,
|
|
110
|
-
bufsize=1,
|
|
75
|
+
self.target_iprs = target_iprs
|
|
76
|
+
self.other_options = (
|
|
77
|
+
other_interproscan_options if other_interproscan_options else ""
|
|
111
78
|
)
|
|
112
79
|
|
|
113
|
-
stdout = process.stdout
|
|
114
|
-
if stdout:
|
|
115
|
-
for line in iter(stdout.readline, ""):
|
|
116
|
-
logger.info(f"BOLTZ: {line.rstrip()}")
|
|
117
|
-
|
|
118
|
-
# Wait for process to complete
|
|
119
|
-
return_code = process.wait()
|
|
120
|
-
if return_code != 0:
|
|
121
|
-
logger.error(f"Boltz prediction failed with exit code {return_code}")
|
|
122
|
-
raise subprocess.CalledProcessError(return_code, cmd)
|
|
123
|
-
|
|
124
80
|
logger.info(
|
|
125
|
-
f"
|
|
81
|
+
f"InterProScanProcessor initialized with script: {self.interproscan_sh_path}"
|
|
126
82
|
)
|
|
127
|
-
return expected_output_dir
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class MMSeqsProfileProcessor(Processor):
|
|
131
|
-
"""Processor for running MMseqs2 profile searches.
|
|
132
|
-
|
|
133
|
-
This class wraps the MMseqs2 workflow to perform a profile-based search
|
|
134
|
-
against a target database using a query FASTA.
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
def __init__(
|
|
138
|
-
self,
|
|
139
|
-
query_fasta_path_in_image: str,
|
|
140
|
-
num_threads: int = 8,
|
|
141
|
-
mmseqs_args: dict | None = None,
|
|
142
|
-
):
|
|
143
|
-
"""Initialize the MMSeqsProfileProcessor.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
query_fasta_path_in_image: Path to the query FASTA file. This path is expected
|
|
147
|
-
to be accessible within the execution environment (e.g.,
|
|
148
|
-
packaged in a Docker image).
|
|
149
|
-
num_threads: Number of threads to use for MMseqs2 commands.
|
|
150
|
-
mmseqs_args: A dictionary of additional MMseqs2 parameters.
|
|
151
|
-
Expected keys: "memory_limit_gb", "evalue", "sensitivity",
|
|
152
|
-
"max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
|
|
153
|
-
Defaults are used if not provided.
|
|
154
|
-
"""
|
|
155
|
-
if not Path(query_fasta_path_in_image).is_file():
|
|
156
|
-
raise FileNotFoundError(
|
|
157
|
-
f"Query FASTA file not found at: {query_fasta_path_in_image}"
|
|
158
|
-
)
|
|
159
|
-
self.query_fasta_path = query_fasta_path_in_image
|
|
160
|
-
self.num_threads = str(num_threads) # MMseqs2 expects string for threads
|
|
161
|
-
|
|
162
|
-
default_mmseqs_args = {
|
|
163
|
-
"memory_limit_gb": "25",
|
|
164
|
-
"evalue": "10",
|
|
165
|
-
"sensitivity": "7.5",
|
|
166
|
-
"max_seqs_search": "300",
|
|
167
|
-
"min_seq_id_cluster": "0.8",
|
|
168
|
-
"max_seqs_profile_msa": "1000",
|
|
169
|
-
}
|
|
170
|
-
if mmseqs_args:
|
|
171
|
-
self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
|
|
172
|
-
else:
|
|
173
|
-
self.mmseqs_args = default_mmseqs_args
|
|
174
|
-
|
|
175
83
|
logger.info(
|
|
176
|
-
f"
|
|
84
|
+
f"Temp dir mount for InterProScan: {self.interproscan_temp_dir_mount}"
|
|
177
85
|
)
|
|
178
|
-
logger.info(f"
|
|
179
|
-
logger.info(f"Num threads: {self.num_threads}")
|
|
180
|
-
|
|
181
|
-
def _run_mmseqs_command(
|
|
182
|
-
self, command_parts: list[str], step_description: str, work_dir: Path
|
|
183
|
-
):
|
|
184
|
-
"""Runs an MMseqs2 command and logs its execution.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
command_parts: A list of strings representing the command and its arguments.
|
|
188
|
-
step_description: A human-readable description of the MMseqs2 step.
|
|
189
|
-
work_dir: The working directory for the command.
|
|
190
|
-
|
|
191
|
-
Raises:
|
|
192
|
-
subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
|
|
193
|
-
"""
|
|
194
|
-
full_command = " ".join(command_parts)
|
|
195
|
-
logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
|
|
196
|
-
logger.info(f"Command: {full_command}")
|
|
197
|
-
try:
|
|
198
|
-
process = subprocess.run(
|
|
199
|
-
command_parts,
|
|
200
|
-
check=True,
|
|
201
|
-
stdout=subprocess.PIPE,
|
|
202
|
-
stderr=subprocess.PIPE,
|
|
203
|
-
text=True,
|
|
204
|
-
cwd=work_dir, # Run command in the specified working directory
|
|
205
|
-
)
|
|
206
|
-
if process.stdout:
|
|
207
|
-
logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
|
|
208
|
-
if process.stderr: # MMseqs often outputs informational messages to stderr
|
|
209
|
-
logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
|
|
210
|
-
logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
|
|
211
|
-
except subprocess.CalledProcessError as e:
|
|
212
|
-
logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
|
|
213
|
-
if e.stdout:
|
|
214
|
-
logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
|
|
215
|
-
if e.stderr:
|
|
216
|
-
logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
|
|
217
|
-
raise
|
|
86
|
+
logger.info(f"Target IPRs: {self.target_iprs}")
|
|
218
87
|
|
|
219
88
|
def run(self, input_file: str) -> str:
|
|
220
|
-
"""Run
|
|
89
|
+
"""Run InterProScan on the input FASTA file and extract domain sequences.
|
|
221
90
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
(e.g., {target_stem}_results.m8 and {target_stem}_hits.fasta).
|
|
91
|
+
This method processes a FASTA file through InterProScan, extracts domains
|
|
92
|
+
of interest based on the target_iprs list, and writes the extracted domains
|
|
93
|
+
to a separate FASTA file. Domain sequences are correctly mapped using MD5 hashes
|
|
94
|
+
from the TSV output to handle differences in sequence ID representation.
|
|
227
95
|
|
|
228
96
|
Args:
|
|
229
|
-
input_file: Path to the input
|
|
97
|
+
input_file: Path to the input FASTA file.
|
|
230
98
|
|
|
231
99
|
Returns:
|
|
232
|
-
Path to the output directory
|
|
233
|
-
the meaningfully named result files.
|
|
234
|
-
|
|
235
|
-
Raises:
|
|
236
|
-
subprocess.CalledProcessError: If any MMseqs2 command fails.
|
|
237
|
-
FileNotFoundError: If the input_file is not found.
|
|
100
|
+
Path to the output directory containing extracted domain sequences and raw results.
|
|
238
101
|
"""
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
input_file_path = Path(input_file).resolve() # Ensure absolute path
|
|
243
|
-
target_fasta_filename = input_file_path.name
|
|
244
|
-
target_fasta_stem = input_file_path.stem # Get stem for naming
|
|
245
|
-
|
|
246
|
-
# Create a unique base directory for this run's outputs and temp files
|
|
247
|
-
# This directory will be returned and subsequently uploaded by the Operator
|
|
248
|
-
run_base_dir_name = f"{target_fasta_stem}" # Use stem as the dir name
|
|
249
|
-
run_base_dir = Path(run_base_dir_name).resolve()
|
|
250
|
-
run_base_dir.mkdir(parents=True, exist_ok=True)
|
|
251
|
-
logger.info(f"Created run base directory: {run_base_dir}")
|
|
252
|
-
|
|
253
|
-
# Define local paths within the run_base_dir
|
|
254
|
-
local_target_file = run_base_dir / target_fasta_filename
|
|
255
|
-
# Copy the target file into the run directory to keep inputs and outputs together
|
|
256
|
-
shutil.copy(input_file_path, local_target_file)
|
|
257
|
-
logger.info(f"Copied target file {input_file_path} to {local_target_file}")
|
|
258
|
-
|
|
259
|
-
# Query file is already specified by self.query_fasta_path (path in image)
|
|
260
|
-
local_query_file = Path(self.query_fasta_path).resolve()
|
|
261
|
-
|
|
262
|
-
# Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
|
|
263
|
-
mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
|
|
264
|
-
mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
|
|
265
|
-
logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
|
|
266
|
-
|
|
267
|
-
# Define INTERMEDIATE output file paths within mmseqs_temp_dir
|
|
268
|
-
intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
|
|
269
|
-
intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
|
|
270
|
-
|
|
271
|
-
# Define FINAL output file paths within run_base_dir, using target stem
|
|
272
|
-
final_results_m8_file = run_base_dir / f"{target_fasta_stem}_results.m8"
|
|
273
|
-
final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}_hits.fasta"
|
|
274
|
-
|
|
275
|
-
# --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
|
|
276
|
-
query_db = mmseqs_temp_dir / "queryDB"
|
|
277
|
-
target_db = mmseqs_temp_dir / "targetDB"
|
|
278
|
-
# Ensure local_target_file is used for creating targetDB
|
|
279
|
-
target_db_input_file = local_target_file
|
|
280
|
-
|
|
281
|
-
query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
|
|
282
|
-
query_db_rep = mmseqs_temp_dir / "queryDB_rep"
|
|
283
|
-
aln_db = mmseqs_temp_dir / "alnDB"
|
|
284
|
-
profile_db = mmseqs_temp_dir / "profileDB"
|
|
285
|
-
result_db = mmseqs_temp_dir / "resultDB"
|
|
286
|
-
hits_db = mmseqs_temp_dir / "hitsDB"
|
|
102
|
+
from Bio import SeqIO
|
|
103
|
+
from Bio.Seq import Seq
|
|
287
104
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
self._run_mmseqs_command(
|
|
291
|
-
["mmseqs", "createdb", str(local_query_file), str(query_db)],
|
|
292
|
-
"Create query DB",
|
|
293
|
-
run_base_dir, # Working directory for the command
|
|
294
|
-
)
|
|
105
|
+
input_file_path = Path(input_file).resolve()
|
|
106
|
+
input_file_stem = input_file_path.stem
|
|
295
107
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
"Create target DB",
|
|
300
|
-
run_base_dir,
|
|
301
|
-
)
|
|
108
|
+
# Create output directory structure
|
|
109
|
+
chunk_output_dir = Path(f"results_{input_file_stem}").resolve()
|
|
110
|
+
chunk_output_dir.mkdir(parents=True, exist_ok=True)
|
|
302
111
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
[
|
|
306
|
-
"mmseqs",
|
|
307
|
-
"cluster",
|
|
308
|
-
str(query_db),
|
|
309
|
-
str(query_db_cluster),
|
|
310
|
-
str(
|
|
311
|
-
mmseqs_temp_dir / "tmp_cluster"
|
|
312
|
-
), # MMseqs needs a temp dir for cluster
|
|
313
|
-
"--min-seq-id",
|
|
314
|
-
self.mmseqs_args["min_seq_id_cluster"],
|
|
315
|
-
"--threads",
|
|
316
|
-
self.num_threads,
|
|
317
|
-
],
|
|
318
|
-
"Cluster query sequences",
|
|
319
|
-
run_base_dir,
|
|
320
|
-
)
|
|
112
|
+
raw_ipr_output_dir = chunk_output_dir / "raw_ipr_output"
|
|
113
|
+
raw_ipr_output_dir.mkdir(parents=True, exist_ok=True)
|
|
321
114
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
115
|
+
# --- Clean input FASTA file to remove stop codons ---
|
|
116
|
+
cleaned_input_file_path = (
|
|
117
|
+
raw_ipr_output_dir / f"{input_file_stem}_cleaned.fasta"
|
|
118
|
+
)
|
|
119
|
+
logger.info(
|
|
120
|
+
f"Cleaning input FASTA file: {input_file_path} to remove '*' characters."
|
|
121
|
+
)
|
|
122
|
+
cleaned_records = []
|
|
123
|
+
has_asterisks = False
|
|
124
|
+
|
|
125
|
+
for record in SeqIO.parse(input_file_path, "fasta"):
|
|
126
|
+
original_seq_str = str(record.seq)
|
|
127
|
+
if "*" in original_seq_str:
|
|
128
|
+
has_asterisks = True
|
|
129
|
+
cleaned_seq_str = original_seq_str.replace("*", "")
|
|
130
|
+
record.seq = Seq(cleaned_seq_str)
|
|
131
|
+
logger.debug(f"Removed '*' from sequence {record.id}")
|
|
132
|
+
cleaned_records.append(record)
|
|
133
|
+
|
|
134
|
+
if has_asterisks:
|
|
135
|
+
SeqIO.write(cleaned_records, cleaned_input_file_path, "fasta")
|
|
136
|
+
logger.info(f"Cleaned FASTA written to {cleaned_input_file_path}")
|
|
137
|
+
ipr_input_file_to_use = cleaned_input_file_path
|
|
138
|
+
else:
|
|
139
|
+
logger.info(
|
|
140
|
+
f"No '*' characters found in {input_file_path}. Using original."
|
|
333
141
|
)
|
|
142
|
+
ipr_input_file_to_use = input_file_path
|
|
143
|
+
# --- End of cleaning ---
|
|
144
|
+
|
|
145
|
+
# Set up InterProScan output base path
|
|
146
|
+
ipr_output_base = raw_ipr_output_dir / input_file_stem
|
|
147
|
+
|
|
148
|
+
# Build the InterProScan command
|
|
149
|
+
cmd = [
|
|
150
|
+
str(self.interproscan_sh_path),
|
|
151
|
+
"-i",
|
|
152
|
+
str(ipr_input_file_to_use),
|
|
153
|
+
"-b",
|
|
154
|
+
str(ipr_output_base),
|
|
155
|
+
"-f",
|
|
156
|
+
",".join(self.output_formats),
|
|
157
|
+
"--cpu",
|
|
158
|
+
str(self.num_threads),
|
|
159
|
+
"--tempdir",
|
|
160
|
+
str(self.interproscan_temp_dir_mount),
|
|
161
|
+
"--disable-precalc",
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
# Add additional options if provided
|
|
165
|
+
if self.other_options:
|
|
166
|
+
cmd.extend(self.other_options.split())
|
|
167
|
+
|
|
168
|
+
# Run InterProScan
|
|
169
|
+
logger.info(f"Running InterProScan command: {' '.join(cmd)}")
|
|
170
|
+
try:
|
|
171
|
+
process = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
172
|
+
logger.info(f"InterProScan STDOUT: {process.stdout}")
|
|
173
|
+
if process.stderr:
|
|
174
|
+
logger.info(f"InterProScan STDERR: {process.stderr}")
|
|
175
|
+
except subprocess.CalledProcessError as e:
|
|
176
|
+
logger.error(f"InterProScan failed for {input_file_path}")
|
|
177
|
+
logger.error(f"Return code: {e.returncode}")
|
|
178
|
+
logger.error(f"STDOUT: {e.stdout}")
|
|
179
|
+
logger.error(f"STDERR: {e.stderr}")
|
|
180
|
+
# Create a failure marker file
|
|
181
|
+
Path(chunk_output_dir / "INTERPROSCAN_FAILED.txt").touch()
|
|
182
|
+
return str(chunk_output_dir)
|
|
183
|
+
|
|
184
|
+
# Define paths for output files
|
|
185
|
+
extracted_domains_fasta_path = (
|
|
186
|
+
chunk_output_dir / f"{input_file_stem}_extracted_domains.fasta"
|
|
187
|
+
)
|
|
188
|
+
json_output_path = ipr_output_base.with_suffix(".json")
|
|
189
|
+
tsv_output_path = ipr_output_base.with_suffix(".tsv")
|
|
334
190
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
"search",
|
|
340
|
-
str(query_db_rep),
|
|
341
|
-
str(query_db), # Search representative against full query DB
|
|
342
|
-
str(aln_db),
|
|
343
|
-
str(mmseqs_temp_dir / "tmp_search_msa"), # Temp for this search
|
|
344
|
-
"--max-seqs",
|
|
345
|
-
self.mmseqs_args["max_seqs_profile_msa"],
|
|
346
|
-
"--threads",
|
|
347
|
-
self.num_threads,
|
|
348
|
-
],
|
|
349
|
-
"Create MSA for profile",
|
|
350
|
-
run_base_dir,
|
|
191
|
+
# Check for required output formats
|
|
192
|
+
if "JSON" not in self.output_formats or not json_output_path.is_file():
|
|
193
|
+
logger.warning(
|
|
194
|
+
f"JSON output format not requested or file not found: {json_output_path}. Cannot extract domains."
|
|
351
195
|
)
|
|
196
|
+
return str(chunk_output_dir)
|
|
352
197
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
"mmseqs",
|
|
357
|
-
"result2profile",
|
|
358
|
-
str(query_db_rep), # Use query_db_rep as input for profile
|
|
359
|
-
str(query_db), # Full query DB as second arg
|
|
360
|
-
str(aln_db),
|
|
361
|
-
str(profile_db),
|
|
362
|
-
"--threads", # Added threads option
|
|
363
|
-
self.num_threads,
|
|
364
|
-
],
|
|
365
|
-
"Create profile DB",
|
|
366
|
-
run_base_dir,
|
|
198
|
+
if "TSV" not in self.output_formats or not tsv_output_path.is_file():
|
|
199
|
+
logger.warning(
|
|
200
|
+
f"TSV output format not found: {tsv_output_path}. This is needed to map sequence IDs."
|
|
367
201
|
)
|
|
202
|
+
return str(chunk_output_dir)
|
|
368
203
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
"-s",
|
|
387
|
-
self.mmseqs_args["sensitivity"],
|
|
388
|
-
],
|
|
389
|
-
"Perform profile search",
|
|
390
|
-
run_base_dir,
|
|
204
|
+
# Extract domains using the JSON and TSV outputs
|
|
205
|
+
try:
|
|
206
|
+
# Create MD5 to sequence ID mapping from TSV
|
|
207
|
+
md5_to_id = {}
|
|
208
|
+
with open(tsv_output_path, "r") as f:
|
|
209
|
+
for line in f:
|
|
210
|
+
parts = line.strip().split("\t")
|
|
211
|
+
if len(parts) >= 3: # Ensure there are enough columns
|
|
212
|
+
seq_id = parts[0]
|
|
213
|
+
md5 = parts[1]
|
|
214
|
+
md5_to_id[md5] = seq_id
|
|
215
|
+
|
|
216
|
+
logger.debug(f"Created MD5 to ID mapping with {len(md5_to_id)} entries")
|
|
217
|
+
|
|
218
|
+
# Load protein sequences for coordinate mapping
|
|
219
|
+
protein_sequences = SeqIO.to_dict(
|
|
220
|
+
SeqIO.parse(ipr_input_file_to_use, "fasta")
|
|
391
221
|
)
|
|
392
222
|
|
|
393
|
-
#
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
]
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
223
|
+
# Process JSON for domain extraction
|
|
224
|
+
extracted_count = 0
|
|
225
|
+
with (
|
|
226
|
+
open(extracted_domains_fasta_path, "w") as f_out,
|
|
227
|
+
open(json_output_path, "r") as f_json,
|
|
228
|
+
):
|
|
229
|
+
data = json.load(f_json)
|
|
230
|
+
if "results" not in data:
|
|
231
|
+
logger.info(f"No 'results' key in JSON output {json_output_path}")
|
|
232
|
+
return str(chunk_output_dir)
|
|
233
|
+
|
|
234
|
+
for result in data.get("results", []):
|
|
235
|
+
# Map sequence via MD5 hash
|
|
236
|
+
md5 = result.get("md5")
|
|
237
|
+
if not md5 or md5 not in md5_to_id:
|
|
238
|
+
logger.debug(f"MD5 hash not found in mapping: {md5}")
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
protein_acc = md5_to_id[md5]
|
|
242
|
+
if protein_acc not in protein_sequences:
|
|
243
|
+
logger.debug(f"Sequence ID not found in FASTA: {protein_acc}")
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
original_seq_record = protein_sequences[protein_acc]
|
|
247
|
+
for match in result.get("matches", []):
|
|
248
|
+
# Extract the InterPro domain entry
|
|
249
|
+
signature = match.get("signature", {})
|
|
250
|
+
entry = signature.get("entry")
|
|
251
|
+
if not entry or entry.get("accession") not in self.target_iprs:
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
ipr_id = entry.get("accession")
|
|
255
|
+
ipr_desc = entry.get("description", "N/A").replace(" ", "_")
|
|
256
|
+
logger.info(
|
|
257
|
+
f"Found target domain {ipr_id} ({ipr_desc}) in sequence {protein_acc}"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
for location in match.get("locations", []):
|
|
261
|
+
start = location.get("start")
|
|
262
|
+
end = location.get("end")
|
|
263
|
+
if start is not None and end is not None:
|
|
264
|
+
domain_seq_str = str(
|
|
265
|
+
original_seq_record.seq[start - 1 : end]
|
|
266
|
+
)
|
|
267
|
+
domain_fasta_header = f">{original_seq_record.id}|{ipr_id}|{start}-{end}|{ipr_desc}"
|
|
268
|
+
f_out.write(f"{domain_fasta_header}\n")
|
|
269
|
+
f_out.write(f"{domain_seq_str}\n")
|
|
270
|
+
extracted_count += 1
|
|
271
|
+
logger.debug(
|
|
272
|
+
f"Extracted domain {ipr_id} ({start}-{end}) from {protein_acc}"
|
|
273
|
+
)
|
|
408
274
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
["mmseqs", "createsubdb", str(result_db), str(target_db), str(hits_db)],
|
|
412
|
-
"Create hits subDB from target_db",
|
|
413
|
-
run_base_dir,
|
|
275
|
+
logger.info(
|
|
276
|
+
f"Extracted {extracted_count} domain sequences to {extracted_domains_fasta_path}"
|
|
414
277
|
)
|
|
415
278
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
"mmseqs",
|
|
420
|
-
"convert2fasta",
|
|
421
|
-
str(hits_db),
|
|
422
|
-
str(intermediate_hits_fasta_file),
|
|
423
|
-
],
|
|
424
|
-
"Convert hits to FASTA",
|
|
425
|
-
run_base_dir,
|
|
279
|
+
except FileNotFoundError:
|
|
280
|
+
logger.error(
|
|
281
|
+
f"Input FASTA file {ipr_input_file_to_use} not found during domain extraction."
|
|
426
282
|
)
|
|
283
|
+
except json.JSONDecodeError:
|
|
284
|
+
logger.error(f"Error decoding JSON from {json_output_path}.")
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.error(f"Error during domain extraction: {e}", exc_info=True)
|
|
427
287
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
)
|
|
288
|
+
# Clean up if the input file was a temporary one
|
|
289
|
+
if has_asterisks and cleaned_input_file_path != input_file_path:
|
|
290
|
+
if cleaned_input_file_path.exists():
|
|
291
|
+
cleaned_input_file_path.unlink()
|
|
431
292
|
|
|
432
|
-
|
|
433
|
-
if intermediate_results_m8_file.exists():
|
|
434
|
-
shutil.move(
|
|
435
|
-
str(intermediate_results_m8_file), str(final_results_m8_file)
|
|
436
|
-
)
|
|
437
|
-
logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
|
|
438
|
-
else:
|
|
439
|
-
logger.warning(
|
|
440
|
-
f"Intermediate M8 file {intermediate_results_m8_file} not found. Creating empty target file."
|
|
441
|
-
)
|
|
442
|
-
final_results_m8_file.touch() # Create empty file in run_base_dir if not found
|
|
443
|
-
|
|
444
|
-
if intermediate_hits_fasta_file.exists():
|
|
445
|
-
shutil.move(
|
|
446
|
-
str(intermediate_hits_fasta_file), str(final_hits_fasta_file)
|
|
447
|
-
)
|
|
448
|
-
logger.info(f"Moved and renamed hits FASTA to {final_hits_fasta_file}")
|
|
449
|
-
else:
|
|
450
|
-
logger.warning(
|
|
451
|
-
f"Intermediate hits FASTA {intermediate_hits_fasta_file} not found. Creating empty target file."
|
|
452
|
-
)
|
|
453
|
-
final_hits_fasta_file.touch() # Create empty file in run_base_dir if not found
|
|
454
|
-
|
|
455
|
-
finally:
|
|
456
|
-
# Clean up the MMseqs2 temporary directory (mmseqs_tmp) which contains intermediate DBs etc.
|
|
457
|
-
if mmseqs_temp_dir.exists():
|
|
458
|
-
shutil.rmtree(mmseqs_temp_dir)
|
|
459
|
-
logger.info(
|
|
460
|
-
f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
# Clean up the copied input file (local_target_file) from the run_base_dir
|
|
464
|
-
# so it does not get uploaded with the results.
|
|
465
|
-
if local_target_file.exists():
|
|
466
|
-
try:
|
|
467
|
-
local_target_file.unlink()
|
|
468
|
-
logger.info(
|
|
469
|
-
f"Cleaned up copied input file from run directory: {local_target_file}"
|
|
470
|
-
)
|
|
471
|
-
except OSError as e:
|
|
472
|
-
logger.error(
|
|
473
|
-
f"Error deleting copied input file {local_target_file}: {e}"
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
# The run_base_dir (containing only the final, meaningfully named output files)
|
|
477
|
-
# will be cleaned up by the Operator after its contents are uploaded.
|
|
478
|
-
|
|
479
|
-
return str(
|
|
480
|
-
run_base_dir
|
|
481
|
-
) # Return the path to the directory containing meaningfully named results
|
|
293
|
+
return str(chunk_output_dir)
|