dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dayhoff_tools/__init__.py +10 -0
  2. dayhoff_tools/cli/cloud_commands.py +179 -43
  3. dayhoff_tools/cli/engine1/__init__.py +323 -0
  4. dayhoff_tools/cli/engine1/engine_core.py +703 -0
  5. dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
  6. dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
  7. dayhoff_tools/cli/engine1/engine_management.py +505 -0
  8. dayhoff_tools/cli/engine1/shared.py +501 -0
  9. dayhoff_tools/cli/engine1/studio_commands.py +825 -0
  10. dayhoff_tools/cli/engines_studios/__init__.py +6 -0
  11. dayhoff_tools/cli/engines_studios/api_client.py +351 -0
  12. dayhoff_tools/cli/engines_studios/auth.py +144 -0
  13. dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
  14. dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
  15. dayhoff_tools/cli/engines_studios/progress.py +260 -0
  16. dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
  17. dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
  18. dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
  19. dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
  20. dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
  21. dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
  22. dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
  23. dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
  24. dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
  25. dayhoff_tools/cli/main.py +106 -7
  26. dayhoff_tools/cli/utility_commands.py +896 -179
  27. dayhoff_tools/deployment/base.py +70 -6
  28. dayhoff_tools/deployment/deploy_aws.py +165 -25
  29. dayhoff_tools/deployment/deploy_gcp.py +78 -5
  30. dayhoff_tools/deployment/deploy_utils.py +20 -7
  31. dayhoff_tools/deployment/job_runner.py +9 -4
  32. dayhoff_tools/deployment/processors.py +230 -418
  33. dayhoff_tools/deployment/swarm.py +47 -12
  34. dayhoff_tools/embedders.py +28 -26
  35. dayhoff_tools/fasta.py +181 -64
  36. dayhoff_tools/warehouse.py +268 -1
  37. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
  38. dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
  39. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
  40. dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
  41. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,5 @@
1
+ import json
1
2
  import logging
2
- import os
3
- import shlex
4
- import shutil
5
3
  import subprocess
6
4
  from abc import ABC, abstractmethod
7
5
  from pathlib import Path
@@ -22,460 +20,274 @@ class Processor(ABC):
22
20
  return output_path
23
21
 
24
22
 
25
- class BoltzPredictor(Processor):
26
- """Processor for running Boltz docking predictions.
23
+ class InterProScanProcessor(Processor):
24
+ """Processes a single FASTA file using InterProScan and extracts target domains.
27
25
 
28
- This class wraps the Boltz docking tool to predict protein structures
29
- from sequence data.
26
+ This processor handles the analysis of protein sequences using InterProScan,
27
+ and extracts specific domains based on their InterPro accession IDs.
28
+ It maps sequence identifiers correctly using MD5 hashes from the TSV output
29
+ to handle differences in sequence ID representation between input FASTA and
30
+ InterProScan JSON output.
30
31
  """
31
32
 
32
- def __init__(self, num_workers: int, boltz_options: str | None = None):
33
- """Initialize the BoltzPredictor.
33
+ def __init__(
34
+ self,
35
+ interproscan_install_dir: str, # Path to the InterProScan installation
36
+ interproscan_temp_dir_mount: str, # Path to temporary directory for InterProScan
37
+ num_threads: int, # Number of CPU threads for InterProScan to use
38
+ output_formats: list[
39
+ str
40
+ ], # List of desired output formats (e.g., ["JSON", "TSV"])
41
+ target_iprs: set[str], # Set of InterPro IDs to extract domains for
42
+ other_interproscan_options: (
43
+ str | None
44
+ ) = None, # Additional command-line options
45
+ ):
46
+ """Initialize the InterProScanProcessor.
34
47
 
35
48
  Args:
36
- num_workers: Number of worker threads to use as a default.
37
- This can be overridden if --num_workers is present
38
- in boltz_options.
39
- boltz_options: A string containing additional command-line options
40
- to pass to the Boltz predictor. Options should be
41
- space-separated (e.g., "--option1 value1 --option2").
49
+ interproscan_install_dir: Path to the InterProScan installation directory.
50
+ interproscan_temp_dir_mount: Path to the temporary directory for InterProScan.
51
+ num_threads: Number of CPU threads for InterProScan to use.
52
+ output_formats: List of desired output formats (e.g., ["JSON", "TSV"]).
53
+ target_iprs: A set of InterPro accession IDs to extract domain sequences for.
54
+ other_interproscan_options: Additional command-line options for interproscan.sh.
42
55
  """
43
- self.num_workers = num_workers
44
- self.boltz_options = boltz_options
45
-
46
- def run(self, input_file: str) -> str:
47
- """Run Boltz prediction on the input file.
56
+ self.interproscan_sh_path = Path(interproscan_install_dir) / "interproscan.sh"
57
+ if not self.interproscan_sh_path.is_file():
58
+ raise FileNotFoundError(
59
+ f"interproscan.sh not found at {self.interproscan_sh_path}"
60
+ )
48
61
 
49
- Constructs the command using the input file, default number of workers,
50
- and any additional options provided via `boltz_options`. If `--num_workers`
51
- is specified in `boltz_options`, it overrides the default `num_workers`.
62
+ self.interproscan_temp_dir_mount = Path(interproscan_temp_dir_mount)
63
+ # Ensure the temp directory exists
64
+ self.interproscan_temp_dir_mount.mkdir(parents=True, exist_ok=True)
52
65
 
53
- Args:
54
- input_file: Path to the input file containing sequences
66
+ self.num_threads = num_threads
67
+ self.output_formats = output_formats
55
68
 
56
- Returns:
57
- Path to the output directory created by Boltz
69
+ # Ensure both JSON and TSV formats are included for domain extraction
70
+ if "JSON" not in self.output_formats:
71
+ self.output_formats.append("JSON")
72
+ if "TSV" not in self.output_formats:
73
+ self.output_formats.append("TSV")
58
74
 
59
- Raises:
60
- subprocess.CalledProcessError: If Boltz prediction fails
61
- """
62
- # Determine expected output directory name
63
- input_base = os.path.splitext(os.path.basename(input_file))[0]
64
- expected_output_dir = f"boltz_results_{input_base}"
65
- logger.info(f"Expected output directory: {expected_output_dir}")
66
-
67
- # Start building the command
68
- cmd = ["boltz", "predict", input_file]
69
-
70
- # Parse additional options if provided
71
- additional_args = []
72
- num_workers_in_opts = False
73
- if self.boltz_options:
74
- try:
75
- parsed_opts = shlex.split(self.boltz_options)
76
- additional_args.extend(parsed_opts)
77
- if "--num_workers" in parsed_opts:
78
- num_workers_in_opts = True
79
- logger.info(
80
- f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
81
- )
82
- except ValueError as e:
83
- logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
84
- # Decide if we should raise an error or proceed without options
85
- # For now, proceed without the additional options
86
- additional_args = [] # Clear potentially partially parsed args
87
-
88
- # Add num_workers if not specified in options
89
- if not num_workers_in_opts:
90
- logger.info(f"Using default num_workers: {self.num_workers}")
91
- cmd.extend(["--num_workers", str(self.num_workers)])
92
-
93
- # Add the parsed additional arguments
94
- cmd.extend(additional_args)
95
-
96
- # Log the final command
97
- # Use shlex.join for safer command logging, especially if paths/args have spaces
98
- try:
99
- safe_cmd_str = shlex.join(cmd)
100
- logger.info(f"Running command: {safe_cmd_str}")
101
- except AttributeError: # shlex.join is Python 3.8+
102
- logger.info(f"Running command: {' '.join(cmd)}")
103
-
104
- # Stream output in real-time
105
- process = subprocess.Popen(
106
- cmd,
107
- stdout=subprocess.PIPE,
108
- stderr=subprocess.STDOUT,
109
- text=True,
110
- bufsize=1,
75
+ self.target_iprs = target_iprs
76
+ self.other_options = (
77
+ other_interproscan_options if other_interproscan_options else ""
111
78
  )
112
79
 
113
- stdout = process.stdout
114
- if stdout:
115
- for line in iter(stdout.readline, ""):
116
- logger.info(f"BOLTZ: {line.rstrip()}")
117
-
118
- # Wait for process to complete
119
- return_code = process.wait()
120
- if return_code != 0:
121
- logger.error(f"Boltz prediction failed with exit code {return_code}")
122
- raise subprocess.CalledProcessError(return_code, cmd)
123
-
124
80
  logger.info(
125
- f"Boltz prediction completed successfully. Output in {expected_output_dir}"
81
+ f"InterProScanProcessor initialized with script: {self.interproscan_sh_path}"
126
82
  )
127
- return expected_output_dir
128
-
129
-
130
- class MMSeqsProfileProcessor(Processor):
131
- """Processor for running MMseqs2 profile searches.
132
-
133
- This class wraps the MMseqs2 workflow to perform a profile-based search
134
- against a target database using a query FASTA.
135
- """
136
-
137
- def __init__(
138
- self,
139
- query_fasta_path_in_image: str,
140
- num_threads: int = 8,
141
- mmseqs_args: dict | None = None,
142
- ):
143
- """Initialize the MMSeqsProfileProcessor.
144
-
145
- Args:
146
- query_fasta_path_in_image: Path to the query FASTA file. This path is expected
147
- to be accessible within the execution environment (e.g.,
148
- packaged in a Docker image).
149
- num_threads: Number of threads to use for MMseqs2 commands.
150
- mmseqs_args: A dictionary of additional MMseqs2 parameters.
151
- Expected keys: "memory_limit_gb", "evalue", "sensitivity",
152
- "max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
153
- Defaults are used if not provided.
154
- """
155
- if not Path(query_fasta_path_in_image).is_file():
156
- raise FileNotFoundError(
157
- f"Query FASTA file not found at: {query_fasta_path_in_image}"
158
- )
159
- self.query_fasta_path = query_fasta_path_in_image
160
- self.num_threads = str(num_threads) # MMseqs2 expects string for threads
161
-
162
- default_mmseqs_args = {
163
- "memory_limit_gb": "25",
164
- "evalue": "10",
165
- "sensitivity": "7.5",
166
- "max_seqs_search": "300",
167
- "min_seq_id_cluster": "0.8",
168
- "max_seqs_profile_msa": "1000",
169
- }
170
- if mmseqs_args:
171
- self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
172
- else:
173
- self.mmseqs_args = default_mmseqs_args
174
-
175
83
  logger.info(
176
- f"MMSeqsProfileProcessor initialized with query: {self.query_fasta_path}"
84
+ f"Temp dir mount for InterProScan: {self.interproscan_temp_dir_mount}"
177
85
  )
178
- logger.info(f"MMSeqs args: {self.mmseqs_args}")
179
- logger.info(f"Num threads: {self.num_threads}")
180
-
181
- def _run_mmseqs_command(
182
- self, command_parts: list[str], step_description: str, work_dir: Path
183
- ):
184
- """Runs an MMseqs2 command and logs its execution.
185
-
186
- Args:
187
- command_parts: A list of strings representing the command and its arguments.
188
- step_description: A human-readable description of the MMseqs2 step.
189
- work_dir: The working directory for the command.
190
-
191
- Raises:
192
- subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
193
- """
194
- full_command = " ".join(command_parts)
195
- logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
196
- logger.info(f"Command: {full_command}")
197
- try:
198
- process = subprocess.run(
199
- command_parts,
200
- check=True,
201
- stdout=subprocess.PIPE,
202
- stderr=subprocess.PIPE,
203
- text=True,
204
- cwd=work_dir, # Run command in the specified working directory
205
- )
206
- if process.stdout:
207
- logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
208
- if process.stderr: # MMseqs often outputs informational messages to stderr
209
- logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
210
- logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
211
- except subprocess.CalledProcessError as e:
212
- logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
213
- if e.stdout:
214
- logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
215
- if e.stderr:
216
- logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
217
- raise
86
+ logger.info(f"Target IPRs: {self.target_iprs}")
218
87
 
219
88
  def run(self, input_file: str) -> str:
220
- """Run MMseqs2 profile search.
89
+ """Run InterProScan on the input FASTA file and extract domain sequences.
221
90
 
222
- The input_file is the target FASTA. The query FASTA is provided
223
- during initialization.
224
- The method creates an output directory (e.g., {target_stem})
225
- which contains the result files, now named meaningfully using the target stem
226
- (e.g., {target_stem}_results.m8 and {target_stem}_hits.fasta).
91
+ This method processes a FASTA file through InterProScan, extracts domains
92
+ of interest based on the target_iprs list, and writes the extracted domains
93
+ to a separate FASTA file. Domain sequences are correctly mapped using MD5 hashes
94
+ from the TSV output to handle differences in sequence ID representation.
227
95
 
228
96
  Args:
229
- input_file: Path to the input target FASTA file.
97
+ input_file: Path to the input FASTA file.
230
98
 
231
99
  Returns:
232
- Path to the output directory (e.g., {target_stem}) containing
233
- the meaningfully named result files.
234
-
235
- Raises:
236
- subprocess.CalledProcessError: If any MMseqs2 command fails.
237
- FileNotFoundError: If the input_file is not found.
100
+ Path to the output directory containing extracted domain sequences and raw results.
238
101
  """
239
- if not Path(input_file).is_file():
240
- raise FileNotFoundError(f"Input target FASTA file not found: {input_file}")
241
-
242
- input_file_path = Path(input_file).resolve() # Ensure absolute path
243
- target_fasta_filename = input_file_path.name
244
- target_fasta_stem = input_file_path.stem # Get stem for naming
245
-
246
- # Create a unique base directory for this run's outputs and temp files
247
- # This directory will be returned and subsequently uploaded by the Operator
248
- run_base_dir_name = f"{target_fasta_stem}" # Use stem as the dir name
249
- run_base_dir = Path(run_base_dir_name).resolve()
250
- run_base_dir.mkdir(parents=True, exist_ok=True)
251
- logger.info(f"Created run base directory: {run_base_dir}")
252
-
253
- # Define local paths within the run_base_dir
254
- local_target_file = run_base_dir / target_fasta_filename
255
- # Copy the target file into the run directory to keep inputs and outputs together
256
- shutil.copy(input_file_path, local_target_file)
257
- logger.info(f"Copied target file {input_file_path} to {local_target_file}")
258
-
259
- # Query file is already specified by self.query_fasta_path (path in image)
260
- local_query_file = Path(self.query_fasta_path).resolve()
261
-
262
- # Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
263
- mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
264
- mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
265
- logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
266
-
267
- # Define INTERMEDIATE output file paths within mmseqs_temp_dir
268
- intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
269
- intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
270
-
271
- # Define FINAL output file paths within run_base_dir, using target stem
272
- final_results_m8_file = run_base_dir / f"{target_fasta_stem}_results.m8"
273
- final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}_hits.fasta"
274
-
275
- # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
276
- query_db = mmseqs_temp_dir / "queryDB"
277
- target_db = mmseqs_temp_dir / "targetDB"
278
- # Ensure local_target_file is used for creating targetDB
279
- target_db_input_file = local_target_file
280
-
281
- query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
282
- query_db_rep = mmseqs_temp_dir / "queryDB_rep"
283
- aln_db = mmseqs_temp_dir / "alnDB"
284
- profile_db = mmseqs_temp_dir / "profileDB"
285
- result_db = mmseqs_temp_dir / "resultDB"
286
- hits_db = mmseqs_temp_dir / "hitsDB"
102
+ from Bio import SeqIO
103
+ from Bio.Seq import Seq
287
104
 
288
- try:
289
- # 1. Create query database
290
- self._run_mmseqs_command(
291
- ["mmseqs", "createdb", str(local_query_file), str(query_db)],
292
- "Create query DB",
293
- run_base_dir, # Working directory for the command
294
- )
105
+ input_file_path = Path(input_file).resolve()
106
+ input_file_stem = input_file_path.stem
295
107
 
296
- # 2. Create target database
297
- self._run_mmseqs_command(
298
- ["mmseqs", "createdb", str(target_db_input_file), str(target_db)],
299
- "Create target DB",
300
- run_base_dir,
301
- )
108
+ # Create output directory structure
109
+ chunk_output_dir = Path(f"results_{input_file_stem}").resolve()
110
+ chunk_output_dir.mkdir(parents=True, exist_ok=True)
302
111
 
303
- # 3. Cluster query sequences
304
- self._run_mmseqs_command(
305
- [
306
- "mmseqs",
307
- "cluster",
308
- str(query_db),
309
- str(query_db_cluster),
310
- str(
311
- mmseqs_temp_dir / "tmp_cluster"
312
- ), # MMseqs needs a temp dir for cluster
313
- "--min-seq-id",
314
- self.mmseqs_args["min_seq_id_cluster"],
315
- "--threads",
316
- self.num_threads,
317
- ],
318
- "Cluster query sequences",
319
- run_base_dir,
320
- )
112
+ raw_ipr_output_dir = chunk_output_dir / "raw_ipr_output"
113
+ raw_ipr_output_dir.mkdir(parents=True, exist_ok=True)
321
114
 
322
- # 4. Create representative set from query clusters
323
- self._run_mmseqs_command(
324
- [
325
- "mmseqs",
326
- "createsubdb",
327
- str(query_db_cluster),
328
- str(query_db),
329
- str(query_db_rep),
330
- ],
331
- "Create representative query set",
332
- run_base_dir,
115
+ # --- Clean input FASTA file to remove stop codons ---
116
+ cleaned_input_file_path = (
117
+ raw_ipr_output_dir / f"{input_file_stem}_cleaned.fasta"
118
+ )
119
+ logger.info(
120
+ f"Cleaning input FASTA file: {input_file_path} to remove '*' characters."
121
+ )
122
+ cleaned_records = []
123
+ has_asterisks = False
124
+
125
+ for record in SeqIO.parse(input_file_path, "fasta"):
126
+ original_seq_str = str(record.seq)
127
+ if "*" in original_seq_str:
128
+ has_asterisks = True
129
+ cleaned_seq_str = original_seq_str.replace("*", "")
130
+ record.seq = Seq(cleaned_seq_str)
131
+ logger.debug(f"Removed '*' from sequence {record.id}")
132
+ cleaned_records.append(record)
133
+
134
+ if has_asterisks:
135
+ SeqIO.write(cleaned_records, cleaned_input_file_path, "fasta")
136
+ logger.info(f"Cleaned FASTA written to {cleaned_input_file_path}")
137
+ ipr_input_file_to_use = cleaned_input_file_path
138
+ else:
139
+ logger.info(
140
+ f"No '*' characters found in {input_file_path}. Using original."
333
141
  )
142
+ ipr_input_file_to_use = input_file_path
143
+ # --- End of cleaning ---
144
+
145
+ # Set up InterProScan output base path
146
+ ipr_output_base = raw_ipr_output_dir / input_file_stem
147
+
148
+ # Build the InterProScan command
149
+ cmd = [
150
+ str(self.interproscan_sh_path),
151
+ "-i",
152
+ str(ipr_input_file_to_use),
153
+ "-b",
154
+ str(ipr_output_base),
155
+ "-f",
156
+ ",".join(self.output_formats),
157
+ "--cpu",
158
+ str(self.num_threads),
159
+ "--tempdir",
160
+ str(self.interproscan_temp_dir_mount),
161
+ "--disable-precalc",
162
+ ]
163
+
164
+ # Add additional options if provided
165
+ if self.other_options:
166
+ cmd.extend(self.other_options.split())
167
+
168
+ # Run InterProScan
169
+ logger.info(f"Running InterProScan command: {' '.join(cmd)}")
170
+ try:
171
+ process = subprocess.run(cmd, check=True, capture_output=True, text=True)
172
+ logger.info(f"InterProScan STDOUT: {process.stdout}")
173
+ if process.stderr:
174
+ logger.info(f"InterProScan STDERR: {process.stderr}")
175
+ except subprocess.CalledProcessError as e:
176
+ logger.error(f"InterProScan failed for {input_file_path}")
177
+ logger.error(f"Return code: {e.returncode}")
178
+ logger.error(f"STDOUT: {e.stdout}")
179
+ logger.error(f"STDERR: {e.stderr}")
180
+ # Create a failure marker file
181
+ Path(chunk_output_dir / "INTERPROSCAN_FAILED.txt").touch()
182
+ return str(chunk_output_dir)
183
+
184
+ # Define paths for output files
185
+ extracted_domains_fasta_path = (
186
+ chunk_output_dir / f"{input_file_stem}_extracted_domains.fasta"
187
+ )
188
+ json_output_path = ipr_output_base.with_suffix(".json")
189
+ tsv_output_path = ipr_output_base.with_suffix(".tsv")
334
190
 
335
- # 5. Create MSA for profile generation
336
- self._run_mmseqs_command(
337
- [
338
- "mmseqs",
339
- "search",
340
- str(query_db_rep),
341
- str(query_db), # Search representative against full query DB
342
- str(aln_db),
343
- str(mmseqs_temp_dir / "tmp_search_msa"), # Temp for this search
344
- "--max-seqs",
345
- self.mmseqs_args["max_seqs_profile_msa"],
346
- "--threads",
347
- self.num_threads,
348
- ],
349
- "Create MSA for profile",
350
- run_base_dir,
191
+ # Check for required output formats
192
+ if "JSON" not in self.output_formats or not json_output_path.is_file():
193
+ logger.warning(
194
+ f"JSON output format not requested or file not found: {json_output_path}. Cannot extract domains."
351
195
  )
196
+ return str(chunk_output_dir)
352
197
 
353
- # 6. Create profile database
354
- self._run_mmseqs_command(
355
- [
356
- "mmseqs",
357
- "result2profile",
358
- str(query_db_rep), # Use query_db_rep as input for profile
359
- str(query_db), # Full query DB as second arg
360
- str(aln_db),
361
- str(profile_db),
362
- "--threads", # Added threads option
363
- self.num_threads,
364
- ],
365
- "Create profile DB",
366
- run_base_dir,
198
+ if "TSV" not in self.output_formats or not tsv_output_path.is_file():
199
+ logger.warning(
200
+ f"TSV output format not found: {tsv_output_path}. This is needed to map sequence IDs."
367
201
  )
202
+ return str(chunk_output_dir)
368
203
 
369
- # 7. Perform profile search
370
- self._run_mmseqs_command(
371
- [
372
- "mmseqs",
373
- "search",
374
- str(profile_db),
375
- str(target_db),
376
- str(result_db),
377
- str(mmseqs_temp_dir / "tmp_search_profile"), # Temp for this search
378
- "--split-memory-limit",
379
- f"{self.mmseqs_args['memory_limit_gb']}G",
380
- "-e",
381
- self.mmseqs_args["evalue"],
382
- "--max-seqs",
383
- self.mmseqs_args["max_seqs_search"],
384
- "--threads",
385
- self.num_threads,
386
- "-s",
387
- self.mmseqs_args["sensitivity"],
388
- ],
389
- "Perform profile search",
390
- run_base_dir,
204
+ # Extract domains using the JSON and TSV outputs
205
+ try:
206
+ # Create MD5 to sequence ID mapping from TSV
207
+ md5_to_id = {}
208
+ with open(tsv_output_path, "r") as f:
209
+ for line in f:
210
+ parts = line.strip().split("\t")
211
+ if len(parts) >= 3: # Ensure there are enough columns
212
+ seq_id = parts[0]
213
+ md5 = parts[1]
214
+ md5_to_id[md5] = seq_id
215
+
216
+ logger.debug(f"Created MD5 to ID mapping with {len(md5_to_id)} entries")
217
+
218
+ # Load protein sequences for coordinate mapping
219
+ protein_sequences = SeqIO.to_dict(
220
+ SeqIO.parse(ipr_input_file_to_use, "fasta")
391
221
  )
392
222
 
393
- # 8. Convert results to tabular format (M8) -> to intermediate file
394
- self._run_mmseqs_command(
395
- [
396
- "mmseqs",
397
- "convertalis",
398
- str(profile_db), # Query DB used for search (profileDB)
399
- str(target_db),
400
- str(result_db),
401
- str(intermediate_results_m8_file), # Output M8 file to temp dir
402
- "--threads",
403
- self.num_threads,
404
- ],
405
- "Convert results to M8",
406
- run_base_dir,
407
- )
223
+ # Process JSON for domain extraction
224
+ extracted_count = 0
225
+ with (
226
+ open(extracted_domains_fasta_path, "w") as f_out,
227
+ open(json_output_path, "r") as f_json,
228
+ ):
229
+ data = json.load(f_json)
230
+ if "results" not in data:
231
+ logger.info(f"No 'results' key in JSON output {json_output_path}")
232
+ return str(chunk_output_dir)
233
+
234
+ for result in data.get("results", []):
235
+ # Map sequence via MD5 hash
236
+ md5 = result.get("md5")
237
+ if not md5 or md5 not in md5_to_id:
238
+ logger.debug(f"MD5 hash not found in mapping: {md5}")
239
+ continue
240
+
241
+ protein_acc = md5_to_id[md5]
242
+ if protein_acc not in protein_sequences:
243
+ logger.debug(f"Sequence ID not found in FASTA: {protein_acc}")
244
+ continue
245
+
246
+ original_seq_record = protein_sequences[protein_acc]
247
+ for match in result.get("matches", []):
248
+ # Extract the InterPro domain entry
249
+ signature = match.get("signature", {})
250
+ entry = signature.get("entry")
251
+ if not entry or entry.get("accession") not in self.target_iprs:
252
+ continue
253
+
254
+ ipr_id = entry.get("accession")
255
+ ipr_desc = entry.get("description", "N/A").replace(" ", "_")
256
+ logger.info(
257
+ f"Found target domain {ipr_id} ({ipr_desc}) in sequence {protein_acc}"
258
+ )
259
+
260
+ for location in match.get("locations", []):
261
+ start = location.get("start")
262
+ end = location.get("end")
263
+ if start is not None and end is not None:
264
+ domain_seq_str = str(
265
+ original_seq_record.seq[start - 1 : end]
266
+ )
267
+ domain_fasta_header = f">{original_seq_record.id}|{ipr_id}|{start}-{end}|{ipr_desc}"
268
+ f_out.write(f"{domain_fasta_header}\n")
269
+ f_out.write(f"{domain_seq_str}\n")
270
+ extracted_count += 1
271
+ logger.debug(
272
+ f"Extracted domain {ipr_id} ({start}-{end}) from {protein_acc}"
273
+ )
408
274
 
409
- # 9. Create subdatabase of hits from original target_db
410
- self._run_mmseqs_command(
411
- ["mmseqs", "createsubdb", str(result_db), str(target_db), str(hits_db)],
412
- "Create hits subDB from target_db",
413
- run_base_dir,
275
+ logger.info(
276
+ f"Extracted {extracted_count} domain sequences to {extracted_domains_fasta_path}"
414
277
  )
415
278
 
416
- # 10. Convert hit sequences to FASTA -> to intermediate file
417
- self._run_mmseqs_command(
418
- [
419
- "mmseqs",
420
- "convert2fasta",
421
- str(hits_db),
422
- str(intermediate_hits_fasta_file),
423
- ],
424
- "Convert hits to FASTA",
425
- run_base_dir,
279
+ except FileNotFoundError:
280
+ logger.error(
281
+ f"Input FASTA file {ipr_input_file_to_use} not found during domain extraction."
426
282
  )
283
+ except json.JSONDecodeError:
284
+ logger.error(f"Error decoding JSON from {json_output_path}.")
285
+ except Exception as e:
286
+ logger.error(f"Error during domain extraction: {e}", exc_info=True)
427
287
 
428
- logger.info(
429
- f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
430
- )
288
+ # Clean up if the input file was a temporary one
289
+ if has_asterisks and cleaned_input_file_path != input_file_path:
290
+ if cleaned_input_file_path.exists():
291
+ cleaned_input_file_path.unlink()
431
292
 
432
- # Move and rename final output files from mmseqs_temp_dir to run_base_dir
433
- if intermediate_results_m8_file.exists():
434
- shutil.move(
435
- str(intermediate_results_m8_file), str(final_results_m8_file)
436
- )
437
- logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
438
- else:
439
- logger.warning(
440
- f"Intermediate M8 file {intermediate_results_m8_file} not found. Creating empty target file."
441
- )
442
- final_results_m8_file.touch() # Create empty file in run_base_dir if not found
443
-
444
- if intermediate_hits_fasta_file.exists():
445
- shutil.move(
446
- str(intermediate_hits_fasta_file), str(final_hits_fasta_file)
447
- )
448
- logger.info(f"Moved and renamed hits FASTA to {final_hits_fasta_file}")
449
- else:
450
- logger.warning(
451
- f"Intermediate hits FASTA {intermediate_hits_fasta_file} not found. Creating empty target file."
452
- )
453
- final_hits_fasta_file.touch() # Create empty file in run_base_dir if not found
454
-
455
- finally:
456
- # Clean up the MMseqs2 temporary directory (mmseqs_tmp) which contains intermediate DBs etc.
457
- if mmseqs_temp_dir.exists():
458
- shutil.rmtree(mmseqs_temp_dir)
459
- logger.info(
460
- f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
461
- )
462
-
463
- # Clean up the copied input file (local_target_file) from the run_base_dir
464
- # so it does not get uploaded with the results.
465
- if local_target_file.exists():
466
- try:
467
- local_target_file.unlink()
468
- logger.info(
469
- f"Cleaned up copied input file from run directory: {local_target_file}"
470
- )
471
- except OSError as e:
472
- logger.error(
473
- f"Error deleting copied input file {local_target_file}: {e}"
474
- )
475
-
476
- # The run_base_dir (containing only the final, meaningfully named output files)
477
- # will be cleaned up by the Operator after its contents are uploaded.
478
-
479
- return str(
480
- run_base_dir
481
- ) # Return the path to the directory containing meaningfully named results
293
+ return str(chunk_output_dir)