dayhoff-tools 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/deployment/processors.py +305 -1
- {dayhoff_tools-1.1.5.dist-info → dayhoff_tools-1.1.6.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.1.5.dist-info → dayhoff_tools-1.1.6.dist-info}/RECORD +5 -5
- {dayhoff_tools-1.1.5.dist-info → dayhoff_tools-1.1.6.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.5.dist-info → dayhoff_tools-1.1.6.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
-
import subprocess
|
4
3
|
import shlex
|
4
|
+
import shutil
|
5
|
+
import subprocess
|
5
6
|
from abc import ABC, abstractmethod
|
7
|
+
from pathlib import Path
|
6
8
|
|
7
9
|
logger = logging.getLogger(__name__)
|
8
10
|
|
@@ -123,3 +125,305 @@ class BoltzPredictor(Processor):
|
|
123
125
|
f"Boltz prediction completed successfully. Output in {expected_output_dir}"
|
124
126
|
)
|
125
127
|
return expected_output_dir
|
128
|
+
|
129
|
+
|
130
|
+
class MMSeqsProfileProcessor(Processor):
|
131
|
+
"""Processor for running MMseqs2 profile searches.
|
132
|
+
|
133
|
+
This class wraps the MMseqs2 workflow to perform a profile-based search
|
134
|
+
against a target database using a query FASTA.
|
135
|
+
"""
|
136
|
+
|
137
|
+
def __init__(
|
138
|
+
self,
|
139
|
+
query_fasta_path_in_image: str,
|
140
|
+
num_threads: int = 8,
|
141
|
+
mmseqs_args: dict | None = None,
|
142
|
+
):
|
143
|
+
"""Initialize the MMSeqsProfileProcessor.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
query_fasta_path_in_image: Path to the query FASTA file. This path is expected
|
147
|
+
to be accessible within the execution environment (e.g.,
|
148
|
+
packaged in a Docker image).
|
149
|
+
num_threads: Number of threads to use for MMseqs2 commands.
|
150
|
+
mmseqs_args: A dictionary of additional MMseqs2 parameters.
|
151
|
+
Expected keys: "memory_limit_gb", "evalue", "sensitivity",
|
152
|
+
"max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
|
153
|
+
Defaults are used if not provided.
|
154
|
+
"""
|
155
|
+
if not Path(query_fasta_path_in_image).is_file():
|
156
|
+
raise FileNotFoundError(
|
157
|
+
f"Query FASTA file not found at: {query_fasta_path_in_image}"
|
158
|
+
)
|
159
|
+
self.query_fasta_path = query_fasta_path_in_image
|
160
|
+
self.num_threads = str(num_threads) # MMseqs2 expects string for threads
|
161
|
+
|
162
|
+
default_mmseqs_args = {
|
163
|
+
"memory_limit_gb": "30",
|
164
|
+
"evalue": "10",
|
165
|
+
"sensitivity": "7.5",
|
166
|
+
"max_seqs_search": "300",
|
167
|
+
"min_seq_id_cluster": "0.8",
|
168
|
+
"max_seqs_profile_msa": "1000",
|
169
|
+
}
|
170
|
+
if mmseqs_args:
|
171
|
+
self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
|
172
|
+
else:
|
173
|
+
self.mmseqs_args = default_mmseqs_args
|
174
|
+
|
175
|
+
logger.info(
|
176
|
+
f"MMSeqsProfileProcessor initialized with query: {self.query_fasta_path}"
|
177
|
+
)
|
178
|
+
logger.info(f"MMSeqs args: {self.mmseqs_args}")
|
179
|
+
logger.info(f"Num threads: {self.num_threads}")
|
180
|
+
|
181
|
+
def _run_mmseqs_command(
|
182
|
+
self, command_parts: list[str], step_description: str, work_dir: Path
|
183
|
+
):
|
184
|
+
"""Runs an MMseqs2 command and logs its execution.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
command_parts: A list of strings representing the command and its arguments.
|
188
|
+
step_description: A human-readable description of the MMseqs2 step.
|
189
|
+
work_dir: The working directory for the command.
|
190
|
+
|
191
|
+
Raises:
|
192
|
+
subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
|
193
|
+
"""
|
194
|
+
full_command = " ".join(command_parts)
|
195
|
+
logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
|
196
|
+
logger.info(f"Command: {full_command}")
|
197
|
+
try:
|
198
|
+
process = subprocess.run(
|
199
|
+
command_parts,
|
200
|
+
check=True,
|
201
|
+
stdout=subprocess.PIPE,
|
202
|
+
stderr=subprocess.PIPE,
|
203
|
+
text=True,
|
204
|
+
cwd=work_dir, # Run command in the specified working directory
|
205
|
+
)
|
206
|
+
if process.stdout:
|
207
|
+
logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
|
208
|
+
if process.stderr: # MMseqs often outputs informational messages to stderr
|
209
|
+
logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
|
210
|
+
logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
|
211
|
+
except subprocess.CalledProcessError as e:
|
212
|
+
logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
|
213
|
+
if e.stdout:
|
214
|
+
logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
|
215
|
+
if e.stderr:
|
216
|
+
logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
|
217
|
+
raise
|
218
|
+
|
219
|
+
def run(self, input_file: str) -> str:
|
220
|
+
"""Run MMseqs2 profile search.
|
221
|
+
|
222
|
+
The input_file is the target FASTA. The query FASTA is provided
|
223
|
+
during initialization.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
input_file: Path to the input target FASTA file.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
Path to the output directory containing results.m8 and results.fasta.
|
230
|
+
|
231
|
+
Raises:
|
232
|
+
subprocess.CalledProcessError: If any MMseqs2 command fails.
|
233
|
+
FileNotFoundError: If the input_file is not found.
|
234
|
+
"""
|
235
|
+
if not Path(input_file).is_file():
|
236
|
+
raise FileNotFoundError(f"Input target FASTA file not found: {input_file}")
|
237
|
+
|
238
|
+
input_file_path = Path(input_file).resolve() # Ensure absolute path
|
239
|
+
target_fasta_filename = input_file_path.name
|
240
|
+
|
241
|
+
# Create a unique base directory for this run's outputs and temp files
|
242
|
+
# This directory will be returned and subsequently uploaded by the Operator
|
243
|
+
run_base_dir_name = f"mmseqs_run_{Path(target_fasta_filename).stem}"
|
244
|
+
run_base_dir = Path(run_base_dir_name).resolve()
|
245
|
+
run_base_dir.mkdir(parents=True, exist_ok=True)
|
246
|
+
logger.info(f"Created run base directory: {run_base_dir}")
|
247
|
+
|
248
|
+
# Define local paths within the run_base_dir
|
249
|
+
local_target_file = run_base_dir / target_fasta_filename
|
250
|
+
# Copy the target file into the run directory to keep inputs and outputs together
|
251
|
+
shutil.copy(input_file_path, local_target_file)
|
252
|
+
logger.info(f"Copied target file {input_file_path} to {local_target_file}")
|
253
|
+
|
254
|
+
# Query file is already specified by self.query_fasta_path (path in image)
|
255
|
+
local_query_file = Path(self.query_fasta_path).resolve()
|
256
|
+
|
257
|
+
# Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
|
258
|
+
mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
|
259
|
+
mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
|
260
|
+
logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
|
261
|
+
|
262
|
+
# Define output file paths directly within run_base_dir
|
263
|
+
local_results_m8_file = run_base_dir / "results.m8"
|
264
|
+
local_hits_fasta_file = run_base_dir / "results.fasta"
|
265
|
+
|
266
|
+
# --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
|
267
|
+
query_db = mmseqs_temp_dir / "queryDB"
|
268
|
+
target_db = mmseqs_temp_dir / "targetDB"
|
269
|
+
# Ensure local_target_file is used for creating targetDB
|
270
|
+
target_db_input_file = local_target_file
|
271
|
+
|
272
|
+
query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
|
273
|
+
query_db_rep = mmseqs_temp_dir / "queryDB_rep"
|
274
|
+
aln_db = mmseqs_temp_dir / "alnDB"
|
275
|
+
profile_db = mmseqs_temp_dir / "profileDB"
|
276
|
+
result_db = mmseqs_temp_dir / "resultDB"
|
277
|
+
hits_db = mmseqs_temp_dir / "hitsDB"
|
278
|
+
|
279
|
+
try:
|
280
|
+
# 1. Create query database
|
281
|
+
self._run_mmseqs_command(
|
282
|
+
["mmseqs", "createdb", str(local_query_file), str(query_db)],
|
283
|
+
"Create query DB",
|
284
|
+
run_base_dir, # Working directory for the command
|
285
|
+
)
|
286
|
+
|
287
|
+
# 2. Create target database
|
288
|
+
self._run_mmseqs_command(
|
289
|
+
["mmseqs", "createdb", str(target_db_input_file), str(target_db)],
|
290
|
+
"Create target DB",
|
291
|
+
run_base_dir,
|
292
|
+
)
|
293
|
+
|
294
|
+
# 3. Cluster query sequences
|
295
|
+
self._run_mmseqs_command(
|
296
|
+
[
|
297
|
+
"mmseqs",
|
298
|
+
"cluster",
|
299
|
+
str(query_db),
|
300
|
+
str(query_db_cluster),
|
301
|
+
str(
|
302
|
+
mmseqs_temp_dir / "tmp_cluster"
|
303
|
+
), # MMseqs needs a temp dir for cluster
|
304
|
+
"--min-seq-id",
|
305
|
+
self.mmseqs_args["min_seq_id_cluster"],
|
306
|
+
"--threads",
|
307
|
+
self.num_threads,
|
308
|
+
],
|
309
|
+
"Cluster query sequences",
|
310
|
+
run_base_dir,
|
311
|
+
)
|
312
|
+
|
313
|
+
# 4. Create representative set from query clusters
|
314
|
+
self._run_mmseqs_command(
|
315
|
+
[
|
316
|
+
"mmseqs",
|
317
|
+
"createsubdb",
|
318
|
+
str(query_db_cluster),
|
319
|
+
str(query_db),
|
320
|
+
str(query_db_rep),
|
321
|
+
],
|
322
|
+
"Create representative query set",
|
323
|
+
run_base_dir,
|
324
|
+
)
|
325
|
+
|
326
|
+
# 5. Create MSA for profile generation
|
327
|
+
self._run_mmseqs_command(
|
328
|
+
[
|
329
|
+
"mmseqs",
|
330
|
+
"search",
|
331
|
+
str(query_db_rep),
|
332
|
+
str(query_db), # Search representative against full query DB
|
333
|
+
str(aln_db),
|
334
|
+
str(mmseqs_temp_dir / "tmp_search_msa"), # Temp for this search
|
335
|
+
"--max-seqs",
|
336
|
+
self.mmseqs_args["max_seqs_profile_msa"],
|
337
|
+
"--threads",
|
338
|
+
self.num_threads,
|
339
|
+
],
|
340
|
+
"Create MSA for profile",
|
341
|
+
run_base_dir,
|
342
|
+
)
|
343
|
+
|
344
|
+
# 6. Create profile database
|
345
|
+
self._run_mmseqs_command(
|
346
|
+
[
|
347
|
+
"mmseqs",
|
348
|
+
"result2profile",
|
349
|
+
str(query_db_rep), # Use query_db_rep as input for profile
|
350
|
+
str(query_db), # Full query DB as second arg
|
351
|
+
str(aln_db),
|
352
|
+
str(profile_db),
|
353
|
+
"--threads", # Added threads option
|
354
|
+
self.num_threads,
|
355
|
+
],
|
356
|
+
"Create profile DB",
|
357
|
+
run_base_dir,
|
358
|
+
)
|
359
|
+
|
360
|
+
# 7. Perform profile search
|
361
|
+
self._run_mmseqs_command(
|
362
|
+
[
|
363
|
+
"mmseqs",
|
364
|
+
"search",
|
365
|
+
str(profile_db),
|
366
|
+
str(target_db),
|
367
|
+
str(result_db),
|
368
|
+
str(mmseqs_temp_dir / "tmp_search_profile"), # Temp for this search
|
369
|
+
"--split-memory-limit",
|
370
|
+
f"{self.mmseqs_args['memory_limit_gb']}G",
|
371
|
+
"-e",
|
372
|
+
self.mmseqs_args["evalue"],
|
373
|
+
"--max-seqs",
|
374
|
+
self.mmseqs_args["max_seqs_search"],
|
375
|
+
"--threads",
|
376
|
+
self.num_threads,
|
377
|
+
"-s",
|
378
|
+
self.mmseqs_args["sensitivity"],
|
379
|
+
],
|
380
|
+
"Perform profile search",
|
381
|
+
run_base_dir,
|
382
|
+
)
|
383
|
+
|
384
|
+
# 8. Convert results to tabular format (M8)
|
385
|
+
self._run_mmseqs_command(
|
386
|
+
[
|
387
|
+
"mmseqs",
|
388
|
+
"convertalis",
|
389
|
+
str(profile_db), # Query DB used for search (profileDB)
|
390
|
+
str(target_db),
|
391
|
+
str(result_db),
|
392
|
+
str(local_results_m8_file), # Output M8 file
|
393
|
+
"--threads",
|
394
|
+
self.num_threads,
|
395
|
+
],
|
396
|
+
"Convert results to M8",
|
397
|
+
run_base_dir,
|
398
|
+
)
|
399
|
+
|
400
|
+
# 9. Create subdatabase of hits from original target_db
|
401
|
+
self._run_mmseqs_command(
|
402
|
+
["mmseqs", "createsubdb", str(result_db), str(target_db), str(hits_db)],
|
403
|
+
"Create hits subDB from target_db",
|
404
|
+
run_base_dir,
|
405
|
+
)
|
406
|
+
|
407
|
+
# 10. Convert hit sequences to FASTA
|
408
|
+
self._run_mmseqs_command(
|
409
|
+
["mmseqs", "convert2fasta", str(hits_db), str(local_hits_fasta_file)],
|
410
|
+
"Convert hits to FASTA",
|
411
|
+
run_base_dir,
|
412
|
+
)
|
413
|
+
|
414
|
+
logger.info(
|
415
|
+
f"MMseqs2 workflow completed successfully. Outputs in {run_base_dir}"
|
416
|
+
)
|
417
|
+
|
418
|
+
finally:
|
419
|
+
# Clean up the MMseqs2 temporary directory
|
420
|
+
if mmseqs_temp_dir.exists():
|
421
|
+
shutil.rmtree(mmseqs_temp_dir)
|
422
|
+
logger.info(
|
423
|
+
f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
|
424
|
+
)
|
425
|
+
# The input_file (original target) is managed by the Operator
|
426
|
+
# The local_target_file (copy inside run_base_dir) will be cleaned up
|
427
|
+
# by the Operator when run_base_dir is deleted after upload.
|
428
|
+
|
429
|
+
return str(run_base_dir) # Return the path to the directory containing results
|
@@ -11,7 +11,7 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRR
|
|
11
11
|
dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
|
14
|
-
dayhoff_tools/deployment/processors.py,sha256=
|
14
|
+
dayhoff_tools/deployment/processors.py,sha256=yex3bWunFLuzHU6jV-b0ab4w8r-AI4qkx907ManROAg,16857
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
16
|
dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
|
17
17
|
dayhoff_tools/fasta.py,sha256=e7xw3pInoupqCGE0-fJTOzmW_earL1M7qPyoqIPfUT4,46269
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.6.dist-info/METADATA,sha256=HRrnLM_xID_Tq-ksoVrDkuYVxQWniXpY0FtdhyMU5DE,2252
|
30
|
+
dayhoff_tools-1.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.6.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|