dh-cli 0.2.11__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dh_cli-0.2.11 → dh_cli-0.3.1}/PKG-INFO +1 -1
- {dh_cli-0.2.11 → dh_cli-0.3.1}/pyproject.toml +1 -1
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/__init__.py +9 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/finalize.py +54 -0
- dh_cli-0.3.1/src/dh_cli/batch/commands/protmpnn.py +418 -0
- dh_cli-0.3.1/src/dh_cli/batch/commands/protmpnn_to_boltz.py +249 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/.gitignore +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/LICENSE +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/README.md +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/__init__.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/aws_batch.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/__init__.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/boltz.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/cancel.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/clean.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/embed_t5.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/list_jobs.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/local.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/logs.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/retry.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/status.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/submit.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/train.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/wait_for.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/fasta_utils.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/h5_utils.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/job_id.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/manifest.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/s3_transport.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/cloud_commands.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/codeartifact.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/__init__.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/api_client.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/auth.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/engine_commands.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/progress.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/ssh_config.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/studio_commands.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/github_commands.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/main.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/utility_commands.py +0 -0
- {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/warehouse.py +0 -0
|
@@ -18,6 +18,8 @@ from .commands.finalize import finalize
|
|
|
18
18
|
from .commands.list_jobs import list_jobs
|
|
19
19
|
from .commands.local import local
|
|
20
20
|
from .commands.logs import logs
|
|
21
|
+
from .commands.protmpnn import protmpnn
|
|
22
|
+
from .commands.protmpnn_to_boltz import protmpnn_to_boltz
|
|
21
23
|
from .commands.retry import retry
|
|
22
24
|
from .commands.status import status
|
|
23
25
|
from .commands.submit import submit
|
|
@@ -49,6 +51,11 @@ def batch_cli():
|
|
|
49
51
|
Structure Prediction:
|
|
50
52
|
boltz Predict protein structures with Boltz
|
|
51
53
|
|
|
54
|
+
\b
|
|
55
|
+
Sequence Design:
|
|
56
|
+
protmpnn Design sequences with ProtMPNN/LigandMPNN
|
|
57
|
+
protmpnn-to-boltz Convert top variants to Boltz validation YAMLs
|
|
58
|
+
|
|
52
59
|
\b
|
|
53
60
|
ML Training:
|
|
54
61
|
train Submit hatchery ML training jobs
|
|
@@ -91,6 +98,8 @@ batch_cli.add_command(wait_for, name="wait-for")
|
|
|
91
98
|
# Register pipeline commands
|
|
92
99
|
batch_cli.add_command(embed_t5, name="embed-t5")
|
|
93
100
|
batch_cli.add_command(boltz)
|
|
101
|
+
batch_cli.add_command(protmpnn)
|
|
102
|
+
batch_cli.add_command(protmpnn_to_boltz, name="protmpnn-to-boltz")
|
|
94
103
|
batch_cli.add_command(train)
|
|
95
104
|
|
|
96
105
|
__all__ = ["batch_cli"]
|
|
@@ -145,6 +145,8 @@ def finalize(
|
|
|
145
145
|
)
|
|
146
146
|
)
|
|
147
147
|
_finalize_boltz(output_dir, output_path, full_output=full_output)
|
|
148
|
+
elif manifest.pipeline == "protmpnn":
|
|
149
|
+
_finalize_protmpnn(output_dir, output_path)
|
|
148
150
|
else:
|
|
149
151
|
_finalize_generic(output_dir, output_path)
|
|
150
152
|
|
|
@@ -447,3 +449,55 @@ def _finalize_generic(output_dir: Path, output_path: Path):
|
|
|
447
449
|
click.echo(f"Copying output directory to {output_path}...")
|
|
448
450
|
shutil.copytree(output_dir, output_path)
|
|
449
451
|
click.echo(click.style("✓ Output copied successfully", fg="green"))
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _finalize_protmpnn(output_dir: Path, output_path: Path):
|
|
455
|
+
"""Merge per-worker ProtMPNN results into final output.
|
|
456
|
+
|
|
457
|
+
1. Merge results_worker_*.csv into results.csv (sorted by confidence)
|
|
458
|
+
2. Flatten all seqs/ and pdbs/ into output seqs/ and pdbs/ dirs
|
|
459
|
+
3. Print summary
|
|
460
|
+
"""
|
|
461
|
+
import pandas as pd
|
|
462
|
+
|
|
463
|
+
csv_files = sorted(output_dir.glob("results_worker_*.csv"))
|
|
464
|
+
|
|
465
|
+
if not csv_files:
|
|
466
|
+
click.echo("No results_worker_*.csv files found in output directory.", err=True)
|
|
467
|
+
raise SystemExit(1)
|
|
468
|
+
|
|
469
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
470
|
+
|
|
471
|
+
dfs = [pd.read_csv(f) for f in csv_files]
|
|
472
|
+
merged = pd.concat(dfs, ignore_index=True)
|
|
473
|
+
merged = merged.sort_values("overall_confidence", ascending=False)
|
|
474
|
+
merged.to_csv(output_path / "results.csv", index=False)
|
|
475
|
+
|
|
476
|
+
num_variants = len(merged)
|
|
477
|
+
num_configs = merged["config_name"].nunique()
|
|
478
|
+
|
|
479
|
+
seqs_dest = output_path / "seqs"
|
|
480
|
+
seqs_dest.mkdir(exist_ok=True)
|
|
481
|
+
for config_dir in output_dir.iterdir():
|
|
482
|
+
if config_dir.is_dir():
|
|
483
|
+
config_seqs = config_dir / "seqs"
|
|
484
|
+
if config_seqs.exists():
|
|
485
|
+
for fa_file in config_seqs.glob("*.fa"):
|
|
486
|
+
shutil.copy2(fa_file, seqs_dest / fa_file.name)
|
|
487
|
+
|
|
488
|
+
pdbs_dest = output_path / "pdbs"
|
|
489
|
+
pdbs_dest.mkdir(exist_ok=True)
|
|
490
|
+
for config_dir in output_dir.iterdir():
|
|
491
|
+
if config_dir.is_dir():
|
|
492
|
+
config_pdbs = config_dir / "pdbs"
|
|
493
|
+
if config_pdbs.exists():
|
|
494
|
+
for pdb_file in config_pdbs.glob("*.pdb"):
|
|
495
|
+
shutil.copy2(pdb_file, pdbs_dest / pdb_file.name)
|
|
496
|
+
|
|
497
|
+
top_conf = merged.iloc[0]["overall_confidence"] if num_variants > 0 else "N/A"
|
|
498
|
+
|
|
499
|
+
click.echo(
|
|
500
|
+
f"{num_variants} variants from {num_configs} config(s), "
|
|
501
|
+
f"top confidence: {top_conf}"
|
|
502
|
+
)
|
|
503
|
+
click.echo(click.style(f"Results: {output_path / 'results.csv'}", fg="green"))
|
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""ProtMPNN/LigandMPNN sequence design pipeline command."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ..aws_batch import BatchClient, BatchError
|
|
11
|
+
from ..job_id import generate_job_id
|
|
12
|
+
from ..manifest import (
|
|
13
|
+
BATCH_JOBS_BASE,
|
|
14
|
+
BatchConfig,
|
|
15
|
+
InputConfig,
|
|
16
|
+
JobManifest,
|
|
17
|
+
JobStatus,
|
|
18
|
+
OutputConfig,
|
|
19
|
+
create_job_directory,
|
|
20
|
+
get_job_dir,
|
|
21
|
+
save_manifest,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
DEFAULT_QUEUE = "t4-1x-spot"
|
|
25
|
+
MAX_WORKERS = 50
|
|
26
|
+
FILES_PER_WORKER = 10
|
|
27
|
+
DEFAULT_JOB_DEFINITION = "dayhoff-protmpnn"
|
|
28
|
+
DEFAULT_IMAGE_URI = (
|
|
29
|
+
"074735440724.dkr.ecr.us-east-1.amazonaws.com/dayhoff:protmpnn-latest"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@click.command()
|
|
34
|
+
@click.argument("input_dir", type=click.Path(exists=True))
|
|
35
|
+
@click.option(
|
|
36
|
+
"--workers",
|
|
37
|
+
default=None,
|
|
38
|
+
type=int,
|
|
39
|
+
help="Number of parallel workers [default: ~1 per 10 files]",
|
|
40
|
+
)
|
|
41
|
+
@click.option(
|
|
42
|
+
"--queue",
|
|
43
|
+
default=DEFAULT_QUEUE,
|
|
44
|
+
help=f"Batch queue [default: {DEFAULT_QUEUE}]",
|
|
45
|
+
)
|
|
46
|
+
@click.option("--dry-run", is_flag=True, help="Show plan without submitting")
|
|
47
|
+
@click.option(
|
|
48
|
+
"--local",
|
|
49
|
+
"run_local",
|
|
50
|
+
is_flag=True,
|
|
51
|
+
help="Force local execution via Docker",
|
|
52
|
+
)
|
|
53
|
+
@click.option(
|
|
54
|
+
"--remote",
|
|
55
|
+
"run_remote",
|
|
56
|
+
is_flag=True,
|
|
57
|
+
help="Force Batch submission (override auto-detect)",
|
|
58
|
+
)
|
|
59
|
+
@click.option(
|
|
60
|
+
"--shell",
|
|
61
|
+
"run_shell",
|
|
62
|
+
is_flag=True,
|
|
63
|
+
help="Drop into container shell for debugging",
|
|
64
|
+
)
|
|
65
|
+
@click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
|
|
66
|
+
def protmpnn(input_dir, workers, queue, dry_run, run_local, run_remote, run_shell, base_path):
|
|
67
|
+
"""Design protein sequences with ProtMPNN/LigandMPNN.
|
|
68
|
+
|
|
69
|
+
Processes a directory of YAML config files, each specifying a PDB
|
|
70
|
+
structure and design parameters. Generates variant sequences ranked
|
|
71
|
+
by confidence.
|
|
72
|
+
|
|
73
|
+
\b
|
|
74
|
+
Examples:
|
|
75
|
+
# Run on a GPU engine (auto-detects GPU, runs locally)
|
|
76
|
+
dh batch protmpnn input/
|
|
77
|
+
|
|
78
|
+
# Force remote Batch submission
|
|
79
|
+
dh batch protmpnn input/ --remote
|
|
80
|
+
|
|
81
|
+
# Preview what would run
|
|
82
|
+
dh batch protmpnn input/ --dry-run
|
|
83
|
+
|
|
84
|
+
# Run locally via Docker
|
|
85
|
+
dh batch protmpnn input/ --local
|
|
86
|
+
|
|
87
|
+
\b
|
|
88
|
+
After remote job completes:
|
|
89
|
+
dh batch status <job-id>
|
|
90
|
+
dh batch finalize <job-id> --output ./results/
|
|
91
|
+
|
|
92
|
+
\b
|
|
93
|
+
YAML config format:
|
|
94
|
+
version: 1
|
|
95
|
+
pdb_path: 6DHI.pdb
|
|
96
|
+
model_type: ligand_mpnn
|
|
97
|
+
num_sequences: 20
|
|
98
|
+
seed: 42
|
|
99
|
+
temperature: 0.1
|
|
100
|
+
fixed_residues: "A42 A181 A217 A218"
|
|
101
|
+
"""
|
|
102
|
+
input_path = Path(input_dir).resolve()
|
|
103
|
+
|
|
104
|
+
if run_shell:
|
|
105
|
+
_run_shell_mode(input_path)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
if run_local:
|
|
109
|
+
_run_local_mode(input_path)
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
# Auto-detect GPU for smart defaulting
|
|
113
|
+
if not run_remote and not dry_run:
|
|
114
|
+
if _has_local_gpu():
|
|
115
|
+
click.echo("GPU detected — running locally (use --remote to override)")
|
|
116
|
+
_run_local_mode(input_path)
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
_submit_batch_job(input_path, workers, queue, dry_run, base_path)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _has_local_gpu() -> bool:
|
|
123
|
+
"""Check if a local NVIDIA GPU is available."""
|
|
124
|
+
import subprocess
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
result = subprocess.run(
|
|
128
|
+
["nvidia-smi"],
|
|
129
|
+
capture_output=True,
|
|
130
|
+
timeout=5,
|
|
131
|
+
)
|
|
132
|
+
return result.returncode == 0
|
|
133
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _count_yaml_files(input_path: Path) -> int:
|
|
138
|
+
return len(list(input_path.glob("*.yaml")))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _copy_inputs_to_job_dir(input_path: Path, job_dir: Path) -> int:
|
|
142
|
+
"""Copy input YAML and PDB files to job directory."""
|
|
143
|
+
input_dir = job_dir / "input"
|
|
144
|
+
input_dir.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
|
|
146
|
+
count = 0
|
|
147
|
+
for yaml_file in sorted(input_path.glob("*.yaml")):
|
|
148
|
+
shutil.copy2(yaml_file, input_dir / yaml_file.name)
|
|
149
|
+
count += 1
|
|
150
|
+
|
|
151
|
+
# Copy PDB files alongside YAMLs
|
|
152
|
+
for pdb_file in sorted(input_path.glob("*.pdb")):
|
|
153
|
+
shutil.copy2(pdb_file, input_dir / pdb_file.name)
|
|
154
|
+
|
|
155
|
+
return count
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _submit_batch_job(
|
|
159
|
+
input_path: Path,
|
|
160
|
+
workers: int | None,
|
|
161
|
+
queue: str,
|
|
162
|
+
dry_run: bool,
|
|
163
|
+
base_path: str,
|
|
164
|
+
):
|
|
165
|
+
"""Submit ProtMPNN job to AWS Batch."""
|
|
166
|
+
click.echo(f"Scanning {input_path} for YAML files...")
|
|
167
|
+
num_files = _count_yaml_files(input_path)
|
|
168
|
+
|
|
169
|
+
if num_files == 0:
|
|
170
|
+
click.echo(
|
|
171
|
+
click.style("Error: No YAML files found in input directory", fg="red"),
|
|
172
|
+
err=True,
|
|
173
|
+
)
|
|
174
|
+
raise SystemExit(1)
|
|
175
|
+
|
|
176
|
+
click.echo(f"Found {num_files} config(s) to process")
|
|
177
|
+
|
|
178
|
+
if workers is None:
|
|
179
|
+
workers = max(1, min(math.ceil(num_files / FILES_PER_WORKER), MAX_WORKERS))
|
|
180
|
+
array_size = min(num_files, workers)
|
|
181
|
+
|
|
182
|
+
job_id = generate_job_id("protmpnn")
|
|
183
|
+
|
|
184
|
+
click.echo()
|
|
185
|
+
click.echo(f"Job ID: {job_id}")
|
|
186
|
+
click.echo(f"Input: {input_path}")
|
|
187
|
+
click.echo(f"Configs: {num_files}")
|
|
188
|
+
click.echo(f"Workers: {array_size}")
|
|
189
|
+
files_per_worker = math.ceil(num_files / array_size)
|
|
190
|
+
click.echo(f"Files per worker: ~{files_per_worker}")
|
|
191
|
+
click.echo(f"Queue: {queue}")
|
|
192
|
+
click.echo(f"Job definition: {DEFAULT_JOB_DEFINITION}")
|
|
193
|
+
|
|
194
|
+
if dry_run:
|
|
195
|
+
click.echo()
|
|
196
|
+
click.echo(click.style("Dry run - job not submitted", fg="yellow"))
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
if not click.confirm("\nSubmit job?", default=True):
|
|
200
|
+
click.echo("Cancelled.")
|
|
201
|
+
raise SystemExit(0)
|
|
202
|
+
click.echo()
|
|
203
|
+
|
|
204
|
+
job_dir = create_job_directory(job_id, base_path)
|
|
205
|
+
click.echo(f"Created job directory: {job_dir}")
|
|
206
|
+
|
|
207
|
+
click.echo("Copying input files...")
|
|
208
|
+
copied = _copy_inputs_to_job_dir(input_path, job_dir)
|
|
209
|
+
click.echo(f"Copied {copied} YAML files")
|
|
210
|
+
|
|
211
|
+
manifest = JobManifest(
|
|
212
|
+
job_id=job_id,
|
|
213
|
+
user=job_id.split("-")[0],
|
|
214
|
+
pipeline="protmpnn",
|
|
215
|
+
status=JobStatus.PENDING,
|
|
216
|
+
image_uri=DEFAULT_IMAGE_URI,
|
|
217
|
+
input=InputConfig(
|
|
218
|
+
source=str(input_path),
|
|
219
|
+
num_sequences=num_files,
|
|
220
|
+
num_chunks=array_size,
|
|
221
|
+
),
|
|
222
|
+
batch=BatchConfig(
|
|
223
|
+
queue=queue,
|
|
224
|
+
job_definition=DEFAULT_JOB_DEFINITION,
|
|
225
|
+
array_size=array_size,
|
|
226
|
+
),
|
|
227
|
+
output=OutputConfig(
|
|
228
|
+
destination=None,
|
|
229
|
+
finalized=False,
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
save_manifest(manifest, base_path)
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
client = BatchClient()
|
|
237
|
+
|
|
238
|
+
environment = {
|
|
239
|
+
"JOB_DIR": str(job_dir),
|
|
240
|
+
"JOB_ID": job_id,
|
|
241
|
+
"BATCH_ARRAY_SIZE": str(array_size),
|
|
242
|
+
"BATCH_NUM_FILES": str(num_files),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
batch_job_id = client.submit_job(
|
|
246
|
+
job_name=job_id,
|
|
247
|
+
job_definition=DEFAULT_JOB_DEFINITION,
|
|
248
|
+
job_queue=queue,
|
|
249
|
+
array_size=array_size,
|
|
250
|
+
environment=environment,
|
|
251
|
+
timeout_seconds=1 * 3600, # 1 hour
|
|
252
|
+
retry_attempts=5,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
manifest.status = JobStatus.SUBMITTED
|
|
256
|
+
manifest.batch.job_id = batch_job_id
|
|
257
|
+
save_manifest(manifest, base_path)
|
|
258
|
+
|
|
259
|
+
click.echo()
|
|
260
|
+
click.echo(click.style("Job submitted successfully!", fg="green"))
|
|
261
|
+
click.echo()
|
|
262
|
+
click.echo(f"AWS Batch Job ID: {batch_job_id}")
|
|
263
|
+
click.echo()
|
|
264
|
+
click.echo("Next steps:")
|
|
265
|
+
click.echo(f" Check status: dh batch status {job_id}")
|
|
266
|
+
click.echo(f" View logs: dh batch logs {job_id}")
|
|
267
|
+
click.echo(f" Cancel: dh batch cancel {job_id}")
|
|
268
|
+
click.echo()
|
|
269
|
+
click.echo("After completion:")
|
|
270
|
+
click.echo(
|
|
271
|
+
f" Finalize: dh batch finalize {job_id} --output ./results/"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
except BatchError as e:
|
|
275
|
+
manifest.status = JobStatus.FAILED
|
|
276
|
+
manifest.error_message = str(e)
|
|
277
|
+
save_manifest(manifest, base_path)
|
|
278
|
+
click.echo(click.style(f"Failed to submit job: {e}", fg="red"), err=True)
|
|
279
|
+
raise SystemExit(1)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _run_local_mode(input_path: Path):
|
|
283
|
+
"""Run ProtMPNN locally in a Docker container."""
|
|
284
|
+
import subprocess
|
|
285
|
+
|
|
286
|
+
click.echo("Running ProtMPNN locally in container...")
|
|
287
|
+
click.echo(f"Input directory: {input_path}")
|
|
288
|
+
|
|
289
|
+
yaml_files = list(input_path.glob("*.yaml"))
|
|
290
|
+
if not yaml_files:
|
|
291
|
+
click.echo(click.style("Error: No YAML files found", fg="red"), err=True)
|
|
292
|
+
raise SystemExit(1)
|
|
293
|
+
|
|
294
|
+
click.echo(f"Found {len(yaml_files)} config file(s)")
|
|
295
|
+
|
|
296
|
+
temp_job_dir = input_path / ".local_protmpnn_job"
|
|
297
|
+
temp_input_dir = temp_job_dir / "input"
|
|
298
|
+
temp_output_dir = temp_job_dir / "output"
|
|
299
|
+
|
|
300
|
+
if temp_job_dir.exists():
|
|
301
|
+
shutil.rmtree(temp_job_dir)
|
|
302
|
+
|
|
303
|
+
temp_input_dir.mkdir(parents=True)
|
|
304
|
+
temp_output_dir.mkdir(parents=True)
|
|
305
|
+
|
|
306
|
+
for yaml_file in yaml_files:
|
|
307
|
+
shutil.copy2(yaml_file, temp_input_dir / yaml_file.name)
|
|
308
|
+
for pdb_file in input_path.glob("*.pdb"):
|
|
309
|
+
shutil.copy2(pdb_file, temp_input_dir / pdb_file.name)
|
|
310
|
+
|
|
311
|
+
click.echo(f"Output will be at: {temp_output_dir}/")
|
|
312
|
+
click.echo()
|
|
313
|
+
|
|
314
|
+
cmd = [
|
|
315
|
+
"docker",
|
|
316
|
+
"run",
|
|
317
|
+
"--rm",
|
|
318
|
+
"--gpus",
|
|
319
|
+
"all",
|
|
320
|
+
"-v",
|
|
321
|
+
"/primordial:/primordial",
|
|
322
|
+
"-v",
|
|
323
|
+
f"{temp_job_dir}:{temp_job_dir}",
|
|
324
|
+
"-e",
|
|
325
|
+
f"JOB_DIR={temp_job_dir}",
|
|
326
|
+
"-e",
|
|
327
|
+
"AWS_BATCH_JOB_ARRAY_INDEX=0",
|
|
328
|
+
"-e",
|
|
329
|
+
"BATCH_ARRAY_SIZE=1",
|
|
330
|
+
"-e",
|
|
331
|
+
f"BATCH_NUM_FILES={len(yaml_files)}",
|
|
332
|
+
DEFAULT_IMAGE_URI,
|
|
333
|
+
]
|
|
334
|
+
|
|
335
|
+
click.echo(f"Running: {' '.join(cmd)}")
|
|
336
|
+
click.echo()
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
result = subprocess.run(cmd)
|
|
340
|
+
if result.returncode != 0:
|
|
341
|
+
click.echo(
|
|
342
|
+
click.style(
|
|
343
|
+
f"Container exited with code {result.returncode}", fg="red"
|
|
344
|
+
),
|
|
345
|
+
err=True,
|
|
346
|
+
)
|
|
347
|
+
raise SystemExit(result.returncode)
|
|
348
|
+
|
|
349
|
+
csv_files = list(temp_output_dir.glob("results_worker_*.csv"))
|
|
350
|
+
if csv_files:
|
|
351
|
+
# Merge worker CSVs into results.csv for local mode
|
|
352
|
+
import pandas as pd
|
|
353
|
+
|
|
354
|
+
dfs = [pd.read_csv(f) for f in csv_files]
|
|
355
|
+
merged = pd.concat(dfs, ignore_index=True)
|
|
356
|
+
merged = merged.sort_values("overall_confidence", ascending=False)
|
|
357
|
+
merged.to_csv(temp_output_dir / "results.csv", index=False)
|
|
358
|
+
|
|
359
|
+
click.echo()
|
|
360
|
+
click.echo(click.style("Design complete!", fg="green"))
|
|
361
|
+
click.echo(f"Results: {temp_output_dir / 'results.csv'}")
|
|
362
|
+
click.echo(f" {len(merged)} variants generated")
|
|
363
|
+
else:
|
|
364
|
+
click.echo(click.style("Warning: No results CSV found", fg="yellow"))
|
|
365
|
+
|
|
366
|
+
except FileNotFoundError:
|
|
367
|
+
click.echo(
|
|
368
|
+
click.style(
|
|
369
|
+
"Error: Docker not found. Is Docker installed and running?",
|
|
370
|
+
fg="red",
|
|
371
|
+
),
|
|
372
|
+
err=True,
|
|
373
|
+
)
|
|
374
|
+
raise SystemExit(1)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _run_shell_mode(input_path: Path):
|
|
378
|
+
"""Drop into container shell for debugging."""
|
|
379
|
+
import subprocess
|
|
380
|
+
|
|
381
|
+
click.echo("Dropping into container shell...")
|
|
382
|
+
click.echo(f"Input will be available at: /input/")
|
|
383
|
+
click.echo()
|
|
384
|
+
|
|
385
|
+
cmd = [
|
|
386
|
+
"docker",
|
|
387
|
+
"run",
|
|
388
|
+
"--rm",
|
|
389
|
+
"-it",
|
|
390
|
+
"--gpus",
|
|
391
|
+
"all",
|
|
392
|
+
"-v",
|
|
393
|
+
"/primordial:/primordial",
|
|
394
|
+
"-v",
|
|
395
|
+
f"{input_path}:/input",
|
|
396
|
+
"-e",
|
|
397
|
+
"JOB_DIR=/input",
|
|
398
|
+
"-e",
|
|
399
|
+
"AWS_BATCH_JOB_ARRAY_INDEX=0",
|
|
400
|
+
"--entrypoint",
|
|
401
|
+
"/bin/bash",
|
|
402
|
+
DEFAULT_IMAGE_URI,
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
click.echo(f"Running: {' '.join(cmd)}")
|
|
406
|
+
click.echo()
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
subprocess.run(cmd)
|
|
410
|
+
except FileNotFoundError:
|
|
411
|
+
click.echo(
|
|
412
|
+
click.style(
|
|
413
|
+
"Error: Docker not found. Is Docker installed and running?",
|
|
414
|
+
fg="red",
|
|
415
|
+
),
|
|
416
|
+
err=True,
|
|
417
|
+
)
|
|
418
|
+
raise SystemExit(1)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Convert ProtMPNN results to Boltz input YAMLs for structural validation."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@click.command("protmpnn-to-boltz")
|
|
12
|
+
@click.argument("results_dir", type=click.Path(exists=True))
|
|
13
|
+
@click.option("--top", default=10, type=int, help="Number of top variants to convert")
|
|
14
|
+
@click.option(
|
|
15
|
+
"--output",
|
|
16
|
+
"-o",
|
|
17
|
+
default=None,
|
|
18
|
+
type=click.Path(),
|
|
19
|
+
help="Output directory for Boltz YAMLs [default: boltz_input/]",
|
|
20
|
+
)
|
|
21
|
+
@click.option(
|
|
22
|
+
"--config",
|
|
23
|
+
"config_dir",
|
|
24
|
+
default=None,
|
|
25
|
+
type=click.Path(exists=True),
|
|
26
|
+
help="Directory containing original ProtMPNN config YAMLs (for ligand_smiles)",
|
|
27
|
+
)
|
|
28
|
+
def protmpnn_to_boltz(results_dir, top, output, config_dir):
|
|
29
|
+
"""Convert top ProtMPNN variants to Boltz YAML configs.
|
|
30
|
+
|
|
31
|
+
Takes a ProtMPNN results directory (containing results.csv) and generates
|
|
32
|
+
Boltz-format YAML files for structural validation of the top-N variants.
|
|
33
|
+
|
|
34
|
+
\b
|
|
35
|
+
Examples:
|
|
36
|
+
# Convert top 10 from local run
|
|
37
|
+
dh batch protmpnn-to-boltz input/.local_protmpnn_job/output/ --output boltz_in/
|
|
38
|
+
|
|
39
|
+
# Convert top 20, pull ligand SMILES from original configs
|
|
40
|
+
dh batch protmpnn-to-boltz results/ --top 20 --config input/ -o boltz_in/
|
|
41
|
+
|
|
42
|
+
\b
|
|
43
|
+
The generated Boltz YAMLs can be used directly:
|
|
44
|
+
dh batch boltz boltz_in/
|
|
45
|
+
dh batch boltz --local boltz_in/
|
|
46
|
+
"""
|
|
47
|
+
results_path = Path(results_dir).resolve()
|
|
48
|
+
csv_path = results_path / "results.csv"
|
|
49
|
+
|
|
50
|
+
if not csv_path.exists():
|
|
51
|
+
# Try worker CSVs if results.csv not found (e.g. raw output before finalize)
|
|
52
|
+
worker_csvs = sorted(results_path.glob("results_worker_*.csv"))
|
|
53
|
+
if worker_csvs:
|
|
54
|
+
dfs = [pd.read_csv(f) for f in worker_csvs]
|
|
55
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
56
|
+
df = df.sort_values("overall_confidence", ascending=False)
|
|
57
|
+
else:
|
|
58
|
+
click.echo(
|
|
59
|
+
click.style(
|
|
60
|
+
"Error: No results.csv or results_worker_*.csv found", fg="red"
|
|
61
|
+
),
|
|
62
|
+
err=True,
|
|
63
|
+
)
|
|
64
|
+
raise SystemExit(1)
|
|
65
|
+
else:
|
|
66
|
+
df = pd.read_csv(csv_path)
|
|
67
|
+
|
|
68
|
+
if len(df) == 0:
|
|
69
|
+
click.echo(click.style("Error: Results CSV is empty", fg="red"), err=True)
|
|
70
|
+
raise SystemExit(1)
|
|
71
|
+
|
|
72
|
+
top_n = min(top, len(df))
|
|
73
|
+
top_variants = df.head(top_n)
|
|
74
|
+
|
|
75
|
+
# Resolve ligand SMILES from original config YAMLs
|
|
76
|
+
ligand_map = _load_ligand_smiles(config_dir, results_path)
|
|
77
|
+
|
|
78
|
+
output_path = Path(output or "boltz_input").resolve()
|
|
79
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
|
|
81
|
+
click.echo(f"Converting top {top_n} variants to Boltz format...")
|
|
82
|
+
|
|
83
|
+
generated = []
|
|
84
|
+
for idx, row in top_variants.iterrows():
|
|
85
|
+
config_name = row.get("config_name", "unknown")
|
|
86
|
+
variant_id = int(row.get("variant_id", idx))
|
|
87
|
+
sequence = row["sequence"]
|
|
88
|
+
confidence = row.get("overall_confidence", float("nan"))
|
|
89
|
+
|
|
90
|
+
boltz_yaml = _build_boltz_yaml(
|
|
91
|
+
sequence=sequence,
|
|
92
|
+
config_name=config_name,
|
|
93
|
+
variant_id=variant_id,
|
|
94
|
+
ligand_smiles=ligand_map.get(config_name),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
filename = f"{config_name}_var{variant_id:03d}.yaml"
|
|
98
|
+
yaml_path = output_path / filename
|
|
99
|
+
|
|
100
|
+
with open(yaml_path, "w") as f:
|
|
101
|
+
yaml.dump(boltz_yaml, f, default_flow_style=False, sort_keys=False)
|
|
102
|
+
|
|
103
|
+
generated.append((filename, confidence))
|
|
104
|
+
|
|
105
|
+
# Copy PDB files for reference if available
|
|
106
|
+
pdbs_src = results_path / "pdbs"
|
|
107
|
+
if pdbs_src.exists():
|
|
108
|
+
pdbs_dest = output_path / "reference_pdbs"
|
|
109
|
+
pdbs_dest.mkdir(exist_ok=True)
|
|
110
|
+
for pdb in pdbs_src.glob("*.pdb"):
|
|
111
|
+
shutil.copy2(pdb, pdbs_dest / pdb.name)
|
|
112
|
+
|
|
113
|
+
# Generate PyMOL visualization script
|
|
114
|
+
_write_pymol_script(output_path, results_path, generated, ligand_map)
|
|
115
|
+
|
|
116
|
+
click.echo()
|
|
117
|
+
click.echo(click.style(f"Generated {len(generated)} Boltz configs", fg="green"))
|
|
118
|
+
click.echo(f"Output: {output_path}/")
|
|
119
|
+
click.echo()
|
|
120
|
+
click.echo("Next steps:")
|
|
121
|
+
click.echo(f" dh batch boltz {output_path}/")
|
|
122
|
+
click.echo(f" dh batch boltz --local {output_path}/")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _load_ligand_smiles(
|
|
126
|
+
config_dir: str | None, results_path: Path
|
|
127
|
+
) -> dict[str, str | None]:
|
|
128
|
+
"""Load ligand_smiles from original ProtMPNN config YAMLs.
|
|
129
|
+
|
|
130
|
+
Searches config_dir first, then falls back to the input/ sibling
|
|
131
|
+
of the results directory (common in local runs).
|
|
132
|
+
"""
|
|
133
|
+
smiles_map: dict[str, str | None] = {}
|
|
134
|
+
|
|
135
|
+
search_dirs = []
|
|
136
|
+
if config_dir:
|
|
137
|
+
search_dirs.append(Path(config_dir))
|
|
138
|
+
|
|
139
|
+
# For local runs: results are at input/.local_protmpnn_job/output/
|
|
140
|
+
# Config YAMLs are at input/
|
|
141
|
+
if results_path.name == "output":
|
|
142
|
+
job_dir = results_path.parent
|
|
143
|
+
input_dir = job_dir / "input"
|
|
144
|
+
if input_dir.exists():
|
|
145
|
+
search_dirs.append(input_dir)
|
|
146
|
+
|
|
147
|
+
for search_dir in search_dirs:
|
|
148
|
+
for yaml_file in search_dir.glob("*.yaml"):
|
|
149
|
+
try:
|
|
150
|
+
with open(yaml_file) as f:
|
|
151
|
+
data = yaml.safe_load(f)
|
|
152
|
+
if isinstance(data, dict) and data.get("ligand_smiles"):
|
|
153
|
+
smiles_map[yaml_file.stem] = data["ligand_smiles"]
|
|
154
|
+
except Exception:
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
return smiles_map
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _build_boltz_yaml(
|
|
161
|
+
sequence: str,
|
|
162
|
+
config_name: str,
|
|
163
|
+
variant_id: int,
|
|
164
|
+
ligand_smiles: str | None = None,
|
|
165
|
+
) -> dict:
|
|
166
|
+
"""Build a Boltz-format YAML dict for a single variant."""
|
|
167
|
+
sequences = [
|
|
168
|
+
{
|
|
169
|
+
"protein": {
|
|
170
|
+
"id": "A",
|
|
171
|
+
"sequence": sequence,
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
if ligand_smiles:
|
|
177
|
+
sequences.append(
|
|
178
|
+
{
|
|
179
|
+
"ligand": {
|
|
180
|
+
"id": "B",
|
|
181
|
+
"smiles": ligand_smiles,
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
"version": 1,
|
|
188
|
+
"sequences": sequences,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _write_pymol_script(
|
|
193
|
+
output_path: Path,
|
|
194
|
+
results_path: Path,
|
|
195
|
+
generated: list[tuple[str, float]],
|
|
196
|
+
ligand_map: dict[str, str | None],
|
|
197
|
+
):
|
|
198
|
+
"""Generate a PyMOL script for visualizing WT + variant structures.
|
|
199
|
+
|
|
200
|
+
This script is designed to be run after Boltz validation completes,
|
|
201
|
+
loading the predicted structures and aligning them to the WT.
|
|
202
|
+
"""
|
|
203
|
+
pdbs_dir = results_path / "pdbs"
|
|
204
|
+
wt_pdbs = sorted(pdbs_dir.glob("*.pdb")) if pdbs_dir.exists() else []
|
|
205
|
+
|
|
206
|
+
lines = [
|
|
207
|
+
"# PyMOL visualization script for ProtMPNN variants",
|
|
208
|
+
"# Generated by: dh batch protmpnn-to-boltz",
|
|
209
|
+
"#",
|
|
210
|
+
"# Usage: pymol view_variants.pml",
|
|
211
|
+
"# or: pymol -r view_variants.pml",
|
|
212
|
+
"",
|
|
213
|
+
"from pymol import cmd",
|
|
214
|
+
"",
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
if wt_pdbs:
|
|
218
|
+
wt_pdb = wt_pdbs[0]
|
|
219
|
+
lines.append(f'cmd.load("reference_pdbs/{wt_pdb.name}", "wildtype")')
|
|
220
|
+
lines.append('cmd.color("gray80", "wildtype")')
|
|
221
|
+
lines.append("")
|
|
222
|
+
|
|
223
|
+
lines.append("# Load variant structures after Boltz validation")
|
|
224
|
+
lines.append("# Boltz outputs will be in the finalized results directory")
|
|
225
|
+
for filename, confidence in generated:
|
|
226
|
+
obj_name = filename.replace(".yaml", "")
|
|
227
|
+
lines.append(f"# {obj_name}: confidence={confidence:.3f}")
|
|
228
|
+
|
|
229
|
+
lines.extend([
|
|
230
|
+
"",
|
|
231
|
+
"# Align all objects to wildtype",
|
|
232
|
+
'for obj in cmd.get_object_list():',
|
|
233
|
+
' if obj != "wildtype":',
|
|
234
|
+
' cmd.align(obj, "wildtype")',
|
|
235
|
+
"",
|
|
236
|
+
"# Show cartoon representation",
|
|
237
|
+
"cmd.show('cartoon')",
|
|
238
|
+
"cmd.hide('lines')",
|
|
239
|
+
"",
|
|
240
|
+
"# Highlight mutations (after loading Boltz results)",
|
|
241
|
+
"# cmd.select('mutations', 'wildtype and not (same sequence as variant)')",
|
|
242
|
+
"",
|
|
243
|
+
"cmd.zoom()",
|
|
244
|
+
"print('Loaded variant structures. Align Boltz results manually.')",
|
|
245
|
+
])
|
|
246
|
+
|
|
247
|
+
script_path = output_path / "view_variants.pml"
|
|
248
|
+
with open(script_path, "w") as f:
|
|
249
|
+
f.write("\n".join(lines) + "\n")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|