dh-cli 0.2.11__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dh_cli-0.2.11 → dh_cli-0.3.1}/PKG-INFO +1 -1
  2. {dh_cli-0.2.11 → dh_cli-0.3.1}/pyproject.toml +1 -1
  3. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/__init__.py +9 -0
  4. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/finalize.py +54 -0
  5. dh_cli-0.3.1/src/dh_cli/batch/commands/protmpnn.py +418 -0
  6. dh_cli-0.3.1/src/dh_cli/batch/commands/protmpnn_to_boltz.py +249 -0
  7. {dh_cli-0.2.11 → dh_cli-0.3.1}/.gitignore +0 -0
  8. {dh_cli-0.2.11 → dh_cli-0.3.1}/LICENSE +0 -0
  9. {dh_cli-0.2.11 → dh_cli-0.3.1}/README.md +0 -0
  10. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/__init__.py +0 -0
  11. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/aws_batch.py +0 -0
  12. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/__init__.py +0 -0
  13. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/boltz.py +0 -0
  14. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/cancel.py +0 -0
  15. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/clean.py +0 -0
  16. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/embed_t5.py +0 -0
  17. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/list_jobs.py +0 -0
  18. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/local.py +0 -0
  19. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/logs.py +0 -0
  20. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/retry.py +0 -0
  21. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/status.py +0 -0
  22. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/submit.py +0 -0
  23. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/train.py +0 -0
  24. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/commands/wait_for.py +0 -0
  25. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/fasta_utils.py +0 -0
  26. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/h5_utils.py +0 -0
  27. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/job_id.py +0 -0
  28. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/manifest.py +0 -0
  29. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/batch/s3_transport.py +0 -0
  30. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/cloud_commands.py +0 -0
  31. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/codeartifact.py +0 -0
  32. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/__init__.py +0 -0
  33. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/api_client.py +0 -0
  34. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/auth.py +0 -0
  35. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/engine_commands.py +0 -0
  36. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/progress.py +0 -0
  37. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/ssh_config.py +0 -0
  38. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/engines_studios/studio_commands.py +0 -0
  39. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/github_commands.py +0 -0
  40. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/main.py +0 -0
  41. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/utility_commands.py +0 -0
  42. {dh_cli-0.2.11 → dh_cli-0.3.1}/src/dh_cli/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dh-cli
3
- Version: 0.2.11
3
+ Version: 0.3.1
4
4
  Summary: Dayhoff Labs developer CLI
5
5
  Author-email: Dayhoff Labs <dev@dayhofflabs.com>
6
6
  License: # PolyForm Noncommercial License 1.0.0
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "dh-cli"
7
- version = "0.2.11"
7
+ version = "0.3.1"
8
8
  description = "Dayhoff Labs developer CLI"
9
9
  requires-python = ">=3.11"
10
10
  readme = "README.md"
@@ -18,6 +18,8 @@ from .commands.finalize import finalize
18
18
  from .commands.list_jobs import list_jobs
19
19
  from .commands.local import local
20
20
  from .commands.logs import logs
21
+ from .commands.protmpnn import protmpnn
22
+ from .commands.protmpnn_to_boltz import protmpnn_to_boltz
21
23
  from .commands.retry import retry
22
24
  from .commands.status import status
23
25
  from .commands.submit import submit
@@ -49,6 +51,11 @@ def batch_cli():
49
51
  Structure Prediction:
50
52
  boltz Predict protein structures with Boltz
51
53
 
54
+ \b
55
+ Sequence Design:
56
+ protmpnn Design sequences with ProtMPNN/LigandMPNN
57
+ protmpnn-to-boltz Convert top variants to Boltz validation YAMLs
58
+
52
59
  \b
53
60
  ML Training:
54
61
  train Submit hatchery ML training jobs
@@ -91,6 +98,8 @@ batch_cli.add_command(wait_for, name="wait-for")
91
98
  # Register pipeline commands
92
99
  batch_cli.add_command(embed_t5, name="embed-t5")
93
100
  batch_cli.add_command(boltz)
101
+ batch_cli.add_command(protmpnn)
102
+ batch_cli.add_command(protmpnn_to_boltz, name="protmpnn-to-boltz")
94
103
  batch_cli.add_command(train)
95
104
 
96
105
  __all__ = ["batch_cli"]
@@ -145,6 +145,8 @@ def finalize(
145
145
  )
146
146
  )
147
147
  _finalize_boltz(output_dir, output_path, full_output=full_output)
148
+ elif manifest.pipeline == "protmpnn":
149
+ _finalize_protmpnn(output_dir, output_path)
148
150
  else:
149
151
  _finalize_generic(output_dir, output_path)
150
152
 
@@ -447,3 +449,55 @@ def _finalize_generic(output_dir: Path, output_path: Path):
447
449
  click.echo(f"Copying output directory to {output_path}...")
448
450
  shutil.copytree(output_dir, output_path)
449
451
  click.echo(click.style("✓ Output copied successfully", fg="green"))
452
+
453
+
454
+ def _finalize_protmpnn(output_dir: Path, output_path: Path):
455
+ """Merge per-worker ProtMPNN results into final output.
456
+
457
+ 1. Merge results_worker_*.csv into results.csv (sorted by confidence)
458
+ 2. Flatten all seqs/ and pdbs/ into output seqs/ and pdbs/ dirs
459
+ 3. Print summary
460
+ """
461
+ import pandas as pd
462
+
463
+ csv_files = sorted(output_dir.glob("results_worker_*.csv"))
464
+
465
+ if not csv_files:
466
+ click.echo("No results_worker_*.csv files found in output directory.", err=True)
467
+ raise SystemExit(1)
468
+
469
+ output_path.mkdir(parents=True, exist_ok=True)
470
+
471
+ dfs = [pd.read_csv(f) for f in csv_files]
472
+ merged = pd.concat(dfs, ignore_index=True)
473
+ merged = merged.sort_values("overall_confidence", ascending=False)
474
+ merged.to_csv(output_path / "results.csv", index=False)
475
+
476
+ num_variants = len(merged)
477
+ num_configs = merged["config_name"].nunique()
478
+
479
+ seqs_dest = output_path / "seqs"
480
+ seqs_dest.mkdir(exist_ok=True)
481
+ for config_dir in output_dir.iterdir():
482
+ if config_dir.is_dir():
483
+ config_seqs = config_dir / "seqs"
484
+ if config_seqs.exists():
485
+ for fa_file in config_seqs.glob("*.fa"):
486
+ shutil.copy2(fa_file, seqs_dest / fa_file.name)
487
+
488
+ pdbs_dest = output_path / "pdbs"
489
+ pdbs_dest.mkdir(exist_ok=True)
490
+ for config_dir in output_dir.iterdir():
491
+ if config_dir.is_dir():
492
+ config_pdbs = config_dir / "pdbs"
493
+ if config_pdbs.exists():
494
+ for pdb_file in config_pdbs.glob("*.pdb"):
495
+ shutil.copy2(pdb_file, pdbs_dest / pdb_file.name)
496
+
497
+ top_conf = merged.iloc[0]["overall_confidence"] if num_variants > 0 else "N/A"
498
+
499
+ click.echo(
500
+ f"{num_variants} variants from {num_configs} config(s), "
501
+ f"top confidence: {top_conf}"
502
+ )
503
+ click.echo(click.style(f"Results: {output_path / 'results.csv'}", fg="green"))
@@ -0,0 +1,418 @@
1
+ """ProtMPNN/LigandMPNN sequence design pipeline command."""
2
+
3
+ import math
4
+ import os
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from ..aws_batch import BatchClient, BatchError
11
+ from ..job_id import generate_job_id
12
+ from ..manifest import (
13
+ BATCH_JOBS_BASE,
14
+ BatchConfig,
15
+ InputConfig,
16
+ JobManifest,
17
+ JobStatus,
18
+ OutputConfig,
19
+ create_job_directory,
20
+ get_job_dir,
21
+ save_manifest,
22
+ )
23
+
24
+ DEFAULT_QUEUE = "t4-1x-spot"
25
+ MAX_WORKERS = 50
26
+ FILES_PER_WORKER = 10
27
+ DEFAULT_JOB_DEFINITION = "dayhoff-protmpnn"
28
+ DEFAULT_IMAGE_URI = (
29
+ "074735440724.dkr.ecr.us-east-1.amazonaws.com/dayhoff:protmpnn-latest"
30
+ )
31
+
32
+
33
+ @click.command()
34
+ @click.argument("input_dir", type=click.Path(exists=True))
35
+ @click.option(
36
+ "--workers",
37
+ default=None,
38
+ type=int,
39
+ help="Number of parallel workers [default: ~1 per 10 files]",
40
+ )
41
+ @click.option(
42
+ "--queue",
43
+ default=DEFAULT_QUEUE,
44
+ help=f"Batch queue [default: {DEFAULT_QUEUE}]",
45
+ )
46
+ @click.option("--dry-run", is_flag=True, help="Show plan without submitting")
47
+ @click.option(
48
+ "--local",
49
+ "run_local",
50
+ is_flag=True,
51
+ help="Force local execution via Docker",
52
+ )
53
+ @click.option(
54
+ "--remote",
55
+ "run_remote",
56
+ is_flag=True,
57
+ help="Force Batch submission (override auto-detect)",
58
+ )
59
+ @click.option(
60
+ "--shell",
61
+ "run_shell",
62
+ is_flag=True,
63
+ help="Drop into container shell for debugging",
64
+ )
65
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
66
+ def protmpnn(input_dir, workers, queue, dry_run, run_local, run_remote, run_shell, base_path):
67
+ """Design protein sequences with ProtMPNN/LigandMPNN.
68
+
69
+ Processes a directory of YAML config files, each specifying a PDB
70
+ structure and design parameters. Generates variant sequences ranked
71
+ by confidence.
72
+
73
+ \b
74
+ Examples:
75
+ # Run on a GPU engine (auto-detects GPU, runs locally)
76
+ dh batch protmpnn input/
77
+
78
+ # Force remote Batch submission
79
+ dh batch protmpnn input/ --remote
80
+
81
+ # Preview what would run
82
+ dh batch protmpnn input/ --dry-run
83
+
84
+ # Run locally via Docker
85
+ dh batch protmpnn input/ --local
86
+
87
+ \b
88
+ After remote job completes:
89
+ dh batch status <job-id>
90
+ dh batch finalize <job-id> --output ./results/
91
+
92
+ \b
93
+ YAML config format:
94
+ version: 1
95
+ pdb_path: 6DHI.pdb
96
+ model_type: ligand_mpnn
97
+ num_sequences: 20
98
+ seed: 42
99
+ temperature: 0.1
100
+ fixed_residues: "A42 A181 A217 A218"
101
+ """
102
+ input_path = Path(input_dir).resolve()
103
+
104
+ if run_shell:
105
+ _run_shell_mode(input_path)
106
+ return
107
+
108
+ if run_local:
109
+ _run_local_mode(input_path)
110
+ return
111
+
112
+ # Auto-detect GPU for smart defaulting
113
+ if not run_remote and not dry_run:
114
+ if _has_local_gpu():
115
+ click.echo("GPU detected — running locally (use --remote to override)")
116
+ _run_local_mode(input_path)
117
+ return
118
+
119
+ _submit_batch_job(input_path, workers, queue, dry_run, base_path)
120
+
121
+
122
+ def _has_local_gpu() -> bool:
123
+ """Check if a local NVIDIA GPU is available."""
124
+ import subprocess
125
+
126
+ try:
127
+ result = subprocess.run(
128
+ ["nvidia-smi"],
129
+ capture_output=True,
130
+ timeout=5,
131
+ )
132
+ return result.returncode == 0
133
+ except (FileNotFoundError, subprocess.TimeoutExpired):
134
+ return False
135
+
136
+
137
+ def _count_yaml_files(input_path: Path) -> int:
138
+ return len(list(input_path.glob("*.yaml")))
139
+
140
+
141
+ def _copy_inputs_to_job_dir(input_path: Path, job_dir: Path) -> int:
142
+ """Copy input YAML and PDB files to job directory."""
143
+ input_dir = job_dir / "input"
144
+ input_dir.mkdir(parents=True, exist_ok=True)
145
+
146
+ count = 0
147
+ for yaml_file in sorted(input_path.glob("*.yaml")):
148
+ shutil.copy2(yaml_file, input_dir / yaml_file.name)
149
+ count += 1
150
+
151
+ # Copy PDB files alongside YAMLs
152
+ for pdb_file in sorted(input_path.glob("*.pdb")):
153
+ shutil.copy2(pdb_file, input_dir / pdb_file.name)
154
+
155
+ return count
156
+
157
+
158
+ def _submit_batch_job(
159
+ input_path: Path,
160
+ workers: int | None,
161
+ queue: str,
162
+ dry_run: bool,
163
+ base_path: str,
164
+ ):
165
+ """Submit ProtMPNN job to AWS Batch."""
166
+ click.echo(f"Scanning {input_path} for YAML files...")
167
+ num_files = _count_yaml_files(input_path)
168
+
169
+ if num_files == 0:
170
+ click.echo(
171
+ click.style("Error: No YAML files found in input directory", fg="red"),
172
+ err=True,
173
+ )
174
+ raise SystemExit(1)
175
+
176
+ click.echo(f"Found {num_files} config(s) to process")
177
+
178
+ if workers is None:
179
+ workers = max(1, min(math.ceil(num_files / FILES_PER_WORKER), MAX_WORKERS))
180
+ array_size = min(num_files, workers)
181
+
182
+ job_id = generate_job_id("protmpnn")
183
+
184
+ click.echo()
185
+ click.echo(f"Job ID: {job_id}")
186
+ click.echo(f"Input: {input_path}")
187
+ click.echo(f"Configs: {num_files}")
188
+ click.echo(f"Workers: {array_size}")
189
+ files_per_worker = math.ceil(num_files / array_size)
190
+ click.echo(f"Files per worker: ~{files_per_worker}")
191
+ click.echo(f"Queue: {queue}")
192
+ click.echo(f"Job definition: {DEFAULT_JOB_DEFINITION}")
193
+
194
+ if dry_run:
195
+ click.echo()
196
+ click.echo(click.style("Dry run - job not submitted", fg="yellow"))
197
+ return
198
+
199
+ if not click.confirm("\nSubmit job?", default=True):
200
+ click.echo("Cancelled.")
201
+ raise SystemExit(0)
202
+ click.echo()
203
+
204
+ job_dir = create_job_directory(job_id, base_path)
205
+ click.echo(f"Created job directory: {job_dir}")
206
+
207
+ click.echo("Copying input files...")
208
+ copied = _copy_inputs_to_job_dir(input_path, job_dir)
209
+ click.echo(f"Copied {copied} YAML files")
210
+
211
+ manifest = JobManifest(
212
+ job_id=job_id,
213
+ user=job_id.split("-")[0],
214
+ pipeline="protmpnn",
215
+ status=JobStatus.PENDING,
216
+ image_uri=DEFAULT_IMAGE_URI,
217
+ input=InputConfig(
218
+ source=str(input_path),
219
+ num_sequences=num_files,
220
+ num_chunks=array_size,
221
+ ),
222
+ batch=BatchConfig(
223
+ queue=queue,
224
+ job_definition=DEFAULT_JOB_DEFINITION,
225
+ array_size=array_size,
226
+ ),
227
+ output=OutputConfig(
228
+ destination=None,
229
+ finalized=False,
230
+ ),
231
+ )
232
+
233
+ save_manifest(manifest, base_path)
234
+
235
+ try:
236
+ client = BatchClient()
237
+
238
+ environment = {
239
+ "JOB_DIR": str(job_dir),
240
+ "JOB_ID": job_id,
241
+ "BATCH_ARRAY_SIZE": str(array_size),
242
+ "BATCH_NUM_FILES": str(num_files),
243
+ }
244
+
245
+ batch_job_id = client.submit_job(
246
+ job_name=job_id,
247
+ job_definition=DEFAULT_JOB_DEFINITION,
248
+ job_queue=queue,
249
+ array_size=array_size,
250
+ environment=environment,
251
+ timeout_seconds=1 * 3600, # 1 hour
252
+ retry_attempts=5,
253
+ )
254
+
255
+ manifest.status = JobStatus.SUBMITTED
256
+ manifest.batch.job_id = batch_job_id
257
+ save_manifest(manifest, base_path)
258
+
259
+ click.echo()
260
+ click.echo(click.style("Job submitted successfully!", fg="green"))
261
+ click.echo()
262
+ click.echo(f"AWS Batch Job ID: {batch_job_id}")
263
+ click.echo()
264
+ click.echo("Next steps:")
265
+ click.echo(f" Check status: dh batch status {job_id}")
266
+ click.echo(f" View logs: dh batch logs {job_id}")
267
+ click.echo(f" Cancel: dh batch cancel {job_id}")
268
+ click.echo()
269
+ click.echo("After completion:")
270
+ click.echo(
271
+ f" Finalize: dh batch finalize {job_id} --output ./results/"
272
+ )
273
+
274
+ except BatchError as e:
275
+ manifest.status = JobStatus.FAILED
276
+ manifest.error_message = str(e)
277
+ save_manifest(manifest, base_path)
278
+ click.echo(click.style(f"Failed to submit job: {e}", fg="red"), err=True)
279
+ raise SystemExit(1)
280
+
281
+
282
+ def _run_local_mode(input_path: Path):
283
+ """Run ProtMPNN locally in a Docker container."""
284
+ import subprocess
285
+
286
+ click.echo("Running ProtMPNN locally in container...")
287
+ click.echo(f"Input directory: {input_path}")
288
+
289
+ yaml_files = list(input_path.glob("*.yaml"))
290
+ if not yaml_files:
291
+ click.echo(click.style("Error: No YAML files found", fg="red"), err=True)
292
+ raise SystemExit(1)
293
+
294
+ click.echo(f"Found {len(yaml_files)} config file(s)")
295
+
296
+ temp_job_dir = input_path / ".local_protmpnn_job"
297
+ temp_input_dir = temp_job_dir / "input"
298
+ temp_output_dir = temp_job_dir / "output"
299
+
300
+ if temp_job_dir.exists():
301
+ shutil.rmtree(temp_job_dir)
302
+
303
+ temp_input_dir.mkdir(parents=True)
304
+ temp_output_dir.mkdir(parents=True)
305
+
306
+ for yaml_file in yaml_files:
307
+ shutil.copy2(yaml_file, temp_input_dir / yaml_file.name)
308
+ for pdb_file in input_path.glob("*.pdb"):
309
+ shutil.copy2(pdb_file, temp_input_dir / pdb_file.name)
310
+
311
+ click.echo(f"Output will be at: {temp_output_dir}/")
312
+ click.echo()
313
+
314
+ cmd = [
315
+ "docker",
316
+ "run",
317
+ "--rm",
318
+ "--gpus",
319
+ "all",
320
+ "-v",
321
+ "/primordial:/primordial",
322
+ "-v",
323
+ f"{temp_job_dir}:{temp_job_dir}",
324
+ "-e",
325
+ f"JOB_DIR={temp_job_dir}",
326
+ "-e",
327
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
328
+ "-e",
329
+ "BATCH_ARRAY_SIZE=1",
330
+ "-e",
331
+ f"BATCH_NUM_FILES={len(yaml_files)}",
332
+ DEFAULT_IMAGE_URI,
333
+ ]
334
+
335
+ click.echo(f"Running: {' '.join(cmd)}")
336
+ click.echo()
337
+
338
+ try:
339
+ result = subprocess.run(cmd)
340
+ if result.returncode != 0:
341
+ click.echo(
342
+ click.style(
343
+ f"Container exited with code {result.returncode}", fg="red"
344
+ ),
345
+ err=True,
346
+ )
347
+ raise SystemExit(result.returncode)
348
+
349
+ csv_files = list(temp_output_dir.glob("results_worker_*.csv"))
350
+ if csv_files:
351
+ # Merge worker CSVs into results.csv for local mode
352
+ import pandas as pd
353
+
354
+ dfs = [pd.read_csv(f) for f in csv_files]
355
+ merged = pd.concat(dfs, ignore_index=True)
356
+ merged = merged.sort_values("overall_confidence", ascending=False)
357
+ merged.to_csv(temp_output_dir / "results.csv", index=False)
358
+
359
+ click.echo()
360
+ click.echo(click.style("Design complete!", fg="green"))
361
+ click.echo(f"Results: {temp_output_dir / 'results.csv'}")
362
+ click.echo(f" {len(merged)} variants generated")
363
+ else:
364
+ click.echo(click.style("Warning: No results CSV found", fg="yellow"))
365
+
366
+ except FileNotFoundError:
367
+ click.echo(
368
+ click.style(
369
+ "Error: Docker not found. Is Docker installed and running?",
370
+ fg="red",
371
+ ),
372
+ err=True,
373
+ )
374
+ raise SystemExit(1)
375
+
376
+
377
+ def _run_shell_mode(input_path: Path):
378
+ """Drop into container shell for debugging."""
379
+ import subprocess
380
+
381
+ click.echo("Dropping into container shell...")
382
+ click.echo(f"Input will be available at: /input/")
383
+ click.echo()
384
+
385
+ cmd = [
386
+ "docker",
387
+ "run",
388
+ "--rm",
389
+ "-it",
390
+ "--gpus",
391
+ "all",
392
+ "-v",
393
+ "/primordial:/primordial",
394
+ "-v",
395
+ f"{input_path}:/input",
396
+ "-e",
397
+ "JOB_DIR=/input",
398
+ "-e",
399
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
400
+ "--entrypoint",
401
+ "/bin/bash",
402
+ DEFAULT_IMAGE_URI,
403
+ ]
404
+
405
+ click.echo(f"Running: {' '.join(cmd)}")
406
+ click.echo()
407
+
408
+ try:
409
+ subprocess.run(cmd)
410
+ except FileNotFoundError:
411
+ click.echo(
412
+ click.style(
413
+ "Error: Docker not found. Is Docker installed and running?",
414
+ fg="red",
415
+ ),
416
+ err=True,
417
+ )
418
+ raise SystemExit(1)
@@ -0,0 +1,249 @@
1
+ """Convert ProtMPNN results to Boltz input YAMLs for structural validation."""
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import click
7
+ import pandas as pd
8
+ import yaml
9
+
10
+
11
+ @click.command("protmpnn-to-boltz")
12
+ @click.argument("results_dir", type=click.Path(exists=True))
13
+ @click.option("--top", default=10, type=int, help="Number of top variants to convert")
14
+ @click.option(
15
+ "--output",
16
+ "-o",
17
+ default=None,
18
+ type=click.Path(),
19
+ help="Output directory for Boltz YAMLs [default: boltz_input/]",
20
+ )
21
+ @click.option(
22
+ "--config",
23
+ "config_dir",
24
+ default=None,
25
+ type=click.Path(exists=True),
26
+ help="Directory containing original ProtMPNN config YAMLs (for ligand_smiles)",
27
+ )
28
+ def protmpnn_to_boltz(results_dir, top, output, config_dir):
29
+ """Convert top ProtMPNN variants to Boltz YAML configs.
30
+
31
+ Takes a ProtMPNN results directory (containing results.csv) and generates
32
+ Boltz-format YAML files for structural validation of the top-N variants.
33
+
34
+ \b
35
+ Examples:
36
+ # Convert top 10 from local run
37
+ dh batch protmpnn-to-boltz input/.local_protmpnn_job/output/ --output boltz_in/
38
+
39
+ # Convert top 20, pull ligand SMILES from original configs
40
+ dh batch protmpnn-to-boltz results/ --top 20 --config input/ -o boltz_in/
41
+
42
+ \b
43
+ The generated Boltz YAMLs can be used directly:
44
+ dh batch boltz boltz_in/
45
+ dh batch boltz --local boltz_in/
46
+ """
47
+ results_path = Path(results_dir).resolve()
48
+ csv_path = results_path / "results.csv"
49
+
50
+ if not csv_path.exists():
51
+ # Try worker CSVs if results.csv not found (e.g. raw output before finalize)
52
+ worker_csvs = sorted(results_path.glob("results_worker_*.csv"))
53
+ if worker_csvs:
54
+ dfs = [pd.read_csv(f) for f in worker_csvs]
55
+ df = pd.concat(dfs, ignore_index=True)
56
+ df = df.sort_values("overall_confidence", ascending=False)
57
+ else:
58
+ click.echo(
59
+ click.style(
60
+ "Error: No results.csv or results_worker_*.csv found", fg="red"
61
+ ),
62
+ err=True,
63
+ )
64
+ raise SystemExit(1)
65
+ else:
66
+ df = pd.read_csv(csv_path)
67
+
68
+ if len(df) == 0:
69
+ click.echo(click.style("Error: Results CSV is empty", fg="red"), err=True)
70
+ raise SystemExit(1)
71
+
72
+ top_n = min(top, len(df))
73
+ top_variants = df.head(top_n)
74
+
75
+ # Resolve ligand SMILES from original config YAMLs
76
+ ligand_map = _load_ligand_smiles(config_dir, results_path)
77
+
78
+ output_path = Path(output or "boltz_input").resolve()
79
+ output_path.mkdir(parents=True, exist_ok=True)
80
+
81
+ click.echo(f"Converting top {top_n} variants to Boltz format...")
82
+
83
+ generated = []
84
+ for idx, row in top_variants.iterrows():
85
+ config_name = row.get("config_name", "unknown")
86
+ variant_id = int(row.get("variant_id", idx))
87
+ sequence = row["sequence"]
88
+ confidence = row.get("overall_confidence", float("nan"))
89
+
90
+ boltz_yaml = _build_boltz_yaml(
91
+ sequence=sequence,
92
+ config_name=config_name,
93
+ variant_id=variant_id,
94
+ ligand_smiles=ligand_map.get(config_name),
95
+ )
96
+
97
+ filename = f"{config_name}_var{variant_id:03d}.yaml"
98
+ yaml_path = output_path / filename
99
+
100
+ with open(yaml_path, "w") as f:
101
+ yaml.dump(boltz_yaml, f, default_flow_style=False, sort_keys=False)
102
+
103
+ generated.append((filename, confidence))
104
+
105
+ # Copy PDB files for reference if available
106
+ pdbs_src = results_path / "pdbs"
107
+ if pdbs_src.exists():
108
+ pdbs_dest = output_path / "reference_pdbs"
109
+ pdbs_dest.mkdir(exist_ok=True)
110
+ for pdb in pdbs_src.glob("*.pdb"):
111
+ shutil.copy2(pdb, pdbs_dest / pdb.name)
112
+
113
+ # Generate PyMOL visualization script
114
+ _write_pymol_script(output_path, results_path, generated, ligand_map)
115
+
116
+ click.echo()
117
+ click.echo(click.style(f"Generated {len(generated)} Boltz configs", fg="green"))
118
+ click.echo(f"Output: {output_path}/")
119
+ click.echo()
120
+ click.echo("Next steps:")
121
+ click.echo(f" dh batch boltz {output_path}/")
122
+ click.echo(f" dh batch boltz --local {output_path}/")
123
+
124
+
125
+ def _load_ligand_smiles(
126
+ config_dir: str | None, results_path: Path
127
+ ) -> dict[str, str | None]:
128
+ """Load ligand_smiles from original ProtMPNN config YAMLs.
129
+
130
+ Searches config_dir first, then falls back to the input/ sibling
131
+ of the results directory (common in local runs).
132
+ """
133
+ smiles_map: dict[str, str | None] = {}
134
+
135
+ search_dirs = []
136
+ if config_dir:
137
+ search_dirs.append(Path(config_dir))
138
+
139
+ # For local runs: results are at input/.local_protmpnn_job/output/
140
+ # Config YAMLs are at input/
141
+ if results_path.name == "output":
142
+ job_dir = results_path.parent
143
+ input_dir = job_dir / "input"
144
+ if input_dir.exists():
145
+ search_dirs.append(input_dir)
146
+
147
+ for search_dir in search_dirs:
148
+ for yaml_file in search_dir.glob("*.yaml"):
149
+ try:
150
+ with open(yaml_file) as f:
151
+ data = yaml.safe_load(f)
152
+ if isinstance(data, dict) and data.get("ligand_smiles"):
153
+ smiles_map[yaml_file.stem] = data["ligand_smiles"]
154
+ except Exception:
155
+ continue
156
+
157
+ return smiles_map
158
+
159
+
160
+ def _build_boltz_yaml(
161
+ sequence: str,
162
+ config_name: str,
163
+ variant_id: int,
164
+ ligand_smiles: str | None = None,
165
+ ) -> dict:
166
+ """Build a Boltz-format YAML dict for a single variant."""
167
+ sequences = [
168
+ {
169
+ "protein": {
170
+ "id": "A",
171
+ "sequence": sequence,
172
+ }
173
+ }
174
+ ]
175
+
176
+ if ligand_smiles:
177
+ sequences.append(
178
+ {
179
+ "ligand": {
180
+ "id": "B",
181
+ "smiles": ligand_smiles,
182
+ }
183
+ }
184
+ )
185
+
186
+ return {
187
+ "version": 1,
188
+ "sequences": sequences,
189
+ }
190
+
191
+
192
+ def _write_pymol_script(
193
+ output_path: Path,
194
+ results_path: Path,
195
+ generated: list[tuple[str, float]],
196
+ ligand_map: dict[str, str | None],
197
+ ):
198
+ """Generate a PyMOL script for visualizing WT + variant structures.
199
+
200
+ This script is designed to be run after Boltz validation completes,
201
+ loading the predicted structures and aligning them to the WT.
202
+ """
203
+ pdbs_dir = results_path / "pdbs"
204
+ wt_pdbs = sorted(pdbs_dir.glob("*.pdb")) if pdbs_dir.exists() else []
205
+
206
+ lines = [
207
+ "# PyMOL visualization script for ProtMPNN variants",
208
+ "# Generated by: dh batch protmpnn-to-boltz",
209
+ "#",
210
+ "# Usage: pymol view_variants.pml",
211
+ "# or: pymol -r view_variants.pml",
212
+ "",
213
+ "from pymol import cmd",
214
+ "",
215
+ ]
216
+
217
+ if wt_pdbs:
218
+ wt_pdb = wt_pdbs[0]
219
+ lines.append(f'cmd.load("reference_pdbs/{wt_pdb.name}", "wildtype")')
220
+ lines.append('cmd.color("gray80", "wildtype")')
221
+ lines.append("")
222
+
223
+ lines.append("# Load variant structures after Boltz validation")
224
+ lines.append("# Boltz outputs will be in the finalized results directory")
225
+ for filename, confidence in generated:
226
+ obj_name = filename.replace(".yaml", "")
227
+ lines.append(f"# {obj_name}: confidence={confidence:.3f}")
228
+
229
+ lines.extend([
230
+ "",
231
+ "# Align all objects to wildtype",
232
+ 'for obj in cmd.get_object_list():',
233
+ ' if obj != "wildtype":',
234
+ ' cmd.align(obj, "wildtype")',
235
+ "",
236
+ "# Show cartoon representation",
237
+ "cmd.show('cartoon')",
238
+ "cmd.hide('lines')",
239
+ "",
240
+ "# Highlight mutations (after loading Boltz results)",
241
+ "# cmd.select('mutations', 'wildtype and not (same sequence as variant)')",
242
+ "",
243
+ "cmd.zoom()",
244
+ "print('Loaded variant structures. Align Boltz results manually.')",
245
+ ])
246
+
247
+ script_path = output_path / "view_variants.pml"
248
+ with open(script_path, "w") as f:
249
+ f.write("\n".join(lines) + "\n")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes