dh-cli 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dh_cli-0.3.0 → dh_cli-0.3.2}/PKG-INFO +1 -1
  2. {dh_cli-0.3.0 → dh_cli-0.3.2}/pyproject.toml +1 -1
  3. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/__init__.py +9 -0
  4. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/finalize.py +55 -0
  5. dh_cli-0.3.2/src/dh_cli/batch/commands/protmpnn.py +543 -0
  6. dh_cli-0.3.2/src/dh_cli/batch/commands/protmpnn_to_boltz.py +249 -0
  7. {dh_cli-0.3.0 → dh_cli-0.3.2}/.gitignore +0 -0
  8. {dh_cli-0.3.0 → dh_cli-0.3.2}/LICENSE +0 -0
  9. {dh_cli-0.3.0 → dh_cli-0.3.2}/README.md +0 -0
  10. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/__init__.py +0 -0
  11. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/aws_batch.py +0 -0
  12. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/__init__.py +0 -0
  13. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/boltz.py +0 -0
  14. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/cancel.py +0 -0
  15. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/clean.py +0 -0
  16. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/embed_t5.py +0 -0
  17. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/list_jobs.py +0 -0
  18. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/local.py +0 -0
  19. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/logs.py +0 -0
  20. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/retry.py +0 -0
  21. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/status.py +0 -0
  22. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/submit.py +0 -0
  23. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/train.py +0 -0
  24. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/commands/wait_for.py +0 -0
  25. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/fasta_utils.py +0 -0
  26. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/h5_utils.py +0 -0
  27. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/job_id.py +0 -0
  28. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/manifest.py +0 -0
  29. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/batch/s3_transport.py +0 -0
  30. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/cloud_commands.py +0 -0
  31. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/codeartifact.py +0 -0
  32. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/__init__.py +0 -0
  33. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/api_client.py +0 -0
  34. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/auth.py +0 -0
  35. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/engine_commands.py +0 -0
  36. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/progress.py +0 -0
  37. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/ssh_config.py +0 -0
  38. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/engines_studios/studio_commands.py +0 -0
  39. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/github_commands.py +0 -0
  40. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/main.py +0 -0
  41. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/utility_commands.py +0 -0
  42. {dh_cli-0.3.0 → dh_cli-0.3.2}/src/dh_cli/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dh-cli
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Dayhoff Labs developer CLI
5
5
  Author-email: Dayhoff Labs <dev@dayhofflabs.com>
6
6
  License: # PolyForm Noncommercial License 1.0.0
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "dh-cli"
7
- version = "0.3.0"
7
+ version = "0.3.2"
8
8
  description = "Dayhoff Labs developer CLI"
9
9
  requires-python = ">=3.11"
10
10
  readme = "README.md"
@@ -18,6 +18,8 @@ from .commands.finalize import finalize
18
18
  from .commands.list_jobs import list_jobs
19
19
  from .commands.local import local
20
20
  from .commands.logs import logs
21
+ from .commands.protmpnn import protmpnn
22
+ from .commands.protmpnn_to_boltz import protmpnn_to_boltz
21
23
  from .commands.retry import retry
22
24
  from .commands.status import status
23
25
  from .commands.submit import submit
@@ -49,6 +51,11 @@ def batch_cli():
49
51
  Structure Prediction:
50
52
  boltz Predict protein structures with Boltz
51
53
 
54
+ \b
55
+ Sequence Design:
56
+ protmpnn Design sequences with ProtMPNN/LigandMPNN
57
+ protmpnn-to-boltz Convert top variants to Boltz validation YAMLs
58
+
52
59
  \b
53
60
  ML Training:
54
61
  train Submit hatchery ML training jobs
@@ -91,6 +98,8 @@ batch_cli.add_command(wait_for, name="wait-for")
91
98
  # Register pipeline commands
92
99
  batch_cli.add_command(embed_t5, name="embed-t5")
93
100
  batch_cli.add_command(boltz)
101
+ batch_cli.add_command(protmpnn)
102
+ batch_cli.add_command(protmpnn_to_boltz, name="protmpnn-to-boltz")
94
103
  batch_cli.add_command(train)
95
104
 
96
105
  __all__ = ["batch_cli"]
@@ -145,6 +145,8 @@ def finalize(
145
145
  )
146
146
  )
147
147
  _finalize_boltz(output_dir, output_path, full_output=full_output)
148
+ elif manifest.pipeline == "protmpnn":
149
+ _finalize_protmpnn(output_dir, output_path)
148
150
  else:
149
151
  _finalize_generic(output_dir, output_path)
150
152
 
@@ -447,3 +449,56 @@ def _finalize_generic(output_dir: Path, output_path: Path):
447
449
  click.echo(f"Copying output directory to {output_path}...")
448
450
  shutil.copytree(output_dir, output_path)
449
451
  click.echo(click.style("✓ Output copied successfully", fg="green"))
452
+
453
+
454
+ def _finalize_protmpnn(output_dir: Path, output_path: Path):
455
+ """Merge per-worker ProtMPNN results into final output.
456
+
457
+ 1. Merge results_worker_*.csv into results.csv (sorted by confidence)
458
+ 2. Flatten all seqs/ and pdbs/ into output seqs/ and pdbs/ dirs
459
+ 3. Print summary
460
+ """
461
+ import pandas as pd
462
+
463
+ csv_files = sorted(output_dir.glob("results_worker_*.csv"))
464
+
465
+ if not csv_files:
466
+ click.echo("No results_worker_*.csv files found in output directory.", err=True)
467
+ raise SystemExit(1)
468
+
469
+ output_path.mkdir(parents=True, exist_ok=True)
470
+
471
+ dfs = [pd.read_csv(f) for f in csv_files]
472
+ merged = pd.concat(dfs, ignore_index=True)
473
+ merged = merged.sort_values("overall_confidence", ascending=False)
474
+ merged.to_csv(output_path / "results.csv", index=False)
475
+
476
+ num_variants = len(merged)
477
+ num_configs = merged["config_name"].nunique()
478
+
479
+ seqs_dest = output_path / "seqs"
480
+ seqs_dest.mkdir(exist_ok=True)
481
+ for config_dir in output_dir.iterdir():
482
+ if config_dir.is_dir():
483
+ config_seqs = config_dir / "seqs"
484
+ if config_seqs.exists():
485
+ for fa_file in config_seqs.glob("*.fa"):
486
+ shutil.copy2(fa_file, seqs_dest / fa_file.name)
487
+
488
+ pdbs_dest = output_path / "pdbs"
489
+ pdbs_dest.mkdir(exist_ok=True)
490
+ for config_dir in output_dir.iterdir():
491
+ if config_dir.is_dir():
492
+ for subdir_name in ("pdbs", "backbones"):
493
+ config_pdbs = config_dir / subdir_name
494
+ if config_pdbs.exists():
495
+ for pdb_file in config_pdbs.glob("*.pdb"):
496
+ shutil.copy2(pdb_file, pdbs_dest / pdb_file.name)
497
+
498
+ top_conf = merged.iloc[0]["overall_confidence"] if num_variants > 0 else "N/A"
499
+
500
+ click.echo(
501
+ f"{num_variants} variants from {num_configs} config(s), "
502
+ f"top confidence: {top_conf}"
503
+ )
504
+ click.echo(click.style(f"Results: {output_path / 'results.csv'}", fg="green"))
@@ -0,0 +1,543 @@
1
+ """ProtMPNN/LigandMPNN sequence design pipeline command."""
2
+
3
+ import math
4
+ import os
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from ..aws_batch import BatchClient, BatchError, resolve_dependency
11
+ from ..job_id import generate_job_id, get_aws_username
12
+ from ..manifest import (
13
+ BATCH_JOBS_BASE,
14
+ BatchConfig,
15
+ InputConfig,
16
+ JobManifest,
17
+ JobStatus,
18
+ OutputConfig,
19
+ create_job_directory,
20
+ get_job_dir,
21
+ save_manifest,
22
+ )
23
+
24
+ DEFAULT_QUEUE = "t4-1x-spot"
25
+ MAX_WORKERS = 50
26
+ FILES_PER_WORKER = 10
27
+ DEFAULT_JOB_DEFINITION = "dayhoff-protmpnn"
28
+ DEFAULT_IMAGE_URI = (
29
+ "074735440724.dkr.ecr.us-east-1.amazonaws.com/dayhoff:protmpnn-latest"
30
+ )
31
+
32
+
33
+ @click.command()
34
+ @click.argument("input_dir", type=click.Path(exists=True))
35
+ @click.option(
36
+ "--workers",
37
+ default=None,
38
+ type=int,
39
+ help="Number of parallel workers [default: ~1 per 10 files]",
40
+ )
41
+ @click.option(
42
+ "--queue",
43
+ default=DEFAULT_QUEUE,
44
+ help=f"Batch queue [default: {DEFAULT_QUEUE}]",
45
+ )
46
+ @click.option("--dry-run", is_flag=True, help="Show plan without submitting")
47
+ @click.option(
48
+ "--local",
49
+ "run_local",
50
+ is_flag=True,
51
+ help="Force local execution via Docker",
52
+ )
53
+ @click.option(
54
+ "--remote",
55
+ "run_remote",
56
+ is_flag=True,
57
+ help="Force Batch submission (override auto-detect)",
58
+ )
59
+ @click.option(
60
+ "--shell",
61
+ "run_shell",
62
+ is_flag=True,
63
+ help="Drop into container shell for debugging",
64
+ )
65
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
66
+ @click.option("--after", "after", multiple=True, help="Job ID(s) to wait for before starting")
67
+ @click.option(
68
+ "--auto-validate-top",
69
+ type=int,
70
+ default=None,
71
+ help="Auto-submit Boltz validation for top N variants after completion",
72
+ )
73
+ def protmpnn(input_dir, workers, queue, dry_run, run_local, run_remote, run_shell, base_path, after, auto_validate_top):
74
+ """Design protein sequences with ProtMPNN/LigandMPNN.
75
+
76
+ Processes a directory of YAML config files, each specifying a PDB
77
+ structure and design parameters. Generates variant sequences ranked
78
+ by confidence.
79
+
80
+ \b
81
+ Examples:
82
+ # Run on a GPU engine (auto-detects GPU, runs locally)
83
+ dh batch protmpnn input/
84
+
85
+ # Force remote Batch submission
86
+ dh batch protmpnn input/ --remote
87
+
88
+ # Preview what would run
89
+ dh batch protmpnn input/ --dry-run
90
+
91
+ # Run locally via Docker
92
+ dh batch protmpnn input/ --local
93
+
94
+ \b
95
+ After remote job completes:
96
+ dh batch status <job-id>
97
+ dh batch finalize <job-id> --output ./results/
98
+
99
+ \b
100
+ YAML config format:
101
+ version: 1
102
+ pdb_path: 6DHI.pdb
103
+ model_type: ligand_mpnn
104
+ num_sequences: 20
105
+ seed: 42
106
+ temperature: 0.1
107
+ fixed_residues: "A42 A181 A217 A218"
108
+ """
109
+ input_path = Path(input_dir).resolve()
110
+
111
+ if run_shell:
112
+ _run_shell_mode(input_path)
113
+ return
114
+
115
+ if run_local:
116
+ _run_local_mode(input_path, auto_validate_top, base_path)
117
+ return
118
+
119
+ # Auto-detect GPU for smart defaulting
120
+ if not run_remote and not dry_run:
121
+ if _has_local_gpu():
122
+ click.echo("GPU detected — running locally (use --remote to override)")
123
+ _run_local_mode(input_path, auto_validate_top, base_path)
124
+ return
125
+
126
+ _submit_batch_job(input_path, workers, queue, dry_run, base_path, after, auto_validate_top)
127
+
128
+
129
+ def _has_local_gpu() -> bool:
130
+ """Check if a local NVIDIA GPU is available."""
131
+ import subprocess
132
+
133
+ try:
134
+ result = subprocess.run(
135
+ ["nvidia-smi"],
136
+ capture_output=True,
137
+ timeout=5,
138
+ )
139
+ return result.returncode == 0
140
+ except (FileNotFoundError, subprocess.TimeoutExpired):
141
+ return False
142
+
143
+
144
+ def _count_yaml_files(input_path: Path) -> int:
145
+ return len(list(input_path.glob("*.yaml")))
146
+
147
+
148
+ def _copy_inputs_to_job_dir(input_path: Path, job_dir: Path) -> int:
149
+ """Copy input YAML and PDB files to job directory."""
150
+ input_dir = job_dir / "input"
151
+ input_dir.mkdir(parents=True, exist_ok=True)
152
+
153
+ count = 0
154
+ for yaml_file in sorted(input_path.glob("*.yaml")):
155
+ shutil.copy2(yaml_file, input_dir / yaml_file.name)
156
+ count += 1
157
+
158
+ # Copy PDB files alongside YAMLs
159
+ for pdb_file in sorted(input_path.glob("*.pdb")):
160
+ shutil.copy2(pdb_file, input_dir / pdb_file.name)
161
+
162
+ return count
163
+
164
+
165
+ def _submit_batch_job(
166
+ input_path: Path,
167
+ workers: int | None,
168
+ queue: str,
169
+ dry_run: bool,
170
+ base_path: str,
171
+ after: tuple[str, ...] = (),
172
+ auto_validate_top: int | None = None,
173
+ ):
174
+ """Submit ProtMPNN job to AWS Batch."""
175
+ click.echo(f"Scanning {input_path} for YAML files...")
176
+ num_files = _count_yaml_files(input_path)
177
+
178
+ if num_files == 0:
179
+ click.echo(
180
+ click.style("Error: No YAML files found in input directory", fg="red"),
181
+ err=True,
182
+ )
183
+ raise SystemExit(1)
184
+
185
+ click.echo(f"Found {num_files} config(s) to process")
186
+
187
+ if workers is None:
188
+ workers = max(1, min(math.ceil(num_files / FILES_PER_WORKER), MAX_WORKERS))
189
+ array_size = min(num_files, workers)
190
+
191
+ job_id = generate_job_id("protmpnn")
192
+
193
+ click.echo()
194
+ click.echo(f"Job ID: {job_id}")
195
+ click.echo(f"Input: {input_path}")
196
+ click.echo(f"Configs: {num_files}")
197
+ click.echo(f"Workers: {array_size}")
198
+ files_per_worker = math.ceil(num_files / array_size)
199
+ click.echo(f"Files per worker: ~{files_per_worker}")
200
+ click.echo(f"Queue: {queue}")
201
+ click.echo(f"Job definition: {DEFAULT_JOB_DEFINITION}")
202
+
203
+ if dry_run:
204
+ click.echo()
205
+ click.echo(click.style("Dry run - job not submitted", fg="yellow"))
206
+ return
207
+
208
+ if not click.confirm("\nSubmit job?", default=True):
209
+ click.echo("Cancelled.")
210
+ raise SystemExit(0)
211
+ click.echo()
212
+
213
+ job_dir = create_job_directory(job_id, base_path)
214
+ click.echo(f"Created job directory: {job_dir}")
215
+
216
+ click.echo("Copying input files...")
217
+ copied = _copy_inputs_to_job_dir(input_path, job_dir)
218
+ click.echo(f"Copied {copied} YAML files")
219
+
220
+ manifest = JobManifest(
221
+ job_id=job_id,
222
+ user=job_id.split("-")[0],
223
+ pipeline="protmpnn",
224
+ status=JobStatus.PENDING,
225
+ image_uri=DEFAULT_IMAGE_URI,
226
+ input=InputConfig(
227
+ source=str(input_path),
228
+ num_sequences=num_files,
229
+ num_chunks=array_size,
230
+ ),
231
+ batch=BatchConfig(
232
+ queue=queue,
233
+ job_definition=DEFAULT_JOB_DEFINITION,
234
+ array_size=array_size,
235
+ ),
236
+ output=OutputConfig(
237
+ destination=None,
238
+ finalized=False,
239
+ ),
240
+ depends_on=list(after) if after else None,
241
+ )
242
+
243
+ save_manifest(manifest, base_path)
244
+
245
+ try:
246
+ resolved = [resolve_dependency(jid, base_path) for jid in after]
247
+ depends_on = [{"jobId": aws_id} for aws_id in resolved if aws_id is not None] or None
248
+
249
+ client = BatchClient()
250
+
251
+ environment = {
252
+ "JOB_DIR": str(job_dir),
253
+ "JOB_ID": job_id,
254
+ "BATCH_ARRAY_SIZE": str(array_size),
255
+ "BATCH_NUM_FILES": str(num_files),
256
+ }
257
+
258
+ batch_job_id = client.submit_job(
259
+ job_name=job_id,
260
+ job_definition=DEFAULT_JOB_DEFINITION,
261
+ job_queue=queue,
262
+ array_size=array_size,
263
+ environment=environment,
264
+ timeout_seconds=1 * 3600, # 1 hour
265
+ retry_attempts=5,
266
+ depends_on=depends_on,
267
+ share_identifier=get_aws_username(),
268
+ )
269
+
270
+ manifest.status = JobStatus.SUBMITTED
271
+ manifest.batch.job_id = batch_job_id
272
+ save_manifest(manifest, base_path)
273
+
274
+ click.echo()
275
+ click.echo(click.style("Job submitted successfully!", fg="green"))
276
+ click.echo()
277
+ click.echo(f"AWS Batch Job ID: {batch_job_id}")
278
+ if depends_on:
279
+ click.echo(f"Waiting on: {', '.join(after)}")
280
+ click.echo()
281
+ click.echo("Next steps:")
282
+ click.echo(f" Check status: dh batch status {job_id}")
283
+ click.echo(f" View logs: dh batch logs {job_id}")
284
+ click.echo(f" Cancel: dh batch cancel {job_id}")
285
+ click.echo()
286
+ click.echo("After completion:")
287
+ click.echo(
288
+ f" Finalize: dh batch finalize {job_id} --output ./results/"
289
+ )
290
+
291
+ if auto_validate_top:
292
+ _submit_boltz_validation(
293
+ job_id, batch_job_id, job_dir, auto_validate_top, base_path
294
+ )
295
+
296
+ except BatchError as e:
297
+ manifest.status = JobStatus.FAILED
298
+ manifest.error_message = str(e)
299
+ save_manifest(manifest, base_path)
300
+ click.echo(click.style(f"Failed to submit job: {e}", fg="red"), err=True)
301
+ raise SystemExit(1)
302
+
303
+
304
+ def _run_local_mode(input_path: Path, auto_validate_top: int | None = None, base_path: str = BATCH_JOBS_BASE):
305
+ """Run ProtMPNN locally in a Docker container."""
306
+ import subprocess
307
+
308
+ click.echo("Running ProtMPNN locally in container...")
309
+ click.echo(f"Input directory: {input_path}")
310
+
311
+ yaml_files = list(input_path.glob("*.yaml"))
312
+ if not yaml_files:
313
+ click.echo(click.style("Error: No YAML files found", fg="red"), err=True)
314
+ raise SystemExit(1)
315
+
316
+ click.echo(f"Found {len(yaml_files)} config file(s)")
317
+
318
+ temp_job_dir = input_path / ".local_protmpnn_job"
319
+ temp_input_dir = temp_job_dir / "input"
320
+ temp_output_dir = temp_job_dir / "output"
321
+
322
+ if temp_job_dir.exists():
323
+ shutil.rmtree(temp_job_dir)
324
+
325
+ temp_input_dir.mkdir(parents=True)
326
+ temp_output_dir.mkdir(parents=True)
327
+
328
+ for yaml_file in yaml_files:
329
+ shutil.copy2(yaml_file, temp_input_dir / yaml_file.name)
330
+ for pdb_file in input_path.glob("*.pdb"):
331
+ shutil.copy2(pdb_file, temp_input_dir / pdb_file.name)
332
+
333
+ click.echo(f"Output will be at: {temp_output_dir}/")
334
+ click.echo()
335
+
336
+ cmd = [
337
+ "docker",
338
+ "run",
339
+ "--rm",
340
+ "--gpus",
341
+ "all",
342
+ "-v",
343
+ "/primordial:/primordial",
344
+ "-v",
345
+ f"{temp_job_dir}:{temp_job_dir}",
346
+ "-e",
347
+ f"JOB_DIR={temp_job_dir}",
348
+ "-e",
349
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
350
+ "-e",
351
+ "BATCH_ARRAY_SIZE=1",
352
+ "-e",
353
+ f"BATCH_NUM_FILES={len(yaml_files)}",
354
+ DEFAULT_IMAGE_URI,
355
+ ]
356
+
357
+ click.echo(f"Running: {' '.join(cmd)}")
358
+ click.echo()
359
+
360
+ try:
361
+ result = subprocess.run(cmd)
362
+ if result.returncode != 0:
363
+ click.echo(
364
+ click.style(
365
+ f"Container exited with code {result.returncode}", fg="red"
366
+ ),
367
+ err=True,
368
+ )
369
+ raise SystemExit(result.returncode)
370
+
371
+ csv_files = list(temp_output_dir.glob("results_worker_*.csv"))
372
+ if csv_files:
373
+ # Merge worker CSVs into results.csv for local mode
374
+ import pandas as pd
375
+
376
+ dfs = [pd.read_csv(f) for f in csv_files]
377
+ merged = pd.concat(dfs, ignore_index=True)
378
+ merged = merged.sort_values("overall_confidence", ascending=False)
379
+ merged.to_csv(temp_output_dir / "results.csv", index=False)
380
+
381
+ click.echo()
382
+ click.echo(click.style("Design complete!", fg="green"))
383
+ click.echo(f"Results: {temp_output_dir / 'results.csv'}")
384
+ click.echo(f" {len(merged)} variants generated")
385
+
386
+ if auto_validate_top:
387
+ _run_local_boltz_validation(
388
+ temp_output_dir, input_path, auto_validate_top
389
+ )
390
+ else:
391
+ click.echo(click.style("Warning: No results CSV found", fg="yellow"))
392
+
393
+ except FileNotFoundError:
394
+ click.echo(
395
+ click.style(
396
+ "Error: Docker not found. Is Docker installed and running?",
397
+ fg="red",
398
+ ),
399
+ err=True,
400
+ )
401
+ raise SystemExit(1)
402
+
403
+
404
+ def _run_shell_mode(input_path: Path):
405
+ """Drop into container shell for debugging."""
406
+ import subprocess
407
+
408
+ click.echo("Dropping into container shell...")
409
+ click.echo(f"Input will be available at: /input/")
410
+ click.echo()
411
+
412
+ cmd = [
413
+ "docker",
414
+ "run",
415
+ "--rm",
416
+ "-it",
417
+ "--gpus",
418
+ "all",
419
+ "-v",
420
+ "/primordial:/primordial",
421
+ "-v",
422
+ f"{input_path}:/input",
423
+ "-e",
424
+ "JOB_DIR=/input",
425
+ "-e",
426
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
427
+ "--entrypoint",
428
+ "/bin/bash",
429
+ DEFAULT_IMAGE_URI,
430
+ ]
431
+
432
+ click.echo(f"Running: {' '.join(cmd)}")
433
+ click.echo()
434
+
435
+ try:
436
+ subprocess.run(cmd)
437
+ except FileNotFoundError:
438
+ click.echo(
439
+ click.style(
440
+ "Error: Docker not found. Is Docker installed and running?",
441
+ fg="red",
442
+ ),
443
+ err=True,
444
+ )
445
+ raise SystemExit(1)
446
+
447
+
448
+ def _convert_to_boltz(results_dir: Path, config_dir: Path, top_n: int) -> Path:
449
+ """Run protmpnn-to-boltz conversion, return the output directory."""
450
+ from .protmpnn_to_boltz import (
451
+ _build_boltz_yaml,
452
+ _load_ligand_smiles,
453
+ _write_pymol_script,
454
+ )
455
+
456
+ import pandas as pd
457
+ import yaml
458
+
459
+ csv_path = results_dir / "results.csv"
460
+ if not csv_path.exists():
461
+ worker_csvs = sorted(results_dir.glob("results_worker_*.csv"))
462
+ if not worker_csvs:
463
+ raise FileNotFoundError(f"No results CSV in {results_dir}")
464
+ dfs = [pd.read_csv(f) for f in worker_csvs]
465
+ df = pd.concat(dfs, ignore_index=True).sort_values(
466
+ "overall_confidence", ascending=False
467
+ )
468
+ else:
469
+ df = pd.read_csv(csv_path)
470
+
471
+ top_n = min(top_n, len(df))
472
+ top_variants = df.head(top_n)
473
+ ligand_map = _load_ligand_smiles(str(config_dir), results_dir)
474
+
475
+ boltz_dir = results_dir.parent / "boltz_input"
476
+ boltz_dir.mkdir(parents=True, exist_ok=True)
477
+
478
+ generated = []
479
+ for _idx, row in top_variants.iterrows():
480
+ config_name = row.get("config_name", "unknown")
481
+ variant_id = int(row.get("variant_id", _idx))
482
+ sequence = row["sequence"]
483
+ confidence = row.get("overall_confidence", float("nan"))
484
+
485
+ boltz_yaml = _build_boltz_yaml(
486
+ sequence=sequence,
487
+ config_name=config_name,
488
+ variant_id=variant_id,
489
+ ligand_smiles=ligand_map.get(config_name),
490
+ )
491
+
492
+ filename = f"{config_name}_var{variant_id:03d}.yaml"
493
+ with open(boltz_dir / filename, "w") as f:
494
+ yaml.dump(boltz_yaml, f, default_flow_style=False, sort_keys=False)
495
+ generated.append((filename, confidence))
496
+
497
+ _write_pymol_script(boltz_dir, results_dir, generated, ligand_map)
498
+ return boltz_dir
499
+
500
+
501
+ def _run_local_boltz_validation(
502
+ results_dir: Path, original_input_path: Path, top_n: int
503
+ ):
504
+ """Convert top variants to Boltz YAMLs and run Boltz locally."""
505
+ click.echo()
506
+ click.echo(f"Auto-validating top {top_n} variants with Boltz...")
507
+
508
+ boltz_dir = _convert_to_boltz(results_dir, original_input_path, top_n)
509
+ num_yamls = len(list(boltz_dir.glob("*.yaml")))
510
+ click.echo(f"Generated {num_yamls} Boltz configs at {boltz_dir}/")
511
+ click.echo()
512
+
513
+ from .boltz import _run_local_mode as boltz_local
514
+
515
+ boltz_local(boltz_dir)
516
+
517
+
518
+ def _submit_boltz_validation(
519
+ protmpnn_job_id: str,
520
+ protmpnn_aws_job_id: str,
521
+ job_dir: Path,
522
+ top_n: int,
523
+ base_path: str,
524
+ ):
525
+ """Pre-register a dependent Boltz Batch job that runs after ProtMPNN completes.
526
+
527
+ The ProtMPNN worker writes results to job_dir/output/. The Boltz conversion
528
+ happens at finalize time — we set up a post-finalize hook via an environment
529
+ variable that tells the ProtMPNN finalizer to convert and submit Boltz.
530
+ """
531
+ click.echo()
532
+ click.echo(
533
+ f"Boltz validation for top {top_n} will run after ProtMPNN finalize."
534
+ )
535
+ click.echo(
536
+ "After ProtMPNN completes, finalize will auto-convert and submit Boltz:"
537
+ )
538
+ click.echo(f" dh batch finalize {protmpnn_job_id} --auto-validate-top {top_n}")
539
+ click.echo()
540
+ click.echo(
541
+ "Or manually: dh batch protmpnn-to-boltz <results_dir> --top "
542
+ f"{top_n} && dh batch boltz <boltz_dir>"
543
+ )
@@ -0,0 +1,249 @@
1
+ """Convert ProtMPNN results to Boltz input YAMLs for structural validation."""
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import click
7
+ import pandas as pd
8
+ import yaml
9
+
10
+
11
+ @click.command("protmpnn-to-boltz")
12
+ @click.argument("results_dir", type=click.Path(exists=True))
13
+ @click.option("--top", default=10, type=int, help="Number of top variants to convert")
14
+ @click.option(
15
+ "--output",
16
+ "-o",
17
+ default=None,
18
+ type=click.Path(),
19
+ help="Output directory for Boltz YAMLs [default: boltz_input/]",
20
+ )
21
+ @click.option(
22
+ "--config",
23
+ "config_dir",
24
+ default=None,
25
+ type=click.Path(exists=True),
26
+ help="Directory containing original ProtMPNN config YAMLs (for ligand_smiles)",
27
+ )
28
+ def protmpnn_to_boltz(results_dir, top, output, config_dir):
29
+ """Convert top ProtMPNN variants to Boltz YAML configs.
30
+
31
+ Takes a ProtMPNN results directory (containing results.csv) and generates
32
+ Boltz-format YAML files for structural validation of the top-N variants.
33
+
34
+ \b
35
+ Examples:
36
+ # Convert top 10 from local run
37
+ dh batch protmpnn-to-boltz input/.local_protmpnn_job/output/ --output boltz_in/
38
+
39
+ # Convert top 20, pull ligand SMILES from original configs
40
+ dh batch protmpnn-to-boltz results/ --top 20 --config input/ -o boltz_in/
41
+
42
+ \b
43
+ The generated Boltz YAMLs can be used directly:
44
+ dh batch boltz boltz_in/
45
+ dh batch boltz --local boltz_in/
46
+ """
47
+ results_path = Path(results_dir).resolve()
48
+ csv_path = results_path / "results.csv"
49
+
50
+ if not csv_path.exists():
51
+ # Try worker CSVs if results.csv not found (e.g. raw output before finalize)
52
+ worker_csvs = sorted(results_path.glob("results_worker_*.csv"))
53
+ if worker_csvs:
54
+ dfs = [pd.read_csv(f) for f in worker_csvs]
55
+ df = pd.concat(dfs, ignore_index=True)
56
+ df = df.sort_values("overall_confidence", ascending=False)
57
+ else:
58
+ click.echo(
59
+ click.style(
60
+ "Error: No results.csv or results_worker_*.csv found", fg="red"
61
+ ),
62
+ err=True,
63
+ )
64
+ raise SystemExit(1)
65
+ else:
66
+ df = pd.read_csv(csv_path)
67
+
68
+ if len(df) == 0:
69
+ click.echo(click.style("Error: Results CSV is empty", fg="red"), err=True)
70
+ raise SystemExit(1)
71
+
72
+ top_n = min(top, len(df))
73
+ top_variants = df.head(top_n)
74
+
75
+ # Resolve ligand SMILES from original config YAMLs
76
+ ligand_map = _load_ligand_smiles(config_dir, results_path)
77
+
78
+ output_path = Path(output or "boltz_input").resolve()
79
+ output_path.mkdir(parents=True, exist_ok=True)
80
+
81
+ click.echo(f"Converting top {top_n} variants to Boltz format...")
82
+
83
+ generated = []
84
+ for idx, row in top_variants.iterrows():
85
+ config_name = row.get("config_name", "unknown")
86
+ variant_id = int(row.get("variant_id", idx))
87
+ sequence = row["sequence"]
88
+ confidence = row.get("overall_confidence", float("nan"))
89
+
90
+ boltz_yaml = _build_boltz_yaml(
91
+ sequence=sequence,
92
+ config_name=config_name,
93
+ variant_id=variant_id,
94
+ ligand_smiles=ligand_map.get(config_name),
95
+ )
96
+
97
+ filename = f"{config_name}_var{variant_id:03d}.yaml"
98
+ yaml_path = output_path / filename
99
+
100
+ with open(yaml_path, "w") as f:
101
+ yaml.dump(boltz_yaml, f, default_flow_style=False, sort_keys=False)
102
+
103
+ generated.append((filename, confidence))
104
+
105
+ # Copy PDB files for reference if available
106
+ pdbs_src = results_path / "pdbs"
107
+ if pdbs_src.exists():
108
+ pdbs_dest = output_path / "reference_pdbs"
109
+ pdbs_dest.mkdir(exist_ok=True)
110
+ for pdb in pdbs_src.glob("*.pdb"):
111
+ shutil.copy2(pdb, pdbs_dest / pdb.name)
112
+
113
+ # Generate PyMOL visualization script
114
+ _write_pymol_script(output_path, results_path, generated, ligand_map)
115
+
116
+ click.echo()
117
+ click.echo(click.style(f"Generated {len(generated)} Boltz configs", fg="green"))
118
+ click.echo(f"Output: {output_path}/")
119
+ click.echo()
120
+ click.echo("Next steps:")
121
+ click.echo(f" dh batch boltz {output_path}/")
122
+ click.echo(f" dh batch boltz --local {output_path}/")
123
+
124
+
125
+ def _load_ligand_smiles(
126
+ config_dir: str | None, results_path: Path
127
+ ) -> dict[str, str | None]:
128
+ """Load ligand_smiles from original ProtMPNN config YAMLs.
129
+
130
+ Searches config_dir first, then falls back to the input/ sibling
131
+ of the results directory (common in local runs).
132
+ """
133
+ smiles_map: dict[str, str | None] = {}
134
+
135
+ search_dirs = []
136
+ if config_dir:
137
+ search_dirs.append(Path(config_dir))
138
+
139
+ # For local runs: results are at input/.local_protmpnn_job/output/
140
+ # Config YAMLs are at input/
141
+ if results_path.name == "output":
142
+ job_dir = results_path.parent
143
+ input_dir = job_dir / "input"
144
+ if input_dir.exists():
145
+ search_dirs.append(input_dir)
146
+
147
+ for search_dir in search_dirs:
148
+ for yaml_file in search_dir.glob("*.yaml"):
149
+ try:
150
+ with open(yaml_file) as f:
151
+ data = yaml.safe_load(f)
152
+ if isinstance(data, dict) and data.get("ligand_smiles"):
153
+ smiles_map[yaml_file.stem] = data["ligand_smiles"]
154
+ except Exception:
155
+ continue
156
+
157
+ return smiles_map
158
+
159
+
160
+ def _build_boltz_yaml(
161
+ sequence: str,
162
+ config_name: str,
163
+ variant_id: int,
164
+ ligand_smiles: str | None = None,
165
+ ) -> dict:
166
+ """Build a Boltz-format YAML dict for a single variant."""
167
+ sequences = [
168
+ {
169
+ "protein": {
170
+ "id": "A",
171
+ "sequence": sequence,
172
+ }
173
+ }
174
+ ]
175
+
176
+ if ligand_smiles:
177
+ sequences.append(
178
+ {
179
+ "ligand": {
180
+ "id": "B",
181
+ "smiles": ligand_smiles,
182
+ }
183
+ }
184
+ )
185
+
186
+ return {
187
+ "version": 1,
188
+ "sequences": sequences,
189
+ }
190
+
191
+
192
+ def _write_pymol_script(
193
+ output_path: Path,
194
+ results_path: Path,
195
+ generated: list[tuple[str, float]],
196
+ ligand_map: dict[str, str | None],
197
+ ):
198
+ """Generate a PyMOL script for visualizing WT + variant structures.
199
+
200
+ This script is designed to be run after Boltz validation completes,
201
+ loading the predicted structures and aligning them to the WT.
202
+ """
203
+ pdbs_dir = results_path / "pdbs"
204
+ wt_pdbs = sorted(pdbs_dir.glob("*.pdb")) if pdbs_dir.exists() else []
205
+
206
+ lines = [
207
+ "# PyMOL visualization script for ProtMPNN variants",
208
+ "# Generated by: dh batch protmpnn-to-boltz",
209
+ "#",
210
+ "# Usage: pymol view_variants.pml",
211
+ "# or: pymol -r view_variants.pml",
212
+ "",
213
+ "from pymol import cmd",
214
+ "",
215
+ ]
216
+
217
+ if wt_pdbs:
218
+ wt_pdb = wt_pdbs[0]
219
+ lines.append(f'cmd.load("reference_pdbs/{wt_pdb.name}", "wildtype")')
220
+ lines.append('cmd.color("gray80", "wildtype")')
221
+ lines.append("")
222
+
223
+ lines.append("# Load variant structures after Boltz validation")
224
+ lines.append("# Boltz outputs will be in the finalized results directory")
225
+ for filename, confidence in generated:
226
+ obj_name = filename.replace(".yaml", "")
227
+ lines.append(f"# {obj_name}: confidence={confidence:.3f}")
228
+
229
+ lines.extend([
230
+ "",
231
+ "# Align all objects to wildtype",
232
+ 'for obj in cmd.get_object_list():',
233
+ ' if obj != "wildtype":',
234
+ ' cmd.align(obj, "wildtype")',
235
+ "",
236
+ "# Show cartoon representation",
237
+ "cmd.show('cartoon')",
238
+ "cmd.hide('lines')",
239
+ "",
240
+ "# Highlight mutations (after loading Boltz results)",
241
+ "# cmd.select('mutations', 'wildtype and not (same sequence as variant)')",
242
+ "",
243
+ "cmd.zoom()",
244
+ "print('Loaded variant structures. Align Boltz results manually.')",
245
+ ])
246
+
247
+ script_path = output_path / "view_variants.pml"
248
+ with open(script_path, "w") as f:
249
+ f.write("\n".join(lines) + "\n")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes