celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/structure.py ADDED
@@ -0,0 +1,882 @@
1
+ """
2
+ Structure prediction tools: AlphaFold fetch, docking, MD simulation, FEP, binding sites.
3
+
4
+ ternary_predict and batch_screen require the TernaryPred sister project
5
+ (github.com/celltype/TernaryPred) to be installed locally. All other tools
6
+ work standalone.
7
+ """
8
+
9
+ import subprocess
10
+ import sys
11
+ import json
12
+ from pathlib import Path
13
+ from ct.tools import registry
14
+ from ct.tools.http_client import request
15
+
16
+ TERNARYPRED_DIR = Path.home() / "Projects" / "CellType" / "TernaryPred"
17
+
18
+
19
+ @registry.register(
20
+ name="structure.ternary_predict",
21
+ description="Predict ternary complex structure (E3 ligase + compound + target) using DeepTernary (requires TernaryPred installation)",
22
+ category="structure",
23
+ parameters={
24
+ "smiles": "Compound SMILES string",
25
+ "target_pdb": "Path to target protein PDB",
26
+ "e3": "E3 ligase: CRBN or VHL",
27
+ },
28
+ usage_guide="You want to predict how a molecular glue or PROTAC forms a ternary complex between E3 ligase and target. Use when you have a compound SMILES and target structure. Requires TernaryPred installation.",
29
+ )
30
+ def ternary_predict(smiles: str, target_pdb: str, e3: str = "CRBN",
31
+ name: str = "prediction", **kwargs) -> dict:
32
+ """Predict ternary complex using DeepTernary via TernaryPred wrapper."""
33
+ script = TERNARYPRED_DIR / "scripts" / "predict_deepternary.py"
34
+ e3_path = TERNARYPRED_DIR / "data" / "e3_structures" / f"{'crbn_5fqd' if e3 == 'CRBN' else 'vhl_4w9h'}.pdb"
35
+
36
+ if not script.exists():
37
+ return {
38
+ "error": (
39
+ f"TernaryPred not installed. This tool requires the TernaryPred sister project.\n"
40
+ f"Install: git clone git@github.com:celltype/TernaryPred.git {TERNARYPRED_DIR}"
41
+ ),
42
+ "summary": "Ternary prediction unavailable — TernaryPred not installed",
43
+ }
44
+
45
+ cmd = [
46
+ sys.executable, str(script), "single",
47
+ "--smiles", smiles,
48
+ "--e3", str(e3_path),
49
+ "--target", target_pdb,
50
+ "--name", name,
51
+ "--device", "cuda",
52
+ ]
53
+
54
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
55
+
56
+ if result.returncode != 0:
57
+ return {"error": f"DeepTernary failed: {result.stderr[:500]}", "summary": f"DeepTernary failed: {result.stderr[:500]}"}
58
+ return {
59
+ "summary": f"Ternary complex predicted for {name}",
60
+ "stdout": result.stdout,
61
+ "e3": e3,
62
+ }
63
+
64
+
65
+ @registry.register(
66
+ name="structure.batch_screen",
67
+ description="Screen compounds against a protein target panel for ternary compatibility (requires TernaryPred installation)",
68
+ category="structure",
69
+ parameters={
70
+ "compounds_csv": "CSV with compound_id and smiles columns",
71
+ "targets_csv": "CSV with target_id and structure_path columns",
72
+ "e3": "E3 ligase: CRBN or VHL",
73
+ },
74
+ usage_guide="You need to screen many compounds against many targets for ternary complex formation. Use for large-scale virtual screening campaigns. Long-running — launches background process.",
75
+ )
76
+ def batch_screen(compounds_csv: str, targets_csv: str, e3: str = "CRBN",
77
+ max_compounds: int = None, max_targets: int = None, **kwargs) -> dict:
78
+ """Batch ternary screening via TernaryPred."""
79
+ script = TERNARYPRED_DIR / "scripts" / "predict_deepternary.py"
80
+ e3_path = TERNARYPRED_DIR / "data" / "e3_structures" / f"{'crbn_5fqd' if e3 == 'CRBN' else 'vhl_4w9h'}.pdb"
81
+
82
+ if not script.exists():
83
+ return {
84
+ "error": (
85
+ f"TernaryPred not installed. This tool requires the TernaryPred sister project.\n"
86
+ f"Install: git clone git@github.com:celltype/TernaryPred.git {TERNARYPRED_DIR}"
87
+ ),
88
+ "summary": "Batch screening unavailable — TernaryPred not installed",
89
+ }
90
+
91
+ cmd = [
92
+ sys.executable, str(script), "batch",
93
+ "--compounds", compounds_csv,
94
+ "--targets", targets_csv,
95
+ "--e3", str(e3_path),
96
+ "--outdir", str(TERNARYPRED_DIR / "predictions" / "ternary_complexes"),
97
+ "--resume",
98
+ ]
99
+ if max_compounds:
100
+ cmd += ["--max-compounds", str(max_compounds)]
101
+ if max_targets:
102
+ cmd += ["--max-targets", str(max_targets)]
103
+
104
+ # This is a long-running process — run in background
105
+ result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
106
+
107
+ return {
108
+ "summary": f"Batch screening started (PID: {result.pid})",
109
+ "pid": result.pid,
110
+ "note": "Long-running process. Check TernaryPred/predictions/ for results.",
111
+ }
112
+
113
+
114
+ @registry.register(
115
+ name="structure.alphafold_fetch",
116
+ description="Download AlphaFold predicted structure for a protein",
117
+ category="structure",
118
+ parameters={"uniprot_id": "UniProt ID"},
119
+ usage_guide="You need a 3D structure for a target protein and no experimental structure is available. Fetches AlphaFold prediction. Use before ternary_predict or structure analysis.",
120
+ )
121
+ def alphafold_fetch(uniprot_id: str, **kwargs) -> dict:
122
+ """Download AlphaFold structure for a protein."""
123
+ cache_dir = Path.home() / ".ct" / "cache" / "alphafold"
124
+ cache_dir.mkdir(parents=True, exist_ok=True)
125
+
126
+ output_path = cache_dir / f"AF-{uniprot_id}-F1-model_v4.pdb"
127
+
128
+ if output_path.exists():
129
+ return {
130
+ "summary": f"AlphaFold structure for {uniprot_id} (cached)",
131
+ "path": str(output_path),
132
+ "cached": True,
133
+ }
134
+
135
+ url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
136
+ response, error = request(
137
+ "GET",
138
+ url,
139
+ timeout=30,
140
+ retries=2,
141
+ raise_for_status=False,
142
+ )
143
+ if error:
144
+ return {"error": f"Failed to fetch AlphaFold structure: {error}", "summary": f"Failed to fetch AlphaFold structure: {error}"}
145
+ if response.status_code == 200 and response.text.startswith("HEADER"):
146
+ output_path.write_text(response.text)
147
+ return {
148
+ "summary": f"Downloaded AlphaFold structure for {uniprot_id}",
149
+ "path": str(output_path),
150
+ "cached": False,
151
+ }
152
+ else:
153
+ return {"error": f"AlphaFold structure not available for {uniprot_id} (HTTP {response.status_code})", "summary": f"AlphaFold structure not available for {uniprot_id} (HTTP {response.status_code})"}
154
+ @registry.register(
155
+ name="structure.compound_3d",
156
+ description="Generate 3D conformer from SMILES and save as SDF",
157
+ category="structure",
158
+ parameters={"smiles": "SMILES string", "output_path": "Output SDF path"},
159
+ usage_guide="You need a 3D structure for a small molecule compound. Use before docking or ternary complex prediction when you only have a SMILES string.",
160
+ )
161
+ def compound_3d(smiles: str, output_path: str = None, **kwargs) -> dict:
162
+ """Generate 3D conformer for a compound."""
163
+ from rdkit import Chem
164
+ from rdkit.Chem import AllChem, Descriptors
165
+
166
+ mol = Chem.MolFromSmiles(smiles)
167
+ if mol is None:
168
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
169
+ mol = Chem.AddHs(mol)
170
+ result = AllChem.EmbedMolecule(mol, AllChem.ETKDGv3())
171
+ if result != 0:
172
+ AllChem.EmbedMolecule(mol, AllChem.ETKDGv3(), randomSeed=42)
173
+ AllChem.MMFFOptimizeMolecule(mol)
174
+
175
+ if output_path:
176
+ writer = Chem.SDWriter(output_path)
177
+ writer.write(mol)
178
+ writer.close()
179
+
180
+ return {
181
+ "summary": f"3D conformer generated ({Descriptors.MolWt(mol):.1f} Da, {mol.GetNumAtoms()} atoms)",
182
+ "smiles": smiles,
183
+ "n_atoms": mol.GetNumAtoms(),
184
+ "output_path": output_path,
185
+ }
186
+
187
+
188
+ # ---------------------------------------------------------------------------
189
+ # Docking
190
+ # ---------------------------------------------------------------------------
191
+
192
+
193
+ def _resolve_pdb(target_pdb: str) -> dict:
194
+ """Resolve a target identifier to a local PDB file path.
195
+
196
+ Accepts either a local file path or a UniProt ID. When a UniProt ID is
197
+ given the AlphaFold structure is fetched first via *alphafold_fetch*.
198
+
199
+ Returns ``{"path": str}`` on success or ``{"error": str}`` on failure.
200
+ """
201
+ p = Path(target_pdb)
202
+ if p.exists():
203
+ return {"path": str(p)}
204
+
205
+ # Treat as UniProt ID — try AlphaFold download
206
+ result = alphafold_fetch(target_pdb)
207
+ if "error" in result:
208
+ return {"error": f"Could not resolve target '{target_pdb}': {result['error']}", "summary": f"Could not resolve target '{target_pdb}': {result['error']}"}
209
+ return {"path": result["path"]}
210
+
211
+
212
+ def _prepare_ligand_pdbqt(smiles: str, work_dir: Path) -> dict:
213
+ """Generate a 3D conformer from SMILES and convert to PDBQT for Vina.
214
+
215
+ Returns ``{"path": str, "mol": rdkit.Mol}`` on success,
216
+ ``{"error": str}`` on failure.
217
+ """
218
+ try:
219
+ from rdkit import Chem
220
+ from rdkit.Chem import AllChem
221
+ except ImportError:
222
+ return {"error": "RDKit is required for ligand preparation (pip install rdkit)", "summary": "RDKit is required for ligand preparation (pip install rdkit)"}
223
+ mol = Chem.MolFromSmiles(smiles)
224
+ if mol is None:
225
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
226
+ mol = Chem.AddHs(mol)
227
+ res = AllChem.EmbedMolecule(mol, AllChem.ETKDGv3())
228
+ if res != 0:
229
+ AllChem.EmbedMolecule(mol, AllChem.ETKDGv3(), randomSeed=42)
230
+ AllChem.MMFFOptimizeMolecule(mol)
231
+
232
+ sdf_path = work_dir / "ligand.sdf"
233
+ writer = Chem.SDWriter(str(sdf_path))
234
+ writer.write(mol)
235
+ writer.close()
236
+
237
+ pdbqt_path = work_dir / "ligand.pdbqt"
238
+
239
+ # Try obabel conversion
240
+ try:
241
+ conv = subprocess.run(
242
+ ["obabel", str(sdf_path), "-O", str(pdbqt_path), "--gen3d"],
243
+ capture_output=True, text=True, timeout=30,
244
+ )
245
+ if conv.returncode == 0 and pdbqt_path.exists():
246
+ return {"path": str(pdbqt_path), "mol": mol, "obabel_fallback": False}
247
+ except FileNotFoundError:
248
+ pass
249
+
250
+ # Fallback: write a minimal PDBQT from coordinates
251
+ # Note: lacks proper atom types and charges — adequate for scoring but
252
+ # install Open Babel for production docking: conda install -c conda-forge openbabel
253
+ conf = mol.GetConformer()
254
+ lines = []
255
+ for i, atom in enumerate(mol.GetAtoms()):
256
+ pos = conf.GetAtomPosition(i)
257
+ element = atom.GetSymbol()
258
+ lines.append(
259
+ f"ATOM {i+1:5d} {element:<3s} LIG A 1 "
260
+ f"{pos.x:8.3f}{pos.y:8.3f}{pos.z:8.3f} 1.00 0.00 "
261
+ f"{element:>2s}"
262
+ )
263
+ pdbqt_path.write_text("\n".join(lines) + "\n")
264
+ return {"path": str(pdbqt_path), "mol": mol, "obabel_fallback": True}
265
+
266
+
267
+ def _detect_search_box(pdb_path: str) -> dict:
268
+ """Compute a bounding-box centre and size from PDB ATOM coordinates.
269
+
270
+ Returns ``{"center_x", "center_y", "center_z", "size_x", "size_y",
271
+ "size_z"}`` with a 10 A padding on each side.
272
+ """
273
+ xs, ys, zs = [], [], []
274
+ with open(pdb_path) as fh:
275
+ for line in fh:
276
+ if line.startswith(("ATOM", "HETATM")):
277
+ try:
278
+ xs.append(float(line[30:38]))
279
+ ys.append(float(line[38:46]))
280
+ zs.append(float(line[46:54]))
281
+ except (ValueError, IndexError):
282
+ continue
283
+ if not xs:
284
+ return {"error": "No atoms found in PDB", "summary": "No atoms found in PDB"}
285
+ padding = 10.0
286
+ return {
287
+ "center_x": round((min(xs) + max(xs)) / 2, 2),
288
+ "center_y": round((min(ys) + max(ys)) / 2, 2),
289
+ "center_z": round((min(zs) + max(zs)) / 2, 2),
290
+ "size_x": round(max(xs) - min(xs) + 2 * padding, 2),
291
+ "size_y": round(max(ys) - min(ys) + 2 * padding, 2),
292
+ "size_z": round(max(zs) - min(zs) + 2 * padding, 2),
293
+ }
294
+
295
+
296
+ @registry.register(
297
+ name="structure.dock",
298
+ description="Molecular docking: dock a ligand (SMILES) into a target protein (PDB path or UniProt ID)",
299
+ category="structure",
300
+ parameters={
301
+ "smiles": "Ligand SMILES string",
302
+ "target_pdb": "Path to target PDB file or UniProt ID for AlphaFold fetch",
303
+ "method": "Docking method: vina (default), diffdock, gnina",
304
+ "n_poses": "Number of docking poses to generate (default 5)",
305
+ },
306
+ usage_guide=(
307
+ "You want to predict how a small molecule binds to a protein target. "
308
+ "Use Vina for fast local docking, DiffDock for GPU-accelerated deep-learning "
309
+ "docking, or gnina for CNN-scored docking. Returns binding poses with "
310
+ "predicted affinities."
311
+ ),
312
+ )
313
+ def dock(smiles: str, target_pdb: str, method: str = "vina",
314
+ n_poses: int = 5, **kwargs) -> dict:
315
+ """Dock a ligand into a protein target.
316
+
317
+ * method='vina' — runs AutoDock Vina locally if installed, else submits
318
+ to cloud compute.
319
+ * method='diffdock' — always submitted as cloud GPU job.
320
+ * method='gnina' — runs gnina locally if installed, else cloud.
321
+ """
322
+ import tempfile
323
+
324
+ valid_methods = ("vina", "diffdock", "gnina")
325
+ if method not in valid_methods:
326
+ return {"error": f"Unknown docking method '{method}'. Choose from: {', '.join(valid_methods)}", "summary": f"Unknown docking method '{method}'. Choose from: {', '.join(valid_methods)}"}
327
+ # Resolve target PDB
328
+ target = _resolve_pdb(target_pdb)
329
+ if "error" in target:
330
+ return {"error": target["error"], "summary": f"Docking failed: {target['error']}"}
331
+ pdb_path = target["path"]
332
+
333
+ # GPU-only methods go straight to cloud
334
+ if method == "diffdock":
335
+ from ct.tools.compute import submit_job
336
+ job_result = submit_job(
337
+ job_type="molecular_docking",
338
+ params={
339
+ "smiles": smiles,
340
+ "target_pdb": pdb_path,
341
+ "method": "diffdock",
342
+ "n_poses": n_poses,
343
+ },
344
+ dry_run=kwargs.get("dry_run", True),
345
+ )
346
+ if "error" in job_result:
347
+ return {
348
+ "error": job_result["error"],
349
+ "summary": f"DiffDock submission failed: {job_result['error']}",
350
+ }
351
+ return {
352
+ "summary": f"DiffDock docking submitted for {smiles[:40]} into {Path(pdb_path).stem} ({n_poses} poses)",
353
+ "method": "diffdock",
354
+ "job": job_result,
355
+ }
356
+
357
+ # Vina / gnina — try local first
358
+ work_dir = Path(tempfile.mkdtemp(prefix="ct_dock_"))
359
+
360
+ # Prepare ligand
361
+ lig = _prepare_ligand_pdbqt(smiles, work_dir)
362
+ if "error" in lig:
363
+ return {"error": lig["error"], "summary": f"Ligand preparation failed: {lig['error']}"}
364
+
365
+ # Compute search box from target
366
+ box = _detect_search_box(pdb_path)
367
+ if "error" in box:
368
+ return {"error": box["error"], "summary": f"Search box detection failed: {box['error']}"}
369
+
370
+ binary = method # "vina" or "gnina"
371
+ output_path = work_dir / "docking_out.pdbqt"
372
+
373
+ # Check if binary is available locally
374
+ try:
375
+ subprocess.run([binary, "--version"], capture_output=True, timeout=5)
376
+ local_available = True
377
+ except (FileNotFoundError, subprocess.TimeoutExpired):
378
+ local_available = False
379
+
380
+ if not local_available:
381
+ # Submit as cloud job
382
+ from ct.tools.compute import submit_job
383
+ job_result = submit_job(
384
+ job_type="molecular_docking",
385
+ params={
386
+ "smiles": smiles,
387
+ "target_pdb": pdb_path,
388
+ "method": method,
389
+ "n_poses": n_poses,
390
+ "search_box": box,
391
+ },
392
+ dry_run=kwargs.get("dry_run", True),
393
+ )
394
+ if "error" in job_result:
395
+ return {
396
+ "error": job_result["error"],
397
+ "summary": f"{method} not installed locally; cloud submission failed: {job_result['error']}",
398
+ }
399
+ return {
400
+ "summary": (
401
+ f"{method} not installed locally — submitted cloud docking job for "
402
+ f"{smiles[:40]} into {Path(pdb_path).stem}"
403
+ ),
404
+ "method": method,
405
+ "local": False,
406
+ "job": job_result,
407
+ }
408
+
409
+ # Run locally
410
+ cmd = [
411
+ binary,
412
+ "--receptor", pdb_path,
413
+ "--ligand", lig["path"],
414
+ "--center_x", str(box["center_x"]),
415
+ "--center_y", str(box["center_y"]),
416
+ "--center_z", str(box["center_z"]),
417
+ "--size_x", str(box["size_x"]),
418
+ "--size_y", str(box["size_y"]),
419
+ "--size_z", str(box["size_z"]),
420
+ "--num_modes", str(n_poses),
421
+ "--out", str(output_path),
422
+ ]
423
+
424
+ try:
425
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
426
+ except subprocess.TimeoutExpired:
427
+ return {"error": f"{method} timed out after 300s", "summary": f"Docking timed out"}
428
+
429
+ if proc.returncode != 0:
430
+ return {"error": f"{method} failed: {proc.stderr[:500]}", "summary": f"Docking failed"}
431
+
432
+ # Parse output for binding affinities
433
+ poses = []
434
+ for line in proc.stdout.splitlines():
435
+ parts = line.split()
436
+ # Vina output: mode | affinity | rmsd_lb | rmsd_ub
437
+ if len(parts) >= 4:
438
+ try:
439
+ mode = int(parts[0])
440
+ affinity = float(parts[1])
441
+ rmsd_lb = float(parts[2])
442
+ rmsd_ub = float(parts[3])
443
+ poses.append({
444
+ "mode": mode,
445
+ "affinity_kcal_mol": affinity,
446
+ "rmsd_lb": rmsd_lb,
447
+ "rmsd_ub": rmsd_ub,
448
+ })
449
+ except (ValueError, IndexError):
450
+ continue
451
+
452
+ best = poses[0]["affinity_kcal_mol"] if poses else "N/A"
453
+
454
+ obabel_note = ""
455
+ if lig.get("obabel_fallback"):
456
+ obabel_note = " (Note: Open Babel not installed — ligand PDBQT may lack proper atom types. Install: conda install -c conda-forge openbabel)"
457
+
458
+ return {
459
+ "summary": (
460
+ f"Docked {smiles[:40]} into {Path(pdb_path).stem}: "
461
+ f"best pose {best} kcal/mol ({len(poses)} poses){obabel_note}"
462
+ ),
463
+ "method": method,
464
+ "local": True,
465
+ "poses": poses,
466
+ "output_path": str(output_path),
467
+ "search_box": box,
468
+ }
469
+
470
+
471
+ # ---------------------------------------------------------------------------
472
+ # Molecular dynamics simulation
473
+ # ---------------------------------------------------------------------------
474
+
475
+ @registry.register(
476
+ name="structure.md_simulate",
477
+ description="Submit a molecular dynamics simulation job (OpenMM/GROMACS) to cloud GPU compute",
478
+ category="structure",
479
+ parameters={
480
+ "pdb_path": "Path to input PDB structure",
481
+ "duration_ns": "Simulation duration in nanoseconds (default 10.0)",
482
+ "forcefield": "Force field: amber14 (default), charmm36, opls",
483
+ "temperature_k": "Temperature in Kelvin (default 300.0)",
484
+ },
485
+ usage_guide=(
486
+ "You want to run an MD simulation to study protein dynamics, ligand "
487
+ "stability in a binding pocket, or conformational sampling. This is "
488
+ "always a cloud GPU job — too computationally intensive for local execution."
489
+ ),
490
+ )
491
+ def md_simulate(pdb_path: str, duration_ns: float = 10.0,
492
+ forcefield: str = "amber14", temperature_k: float = 300.0,
493
+ **kwargs) -> dict:
494
+ """Submit an MD simulation to cloud GPU compute.
495
+
496
+ Prepares an OpenMM/GROMACS job configuration and submits via
497
+ ``compute.submit_job``. Always a cloud job.
498
+ """
499
+ valid_forcefields = ("amber14", "charmm36", "opls")
500
+ if forcefield not in valid_forcefields:
501
+ return {"error": f"Unknown forcefield '{forcefield}'. Choose from: {', '.join(valid_forcefields)}", "summary": f"Unknown forcefield '{forcefield}'. Choose from: {', '.join(valid_forcefields)}"}
502
+ if duration_ns <= 0:
503
+ return {"error": "duration_ns must be positive", "summary": "duration_ns must be positive"}
504
+ if temperature_k <= 0:
505
+ return {"error": "temperature_k must be positive", "summary": "temperature_k must be positive"}
506
+ pdb = Path(pdb_path)
507
+ if not pdb.exists():
508
+ return {"error": f"PDB file not found: {pdb_path}",
509
+ "summary": f"MD simulation failed: PDB file not found"}
510
+
511
+ # Estimate runtime: ~1 ns/hr on a single A100 for a typical protein
512
+ estimated_hours = duration_ns * 1.0 # rough heuristic
513
+ protein_name = pdb.stem
514
+
515
+ config = {
516
+ "pdb_path": str(pdb),
517
+ "duration_ns": duration_ns,
518
+ "forcefield": forcefield,
519
+ "temperature_k": temperature_k,
520
+ "integrator": "LangevinMiddle",
521
+ "timestep_fs": 2.0,
522
+ "solvent": "tip3p",
523
+ "ionic_strength_M": 0.15,
524
+ "reporting_interval_ps": 10.0,
525
+ "platform": "CUDA",
526
+ }
527
+
528
+ from ct.tools.compute import submit_job
529
+ job_result = submit_job(
530
+ job_type="molecular_dynamics",
531
+ params={
532
+ "n_samples": 1,
533
+ "config": config,
534
+ },
535
+ dry_run=kwargs.get("dry_run", True),
536
+ )
537
+
538
+ if "error" in job_result:
539
+ return {
540
+ "error": job_result["error"],
541
+ "summary": f"MD simulation submission failed: {job_result['error']}",
542
+ }
543
+
544
+ return {
545
+ "summary": (
546
+ f"MD simulation submitted: {protein_name} for {duration_ns}ns at "
547
+ f"{temperature_k}K ({forcefield})"
548
+ + (f" (job: {job_result.get('job_id', 'dry-run')})" if not job_result.get("dry_run") else " [DRY RUN]")
549
+ ),
550
+ "config": config,
551
+ "estimated_hours": round(estimated_hours, 1),
552
+ "job": job_result,
553
+ }
554
+
555
+
556
+ # ---------------------------------------------------------------------------
557
+ # Free energy perturbation
558
+ # ---------------------------------------------------------------------------
559
+
560
+ @registry.register(
561
+ name="structure.fep",
562
+ description="Submit a free energy perturbation (FEP) calculation for relative binding free energy between two ligands",
563
+ category="structure",
564
+ parameters={
565
+ "smiles_a": "SMILES for ligand A",
566
+ "smiles_b": "SMILES for ligand B",
567
+ "target_pdb": "Path to target protein PDB or UniProt ID",
568
+ "method": "FEP method: openmm (default), gromacs",
569
+ },
570
+ usage_guide=(
571
+ "You want to predict the relative binding free energy difference "
572
+ "between two similar ligands to a target. Use for lead optimization "
573
+ "when you need to rank-order compound modifications by binding affinity. "
574
+ "Always a cloud GPU job."
575
+ ),
576
+ )
577
+ def fep(smiles_a: str, smiles_b: str, target_pdb: str,
578
+ method: str = "openmm", **kwargs) -> dict:
579
+ """Submit an FEP calculation for relative binding free energy.
580
+
581
+ Prepares the ligand pair and target, then submits to cloud compute.
582
+ """
583
+ valid_methods = ("openmm", "gromacs")
584
+ if method not in valid_methods:
585
+ return {"error": f"Unknown FEP method '{method}'. Choose from: {', '.join(valid_methods)}", "summary": f"Unknown FEP method '{method}'. Choose from: {', '.join(valid_methods)}"}
586
+ # Validate SMILES
587
+ try:
588
+ from rdkit import Chem
589
+ except ImportError:
590
+ return {"error": "RDKit is required for FEP ligand validation (pip install rdkit)", "summary": "RDKit is required for FEP ligand validation (pip install rdkit)"}
591
+ mol_a = Chem.MolFromSmiles(smiles_a)
592
+ mol_b = Chem.MolFromSmiles(smiles_b)
593
+ if mol_a is None:
594
+ return {"error": f"Invalid SMILES for ligand A: {smiles_a}", "summary": f"Invalid SMILES for ligand A: {smiles_a}"}
595
+ if mol_b is None:
596
+ return {"error": f"Invalid SMILES for ligand B: {smiles_b}", "summary": f"Invalid SMILES for ligand B: {smiles_b}"}
597
+ # Resolve target
598
+ target = _resolve_pdb(target_pdb)
599
+ if "error" in target:
600
+ return {"error": target["error"], "summary": f"FEP failed: {target['error']}"}
601
+ pdb_path = target["path"]
602
+
603
+ # Estimate runtime: ~4-8 hours per ligand pair on A100
604
+ estimated_hours = 6.0
605
+
606
+ config = {
607
+ "smiles_a": smiles_a,
608
+ "smiles_b": smiles_b,
609
+ "target_pdb": pdb_path,
610
+ "method": method,
611
+ "n_lambda_windows": 12,
612
+ "simulation_time_per_window_ns": 5.0,
613
+ "temperature_k": 300.0,
614
+ "platform": "CUDA",
615
+ }
616
+
617
+ from ct.tools.compute import submit_job
618
+ job_result = submit_job(
619
+ job_type="molecular_dynamics",
620
+ params={
621
+ "n_samples": 1,
622
+ "fep_config": config,
623
+ },
624
+ dry_run=kwargs.get("dry_run", True),
625
+ )
626
+
627
+ if "error" in job_result:
628
+ return {
629
+ "error": job_result["error"],
630
+ "summary": f"FEP submission failed: {job_result['error']}",
631
+ }
632
+
633
+ target_name = Path(pdb_path).stem
634
+
635
+ return {
636
+ "summary": (
637
+ f"FEP calculation submitted: {smiles_a[:30]} -> {smiles_b[:30]} "
638
+ f"in {target_name} ({method})"
639
+ + (f" (job: {job_result.get('job_id', 'dry-run')})" if not job_result.get("dry_run") else " [DRY RUN]")
640
+ ),
641
+ "config": config,
642
+ "estimated_hours": estimated_hours,
643
+ "transformation": {"ligand_a": smiles_a, "ligand_b": smiles_b},
644
+ "job": job_result,
645
+ }
646
+
647
+
648
+ # ---------------------------------------------------------------------------
649
+ # Binding site / pocket detection
650
+ # ---------------------------------------------------------------------------
651
+
652
+ def _geometric_pocket_detection(pdb_path: str, min_residues: int = 5,
653
+ distance_cutoff: float = 8.0) -> list[dict]:
654
+ """Simple geometric pocket detection from PDB coordinates.
655
+
656
+ Parses ATOM records, identifies residues with buried atoms (low
657
+ neighbor-averaged solvent exposure), and clusters them by spatial
658
+ proximity. Returns a list of pocket dicts sorted by size.
659
+ """
660
+ import math
661
+
662
+ # Parse atoms
663
+ atoms = []
664
+ with open(pdb_path) as fh:
665
+ for line in fh:
666
+ if not line.startswith("ATOM"):
667
+ continue
668
+ try:
669
+ x = float(line[30:38])
670
+ y = float(line[38:46])
671
+ z = float(line[46:54])
672
+ resname = line[17:20].strip()
673
+ resid = int(line[22:26])
674
+ chain = line[21]
675
+ atoms.append({
676
+ "x": x, "y": y, "z": z,
677
+ "resname": resname, "resid": resid, "chain": chain,
678
+ })
679
+ except (ValueError, IndexError):
680
+ continue
681
+
682
+ if len(atoms) < 10:
683
+ return []
684
+
685
+ # Compute centre of mass
686
+ cx = sum(a["x"] for a in atoms) / len(atoms)
687
+ cy = sum(a["y"] for a in atoms) / len(atoms)
688
+ cz = sum(a["z"] for a in atoms) / len(atoms)
689
+
690
+ # For each residue, compute distance to COM and local density
691
+ residue_coords = {}
692
+ for a in atoms:
693
+ key = (a["chain"], a["resid"])
694
+ if key not in residue_coords:
695
+ residue_coords[key] = {"xs": [], "ys": [], "zs": [],
696
+ "resname": a["resname"], "chain": a["chain"],
697
+ "resid": a["resid"]}
698
+ residue_coords[key]["xs"].append(a["x"])
699
+ residue_coords[key]["ys"].append(a["y"])
700
+ residue_coords[key]["zs"].append(a["z"])
701
+
702
+ # Compute residue centres
703
+ residues = []
704
+ for key, rc in residue_coords.items():
705
+ rx = sum(rc["xs"]) / len(rc["xs"])
706
+ ry = sum(rc["ys"]) / len(rc["ys"])
707
+ rz = sum(rc["zs"]) / len(rc["zs"])
708
+ dist_to_com = math.sqrt((rx - cx)**2 + (ry - cy)**2 + (rz - cz)**2)
709
+ residues.append({
710
+ "chain": rc["chain"], "resid": rc["resid"], "resname": rc["resname"],
711
+ "x": rx, "y": ry, "z": rz, "dist_to_com": dist_to_com,
712
+ })
713
+
714
+ # Identify cavity residues: not too close to COM (core), not too far (surface)
715
+ dists = [r["dist_to_com"] for r in residues]
716
+ if not dists:
717
+ return []
718
+ median_dist = sorted(dists)[len(dists) // 2]
719
+ cavity_residues = [
720
+ r for r in residues
721
+ if median_dist * 0.4 < r["dist_to_com"] < median_dist * 1.2
722
+ ]
723
+
724
+ if len(cavity_residues) < min_residues:
725
+ cavity_residues = sorted(residues, key=lambda r: abs(r["dist_to_com"] - median_dist))[:max(min_residues, len(residues) // 4)]
726
+
727
+ # Simple clustering: greedy single-linkage
728
+ clusters = []
729
+ assigned = set()
730
+ for i, res in enumerate(cavity_residues):
731
+ if i in assigned:
732
+ continue
733
+ cluster = [res]
734
+ assigned.add(i)
735
+ for j, other in enumerate(cavity_residues):
736
+ if j in assigned:
737
+ continue
738
+ d = math.sqrt((res["x"] - other["x"])**2 +
739
+ (res["y"] - other["y"])**2 +
740
+ (res["z"] - other["z"])**2)
741
+ if d < distance_cutoff:
742
+ cluster.append(other)
743
+ assigned.add(j)
744
+ if len(cluster) >= min_residues:
745
+ clusters.append(cluster)
746
+
747
+ # Sort clusters by size descending
748
+ clusters.sort(key=lambda c: len(c), reverse=True)
749
+
750
+ pockets = []
751
+ for idx, cluster in enumerate(clusters[:5]): # top 5 pockets
752
+ xs = [r["x"] for r in cluster]
753
+ ys = [r["y"] for r in cluster]
754
+ zs = [r["z"] for r in cluster]
755
+
756
+ # Approximate volume as bounding box volume
757
+ vol = (max(xs) - min(xs)) * (max(ys) - min(ys)) * (max(zs) - min(zs))
758
+
759
+ # Druggability heuristic: more residues + larger volume + hydrophobic
760
+ # residues are better
761
+ hydrophobic = {"ALA", "VAL", "LEU", "ILE", "PHE", "TRP", "MET", "PRO"}
762
+ n_hydrophobic = sum(1 for r in cluster if r["resname"] in hydrophobic)
763
+ druggability = min(1.0, (len(cluster) / 20) * 0.5 + (n_hydrophobic / max(len(cluster), 1)) * 0.5)
764
+
765
+ pockets.append({
766
+ "pocket_id": idx + 1,
767
+ "n_residues": len(cluster),
768
+ "residue_ids": [f"{r['chain']}:{r['resname']}{r['resid']}" for r in cluster],
769
+ "center": {
770
+ "x": round(sum(xs) / len(xs), 2),
771
+ "y": round(sum(ys) / len(ys), 2),
772
+ "z": round(sum(zs) / len(zs), 2),
773
+ },
774
+ "volume_approx_A3": round(vol, 1),
775
+ "druggability_score": round(druggability, 3),
776
+ })
777
+
778
+ return pockets
779
+
780
+
781
+ @registry.register(
782
+ name="structure.binding_site",
783
+ description="Detect binding pockets in a protein structure using geometric analysis or fpocket",
784
+ category="structure",
785
+ parameters={
786
+ "pdb_path": "Path to PDB file or UniProt ID for AlphaFold fetch",
787
+ "method": "Detection method: fpocket (default), geometric",
788
+ },
789
+ usage_guide=(
790
+ "You want to find druggable binding pockets in a protein structure. "
791
+ "Use before docking to identify where to focus. fpocket is preferred "
792
+ "if installed; geometric fallback uses coordinate-based clustering."
793
+ ),
794
+ )
795
+ def binding_site(pdb_path: str, method: str = "fpocket", **kwargs) -> dict:
796
+ """Detect binding pockets in a protein structure.
797
+
798
+ * method='fpocket' — runs fpocket locally if installed, falls back to
799
+ geometric detection.
800
+ * method='geometric' — pure coordinate-based pocket detection.
801
+ """
802
+ valid_methods = ("fpocket", "geometric")
803
+ if method not in valid_methods:
804
+ return {"error": f"Unknown method '{method}'. Choose from: {', '.join(valid_methods)}", "summary": f"Unknown method '{method}'. Choose from: {', '.join(valid_methods)}"}
805
+ # Resolve PDB path (may be UniProt ID)
806
+ target = _resolve_pdb(pdb_path)
807
+ if "error" in target:
808
+ return {"error": target["error"], "summary": f"Pocket detection failed: {target['error']}"}
809
+ resolved_path = target["path"]
810
+ protein_name = Path(resolved_path).stem
811
+
812
+ if method == "fpocket":
813
+ # Try running fpocket
814
+ try:
815
+ proc = subprocess.run(
816
+ ["fpocket", "-f", resolved_path],
817
+ capture_output=True, text=True, timeout=120,
818
+ )
819
+ if proc.returncode == 0:
820
+ # Parse fpocket output
821
+ out_dir = Path(resolved_path).parent / f"{protein_name}_out"
822
+ info_file = out_dir / f"{protein_name}_info.txt"
823
+ pockets = []
824
+
825
+ if info_file.exists():
826
+ current_pocket = {}
827
+ for line in info_file.read_text().splitlines():
828
+ line = line.strip()
829
+ if line.startswith("Pocket"):
830
+ if current_pocket:
831
+ pockets.append(current_pocket)
832
+ pocket_num = line.split()[1].rstrip(":")
833
+ current_pocket = {"pocket_id": int(pocket_num)}
834
+ elif "Score" in line and ":" in line:
835
+ key, val = line.split(":", 1)
836
+ try:
837
+ current_pocket[key.strip().lower().replace(" ", "_")] = float(val.strip())
838
+ except ValueError:
839
+ current_pocket[key.strip().lower().replace(" ", "_")] = val.strip()
840
+ elif "Volume" in line and ":" in line:
841
+ key, val = line.split(":", 1)
842
+ try:
843
+ current_pocket["volume_A3"] = float(val.strip())
844
+ except ValueError:
845
+ pass
846
+ if current_pocket:
847
+ pockets.append(current_pocket)
848
+
849
+ if pockets:
850
+ top = pockets[0]
851
+ vol_str = f"vol={top.get('volume_A3', '?')}A^3" if "volume_A3" in top else ""
852
+ return {
853
+ "summary": (
854
+ f"Found {len(pockets)} binding pocket(s) in {protein_name} (fpocket): "
855
+ f"site 1 ({vol_str})"
856
+ ),
857
+ "method": "fpocket",
858
+ "pockets": pockets,
859
+ }
860
+ except (FileNotFoundError, subprocess.TimeoutExpired):
861
+ pass # Fall through to geometric
862
+
863
+ # Geometric fallback
864
+ pockets = _geometric_pocket_detection(resolved_path)
865
+
866
+ if not pockets:
867
+ return {
868
+ "summary": f"No binding pockets detected in {protein_name}",
869
+ "method": "geometric",
870
+ "pockets": [],
871
+ }
872
+
873
+ top = pockets[0]
874
+ return {
875
+ "summary": (
876
+ f"Found {len(pockets)} binding pocket(s) in {protein_name}: "
877
+ f"site 1 (vol={top['volume_approx_A3']:.0f}A^3, "
878
+ f"druggability={top['druggability_score']:.2f})"
879
+ ),
880
+ "method": "geometric",
881
+ "pockets": pockets,
882
+ }