celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/safety.py ADDED
@@ -0,0 +1,1280 @@
1
+ """
2
+ Safety profiling tools: anti-target screening, multi-modal safety classification, SALL4 risk.
3
+
4
+ References crews-glue-discovery/analysis/safety_profile.md for classification logic
5
+ and anti-target lists.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from ct.tools import registry
11
+ from ct.tools.http_client import request_json
12
+
13
+
14
+ # UniProt accession → gene symbol mapping for safety-relevant proteins.
15
+ # The proteomics matrix uses UniProt IDs as row index; all gene-symbol
16
+ # lookups must go through this mapping.
17
+ UNIPROT_TO_GENE = {
18
+ # SALL family (teratogenicity)
19
+ "Q9UJQ4": "SALL4", "Q9NSC2": "SALL1", "Q9Y467": "SALL2", "Q8N3A9": "SALL3",
20
+ # IKZF family (heme TFs / CRBN substrates)
21
+ "Q13422": "IKZF1", "Q9UKT9": "IKZF3", "Q9H2S1": "IKZF4",
22
+ "Q96PU5": "IKZF2", "Q9H193": "IKZF5",
23
+ # Other CRBN substrates
24
+ "P15170": "GSPT1", "Q8IYD1": "GSPT2", "P48729": "CSNK1A1", "Q96SW2": "ZFP91",
25
+ # Tumor suppressors
26
+ "P04637": "TP53", "P06400": "RB1", "P60484": "PTEN", "P25054": "APC",
27
+ "P38398": "BRCA1", "P51587": "BRCA2", "P40337": "VHL",
28
+ "P21359": "NF1", "P35240": "NF2",
29
+ "P42771": "CDKN2A", "P42772": "CDKN2B", "P19544": "WT1",
30
+ "Q13315": "SMAD4", "Q15831": "STK11", "Q969H0": "FBXW7", "Q92560": "BAP1",
31
+ "O14497": "ARID1A", "Q8NFD5": "ARID1B", "Q68CP9": "ARID2",
32
+ "O14686": "KMT2D", "Q8NEZ4": "KMT2C", "Q9BYW2": "SETD2",
33
+ "Q01196": "RUNX1", "Q13761": "RUNX3", "P23771": "GATA3",
34
+ "P10914": "IRF1", "O15524": "SOCS1",
35
+ # Heme TFs
36
+ "P15976": "GATA1", "P23769": "GATA2", "P17542": "TAL1",
37
+ "P17947": "SPI1", "P49715": "CEBPA", "P17676": "CEBPB",
38
+ "P10242": "MYB", "P41212": "ETV6", "Q01543": "FLI1",
39
+ # TP63 (teratogenic)
40
+ "Q9H3D4": "TP63",
41
+ }
42
+ GENE_TO_UNIPROT = {v: k for k, v in UNIPROT_TO_GENE.items()}
43
+
44
+
45
+ def _gene_ids(gene_set, all_proteins):
46
+ """Return the subset of protein IDs (UniProt or gene symbol) present in *all_proteins*
47
+ that correspond to any gene in *gene_set*."""
48
+ # Direct gene-symbol matches (in case the index already uses symbols)
49
+ hits = gene_set & all_proteins
50
+ # UniProt-ID matches
51
+ for uid, gene in UNIPROT_TO_GENE.items():
52
+ if gene in gene_set and uid in all_proteins:
53
+ hits.add(uid)
54
+ return hits
55
+
56
+
57
+ def _display_name(protein_id):
58
+ """Return a human-readable name: 'GENE (UNIPROT)' when a mapping exists."""
59
+ gene = UNIPROT_TO_GENE.get(protein_id)
60
+ if gene:
61
+ return f"{gene} ({protein_id})"
62
+ return protein_id
63
+
64
+
65
+ # Known tumor suppressor genes (anti-targets for degradation)
66
+ TUMOR_SUPPRESSORS = {
67
+ "TP53", "RB1", "PTEN", "APC", "BRCA1", "BRCA2", "VHL", "NF1", "NF2",
68
+ "CDKN2A", "CDKN2B", "WT1", "SMAD4", "STK11", "FBXW7", "BAP1",
69
+ "ARID1A", "ARID1B", "ARID2", "KMT2D", "KMT2C", "SETD2",
70
+ "RUNX1", "RUNX3", "GATA3", "IRF1", "SOCS1",
71
+ }
72
+
73
+ # Essential hematopoietic transcription factors (high-risk degradation targets)
74
+ HEME_TFS = {
75
+ "IKZF1", "IKZF3", "IKZF4", "GATA1", "GATA2", "TAL1", "RUNX1",
76
+ "SPI1", "CEBPA", "CEBPB", "MYB", "ETV6", "FLI1",
77
+ }
78
+
79
+ # Known teratogenicity-associated CRBN substrates
80
+ TERATOGENIC_SUBSTRATES = {
81
+ "SALL4", "SALL1", "SALL3", # limb development TFs
82
+ "p63", "TP63", # epithelial development
83
+ }
84
+
85
+ # Known CRBN neosubstrates (degraded by IMiDs/molecular glues)
86
+ CRBN_SUBSTRATES = {
87
+ "IKZF1", "IKZF3", "CK1A", "CSNK1A1", "GSPT1", "GSPT2",
88
+ "ZFP91", "AIOLOS", "IKAROS",
89
+ }
90
+
91
+ _OPENFDA_DRUG_EVENT_URL = "https://api.fda.gov/drug/event.json"
92
+ _OPENFDA_DRUG_LABEL_URL = "https://api.fda.gov/drug/label.json"
93
+
94
+
95
+ def _openfda_escape(term: str) -> str:
96
+ """Escape a value for openFDA search string usage."""
97
+ return str(term or "").replace("\\", "\\\\").replace('"', '\\"').strip()
98
+
99
+
100
+ def _openfda_total(search: str = "") -> tuple[int | None, str | None]:
101
+ """Return total matching records from openFDA endpoint."""
102
+ params = {"limit": "1"}
103
+ if search:
104
+ params["search"] = search
105
+
106
+ data, error = request_json(
107
+ "GET",
108
+ _OPENFDA_DRUG_EVENT_URL,
109
+ params=params,
110
+ timeout=20,
111
+ retries=2,
112
+ )
113
+ if error:
114
+ return None, error
115
+
116
+ total = data.get("meta", {}).get("results", {}).get("total")
117
+ try:
118
+ return int(total), None
119
+ except Exception:
120
+ return None, "openFDA response missing total count"
121
+
122
+
123
+ def _faers_signal_metrics(
124
+ a: int,
125
+ b: int,
126
+ c: int,
127
+ d: int,
128
+ *,
129
+ min_case_count: int = 3,
130
+ ) -> dict:
131
+ """Compute basic disproportionality metrics (PRR/ROR/chi-square)."""
132
+ import math
133
+
134
+ a = max(int(a), 0)
135
+ b = max(int(b), 0)
136
+ c = max(int(c), 0)
137
+ d = max(int(d), 0)
138
+
139
+ # Haldane-Anscombe correction stabilizes estimates when cells are zero.
140
+ ac, bc, cc, dc = [x + 0.5 for x in (a, b, c, d)]
141
+
142
+ prr = (ac / (ac + bc)) / (cc / (cc + dc))
143
+ ror = (ac / bc) / (cc / dc)
144
+ se_log_ror = math.sqrt((1 / ac) + (1 / bc) + (1 / cc) + (1 / dc))
145
+ ror_ci95_lower = math.exp(math.log(ror) - 1.96 * se_log_ror)
146
+ ror_ci95_upper = math.exp(math.log(ror) + 1.96 * se_log_ror)
147
+
148
+ total = a + b + c + d
149
+ denom = (a + b) * (c + d) * (a + c) * (b + d)
150
+ chi_square = ((total * ((a * d - b * c) ** 2)) / denom) if denom > 0 else 0.0
151
+
152
+ # Classic pharmacovigilance heuristic gate.
153
+ signal = (a >= min_case_count) and (prr >= 2.0) and (chi_square >= 4.0)
154
+
155
+ return {
156
+ "prr": round(float(prr), 4),
157
+ "ror": round(float(ror), 4),
158
+ "ror_ci95_lower": round(float(ror_ci95_lower), 4),
159
+ "ror_ci95_upper": round(float(ror_ci95_upper), 4),
160
+ "chi_square": round(float(chi_square), 4),
161
+ "meets_signal_criteria": bool(signal),
162
+ }
163
+
164
+
165
+ @registry.register(
166
+ name="safety.antitarget_profile",
167
+ description="Screen degradation data for anti-target hits (tumor suppressors, essential genes, heme TFs)",
168
+ category="safety",
169
+ parameters={
170
+ "compound_id": "Compound to profile (or 'all')",
171
+ "lfc_threshold": "LFC threshold for degradation call (default -0.5)",
172
+ },
173
+ requires_data=["proteomics"],
174
+ usage_guide="You need to check if a compound degrades dangerous off-targets (tumor suppressors, essential heme TFs, teratogenic substrates). Run this first in any safety assessment workflow.",
175
+ )
176
+ def antitarget_profile(compound_id: str = "all", lfc_threshold: float = -0.5, **kwargs) -> dict:
177
+ """Screen proteomics data for degradation of anti-target proteins.
178
+
179
+ Anti-targets: tumor suppressors, essential heme TFs, teratogenic substrates,
180
+ and known CRBN substrates. Degrading these = safety liability.
181
+ """
182
+ from ct.tools._compound_resolver import resolve_compound
183
+ if compound_id != "all":
184
+ compound_id = resolve_compound(compound_id, dataset="proteomics")
185
+
186
+ try:
187
+ from ct.data.loaders import load_proteomics
188
+ prot = load_proteomics()
189
+ except FileNotFoundError:
190
+ return {
191
+ "error": "Proteomics data not available.",
192
+ "summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
193
+ }
194
+
195
+ compounds = [compound_id] if compound_id != "all" else prot.columns.tolist()
196
+ all_proteins = set(prot.index)
197
+
198
+ # Categorize known anti-targets present in data (handles both gene symbols and UniProt IDs)
199
+ tsg_present = _gene_ids(TUMOR_SUPPRESSORS, all_proteins)
200
+ heme_present = _gene_ids(HEME_TFS, all_proteins)
201
+ terat_present = _gene_ids(TERATOGENIC_SUBSTRATES, all_proteins)
202
+ crbn_present = _gene_ids(CRBN_SUBSTRATES, all_proteins)
203
+
204
+ results = []
205
+ for cpd in compounds:
206
+ if cpd not in prot.columns:
207
+ continue
208
+
209
+ values = prot[cpd].dropna()
210
+ degraded = values[values < lfc_threshold]
211
+
212
+ # Check anti-target categories
213
+ hits = {
214
+ "tumor_suppressors": sorted([p for p in degraded.index if p in tsg_present]),
215
+ "heme_tfs": sorted([p for p in degraded.index if p in heme_present]),
216
+ "teratogenic": sorted([p for p in degraded.index if p in terat_present]),
217
+ "crbn_substrates": sorted([p for p in degraded.index if p in crbn_present]),
218
+ }
219
+
220
+ n_antitargets = sum(len(v) for v in hits.values())
221
+
222
+ # Compute safety penalty score
223
+ penalty = 0.0
224
+ for p in hits["teratogenic"]:
225
+ penalty += 10.0 # highest risk
226
+ for p in hits["heme_tfs"]:
227
+ penalty += 5.0
228
+ for p in hits["tumor_suppressors"]:
229
+ penalty += 3.0
230
+ for p in hits["crbn_substrates"]:
231
+ penalty += 2.0
232
+
233
+ # Get LFC values for flagged proteins
234
+ flagged_details = []
235
+ for category, proteins in hits.items():
236
+ for p in proteins:
237
+ flagged_details.append({
238
+ "protein": _display_name(p),
239
+ "protein_id": p,
240
+ "category": category,
241
+ "lfc": round(float(values[p]), 3),
242
+ })
243
+
244
+ results.append({
245
+ "compound": cpd,
246
+ "n_total_degraded": len(degraded),
247
+ "n_antitargets": n_antitargets,
248
+ "n_tumor_suppressors": len(hits["tumor_suppressors"]),
249
+ "n_heme_tfs": len(hits["heme_tfs"]),
250
+ "n_teratogenic": len(hits["teratogenic"]),
251
+ "n_crbn_substrates": len(hits["crbn_substrates"]),
252
+ "safety_penalty": round(penalty, 1),
253
+ "flagged_proteins": flagged_details,
254
+ })
255
+
256
+ df = pd.DataFrame([{k: v for k, v in r.items() if k != "flagged_proteins"} for r in results])
257
+ if len(df) > 0:
258
+ df = df.sort_values("safety_penalty", ascending=False)
259
+
260
+ if compound_id != "all":
261
+ r = results[0] if results else {}
262
+ flagged_str = ", ".join([f"{d['protein']}({d['category']})" for d in r.get("flagged_proteins", [])])
263
+ summary = (
264
+ f"Anti-target profile for {compound_id}: "
265
+ f"{r.get('n_antitargets', 0)} anti-targets hit, "
266
+ f"penalty={r.get('safety_penalty', 0)}\n"
267
+ f"Flagged: {flagged_str if flagged_str else 'none'}"
268
+ )
269
+ else:
270
+ n_clean = (df["n_antitargets"] == 0).sum() if len(df) > 0 else 0
271
+ summary = (
272
+ f"Anti-target screening: {len(df)} compounds profiled\n"
273
+ f"Clean (0 anti-targets): {n_clean}/{len(df)}"
274
+ )
275
+
276
+ return {
277
+ "summary": summary,
278
+ "n_screened": len(tsg_present | heme_present | terat_present | crbn_present),
279
+ "antitarget_counts": {
280
+ "tumor_suppressors": len(tsg_present),
281
+ "heme_tfs": len(heme_present),
282
+ "teratogenic": len(terat_present),
283
+ "crbn_substrates": len(crbn_present),
284
+ },
285
+ "profiles": results if compound_id != "all" else df.to_dict("records"),
286
+ }
287
+
288
+
289
+ @registry.register(
290
+ name="safety.classify",
291
+ description="Classify compound safety as SAFE/CAUTION/DANGEROUS based on multi-modal profiling",
292
+ category="safety",
293
+ parameters={
294
+ "compound_id": "Compound to classify (or 'all')",
295
+ },
296
+ requires_data=["proteomics", "prism"],
297
+ usage_guide="You need a quick safety verdict (SAFE/CAUTION/DANGEROUS) before advancing a compound. Combines anti-target profile with viability breadth. Run after antitarget_profile for full context.",
298
+ )
299
+ def classify(compound_id: str = "all", **kwargs) -> dict:
300
+ """Multi-modal safety classification.
301
+
302
+ Classification rules:
303
+ - DANGEROUS: degrades any teratogenic substrate OR safety_penalty >= 15
304
+ - CAUTION: degrades tumor suppressors OR heme TFs OR safety_penalty >= 5
305
+ - SAFE: no anti-target degradation AND safety_penalty < 5
306
+
307
+ Also considers viability breadth (% cell lines killed) as a toxicity signal.
308
+ """
309
+ # Get anti-target profile (handles missing proteomics internally)
310
+ at_result = antitarget_profile(compound_id=compound_id)
311
+ if "error" in at_result:
312
+ return at_result
313
+
314
+ profiles = at_result["profiles"]
315
+
316
+ # Get viability breadth from PRISM
317
+ try:
318
+ from ct.data.loaders import load_prism
319
+ prism = load_prism()
320
+ except FileNotFoundError:
321
+ return {
322
+ "error": "PRISM data not available.",
323
+ "summary": "PRISM data not available — skipping. Run: ct data pull prism",
324
+ }
325
+
326
+ results = []
327
+ for profile in profiles:
328
+ cpd = profile["compound"]
329
+ penalty = profile["safety_penalty"]
330
+
331
+ # Viability breadth
332
+ cpd_data = prism[prism["pert_name"] == cpd]
333
+ breadth = 0.0
334
+ if len(cpd_data) > 0:
335
+ max_dose = cpd_data["pert_dose"].max()
336
+ cpd_hd = cpd_data[cpd_data["pert_dose"] == max_dose]
337
+ per_cell = cpd_hd.groupby("ccle_name")["LFC"].mean()
338
+ breadth = float((per_cell < -0.5).mean())
339
+
340
+ # Classification
341
+ if profile["n_teratogenic"] > 0 or penalty >= 15:
342
+ classification = "DANGEROUS"
343
+ elif profile["n_tumor_suppressors"] > 0 or profile["n_heme_tfs"] > 0 or penalty >= 5:
344
+ classification = "CAUTION"
345
+ elif breadth > 0.8:
346
+ classification = "CAUTION" # kills too many cell lines = nonspecific toxicity
347
+ else:
348
+ classification = "SAFE"
349
+
350
+ # Safety score (0-100, higher = safer)
351
+ safety_score = max(0, 100 - penalty * 5 - breadth * 30)
352
+
353
+ results.append({
354
+ "compound": cpd,
355
+ "classification": classification,
356
+ "safety_score": round(safety_score, 1),
357
+ "safety_penalty": penalty,
358
+ "viability_breadth": round(breadth, 3),
359
+ "n_antitargets": profile["n_antitargets"],
360
+ "n_tumor_suppressors": profile["n_tumor_suppressors"],
361
+ "n_heme_tfs": profile["n_heme_tfs"],
362
+ "n_teratogenic": profile["n_teratogenic"],
363
+ })
364
+
365
+ df = pd.DataFrame(results)
366
+
367
+ if len(df) > 0:
368
+ counts = df["classification"].value_counts().to_dict()
369
+ safe = counts.get("SAFE", 0)
370
+ caution = counts.get("CAUTION", 0)
371
+ dangerous = counts.get("DANGEROUS", 0)
372
+ else:
373
+ safe = caution = dangerous = 0
374
+
375
+ if compound_id != "all" and results:
376
+ r = results[0]
377
+ summary = (
378
+ f"Safety classification for {compound_id}: {r['classification']}\n"
379
+ f"Score: {r['safety_score']}/100, Penalty: {r['safety_penalty']}, "
380
+ f"Viability breadth: {r['viability_breadth']:.1%}"
381
+ )
382
+ else:
383
+ summary = (
384
+ f"Safety classification: {len(df)} compounds\n"
385
+ f"SAFE: {safe}, CAUTION: {caution}, DANGEROUS: {dangerous}"
386
+ )
387
+
388
+ return {
389
+ "summary": summary,
390
+ "classifications": results,
391
+ "distribution": {"SAFE": safe, "CAUTION": caution, "DANGEROUS": dangerous},
392
+ }
393
+
394
+
395
+ @registry.register(
396
+ name="safety.sall4_risk",
397
+ description="Assess SALL4 degradation risk for IMiD-type molecular glue compounds (teratogenicity marker)",
398
+ category="safety",
399
+ parameters={
400
+ "compound_id": "Compound to check (or 'all')",
401
+ },
402
+ requires_data=["proteomics"],
403
+ usage_guide="You are working with CRBN-based molecular glues and need to assess teratogenicity risk. SALL4 degradation was the molecular cause of thalidomide birth defects — critical safety check for any IMiD-type compound.",
404
+ )
405
+ def sall4_risk(compound_id: str = "all", **kwargs) -> dict:
406
+ """Check for SALL4 degradation -- the key teratogenicity signal for IMiD-type compounds.
407
+
408
+ SALL4 is a zinc finger TF essential for limb development. Its degradation by
409
+ thalidomide via CRBN was the molecular cause of thalidomide teratogenicity.
410
+ Any CRBN-based molecular glue that degrades SALL4 is a teratogenicity risk.
411
+ """
412
+ from ct.tools._compound_resolver import resolve_compound
413
+ if compound_id != "all":
414
+ compound_id = resolve_compound(compound_id, dataset="proteomics")
415
+
416
+ try:
417
+ from ct.data.loaders import load_proteomics
418
+ prot = load_proteomics()
419
+ except FileNotFoundError:
420
+ return {
421
+ "error": "Proteomics data not available.",
422
+ "summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
423
+ }
424
+
425
+ # Check for SALL family proteins (handles both gene symbols and UniProt IDs)
426
+ sall_uniprot = {uid: gene for uid, gene in UNIPROT_TO_GENE.items() if gene.startswith("SALL")}
427
+ sall_proteins = [] # list of (index_id, gene_symbol)
428
+ for p in prot.index:
429
+ if p.startswith("SALL"):
430
+ sall_proteins.append((p, p))
431
+ elif p in sall_uniprot:
432
+ sall_proteins.append((p, sall_uniprot[p]))
433
+
434
+ if not sall_proteins:
435
+ return {
436
+ "summary": "SALL proteins not detected in proteomics data -- cannot assess teratogenicity risk",
437
+ "sall_proteins_in_data": [],
438
+ "risk_assessment": "UNKNOWN",
439
+ }
440
+
441
+ compounds = [compound_id] if compound_id != "all" else prot.columns.tolist()
442
+ results = []
443
+
444
+ for cpd in compounds:
445
+ if cpd not in prot.columns:
446
+ continue
447
+
448
+ sall_values = {} # gene_symbol -> LFC
449
+ for idx_id, gene in sall_proteins:
450
+ val = prot.loc[idx_id, cpd]
451
+ if pd.notna(val):
452
+ sall_values[gene] = float(val)
453
+
454
+ # Risk assessment
455
+ sall4_lfc = sall_values.get("SALL4")
456
+ any_sall_degraded = any(v < -0.5 for v in sall_values.values())
457
+ sall4_degraded = sall4_lfc is not None and sall4_lfc < -0.5
458
+
459
+ if sall4_degraded:
460
+ risk = "HIGH"
461
+ risk_detail = f"SALL4 degraded (LFC={sall4_lfc:.2f}) -- thalidomide-like teratogenicity risk"
462
+ elif any_sall_degraded:
463
+ risk = "MODERATE"
464
+ degraded_salls = {k: round(v, 3) for k, v in sall_values.items() if v < -0.5}
465
+ risk_detail = f"SALL family member(s) degraded: {degraded_salls} -- potential teratogenicity"
466
+ elif sall4_lfc is not None and sall4_lfc < -0.3:
467
+ risk = "LOW"
468
+ risk_detail = f"SALL4 mildly reduced (LFC={sall4_lfc:.2f}) -- monitor in follow-up"
469
+ else:
470
+ risk = "MINIMAL"
471
+ risk_detail = "No SALL degradation detected"
472
+
473
+ results.append({
474
+ "compound": cpd,
475
+ "risk_level": risk,
476
+ "risk_detail": risk_detail,
477
+ "sall_values": sall_values,
478
+ })
479
+
480
+ sall_names = [gene for _, gene in sall_proteins]
481
+
482
+ if compound_id != "all" and results:
483
+ r = results[0]
484
+ summary = f"SALL4 risk for {compound_id}: {r['risk_level']} -- {r['risk_detail']}"
485
+ else:
486
+ risk_counts = {}
487
+ for r in results:
488
+ risk_counts[r["risk_level"]] = risk_counts.get(r["risk_level"], 0) + 1
489
+ summary = f"SALL4 risk assessment: {len(results)} compounds -- {risk_counts}"
490
+
491
+ return {
492
+ "summary": summary,
493
+ "sall_proteins_in_data": sall_names,
494
+ "assessments": results,
495
+ }
496
+
497
+
498
+ @registry.register(
499
+ name="safety.faers_signal_scan",
500
+ description="Scan openFDA FAERS adverse-event reports for disproportionality signals (PRR/ROR) for a drug",
501
+ category="safety",
502
+ parameters={
503
+ "drug_name": "Drug name to scan (generic or brand name)",
504
+ "event": "Optional specific MedDRA preferred term to evaluate",
505
+ "top_n": "If event not provided, evaluate top N reported events for this drug (default 5)",
506
+ "min_case_count": "Minimum A-count threshold for signal flagging (default 3)",
507
+ },
508
+ usage_guide=(
509
+ "Use for post-marketing pharmacovigilance triage. Computes disproportionality metrics "
510
+ "(PRR/ROR/chi-square) from openFDA FAERS counts and flags candidate safety signals."
511
+ ),
512
+ )
513
+ def faers_signal_scan(
514
+ drug_name: str,
515
+ event: str = "",
516
+ top_n: int = 5,
517
+ min_case_count: int = 3,
518
+ **kwargs,
519
+ ) -> dict:
520
+ """Run a disproportionality safety scan using openFDA FAERS."""
521
+ if not drug_name or not drug_name.strip():
522
+ return {"error": "drug_name is required", "summary": "No drug name provided"}
523
+
524
+ drug_term = _openfda_escape(drug_name)
525
+ if not drug_term:
526
+ return {"error": "drug_name is required", "summary": "No drug name provided"}
527
+
528
+ top_n = max(1, min(int(top_n or 5), 20))
529
+ min_case_count = max(1, int(min_case_count or 3))
530
+
531
+ drug_search = f'patient.drug.medicinalproduct.exact:"{drug_term}"'
532
+
533
+ all_total, error = _openfda_total("")
534
+ if error:
535
+ return {"error": f"openFDA total lookup failed: {error}", "summary": f"FAERS scan failed: {error}"}
536
+ drug_total, error = _openfda_total(drug_search)
537
+ if error:
538
+ return {"error": f"openFDA drug lookup failed: {error}", "summary": f"FAERS scan failed for {drug_name}: {error}"}
539
+
540
+ if drug_total <= 0:
541
+ return {
542
+ "drug_name": drug_name,
543
+ "total_reports_for_drug": 0,
544
+ "signals": [],
545
+ "summary": f"No FAERS reports found for '{drug_name}'",
546
+ }
547
+
548
+ events_to_scan = []
549
+ if event and event.strip():
550
+ events_to_scan = [event.strip()]
551
+ else:
552
+ data, count_error = request_json(
553
+ "GET",
554
+ _OPENFDA_DRUG_EVENT_URL,
555
+ params={
556
+ "search": drug_search,
557
+ "count": "patient.reaction.reactionmeddrapt.exact",
558
+ "limit": str(top_n),
559
+ },
560
+ timeout=20,
561
+ retries=2,
562
+ )
563
+ if count_error:
564
+ return {
565
+ "error": f"openFDA event aggregation failed: {count_error}",
566
+ "summary": f"FAERS scan failed for {drug_name}: {count_error}",
567
+ }
568
+ events_to_scan = [r.get("term", "") for r in data.get("results", []) if r.get("term")]
569
+ if not events_to_scan:
570
+ return {
571
+ "drug_name": drug_name,
572
+ "total_reports_for_drug": int(drug_total),
573
+ "signals": [],
574
+ "summary": f"FAERS reports found for '{drug_name}', but no reaction terms were returned",
575
+ }
576
+
577
+ signals = []
578
+ for ev in events_to_scan:
579
+ ev_term = _openfda_escape(ev)
580
+ if not ev_term:
581
+ continue
582
+
583
+ event_search = f'patient.reaction.reactionmeddrapt.exact:"{ev_term}"'
584
+ both_search = f"{drug_search}+AND+{event_search}"
585
+
586
+ event_total, event_err = _openfda_total(event_search)
587
+ both_total, both_err = _openfda_total(both_search)
588
+ if event_err or both_err:
589
+ signals.append({
590
+ "event": ev,
591
+ "error": event_err or both_err,
592
+ })
593
+ continue
594
+
595
+ a = int(both_total)
596
+ b = int(drug_total) - a
597
+ c = int(event_total) - a
598
+ d = int(all_total) - (a + b + c)
599
+ if d < 0:
600
+ d = 0
601
+
602
+ metrics = _faers_signal_metrics(a, b, c, d, min_case_count=min_case_count)
603
+ signals.append({
604
+ "event": ev,
605
+ "a_drug_and_event": a,
606
+ "b_drug_no_event": b,
607
+ "c_no_drug_event": c,
608
+ "d_no_drug_no_event": d,
609
+ **metrics,
610
+ })
611
+
612
+ clean_signals = [s for s in signals if "error" not in s]
613
+ clean_signals.sort(
614
+ key=lambda x: (x.get("meets_signal_criteria", False), x.get("prr", 0.0), x.get("ror", 0.0)),
615
+ reverse=True,
616
+ )
617
+ n_flagged = sum(1 for s in clean_signals if s.get("meets_signal_criteria"))
618
+ error_count = len(signals) - len(clean_signals)
619
+
620
+ if clean_signals:
621
+ top = clean_signals[0]
622
+ summary = (
623
+ f"FAERS signal scan for {drug_name}: {len(clean_signals)} event(s) analyzed, "
624
+ f"{n_flagged} flagged by PRR/ROR criteria. Top event: {top['event']} "
625
+ f"(PRR={top['prr']}, ROR={top['ror']})."
626
+ )
627
+ else:
628
+ summary = (
629
+ f"FAERS signal scan for {drug_name}: no analyzable events returned"
630
+ + (f" ({error_count} event lookup error(s))." if error_count else ".")
631
+ )
632
+
633
+ return {
634
+ "summary": summary,
635
+ "drug_name": drug_name,
636
+ "event_filter": event.strip(),
637
+ "total_reports_all_faers": int(all_total),
638
+ "total_reports_for_drug": int(drug_total),
639
+ "criteria": {
640
+ "min_case_count": min_case_count,
641
+ "prr_threshold": 2.0,
642
+ "chi_square_threshold": 4.0,
643
+ },
644
+ "n_events_analyzed": len(clean_signals),
645
+ "n_events_flagged": n_flagged,
646
+ "n_event_lookup_errors": error_count,
647
+ "signals": clean_signals,
648
+ "errors": [s for s in signals if "error" in s],
649
+ }
650
+
651
+
652
+ @registry.register(
653
+ name="safety.label_risk_extract",
654
+ description="Extract boxed warnings, contraindications, and key risk sections from openFDA drug labels",
655
+ category="safety",
656
+ parameters={
657
+ "drug_name": "Drug name (generic or brand)",
658
+ "max_labels": "Maximum label records to inspect (default 3)",
659
+ "section_max_chars": "Max characters per extracted section (default 500)",
660
+ },
661
+ usage_guide=(
662
+ "Use for rapid regulatory risk triage. Pulls key safety sections from FDA labels "
663
+ "(boxed warning, contraindications, warnings, interactions, special populations)."
664
+ ),
665
+ )
666
+ def label_risk_extract(
667
+ drug_name: str,
668
+ max_labels: int = 3,
669
+ section_max_chars: int = 500,
670
+ **kwargs,
671
+ ) -> dict:
672
+ """Extract key risk sections from openFDA drug label endpoint."""
673
+ import re
674
+
675
+ if not drug_name or not drug_name.strip():
676
+ return {"error": "drug_name is required", "summary": "No drug name provided"}
677
+
678
+ max_labels = max(1, min(int(max_labels or 3), 10))
679
+ section_max_chars = max(120, min(int(section_max_chars or 500), 4000))
680
+ drug_term = _openfda_escape(drug_name)
681
+
682
+ search = (
683
+ f'openfda.generic_name.exact:"{drug_term}"'
684
+ f'+OR+openfda.brand_name.exact:"{drug_term}"'
685
+ f'+OR+openfda.substance_name.exact:"{drug_term}"'
686
+ )
687
+ data, error = request_json(
688
+ "GET",
689
+ _OPENFDA_DRUG_LABEL_URL,
690
+ params={"search": search, "limit": str(max_labels)},
691
+ timeout=20,
692
+ retries=2,
693
+ )
694
+ if error:
695
+ return {"error": f"openFDA label query failed: {error}", "summary": f"Label risk extraction failed: {error}"}
696
+
697
+ results = data.get("results", [])
698
+ if not results:
699
+ return {
700
+ "drug_name": drug_name,
701
+ "labels_found": 0,
702
+ "risk_level": "UNKNOWN",
703
+ "summary": f"No openFDA label records found for '{drug_name}'",
704
+ "labels": [],
705
+ }
706
+
707
+ def _extract_section(entry: dict, key: str) -> str:
708
+ value = entry.get(key, [])
709
+ if isinstance(value, list):
710
+ text = " ".join(str(v).strip() for v in value if str(v).strip())
711
+ elif isinstance(value, str):
712
+ text = value.strip()
713
+ else:
714
+ text = ""
715
+ text = re.sub(r"\s+", " ", text).strip()
716
+ if len(text) > section_max_chars:
717
+ text = text[: section_max_chars - 3] + "..."
718
+ return text
719
+
720
+ label_summaries = []
721
+ for entry in results:
722
+ openfda = entry.get("openfda", {})
723
+ brand = ", ".join(openfda.get("brand_name", [])[:3]) if isinstance(openfda.get("brand_name"), list) else ""
724
+ generic = ", ".join(openfda.get("generic_name", [])[:3]) if isinstance(openfda.get("generic_name"), list) else ""
725
+ application = ", ".join(openfda.get("application_number", [])[:3]) if isinstance(openfda.get("application_number"), list) else ""
726
+ manufacturer = ", ".join(openfda.get("manufacturer_name", [])[:2]) if isinstance(openfda.get("manufacturer_name"), list) else ""
727
+
728
+ sections = {
729
+ "boxed_warning": _extract_section(entry, "boxed_warning"),
730
+ "contraindications": _extract_section(entry, "contraindications"),
731
+ "warnings_and_cautions": _extract_section(entry, "warnings_and_cautions"),
732
+ "warnings": _extract_section(entry, "warnings"),
733
+ "adverse_reactions": _extract_section(entry, "adverse_reactions"),
734
+ "drug_interactions": _extract_section(entry, "drug_interactions"),
735
+ "use_in_specific_populations": _extract_section(entry, "use_in_specific_populations"),
736
+ }
737
+
738
+ has_boxed = bool(sections["boxed_warning"])
739
+ has_contra = bool(sections["contraindications"])
740
+ has_warn = bool(sections["warnings"] or sections["warnings_and_cautions"])
741
+
742
+ if has_boxed:
743
+ risk_level = "HIGH"
744
+ elif has_contra or has_warn:
745
+ risk_level = "MODERATE"
746
+ else:
747
+ risk_level = "LOW"
748
+
749
+ flags = []
750
+ if has_boxed:
751
+ flags.append("boxed_warning")
752
+ if has_contra:
753
+ flags.append("contraindications")
754
+ if has_warn:
755
+ flags.append("warnings")
756
+
757
+ label_summaries.append({
758
+ "brand_name": brand,
759
+ "generic_name": generic,
760
+ "application_number": application,
761
+ "manufacturer": manufacturer,
762
+ "risk_level": risk_level,
763
+ "risk_flags": flags,
764
+ "sections": sections,
765
+ })
766
+
767
+ rank = {"HIGH": 3, "MODERATE": 2, "LOW": 1}
768
+ overall_risk = max(label_summaries, key=lambda x: rank.get(x["risk_level"], 0))["risk_level"]
769
+ boxed_count = sum(1 for l in label_summaries if "boxed_warning" in l.get("risk_flags", []))
770
+ contra_count = sum(1 for l in label_summaries if "contraindications" in l.get("risk_flags", []))
771
+
772
+ summary = (
773
+ f"Label risk extraction for {drug_name}: {len(label_summaries)} label record(s), "
774
+ f"overall risk={overall_risk}. Boxed warning present in {boxed_count} label(s); "
775
+ f"contraindications present in {contra_count} label(s)."
776
+ )
777
+
778
+ return {
779
+ "summary": summary,
780
+ "drug_name": drug_name,
781
+ "labels_found": len(label_summaries),
782
+ "risk_level": overall_risk,
783
+ "n_boxed_warning_labels": boxed_count,
784
+ "n_contraindication_labels": contra_count,
785
+ "labels": label_summaries,
786
+ }
787
+
788
+
789
+ @registry.register(
790
+ name="safety.admet_predict",
791
+ description="Predict ADMET properties for a compound from SMILES using RDKit descriptors and heuristic rules",
792
+ category="safety",
793
+ parameters={
794
+ "smiles": "SMILES string for the compound to profile",
795
+ },
796
+ usage_guide="You need a comprehensive ADMET (absorption, distribution, metabolism, excretion, toxicity) profile for a compound. Use early in lead optimization to flag liabilities before synthesis. Covers Lipinski, Veber, Ghose, lead-likeness, oral absorption, BBB, hERG, CYP, and solubility.",
797
+ )
798
+ def admet_predict(smiles: str, **kwargs) -> dict:
799
+ """Predict ADMET properties from SMILES using RDKit descriptors and heuristic rules.
800
+
801
+ Computes physicochemical properties and applies established medicinal chemistry
802
+ filters (Lipinski Ro5, Veber, Ghose, lead-likeness) plus heuristic predictions
803
+ for oral absorption, BBB penetration, hERG risk, CYP liability, and solubility.
804
+ """
805
+ from ct.tools.chemistry import _extract_smiles
806
+ smiles = _extract_smiles(smiles)
807
+
808
+ try:
809
+ from rdkit import Chem
810
+ from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors
811
+ except ImportError:
812
+ return {"error": "RDKit is required for ADMET prediction. Install with: pip install rdkit", "summary": "RDKit is required for ADMET prediction. Install with: pip install rdkit"}
813
+ mol = Chem.MolFromSmiles(smiles)
814
+ if mol is None:
815
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
816
+
817
+ # --- Physicochemical descriptors ---
818
+ mw = Descriptors.MolWt(mol)
819
+ logp = Crippen.MolLogP(mol)
820
+ tpsa = Descriptors.TPSA(mol)
821
+ hba = Lipinski.NumHAcceptors(mol)
822
+ hbd = Lipinski.NumHDonors(mol)
823
+ rotatable_bonds = Descriptors.NumRotatableBonds(mol)
824
+ aromatic_rings = Descriptors.NumAromaticRings(mol)
825
+ num_rings = Descriptors.RingCount(mol)
826
+ heavy_atoms = mol.GetNumHeavyAtoms()
827
+ formula = rdMolDescriptors.CalcMolFormula(mol)
828
+
829
+ properties = {
830
+ "smiles": smiles,
831
+ "formula": formula,
832
+ "molecular_weight": round(mw, 2),
833
+ "logp": round(logp, 2),
834
+ "tpsa": round(tpsa, 2),
835
+ "hba": hba,
836
+ "hbd": hbd,
837
+ "rotatable_bonds": rotatable_bonds,
838
+ "aromatic_rings": aromatic_rings,
839
+ "num_rings": num_rings,
840
+ "heavy_atoms": heavy_atoms,
841
+ }
842
+
843
+ # --- Drug-likeness filters ---
844
+ filters = {}
845
+
846
+ # Lipinski Rule of Five
847
+ lipinski_violations = sum([
848
+ mw > 500,
849
+ logp > 5,
850
+ hbd > 5,
851
+ hba > 10,
852
+ ])
853
+ filters["lipinski_ro5"] = {
854
+ "pass": lipinski_violations <= 1,
855
+ "violations": lipinski_violations,
856
+ "details": {
857
+ "MW<=500": mw <= 500,
858
+ "LogP<=5": logp <= 5,
859
+ "HBD<=5": hbd <= 5,
860
+ "HBA<=10": hba <= 10,
861
+ },
862
+ }
863
+
864
+ # Veber rule (oral bioavailability)
865
+ veber_pass = tpsa <= 140 and rotatable_bonds <= 10
866
+ filters["veber"] = {
867
+ "pass": veber_pass,
868
+ "details": {
869
+ "TPSA<=140": tpsa <= 140,
870
+ "RotBonds<=10": rotatable_bonds <= 10,
871
+ },
872
+ }
873
+
874
+ # Lead-likeness (Teague/Oprea)
875
+ lead_like = mw <= 350 and logp <= 3.5 and rotatable_bonds <= 7
876
+ filters["lead_likeness"] = {
877
+ "pass": lead_like,
878
+ "details": {
879
+ "MW<=350": mw <= 350,
880
+ "LogP<=3.5": logp <= 3.5,
881
+ "RotBonds<=7": rotatable_bonds <= 7,
882
+ },
883
+ }
884
+
885
+ # Ghose filter
886
+ ghose_pass = (
887
+ 160 <= mw <= 480
888
+ and -0.4 <= logp <= 5.6
889
+ and 40 <= heavy_atoms <= 130 # using heavy atoms as proxy for atom count
890
+ and 20 <= Descriptors.MolMR(mol) <= 130
891
+ )
892
+ filters["ghose"] = {
893
+ "pass": ghose_pass,
894
+ "details": {
895
+ "160<=MW<=480": 160 <= mw <= 480,
896
+ "-0.4<=LogP<=5.6": -0.4 <= logp <= 5.6,
897
+ "20<=MR<=130": 20 <= Descriptors.MolMR(mol) <= 130,
898
+ },
899
+ }
900
+
901
+ # --- ADMET predictions (heuristic) ---
902
+ predictions = {}
903
+
904
+ # Oral absorption
905
+ oral_absorption = tpsa < 140 and rotatable_bonds <= 10
906
+ oral_score = max(0, 100 - (max(0, tpsa - 60) * 0.8) - (max(0, rotatable_bonds - 5) * 5))
907
+ predictions["oral_absorption"] = {
908
+ "prediction": "likely" if oral_absorption else "poor",
909
+ "score": round(min(100, oral_score), 1),
910
+ "rationale": f"TPSA={tpsa:.0f} ({'<' if tpsa < 140 else '>='} 140), "
911
+ f"RotBonds={rotatable_bonds} ({'<=' if rotatable_bonds <= 10 else '>'} 10)",
912
+ }
913
+
914
+ # BBB penetration
915
+ bbb = tpsa < 90 and mw < 450 and 1 <= logp <= 3
916
+ bbb_score = max(0, 100 - max(0, tpsa - 40) * 1.2 - max(0, mw - 300) * 0.3 - abs(logp - 2) * 15)
917
+ predictions["bbb_penetration"] = {
918
+ "prediction": "likely" if bbb else "unlikely",
919
+ "score": round(min(100, bbb_score), 1),
920
+ "rationale": f"TPSA={tpsa:.0f} ({'<' if tpsa < 90 else '>='} 90), "
921
+ f"MW={mw:.0f} ({'<' if mw < 450 else '>='} 450), "
922
+ f"LogP={logp:.1f} ({'in' if 1 <= logp <= 3 else 'outside'} 1-3)",
923
+ }
924
+
925
+ # hERG risk (rough heuristic)
926
+ herg_risk = logp > 3.7 and mw > 400
927
+ herg_concern = "elevated" if herg_risk else "low"
928
+ predictions["herg_risk"] = {
929
+ "prediction": herg_concern,
930
+ "flag": herg_risk,
931
+ "rationale": f"LogP={logp:.1f} ({'>' if logp > 3.7 else '<='} 3.7), "
932
+ f"MW={mw:.0f} ({'>' if mw > 400 else '<='} 400). "
933
+ f"Lipophilic, large molecules more likely to block hERG channel.",
934
+ }
935
+
936
+ # CYP liability
937
+ cyp_risk_factors = 0
938
+ cyp_details = []
939
+ if aromatic_rings >= 3:
940
+ cyp_risk_factors += 1
941
+ cyp_details.append(f"{aromatic_rings} aromatic rings (>=3)")
942
+ if logp > 3:
943
+ cyp_risk_factors += 1
944
+ cyp_details.append(f"LogP={logp:.1f} (>3)")
945
+ if mw > 500:
946
+ cyp_risk_factors += 1
947
+ cyp_details.append(f"MW={mw:.0f} (>500)")
948
+
949
+ cyp_level = "high" if cyp_risk_factors >= 2 else "moderate" if cyp_risk_factors == 1 else "low"
950
+ predictions["cyp_liability"] = {
951
+ "prediction": cyp_level,
952
+ "risk_factors": cyp_risk_factors,
953
+ "details": cyp_details if cyp_details else ["No major CYP liability flags"],
954
+ }
955
+
956
+ # Solubility class (simplified Yalkowsky-based heuristic: logS ~ 0.5 - 0.01*(MP) - logP)
957
+ # Without melting point, use MW as rough proxy: logS ~ 0.5 - 0.01*MW - logP
958
+ log_s_est = 0.5 - 0.01 * mw - logp
959
+ if log_s_est > -1:
960
+ sol_class = "highly soluble"
961
+ elif log_s_est > -3:
962
+ sol_class = "soluble"
963
+ elif log_s_est > -5:
964
+ sol_class = "moderately soluble"
965
+ elif log_s_est > -7:
966
+ sol_class = "poorly soluble"
967
+ else:
968
+ sol_class = "insoluble"
969
+
970
+ predictions["solubility"] = {
971
+ "class": sol_class,
972
+ "estimated_logS": round(log_s_est, 2),
973
+ "rationale": f"Estimated logS={log_s_est:.2f} (Yalkowsky-type heuristic from MW and LogP)",
974
+ }
975
+
976
+ # --- Overall ADMET verdict ---
977
+ flags = []
978
+ if not filters["lipinski_ro5"]["pass"]:
979
+ flags.append(f"Lipinski: {lipinski_violations} violations")
980
+ if not veber_pass:
981
+ flags.append("Fails Veber (oral bioavailability concern)")
982
+ if herg_risk:
983
+ flags.append("Elevated hERG risk")
984
+ if cyp_level == "high":
985
+ flags.append("High CYP liability")
986
+ if sol_class in ("poorly soluble", "insoluble"):
987
+ flags.append(f"Solubility: {sol_class}")
988
+
989
+ if not flags:
990
+ verdict = "FAVORABLE"
991
+ elif len(flags) <= 2:
992
+ verdict = "ACCEPTABLE"
993
+ else:
994
+ verdict = "UNFAVORABLE"
995
+
996
+ summary_parts = [
997
+ f"ADMET profile for {formula} (MW={mw:.0f}, LogP={logp:.1f}): {verdict}",
998
+ f"Lipinski: {'PASS' if filters['lipinski_ro5']['pass'] else 'FAIL'} ({lipinski_violations} violations)",
999
+ f"Oral absorption: {predictions['oral_absorption']['prediction']} (score {predictions['oral_absorption']['score']})",
1000
+ f"BBB: {predictions['bbb_penetration']['prediction']} (score {predictions['bbb_penetration']['score']})",
1001
+ f"hERG: {predictions['herg_risk']['prediction']}, CYP: {predictions['cyp_liability']['prediction']}",
1002
+ f"Solubility: {predictions['solubility']['class']} (logS~{log_s_est:.1f})",
1003
+ ]
1004
+ if flags:
1005
+ summary_parts.append(f"Flags: {'; '.join(flags)}")
1006
+
1007
+ return {
1008
+ "summary": "\n".join(summary_parts),
1009
+ "verdict": verdict,
1010
+ "properties": properties,
1011
+ "filters": filters,
1012
+ "predictions": predictions,
1013
+ "flags": flags,
1014
+ }
1015
+
1016
+
1017
+ @registry.register(
1018
+ name="safety.ddi_predict",
1019
+ description="Predict drug-drug interaction potential based on CYP metabolism profile and molecular features",
1020
+ category="safety",
1021
+ parameters={
1022
+ "smiles": "SMILES string for the primary compound",
1023
+ "comedication_smiles": "SMILES string for a co-administered drug (optional)",
1024
+ },
1025
+ usage_guide="You need to assess drug-drug interaction risk for a compound, especially CYP-mediated interactions. Use when evaluating combination therapies or compounds likely to be co-prescribed. Identifies CYP inhibition/induction risk from structural features.",
1026
+ )
1027
+ def ddi_predict(smiles: str, comedication_smiles: str = None, **kwargs) -> dict:
1028
+ """Predict drug-drug interaction potential based on CYP metabolism profile.
1029
+
1030
+ Uses structural features to estimate CYP inhibition risk for major isoforms
1031
+ (3A4, 2D6, 2C9, 2C19, 1A2). Optionally compares with a co-medication.
1032
+ """
1033
+ from ct.tools.chemistry import _extract_smiles
1034
+ smiles = _extract_smiles(smiles)
1035
+
1036
+ try:
1037
+ from rdkit import Chem
1038
+ from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
1039
+ except ImportError:
1040
+ return {"error": "RDKit is required for DDI prediction. Install with: pip install rdkit", "summary": "RDKit is required for DDI prediction. Install with: pip install rdkit"}
1041
+ mol = Chem.MolFromSmiles(smiles)
1042
+ if mol is None:
1043
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
1044
+
1045
+ mw = Descriptors.MolWt(mol)
1046
+ logp = Crippen.MolLogP(mol)
1047
+ tpsa = Descriptors.TPSA(mol)
1048
+ aromatic_rings = Descriptors.NumAromaticRings(mol)
1049
+ num_rings = Descriptors.RingCount(mol)
1050
+ hba = Descriptors.NumHAcceptors(mol)
1051
+
1052
+ # --- Detect structural motifs associated with CYP inhibition ---
1053
+ motif_flags = {}
1054
+
1055
+ # Nitrogen heterocycles (CYP3A4 inhibition)
1056
+ n_heterocycle_pattern = Chem.MolFromSmarts("[nR]") # ring nitrogen
1057
+ n_heterocycles = len(mol.GetSubstructMatches(n_heterocycle_pattern)) if n_heterocycle_pattern else 0
1058
+ motif_flags["nitrogen_heterocycles"] = n_heterocycles
1059
+
1060
+ # Imidazole motif (strong CYP inhibition — azole antifungals)
1061
+ # Multiple SMARTS to catch both NH and N-substituted forms
1062
+ imidazole_patterns = [
1063
+ Chem.MolFromSmarts("c1cnc[nH]1"), # unsubstituted
1064
+ Chem.MolFromSmarts("c1nccn1"), # N-substituted imidazole
1065
+ Chem.MolFromSmarts("c1cncn1"), # alternative numbering
1066
+ ]
1067
+ has_imidazole = any(
1068
+ pat is not None and bool(mol.GetSubstructMatches(pat))
1069
+ for pat in imidazole_patterns
1070
+ )
1071
+
1072
+ # Triazole motif
1073
+ triazole_1 = Chem.MolFromSmarts("c1nncn1")
1074
+ triazole_2 = Chem.MolFromSmarts("c1nnn[nH]1")
1075
+ has_triazole = (
1076
+ (bool(mol.GetSubstructMatches(triazole_1)) if triazole_1 else False)
1077
+ or (bool(mol.GetSubstructMatches(triazole_2)) if triazole_2 else False)
1078
+ )
1079
+ motif_flags["has_imidazole"] = has_imidazole
1080
+ motif_flags["has_triazole"] = has_triazole
1081
+ motif_flags["has_azole"] = has_imidazole or has_triazole
1082
+
1083
+ # Furanyl groups (mechanism-based CYP inhibition)
1084
+ furan = Chem.MolFromSmarts("c1ccoc1")
1085
+ has_furan = bool(mol.GetSubstructMatches(furan)) if furan else False
1086
+ motif_flags["has_furan"] = has_furan
1087
+
1088
+ # Amine groups (CYP2D6 substrates/inhibitors)
1089
+ basic_amine = Chem.MolFromSmarts("[NX3;!$(NC=O);!$(NS=O)]")
1090
+ n_basic_amines = len(mol.GetSubstructMatches(basic_amine)) if basic_amine else 0
1091
+ motif_flags["basic_amines"] = n_basic_amines
1092
+
1093
+ # --- CYP isoform risk assessment ---
1094
+ cyp_profile = {}
1095
+
1096
+ # CYP3A4 — the major drug-metabolizing enzyme
1097
+ cyp3a4_score = 0
1098
+ cyp3a4_reasons = []
1099
+ if has_imidazole or has_triazole:
1100
+ cyp3a4_score += 3
1101
+ cyp3a4_reasons.append("Azole motif (strong CYP3A4 inhibition)")
1102
+ if n_heterocycles >= 2:
1103
+ cyp3a4_score += 1
1104
+ cyp3a4_reasons.append(f"{n_heterocycles} nitrogen heterocycles")
1105
+ if mw > 400 and logp > 3:
1106
+ cyp3a4_score += 1
1107
+ cyp3a4_reasons.append(f"Large lipophilic molecule (MW={mw:.0f}, LogP={logp:.1f})")
1108
+ cyp_profile["CYP3A4"] = {
1109
+ "inhibition_risk": "high" if cyp3a4_score >= 3 else "moderate" if cyp3a4_score >= 1 else "low",
1110
+ "score": cyp3a4_score,
1111
+ "reasons": cyp3a4_reasons if cyp3a4_reasons else ["No major CYP3A4 inhibition flags"],
1112
+ }
1113
+
1114
+ # CYP2D6
1115
+ cyp2d6_score = 0
1116
+ cyp2d6_reasons = []
1117
+ if n_basic_amines >= 1:
1118
+ cyp2d6_score += 1
1119
+ cyp2d6_reasons.append(f"{n_basic_amines} basic amine(s) — CYP2D6 substrate/inhibitor feature")
1120
+ if aromatic_rings >= 2 and n_basic_amines >= 1:
1121
+ cyp2d6_score += 1
1122
+ cyp2d6_reasons.append("Lipophilic amine — classic CYP2D6 inhibitor pharmacophore")
1123
+ cyp_profile["CYP2D6"] = {
1124
+ "inhibition_risk": "high" if cyp2d6_score >= 2 else "moderate" if cyp2d6_score >= 1 else "low",
1125
+ "score": cyp2d6_score,
1126
+ "reasons": cyp2d6_reasons if cyp2d6_reasons else ["No major CYP2D6 inhibition flags"],
1127
+ }
1128
+
1129
+ # CYP2C9
1130
+ cyp2c9_score = 0
1131
+ cyp2c9_reasons = []
1132
+ if logp > 3 and aromatic_rings >= 2:
1133
+ cyp2c9_score += 1
1134
+ cyp2c9_reasons.append("Lipophilic aromatic compound")
1135
+ # Acidic groups — CYP2C9 substrates tend to be weak acids
1136
+ carboxylic = Chem.MolFromSmarts("[CX3](=O)[OX2H1]")
1137
+ has_acid = bool(mol.GetSubstructMatches(carboxylic)) if carboxylic else False
1138
+ if has_acid:
1139
+ cyp2c9_score += 1
1140
+ cyp2c9_reasons.append("Carboxylic acid group — CYP2C9 substrate feature")
1141
+ cyp_profile["CYP2C9"] = {
1142
+ "inhibition_risk": "moderate" if cyp2c9_score >= 1 else "low",
1143
+ "score": cyp2c9_score,
1144
+ "reasons": cyp2c9_reasons if cyp2c9_reasons else ["No major CYP2C9 inhibition flags"],
1145
+ }
1146
+
1147
+ # CYP2C19
1148
+ cyp2c19_score = 0
1149
+ cyp2c19_reasons = []
1150
+ if has_imidazole:
1151
+ cyp2c19_score += 2
1152
+ cyp2c19_reasons.append("Imidazole motif (CYP2C19 inhibitor feature)")
1153
+ if n_heterocycles >= 2 and mw < 500:
1154
+ cyp2c19_score += 1
1155
+ cyp2c19_reasons.append("Multiple nitrogen heterocycles")
1156
+ cyp_profile["CYP2C19"] = {
1157
+ "inhibition_risk": "high" if cyp2c19_score >= 2 else "moderate" if cyp2c19_score >= 1 else "low",
1158
+ "score": cyp2c19_score,
1159
+ "reasons": cyp2c19_reasons if cyp2c19_reasons else ["No major CYP2C19 inhibition flags"],
1160
+ }
1161
+
1162
+ # CYP1A2
1163
+ cyp1a2_score = 0
1164
+ cyp1a2_reasons = []
1165
+ if aromatic_rings >= 3:
1166
+ cyp1a2_score += 1
1167
+ cyp1a2_reasons.append(f"{aromatic_rings} aromatic rings — planar aromatic CYP1A2 substrate")
1168
+ # Fused ring systems
1169
+ if num_rings >= 3 and aromatic_rings >= 2:
1170
+ cyp1a2_score += 1
1171
+ cyp1a2_reasons.append("Polycyclic aromatic system")
1172
+ cyp_profile["CYP1A2"] = {
1173
+ "inhibition_risk": "moderate" if cyp1a2_score >= 1 else "low",
1174
+ "score": cyp1a2_score,
1175
+ "reasons": cyp1a2_reasons if cyp1a2_reasons else ["No major CYP1A2 inhibition flags"],
1176
+ }
1177
+
1178
+ # --- Mechanism-based inhibition (MBI) risk ---
1179
+ mbi_risk = False
1180
+ mbi_reasons = []
1181
+ if has_furan:
1182
+ mbi_risk = True
1183
+ mbi_reasons.append("Furan ring — known MBI risk (bioactivated to reactive epoxide)")
1184
+ # Terminal alkyne
1185
+ alkyne = Chem.MolFromSmarts("[CX2]#[CX2H1]")
1186
+ if alkyne and mol.GetSubstructMatches(alkyne):
1187
+ mbi_risk = True
1188
+ mbi_reasons.append("Terminal alkyne — potential MBI via ketene intermediate")
1189
+ # Methylenedioxy
1190
+ mdp = Chem.MolFromSmarts("c1cc2OCOc2cc1")
1191
+ if mdp and mol.GetSubstructMatches(mdp):
1192
+ mbi_risk = True
1193
+ mbi_reasons.append("Methylenedioxy group — known CYP MBI risk (carbene formation)")
1194
+
1195
+ # --- Overall DDI risk ---
1196
+ high_risk_cyps = [k for k, v in cyp_profile.items() if v["inhibition_risk"] == "high"]
1197
+ moderate_risk_cyps = [k for k, v in cyp_profile.items() if v["inhibition_risk"] == "moderate"]
1198
+
1199
+ if high_risk_cyps or mbi_risk:
1200
+ overall_risk = "HIGH"
1201
+ elif len(moderate_risk_cyps) >= 2:
1202
+ overall_risk = "MODERATE"
1203
+ elif moderate_risk_cyps:
1204
+ overall_risk = "LOW-MODERATE"
1205
+ else:
1206
+ overall_risk = "LOW"
1207
+
1208
+ # --- Co-medication analysis ---
1209
+ comedication_analysis = None
1210
+ if comedication_smiles:
1211
+ comol = Chem.MolFromSmiles(comedication_smiles)
1212
+ if comol is not None:
1213
+ co_mw = Descriptors.MolWt(comol)
1214
+ co_logp = Crippen.MolLogP(comol)
1215
+ co_aromatic = Descriptors.NumAromaticRings(comol)
1216
+
1217
+ # Check if comedication shares metabolic pathway features
1218
+ co_n_het = Chem.MolFromSmarts("[nR]")
1219
+ co_n_heterocycles = len(comol.GetSubstructMatches(co_n_het)) if co_n_het else 0
1220
+ co_basic_amine = Chem.MolFromSmarts("[NX3;!$(NC=O);!$(NS=O)]")
1221
+ co_amines = len(comol.GetSubstructMatches(co_basic_amine)) if co_basic_amine else 0
1222
+
1223
+ shared_pathways = []
1224
+ if (n_heterocycles >= 2 or has_imidazole) and co_n_heterocycles >= 2:
1225
+ shared_pathways.append("CYP3A4 (both contain N-heterocycles)")
1226
+ if n_basic_amines >= 1 and co_amines >= 1:
1227
+ shared_pathways.append("CYP2D6 (both contain basic amines)")
1228
+ if logp > 3 and co_logp > 3:
1229
+ shared_pathways.append("General CYP competition (both lipophilic)")
1230
+
1231
+ interaction_risk = "high" if shared_pathways else "low"
1232
+
1233
+ comedication_analysis = {
1234
+ "comedication_smiles": comedication_smiles,
1235
+ "comedication_mw": round(co_mw, 1),
1236
+ "comedication_logp": round(co_logp, 2),
1237
+ "shared_metabolic_pathways": shared_pathways,
1238
+ "interaction_risk": interaction_risk,
1239
+ "recommendation": (
1240
+ f"Monitor for interactions via {', '.join(shared_pathways)}"
1241
+ if shared_pathways
1242
+ else "Low structural overlap in CYP-relevant features"
1243
+ ),
1244
+ }
1245
+ else:
1246
+ comedication_analysis = {"error": f"Invalid co-medication SMILES: {comedication_smiles}"}
1247
+
1248
+ # --- Summary ---
1249
+ summary_lines = [
1250
+ f"DDI risk assessment: {overall_risk}",
1251
+ ]
1252
+ if high_risk_cyps:
1253
+ summary_lines.append(f"High CYP inhibition risk: {', '.join(high_risk_cyps)}")
1254
+ if moderate_risk_cyps:
1255
+ summary_lines.append(f"Moderate CYP inhibition risk: {', '.join(moderate_risk_cyps)}")
1256
+ if mbi_risk:
1257
+ summary_lines.append(f"Mechanism-based inhibition risk: {'; '.join(mbi_reasons)}")
1258
+ if motif_flags["has_azole"]:
1259
+ summary_lines.append("Contains azole motif — strong CYP inhibitor pharmacophore")
1260
+ if comedication_analysis and isinstance(comedication_analysis, dict) and "shared_metabolic_pathways" in comedication_analysis:
1261
+ if comedication_analysis["shared_metabolic_pathways"]:
1262
+ summary_lines.append(f"Co-medication interaction via: {', '.join(comedication_analysis['shared_metabolic_pathways'])}")
1263
+ else:
1264
+ summary_lines.append("Low metabolic pathway overlap with co-medication")
1265
+
1266
+ result = {
1267
+ "summary": "\n".join(summary_lines),
1268
+ "overall_risk": overall_risk,
1269
+ "cyp_profile": cyp_profile,
1270
+ "motif_flags": motif_flags,
1271
+ "mechanism_based_inhibition": {
1272
+ "risk": mbi_risk,
1273
+ "reasons": mbi_reasons if mbi_reasons else ["No MBI structural alerts"],
1274
+ },
1275
+ }
1276
+
1277
+ if comedication_analysis:
1278
+ result["comedication_analysis"] = comedication_analysis
1279
+
1280
+ return result