celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/safety.py
ADDED
|
@@ -0,0 +1,1280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Safety profiling tools: anti-target screening, multi-modal safety classification, SALL4 risk.
|
|
3
|
+
|
|
4
|
+
References crews-glue-discovery/analysis/safety_profile.md for classification logic
|
|
5
|
+
and anti-target lists.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from ct.tools import registry
|
|
11
|
+
from ct.tools.http_client import request_json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# UniProt accession → gene symbol mapping for safety-relevant proteins.
|
|
15
|
+
# The proteomics matrix uses UniProt IDs as row index; all gene-symbol
|
|
16
|
+
# lookups must go through this mapping.
|
|
17
|
+
UNIPROT_TO_GENE = {
|
|
18
|
+
# SALL family (teratogenicity)
|
|
19
|
+
"Q9UJQ4": "SALL4", "Q9NSC2": "SALL1", "Q9Y467": "SALL2", "Q8N3A9": "SALL3",
|
|
20
|
+
# IKZF family (heme TFs / CRBN substrates)
|
|
21
|
+
"Q13422": "IKZF1", "Q9UKT9": "IKZF3", "Q9H2S1": "IKZF4",
|
|
22
|
+
"Q96PU5": "IKZF2", "Q9H193": "IKZF5",
|
|
23
|
+
# Other CRBN substrates
|
|
24
|
+
"P15170": "GSPT1", "Q8IYD1": "GSPT2", "P48729": "CSNK1A1", "Q96SW2": "ZFP91",
|
|
25
|
+
# Tumor suppressors
|
|
26
|
+
"P04637": "TP53", "P06400": "RB1", "P60484": "PTEN", "P25054": "APC",
|
|
27
|
+
"P38398": "BRCA1", "P51587": "BRCA2", "P40337": "VHL",
|
|
28
|
+
"P21359": "NF1", "P35240": "NF2",
|
|
29
|
+
"P42771": "CDKN2A", "P42772": "CDKN2B", "P19544": "WT1",
|
|
30
|
+
"Q13315": "SMAD4", "Q15831": "STK11", "Q969H0": "FBXW7", "Q92560": "BAP1",
|
|
31
|
+
"O14497": "ARID1A", "Q8NFD5": "ARID1B", "Q68CP9": "ARID2",
|
|
32
|
+
"O14686": "KMT2D", "Q8NEZ4": "KMT2C", "Q9BYW2": "SETD2",
|
|
33
|
+
"Q01196": "RUNX1", "Q13761": "RUNX3", "P23771": "GATA3",
|
|
34
|
+
"P10914": "IRF1", "O15524": "SOCS1",
|
|
35
|
+
# Heme TFs
|
|
36
|
+
"P15976": "GATA1", "P23769": "GATA2", "P17542": "TAL1",
|
|
37
|
+
"P17947": "SPI1", "P49715": "CEBPA", "P17676": "CEBPB",
|
|
38
|
+
"P10242": "MYB", "P41212": "ETV6", "Q01543": "FLI1",
|
|
39
|
+
# TP63 (teratogenic)
|
|
40
|
+
"Q9H3D4": "TP63",
|
|
41
|
+
}
|
|
42
|
+
GENE_TO_UNIPROT = {v: k for k, v in UNIPROT_TO_GENE.items()}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _gene_ids(gene_set, all_proteins):
|
|
46
|
+
"""Return the subset of protein IDs (UniProt or gene symbol) present in *all_proteins*
|
|
47
|
+
that correspond to any gene in *gene_set*."""
|
|
48
|
+
# Direct gene-symbol matches (in case the index already uses symbols)
|
|
49
|
+
hits = gene_set & all_proteins
|
|
50
|
+
# UniProt-ID matches
|
|
51
|
+
for uid, gene in UNIPROT_TO_GENE.items():
|
|
52
|
+
if gene in gene_set and uid in all_proteins:
|
|
53
|
+
hits.add(uid)
|
|
54
|
+
return hits
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _display_name(protein_id):
|
|
58
|
+
"""Return a human-readable name: 'GENE (UNIPROT)' when a mapping exists."""
|
|
59
|
+
gene = UNIPROT_TO_GENE.get(protein_id)
|
|
60
|
+
if gene:
|
|
61
|
+
return f"{gene} ({protein_id})"
|
|
62
|
+
return protein_id
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Known tumor suppressor genes (anti-targets for degradation)
|
|
66
|
+
TUMOR_SUPPRESSORS = {
|
|
67
|
+
"TP53", "RB1", "PTEN", "APC", "BRCA1", "BRCA2", "VHL", "NF1", "NF2",
|
|
68
|
+
"CDKN2A", "CDKN2B", "WT1", "SMAD4", "STK11", "FBXW7", "BAP1",
|
|
69
|
+
"ARID1A", "ARID1B", "ARID2", "KMT2D", "KMT2C", "SETD2",
|
|
70
|
+
"RUNX1", "RUNX3", "GATA3", "IRF1", "SOCS1",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Essential hematopoietic transcription factors (high-risk degradation targets)
|
|
74
|
+
HEME_TFS = {
|
|
75
|
+
"IKZF1", "IKZF3", "IKZF4", "GATA1", "GATA2", "TAL1", "RUNX1",
|
|
76
|
+
"SPI1", "CEBPA", "CEBPB", "MYB", "ETV6", "FLI1",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Known teratogenicity-associated CRBN substrates
|
|
80
|
+
TERATOGENIC_SUBSTRATES = {
|
|
81
|
+
"SALL4", "SALL1", "SALL3", # limb development TFs
|
|
82
|
+
"p63", "TP63", # epithelial development
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Known CRBN neosubstrates (degraded by IMiDs/molecular glues)
|
|
86
|
+
CRBN_SUBSTRATES = {
|
|
87
|
+
"IKZF1", "IKZF3", "CK1A", "CSNK1A1", "GSPT1", "GSPT2",
|
|
88
|
+
"ZFP91", "AIOLOS", "IKAROS",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
_OPENFDA_DRUG_EVENT_URL = "https://api.fda.gov/drug/event.json"
|
|
92
|
+
_OPENFDA_DRUG_LABEL_URL = "https://api.fda.gov/drug/label.json"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _openfda_escape(term: str) -> str:
|
|
96
|
+
"""Escape a value for openFDA search string usage."""
|
|
97
|
+
return str(term or "").replace("\\", "\\\\").replace('"', '\\"').strip()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _openfda_total(search: str = "") -> tuple[int | None, str | None]:
|
|
101
|
+
"""Return total matching records from openFDA endpoint."""
|
|
102
|
+
params = {"limit": "1"}
|
|
103
|
+
if search:
|
|
104
|
+
params["search"] = search
|
|
105
|
+
|
|
106
|
+
data, error = request_json(
|
|
107
|
+
"GET",
|
|
108
|
+
_OPENFDA_DRUG_EVENT_URL,
|
|
109
|
+
params=params,
|
|
110
|
+
timeout=20,
|
|
111
|
+
retries=2,
|
|
112
|
+
)
|
|
113
|
+
if error:
|
|
114
|
+
return None, error
|
|
115
|
+
|
|
116
|
+
total = data.get("meta", {}).get("results", {}).get("total")
|
|
117
|
+
try:
|
|
118
|
+
return int(total), None
|
|
119
|
+
except Exception:
|
|
120
|
+
return None, "openFDA response missing total count"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _faers_signal_metrics(
|
|
124
|
+
a: int,
|
|
125
|
+
b: int,
|
|
126
|
+
c: int,
|
|
127
|
+
d: int,
|
|
128
|
+
*,
|
|
129
|
+
min_case_count: int = 3,
|
|
130
|
+
) -> dict:
|
|
131
|
+
"""Compute basic disproportionality metrics (PRR/ROR/chi-square)."""
|
|
132
|
+
import math
|
|
133
|
+
|
|
134
|
+
a = max(int(a), 0)
|
|
135
|
+
b = max(int(b), 0)
|
|
136
|
+
c = max(int(c), 0)
|
|
137
|
+
d = max(int(d), 0)
|
|
138
|
+
|
|
139
|
+
# Haldane-Anscombe correction stabilizes estimates when cells are zero.
|
|
140
|
+
ac, bc, cc, dc = [x + 0.5 for x in (a, b, c, d)]
|
|
141
|
+
|
|
142
|
+
prr = (ac / (ac + bc)) / (cc / (cc + dc))
|
|
143
|
+
ror = (ac / bc) / (cc / dc)
|
|
144
|
+
se_log_ror = math.sqrt((1 / ac) + (1 / bc) + (1 / cc) + (1 / dc))
|
|
145
|
+
ror_ci95_lower = math.exp(math.log(ror) - 1.96 * se_log_ror)
|
|
146
|
+
ror_ci95_upper = math.exp(math.log(ror) + 1.96 * se_log_ror)
|
|
147
|
+
|
|
148
|
+
total = a + b + c + d
|
|
149
|
+
denom = (a + b) * (c + d) * (a + c) * (b + d)
|
|
150
|
+
chi_square = ((total * ((a * d - b * c) ** 2)) / denom) if denom > 0 else 0.0
|
|
151
|
+
|
|
152
|
+
# Classic pharmacovigilance heuristic gate.
|
|
153
|
+
signal = (a >= min_case_count) and (prr >= 2.0) and (chi_square >= 4.0)
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
"prr": round(float(prr), 4),
|
|
157
|
+
"ror": round(float(ror), 4),
|
|
158
|
+
"ror_ci95_lower": round(float(ror_ci95_lower), 4),
|
|
159
|
+
"ror_ci95_upper": round(float(ror_ci95_upper), 4),
|
|
160
|
+
"chi_square": round(float(chi_square), 4),
|
|
161
|
+
"meets_signal_criteria": bool(signal),
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@registry.register(
|
|
166
|
+
name="safety.antitarget_profile",
|
|
167
|
+
description="Screen degradation data for anti-target hits (tumor suppressors, essential genes, heme TFs)",
|
|
168
|
+
category="safety",
|
|
169
|
+
parameters={
|
|
170
|
+
"compound_id": "Compound to profile (or 'all')",
|
|
171
|
+
"lfc_threshold": "LFC threshold for degradation call (default -0.5)",
|
|
172
|
+
},
|
|
173
|
+
requires_data=["proteomics"],
|
|
174
|
+
usage_guide="You need to check if a compound degrades dangerous off-targets (tumor suppressors, essential heme TFs, teratogenic substrates). Run this first in any safety assessment workflow.",
|
|
175
|
+
)
|
|
176
|
+
def antitarget_profile(compound_id: str = "all", lfc_threshold: float = -0.5, **kwargs) -> dict:
|
|
177
|
+
"""Screen proteomics data for degradation of anti-target proteins.
|
|
178
|
+
|
|
179
|
+
Anti-targets: tumor suppressors, essential heme TFs, teratogenic substrates,
|
|
180
|
+
and known CRBN substrates. Degrading these = safety liability.
|
|
181
|
+
"""
|
|
182
|
+
from ct.tools._compound_resolver import resolve_compound
|
|
183
|
+
if compound_id != "all":
|
|
184
|
+
compound_id = resolve_compound(compound_id, dataset="proteomics")
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
from ct.data.loaders import load_proteomics
|
|
188
|
+
prot = load_proteomics()
|
|
189
|
+
except FileNotFoundError:
|
|
190
|
+
return {
|
|
191
|
+
"error": "Proteomics data not available.",
|
|
192
|
+
"summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
compounds = [compound_id] if compound_id != "all" else prot.columns.tolist()
|
|
196
|
+
all_proteins = set(prot.index)
|
|
197
|
+
|
|
198
|
+
# Categorize known anti-targets present in data (handles both gene symbols and UniProt IDs)
|
|
199
|
+
tsg_present = _gene_ids(TUMOR_SUPPRESSORS, all_proteins)
|
|
200
|
+
heme_present = _gene_ids(HEME_TFS, all_proteins)
|
|
201
|
+
terat_present = _gene_ids(TERATOGENIC_SUBSTRATES, all_proteins)
|
|
202
|
+
crbn_present = _gene_ids(CRBN_SUBSTRATES, all_proteins)
|
|
203
|
+
|
|
204
|
+
results = []
|
|
205
|
+
for cpd in compounds:
|
|
206
|
+
if cpd not in prot.columns:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
values = prot[cpd].dropna()
|
|
210
|
+
degraded = values[values < lfc_threshold]
|
|
211
|
+
|
|
212
|
+
# Check anti-target categories
|
|
213
|
+
hits = {
|
|
214
|
+
"tumor_suppressors": sorted([p for p in degraded.index if p in tsg_present]),
|
|
215
|
+
"heme_tfs": sorted([p for p in degraded.index if p in heme_present]),
|
|
216
|
+
"teratogenic": sorted([p for p in degraded.index if p in terat_present]),
|
|
217
|
+
"crbn_substrates": sorted([p for p in degraded.index if p in crbn_present]),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
n_antitargets = sum(len(v) for v in hits.values())
|
|
221
|
+
|
|
222
|
+
# Compute safety penalty score
|
|
223
|
+
penalty = 0.0
|
|
224
|
+
for p in hits["teratogenic"]:
|
|
225
|
+
penalty += 10.0 # highest risk
|
|
226
|
+
for p in hits["heme_tfs"]:
|
|
227
|
+
penalty += 5.0
|
|
228
|
+
for p in hits["tumor_suppressors"]:
|
|
229
|
+
penalty += 3.0
|
|
230
|
+
for p in hits["crbn_substrates"]:
|
|
231
|
+
penalty += 2.0
|
|
232
|
+
|
|
233
|
+
# Get LFC values for flagged proteins
|
|
234
|
+
flagged_details = []
|
|
235
|
+
for category, proteins in hits.items():
|
|
236
|
+
for p in proteins:
|
|
237
|
+
flagged_details.append({
|
|
238
|
+
"protein": _display_name(p),
|
|
239
|
+
"protein_id": p,
|
|
240
|
+
"category": category,
|
|
241
|
+
"lfc": round(float(values[p]), 3),
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
results.append({
|
|
245
|
+
"compound": cpd,
|
|
246
|
+
"n_total_degraded": len(degraded),
|
|
247
|
+
"n_antitargets": n_antitargets,
|
|
248
|
+
"n_tumor_suppressors": len(hits["tumor_suppressors"]),
|
|
249
|
+
"n_heme_tfs": len(hits["heme_tfs"]),
|
|
250
|
+
"n_teratogenic": len(hits["teratogenic"]),
|
|
251
|
+
"n_crbn_substrates": len(hits["crbn_substrates"]),
|
|
252
|
+
"safety_penalty": round(penalty, 1),
|
|
253
|
+
"flagged_proteins": flagged_details,
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
df = pd.DataFrame([{k: v for k, v in r.items() if k != "flagged_proteins"} for r in results])
|
|
257
|
+
if len(df) > 0:
|
|
258
|
+
df = df.sort_values("safety_penalty", ascending=False)
|
|
259
|
+
|
|
260
|
+
if compound_id != "all":
|
|
261
|
+
r = results[0] if results else {}
|
|
262
|
+
flagged_str = ", ".join([f"{d['protein']}({d['category']})" for d in r.get("flagged_proteins", [])])
|
|
263
|
+
summary = (
|
|
264
|
+
f"Anti-target profile for {compound_id}: "
|
|
265
|
+
f"{r.get('n_antitargets', 0)} anti-targets hit, "
|
|
266
|
+
f"penalty={r.get('safety_penalty', 0)}\n"
|
|
267
|
+
f"Flagged: {flagged_str if flagged_str else 'none'}"
|
|
268
|
+
)
|
|
269
|
+
else:
|
|
270
|
+
n_clean = (df["n_antitargets"] == 0).sum() if len(df) > 0 else 0
|
|
271
|
+
summary = (
|
|
272
|
+
f"Anti-target screening: {len(df)} compounds profiled\n"
|
|
273
|
+
f"Clean (0 anti-targets): {n_clean}/{len(df)}"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
"summary": summary,
|
|
278
|
+
"n_screened": len(tsg_present | heme_present | terat_present | crbn_present),
|
|
279
|
+
"antitarget_counts": {
|
|
280
|
+
"tumor_suppressors": len(tsg_present),
|
|
281
|
+
"heme_tfs": len(heme_present),
|
|
282
|
+
"teratogenic": len(terat_present),
|
|
283
|
+
"crbn_substrates": len(crbn_present),
|
|
284
|
+
},
|
|
285
|
+
"profiles": results if compound_id != "all" else df.to_dict("records"),
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@registry.register(
|
|
290
|
+
name="safety.classify",
|
|
291
|
+
description="Classify compound safety as SAFE/CAUTION/DANGEROUS based on multi-modal profiling",
|
|
292
|
+
category="safety",
|
|
293
|
+
parameters={
|
|
294
|
+
"compound_id": "Compound to classify (or 'all')",
|
|
295
|
+
},
|
|
296
|
+
requires_data=["proteomics", "prism"],
|
|
297
|
+
usage_guide="You need a quick safety verdict (SAFE/CAUTION/DANGEROUS) before advancing a compound. Combines anti-target profile with viability breadth. Run after antitarget_profile for full context.",
|
|
298
|
+
)
|
|
299
|
+
def classify(compound_id: str = "all", **kwargs) -> dict:
|
|
300
|
+
"""Multi-modal safety classification.
|
|
301
|
+
|
|
302
|
+
Classification rules:
|
|
303
|
+
- DANGEROUS: degrades any teratogenic substrate OR safety_penalty >= 15
|
|
304
|
+
- CAUTION: degrades tumor suppressors OR heme TFs OR safety_penalty >= 5
|
|
305
|
+
- SAFE: no anti-target degradation AND safety_penalty < 5
|
|
306
|
+
|
|
307
|
+
Also considers viability breadth (% cell lines killed) as a toxicity signal.
|
|
308
|
+
"""
|
|
309
|
+
# Get anti-target profile (handles missing proteomics internally)
|
|
310
|
+
at_result = antitarget_profile(compound_id=compound_id)
|
|
311
|
+
if "error" in at_result:
|
|
312
|
+
return at_result
|
|
313
|
+
|
|
314
|
+
profiles = at_result["profiles"]
|
|
315
|
+
|
|
316
|
+
# Get viability breadth from PRISM
|
|
317
|
+
try:
|
|
318
|
+
from ct.data.loaders import load_prism
|
|
319
|
+
prism = load_prism()
|
|
320
|
+
except FileNotFoundError:
|
|
321
|
+
return {
|
|
322
|
+
"error": "PRISM data not available.",
|
|
323
|
+
"summary": "PRISM data not available — skipping. Run: ct data pull prism",
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
results = []
|
|
327
|
+
for profile in profiles:
|
|
328
|
+
cpd = profile["compound"]
|
|
329
|
+
penalty = profile["safety_penalty"]
|
|
330
|
+
|
|
331
|
+
# Viability breadth
|
|
332
|
+
cpd_data = prism[prism["pert_name"] == cpd]
|
|
333
|
+
breadth = 0.0
|
|
334
|
+
if len(cpd_data) > 0:
|
|
335
|
+
max_dose = cpd_data["pert_dose"].max()
|
|
336
|
+
cpd_hd = cpd_data[cpd_data["pert_dose"] == max_dose]
|
|
337
|
+
per_cell = cpd_hd.groupby("ccle_name")["LFC"].mean()
|
|
338
|
+
breadth = float((per_cell < -0.5).mean())
|
|
339
|
+
|
|
340
|
+
# Classification
|
|
341
|
+
if profile["n_teratogenic"] > 0 or penalty >= 15:
|
|
342
|
+
classification = "DANGEROUS"
|
|
343
|
+
elif profile["n_tumor_suppressors"] > 0 or profile["n_heme_tfs"] > 0 or penalty >= 5:
|
|
344
|
+
classification = "CAUTION"
|
|
345
|
+
elif breadth > 0.8:
|
|
346
|
+
classification = "CAUTION" # kills too many cell lines = nonspecific toxicity
|
|
347
|
+
else:
|
|
348
|
+
classification = "SAFE"
|
|
349
|
+
|
|
350
|
+
# Safety score (0-100, higher = safer)
|
|
351
|
+
safety_score = max(0, 100 - penalty * 5 - breadth * 30)
|
|
352
|
+
|
|
353
|
+
results.append({
|
|
354
|
+
"compound": cpd,
|
|
355
|
+
"classification": classification,
|
|
356
|
+
"safety_score": round(safety_score, 1),
|
|
357
|
+
"safety_penalty": penalty,
|
|
358
|
+
"viability_breadth": round(breadth, 3),
|
|
359
|
+
"n_antitargets": profile["n_antitargets"],
|
|
360
|
+
"n_tumor_suppressors": profile["n_tumor_suppressors"],
|
|
361
|
+
"n_heme_tfs": profile["n_heme_tfs"],
|
|
362
|
+
"n_teratogenic": profile["n_teratogenic"],
|
|
363
|
+
})
|
|
364
|
+
|
|
365
|
+
df = pd.DataFrame(results)
|
|
366
|
+
|
|
367
|
+
if len(df) > 0:
|
|
368
|
+
counts = df["classification"].value_counts().to_dict()
|
|
369
|
+
safe = counts.get("SAFE", 0)
|
|
370
|
+
caution = counts.get("CAUTION", 0)
|
|
371
|
+
dangerous = counts.get("DANGEROUS", 0)
|
|
372
|
+
else:
|
|
373
|
+
safe = caution = dangerous = 0
|
|
374
|
+
|
|
375
|
+
if compound_id != "all" and results:
|
|
376
|
+
r = results[0]
|
|
377
|
+
summary = (
|
|
378
|
+
f"Safety classification for {compound_id}: {r['classification']}\n"
|
|
379
|
+
f"Score: {r['safety_score']}/100, Penalty: {r['safety_penalty']}, "
|
|
380
|
+
f"Viability breadth: {r['viability_breadth']:.1%}"
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
summary = (
|
|
384
|
+
f"Safety classification: {len(df)} compounds\n"
|
|
385
|
+
f"SAFE: {safe}, CAUTION: {caution}, DANGEROUS: {dangerous}"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return {
|
|
389
|
+
"summary": summary,
|
|
390
|
+
"classifications": results,
|
|
391
|
+
"distribution": {"SAFE": safe, "CAUTION": caution, "DANGEROUS": dangerous},
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
@registry.register(
|
|
396
|
+
name="safety.sall4_risk",
|
|
397
|
+
description="Assess SALL4 degradation risk for IMiD-type molecular glue compounds (teratogenicity marker)",
|
|
398
|
+
category="safety",
|
|
399
|
+
parameters={
|
|
400
|
+
"compound_id": "Compound to check (or 'all')",
|
|
401
|
+
},
|
|
402
|
+
requires_data=["proteomics"],
|
|
403
|
+
usage_guide="You are working with CRBN-based molecular glues and need to assess teratogenicity risk. SALL4 degradation was the molecular cause of thalidomide birth defects — critical safety check for any IMiD-type compound.",
|
|
404
|
+
)
|
|
405
|
+
def sall4_risk(compound_id: str = "all", **kwargs) -> dict:
|
|
406
|
+
"""Check for SALL4 degradation -- the key teratogenicity signal for IMiD-type compounds.
|
|
407
|
+
|
|
408
|
+
SALL4 is a zinc finger TF essential for limb development. Its degradation by
|
|
409
|
+
thalidomide via CRBN was the molecular cause of thalidomide teratogenicity.
|
|
410
|
+
Any CRBN-based molecular glue that degrades SALL4 is a teratogenicity risk.
|
|
411
|
+
"""
|
|
412
|
+
from ct.tools._compound_resolver import resolve_compound
|
|
413
|
+
if compound_id != "all":
|
|
414
|
+
compound_id = resolve_compound(compound_id, dataset="proteomics")
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
from ct.data.loaders import load_proteomics
|
|
418
|
+
prot = load_proteomics()
|
|
419
|
+
except FileNotFoundError:
|
|
420
|
+
return {
|
|
421
|
+
"error": "Proteomics data not available.",
|
|
422
|
+
"summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
# Check for SALL family proteins (handles both gene symbols and UniProt IDs)
|
|
426
|
+
sall_uniprot = {uid: gene for uid, gene in UNIPROT_TO_GENE.items() if gene.startswith("SALL")}
|
|
427
|
+
sall_proteins = [] # list of (index_id, gene_symbol)
|
|
428
|
+
for p in prot.index:
|
|
429
|
+
if p.startswith("SALL"):
|
|
430
|
+
sall_proteins.append((p, p))
|
|
431
|
+
elif p in sall_uniprot:
|
|
432
|
+
sall_proteins.append((p, sall_uniprot[p]))
|
|
433
|
+
|
|
434
|
+
if not sall_proteins:
|
|
435
|
+
return {
|
|
436
|
+
"summary": "SALL proteins not detected in proteomics data -- cannot assess teratogenicity risk",
|
|
437
|
+
"sall_proteins_in_data": [],
|
|
438
|
+
"risk_assessment": "UNKNOWN",
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
compounds = [compound_id] if compound_id != "all" else prot.columns.tolist()
|
|
442
|
+
results = []
|
|
443
|
+
|
|
444
|
+
for cpd in compounds:
|
|
445
|
+
if cpd not in prot.columns:
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
sall_values = {} # gene_symbol -> LFC
|
|
449
|
+
for idx_id, gene in sall_proteins:
|
|
450
|
+
val = prot.loc[idx_id, cpd]
|
|
451
|
+
if pd.notna(val):
|
|
452
|
+
sall_values[gene] = float(val)
|
|
453
|
+
|
|
454
|
+
# Risk assessment
|
|
455
|
+
sall4_lfc = sall_values.get("SALL4")
|
|
456
|
+
any_sall_degraded = any(v < -0.5 for v in sall_values.values())
|
|
457
|
+
sall4_degraded = sall4_lfc is not None and sall4_lfc < -0.5
|
|
458
|
+
|
|
459
|
+
if sall4_degraded:
|
|
460
|
+
risk = "HIGH"
|
|
461
|
+
risk_detail = f"SALL4 degraded (LFC={sall4_lfc:.2f}) -- thalidomide-like teratogenicity risk"
|
|
462
|
+
elif any_sall_degraded:
|
|
463
|
+
risk = "MODERATE"
|
|
464
|
+
degraded_salls = {k: round(v, 3) for k, v in sall_values.items() if v < -0.5}
|
|
465
|
+
risk_detail = f"SALL family member(s) degraded: {degraded_salls} -- potential teratogenicity"
|
|
466
|
+
elif sall4_lfc is not None and sall4_lfc < -0.3:
|
|
467
|
+
risk = "LOW"
|
|
468
|
+
risk_detail = f"SALL4 mildly reduced (LFC={sall4_lfc:.2f}) -- monitor in follow-up"
|
|
469
|
+
else:
|
|
470
|
+
risk = "MINIMAL"
|
|
471
|
+
risk_detail = "No SALL degradation detected"
|
|
472
|
+
|
|
473
|
+
results.append({
|
|
474
|
+
"compound": cpd,
|
|
475
|
+
"risk_level": risk,
|
|
476
|
+
"risk_detail": risk_detail,
|
|
477
|
+
"sall_values": sall_values,
|
|
478
|
+
})
|
|
479
|
+
|
|
480
|
+
sall_names = [gene for _, gene in sall_proteins]
|
|
481
|
+
|
|
482
|
+
if compound_id != "all" and results:
|
|
483
|
+
r = results[0]
|
|
484
|
+
summary = f"SALL4 risk for {compound_id}: {r['risk_level']} -- {r['risk_detail']}"
|
|
485
|
+
else:
|
|
486
|
+
risk_counts = {}
|
|
487
|
+
for r in results:
|
|
488
|
+
risk_counts[r["risk_level"]] = risk_counts.get(r["risk_level"], 0) + 1
|
|
489
|
+
summary = f"SALL4 risk assessment: {len(results)} compounds -- {risk_counts}"
|
|
490
|
+
|
|
491
|
+
return {
|
|
492
|
+
"summary": summary,
|
|
493
|
+
"sall_proteins_in_data": sall_names,
|
|
494
|
+
"assessments": results,
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
@registry.register(
|
|
499
|
+
name="safety.faers_signal_scan",
|
|
500
|
+
description="Scan openFDA FAERS adverse-event reports for disproportionality signals (PRR/ROR) for a drug",
|
|
501
|
+
category="safety",
|
|
502
|
+
parameters={
|
|
503
|
+
"drug_name": "Drug name to scan (generic or brand name)",
|
|
504
|
+
"event": "Optional specific MedDRA preferred term to evaluate",
|
|
505
|
+
"top_n": "If event not provided, evaluate top N reported events for this drug (default 5)",
|
|
506
|
+
"min_case_count": "Minimum A-count threshold for signal flagging (default 3)",
|
|
507
|
+
},
|
|
508
|
+
usage_guide=(
|
|
509
|
+
"Use for post-marketing pharmacovigilance triage. Computes disproportionality metrics "
|
|
510
|
+
"(PRR/ROR/chi-square) from openFDA FAERS counts and flags candidate safety signals."
|
|
511
|
+
),
|
|
512
|
+
)
|
|
513
|
+
def faers_signal_scan(
|
|
514
|
+
drug_name: str,
|
|
515
|
+
event: str = "",
|
|
516
|
+
top_n: int = 5,
|
|
517
|
+
min_case_count: int = 3,
|
|
518
|
+
**kwargs,
|
|
519
|
+
) -> dict:
|
|
520
|
+
"""Run a disproportionality safety scan using openFDA FAERS."""
|
|
521
|
+
if not drug_name or not drug_name.strip():
|
|
522
|
+
return {"error": "drug_name is required", "summary": "No drug name provided"}
|
|
523
|
+
|
|
524
|
+
drug_term = _openfda_escape(drug_name)
|
|
525
|
+
if not drug_term:
|
|
526
|
+
return {"error": "drug_name is required", "summary": "No drug name provided"}
|
|
527
|
+
|
|
528
|
+
top_n = max(1, min(int(top_n or 5), 20))
|
|
529
|
+
min_case_count = max(1, int(min_case_count or 3))
|
|
530
|
+
|
|
531
|
+
drug_search = f'patient.drug.medicinalproduct.exact:"{drug_term}"'
|
|
532
|
+
|
|
533
|
+
all_total, error = _openfda_total("")
|
|
534
|
+
if error:
|
|
535
|
+
return {"error": f"openFDA total lookup failed: {error}", "summary": f"FAERS scan failed: {error}"}
|
|
536
|
+
drug_total, error = _openfda_total(drug_search)
|
|
537
|
+
if error:
|
|
538
|
+
return {"error": f"openFDA drug lookup failed: {error}", "summary": f"FAERS scan failed for {drug_name}: {error}"}
|
|
539
|
+
|
|
540
|
+
if drug_total <= 0:
|
|
541
|
+
return {
|
|
542
|
+
"drug_name": drug_name,
|
|
543
|
+
"total_reports_for_drug": 0,
|
|
544
|
+
"signals": [],
|
|
545
|
+
"summary": f"No FAERS reports found for '{drug_name}'",
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
events_to_scan = []
|
|
549
|
+
if event and event.strip():
|
|
550
|
+
events_to_scan = [event.strip()]
|
|
551
|
+
else:
|
|
552
|
+
data, count_error = request_json(
|
|
553
|
+
"GET",
|
|
554
|
+
_OPENFDA_DRUG_EVENT_URL,
|
|
555
|
+
params={
|
|
556
|
+
"search": drug_search,
|
|
557
|
+
"count": "patient.reaction.reactionmeddrapt.exact",
|
|
558
|
+
"limit": str(top_n),
|
|
559
|
+
},
|
|
560
|
+
timeout=20,
|
|
561
|
+
retries=2,
|
|
562
|
+
)
|
|
563
|
+
if count_error:
|
|
564
|
+
return {
|
|
565
|
+
"error": f"openFDA event aggregation failed: {count_error}",
|
|
566
|
+
"summary": f"FAERS scan failed for {drug_name}: {count_error}",
|
|
567
|
+
}
|
|
568
|
+
events_to_scan = [r.get("term", "") for r in data.get("results", []) if r.get("term")]
|
|
569
|
+
if not events_to_scan:
|
|
570
|
+
return {
|
|
571
|
+
"drug_name": drug_name,
|
|
572
|
+
"total_reports_for_drug": int(drug_total),
|
|
573
|
+
"signals": [],
|
|
574
|
+
"summary": f"FAERS reports found for '{drug_name}', but no reaction terms were returned",
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
signals = []
|
|
578
|
+
for ev in events_to_scan:
|
|
579
|
+
ev_term = _openfda_escape(ev)
|
|
580
|
+
if not ev_term:
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
event_search = f'patient.reaction.reactionmeddrapt.exact:"{ev_term}"'
|
|
584
|
+
both_search = f"{drug_search}+AND+{event_search}"
|
|
585
|
+
|
|
586
|
+
event_total, event_err = _openfda_total(event_search)
|
|
587
|
+
both_total, both_err = _openfda_total(both_search)
|
|
588
|
+
if event_err or both_err:
|
|
589
|
+
signals.append({
|
|
590
|
+
"event": ev,
|
|
591
|
+
"error": event_err or both_err,
|
|
592
|
+
})
|
|
593
|
+
continue
|
|
594
|
+
|
|
595
|
+
a = int(both_total)
|
|
596
|
+
b = int(drug_total) - a
|
|
597
|
+
c = int(event_total) - a
|
|
598
|
+
d = int(all_total) - (a + b + c)
|
|
599
|
+
if d < 0:
|
|
600
|
+
d = 0
|
|
601
|
+
|
|
602
|
+
metrics = _faers_signal_metrics(a, b, c, d, min_case_count=min_case_count)
|
|
603
|
+
signals.append({
|
|
604
|
+
"event": ev,
|
|
605
|
+
"a_drug_and_event": a,
|
|
606
|
+
"b_drug_no_event": b,
|
|
607
|
+
"c_no_drug_event": c,
|
|
608
|
+
"d_no_drug_no_event": d,
|
|
609
|
+
**metrics,
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
clean_signals = [s for s in signals if "error" not in s]
|
|
613
|
+
clean_signals.sort(
|
|
614
|
+
key=lambda x: (x.get("meets_signal_criteria", False), x.get("prr", 0.0), x.get("ror", 0.0)),
|
|
615
|
+
reverse=True,
|
|
616
|
+
)
|
|
617
|
+
n_flagged = sum(1 for s in clean_signals if s.get("meets_signal_criteria"))
|
|
618
|
+
error_count = len(signals) - len(clean_signals)
|
|
619
|
+
|
|
620
|
+
if clean_signals:
|
|
621
|
+
top = clean_signals[0]
|
|
622
|
+
summary = (
|
|
623
|
+
f"FAERS signal scan for {drug_name}: {len(clean_signals)} event(s) analyzed, "
|
|
624
|
+
f"{n_flagged} flagged by PRR/ROR criteria. Top event: {top['event']} "
|
|
625
|
+
f"(PRR={top['prr']}, ROR={top['ror']})."
|
|
626
|
+
)
|
|
627
|
+
else:
|
|
628
|
+
summary = (
|
|
629
|
+
f"FAERS signal scan for {drug_name}: no analyzable events returned"
|
|
630
|
+
+ (f" ({error_count} event lookup error(s))." if error_count else ".")
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
return {
|
|
634
|
+
"summary": summary,
|
|
635
|
+
"drug_name": drug_name,
|
|
636
|
+
"event_filter": event.strip(),
|
|
637
|
+
"total_reports_all_faers": int(all_total),
|
|
638
|
+
"total_reports_for_drug": int(drug_total),
|
|
639
|
+
"criteria": {
|
|
640
|
+
"min_case_count": min_case_count,
|
|
641
|
+
"prr_threshold": 2.0,
|
|
642
|
+
"chi_square_threshold": 4.0,
|
|
643
|
+
},
|
|
644
|
+
"n_events_analyzed": len(clean_signals),
|
|
645
|
+
"n_events_flagged": n_flagged,
|
|
646
|
+
"n_event_lookup_errors": error_count,
|
|
647
|
+
"signals": clean_signals,
|
|
648
|
+
"errors": [s for s in signals if "error" in s],
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
@registry.register(
|
|
653
|
+
name="safety.label_risk_extract",
|
|
654
|
+
description="Extract boxed warnings, contraindications, and key risk sections from openFDA drug labels",
|
|
655
|
+
category="safety",
|
|
656
|
+
parameters={
|
|
657
|
+
"drug_name": "Drug name (generic or brand)",
|
|
658
|
+
"max_labels": "Maximum label records to inspect (default 3)",
|
|
659
|
+
"section_max_chars": "Max characters per extracted section (default 500)",
|
|
660
|
+
},
|
|
661
|
+
usage_guide=(
|
|
662
|
+
"Use for rapid regulatory risk triage. Pulls key safety sections from FDA labels "
|
|
663
|
+
"(boxed warning, contraindications, warnings, interactions, special populations)."
|
|
664
|
+
),
|
|
665
|
+
)
|
|
666
|
+
def label_risk_extract(
|
|
667
|
+
drug_name: str,
|
|
668
|
+
max_labels: int = 3,
|
|
669
|
+
section_max_chars: int = 500,
|
|
670
|
+
**kwargs,
|
|
671
|
+
) -> dict:
|
|
672
|
+
"""Extract key risk sections from openFDA drug label endpoint."""
|
|
673
|
+
import re
|
|
674
|
+
|
|
675
|
+
if not drug_name or not drug_name.strip():
|
|
676
|
+
return {"error": "drug_name is required", "summary": "No drug name provided"}
|
|
677
|
+
|
|
678
|
+
max_labels = max(1, min(int(max_labels or 3), 10))
|
|
679
|
+
section_max_chars = max(120, min(int(section_max_chars or 500), 4000))
|
|
680
|
+
drug_term = _openfda_escape(drug_name)
|
|
681
|
+
|
|
682
|
+
search = (
|
|
683
|
+
f'openfda.generic_name.exact:"{drug_term}"'
|
|
684
|
+
f'+OR+openfda.brand_name.exact:"{drug_term}"'
|
|
685
|
+
f'+OR+openfda.substance_name.exact:"{drug_term}"'
|
|
686
|
+
)
|
|
687
|
+
data, error = request_json(
|
|
688
|
+
"GET",
|
|
689
|
+
_OPENFDA_DRUG_LABEL_URL,
|
|
690
|
+
params={"search": search, "limit": str(max_labels)},
|
|
691
|
+
timeout=20,
|
|
692
|
+
retries=2,
|
|
693
|
+
)
|
|
694
|
+
if error:
|
|
695
|
+
return {"error": f"openFDA label query failed: {error}", "summary": f"Label risk extraction failed: {error}"}
|
|
696
|
+
|
|
697
|
+
results = data.get("results", [])
|
|
698
|
+
if not results:
|
|
699
|
+
return {
|
|
700
|
+
"drug_name": drug_name,
|
|
701
|
+
"labels_found": 0,
|
|
702
|
+
"risk_level": "UNKNOWN",
|
|
703
|
+
"summary": f"No openFDA label records found for '{drug_name}'",
|
|
704
|
+
"labels": [],
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
def _extract_section(entry: dict, key: str) -> str:
|
|
708
|
+
value = entry.get(key, [])
|
|
709
|
+
if isinstance(value, list):
|
|
710
|
+
text = " ".join(str(v).strip() for v in value if str(v).strip())
|
|
711
|
+
elif isinstance(value, str):
|
|
712
|
+
text = value.strip()
|
|
713
|
+
else:
|
|
714
|
+
text = ""
|
|
715
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
716
|
+
if len(text) > section_max_chars:
|
|
717
|
+
text = text[: section_max_chars - 3] + "..."
|
|
718
|
+
return text
|
|
719
|
+
|
|
720
|
+
label_summaries = []
|
|
721
|
+
for entry in results:
|
|
722
|
+
openfda = entry.get("openfda", {})
|
|
723
|
+
brand = ", ".join(openfda.get("brand_name", [])[:3]) if isinstance(openfda.get("brand_name"), list) else ""
|
|
724
|
+
generic = ", ".join(openfda.get("generic_name", [])[:3]) if isinstance(openfda.get("generic_name"), list) else ""
|
|
725
|
+
application = ", ".join(openfda.get("application_number", [])[:3]) if isinstance(openfda.get("application_number"), list) else ""
|
|
726
|
+
manufacturer = ", ".join(openfda.get("manufacturer_name", [])[:2]) if isinstance(openfda.get("manufacturer_name"), list) else ""
|
|
727
|
+
|
|
728
|
+
sections = {
|
|
729
|
+
"boxed_warning": _extract_section(entry, "boxed_warning"),
|
|
730
|
+
"contraindications": _extract_section(entry, "contraindications"),
|
|
731
|
+
"warnings_and_cautions": _extract_section(entry, "warnings_and_cautions"),
|
|
732
|
+
"warnings": _extract_section(entry, "warnings"),
|
|
733
|
+
"adverse_reactions": _extract_section(entry, "adverse_reactions"),
|
|
734
|
+
"drug_interactions": _extract_section(entry, "drug_interactions"),
|
|
735
|
+
"use_in_specific_populations": _extract_section(entry, "use_in_specific_populations"),
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
has_boxed = bool(sections["boxed_warning"])
|
|
739
|
+
has_contra = bool(sections["contraindications"])
|
|
740
|
+
has_warn = bool(sections["warnings"] or sections["warnings_and_cautions"])
|
|
741
|
+
|
|
742
|
+
if has_boxed:
|
|
743
|
+
risk_level = "HIGH"
|
|
744
|
+
elif has_contra or has_warn:
|
|
745
|
+
risk_level = "MODERATE"
|
|
746
|
+
else:
|
|
747
|
+
risk_level = "LOW"
|
|
748
|
+
|
|
749
|
+
flags = []
|
|
750
|
+
if has_boxed:
|
|
751
|
+
flags.append("boxed_warning")
|
|
752
|
+
if has_contra:
|
|
753
|
+
flags.append("contraindications")
|
|
754
|
+
if has_warn:
|
|
755
|
+
flags.append("warnings")
|
|
756
|
+
|
|
757
|
+
label_summaries.append({
|
|
758
|
+
"brand_name": brand,
|
|
759
|
+
"generic_name": generic,
|
|
760
|
+
"application_number": application,
|
|
761
|
+
"manufacturer": manufacturer,
|
|
762
|
+
"risk_level": risk_level,
|
|
763
|
+
"risk_flags": flags,
|
|
764
|
+
"sections": sections,
|
|
765
|
+
})
|
|
766
|
+
|
|
767
|
+
rank = {"HIGH": 3, "MODERATE": 2, "LOW": 1}
|
|
768
|
+
overall_risk = max(label_summaries, key=lambda x: rank.get(x["risk_level"], 0))["risk_level"]
|
|
769
|
+
boxed_count = sum(1 for l in label_summaries if "boxed_warning" in l.get("risk_flags", []))
|
|
770
|
+
contra_count = sum(1 for l in label_summaries if "contraindications" in l.get("risk_flags", []))
|
|
771
|
+
|
|
772
|
+
summary = (
|
|
773
|
+
f"Label risk extraction for {drug_name}: {len(label_summaries)} label record(s), "
|
|
774
|
+
f"overall risk={overall_risk}. Boxed warning present in {boxed_count} label(s); "
|
|
775
|
+
f"contraindications present in {contra_count} label(s)."
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
return {
|
|
779
|
+
"summary": summary,
|
|
780
|
+
"drug_name": drug_name,
|
|
781
|
+
"labels_found": len(label_summaries),
|
|
782
|
+
"risk_level": overall_risk,
|
|
783
|
+
"n_boxed_warning_labels": boxed_count,
|
|
784
|
+
"n_contraindication_labels": contra_count,
|
|
785
|
+
"labels": label_summaries,
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
@registry.register(
|
|
790
|
+
name="safety.admet_predict",
|
|
791
|
+
description="Predict ADMET properties for a compound from SMILES using RDKit descriptors and heuristic rules",
|
|
792
|
+
category="safety",
|
|
793
|
+
parameters={
|
|
794
|
+
"smiles": "SMILES string for the compound to profile",
|
|
795
|
+
},
|
|
796
|
+
usage_guide="You need a comprehensive ADMET (absorption, distribution, metabolism, excretion, toxicity) profile for a compound. Use early in lead optimization to flag liabilities before synthesis. Covers Lipinski, Veber, Ghose, lead-likeness, oral absorption, BBB, hERG, CYP, and solubility.",
|
|
797
|
+
)
|
|
798
|
+
def admet_predict(smiles: str, **kwargs) -> dict:
|
|
799
|
+
"""Predict ADMET properties from SMILES using RDKit descriptors and heuristic rules.
|
|
800
|
+
|
|
801
|
+
Computes physicochemical properties and applies established medicinal chemistry
|
|
802
|
+
filters (Lipinski Ro5, Veber, Ghose, lead-likeness) plus heuristic predictions
|
|
803
|
+
for oral absorption, BBB penetration, hERG risk, CYP liability, and solubility.
|
|
804
|
+
"""
|
|
805
|
+
from ct.tools.chemistry import _extract_smiles
|
|
806
|
+
smiles = _extract_smiles(smiles)
|
|
807
|
+
|
|
808
|
+
try:
|
|
809
|
+
from rdkit import Chem
|
|
810
|
+
from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors
|
|
811
|
+
except ImportError:
|
|
812
|
+
return {"error": "RDKit is required for ADMET prediction. Install with: pip install rdkit", "summary": "RDKit is required for ADMET prediction. Install with: pip install rdkit"}
|
|
813
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
814
|
+
if mol is None:
|
|
815
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
|
|
816
|
+
|
|
817
|
+
# --- Physicochemical descriptors ---
|
|
818
|
+
mw = Descriptors.MolWt(mol)
|
|
819
|
+
logp = Crippen.MolLogP(mol)
|
|
820
|
+
tpsa = Descriptors.TPSA(mol)
|
|
821
|
+
hba = Lipinski.NumHAcceptors(mol)
|
|
822
|
+
hbd = Lipinski.NumHDonors(mol)
|
|
823
|
+
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
|
|
824
|
+
aromatic_rings = Descriptors.NumAromaticRings(mol)
|
|
825
|
+
num_rings = Descriptors.RingCount(mol)
|
|
826
|
+
heavy_atoms = mol.GetNumHeavyAtoms()
|
|
827
|
+
formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
828
|
+
|
|
829
|
+
properties = {
|
|
830
|
+
"smiles": smiles,
|
|
831
|
+
"formula": formula,
|
|
832
|
+
"molecular_weight": round(mw, 2),
|
|
833
|
+
"logp": round(logp, 2),
|
|
834
|
+
"tpsa": round(tpsa, 2),
|
|
835
|
+
"hba": hba,
|
|
836
|
+
"hbd": hbd,
|
|
837
|
+
"rotatable_bonds": rotatable_bonds,
|
|
838
|
+
"aromatic_rings": aromatic_rings,
|
|
839
|
+
"num_rings": num_rings,
|
|
840
|
+
"heavy_atoms": heavy_atoms,
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
# --- Drug-likeness filters ---
|
|
844
|
+
filters = {}
|
|
845
|
+
|
|
846
|
+
# Lipinski Rule of Five
|
|
847
|
+
lipinski_violations = sum([
|
|
848
|
+
mw > 500,
|
|
849
|
+
logp > 5,
|
|
850
|
+
hbd > 5,
|
|
851
|
+
hba > 10,
|
|
852
|
+
])
|
|
853
|
+
filters["lipinski_ro5"] = {
|
|
854
|
+
"pass": lipinski_violations <= 1,
|
|
855
|
+
"violations": lipinski_violations,
|
|
856
|
+
"details": {
|
|
857
|
+
"MW<=500": mw <= 500,
|
|
858
|
+
"LogP<=5": logp <= 5,
|
|
859
|
+
"HBD<=5": hbd <= 5,
|
|
860
|
+
"HBA<=10": hba <= 10,
|
|
861
|
+
},
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
# Veber rule (oral bioavailability)
|
|
865
|
+
veber_pass = tpsa <= 140 and rotatable_bonds <= 10
|
|
866
|
+
filters["veber"] = {
|
|
867
|
+
"pass": veber_pass,
|
|
868
|
+
"details": {
|
|
869
|
+
"TPSA<=140": tpsa <= 140,
|
|
870
|
+
"RotBonds<=10": rotatable_bonds <= 10,
|
|
871
|
+
},
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
# Lead-likeness (Teague/Oprea)
|
|
875
|
+
lead_like = mw <= 350 and logp <= 3.5 and rotatable_bonds <= 7
|
|
876
|
+
filters["lead_likeness"] = {
|
|
877
|
+
"pass": lead_like,
|
|
878
|
+
"details": {
|
|
879
|
+
"MW<=350": mw <= 350,
|
|
880
|
+
"LogP<=3.5": logp <= 3.5,
|
|
881
|
+
"RotBonds<=7": rotatable_bonds <= 7,
|
|
882
|
+
},
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
# Ghose filter
|
|
886
|
+
ghose_pass = (
|
|
887
|
+
160 <= mw <= 480
|
|
888
|
+
and -0.4 <= logp <= 5.6
|
|
889
|
+
and 40 <= heavy_atoms <= 130 # using heavy atoms as proxy for atom count
|
|
890
|
+
and 20 <= Descriptors.MolMR(mol) <= 130
|
|
891
|
+
)
|
|
892
|
+
filters["ghose"] = {
|
|
893
|
+
"pass": ghose_pass,
|
|
894
|
+
"details": {
|
|
895
|
+
"160<=MW<=480": 160 <= mw <= 480,
|
|
896
|
+
"-0.4<=LogP<=5.6": -0.4 <= logp <= 5.6,
|
|
897
|
+
"20<=MR<=130": 20 <= Descriptors.MolMR(mol) <= 130,
|
|
898
|
+
},
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
# --- ADMET predictions (heuristic) ---
|
|
902
|
+
predictions = {}
|
|
903
|
+
|
|
904
|
+
# Oral absorption
|
|
905
|
+
oral_absorption = tpsa < 140 and rotatable_bonds <= 10
|
|
906
|
+
oral_score = max(0, 100 - (max(0, tpsa - 60) * 0.8) - (max(0, rotatable_bonds - 5) * 5))
|
|
907
|
+
predictions["oral_absorption"] = {
|
|
908
|
+
"prediction": "likely" if oral_absorption else "poor",
|
|
909
|
+
"score": round(min(100, oral_score), 1),
|
|
910
|
+
"rationale": f"TPSA={tpsa:.0f} ({'<' if tpsa < 140 else '>='} 140), "
|
|
911
|
+
f"RotBonds={rotatable_bonds} ({'<=' if rotatable_bonds <= 10 else '>'} 10)",
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
# BBB penetration
|
|
915
|
+
bbb = tpsa < 90 and mw < 450 and 1 <= logp <= 3
|
|
916
|
+
bbb_score = max(0, 100 - max(0, tpsa - 40) * 1.2 - max(0, mw - 300) * 0.3 - abs(logp - 2) * 15)
|
|
917
|
+
predictions["bbb_penetration"] = {
|
|
918
|
+
"prediction": "likely" if bbb else "unlikely",
|
|
919
|
+
"score": round(min(100, bbb_score), 1),
|
|
920
|
+
"rationale": f"TPSA={tpsa:.0f} ({'<' if tpsa < 90 else '>='} 90), "
|
|
921
|
+
f"MW={mw:.0f} ({'<' if mw < 450 else '>='} 450), "
|
|
922
|
+
f"LogP={logp:.1f} ({'in' if 1 <= logp <= 3 else 'outside'} 1-3)",
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
# hERG risk (rough heuristic)
|
|
926
|
+
herg_risk = logp > 3.7 and mw > 400
|
|
927
|
+
herg_concern = "elevated" if herg_risk else "low"
|
|
928
|
+
predictions["herg_risk"] = {
|
|
929
|
+
"prediction": herg_concern,
|
|
930
|
+
"flag": herg_risk,
|
|
931
|
+
"rationale": f"LogP={logp:.1f} ({'>' if logp > 3.7 else '<='} 3.7), "
|
|
932
|
+
f"MW={mw:.0f} ({'>' if mw > 400 else '<='} 400). "
|
|
933
|
+
f"Lipophilic, large molecules more likely to block hERG channel.",
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
# CYP liability
|
|
937
|
+
cyp_risk_factors = 0
|
|
938
|
+
cyp_details = []
|
|
939
|
+
if aromatic_rings >= 3:
|
|
940
|
+
cyp_risk_factors += 1
|
|
941
|
+
cyp_details.append(f"{aromatic_rings} aromatic rings (>=3)")
|
|
942
|
+
if logp > 3:
|
|
943
|
+
cyp_risk_factors += 1
|
|
944
|
+
cyp_details.append(f"LogP={logp:.1f} (>3)")
|
|
945
|
+
if mw > 500:
|
|
946
|
+
cyp_risk_factors += 1
|
|
947
|
+
cyp_details.append(f"MW={mw:.0f} (>500)")
|
|
948
|
+
|
|
949
|
+
cyp_level = "high" if cyp_risk_factors >= 2 else "moderate" if cyp_risk_factors == 1 else "low"
|
|
950
|
+
predictions["cyp_liability"] = {
|
|
951
|
+
"prediction": cyp_level,
|
|
952
|
+
"risk_factors": cyp_risk_factors,
|
|
953
|
+
"details": cyp_details if cyp_details else ["No major CYP liability flags"],
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
# Solubility class (simplified Yalkowsky-based heuristic: logS ~ 0.5 - 0.01*(MP) - logP)
|
|
957
|
+
# Without melting point, use MW as rough proxy: logS ~ 0.5 - 0.01*MW - logP
|
|
958
|
+
log_s_est = 0.5 - 0.01 * mw - logp
|
|
959
|
+
if log_s_est > -1:
|
|
960
|
+
sol_class = "highly soluble"
|
|
961
|
+
elif log_s_est > -3:
|
|
962
|
+
sol_class = "soluble"
|
|
963
|
+
elif log_s_est > -5:
|
|
964
|
+
sol_class = "moderately soluble"
|
|
965
|
+
elif log_s_est > -7:
|
|
966
|
+
sol_class = "poorly soluble"
|
|
967
|
+
else:
|
|
968
|
+
sol_class = "insoluble"
|
|
969
|
+
|
|
970
|
+
predictions["solubility"] = {
|
|
971
|
+
"class": sol_class,
|
|
972
|
+
"estimated_logS": round(log_s_est, 2),
|
|
973
|
+
"rationale": f"Estimated logS={log_s_est:.2f} (Yalkowsky-type heuristic from MW and LogP)",
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
# --- Overall ADMET verdict ---
|
|
977
|
+
flags = []
|
|
978
|
+
if not filters["lipinski_ro5"]["pass"]:
|
|
979
|
+
flags.append(f"Lipinski: {lipinski_violations} violations")
|
|
980
|
+
if not veber_pass:
|
|
981
|
+
flags.append("Fails Veber (oral bioavailability concern)")
|
|
982
|
+
if herg_risk:
|
|
983
|
+
flags.append("Elevated hERG risk")
|
|
984
|
+
if cyp_level == "high":
|
|
985
|
+
flags.append("High CYP liability")
|
|
986
|
+
if sol_class in ("poorly soluble", "insoluble"):
|
|
987
|
+
flags.append(f"Solubility: {sol_class}")
|
|
988
|
+
|
|
989
|
+
if not flags:
|
|
990
|
+
verdict = "FAVORABLE"
|
|
991
|
+
elif len(flags) <= 2:
|
|
992
|
+
verdict = "ACCEPTABLE"
|
|
993
|
+
else:
|
|
994
|
+
verdict = "UNFAVORABLE"
|
|
995
|
+
|
|
996
|
+
summary_parts = [
|
|
997
|
+
f"ADMET profile for {formula} (MW={mw:.0f}, LogP={logp:.1f}): {verdict}",
|
|
998
|
+
f"Lipinski: {'PASS' if filters['lipinski_ro5']['pass'] else 'FAIL'} ({lipinski_violations} violations)",
|
|
999
|
+
f"Oral absorption: {predictions['oral_absorption']['prediction']} (score {predictions['oral_absorption']['score']})",
|
|
1000
|
+
f"BBB: {predictions['bbb_penetration']['prediction']} (score {predictions['bbb_penetration']['score']})",
|
|
1001
|
+
f"hERG: {predictions['herg_risk']['prediction']}, CYP: {predictions['cyp_liability']['prediction']}",
|
|
1002
|
+
f"Solubility: {predictions['solubility']['class']} (logS~{log_s_est:.1f})",
|
|
1003
|
+
]
|
|
1004
|
+
if flags:
|
|
1005
|
+
summary_parts.append(f"Flags: {'; '.join(flags)}")
|
|
1006
|
+
|
|
1007
|
+
return {
|
|
1008
|
+
"summary": "\n".join(summary_parts),
|
|
1009
|
+
"verdict": verdict,
|
|
1010
|
+
"properties": properties,
|
|
1011
|
+
"filters": filters,
|
|
1012
|
+
"predictions": predictions,
|
|
1013
|
+
"flags": flags,
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
@registry.register(
|
|
1018
|
+
name="safety.ddi_predict",
|
|
1019
|
+
description="Predict drug-drug interaction potential based on CYP metabolism profile and molecular features",
|
|
1020
|
+
category="safety",
|
|
1021
|
+
parameters={
|
|
1022
|
+
"smiles": "SMILES string for the primary compound",
|
|
1023
|
+
"comedication_smiles": "SMILES string for a co-administered drug (optional)",
|
|
1024
|
+
},
|
|
1025
|
+
usage_guide="You need to assess drug-drug interaction risk for a compound, especially CYP-mediated interactions. Use when evaluating combination therapies or compounds likely to be co-prescribed. Identifies CYP inhibition/induction risk from structural features.",
|
|
1026
|
+
)
|
|
1027
|
+
def ddi_predict(smiles: str, comedication_smiles: str = None, **kwargs) -> dict:
|
|
1028
|
+
"""Predict drug-drug interaction potential based on CYP metabolism profile.
|
|
1029
|
+
|
|
1030
|
+
Uses structural features to estimate CYP inhibition risk for major isoforms
|
|
1031
|
+
(3A4, 2D6, 2C9, 2C19, 1A2). Optionally compares with a co-medication.
|
|
1032
|
+
"""
|
|
1033
|
+
from ct.tools.chemistry import _extract_smiles
|
|
1034
|
+
smiles = _extract_smiles(smiles)
|
|
1035
|
+
|
|
1036
|
+
try:
|
|
1037
|
+
from rdkit import Chem
|
|
1038
|
+
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
|
|
1039
|
+
except ImportError:
|
|
1040
|
+
return {"error": "RDKit is required for DDI prediction. Install with: pip install rdkit", "summary": "RDKit is required for DDI prediction. Install with: pip install rdkit"}
|
|
1041
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1042
|
+
if mol is None:
|
|
1043
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
|
|
1044
|
+
|
|
1045
|
+
mw = Descriptors.MolWt(mol)
|
|
1046
|
+
logp = Crippen.MolLogP(mol)
|
|
1047
|
+
tpsa = Descriptors.TPSA(mol)
|
|
1048
|
+
aromatic_rings = Descriptors.NumAromaticRings(mol)
|
|
1049
|
+
num_rings = Descriptors.RingCount(mol)
|
|
1050
|
+
hba = Descriptors.NumHAcceptors(mol)
|
|
1051
|
+
|
|
1052
|
+
# --- Detect structural motifs associated with CYP inhibition ---
|
|
1053
|
+
motif_flags = {}
|
|
1054
|
+
|
|
1055
|
+
# Nitrogen heterocycles (CYP3A4 inhibition)
|
|
1056
|
+
n_heterocycle_pattern = Chem.MolFromSmarts("[nR]") # ring nitrogen
|
|
1057
|
+
n_heterocycles = len(mol.GetSubstructMatches(n_heterocycle_pattern)) if n_heterocycle_pattern else 0
|
|
1058
|
+
motif_flags["nitrogen_heterocycles"] = n_heterocycles
|
|
1059
|
+
|
|
1060
|
+
# Imidazole motif (strong CYP inhibition — azole antifungals)
|
|
1061
|
+
# Multiple SMARTS to catch both NH and N-substituted forms
|
|
1062
|
+
imidazole_patterns = [
|
|
1063
|
+
Chem.MolFromSmarts("c1cnc[nH]1"), # unsubstituted
|
|
1064
|
+
Chem.MolFromSmarts("c1nccn1"), # N-substituted imidazole
|
|
1065
|
+
Chem.MolFromSmarts("c1cncn1"), # alternative numbering
|
|
1066
|
+
]
|
|
1067
|
+
has_imidazole = any(
|
|
1068
|
+
pat is not None and bool(mol.GetSubstructMatches(pat))
|
|
1069
|
+
for pat in imidazole_patterns
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
# Triazole motif
|
|
1073
|
+
triazole_1 = Chem.MolFromSmarts("c1nncn1")
|
|
1074
|
+
triazole_2 = Chem.MolFromSmarts("c1nnn[nH]1")
|
|
1075
|
+
has_triazole = (
|
|
1076
|
+
(bool(mol.GetSubstructMatches(triazole_1)) if triazole_1 else False)
|
|
1077
|
+
or (bool(mol.GetSubstructMatches(triazole_2)) if triazole_2 else False)
|
|
1078
|
+
)
|
|
1079
|
+
motif_flags["has_imidazole"] = has_imidazole
|
|
1080
|
+
motif_flags["has_triazole"] = has_triazole
|
|
1081
|
+
motif_flags["has_azole"] = has_imidazole or has_triazole
|
|
1082
|
+
|
|
1083
|
+
# Furanyl groups (mechanism-based CYP inhibition)
|
|
1084
|
+
furan = Chem.MolFromSmarts("c1ccoc1")
|
|
1085
|
+
has_furan = bool(mol.GetSubstructMatches(furan)) if furan else False
|
|
1086
|
+
motif_flags["has_furan"] = has_furan
|
|
1087
|
+
|
|
1088
|
+
# Amine groups (CYP2D6 substrates/inhibitors)
|
|
1089
|
+
basic_amine = Chem.MolFromSmarts("[NX3;!$(NC=O);!$(NS=O)]")
|
|
1090
|
+
n_basic_amines = len(mol.GetSubstructMatches(basic_amine)) if basic_amine else 0
|
|
1091
|
+
motif_flags["basic_amines"] = n_basic_amines
|
|
1092
|
+
|
|
1093
|
+
# --- CYP isoform risk assessment ---
|
|
1094
|
+
cyp_profile = {}
|
|
1095
|
+
|
|
1096
|
+
# CYP3A4 — the major drug-metabolizing enzyme
|
|
1097
|
+
cyp3a4_score = 0
|
|
1098
|
+
cyp3a4_reasons = []
|
|
1099
|
+
if has_imidazole or has_triazole:
|
|
1100
|
+
cyp3a4_score += 3
|
|
1101
|
+
cyp3a4_reasons.append("Azole motif (strong CYP3A4 inhibition)")
|
|
1102
|
+
if n_heterocycles >= 2:
|
|
1103
|
+
cyp3a4_score += 1
|
|
1104
|
+
cyp3a4_reasons.append(f"{n_heterocycles} nitrogen heterocycles")
|
|
1105
|
+
if mw > 400 and logp > 3:
|
|
1106
|
+
cyp3a4_score += 1
|
|
1107
|
+
cyp3a4_reasons.append(f"Large lipophilic molecule (MW={mw:.0f}, LogP={logp:.1f})")
|
|
1108
|
+
cyp_profile["CYP3A4"] = {
|
|
1109
|
+
"inhibition_risk": "high" if cyp3a4_score >= 3 else "moderate" if cyp3a4_score >= 1 else "low",
|
|
1110
|
+
"score": cyp3a4_score,
|
|
1111
|
+
"reasons": cyp3a4_reasons if cyp3a4_reasons else ["No major CYP3A4 inhibition flags"],
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
# CYP2D6
|
|
1115
|
+
cyp2d6_score = 0
|
|
1116
|
+
cyp2d6_reasons = []
|
|
1117
|
+
if n_basic_amines >= 1:
|
|
1118
|
+
cyp2d6_score += 1
|
|
1119
|
+
cyp2d6_reasons.append(f"{n_basic_amines} basic amine(s) — CYP2D6 substrate/inhibitor feature")
|
|
1120
|
+
if aromatic_rings >= 2 and n_basic_amines >= 1:
|
|
1121
|
+
cyp2d6_score += 1
|
|
1122
|
+
cyp2d6_reasons.append("Lipophilic amine — classic CYP2D6 inhibitor pharmacophore")
|
|
1123
|
+
cyp_profile["CYP2D6"] = {
|
|
1124
|
+
"inhibition_risk": "high" if cyp2d6_score >= 2 else "moderate" if cyp2d6_score >= 1 else "low",
|
|
1125
|
+
"score": cyp2d6_score,
|
|
1126
|
+
"reasons": cyp2d6_reasons if cyp2d6_reasons else ["No major CYP2D6 inhibition flags"],
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
# CYP2C9
|
|
1130
|
+
cyp2c9_score = 0
|
|
1131
|
+
cyp2c9_reasons = []
|
|
1132
|
+
if logp > 3 and aromatic_rings >= 2:
|
|
1133
|
+
cyp2c9_score += 1
|
|
1134
|
+
cyp2c9_reasons.append("Lipophilic aromatic compound")
|
|
1135
|
+
# Acidic groups — CYP2C9 substrates tend to be weak acids
|
|
1136
|
+
carboxylic = Chem.MolFromSmarts("[CX3](=O)[OX2H1]")
|
|
1137
|
+
has_acid = bool(mol.GetSubstructMatches(carboxylic)) if carboxylic else False
|
|
1138
|
+
if has_acid:
|
|
1139
|
+
cyp2c9_score += 1
|
|
1140
|
+
cyp2c9_reasons.append("Carboxylic acid group — CYP2C9 substrate feature")
|
|
1141
|
+
cyp_profile["CYP2C9"] = {
|
|
1142
|
+
"inhibition_risk": "moderate" if cyp2c9_score >= 1 else "low",
|
|
1143
|
+
"score": cyp2c9_score,
|
|
1144
|
+
"reasons": cyp2c9_reasons if cyp2c9_reasons else ["No major CYP2C9 inhibition flags"],
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
# CYP2C19
|
|
1148
|
+
cyp2c19_score = 0
|
|
1149
|
+
cyp2c19_reasons = []
|
|
1150
|
+
if has_imidazole:
|
|
1151
|
+
cyp2c19_score += 2
|
|
1152
|
+
cyp2c19_reasons.append("Imidazole motif (CYP2C19 inhibitor feature)")
|
|
1153
|
+
if n_heterocycles >= 2 and mw < 500:
|
|
1154
|
+
cyp2c19_score += 1
|
|
1155
|
+
cyp2c19_reasons.append("Multiple nitrogen heterocycles")
|
|
1156
|
+
cyp_profile["CYP2C19"] = {
|
|
1157
|
+
"inhibition_risk": "high" if cyp2c19_score >= 2 else "moderate" if cyp2c19_score >= 1 else "low",
|
|
1158
|
+
"score": cyp2c19_score,
|
|
1159
|
+
"reasons": cyp2c19_reasons if cyp2c19_reasons else ["No major CYP2C19 inhibition flags"],
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
# CYP1A2
|
|
1163
|
+
cyp1a2_score = 0
|
|
1164
|
+
cyp1a2_reasons = []
|
|
1165
|
+
if aromatic_rings >= 3:
|
|
1166
|
+
cyp1a2_score += 1
|
|
1167
|
+
cyp1a2_reasons.append(f"{aromatic_rings} aromatic rings — planar aromatic CYP1A2 substrate")
|
|
1168
|
+
# Fused ring systems
|
|
1169
|
+
if num_rings >= 3 and aromatic_rings >= 2:
|
|
1170
|
+
cyp1a2_score += 1
|
|
1171
|
+
cyp1a2_reasons.append("Polycyclic aromatic system")
|
|
1172
|
+
cyp_profile["CYP1A2"] = {
|
|
1173
|
+
"inhibition_risk": "moderate" if cyp1a2_score >= 1 else "low",
|
|
1174
|
+
"score": cyp1a2_score,
|
|
1175
|
+
"reasons": cyp1a2_reasons if cyp1a2_reasons else ["No major CYP1A2 inhibition flags"],
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
# --- Mechanism-based inhibition (MBI) risk ---
|
|
1179
|
+
mbi_risk = False
|
|
1180
|
+
mbi_reasons = []
|
|
1181
|
+
if has_furan:
|
|
1182
|
+
mbi_risk = True
|
|
1183
|
+
mbi_reasons.append("Furan ring — known MBI risk (bioactivated to reactive epoxide)")
|
|
1184
|
+
# Terminal alkyne
|
|
1185
|
+
alkyne = Chem.MolFromSmarts("[CX2]#[CX2H1]")
|
|
1186
|
+
if alkyne and mol.GetSubstructMatches(alkyne):
|
|
1187
|
+
mbi_risk = True
|
|
1188
|
+
mbi_reasons.append("Terminal alkyne — potential MBI via ketene intermediate")
|
|
1189
|
+
# Methylenedioxy
|
|
1190
|
+
mdp = Chem.MolFromSmarts("c1cc2OCOc2cc1")
|
|
1191
|
+
if mdp and mol.GetSubstructMatches(mdp):
|
|
1192
|
+
mbi_risk = True
|
|
1193
|
+
mbi_reasons.append("Methylenedioxy group — known CYP MBI risk (carbene formation)")
|
|
1194
|
+
|
|
1195
|
+
# --- Overall DDI risk ---
|
|
1196
|
+
high_risk_cyps = [k for k, v in cyp_profile.items() if v["inhibition_risk"] == "high"]
|
|
1197
|
+
moderate_risk_cyps = [k for k, v in cyp_profile.items() if v["inhibition_risk"] == "moderate"]
|
|
1198
|
+
|
|
1199
|
+
if high_risk_cyps or mbi_risk:
|
|
1200
|
+
overall_risk = "HIGH"
|
|
1201
|
+
elif len(moderate_risk_cyps) >= 2:
|
|
1202
|
+
overall_risk = "MODERATE"
|
|
1203
|
+
elif moderate_risk_cyps:
|
|
1204
|
+
overall_risk = "LOW-MODERATE"
|
|
1205
|
+
else:
|
|
1206
|
+
overall_risk = "LOW"
|
|
1207
|
+
|
|
1208
|
+
# --- Co-medication analysis ---
|
|
1209
|
+
comedication_analysis = None
|
|
1210
|
+
if comedication_smiles:
|
|
1211
|
+
comol = Chem.MolFromSmiles(comedication_smiles)
|
|
1212
|
+
if comol is not None:
|
|
1213
|
+
co_mw = Descriptors.MolWt(comol)
|
|
1214
|
+
co_logp = Crippen.MolLogP(comol)
|
|
1215
|
+
co_aromatic = Descriptors.NumAromaticRings(comol)
|
|
1216
|
+
|
|
1217
|
+
# Check if comedication shares metabolic pathway features
|
|
1218
|
+
co_n_het = Chem.MolFromSmarts("[nR]")
|
|
1219
|
+
co_n_heterocycles = len(comol.GetSubstructMatches(co_n_het)) if co_n_het else 0
|
|
1220
|
+
co_basic_amine = Chem.MolFromSmarts("[NX3;!$(NC=O);!$(NS=O)]")
|
|
1221
|
+
co_amines = len(comol.GetSubstructMatches(co_basic_amine)) if co_basic_amine else 0
|
|
1222
|
+
|
|
1223
|
+
shared_pathways = []
|
|
1224
|
+
if (n_heterocycles >= 2 or has_imidazole) and co_n_heterocycles >= 2:
|
|
1225
|
+
shared_pathways.append("CYP3A4 (both contain N-heterocycles)")
|
|
1226
|
+
if n_basic_amines >= 1 and co_amines >= 1:
|
|
1227
|
+
shared_pathways.append("CYP2D6 (both contain basic amines)")
|
|
1228
|
+
if logp > 3 and co_logp > 3:
|
|
1229
|
+
shared_pathways.append("General CYP competition (both lipophilic)")
|
|
1230
|
+
|
|
1231
|
+
interaction_risk = "high" if shared_pathways else "low"
|
|
1232
|
+
|
|
1233
|
+
comedication_analysis = {
|
|
1234
|
+
"comedication_smiles": comedication_smiles,
|
|
1235
|
+
"comedication_mw": round(co_mw, 1),
|
|
1236
|
+
"comedication_logp": round(co_logp, 2),
|
|
1237
|
+
"shared_metabolic_pathways": shared_pathways,
|
|
1238
|
+
"interaction_risk": interaction_risk,
|
|
1239
|
+
"recommendation": (
|
|
1240
|
+
f"Monitor for interactions via {', '.join(shared_pathways)}"
|
|
1241
|
+
if shared_pathways
|
|
1242
|
+
else "Low structural overlap in CYP-relevant features"
|
|
1243
|
+
),
|
|
1244
|
+
}
|
|
1245
|
+
else:
|
|
1246
|
+
comedication_analysis = {"error": f"Invalid co-medication SMILES: {comedication_smiles}"}
|
|
1247
|
+
|
|
1248
|
+
# --- Summary ---
|
|
1249
|
+
summary_lines = [
|
|
1250
|
+
f"DDI risk assessment: {overall_risk}",
|
|
1251
|
+
]
|
|
1252
|
+
if high_risk_cyps:
|
|
1253
|
+
summary_lines.append(f"High CYP inhibition risk: {', '.join(high_risk_cyps)}")
|
|
1254
|
+
if moderate_risk_cyps:
|
|
1255
|
+
summary_lines.append(f"Moderate CYP inhibition risk: {', '.join(moderate_risk_cyps)}")
|
|
1256
|
+
if mbi_risk:
|
|
1257
|
+
summary_lines.append(f"Mechanism-based inhibition risk: {'; '.join(mbi_reasons)}")
|
|
1258
|
+
if motif_flags["has_azole"]:
|
|
1259
|
+
summary_lines.append("Contains azole motif — strong CYP inhibitor pharmacophore")
|
|
1260
|
+
if comedication_analysis and isinstance(comedication_analysis, dict) and "shared_metabolic_pathways" in comedication_analysis:
|
|
1261
|
+
if comedication_analysis["shared_metabolic_pathways"]:
|
|
1262
|
+
summary_lines.append(f"Co-medication interaction via: {', '.join(comedication_analysis['shared_metabolic_pathways'])}")
|
|
1263
|
+
else:
|
|
1264
|
+
summary_lines.append("Low metabolic pathway overlap with co-medication")
|
|
1265
|
+
|
|
1266
|
+
result = {
|
|
1267
|
+
"summary": "\n".join(summary_lines),
|
|
1268
|
+
"overall_risk": overall_risk,
|
|
1269
|
+
"cyp_profile": cyp_profile,
|
|
1270
|
+
"motif_flags": motif_flags,
|
|
1271
|
+
"mechanism_based_inhibition": {
|
|
1272
|
+
"risk": mbi_risk,
|
|
1273
|
+
"reasons": mbi_reasons if mbi_reasons else ["No MBI structural alerts"],
|
|
1274
|
+
},
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
if comedication_analysis:
|
|
1278
|
+
result["comedication_analysis"] = comedication_analysis
|
|
1279
|
+
|
|
1280
|
+
return result
|