celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/clinical.py
ADDED
|
@@ -0,0 +1,1153 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clinical translation tools: indication mapping, patient population sizing, TCGA stratification.
|
|
3
|
+
|
|
4
|
+
References crews-glue-discovery/scripts/patient_population_sizing.py and tcga_stratification.py
|
|
5
|
+
for data sources and scoring logic.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
import re
|
|
11
|
+
from ct.tools import registry
|
|
12
|
+
from ct.tools.http_client import request, request_json
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# US annual incidence by cancer type (SEER/Globocan estimates)
|
|
16
|
+
US_INCIDENCE = {
|
|
17
|
+
"Lung": 238000,
|
|
18
|
+
"Breast": 310000,
|
|
19
|
+
"Colorectal": 153000,
|
|
20
|
+
"Prostate": 288000,
|
|
21
|
+
"Lymphoma (NHL)": 80000,
|
|
22
|
+
"AML": 20000,
|
|
23
|
+
"ALL": 6000,
|
|
24
|
+
"Multiple Myeloma": 35000,
|
|
25
|
+
"Kidney": 82000,
|
|
26
|
+
"Liver": 42000,
|
|
27
|
+
"Ovarian": 20000,
|
|
28
|
+
"Pancreatic": 64000,
|
|
29
|
+
"Melanoma": 100000,
|
|
30
|
+
"Bladder": 83000,
|
|
31
|
+
"Thyroid": 44000,
|
|
32
|
+
"Glioma/Brain": 25000,
|
|
33
|
+
"Cervical": 14000,
|
|
34
|
+
"Endometrial/Uterine": 66000,
|
|
35
|
+
"Head & Neck": 66000,
|
|
36
|
+
"Gastric/Esophageal": 49000,
|
|
37
|
+
"Sarcoma": 14000,
|
|
38
|
+
"Neuroblastoma": 800,
|
|
39
|
+
"Mesothelioma": 3000,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# PRISM lineage to standard cancer type mapping
|
|
43
|
+
LINEAGE_TO_CANCER = {
|
|
44
|
+
"Lung": {"incidence_key": "Lung", "fraction": 0.85, "five_yr_survival": 0.25,
|
|
45
|
+
"unmet_need": True, "name": "Non-Small Cell Lung Cancer"},
|
|
46
|
+
"CNS/Brain": {"incidence_key": "Glioma/Brain", "fraction": 0.60, "five_yr_survival": 0.05,
|
|
47
|
+
"unmet_need": True, "name": "Diffuse Glioma"},
|
|
48
|
+
"Skin": {"incidence_key": "Melanoma", "fraction": 1.0, "five_yr_survival": 0.93,
|
|
49
|
+
"unmet_need": False, "name": "Melanoma"},
|
|
50
|
+
"Lymphoid": {"incidence_key": "Lymphoma (NHL)", "fraction": 0.85, "five_yr_survival": 0.73,
|
|
51
|
+
"unmet_need": False, "name": "B-Cell Lymphoma"},
|
|
52
|
+
"Head and Neck": {"incidence_key": "Head & Neck", "fraction": 0.90, "five_yr_survival": 0.66,
|
|
53
|
+
"unmet_need": False, "name": "Head & Neck SCC"},
|
|
54
|
+
"Bowel": {"incidence_key": "Colorectal", "fraction": 0.95, "five_yr_survival": 0.65,
|
|
55
|
+
"unmet_need": False, "name": "Colorectal Cancer"},
|
|
56
|
+
"Ovary/Fallopian Tube": {"incidence_key": "Ovarian", "fraction": 0.90, "five_yr_survival": 0.50,
|
|
57
|
+
"unmet_need": True, "name": "Ovarian Cancer"},
|
|
58
|
+
"Pancreas": {"incidence_key": "Pancreatic", "fraction": 0.85, "five_yr_survival": 0.12,
|
|
59
|
+
"unmet_need": True, "name": "Pancreatic Cancer"},
|
|
60
|
+
"Breast": {"incidence_key": "Breast", "fraction": 0.95, "five_yr_survival": 0.90,
|
|
61
|
+
"unmet_need": False, "name": "Breast Cancer"},
|
|
62
|
+
"Prostate": {"incidence_key": "Prostate", "fraction": 0.95, "five_yr_survival": 0.97,
|
|
63
|
+
"unmet_need": False, "name": "Prostate Cancer"},
|
|
64
|
+
"Myeloid": {"incidence_key": "AML", "fraction": 1.0, "five_yr_survival": 0.30,
|
|
65
|
+
"unmet_need": True, "name": "Acute Myeloid Leukemia"},
|
|
66
|
+
"Liver": {"incidence_key": "Liver", "fraction": 0.80, "five_yr_survival": 0.20,
|
|
67
|
+
"unmet_need": True, "name": "Hepatocellular Carcinoma"},
|
|
68
|
+
"Kidney": {"incidence_key": "Kidney", "fraction": 0.85, "five_yr_survival": 0.77,
|
|
69
|
+
"unmet_need": False, "name": "Renal Cell Carcinoma"},
|
|
70
|
+
"Bladder/Urinary Tract": {"incidence_key": "Bladder", "fraction": 0.90, "five_yr_survival": 0.77,
|
|
71
|
+
"unmet_need": False, "name": "Bladder Cancer"},
|
|
72
|
+
"Stomach": {"incidence_key": "Gastric/Esophageal", "fraction": 0.65, "five_yr_survival": 0.22,
|
|
73
|
+
"unmet_need": True, "name": "Gastric/Esophageal Cancer"},
|
|
74
|
+
"Uterus": {"incidence_key": "Endometrial/Uterine", "fraction": 0.90, "five_yr_survival": 0.81,
|
|
75
|
+
"unmet_need": False, "name": "Endometrial Cancer"},
|
|
76
|
+
"Cervix": {"incidence_key": "Cervical", "fraction": 0.70, "five_yr_survival": 0.66,
|
|
77
|
+
"unmet_need": True, "name": "Cervical Cancer"},
|
|
78
|
+
"Bone": {"incidence_key": "Sarcoma", "fraction": 0.11, "five_yr_survival": 0.60,
|
|
79
|
+
"unmet_need": True, "name": "Bone Sarcoma"},
|
|
80
|
+
"Soft Tissue": {"incidence_key": "Sarcoma", "fraction": 0.05, "five_yr_survival": 0.63,
|
|
81
|
+
"unmet_need": True, "name": "Soft Tissue Sarcoma"},
|
|
82
|
+
"Pleura": {"incidence_key": "Mesothelioma", "fraction": 0.80, "five_yr_survival": 0.12,
|
|
83
|
+
"unmet_need": True, "name": "Mesothelioma"},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@registry.register(
|
|
88
|
+
name="clinical.indication_map",
|
|
89
|
+
description="Map compound sensitivity profiles to cancer indications with response rates",
|
|
90
|
+
category="clinical",
|
|
91
|
+
parameters={
|
|
92
|
+
"compound_id": "Compound ID to map (or 'all')",
|
|
93
|
+
"min_response_rate": "Minimum response rate to include (default 0.1)",
|
|
94
|
+
},
|
|
95
|
+
requires_data=["prism", "depmap_model"],
|
|
96
|
+
usage_guide="You want to know which cancer types a compound is active against. Maps PRISM cell line sensitivity to clinical cancer indications. Use for indication selection and clinical positioning.",
|
|
97
|
+
)
|
|
98
|
+
def indication_map(compound_id: str = "all", min_response_rate: float = 0.1, **kwargs) -> dict:
|
|
99
|
+
"""Map compound PRISM sensitivity to cancer indications.
|
|
100
|
+
|
|
101
|
+
Uses cell line lineage annotations to group by cancer type and compute
|
|
102
|
+
per-indication response rates (fraction of cell lines with LFC < -0.5).
|
|
103
|
+
"""
|
|
104
|
+
from ct.data.loaders import load_prism, load_model_metadata
|
|
105
|
+
|
|
106
|
+
prism = load_prism()
|
|
107
|
+
model = load_model_metadata()
|
|
108
|
+
|
|
109
|
+
# Map cell lines to lineages
|
|
110
|
+
ccle_to_lineage = {}
|
|
111
|
+
for _, row in model.iterrows():
|
|
112
|
+
ccle = row.get("CCLEName", "")
|
|
113
|
+
lin = row.get("OncotreeLineage", "Unknown")
|
|
114
|
+
if pd.notna(ccle) and pd.notna(lin):
|
|
115
|
+
ccle_to_lineage[ccle] = lin
|
|
116
|
+
|
|
117
|
+
compounds = [compound_id] if compound_id != "all" else prism["pert_name"].unique().tolist()
|
|
118
|
+
results = []
|
|
119
|
+
|
|
120
|
+
for cpd in compounds:
|
|
121
|
+
cpd_data = prism[prism["pert_name"] == cpd]
|
|
122
|
+
if len(cpd_data) == 0:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
max_dose = cpd_data["pert_dose"].max()
|
|
126
|
+
cpd_hd = cpd_data[cpd_data["pert_dose"] == max_dose].copy()
|
|
127
|
+
cpd_hd["lineage"] = cpd_hd["ccle_name"].map(ccle_to_lineage)
|
|
128
|
+
|
|
129
|
+
for lineage, group in cpd_hd.groupby("lineage"):
|
|
130
|
+
if lineage == "Unknown" or len(group) < 3:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
n_cells = len(group)
|
|
134
|
+
n_sensitive = (group["LFC"] < -0.5).sum()
|
|
135
|
+
response_rate = n_sensitive / n_cells
|
|
136
|
+
mean_lfc = float(group["LFC"].mean())
|
|
137
|
+
|
|
138
|
+
if response_rate < min_response_rate:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# Map to clinical indication
|
|
142
|
+
cancer_info = LINEAGE_TO_CANCER.get(lineage, {})
|
|
143
|
+
cancer_name = cancer_info.get("name", lineage)
|
|
144
|
+
|
|
145
|
+
results.append({
|
|
146
|
+
"compound": cpd,
|
|
147
|
+
"lineage": lineage,
|
|
148
|
+
"cancer_type": cancer_name,
|
|
149
|
+
"n_cell_lines": n_cells,
|
|
150
|
+
"n_sensitive": int(n_sensitive),
|
|
151
|
+
"response_rate": round(response_rate, 3),
|
|
152
|
+
"mean_lfc": round(mean_lfc, 3),
|
|
153
|
+
"unmet_need": cancer_info.get("unmet_need"),
|
|
154
|
+
"five_yr_survival": cancer_info.get("five_yr_survival"),
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
if not results:
|
|
158
|
+
return {
|
|
159
|
+
"summary": f"No indications found for {compound_id} (compound may not be in PRISM data or no lineages met criteria)",
|
|
160
|
+
"n_indications": 0,
|
|
161
|
+
"indications": [],
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
df = pd.DataFrame(results).sort_values("response_rate", ascending=False)
|
|
165
|
+
|
|
166
|
+
if compound_id != "all":
|
|
167
|
+
top = df.head(5)
|
|
168
|
+
top_names = ", ".join(top["cancer_type"].tolist()) if len(top) > 0 else "none"
|
|
169
|
+
summary = f"Indication mapping for {compound_id}: {len(df)} indications (top: {top_names})"
|
|
170
|
+
else:
|
|
171
|
+
summary = f"Mapped {len(compounds)} compounds across {df['cancer_type'].nunique()} indications"
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
"summary": summary,
|
|
175
|
+
"n_indications": len(df),
|
|
176
|
+
"indications": df.to_dict("records"),
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@registry.register(
|
|
181
|
+
name="clinical.population_size",
|
|
182
|
+
description="Estimate addressable patient population per compound and indication using SEER incidence data",
|
|
183
|
+
category="clinical",
|
|
184
|
+
parameters={
|
|
185
|
+
"compound_id": "Compound ID to size (or 'all')",
|
|
186
|
+
"clinical_adjustment": "Clinical reality factor (default 0.10 = 10% of cell-line estimate)",
|
|
187
|
+
},
|
|
188
|
+
requires_data=["prism", "depmap_model"],
|
|
189
|
+
usage_guide="You want to estimate how many patients could benefit from a compound — combines PRISM response rates with US cancer incidence data. Use for market sizing and clinical development prioritization.",
|
|
190
|
+
)
|
|
191
|
+
def population_size(compound_id: str = "all", clinical_adjustment: float = 0.10, **kwargs) -> dict:
|
|
192
|
+
"""Estimate addressable patient populations.
|
|
193
|
+
|
|
194
|
+
addressable = annual_incidence x subtype_fraction x cell_line_response_rate
|
|
195
|
+
clinical_adjusted = addressable x clinical_adjustment_factor
|
|
196
|
+
"""
|
|
197
|
+
# Get indication mapping first
|
|
198
|
+
ind_result = indication_map(compound_id=compound_id, min_response_rate=0.05)
|
|
199
|
+
if "error" in ind_result:
|
|
200
|
+
return ind_result
|
|
201
|
+
|
|
202
|
+
indications = ind_result["indications"]
|
|
203
|
+
results = []
|
|
204
|
+
|
|
205
|
+
for ind in indications:
|
|
206
|
+
lineage = ind["lineage"]
|
|
207
|
+
cancer_info = LINEAGE_TO_CANCER.get(lineage)
|
|
208
|
+
if not cancer_info:
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
incidence_key = cancer_info["incidence_key"]
|
|
212
|
+
if incidence_key not in US_INCIDENCE:
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
annual_incidence = US_INCIDENCE[incidence_key]
|
|
216
|
+
subtype_fraction = cancer_info["fraction"]
|
|
217
|
+
base_population = int(annual_incidence * subtype_fraction)
|
|
218
|
+
addressable = int(base_population * ind["response_rate"])
|
|
219
|
+
clinical_est = int(addressable * clinical_adjustment)
|
|
220
|
+
|
|
221
|
+
results.append({
|
|
222
|
+
"compound": ind["compound"],
|
|
223
|
+
"cancer_type": ind["cancer_type"],
|
|
224
|
+
"annual_us_incidence": annual_incidence,
|
|
225
|
+
"subtype_fraction": subtype_fraction,
|
|
226
|
+
"base_population": base_population,
|
|
227
|
+
"response_rate": ind["response_rate"],
|
|
228
|
+
"addressable_patients": addressable,
|
|
229
|
+
"clinical_adjusted": clinical_est,
|
|
230
|
+
"mean_lfc": ind["mean_lfc"],
|
|
231
|
+
"n_cell_lines": ind["n_cell_lines"],
|
|
232
|
+
"unmet_need": ind.get("unmet_need"),
|
|
233
|
+
"five_yr_survival": ind.get("five_yr_survival"),
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
if not results:
|
|
237
|
+
return {
|
|
238
|
+
"summary": f"No addressable populations identified for {compound_id} (compound may not be in PRISM data)",
|
|
239
|
+
"clinical_adjustment": clinical_adjustment,
|
|
240
|
+
"per_indication": [],
|
|
241
|
+
"per_compound": {},
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
df = pd.DataFrame(results).sort_values("addressable_patients", ascending=False)
|
|
245
|
+
|
|
246
|
+
# Per-compound totals
|
|
247
|
+
if len(df) > 0:
|
|
248
|
+
cpd_totals = df.groupby("compound").agg(
|
|
249
|
+
total_addressable=("addressable_patients", "sum"),
|
|
250
|
+
total_clinical=("clinical_adjusted", "sum"),
|
|
251
|
+
n_indications=("cancer_type", "nunique"),
|
|
252
|
+
).sort_values("total_addressable", ascending=False)
|
|
253
|
+
|
|
254
|
+
top_cpd = cpd_totals.index[0] if len(cpd_totals) > 0 else "N/A"
|
|
255
|
+
total = int(cpd_totals.iloc[0]["total_addressable"]) if len(cpd_totals) > 0 else 0
|
|
256
|
+
|
|
257
|
+
summary = (
|
|
258
|
+
f"Patient population sizing ({clinical_adjustment:.0%} clinical adjustment):\n"
|
|
259
|
+
f"Top compound: {top_cpd} ({total:,} addressable, "
|
|
260
|
+
f"{int(total * clinical_adjustment):,} clinical estimate)"
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
summary = "No addressable populations identified"
|
|
264
|
+
cpd_totals = pd.DataFrame()
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"summary": summary,
|
|
268
|
+
"clinical_adjustment": clinical_adjustment,
|
|
269
|
+
"per_indication": df.to_dict("records"),
|
|
270
|
+
"per_compound": cpd_totals.to_dict("index") if len(cpd_totals) > 0 else {},
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@registry.register(
|
|
275
|
+
name="clinical.tcga_stratify",
|
|
276
|
+
description="Stratify patients by target expression using TCGA data from Human Protein Atlas",
|
|
277
|
+
category="clinical",
|
|
278
|
+
parameters={
|
|
279
|
+
"gene": "Gene symbol to query (e.g. CDC25C, GATA2)",
|
|
280
|
+
},
|
|
281
|
+
usage_guide="You want to check if a target gene is expressed in patient tumors — queries TCGA expression data from Human Protein Atlas. Use for clinical biomarker validation and patient stratification strategy.",
|
|
282
|
+
)
|
|
283
|
+
def tcga_stratify(gene: str, **kwargs) -> dict:
|
|
284
|
+
"""Query Human Protein Atlas for TCGA expression data.
|
|
285
|
+
|
|
286
|
+
Returns expression levels across cancer types and prognostic associations.
|
|
287
|
+
Convergence = log2(median_FPKM + 1) x |compound_LFC| (when PRISM data available).
|
|
288
|
+
"""
|
|
289
|
+
import math
|
|
290
|
+
import re
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
import httpx
|
|
294
|
+
except ImportError:
|
|
295
|
+
return {"error": "httpx required for TCGA queries (pip install httpx)", "summary": "httpx required for TCGA queries (pip install httpx)"}
|
|
296
|
+
# Fast-path cache for common targets (avoids API call)
|
|
297
|
+
_GENE_ENSEMBL_CACHE = {
|
|
298
|
+
"CDC25C": "ENSG00000158402", "GATA2": "ENSG00000179348",
|
|
299
|
+
"RBCK1": "ENSG00000125826", "ZNF687": "ENSG00000143373",
|
|
300
|
+
"BCOR": "ENSG00000183337", "CEP57": "ENSG00000166037",
|
|
301
|
+
"BTBD1": "ENSG00000084693", "FLCN": "ENSG00000154803",
|
|
302
|
+
"LYZ": "ENSG00000090382", "CRBN": "ENSG00000113851",
|
|
303
|
+
"PDCD2": "ENSG00000126249",
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
ensembl_id = _GENE_ENSEMBL_CACHE.get(gene.upper())
|
|
307
|
+
if not ensembl_id:
|
|
308
|
+
# Look up via Ensembl REST API — works for any human gene symbol
|
|
309
|
+
try:
|
|
310
|
+
xref_url = f"https://rest.ensembl.org/xrefs/symbol/homo_sapiens/{gene}"
|
|
311
|
+
xref_data, xref_error = request_json(
|
|
312
|
+
"GET",
|
|
313
|
+
xref_url,
|
|
314
|
+
timeout=15,
|
|
315
|
+
retries=2,
|
|
316
|
+
headers={
|
|
317
|
+
"Content-Type": "application/json",
|
|
318
|
+
"User-Agent": "ct-celltype/0.1",
|
|
319
|
+
},
|
|
320
|
+
)
|
|
321
|
+
if not xref_error and isinstance(xref_data, list):
|
|
322
|
+
for xref in xref_data:
|
|
323
|
+
if xref.get("type") == "gene" and xref.get("id", "").startswith("ENSG"):
|
|
324
|
+
ensembl_id = xref["id"]
|
|
325
|
+
break
|
|
326
|
+
except Exception:
|
|
327
|
+
pass
|
|
328
|
+
|
|
329
|
+
if not ensembl_id:
|
|
330
|
+
return {"error": f"Could not resolve Ensembl ID for gene '{gene}'. Check the gene symbol is correct.", "summary": f"Could not resolve Ensembl ID for gene '{gene}'. Check the gene symbol is correct."}
|
|
331
|
+
# Fetch gene data from HPA JSON API
|
|
332
|
+
url = f"https://www.proteinatlas.org/{ensembl_id}.json"
|
|
333
|
+
resp, error = request(
|
|
334
|
+
"GET",
|
|
335
|
+
url,
|
|
336
|
+
timeout=30,
|
|
337
|
+
retries=2,
|
|
338
|
+
headers={"User-Agent": "ct-celltype/0.1"},
|
|
339
|
+
raise_for_status=False,
|
|
340
|
+
)
|
|
341
|
+
if error:
|
|
342
|
+
return {"error": f"Failed to fetch HPA data: {error}", "summary": f"Failed to fetch HPA data: {error}"}
|
|
343
|
+
if resp.status_code != 200:
|
|
344
|
+
return {"error": f"HPA API returned status {resp.status_code} for {gene}", "summary": f"HPA API error for {gene}"}
|
|
345
|
+
content_type = ""
|
|
346
|
+
try:
|
|
347
|
+
ct_raw = resp.headers.get("content-type", "")
|
|
348
|
+
if isinstance(ct_raw, str):
|
|
349
|
+
content_type = ct_raw.lower()
|
|
350
|
+
except Exception:
|
|
351
|
+
pass
|
|
352
|
+
if content_type and "json" not in content_type:
|
|
353
|
+
return {"error": f"HPA API returned {content_type} instead of JSON for {gene}", "summary": f"HPA returned non-JSON for {gene}"}
|
|
354
|
+
try:
|
|
355
|
+
gene_json = resp.json()
|
|
356
|
+
except Exception:
|
|
357
|
+
return {"error": f"HPA API returned invalid JSON for {gene}", "summary": f"HPA invalid JSON for {gene}"}
|
|
358
|
+
|
|
359
|
+
# Extract prognostic data
|
|
360
|
+
prognostics = []
|
|
361
|
+
for key, val in gene_json.items():
|
|
362
|
+
if key.startswith("Cancer prognostics -") and val is not None:
|
|
363
|
+
m = re.match(r"Cancer prognostics - (.+?) \((TCGA|validation)\)", key)
|
|
364
|
+
if m and val.get("is_prognostic"):
|
|
365
|
+
prognostics.append({
|
|
366
|
+
"cancer_type": m.group(1),
|
|
367
|
+
"dataset": m.group(2),
|
|
368
|
+
"direction": val.get("prognostic type", ""),
|
|
369
|
+
"status": val.get("prognostic", ""),
|
|
370
|
+
"p_value": val.get("p_val", ""),
|
|
371
|
+
})
|
|
372
|
+
|
|
373
|
+
gene_info = {
|
|
374
|
+
"cancer_specificity": gene_json.get("RNA cancer specificity", ""),
|
|
375
|
+
"cancer_distribution": gene_json.get("RNA cancer distribution", ""),
|
|
376
|
+
"tissue_specificity": gene_json.get("RNA tissue specificity", ""),
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
# Extract RNA expression by cancer type
|
|
380
|
+
rna_cancer = gene_json.get("RNA cancer sample", {})
|
|
381
|
+
cancer_expression = []
|
|
382
|
+
if isinstance(rna_cancer, dict):
|
|
383
|
+
for cancer_type, data in rna_cancer.items():
|
|
384
|
+
if isinstance(data, dict):
|
|
385
|
+
fpkm = data.get("value", 0)
|
|
386
|
+
cancer_expression.append({
|
|
387
|
+
"cancer_type": cancer_type,
|
|
388
|
+
"fpkm": float(fpkm) if fpkm else 0,
|
|
389
|
+
"expr_score": round(math.log2(float(fpkm) + 1), 3) if fpkm else 0,
|
|
390
|
+
})
|
|
391
|
+
|
|
392
|
+
cancer_expression.sort(key=lambda x: x["fpkm"], reverse=True)
|
|
393
|
+
|
|
394
|
+
# Classify expression levels
|
|
395
|
+
for entry in cancer_expression:
|
|
396
|
+
fpkm = entry["fpkm"]
|
|
397
|
+
if fpkm >= 10:
|
|
398
|
+
entry["level"] = "HIGH"
|
|
399
|
+
elif fpkm >= 3:
|
|
400
|
+
entry["level"] = "MEDIUM"
|
|
401
|
+
elif fpkm >= 1:
|
|
402
|
+
entry["level"] = "LOW"
|
|
403
|
+
else:
|
|
404
|
+
entry["level"] = "VERY_LOW"
|
|
405
|
+
|
|
406
|
+
high_expr = [e for e in cancer_expression if e["level"] in ("HIGH", "MEDIUM")]
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
"summary": (
|
|
410
|
+
f"TCGA stratification for {gene}:\n"
|
|
411
|
+
f"Expressed (FPKM>=3) in {len(high_expr)}/{len(cancer_expression)} cancer types\n"
|
|
412
|
+
f"Prognostic in {len(prognostics)} cancer types\n"
|
|
413
|
+
f"Cancer specificity: {gene_info['cancer_specificity']}"
|
|
414
|
+
),
|
|
415
|
+
"gene": gene,
|
|
416
|
+
"gene_info": gene_info,
|
|
417
|
+
"cancer_expression": cancer_expression,
|
|
418
|
+
"prognostics": prognostics,
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
@registry.register(
|
|
423
|
+
name="clinical.trial_search",
|
|
424
|
+
description="Search ClinicalTrials.gov for relevant clinical trials by gene, drug, or indication",
|
|
425
|
+
category="clinical",
|
|
426
|
+
parameters={
|
|
427
|
+
"query": "Search term (gene name, drug name, indication, or free text)",
|
|
428
|
+
"status": "Optional trial status filter: RECRUITING, COMPLETED, ACTIVE_NOT_RECRUITING, etc.",
|
|
429
|
+
},
|
|
430
|
+
usage_guide="You want to find clinical trials for a target, compound, or disease. Use to assess clinical precedent, competitive landscape, and development activity.",
|
|
431
|
+
)
|
|
432
|
+
def trial_search(query: str, status: str = "", **kwargs) -> dict:
|
|
433
|
+
"""Search ClinicalTrials.gov API v2 for clinical trials.
|
|
434
|
+
|
|
435
|
+
Returns trial metadata including NCT ID, phase, status, conditions,
|
|
436
|
+
interventions, sponsor, and enrollment.
|
|
437
|
+
"""
|
|
438
|
+
try:
|
|
439
|
+
import httpx
|
|
440
|
+
except ImportError:
|
|
441
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
442
|
+
url = "https://clinicaltrials.gov/api/v2/studies"
|
|
443
|
+
params = {
|
|
444
|
+
"query.term": query,
|
|
445
|
+
"pageSize": 20,
|
|
446
|
+
}
|
|
447
|
+
if status:
|
|
448
|
+
params["filter.overallStatus"] = status
|
|
449
|
+
|
|
450
|
+
data, error = request_json(
|
|
451
|
+
"GET",
|
|
452
|
+
url,
|
|
453
|
+
params=params,
|
|
454
|
+
timeout=15,
|
|
455
|
+
retries=2,
|
|
456
|
+
)
|
|
457
|
+
if error:
|
|
458
|
+
return {"error": f"ClinicalTrials.gov search failed: {error}", "summary": f"ClinicalTrials.gov search failed: {error}"}
|
|
459
|
+
studies = data.get("studies", [])
|
|
460
|
+
total_count = len(studies)
|
|
461
|
+
has_more = data.get("nextPageToken") is not None
|
|
462
|
+
|
|
463
|
+
trials = []
|
|
464
|
+
phase_counts = {}
|
|
465
|
+
status_counts = {}
|
|
466
|
+
|
|
467
|
+
for study in studies:
|
|
468
|
+
proto = study.get("protocolSection", {})
|
|
469
|
+
ident = proto.get("identificationModule", {})
|
|
470
|
+
status_mod = proto.get("statusModule", {})
|
|
471
|
+
design = proto.get("designModule", {})
|
|
472
|
+
desc = proto.get("descriptionModule", {})
|
|
473
|
+
contacts = proto.get("contactsLocationsModule", {})
|
|
474
|
+
arms = proto.get("armsInterventionsModule", {})
|
|
475
|
+
cond_mod = proto.get("conditionsModule", {})
|
|
476
|
+
sponsor_mod = proto.get("sponsorCollaboratorsModule", {})
|
|
477
|
+
|
|
478
|
+
nct_id = ident.get("nctId", "")
|
|
479
|
+
title = ident.get("briefTitle", "")
|
|
480
|
+
overall_status = status_mod.get("overallStatus", "")
|
|
481
|
+
start_date = status_mod.get("startDateStruct", {}).get("date", "")
|
|
482
|
+
|
|
483
|
+
# Phase
|
|
484
|
+
phases = design.get("phases", [])
|
|
485
|
+
phase = ", ".join(phases) if phases else "N/A"
|
|
486
|
+
|
|
487
|
+
# Conditions
|
|
488
|
+
conditions = cond_mod.get("conditions", [])
|
|
489
|
+
|
|
490
|
+
# Interventions
|
|
491
|
+
interventions_raw = arms.get("interventions", [])
|
|
492
|
+
interventions = []
|
|
493
|
+
for iv in interventions_raw:
|
|
494
|
+
interventions.append({
|
|
495
|
+
"type": iv.get("type", ""),
|
|
496
|
+
"name": iv.get("name", ""),
|
|
497
|
+
})
|
|
498
|
+
|
|
499
|
+
# Sponsor
|
|
500
|
+
lead_sponsor = sponsor_mod.get("leadSponsor", {})
|
|
501
|
+
sponsor_name = lead_sponsor.get("name", "")
|
|
502
|
+
|
|
503
|
+
# Enrollment
|
|
504
|
+
enrollment_info = design.get("enrollmentInfo", {})
|
|
505
|
+
enrollment = enrollment_info.get("count", "")
|
|
506
|
+
|
|
507
|
+
trial = {
|
|
508
|
+
"nct_id": nct_id,
|
|
509
|
+
"title": title,
|
|
510
|
+
"status": overall_status,
|
|
511
|
+
"phase": phase,
|
|
512
|
+
"conditions": conditions[:5], # Cap to keep output manageable
|
|
513
|
+
"interventions": interventions[:5],
|
|
514
|
+
"sponsor": sponsor_name,
|
|
515
|
+
"enrollment": enrollment,
|
|
516
|
+
"start_date": start_date,
|
|
517
|
+
}
|
|
518
|
+
trials.append(trial)
|
|
519
|
+
|
|
520
|
+
# Aggregate counts
|
|
521
|
+
for p in phases:
|
|
522
|
+
phase_counts[p] = phase_counts.get(p, 0) + 1
|
|
523
|
+
status_counts[overall_status] = status_counts.get(overall_status, 0) + 1
|
|
524
|
+
|
|
525
|
+
# Build summary
|
|
526
|
+
if trials:
|
|
527
|
+
top_phases = ", ".join(f"{k}: {v}" for k, v in sorted(phase_counts.items()))
|
|
528
|
+
top_statuses = ", ".join(f"{k}: {v}" for k, v in sorted(status_counts.items()))
|
|
529
|
+
summary = (
|
|
530
|
+
f"ClinicalTrials.gov search '{query}': {total_count}{'+ (more pages)' if has_more else ''} results\n"
|
|
531
|
+
f"Phase distribution: {top_phases}\n"
|
|
532
|
+
f"Status distribution: {top_statuses}"
|
|
533
|
+
)
|
|
534
|
+
else:
|
|
535
|
+
summary = f"No clinical trials found for '{query}'"
|
|
536
|
+
|
|
537
|
+
return {
|
|
538
|
+
"summary": summary,
|
|
539
|
+
"query": query,
|
|
540
|
+
"total_count": total_count,
|
|
541
|
+
"has_more": has_more,
|
|
542
|
+
"trials": trials,
|
|
543
|
+
"phase_distribution": phase_counts,
|
|
544
|
+
"status_distribution": status_counts,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _normalize_phase_token(phase_value: str) -> str:
|
|
549
|
+
"""Normalize trial phase labels for robust filtering."""
|
|
550
|
+
return re.sub(r"[^A-Z0-9]", "", str(phase_value or "").upper())
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
@registry.register(
|
|
554
|
+
name="clinical.trial_design_benchmark",
|
|
555
|
+
description="Benchmark clinical trial design patterns for a query (endpoints, enrollment, randomization, biomarker criteria)",
|
|
556
|
+
category="clinical",
|
|
557
|
+
parameters={
|
|
558
|
+
"query": "Search term for indication/target/drug",
|
|
559
|
+
"phase": "Optional phase filter (e.g., 'PHASE2', 'PHASE3', 'EARLY_PHASE1')",
|
|
560
|
+
"status": "Optional trial status filter (e.g., RECRUITING, COMPLETED)",
|
|
561
|
+
"max_results": "Max studies to include from ClinicalTrials.gov API v2 (default 20, max 100)",
|
|
562
|
+
},
|
|
563
|
+
usage_guide=(
|
|
564
|
+
"Use to benchmark protocol design against the current landscape. Summarizes common "
|
|
565
|
+
"endpoints, intervention patterns, enrollment benchmarks, and key eligibility traits."
|
|
566
|
+
),
|
|
567
|
+
)
|
|
568
|
+
def trial_design_benchmark(
|
|
569
|
+
query: str,
|
|
570
|
+
phase: str = "",
|
|
571
|
+
status: str = "",
|
|
572
|
+
max_results: int = 20,
|
|
573
|
+
**kwargs,
|
|
574
|
+
) -> dict:
|
|
575
|
+
"""Benchmark trial design characteristics from ClinicalTrials.gov API v2."""
|
|
576
|
+
if not query or not query.strip():
|
|
577
|
+
return {"error": "query is required", "summary": "No query provided"}
|
|
578
|
+
|
|
579
|
+
max_results = max(1, min(int(max_results or 20), 100))
|
|
580
|
+
params = {
|
|
581
|
+
"query.term": query.strip(),
|
|
582
|
+
"pageSize": str(max_results),
|
|
583
|
+
}
|
|
584
|
+
if status:
|
|
585
|
+
params["filter.overallStatus"] = status
|
|
586
|
+
|
|
587
|
+
data, error = request_json(
|
|
588
|
+
"GET",
|
|
589
|
+
"https://clinicaltrials.gov/api/v2/studies",
|
|
590
|
+
params=params,
|
|
591
|
+
timeout=20,
|
|
592
|
+
retries=2,
|
|
593
|
+
)
|
|
594
|
+
if error:
|
|
595
|
+
return {
|
|
596
|
+
"error": f"ClinicalTrials.gov benchmark failed: {error}",
|
|
597
|
+
"summary": f"Clinical trial design benchmark failed: {error}",
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
studies = data.get("studies", [])
|
|
601
|
+
has_more = data.get("nextPageToken") is not None
|
|
602
|
+
phase_filter = phase.strip()
|
|
603
|
+
phase_filter_norm = _normalize_phase_token(phase_filter) if phase_filter else ""
|
|
604
|
+
|
|
605
|
+
trials = []
|
|
606
|
+
phase_counts = {}
|
|
607
|
+
status_counts = {}
|
|
608
|
+
endpoint_counts = {}
|
|
609
|
+
intervention_counts = {}
|
|
610
|
+
enrollment_values = []
|
|
611
|
+
|
|
612
|
+
design_patterns = {
|
|
613
|
+
"randomized_trials": 0,
|
|
614
|
+
"blinded_trials": 0,
|
|
615
|
+
"placebo_control_trials": 0,
|
|
616
|
+
"biomarker_criteria_trials": 0,
|
|
617
|
+
"ecog_criteria_trials": 0,
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
for study in studies:
|
|
621
|
+
proto = study.get("protocolSection", {})
|
|
622
|
+
ident = proto.get("identificationModule", {})
|
|
623
|
+
status_mod = proto.get("statusModule", {})
|
|
624
|
+
design_mod = proto.get("designModule", {})
|
|
625
|
+
outcomes_mod = proto.get("outcomesModule", {})
|
|
626
|
+
elig_mod = proto.get("eligibilityModule", {})
|
|
627
|
+
arms_mod = proto.get("armsInterventionsModule", {})
|
|
628
|
+
cond_mod = proto.get("conditionsModule", {})
|
|
629
|
+
sponsor_mod = proto.get("sponsorCollaboratorsModule", {})
|
|
630
|
+
|
|
631
|
+
phases = design_mod.get("phases", []) or []
|
|
632
|
+
if phase_filter_norm:
|
|
633
|
+
phase_tokens = {_normalize_phase_token(p) for p in phases}
|
|
634
|
+
if phase_filter_norm not in phase_tokens:
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
overall_status = status_mod.get("overallStatus", "") or "UNKNOWN"
|
|
638
|
+
phase_label = ", ".join(phases) if phases else "N/A"
|
|
639
|
+
phase_counts[phase_label] = phase_counts.get(phase_label, 0) + 1
|
|
640
|
+
status_counts[overall_status] = status_counts.get(overall_status, 0) + 1
|
|
641
|
+
|
|
642
|
+
design_info = design_mod.get("designInfo", {})
|
|
643
|
+
allocation = design_info.get("allocation", "")
|
|
644
|
+
intervention_model = design_info.get("interventionModel", "")
|
|
645
|
+
masking = design_info.get("maskingInfo", {}).get("masking", "")
|
|
646
|
+
|
|
647
|
+
interventions = []
|
|
648
|
+
for iv in arms_mod.get("interventions", []) or []:
|
|
649
|
+
iv_name = (iv.get("name", "") or "").strip()
|
|
650
|
+
if iv_name:
|
|
651
|
+
interventions.append(iv_name)
|
|
652
|
+
intervention_counts[iv_name] = intervention_counts.get(iv_name, 0) + 1
|
|
653
|
+
|
|
654
|
+
primary_endpoints = []
|
|
655
|
+
for out in outcomes_mod.get("primaryOutcomes", []) or []:
|
|
656
|
+
measure = (out.get("measure", "") or "").strip()
|
|
657
|
+
if measure:
|
|
658
|
+
primary_endpoints.append(measure)
|
|
659
|
+
endpoint_counts[measure] = endpoint_counts.get(measure, 0) + 1
|
|
660
|
+
|
|
661
|
+
enrollment_raw = design_mod.get("enrollmentInfo", {}).get("count")
|
|
662
|
+
enrollment = None
|
|
663
|
+
try:
|
|
664
|
+
enrollment = int(enrollment_raw)
|
|
665
|
+
enrollment_values.append(enrollment)
|
|
666
|
+
except Exception:
|
|
667
|
+
enrollment = enrollment_raw
|
|
668
|
+
|
|
669
|
+
eligibility_text = (elig_mod.get("eligibilityCriteria", "") or "").lower()
|
|
670
|
+
biomarker_criteria = any(
|
|
671
|
+
term in eligibility_text
|
|
672
|
+
for term in ("biomarker", "mutation", "genotype", "expression", "pd-l1", "her2", "egfr", "alk")
|
|
673
|
+
)
|
|
674
|
+
ecog_criteria = "ecog" in eligibility_text
|
|
675
|
+
|
|
676
|
+
allocation_norm = str(allocation).strip().upper().replace("-", "_")
|
|
677
|
+
if allocation_norm == "RANDOMIZED" or (
|
|
678
|
+
"RANDOMIZED" in allocation_norm and "NON_RANDOMIZED" not in allocation_norm
|
|
679
|
+
):
|
|
680
|
+
design_patterns["randomized_trials"] += 1
|
|
681
|
+
if masking and str(masking).upper() not in {"NONE", "OPEN_LABEL"}:
|
|
682
|
+
design_patterns["blinded_trials"] += 1
|
|
683
|
+
if any("placebo" in iv.lower() for iv in interventions):
|
|
684
|
+
design_patterns["placebo_control_trials"] += 1
|
|
685
|
+
if biomarker_criteria:
|
|
686
|
+
design_patterns["biomarker_criteria_trials"] += 1
|
|
687
|
+
if ecog_criteria:
|
|
688
|
+
design_patterns["ecog_criteria_trials"] += 1
|
|
689
|
+
|
|
690
|
+
trials.append({
|
|
691
|
+
"nct_id": ident.get("nctId", ""),
|
|
692
|
+
"title": ident.get("briefTitle", ""),
|
|
693
|
+
"phase": phase_label,
|
|
694
|
+
"status": overall_status,
|
|
695
|
+
"study_type": design_mod.get("studyType", ""),
|
|
696
|
+
"allocation": allocation,
|
|
697
|
+
"intervention_model": intervention_model,
|
|
698
|
+
"masking": masking,
|
|
699
|
+
"enrollment": enrollment,
|
|
700
|
+
"conditions": (cond_mod.get("conditions", []) or [])[:5],
|
|
701
|
+
"interventions": interventions[:8],
|
|
702
|
+
"primary_endpoints": primary_endpoints[:8],
|
|
703
|
+
"sponsor": (sponsor_mod.get("leadSponsor", {}) or {}).get("name", ""),
|
|
704
|
+
"start_date": (status_mod.get("startDateStruct", {}) or {}).get("date", ""),
|
|
705
|
+
"biomarker_criteria": biomarker_criteria,
|
|
706
|
+
"ecog_criteria": ecog_criteria,
|
|
707
|
+
})
|
|
708
|
+
|
|
709
|
+
if not trials:
|
|
710
|
+
phase_text = f", phase={phase_filter}" if phase_filter else ""
|
|
711
|
+
status_text = f", status={status}" if status else ""
|
|
712
|
+
return {
|
|
713
|
+
"query": query,
|
|
714
|
+
"phase_filter": phase_filter,
|
|
715
|
+
"status_filter": status,
|
|
716
|
+
"trials": [],
|
|
717
|
+
"summary": f"No trials found for '{query}' with current filters{phase_text}{status_text}.",
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
median_enrollment = float(np.median(enrollment_values)) if enrollment_values else None
|
|
721
|
+
|
|
722
|
+
endpoint_top = sorted(endpoint_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
|
|
723
|
+
intervention_top = sorted(intervention_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
|
|
724
|
+
|
|
725
|
+
top_endpoint_text = ", ".join(f"{name} ({count})" for name, count in endpoint_top[:3]) or "none"
|
|
726
|
+
summary = (
|
|
727
|
+
f"Trial design benchmark for '{query}': {len(trials)} trial(s)"
|
|
728
|
+
f"{' (+ more pages)' if has_more else ''}. "
|
|
729
|
+
f"Median enrollment: {int(median_enrollment) if median_enrollment is not None else 'NA'}. "
|
|
730
|
+
f"Top primary endpoints: {top_endpoint_text}."
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
return {
|
|
734
|
+
"summary": summary,
|
|
735
|
+
"query": query,
|
|
736
|
+
"phase_filter": phase_filter,
|
|
737
|
+
"status_filter": status,
|
|
738
|
+
"has_more": has_more,
|
|
739
|
+
"n_trials": len(trials),
|
|
740
|
+
"median_enrollment": median_enrollment,
|
|
741
|
+
"phase_distribution": phase_counts,
|
|
742
|
+
"status_distribution": status_counts,
|
|
743
|
+
"design_patterns": design_patterns,
|
|
744
|
+
"top_primary_endpoints": [{"endpoint": k, "count": v} for k, v in endpoint_top],
|
|
745
|
+
"top_interventions": [{"intervention": k, "count": v} for k, v in intervention_top],
|
|
746
|
+
"trials": trials,
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
@registry.register(
|
|
751
|
+
name="clinical.endpoint_benchmark",
|
|
752
|
+
description="Benchmark endpoint usage patterns and enrollment norms for an indication/target query",
|
|
753
|
+
category="clinical",
|
|
754
|
+
parameters={
|
|
755
|
+
"query": "Search term for indication/target/drug",
|
|
756
|
+
"phase": "Optional phase filter",
|
|
757
|
+
"status": "Optional status filter",
|
|
758
|
+
"max_results": "Maximum studies to include (default 30, max 100)",
|
|
759
|
+
},
|
|
760
|
+
usage_guide=(
|
|
761
|
+
"Use during protocol planning to benchmark what endpoints and enrollment levels are commonly used "
|
|
762
|
+
"by competitors in similar trials."
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
def endpoint_benchmark(
|
|
766
|
+
query: str,
|
|
767
|
+
phase: str = "",
|
|
768
|
+
status: str = "",
|
|
769
|
+
max_results: int = 30,
|
|
770
|
+
**kwargs,
|
|
771
|
+
) -> dict:
|
|
772
|
+
"""Summarize endpoint conventions from ClinicalTrials.gov records."""
|
|
773
|
+
del kwargs
|
|
774
|
+
base = trial_design_benchmark(
|
|
775
|
+
query=query,
|
|
776
|
+
phase=phase,
|
|
777
|
+
status=status,
|
|
778
|
+
max_results=max_results,
|
|
779
|
+
)
|
|
780
|
+
if "error" in base:
|
|
781
|
+
return {
|
|
782
|
+
"error": base["error"],
|
|
783
|
+
"summary": base["summary"],
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
trials = base.get("trials", []) or []
|
|
787
|
+
if not trials:
|
|
788
|
+
return {
|
|
789
|
+
"summary": f"No trials available for endpoint benchmark on '{query}'.",
|
|
790
|
+
"query": query,
|
|
791
|
+
"trials": [],
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
endpoint_family_counts = {
|
|
795
|
+
"overall_survival": 0,
|
|
796
|
+
"progression_free_survival": 0,
|
|
797
|
+
"response_rate": 0,
|
|
798
|
+
"safety_tolerability": 0,
|
|
799
|
+
"quality_of_life": 0,
|
|
800
|
+
"biomarker_driven": 0,
|
|
801
|
+
"other": 0,
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
endpoint_examples = {k: [] for k in endpoint_family_counts}
|
|
805
|
+
for trial in trials:
|
|
806
|
+
endpoints = trial.get("primary_endpoints", []) or []
|
|
807
|
+
if not endpoints:
|
|
808
|
+
endpoint_family_counts["other"] += 1
|
|
809
|
+
continue
|
|
810
|
+
classified = False
|
|
811
|
+
for endpoint in endpoints:
|
|
812
|
+
text = str(endpoint).lower()
|
|
813
|
+
if "overall survival" in text or text.strip() == "os":
|
|
814
|
+
key = "overall_survival"
|
|
815
|
+
elif "progression-free survival" in text or "pfs" in text:
|
|
816
|
+
key = "progression_free_survival"
|
|
817
|
+
elif "objective response rate" in text or "orr" in text or "response rate" in text:
|
|
818
|
+
key = "response_rate"
|
|
819
|
+
elif "adverse event" in text or "safety" in text or "tolerability" in text:
|
|
820
|
+
key = "safety_tolerability"
|
|
821
|
+
elif "quality of life" in text or "qol" in text or "patient-reported" in text:
|
|
822
|
+
key = "quality_of_life"
|
|
823
|
+
elif any(k in text for k in ("biomarker", "mutation", "pd-l1", "ctdna", "mrd")):
|
|
824
|
+
key = "biomarker_driven"
|
|
825
|
+
else:
|
|
826
|
+
key = "other"
|
|
827
|
+
|
|
828
|
+
endpoint_family_counts[key] += 1
|
|
829
|
+
if len(endpoint_examples[key]) < 5:
|
|
830
|
+
endpoint_examples[key].append(endpoint)
|
|
831
|
+
classified = True
|
|
832
|
+
if not classified:
|
|
833
|
+
endpoint_family_counts["other"] += 1
|
|
834
|
+
|
|
835
|
+
# Enrollment statistics
|
|
836
|
+
enrollments = []
|
|
837
|
+
for trial in trials:
|
|
838
|
+
value = trial.get("enrollment")
|
|
839
|
+
if isinstance(value, int):
|
|
840
|
+
enrollments.append(value)
|
|
841
|
+
enrollment_median = float(np.median(enrollments)) if enrollments else None
|
|
842
|
+
enrollment_p75 = float(np.percentile(enrollments, 75)) if len(enrollments) >= 2 else None
|
|
843
|
+
|
|
844
|
+
ranked_families = sorted(
|
|
845
|
+
endpoint_family_counts.items(),
|
|
846
|
+
key=lambda kv: kv[1],
|
|
847
|
+
reverse=True,
|
|
848
|
+
)
|
|
849
|
+
top_families = [{"family": k, "count": v} for k, v in ranked_families if v > 0][:6]
|
|
850
|
+
top_family_text = ", ".join(f"{x['family']} ({x['count']})" for x in top_families[:3]) or "none"
|
|
851
|
+
|
|
852
|
+
summary = (
|
|
853
|
+
f"Endpoint benchmark for '{query}': {len(trials)} trial(s). "
|
|
854
|
+
f"Top endpoint families: {top_family_text}. "
|
|
855
|
+
f"Median enrollment: {int(enrollment_median) if enrollment_median is not None else 'NA'}."
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
return {
|
|
859
|
+
"summary": summary,
|
|
860
|
+
"query": query,
|
|
861
|
+
"phase_filter": phase,
|
|
862
|
+
"status_filter": status,
|
|
863
|
+
"n_trials": len(trials),
|
|
864
|
+
"endpoint_families": endpoint_family_counts,
|
|
865
|
+
"top_endpoint_families": top_families,
|
|
866
|
+
"endpoint_examples": endpoint_examples,
|
|
867
|
+
"median_enrollment": enrollment_median,
|
|
868
|
+
"p75_enrollment": enrollment_p75,
|
|
869
|
+
"phase_distribution": base.get("phase_distribution", {}),
|
|
870
|
+
"status_distribution": base.get("status_distribution", {}),
|
|
871
|
+
"trials": trials,
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
@registry.register(
|
|
876
|
+
name="clinical.competitive_landscape",
|
|
877
|
+
description="Aggregate competitive intelligence for a target or indication from trials, ChEMBL, and Open Targets",
|
|
878
|
+
category="clinical",
|
|
879
|
+
parameters={
|
|
880
|
+
"gene": "Target gene symbol (e.g. CRBN, BRAF, EGFR)",
|
|
881
|
+
"indication": "Optional indication to focus the search (e.g. 'melanoma', 'lung cancer')",
|
|
882
|
+
},
|
|
883
|
+
usage_guide="You want a comprehensive view of the competitive landscape around a drug target — combines ClinicalTrials.gov, ChEMBL, and Open Targets to show active programs, phase distribution, and mechanism diversity. Use for strategic positioning and differentiation.",
|
|
884
|
+
)
|
|
885
|
+
def competitive_landscape(gene: str, indication: str = "", **kwargs) -> dict:
|
|
886
|
+
"""Aggregate competitive intelligence from multiple sources.
|
|
887
|
+
|
|
888
|
+
Combines:
|
|
889
|
+
1. ClinicalTrials.gov: active clinical programs
|
|
890
|
+
2. ChEMBL: known compounds and bioactivities against the target
|
|
891
|
+
3. Open Targets: known drugs and mechanisms via GraphQL
|
|
892
|
+
"""
|
|
893
|
+
try:
|
|
894
|
+
import httpx
|
|
895
|
+
except ImportError:
|
|
896
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
897
|
+
results = {
|
|
898
|
+
"gene": gene,
|
|
899
|
+
"indication": indication or "all",
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
# --- Source 1: ClinicalTrials.gov ---
|
|
903
|
+
ct_query = f"{gene} {indication}".strip() if indication else gene
|
|
904
|
+
trial_data = trial_search(query=ct_query)
|
|
905
|
+
|
|
906
|
+
if "error" not in trial_data:
|
|
907
|
+
results["trials"] = {
|
|
908
|
+
"total_count": trial_data.get("total_count", 0),
|
|
909
|
+
"phase_distribution": trial_data.get("phase_distribution", {}),
|
|
910
|
+
"status_distribution": trial_data.get("status_distribution", {}),
|
|
911
|
+
"top_trials": trial_data.get("trials", [])[:10],
|
|
912
|
+
}
|
|
913
|
+
else:
|
|
914
|
+
results["trials"] = {"error": trial_data["error"], "total_count": 0}
|
|
915
|
+
|
|
916
|
+
# --- Source 2: ChEMBL target search + activities ---
|
|
917
|
+
chembl_compounds = []
|
|
918
|
+
chembl_base = "https://www.ebi.ac.uk/chembl/api/data"
|
|
919
|
+
headers = {"Accept": "application/json"}
|
|
920
|
+
|
|
921
|
+
try:
|
|
922
|
+
# Find target in ChEMBL
|
|
923
|
+
tgt_data, error = request_json(
|
|
924
|
+
"GET",
|
|
925
|
+
f"{chembl_base}/target/search.json",
|
|
926
|
+
params={"q": gene, "limit": 5},
|
|
927
|
+
headers=headers,
|
|
928
|
+
timeout=10,
|
|
929
|
+
retries=2,
|
|
930
|
+
)
|
|
931
|
+
if error:
|
|
932
|
+
raise RuntimeError(error)
|
|
933
|
+
|
|
934
|
+
targets = tgt_data.get("targets", [])
|
|
935
|
+
chembl_target_id = None
|
|
936
|
+
for tgt in targets:
|
|
937
|
+
# Prefer human SINGLE PROTEIN targets
|
|
938
|
+
if (tgt.get("organism", "") == "Homo sapiens" and
|
|
939
|
+
tgt.get("target_type", "") == "SINGLE PROTEIN"):
|
|
940
|
+
chembl_target_id = tgt.get("target_chembl_id")
|
|
941
|
+
break
|
|
942
|
+
|
|
943
|
+
if not chembl_target_id and targets:
|
|
944
|
+
chembl_target_id = targets[0].get("target_chembl_id")
|
|
945
|
+
|
|
946
|
+
if chembl_target_id:
|
|
947
|
+
# Get activities for the target
|
|
948
|
+
act_data, error = request_json(
|
|
949
|
+
"GET",
|
|
950
|
+
f"{chembl_base}/activity.json",
|
|
951
|
+
params={
|
|
952
|
+
"target_chembl_id": chembl_target_id,
|
|
953
|
+
"limit": 50,
|
|
954
|
+
"standard_type__in": "IC50,Ki,Kd,EC50",
|
|
955
|
+
},
|
|
956
|
+
headers=headers,
|
|
957
|
+
timeout=10,
|
|
958
|
+
retries=2,
|
|
959
|
+
)
|
|
960
|
+
if error:
|
|
961
|
+
raise RuntimeError(error)
|
|
962
|
+
|
|
963
|
+
# Deduplicate by molecule
|
|
964
|
+
seen_mols = set()
|
|
965
|
+
moa_types = set()
|
|
966
|
+
for act in act_data.get("activities", []):
|
|
967
|
+
mol_id = act.get("molecule_chembl_id", "")
|
|
968
|
+
if mol_id and mol_id not in seen_mols:
|
|
969
|
+
seen_mols.add(mol_id)
|
|
970
|
+
chembl_compounds.append({
|
|
971
|
+
"chembl_id": mol_id,
|
|
972
|
+
"name": act.get("molecule_pref_name", "") or mol_id,
|
|
973
|
+
"activity_type": act.get("standard_type", ""),
|
|
974
|
+
"activity_value": act.get("standard_value"),
|
|
975
|
+
"activity_units": act.get("standard_units", ""),
|
|
976
|
+
"pchembl": act.get("pchembl_value"),
|
|
977
|
+
})
|
|
978
|
+
assay_desc = act.get("assay_description", "")
|
|
979
|
+
if assay_desc:
|
|
980
|
+
# Extract broad MoA categories from assay descriptions
|
|
981
|
+
desc_lower = assay_desc.lower()
|
|
982
|
+
if "inhibit" in desc_lower:
|
|
983
|
+
moa_types.add("Inhibitor")
|
|
984
|
+
if "degrad" in desc_lower:
|
|
985
|
+
moa_types.add("Degrader")
|
|
986
|
+
if "agonist" in desc_lower:
|
|
987
|
+
moa_types.add("Agonist")
|
|
988
|
+
if "antagonist" in desc_lower:
|
|
989
|
+
moa_types.add("Antagonist")
|
|
990
|
+
if "allosteric" in desc_lower:
|
|
991
|
+
moa_types.add("Allosteric modulator")
|
|
992
|
+
if "antibod" in desc_lower:
|
|
993
|
+
moa_types.add("Antibody")
|
|
994
|
+
if "covalent" in desc_lower:
|
|
995
|
+
moa_types.add("Covalent binder")
|
|
996
|
+
|
|
997
|
+
results["chembl"] = {
|
|
998
|
+
"target_chembl_id": chembl_target_id,
|
|
999
|
+
"unique_compounds": len(chembl_compounds),
|
|
1000
|
+
"moa_types": sorted(moa_types),
|
|
1001
|
+
"top_compounds": chembl_compounds[:15],
|
|
1002
|
+
}
|
|
1003
|
+
else:
|
|
1004
|
+
results["chembl"] = {"error": f"No ChEMBL target found for {gene}", "unique_compounds": 0}
|
|
1005
|
+
|
|
1006
|
+
except Exception as e:
|
|
1007
|
+
results["chembl"] = {"error": f"ChEMBL query failed: {e}", "unique_compounds": 0}
|
|
1008
|
+
|
|
1009
|
+
# --- Source 3: Open Targets known drugs (GraphQL) ---
|
|
1010
|
+
ot_drugs = []
|
|
1011
|
+
ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
|
|
1012
|
+
graphql_query = """
|
|
1013
|
+
query knownDrugs($ensemblId: String!) {
|
|
1014
|
+
target(ensemblId: $ensemblId) {
|
|
1015
|
+
id
|
|
1016
|
+
approvedSymbol
|
|
1017
|
+
knownDrugs(size: 30) {
|
|
1018
|
+
uniqueDrugs
|
|
1019
|
+
uniqueTargets
|
|
1020
|
+
rows {
|
|
1021
|
+
drugId
|
|
1022
|
+
prefName
|
|
1023
|
+
drugType
|
|
1024
|
+
mechanismOfAction
|
|
1025
|
+
phase
|
|
1026
|
+
status
|
|
1027
|
+
disease {
|
|
1028
|
+
id
|
|
1029
|
+
name
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
"""
|
|
1036
|
+
|
|
1037
|
+
# Map gene symbol to Ensembl ID via Open Targets search
|
|
1038
|
+
try:
|
|
1039
|
+
search_data, error = request_json(
|
|
1040
|
+
"POST",
|
|
1041
|
+
ot_url,
|
|
1042
|
+
json={
|
|
1043
|
+
"query": """
|
|
1044
|
+
query searchTarget($q: String!) {
|
|
1045
|
+
search(queryString: $q, entityNames: ["target"], page: {size: 5, index: 0}) {
|
|
1046
|
+
hits {
|
|
1047
|
+
id
|
|
1048
|
+
name
|
|
1049
|
+
entity
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
""",
|
|
1054
|
+
"variables": {"q": gene},
|
|
1055
|
+
},
|
|
1056
|
+
timeout=10,
|
|
1057
|
+
retries=2,
|
|
1058
|
+
)
|
|
1059
|
+
if error:
|
|
1060
|
+
raise RuntimeError(error)
|
|
1061
|
+
|
|
1062
|
+
hits = search_data.get("data", {}).get("search", {}).get("hits", [])
|
|
1063
|
+
ensembl_id = None
|
|
1064
|
+
for hit in hits:
|
|
1065
|
+
if hit.get("entity") == "target":
|
|
1066
|
+
ensembl_id = hit.get("id")
|
|
1067
|
+
break
|
|
1068
|
+
|
|
1069
|
+
if ensembl_id:
|
|
1070
|
+
drugs_data, error = request_json(
|
|
1071
|
+
"POST",
|
|
1072
|
+
ot_url,
|
|
1073
|
+
json={
|
|
1074
|
+
"query": graphql_query,
|
|
1075
|
+
"variables": {"ensemblId": ensembl_id},
|
|
1076
|
+
},
|
|
1077
|
+
timeout=10,
|
|
1078
|
+
retries=2,
|
|
1079
|
+
)
|
|
1080
|
+
if error:
|
|
1081
|
+
raise RuntimeError(error)
|
|
1082
|
+
|
|
1083
|
+
known_drugs = drugs_data.get("data", {}).get("target", {}).get("knownDrugs", {})
|
|
1084
|
+
if known_drugs:
|
|
1085
|
+
unique_drugs = known_drugs.get("uniqueDrugs", 0)
|
|
1086
|
+
phase_dist_ot = {}
|
|
1087
|
+
moa_set = set()
|
|
1088
|
+
|
|
1089
|
+
for row in known_drugs.get("rows", []):
|
|
1090
|
+
drug_name = row.get("prefName", "") or row.get("drugId", "")
|
|
1091
|
+
phase = row.get("phase", 0)
|
|
1092
|
+
moa = row.get("mechanismOfAction", "")
|
|
1093
|
+
disease = row.get("disease", {})
|
|
1094
|
+
disease_name = disease.get("name", "") if disease else ""
|
|
1095
|
+
|
|
1096
|
+
# Filter by indication if specified
|
|
1097
|
+
if indication and disease_name:
|
|
1098
|
+
if indication.lower() not in disease_name.lower():
|
|
1099
|
+
continue
|
|
1100
|
+
|
|
1101
|
+
ot_drugs.append({
|
|
1102
|
+
"drug": drug_name,
|
|
1103
|
+
"drug_type": row.get("drugType", ""),
|
|
1104
|
+
"mechanism": moa,
|
|
1105
|
+
"phase": phase,
|
|
1106
|
+
"status": row.get("status", ""),
|
|
1107
|
+
"disease": disease_name,
|
|
1108
|
+
})
|
|
1109
|
+
|
|
1110
|
+
phase_key = f"Phase {phase}" if phase else "Unknown"
|
|
1111
|
+
phase_dist_ot[phase_key] = phase_dist_ot.get(phase_key, 0) + 1
|
|
1112
|
+
if moa:
|
|
1113
|
+
moa_set.add(moa)
|
|
1114
|
+
|
|
1115
|
+
results["open_targets"] = {
|
|
1116
|
+
"ensembl_id": ensembl_id,
|
|
1117
|
+
"unique_drugs": unique_drugs,
|
|
1118
|
+
"phase_distribution": phase_dist_ot,
|
|
1119
|
+
"mechanisms": sorted(moa_set),
|
|
1120
|
+
"drugs": ot_drugs[:20],
|
|
1121
|
+
}
|
|
1122
|
+
else:
|
|
1123
|
+
results["open_targets"] = {"error": "No known drugs found", "unique_drugs": 0}
|
|
1124
|
+
else:
|
|
1125
|
+
results["open_targets"] = {"error": f"Could not resolve Ensembl ID for {gene}", "unique_drugs": 0}
|
|
1126
|
+
|
|
1127
|
+
except Exception as e:
|
|
1128
|
+
results["open_targets"] = {"error": f"Open Targets query failed: {e}", "unique_drugs": 0}
|
|
1129
|
+
|
|
1130
|
+
# --- Aggregate summary ---
|
|
1131
|
+
total_trials = results.get("trials", {}).get("total_count", 0)
|
|
1132
|
+
chembl_count = results.get("chembl", {}).get("unique_compounds", 0)
|
|
1133
|
+
ot_count = results.get("open_targets", {}).get("unique_drugs", 0)
|
|
1134
|
+
|
|
1135
|
+
trial_phases = results.get("trials", {}).get("phase_distribution", {})
|
|
1136
|
+
chembl_moas = results.get("chembl", {}).get("moa_types", [])
|
|
1137
|
+
ot_moas = results.get("open_targets", {}).get("mechanisms", [])
|
|
1138
|
+
all_moas = sorted(set(chembl_moas + ot_moas))
|
|
1139
|
+
|
|
1140
|
+
phase_str = ", ".join(f"{k}: {v}" for k, v in sorted(trial_phases.items())) if trial_phases else "none"
|
|
1141
|
+
moa_str = ", ".join(all_moas[:5]) if all_moas else "not characterized"
|
|
1142
|
+
|
|
1143
|
+
ind_label = f" in {indication}" if indication else ""
|
|
1144
|
+
summary = (
|
|
1145
|
+
f"Competitive landscape for {gene}{ind_label}:\n"
|
|
1146
|
+
f"Clinical trials: {total_trials} ({phase_str})\n"
|
|
1147
|
+
f"ChEMBL compounds: {chembl_count}\n"
|
|
1148
|
+
f"Open Targets known drugs: {ot_count}\n"
|
|
1149
|
+
f"Mechanism diversity: {moa_str}"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
results["summary"] = summary
|
|
1153
|
+
return results
|