celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/clinical.py ADDED
@@ -0,0 +1,1153 @@
1
+ """
2
+ Clinical translation tools: indication mapping, patient population sizing, TCGA stratification.
3
+
4
+ References crews-glue-discovery/scripts/patient_population_sizing.py and tcga_stratification.py
5
+ for data sources and scoring logic.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ import re
11
+ from ct.tools import registry
12
+ from ct.tools.http_client import request, request_json
13
+
14
+
15
+ # US annual incidence by cancer type (SEER/Globocan estimates)
16
+ US_INCIDENCE = {
17
+ "Lung": 238000,
18
+ "Breast": 310000,
19
+ "Colorectal": 153000,
20
+ "Prostate": 288000,
21
+ "Lymphoma (NHL)": 80000,
22
+ "AML": 20000,
23
+ "ALL": 6000,
24
+ "Multiple Myeloma": 35000,
25
+ "Kidney": 82000,
26
+ "Liver": 42000,
27
+ "Ovarian": 20000,
28
+ "Pancreatic": 64000,
29
+ "Melanoma": 100000,
30
+ "Bladder": 83000,
31
+ "Thyroid": 44000,
32
+ "Glioma/Brain": 25000,
33
+ "Cervical": 14000,
34
+ "Endometrial/Uterine": 66000,
35
+ "Head & Neck": 66000,
36
+ "Gastric/Esophageal": 49000,
37
+ "Sarcoma": 14000,
38
+ "Neuroblastoma": 800,
39
+ "Mesothelioma": 3000,
40
+ }
41
+
42
+ # PRISM lineage to standard cancer type mapping
43
+ LINEAGE_TO_CANCER = {
44
+ "Lung": {"incidence_key": "Lung", "fraction": 0.85, "five_yr_survival": 0.25,
45
+ "unmet_need": True, "name": "Non-Small Cell Lung Cancer"},
46
+ "CNS/Brain": {"incidence_key": "Glioma/Brain", "fraction": 0.60, "five_yr_survival": 0.05,
47
+ "unmet_need": True, "name": "Diffuse Glioma"},
48
+ "Skin": {"incidence_key": "Melanoma", "fraction": 1.0, "five_yr_survival": 0.93,
49
+ "unmet_need": False, "name": "Melanoma"},
50
+ "Lymphoid": {"incidence_key": "Lymphoma (NHL)", "fraction": 0.85, "five_yr_survival": 0.73,
51
+ "unmet_need": False, "name": "B-Cell Lymphoma"},
52
+ "Head and Neck": {"incidence_key": "Head & Neck", "fraction": 0.90, "five_yr_survival": 0.66,
53
+ "unmet_need": False, "name": "Head & Neck SCC"},
54
+ "Bowel": {"incidence_key": "Colorectal", "fraction": 0.95, "five_yr_survival": 0.65,
55
+ "unmet_need": False, "name": "Colorectal Cancer"},
56
+ "Ovary/Fallopian Tube": {"incidence_key": "Ovarian", "fraction": 0.90, "five_yr_survival": 0.50,
57
+ "unmet_need": True, "name": "Ovarian Cancer"},
58
+ "Pancreas": {"incidence_key": "Pancreatic", "fraction": 0.85, "five_yr_survival": 0.12,
59
+ "unmet_need": True, "name": "Pancreatic Cancer"},
60
+ "Breast": {"incidence_key": "Breast", "fraction": 0.95, "five_yr_survival": 0.90,
61
+ "unmet_need": False, "name": "Breast Cancer"},
62
+ "Prostate": {"incidence_key": "Prostate", "fraction": 0.95, "five_yr_survival": 0.97,
63
+ "unmet_need": False, "name": "Prostate Cancer"},
64
+ "Myeloid": {"incidence_key": "AML", "fraction": 1.0, "five_yr_survival": 0.30,
65
+ "unmet_need": True, "name": "Acute Myeloid Leukemia"},
66
+ "Liver": {"incidence_key": "Liver", "fraction": 0.80, "five_yr_survival": 0.20,
67
+ "unmet_need": True, "name": "Hepatocellular Carcinoma"},
68
+ "Kidney": {"incidence_key": "Kidney", "fraction": 0.85, "five_yr_survival": 0.77,
69
+ "unmet_need": False, "name": "Renal Cell Carcinoma"},
70
+ "Bladder/Urinary Tract": {"incidence_key": "Bladder", "fraction": 0.90, "five_yr_survival": 0.77,
71
+ "unmet_need": False, "name": "Bladder Cancer"},
72
+ "Stomach": {"incidence_key": "Gastric/Esophageal", "fraction": 0.65, "five_yr_survival": 0.22,
73
+ "unmet_need": True, "name": "Gastric/Esophageal Cancer"},
74
+ "Uterus": {"incidence_key": "Endometrial/Uterine", "fraction": 0.90, "five_yr_survival": 0.81,
75
+ "unmet_need": False, "name": "Endometrial Cancer"},
76
+ "Cervix": {"incidence_key": "Cervical", "fraction": 0.70, "five_yr_survival": 0.66,
77
+ "unmet_need": True, "name": "Cervical Cancer"},
78
+ "Bone": {"incidence_key": "Sarcoma", "fraction": 0.11, "five_yr_survival": 0.60,
79
+ "unmet_need": True, "name": "Bone Sarcoma"},
80
+ "Soft Tissue": {"incidence_key": "Sarcoma", "fraction": 0.05, "five_yr_survival": 0.63,
81
+ "unmet_need": True, "name": "Soft Tissue Sarcoma"},
82
+ "Pleura": {"incidence_key": "Mesothelioma", "fraction": 0.80, "five_yr_survival": 0.12,
83
+ "unmet_need": True, "name": "Mesothelioma"},
84
+ }
85
+
86
+
87
+ @registry.register(
88
+ name="clinical.indication_map",
89
+ description="Map compound sensitivity profiles to cancer indications with response rates",
90
+ category="clinical",
91
+ parameters={
92
+ "compound_id": "Compound ID to map (or 'all')",
93
+ "min_response_rate": "Minimum response rate to include (default 0.1)",
94
+ },
95
+ requires_data=["prism", "depmap_model"],
96
+ usage_guide="You want to know which cancer types a compound is active against. Maps PRISM cell line sensitivity to clinical cancer indications. Use for indication selection and clinical positioning.",
97
+ )
98
+ def indication_map(compound_id: str = "all", min_response_rate: float = 0.1, **kwargs) -> dict:
99
+ """Map compound PRISM sensitivity to cancer indications.
100
+
101
+ Uses cell line lineage annotations to group by cancer type and compute
102
+ per-indication response rates (fraction of cell lines with LFC < -0.5).
103
+ """
104
+ from ct.data.loaders import load_prism, load_model_metadata
105
+
106
+ prism = load_prism()
107
+ model = load_model_metadata()
108
+
109
+ # Map cell lines to lineages
110
+ ccle_to_lineage = {}
111
+ for _, row in model.iterrows():
112
+ ccle = row.get("CCLEName", "")
113
+ lin = row.get("OncotreeLineage", "Unknown")
114
+ if pd.notna(ccle) and pd.notna(lin):
115
+ ccle_to_lineage[ccle] = lin
116
+
117
+ compounds = [compound_id] if compound_id != "all" else prism["pert_name"].unique().tolist()
118
+ results = []
119
+
120
+ for cpd in compounds:
121
+ cpd_data = prism[prism["pert_name"] == cpd]
122
+ if len(cpd_data) == 0:
123
+ continue
124
+
125
+ max_dose = cpd_data["pert_dose"].max()
126
+ cpd_hd = cpd_data[cpd_data["pert_dose"] == max_dose].copy()
127
+ cpd_hd["lineage"] = cpd_hd["ccle_name"].map(ccle_to_lineage)
128
+
129
+ for lineage, group in cpd_hd.groupby("lineage"):
130
+ if lineage == "Unknown" or len(group) < 3:
131
+ continue
132
+
133
+ n_cells = len(group)
134
+ n_sensitive = (group["LFC"] < -0.5).sum()
135
+ response_rate = n_sensitive / n_cells
136
+ mean_lfc = float(group["LFC"].mean())
137
+
138
+ if response_rate < min_response_rate:
139
+ continue
140
+
141
+ # Map to clinical indication
142
+ cancer_info = LINEAGE_TO_CANCER.get(lineage, {})
143
+ cancer_name = cancer_info.get("name", lineage)
144
+
145
+ results.append({
146
+ "compound": cpd,
147
+ "lineage": lineage,
148
+ "cancer_type": cancer_name,
149
+ "n_cell_lines": n_cells,
150
+ "n_sensitive": int(n_sensitive),
151
+ "response_rate": round(response_rate, 3),
152
+ "mean_lfc": round(mean_lfc, 3),
153
+ "unmet_need": cancer_info.get("unmet_need"),
154
+ "five_yr_survival": cancer_info.get("five_yr_survival"),
155
+ })
156
+
157
+ if not results:
158
+ return {
159
+ "summary": f"No indications found for {compound_id} (compound may not be in PRISM data or no lineages met criteria)",
160
+ "n_indications": 0,
161
+ "indications": [],
162
+ }
163
+
164
+ df = pd.DataFrame(results).sort_values("response_rate", ascending=False)
165
+
166
+ if compound_id != "all":
167
+ top = df.head(5)
168
+ top_names = ", ".join(top["cancer_type"].tolist()) if len(top) > 0 else "none"
169
+ summary = f"Indication mapping for {compound_id}: {len(df)} indications (top: {top_names})"
170
+ else:
171
+ summary = f"Mapped {len(compounds)} compounds across {df['cancer_type'].nunique()} indications"
172
+
173
+ return {
174
+ "summary": summary,
175
+ "n_indications": len(df),
176
+ "indications": df.to_dict("records"),
177
+ }
178
+
179
+
180
+ @registry.register(
181
+ name="clinical.population_size",
182
+ description="Estimate addressable patient population per compound and indication using SEER incidence data",
183
+ category="clinical",
184
+ parameters={
185
+ "compound_id": "Compound ID to size (or 'all')",
186
+ "clinical_adjustment": "Clinical reality factor (default 0.10 = 10% of cell-line estimate)",
187
+ },
188
+ requires_data=["prism", "depmap_model"],
189
+ usage_guide="You want to estimate how many patients could benefit from a compound — combines PRISM response rates with US cancer incidence data. Use for market sizing and clinical development prioritization.",
190
+ )
191
+ def population_size(compound_id: str = "all", clinical_adjustment: float = 0.10, **kwargs) -> dict:
192
+ """Estimate addressable patient populations.
193
+
194
+ addressable = annual_incidence x subtype_fraction x cell_line_response_rate
195
+ clinical_adjusted = addressable x clinical_adjustment_factor
196
+ """
197
+ # Get indication mapping first
198
+ ind_result = indication_map(compound_id=compound_id, min_response_rate=0.05)
199
+ if "error" in ind_result:
200
+ return ind_result
201
+
202
+ indications = ind_result["indications"]
203
+ results = []
204
+
205
+ for ind in indications:
206
+ lineage = ind["lineage"]
207
+ cancer_info = LINEAGE_TO_CANCER.get(lineage)
208
+ if not cancer_info:
209
+ continue
210
+
211
+ incidence_key = cancer_info["incidence_key"]
212
+ if incidence_key not in US_INCIDENCE:
213
+ continue
214
+
215
+ annual_incidence = US_INCIDENCE[incidence_key]
216
+ subtype_fraction = cancer_info["fraction"]
217
+ base_population = int(annual_incidence * subtype_fraction)
218
+ addressable = int(base_population * ind["response_rate"])
219
+ clinical_est = int(addressable * clinical_adjustment)
220
+
221
+ results.append({
222
+ "compound": ind["compound"],
223
+ "cancer_type": ind["cancer_type"],
224
+ "annual_us_incidence": annual_incidence,
225
+ "subtype_fraction": subtype_fraction,
226
+ "base_population": base_population,
227
+ "response_rate": ind["response_rate"],
228
+ "addressable_patients": addressable,
229
+ "clinical_adjusted": clinical_est,
230
+ "mean_lfc": ind["mean_lfc"],
231
+ "n_cell_lines": ind["n_cell_lines"],
232
+ "unmet_need": ind.get("unmet_need"),
233
+ "five_yr_survival": ind.get("five_yr_survival"),
234
+ })
235
+
236
+ if not results:
237
+ return {
238
+ "summary": f"No addressable populations identified for {compound_id} (compound may not be in PRISM data)",
239
+ "clinical_adjustment": clinical_adjustment,
240
+ "per_indication": [],
241
+ "per_compound": {},
242
+ }
243
+
244
+ df = pd.DataFrame(results).sort_values("addressable_patients", ascending=False)
245
+
246
+ # Per-compound totals
247
+ if len(df) > 0:
248
+ cpd_totals = df.groupby("compound").agg(
249
+ total_addressable=("addressable_patients", "sum"),
250
+ total_clinical=("clinical_adjusted", "sum"),
251
+ n_indications=("cancer_type", "nunique"),
252
+ ).sort_values("total_addressable", ascending=False)
253
+
254
+ top_cpd = cpd_totals.index[0] if len(cpd_totals) > 0 else "N/A"
255
+ total = int(cpd_totals.iloc[0]["total_addressable"]) if len(cpd_totals) > 0 else 0
256
+
257
+ summary = (
258
+ f"Patient population sizing ({clinical_adjustment:.0%} clinical adjustment):\n"
259
+ f"Top compound: {top_cpd} ({total:,} addressable, "
260
+ f"{int(total * clinical_adjustment):,} clinical estimate)"
261
+ )
262
+ else:
263
+ summary = "No addressable populations identified"
264
+ cpd_totals = pd.DataFrame()
265
+
266
+ return {
267
+ "summary": summary,
268
+ "clinical_adjustment": clinical_adjustment,
269
+ "per_indication": df.to_dict("records"),
270
+ "per_compound": cpd_totals.to_dict("index") if len(cpd_totals) > 0 else {},
271
+ }
272
+
273
+
274
+ @registry.register(
275
+ name="clinical.tcga_stratify",
276
+ description="Stratify patients by target expression using TCGA data from Human Protein Atlas",
277
+ category="clinical",
278
+ parameters={
279
+ "gene": "Gene symbol to query (e.g. CDC25C, GATA2)",
280
+ },
281
+ usage_guide="You want to check if a target gene is expressed in patient tumors — queries TCGA expression data from Human Protein Atlas. Use for clinical biomarker validation and patient stratification strategy.",
282
+ )
283
+ def tcga_stratify(gene: str, **kwargs) -> dict:
284
+ """Query Human Protein Atlas for TCGA expression data.
285
+
286
+ Returns expression levels across cancer types and prognostic associations.
287
+ Convergence = log2(median_FPKM + 1) x |compound_LFC| (when PRISM data available).
288
+ """
289
+ import math
290
+ import re
291
+
292
+ try:
293
+ import httpx
294
+ except ImportError:
295
+ return {"error": "httpx required for TCGA queries (pip install httpx)", "summary": "httpx required for TCGA queries (pip install httpx)"}
296
+ # Fast-path cache for common targets (avoids API call)
297
+ _GENE_ENSEMBL_CACHE = {
298
+ "CDC25C": "ENSG00000158402", "GATA2": "ENSG00000179348",
299
+ "RBCK1": "ENSG00000125826", "ZNF687": "ENSG00000143373",
300
+ "BCOR": "ENSG00000183337", "CEP57": "ENSG00000166037",
301
+ "BTBD1": "ENSG00000084693", "FLCN": "ENSG00000154803",
302
+ "LYZ": "ENSG00000090382", "CRBN": "ENSG00000113851",
303
+ "PDCD2": "ENSG00000126249",
304
+ }
305
+
306
+ ensembl_id = _GENE_ENSEMBL_CACHE.get(gene.upper())
307
+ if not ensembl_id:
308
+ # Look up via Ensembl REST API — works for any human gene symbol
309
+ try:
310
+ xref_url = f"https://rest.ensembl.org/xrefs/symbol/homo_sapiens/{gene}"
311
+ xref_data, xref_error = request_json(
312
+ "GET",
313
+ xref_url,
314
+ timeout=15,
315
+ retries=2,
316
+ headers={
317
+ "Content-Type": "application/json",
318
+ "User-Agent": "ct-celltype/0.1",
319
+ },
320
+ )
321
+ if not xref_error and isinstance(xref_data, list):
322
+ for xref in xref_data:
323
+ if xref.get("type") == "gene" and xref.get("id", "").startswith("ENSG"):
324
+ ensembl_id = xref["id"]
325
+ break
326
+ except Exception:
327
+ pass
328
+
329
+ if not ensembl_id:
330
+ return {"error": f"Could not resolve Ensembl ID for gene '{gene}'. Check the gene symbol is correct.", "summary": f"Could not resolve Ensembl ID for gene '{gene}'. Check the gene symbol is correct."}
331
+ # Fetch gene data from HPA JSON API
332
+ url = f"https://www.proteinatlas.org/{ensembl_id}.json"
333
+ resp, error = request(
334
+ "GET",
335
+ url,
336
+ timeout=30,
337
+ retries=2,
338
+ headers={"User-Agent": "ct-celltype/0.1"},
339
+ raise_for_status=False,
340
+ )
341
+ if error:
342
+ return {"error": f"Failed to fetch HPA data: {error}", "summary": f"Failed to fetch HPA data: {error}"}
343
+ if resp.status_code != 200:
344
+ return {"error": f"HPA API returned status {resp.status_code} for {gene}", "summary": f"HPA API error for {gene}"}
345
+ content_type = ""
346
+ try:
347
+ ct_raw = resp.headers.get("content-type", "")
348
+ if isinstance(ct_raw, str):
349
+ content_type = ct_raw.lower()
350
+ except Exception:
351
+ pass
352
+ if content_type and "json" not in content_type:
353
+ return {"error": f"HPA API returned {content_type} instead of JSON for {gene}", "summary": f"HPA returned non-JSON for {gene}"}
354
+ try:
355
+ gene_json = resp.json()
356
+ except Exception:
357
+ return {"error": f"HPA API returned invalid JSON for {gene}", "summary": f"HPA invalid JSON for {gene}"}
358
+
359
+ # Extract prognostic data
360
+ prognostics = []
361
+ for key, val in gene_json.items():
362
+ if key.startswith("Cancer prognostics -") and val is not None:
363
+ m = re.match(r"Cancer prognostics - (.+?) \((TCGA|validation)\)", key)
364
+ if m and val.get("is_prognostic"):
365
+ prognostics.append({
366
+ "cancer_type": m.group(1),
367
+ "dataset": m.group(2),
368
+ "direction": val.get("prognostic type", ""),
369
+ "status": val.get("prognostic", ""),
370
+ "p_value": val.get("p_val", ""),
371
+ })
372
+
373
+ gene_info = {
374
+ "cancer_specificity": gene_json.get("RNA cancer specificity", ""),
375
+ "cancer_distribution": gene_json.get("RNA cancer distribution", ""),
376
+ "tissue_specificity": gene_json.get("RNA tissue specificity", ""),
377
+ }
378
+
379
+ # Extract RNA expression by cancer type
380
+ rna_cancer = gene_json.get("RNA cancer sample", {})
381
+ cancer_expression = []
382
+ if isinstance(rna_cancer, dict):
383
+ for cancer_type, data in rna_cancer.items():
384
+ if isinstance(data, dict):
385
+ fpkm = data.get("value", 0)
386
+ cancer_expression.append({
387
+ "cancer_type": cancer_type,
388
+ "fpkm": float(fpkm) if fpkm else 0,
389
+ "expr_score": round(math.log2(float(fpkm) + 1), 3) if fpkm else 0,
390
+ })
391
+
392
+ cancer_expression.sort(key=lambda x: x["fpkm"], reverse=True)
393
+
394
+ # Classify expression levels
395
+ for entry in cancer_expression:
396
+ fpkm = entry["fpkm"]
397
+ if fpkm >= 10:
398
+ entry["level"] = "HIGH"
399
+ elif fpkm >= 3:
400
+ entry["level"] = "MEDIUM"
401
+ elif fpkm >= 1:
402
+ entry["level"] = "LOW"
403
+ else:
404
+ entry["level"] = "VERY_LOW"
405
+
406
+ high_expr = [e for e in cancer_expression if e["level"] in ("HIGH", "MEDIUM")]
407
+
408
+ return {
409
+ "summary": (
410
+ f"TCGA stratification for {gene}:\n"
411
+ f"Expressed (FPKM>=3) in {len(high_expr)}/{len(cancer_expression)} cancer types\n"
412
+ f"Prognostic in {len(prognostics)} cancer types\n"
413
+ f"Cancer specificity: {gene_info['cancer_specificity']}"
414
+ ),
415
+ "gene": gene,
416
+ "gene_info": gene_info,
417
+ "cancer_expression": cancer_expression,
418
+ "prognostics": prognostics,
419
+ }
420
+
421
+
422
+ @registry.register(
423
+ name="clinical.trial_search",
424
+ description="Search ClinicalTrials.gov for relevant clinical trials by gene, drug, or indication",
425
+ category="clinical",
426
+ parameters={
427
+ "query": "Search term (gene name, drug name, indication, or free text)",
428
+ "status": "Optional trial status filter: RECRUITING, COMPLETED, ACTIVE_NOT_RECRUITING, etc.",
429
+ },
430
+ usage_guide="You want to find clinical trials for a target, compound, or disease. Use to assess clinical precedent, competitive landscape, and development activity.",
431
+ )
432
+ def trial_search(query: str, status: str = "", **kwargs) -> dict:
433
+ """Search ClinicalTrials.gov API v2 for clinical trials.
434
+
435
+ Returns trial metadata including NCT ID, phase, status, conditions,
436
+ interventions, sponsor, and enrollment.
437
+ """
438
+ try:
439
+ import httpx
440
+ except ImportError:
441
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
442
+ url = "https://clinicaltrials.gov/api/v2/studies"
443
+ params = {
444
+ "query.term": query,
445
+ "pageSize": 20,
446
+ }
447
+ if status:
448
+ params["filter.overallStatus"] = status
449
+
450
+ data, error = request_json(
451
+ "GET",
452
+ url,
453
+ params=params,
454
+ timeout=15,
455
+ retries=2,
456
+ )
457
+ if error:
458
+ return {"error": f"ClinicalTrials.gov search failed: {error}", "summary": f"ClinicalTrials.gov search failed: {error}"}
459
+ studies = data.get("studies", [])
460
+ total_count = len(studies)
461
+ has_more = data.get("nextPageToken") is not None
462
+
463
+ trials = []
464
+ phase_counts = {}
465
+ status_counts = {}
466
+
467
+ for study in studies:
468
+ proto = study.get("protocolSection", {})
469
+ ident = proto.get("identificationModule", {})
470
+ status_mod = proto.get("statusModule", {})
471
+ design = proto.get("designModule", {})
472
+ desc = proto.get("descriptionModule", {})
473
+ contacts = proto.get("contactsLocationsModule", {})
474
+ arms = proto.get("armsInterventionsModule", {})
475
+ cond_mod = proto.get("conditionsModule", {})
476
+ sponsor_mod = proto.get("sponsorCollaboratorsModule", {})
477
+
478
+ nct_id = ident.get("nctId", "")
479
+ title = ident.get("briefTitle", "")
480
+ overall_status = status_mod.get("overallStatus", "")
481
+ start_date = status_mod.get("startDateStruct", {}).get("date", "")
482
+
483
+ # Phase
484
+ phases = design.get("phases", [])
485
+ phase = ", ".join(phases) if phases else "N/A"
486
+
487
+ # Conditions
488
+ conditions = cond_mod.get("conditions", [])
489
+
490
+ # Interventions
491
+ interventions_raw = arms.get("interventions", [])
492
+ interventions = []
493
+ for iv in interventions_raw:
494
+ interventions.append({
495
+ "type": iv.get("type", ""),
496
+ "name": iv.get("name", ""),
497
+ })
498
+
499
+ # Sponsor
500
+ lead_sponsor = sponsor_mod.get("leadSponsor", {})
501
+ sponsor_name = lead_sponsor.get("name", "")
502
+
503
+ # Enrollment
504
+ enrollment_info = design.get("enrollmentInfo", {})
505
+ enrollment = enrollment_info.get("count", "")
506
+
507
+ trial = {
508
+ "nct_id": nct_id,
509
+ "title": title,
510
+ "status": overall_status,
511
+ "phase": phase,
512
+ "conditions": conditions[:5], # Cap to keep output manageable
513
+ "interventions": interventions[:5],
514
+ "sponsor": sponsor_name,
515
+ "enrollment": enrollment,
516
+ "start_date": start_date,
517
+ }
518
+ trials.append(trial)
519
+
520
+ # Aggregate counts
521
+ for p in phases:
522
+ phase_counts[p] = phase_counts.get(p, 0) + 1
523
+ status_counts[overall_status] = status_counts.get(overall_status, 0) + 1
524
+
525
+ # Build summary
526
+ if trials:
527
+ top_phases = ", ".join(f"{k}: {v}" for k, v in sorted(phase_counts.items()))
528
+ top_statuses = ", ".join(f"{k}: {v}" for k, v in sorted(status_counts.items()))
529
+ summary = (
530
+ f"ClinicalTrials.gov search '{query}': {total_count}{'+ (more pages)' if has_more else ''} results\n"
531
+ f"Phase distribution: {top_phases}\n"
532
+ f"Status distribution: {top_statuses}"
533
+ )
534
+ else:
535
+ summary = f"No clinical trials found for '{query}'"
536
+
537
+ return {
538
+ "summary": summary,
539
+ "query": query,
540
+ "total_count": total_count,
541
+ "has_more": has_more,
542
+ "trials": trials,
543
+ "phase_distribution": phase_counts,
544
+ "status_distribution": status_counts,
545
+ }
546
+
547
+
548
+ def _normalize_phase_token(phase_value: str) -> str:
549
+ """Normalize trial phase labels for robust filtering."""
550
+ return re.sub(r"[^A-Z0-9]", "", str(phase_value or "").upper())
551
+
552
+
553
+ @registry.register(
554
+ name="clinical.trial_design_benchmark",
555
+ description="Benchmark clinical trial design patterns for a query (endpoints, enrollment, randomization, biomarker criteria)",
556
+ category="clinical",
557
+ parameters={
558
+ "query": "Search term for indication/target/drug",
559
+ "phase": "Optional phase filter (e.g., 'PHASE2', 'PHASE3', 'EARLY_PHASE1')",
560
+ "status": "Optional trial status filter (e.g., RECRUITING, COMPLETED)",
561
+ "max_results": "Max studies to include from ClinicalTrials.gov API v2 (default 20, max 100)",
562
+ },
563
+ usage_guide=(
564
+ "Use to benchmark protocol design against the current landscape. Summarizes common "
565
+ "endpoints, intervention patterns, enrollment benchmarks, and key eligibility traits."
566
+ ),
567
+ )
568
+ def trial_design_benchmark(
569
+ query: str,
570
+ phase: str = "",
571
+ status: str = "",
572
+ max_results: int = 20,
573
+ **kwargs,
574
+ ) -> dict:
575
+ """Benchmark trial design characteristics from ClinicalTrials.gov API v2."""
576
+ if not query or not query.strip():
577
+ return {"error": "query is required", "summary": "No query provided"}
578
+
579
+ max_results = max(1, min(int(max_results or 20), 100))
580
+ params = {
581
+ "query.term": query.strip(),
582
+ "pageSize": str(max_results),
583
+ }
584
+ if status:
585
+ params["filter.overallStatus"] = status
586
+
587
+ data, error = request_json(
588
+ "GET",
589
+ "https://clinicaltrials.gov/api/v2/studies",
590
+ params=params,
591
+ timeout=20,
592
+ retries=2,
593
+ )
594
+ if error:
595
+ return {
596
+ "error": f"ClinicalTrials.gov benchmark failed: {error}",
597
+ "summary": f"Clinical trial design benchmark failed: {error}",
598
+ }
599
+
600
+ studies = data.get("studies", [])
601
+ has_more = data.get("nextPageToken") is not None
602
+ phase_filter = phase.strip()
603
+ phase_filter_norm = _normalize_phase_token(phase_filter) if phase_filter else ""
604
+
605
+ trials = []
606
+ phase_counts = {}
607
+ status_counts = {}
608
+ endpoint_counts = {}
609
+ intervention_counts = {}
610
+ enrollment_values = []
611
+
612
+ design_patterns = {
613
+ "randomized_trials": 0,
614
+ "blinded_trials": 0,
615
+ "placebo_control_trials": 0,
616
+ "biomarker_criteria_trials": 0,
617
+ "ecog_criteria_trials": 0,
618
+ }
619
+
620
+ for study in studies:
621
+ proto = study.get("protocolSection", {})
622
+ ident = proto.get("identificationModule", {})
623
+ status_mod = proto.get("statusModule", {})
624
+ design_mod = proto.get("designModule", {})
625
+ outcomes_mod = proto.get("outcomesModule", {})
626
+ elig_mod = proto.get("eligibilityModule", {})
627
+ arms_mod = proto.get("armsInterventionsModule", {})
628
+ cond_mod = proto.get("conditionsModule", {})
629
+ sponsor_mod = proto.get("sponsorCollaboratorsModule", {})
630
+
631
+ phases = design_mod.get("phases", []) or []
632
+ if phase_filter_norm:
633
+ phase_tokens = {_normalize_phase_token(p) for p in phases}
634
+ if phase_filter_norm not in phase_tokens:
635
+ continue
636
+
637
+ overall_status = status_mod.get("overallStatus", "") or "UNKNOWN"
638
+ phase_label = ", ".join(phases) if phases else "N/A"
639
+ phase_counts[phase_label] = phase_counts.get(phase_label, 0) + 1
640
+ status_counts[overall_status] = status_counts.get(overall_status, 0) + 1
641
+
642
+ design_info = design_mod.get("designInfo", {})
643
+ allocation = design_info.get("allocation", "")
644
+ intervention_model = design_info.get("interventionModel", "")
645
+ masking = design_info.get("maskingInfo", {}).get("masking", "")
646
+
647
+ interventions = []
648
+ for iv in arms_mod.get("interventions", []) or []:
649
+ iv_name = (iv.get("name", "") or "").strip()
650
+ if iv_name:
651
+ interventions.append(iv_name)
652
+ intervention_counts[iv_name] = intervention_counts.get(iv_name, 0) + 1
653
+
654
+ primary_endpoints = []
655
+ for out in outcomes_mod.get("primaryOutcomes", []) or []:
656
+ measure = (out.get("measure", "") or "").strip()
657
+ if measure:
658
+ primary_endpoints.append(measure)
659
+ endpoint_counts[measure] = endpoint_counts.get(measure, 0) + 1
660
+
661
+ enrollment_raw = design_mod.get("enrollmentInfo", {}).get("count")
662
+ enrollment = None
663
+ try:
664
+ enrollment = int(enrollment_raw)
665
+ enrollment_values.append(enrollment)
666
+ except Exception:
667
+ enrollment = enrollment_raw
668
+
669
+ eligibility_text = (elig_mod.get("eligibilityCriteria", "") or "").lower()
670
+ biomarker_criteria = any(
671
+ term in eligibility_text
672
+ for term in ("biomarker", "mutation", "genotype", "expression", "pd-l1", "her2", "egfr", "alk")
673
+ )
674
+ ecog_criteria = "ecog" in eligibility_text
675
+
676
+ allocation_norm = str(allocation).strip().upper().replace("-", "_")
677
+ if allocation_norm == "RANDOMIZED" or (
678
+ "RANDOMIZED" in allocation_norm and "NON_RANDOMIZED" not in allocation_norm
679
+ ):
680
+ design_patterns["randomized_trials"] += 1
681
+ if masking and str(masking).upper() not in {"NONE", "OPEN_LABEL"}:
682
+ design_patterns["blinded_trials"] += 1
683
+ if any("placebo" in iv.lower() for iv in interventions):
684
+ design_patterns["placebo_control_trials"] += 1
685
+ if biomarker_criteria:
686
+ design_patterns["biomarker_criteria_trials"] += 1
687
+ if ecog_criteria:
688
+ design_patterns["ecog_criteria_trials"] += 1
689
+
690
+ trials.append({
691
+ "nct_id": ident.get("nctId", ""),
692
+ "title": ident.get("briefTitle", ""),
693
+ "phase": phase_label,
694
+ "status": overall_status,
695
+ "study_type": design_mod.get("studyType", ""),
696
+ "allocation": allocation,
697
+ "intervention_model": intervention_model,
698
+ "masking": masking,
699
+ "enrollment": enrollment,
700
+ "conditions": (cond_mod.get("conditions", []) or [])[:5],
701
+ "interventions": interventions[:8],
702
+ "primary_endpoints": primary_endpoints[:8],
703
+ "sponsor": (sponsor_mod.get("leadSponsor", {}) or {}).get("name", ""),
704
+ "start_date": (status_mod.get("startDateStruct", {}) or {}).get("date", ""),
705
+ "biomarker_criteria": biomarker_criteria,
706
+ "ecog_criteria": ecog_criteria,
707
+ })
708
+
709
+ if not trials:
710
+ phase_text = f", phase={phase_filter}" if phase_filter else ""
711
+ status_text = f", status={status}" if status else ""
712
+ return {
713
+ "query": query,
714
+ "phase_filter": phase_filter,
715
+ "status_filter": status,
716
+ "trials": [],
717
+ "summary": f"No trials found for '{query}' with current filters{phase_text}{status_text}.",
718
+ }
719
+
720
+ median_enrollment = float(np.median(enrollment_values)) if enrollment_values else None
721
+
722
+ endpoint_top = sorted(endpoint_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
723
+ intervention_top = sorted(intervention_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
724
+
725
+ top_endpoint_text = ", ".join(f"{name} ({count})" for name, count in endpoint_top[:3]) or "none"
726
+ summary = (
727
+ f"Trial design benchmark for '{query}': {len(trials)} trial(s)"
728
+ f"{' (+ more pages)' if has_more else ''}. "
729
+ f"Median enrollment: {int(median_enrollment) if median_enrollment is not None else 'NA'}. "
730
+ f"Top primary endpoints: {top_endpoint_text}."
731
+ )
732
+
733
+ return {
734
+ "summary": summary,
735
+ "query": query,
736
+ "phase_filter": phase_filter,
737
+ "status_filter": status,
738
+ "has_more": has_more,
739
+ "n_trials": len(trials),
740
+ "median_enrollment": median_enrollment,
741
+ "phase_distribution": phase_counts,
742
+ "status_distribution": status_counts,
743
+ "design_patterns": design_patterns,
744
+ "top_primary_endpoints": [{"endpoint": k, "count": v} for k, v in endpoint_top],
745
+ "top_interventions": [{"intervention": k, "count": v} for k, v in intervention_top],
746
+ "trials": trials,
747
+ }
748
+
749
+
750
+ @registry.register(
751
+ name="clinical.endpoint_benchmark",
752
+ description="Benchmark endpoint usage patterns and enrollment norms for an indication/target query",
753
+ category="clinical",
754
+ parameters={
755
+ "query": "Search term for indication/target/drug",
756
+ "phase": "Optional phase filter",
757
+ "status": "Optional status filter",
758
+ "max_results": "Maximum studies to include (default 30, max 100)",
759
+ },
760
+ usage_guide=(
761
+ "Use during protocol planning to benchmark what endpoints and enrollment levels are commonly used "
762
+ "by competitors in similar trials."
763
+ ),
764
+ )
765
+ def endpoint_benchmark(
766
+ query: str,
767
+ phase: str = "",
768
+ status: str = "",
769
+ max_results: int = 30,
770
+ **kwargs,
771
+ ) -> dict:
772
+ """Summarize endpoint conventions from ClinicalTrials.gov records."""
773
+ del kwargs
774
+ base = trial_design_benchmark(
775
+ query=query,
776
+ phase=phase,
777
+ status=status,
778
+ max_results=max_results,
779
+ )
780
+ if "error" in base:
781
+ return {
782
+ "error": base["error"],
783
+ "summary": base["summary"],
784
+ }
785
+
786
+ trials = base.get("trials", []) or []
787
+ if not trials:
788
+ return {
789
+ "summary": f"No trials available for endpoint benchmark on '{query}'.",
790
+ "query": query,
791
+ "trials": [],
792
+ }
793
+
794
+ endpoint_family_counts = {
795
+ "overall_survival": 0,
796
+ "progression_free_survival": 0,
797
+ "response_rate": 0,
798
+ "safety_tolerability": 0,
799
+ "quality_of_life": 0,
800
+ "biomarker_driven": 0,
801
+ "other": 0,
802
+ }
803
+
804
+ endpoint_examples = {k: [] for k in endpoint_family_counts}
805
+ for trial in trials:
806
+ endpoints = trial.get("primary_endpoints", []) or []
807
+ if not endpoints:
808
+ endpoint_family_counts["other"] += 1
809
+ continue
810
+ classified = False
811
+ for endpoint in endpoints:
812
+ text = str(endpoint).lower()
813
+ if "overall survival" in text or text.strip() == "os":
814
+ key = "overall_survival"
815
+ elif "progression-free survival" in text or "pfs" in text:
816
+ key = "progression_free_survival"
817
+ elif "objective response rate" in text or "orr" in text or "response rate" in text:
818
+ key = "response_rate"
819
+ elif "adverse event" in text or "safety" in text or "tolerability" in text:
820
+ key = "safety_tolerability"
821
+ elif "quality of life" in text or "qol" in text or "patient-reported" in text:
822
+ key = "quality_of_life"
823
+ elif any(k in text for k in ("biomarker", "mutation", "pd-l1", "ctdna", "mrd")):
824
+ key = "biomarker_driven"
825
+ else:
826
+ key = "other"
827
+
828
+ endpoint_family_counts[key] += 1
829
+ if len(endpoint_examples[key]) < 5:
830
+ endpoint_examples[key].append(endpoint)
831
+ classified = True
832
+ if not classified:
833
+ endpoint_family_counts["other"] += 1
834
+
835
+ # Enrollment statistics
836
+ enrollments = []
837
+ for trial in trials:
838
+ value = trial.get("enrollment")
839
+ if isinstance(value, int):
840
+ enrollments.append(value)
841
+ enrollment_median = float(np.median(enrollments)) if enrollments else None
842
+ enrollment_p75 = float(np.percentile(enrollments, 75)) if len(enrollments) >= 2 else None
843
+
844
+ ranked_families = sorted(
845
+ endpoint_family_counts.items(),
846
+ key=lambda kv: kv[1],
847
+ reverse=True,
848
+ )
849
+ top_families = [{"family": k, "count": v} for k, v in ranked_families if v > 0][:6]
850
+ top_family_text = ", ".join(f"{x['family']} ({x['count']})" for x in top_families[:3]) or "none"
851
+
852
+ summary = (
853
+ f"Endpoint benchmark for '{query}': {len(trials)} trial(s). "
854
+ f"Top endpoint families: {top_family_text}. "
855
+ f"Median enrollment: {int(enrollment_median) if enrollment_median is not None else 'NA'}."
856
+ )
857
+
858
+ return {
859
+ "summary": summary,
860
+ "query": query,
861
+ "phase_filter": phase,
862
+ "status_filter": status,
863
+ "n_trials": len(trials),
864
+ "endpoint_families": endpoint_family_counts,
865
+ "top_endpoint_families": top_families,
866
+ "endpoint_examples": endpoint_examples,
867
+ "median_enrollment": enrollment_median,
868
+ "p75_enrollment": enrollment_p75,
869
+ "phase_distribution": base.get("phase_distribution", {}),
870
+ "status_distribution": base.get("status_distribution", {}),
871
+ "trials": trials,
872
+ }
873
+
874
+
875
+ @registry.register(
876
+ name="clinical.competitive_landscape",
877
+ description="Aggregate competitive intelligence for a target or indication from trials, ChEMBL, and Open Targets",
878
+ category="clinical",
879
+ parameters={
880
+ "gene": "Target gene symbol (e.g. CRBN, BRAF, EGFR)",
881
+ "indication": "Optional indication to focus the search (e.g. 'melanoma', 'lung cancer')",
882
+ },
883
+ usage_guide="You want a comprehensive view of the competitive landscape around a drug target — combines ClinicalTrials.gov, ChEMBL, and Open Targets to show active programs, phase distribution, and mechanism diversity. Use for strategic positioning and differentiation.",
884
+ )
885
+ def competitive_landscape(gene: str, indication: str = "", **kwargs) -> dict:
886
+ """Aggregate competitive intelligence from multiple sources.
887
+
888
+ Combines:
889
+ 1. ClinicalTrials.gov: active clinical programs
890
+ 2. ChEMBL: known compounds and bioactivities against the target
891
+ 3. Open Targets: known drugs and mechanisms via GraphQL
892
+ """
893
+ try:
894
+ import httpx
895
+ except ImportError:
896
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
897
+ results = {
898
+ "gene": gene,
899
+ "indication": indication or "all",
900
+ }
901
+
902
+ # --- Source 1: ClinicalTrials.gov ---
903
+ ct_query = f"{gene} {indication}".strip() if indication else gene
904
+ trial_data = trial_search(query=ct_query)
905
+
906
+ if "error" not in trial_data:
907
+ results["trials"] = {
908
+ "total_count": trial_data.get("total_count", 0),
909
+ "phase_distribution": trial_data.get("phase_distribution", {}),
910
+ "status_distribution": trial_data.get("status_distribution", {}),
911
+ "top_trials": trial_data.get("trials", [])[:10],
912
+ }
913
+ else:
914
+ results["trials"] = {"error": trial_data["error"], "total_count": 0}
915
+
916
+ # --- Source 2: ChEMBL target search + activities ---
917
+ chembl_compounds = []
918
+ chembl_base = "https://www.ebi.ac.uk/chembl/api/data"
919
+ headers = {"Accept": "application/json"}
920
+
921
+ try:
922
+ # Find target in ChEMBL
923
+ tgt_data, error = request_json(
924
+ "GET",
925
+ f"{chembl_base}/target/search.json",
926
+ params={"q": gene, "limit": 5},
927
+ headers=headers,
928
+ timeout=10,
929
+ retries=2,
930
+ )
931
+ if error:
932
+ raise RuntimeError(error)
933
+
934
+ targets = tgt_data.get("targets", [])
935
+ chembl_target_id = None
936
+ for tgt in targets:
937
+ # Prefer human SINGLE PROTEIN targets
938
+ if (tgt.get("organism", "") == "Homo sapiens" and
939
+ tgt.get("target_type", "") == "SINGLE PROTEIN"):
940
+ chembl_target_id = tgt.get("target_chembl_id")
941
+ break
942
+
943
+ if not chembl_target_id and targets:
944
+ chembl_target_id = targets[0].get("target_chembl_id")
945
+
946
+ if chembl_target_id:
947
+ # Get activities for the target
948
+ act_data, error = request_json(
949
+ "GET",
950
+ f"{chembl_base}/activity.json",
951
+ params={
952
+ "target_chembl_id": chembl_target_id,
953
+ "limit": 50,
954
+ "standard_type__in": "IC50,Ki,Kd,EC50",
955
+ },
956
+ headers=headers,
957
+ timeout=10,
958
+ retries=2,
959
+ )
960
+ if error:
961
+ raise RuntimeError(error)
962
+
963
+ # Deduplicate by molecule
964
+ seen_mols = set()
965
+ moa_types = set()
966
+ for act in act_data.get("activities", []):
967
+ mol_id = act.get("molecule_chembl_id", "")
968
+ if mol_id and mol_id not in seen_mols:
969
+ seen_mols.add(mol_id)
970
+ chembl_compounds.append({
971
+ "chembl_id": mol_id,
972
+ "name": act.get("molecule_pref_name", "") or mol_id,
973
+ "activity_type": act.get("standard_type", ""),
974
+ "activity_value": act.get("standard_value"),
975
+ "activity_units": act.get("standard_units", ""),
976
+ "pchembl": act.get("pchembl_value"),
977
+ })
978
+ assay_desc = act.get("assay_description", "")
979
+ if assay_desc:
980
+ # Extract broad MoA categories from assay descriptions
981
+ desc_lower = assay_desc.lower()
982
+ if "inhibit" in desc_lower:
983
+ moa_types.add("Inhibitor")
984
+ if "degrad" in desc_lower:
985
+ moa_types.add("Degrader")
986
+ if "agonist" in desc_lower:
987
+ moa_types.add("Agonist")
988
+ if "antagonist" in desc_lower:
989
+ moa_types.add("Antagonist")
990
+ if "allosteric" in desc_lower:
991
+ moa_types.add("Allosteric modulator")
992
+ if "antibod" in desc_lower:
993
+ moa_types.add("Antibody")
994
+ if "covalent" in desc_lower:
995
+ moa_types.add("Covalent binder")
996
+
997
+ results["chembl"] = {
998
+ "target_chembl_id": chembl_target_id,
999
+ "unique_compounds": len(chembl_compounds),
1000
+ "moa_types": sorted(moa_types),
1001
+ "top_compounds": chembl_compounds[:15],
1002
+ }
1003
+ else:
1004
+ results["chembl"] = {"error": f"No ChEMBL target found for {gene}", "unique_compounds": 0}
1005
+
1006
+ except Exception as e:
1007
+ results["chembl"] = {"error": f"ChEMBL query failed: {e}", "unique_compounds": 0}
1008
+
1009
+ # --- Source 3: Open Targets known drugs (GraphQL) ---
1010
+ ot_drugs = []
1011
+ ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
1012
+ graphql_query = """
1013
+ query knownDrugs($ensemblId: String!) {
1014
+ target(ensemblId: $ensemblId) {
1015
+ id
1016
+ approvedSymbol
1017
+ knownDrugs(size: 30) {
1018
+ uniqueDrugs
1019
+ uniqueTargets
1020
+ rows {
1021
+ drugId
1022
+ prefName
1023
+ drugType
1024
+ mechanismOfAction
1025
+ phase
1026
+ status
1027
+ disease {
1028
+ id
1029
+ name
1030
+ }
1031
+ }
1032
+ }
1033
+ }
1034
+ }
1035
+ """
1036
+
1037
+ # Map gene symbol to Ensembl ID via Open Targets search
1038
+ try:
1039
+ search_data, error = request_json(
1040
+ "POST",
1041
+ ot_url,
1042
+ json={
1043
+ "query": """
1044
+ query searchTarget($q: String!) {
1045
+ search(queryString: $q, entityNames: ["target"], page: {size: 5, index: 0}) {
1046
+ hits {
1047
+ id
1048
+ name
1049
+ entity
1050
+ }
1051
+ }
1052
+ }
1053
+ """,
1054
+ "variables": {"q": gene},
1055
+ },
1056
+ timeout=10,
1057
+ retries=2,
1058
+ )
1059
+ if error:
1060
+ raise RuntimeError(error)
1061
+
1062
+ hits = search_data.get("data", {}).get("search", {}).get("hits", [])
1063
+ ensembl_id = None
1064
+ for hit in hits:
1065
+ if hit.get("entity") == "target":
1066
+ ensembl_id = hit.get("id")
1067
+ break
1068
+
1069
+ if ensembl_id:
1070
+ drugs_data, error = request_json(
1071
+ "POST",
1072
+ ot_url,
1073
+ json={
1074
+ "query": graphql_query,
1075
+ "variables": {"ensemblId": ensembl_id},
1076
+ },
1077
+ timeout=10,
1078
+ retries=2,
1079
+ )
1080
+ if error:
1081
+ raise RuntimeError(error)
1082
+
1083
+ known_drugs = drugs_data.get("data", {}).get("target", {}).get("knownDrugs", {})
1084
+ if known_drugs:
1085
+ unique_drugs = known_drugs.get("uniqueDrugs", 0)
1086
+ phase_dist_ot = {}
1087
+ moa_set = set()
1088
+
1089
+ for row in known_drugs.get("rows", []):
1090
+ drug_name = row.get("prefName", "") or row.get("drugId", "")
1091
+ phase = row.get("phase", 0)
1092
+ moa = row.get("mechanismOfAction", "")
1093
+ disease = row.get("disease", {})
1094
+ disease_name = disease.get("name", "") if disease else ""
1095
+
1096
+ # Filter by indication if specified
1097
+ if indication and disease_name:
1098
+ if indication.lower() not in disease_name.lower():
1099
+ continue
1100
+
1101
+ ot_drugs.append({
1102
+ "drug": drug_name,
1103
+ "drug_type": row.get("drugType", ""),
1104
+ "mechanism": moa,
1105
+ "phase": phase,
1106
+ "status": row.get("status", ""),
1107
+ "disease": disease_name,
1108
+ })
1109
+
1110
+ phase_key = f"Phase {phase}" if phase else "Unknown"
1111
+ phase_dist_ot[phase_key] = phase_dist_ot.get(phase_key, 0) + 1
1112
+ if moa:
1113
+ moa_set.add(moa)
1114
+
1115
+ results["open_targets"] = {
1116
+ "ensembl_id": ensembl_id,
1117
+ "unique_drugs": unique_drugs,
1118
+ "phase_distribution": phase_dist_ot,
1119
+ "mechanisms": sorted(moa_set),
1120
+ "drugs": ot_drugs[:20],
1121
+ }
1122
+ else:
1123
+ results["open_targets"] = {"error": "No known drugs found", "unique_drugs": 0}
1124
+ else:
1125
+ results["open_targets"] = {"error": f"Could not resolve Ensembl ID for {gene}", "unique_drugs": 0}
1126
+
1127
+ except Exception as e:
1128
+ results["open_targets"] = {"error": f"Open Targets query failed: {e}", "unique_drugs": 0}
1129
+
1130
+ # --- Aggregate summary ---
1131
+ total_trials = results.get("trials", {}).get("total_count", 0)
1132
+ chembl_count = results.get("chembl", {}).get("unique_compounds", 0)
1133
+ ot_count = results.get("open_targets", {}).get("unique_drugs", 0)
1134
+
1135
+ trial_phases = results.get("trials", {}).get("phase_distribution", {})
1136
+ chembl_moas = results.get("chembl", {}).get("moa_types", [])
1137
+ ot_moas = results.get("open_targets", {}).get("mechanisms", [])
1138
+ all_moas = sorted(set(chembl_moas + ot_moas))
1139
+
1140
+ phase_str = ", ".join(f"{k}: {v}" for k, v in sorted(trial_phases.items())) if trial_phases else "none"
1141
+ moa_str = ", ".join(all_moas[:5]) if all_moas else "not characterized"
1142
+
1143
+ ind_label = f" in {indication}" if indication else ""
1144
+ summary = (
1145
+ f"Competitive landscape for {gene}{ind_label}:\n"
1146
+ f"Clinical trials: {total_trials} ({phase_str})\n"
1147
+ f"ChEMBL compounds: {chembl_count}\n"
1148
+ f"Open Targets known drugs: {ot_count}\n"
1149
+ f"Mechanism diversity: {moa_str}"
1150
+ )
1151
+
1152
+ results["summary"] = summary
1153
+ return results