celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/data_api.py ADDED
@@ -0,0 +1,2114 @@
1
+ """
2
+ Data API tools: rich wrappers for major biomedical data platforms.
3
+
4
+ Provides general-purpose access to DepMap, Open Targets, UniProt, PDB,
5
+ Ensembl, NCBI, ChEMBL, and DrugBank/PubChem.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+
11
+ from ct.tools import registry
12
+ from ct.tools.http_client import request
13
+
14
+
15
+ def _http_get(url: str, *, params=None, headers=None, timeout: int = 15, retries: int = 2):
16
+ """GET helper with transient retry/backoff semantics."""
17
+ import httpx
18
+
19
+ # Preserve historical semantics where `retries` represented max attempts.
20
+ resp, error = request(
21
+ "GET",
22
+ url,
23
+ params=params,
24
+ headers=headers,
25
+ timeout=timeout,
26
+ retries=max(retries - 1, 0),
27
+ raise_for_status=False,
28
+ )
29
+ if error:
30
+ raise httpx.HTTPError(error)
31
+ return resp
32
+
33
+
34
+ def _http_post(url: str, *, json=None, data=None, params=None,
35
+ headers=None, timeout: int = 15, retries: int = 2):
36
+ """POST helper with transient retry/backoff semantics."""
37
+ import httpx
38
+
39
+ # Preserve historical semantics where `retries` represented max attempts.
40
+ resp, error = request(
41
+ "POST",
42
+ url,
43
+ json=json,
44
+ data=data,
45
+ params=params,
46
+ headers=headers,
47
+ timeout=timeout,
48
+ retries=max(retries - 1, 0),
49
+ raise_for_status=False,
50
+ )
51
+ if error:
52
+ raise httpx.HTTPError(error)
53
+ return resp
54
+
55
+ _logger = logging.getLogger("ct.data_api")
56
+
57
+
58
+ def _normalize_gene_name(gene: str) -> str:
59
+ """Normalize a gene symbol: uppercase, strip whitespace, remove common prefixes."""
60
+ gene = gene.strip()
61
+ # Strip common noise prefixes that confuse APIs
62
+ for prefix in ("gene ", "Gene ", "GENE ", "human ", "Human "):
63
+ if gene.startswith(prefix):
64
+ gene = gene[len(prefix):]
65
+ gene = gene.strip()
66
+ # Gene symbols should be uppercase alphanumeric (with hyphens/dots allowed)
67
+ # If it looks like a gene symbol, uppercase it
68
+ if re.match(r'^[A-Za-z][A-Za-z0-9._-]*$', gene):
69
+ gene = gene.upper()
70
+ return gene
71
+
72
+
73
+ def _normalize_drug_query(query: str) -> str:
74
+ """Strip noise words from drug name queries that confuse APIs."""
75
+ noise_prefixes = [
76
+ "fda-approved ", "fda approved ", "approved drug ",
77
+ "drug ", "compound ", "the drug ", "the compound ",
78
+ "investigational ", "experimental ",
79
+ ]
80
+ cleaned = query.strip()
81
+ # Keep stripping prefixes (case-insensitive) until none match
82
+ changed = True
83
+ while changed:
84
+ changed = False
85
+ lower = cleaned.lower()
86
+ for prefix in noise_prefixes:
87
+ if lower.startswith(prefix):
88
+ cleaned = cleaned[len(prefix):]
89
+ changed = True
90
+ break
91
+ return cleaned.strip()
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # 1. DepMap search
96
+ # ---------------------------------------------------------------------------
97
+
98
+ @registry.register(
99
+ name="data_api.depmap_search",
100
+ description="Search DepMap for gene dependency scores across cancer cell lines",
101
+ category="data_api",
102
+ parameters={
103
+ "gene": "Gene symbol (e.g. BRCA1, TP53)",
104
+ "dataset": "Dataset to query: 'crispr', 'expression', 'mutations', or 'cn' (default 'crispr')",
105
+ },
106
+ requires_data=[],
107
+ usage_guide="You want DepMap gene dependency data across cell lines. Returns dependency scores, most/least dependent lineages. Uses local DepMap data when available, or the Cell Model Passports API as fallback.",
108
+ )
109
+ def depmap_search(gene: str, dataset: str = "crispr", **kwargs) -> dict:
110
+ """Search DepMap for gene dependency / expression / mutation data.
111
+
112
+ Tries local DepMap data first (via ct data loaders), then falls back to
113
+ the Cell Model Passports API (public, no key required).
114
+ """
115
+ import numpy as np
116
+
117
+ valid_datasets = ("crispr", "expression", "mutations", "cn")
118
+ if dataset not in valid_datasets:
119
+ return {"error": f"Invalid dataset '{dataset}'. Choose from: {', '.join(valid_datasets)}", "summary": f"Invalid dataset '{dataset}'"}
120
+
121
+ # Normalize gene name
122
+ gene = _normalize_gene_name(gene)
123
+
124
+ # --- Attempt local DepMap data ---
125
+ if dataset == "crispr":
126
+ try:
127
+ from ct.data.loaders import load_crispr
128
+ crispr = load_crispr()
129
+ # Try exact match first, then common variations
130
+ if gene not in crispr.columns:
131
+ # Try with/without hyphens, dots, etc.
132
+ found = False
133
+ for variant in [gene.replace("-", ""), gene.replace(".", ""), gene + "A"]:
134
+ if variant in crispr.columns:
135
+ _logger.warning("Gene '%s' not found, using variant '%s'", gene, variant)
136
+ gene = variant
137
+ found = True
138
+ break
139
+ if not found:
140
+ # Try partial match (e.g., "CD274" matches "CD274 (PD-L1)")
141
+ matches = [c for c in crispr.columns if c.startswith(gene + " ") or c == gene]
142
+ if matches:
143
+ gene = matches[0]
144
+ _logger.warning("Gene exact match not found, using '%s'", gene)
145
+ else:
146
+ return {"error": f"Gene {gene} not found in local DepMap CRISPR data", "summary": f"Gene {gene} not in DepMap CRISPR"}
147
+
148
+ scores = crispr[gene].dropna()
149
+ n_lines = len(scores)
150
+ essential = (scores < -0.5).sum()
151
+ mean_score = float(scores.mean())
152
+ min_score = float(scores.min())
153
+
154
+ # Lineage info if model metadata available
155
+ lineage_stats = []
156
+ try:
157
+ from ct.data.loaders import load_model_metadata
158
+ model = load_model_metadata()
159
+ merged = scores.to_frame(name="score").join(
160
+ model.set_index("ModelID")["OncotreeLineage"], how="left"
161
+ )
162
+ if "OncotreeLineage" in merged.columns:
163
+ for lin, grp in merged.groupby("OncotreeLineage"):
164
+ lineage_stats.append({
165
+ "lineage": lin,
166
+ "mean_score": round(float(grp["score"].mean()), 4),
167
+ "n_lines": len(grp),
168
+ "n_essential": int((grp["score"] < -0.5).sum()),
169
+ })
170
+ lineage_stats.sort(key=lambda x: x["mean_score"])
171
+ except Exception:
172
+ pass
173
+
174
+ most_dependent = [ls["lineage"] for ls in lineage_stats[:3]] if lineage_stats else []
175
+ least_dependent = [ls["lineage"] for ls in lineage_stats[-3:]] if lineage_stats else []
176
+
177
+ return {
178
+ "summary": (
179
+ f"{gene} dependency (DepMap CRISPR): essential in {essential}/{n_lines} lines, "
180
+ f"mean score {mean_score:.3f}"
181
+ + (f", most dependent: {', '.join(most_dependent)}" if most_dependent else "")
182
+ ),
183
+ "gene": gene,
184
+ "dataset": "crispr",
185
+ "n_cell_lines": n_lines,
186
+ "n_essential": int(essential),
187
+ "mean_score": round(mean_score, 4),
188
+ "min_score": round(min_score, 4),
189
+ "lineage_stats": lineage_stats[:20],
190
+ "most_dependent_lineages": most_dependent,
191
+ "least_dependent_lineages": least_dependent,
192
+ }
193
+ except (ImportError, FileNotFoundError):
194
+ pass # Fall through to API
195
+
196
+ if dataset == "mutations":
197
+ try:
198
+ from ct.data.loaders import load_mutations
199
+ mutations = load_mutations()
200
+ if gene not in mutations.columns:
201
+ return {"error": f"Gene {gene} not found in local DepMap mutation data"}
202
+
203
+ mutated = mutations[gene].dropna()
204
+ n_lines = len(mutated)
205
+ n_mutated = int((mutated > 0).sum())
206
+ mutation_rate = n_mutated / n_lines if n_lines > 0 else 0
207
+
208
+ return {
209
+ "summary": (
210
+ f"{gene} mutations (DepMap): mutated in {n_mutated}/{n_lines} lines "
211
+ f"({mutation_rate:.1%})"
212
+ ),
213
+ "gene": gene,
214
+ "dataset": "mutations",
215
+ "n_cell_lines": n_lines,
216
+ "n_mutated": n_mutated,
217
+ "mutation_rate": round(mutation_rate, 4),
218
+ }
219
+ except (ImportError, FileNotFoundError):
220
+ pass # Fall through to API
221
+
222
+ # --- Fallback: Cell Model Passports API ---
223
+ try:
224
+ resp = _http_get(
225
+ "https://www.cellmodelpassports.sanger.ac.uk/api/v1/genes",
226
+ params={"search": gene, "page_size": 5},
227
+ timeout=15,
228
+ )
229
+ if resp.status_code != 200:
230
+ return {
231
+ "error": f"Cell Model Passports API returned HTTP {resp.status_code}",
232
+ "summary": f"Could not query DepMap/CMP for {gene}",
233
+ }
234
+ data = resp.json()
235
+ except Exception as e:
236
+ import httpx
237
+ if isinstance(e, httpx.TimeoutException):
238
+ return {"error": "Cell Model Passports API timed out", "summary": f"CMP timeout for {gene}"}
239
+ if isinstance(e, httpx.HTTPError):
240
+ return {"error": f"CMP API error: {e}", "summary": f"CMP query failed for {gene}"}
241
+ return {"error": f"CMP API error: {e}", "summary": f"CMP query failed for {gene}"}
242
+
243
+ results = data.get("data", data.get("results", []))
244
+ if not results:
245
+ return {
246
+ "error": f"Gene {gene} not found in Cell Model Passports",
247
+ "summary": f"No results for {gene} in CMP",
248
+ }
249
+
250
+ gene_info = results[0] if isinstance(results, list) else results
251
+ return {
252
+ "summary": f"DepMap/CMP: {gene} — found in Cell Model Passports database",
253
+ "gene": gene,
254
+ "dataset": dataset,
255
+ "source": "cell_model_passports",
256
+ "gene_info": gene_info,
257
+ }
258
+
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # 2. Open Targets search
262
+ # ---------------------------------------------------------------------------
263
+
264
+ @registry.register(
265
+ name="data_api.opentargets_search",
266
+ description="Search Open Targets Platform for comprehensive target, disease, or drug profiles",
267
+ category="data_api",
268
+ parameters={
269
+ "query": "Gene name, disease name, or drug name",
270
+ "entity_type": "Entity type: 'target', 'disease', or 'drug' (default 'target')",
271
+ },
272
+ requires_data=[],
273
+ usage_guide="You want a comprehensive profile from Open Targets: disease associations for a target, associated targets for a disease, or indications/mechanisms for a drug. General-purpose Open Targets access.",
274
+ )
275
+ def opentargets_search(query: str, entity_type: str = "target", **kwargs) -> dict:
276
+ """Query Open Targets Platform GraphQL API for target/disease/drug profiles."""
277
+ ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
278
+ headers = {"Content-Type": "application/json"}
279
+
280
+ valid_types = ("target", "disease", "drug")
281
+ if entity_type not in valid_types:
282
+ return {"error": f"Invalid entity_type '{entity_type}'. Choose from: {', '.join(valid_types)}", "summary": f"Invalid entity type '{entity_type}'"}
283
+
284
+ # Normalize query based on entity type
285
+ if entity_type == "target":
286
+ query = _normalize_gene_name(query)
287
+ elif entity_type == "drug":
288
+ query = _normalize_drug_query(query)
289
+
290
+ # Step 1: Search to resolve ID
291
+ search_gql = """
292
+ query search($q: String!, $entities: [String!]!) {
293
+ search(queryString: $q, entityNames: $entities, page: {size: 5, index: 0}) {
294
+ total
295
+ hits { id entity name description }
296
+ }
297
+ }
298
+ """
299
+ entity_names = {
300
+ "target": ["target"],
301
+ "disease": ["disease"],
302
+ "drug": ["drug"],
303
+ }
304
+
305
+ try:
306
+ search_resp = _http_post(
307
+ ot_url,
308
+ json={"query": search_gql, "variables": {"q": query, "entities": entity_names[entity_type]}},
309
+ headers=headers,
310
+ timeout=15,
311
+ )
312
+ search_resp.raise_for_status()
313
+ search_data = search_resp.json()
314
+ except Exception as e:
315
+ import httpx
316
+ if isinstance(e, httpx.TimeoutException):
317
+ return {"error": f"Open Targets search timed out for '{query}'", "summary": f"Open Targets timed out for '{query}'"}
318
+ if isinstance(e, httpx.HTTPError):
319
+ return {"error": f"Open Targets search failed: {e}", "summary": f"Open Targets search failed"}
320
+ return {"error": f"Open Targets search failed: {e}", "summary": f"Open Targets search failed"}
321
+
322
+ hits = search_data.get("data", {}).get("search", {}).get("hits", [])
323
+ total = search_data.get("data", {}).get("search", {}).get("total", 0)
324
+
325
+ if not hits:
326
+ return {
327
+ "error": f"No {entity_type} found for '{query}' in Open Targets",
328
+ "summary": f"Open Targets: no {entity_type} matches for '{query}'",
329
+ }
330
+
331
+ top_hit = hits[0]
332
+ entity_id = top_hit["id"]
333
+ entity_name = top_hit.get("name", query)
334
+
335
+ # Step 2: Fetch detailed profile
336
+ if entity_type == "target":
337
+ detail_gql = """
338
+ query targetProfile($id: String!) {
339
+ target(ensemblId: $id) {
340
+ id
341
+ approvedSymbol
342
+ approvedName
343
+ biotype
344
+ functionDescriptions
345
+ subcellularLocations { location }
346
+ tractability {
347
+ label
348
+ modality
349
+ value
350
+ }
351
+ associatedDiseases(page: {size: 10, index: 0}) {
352
+ count
353
+ rows {
354
+ disease { id name }
355
+ score
356
+ }
357
+ }
358
+ knownDrugs(size: 10) {
359
+ uniqueDrugs
360
+ rows {
361
+ prefName
362
+ drugType
363
+ mechanismOfAction
364
+ phase
365
+ }
366
+ }
367
+ }
368
+ }
369
+ """
370
+ variables = {"id": entity_id}
371
+
372
+ elif entity_type == "disease":
373
+ detail_gql = """
374
+ query diseaseProfile($id: String!) {
375
+ disease(efoId: $id) {
376
+ id
377
+ name
378
+ description
379
+ therapeuticAreas { id name }
380
+ associatedTargets(page: {size: 10, index: 0}) {
381
+ count
382
+ rows {
383
+ target { id approvedSymbol }
384
+ score
385
+ }
386
+ }
387
+ knownDrugs(size: 10) {
388
+ uniqueDrugs
389
+ rows {
390
+ prefName
391
+ drugType
392
+ phase
393
+ mechanismOfAction
394
+ }
395
+ }
396
+ }
397
+ }
398
+ """
399
+ variables = {"id": entity_id}
400
+
401
+ else: # drug
402
+ detail_gql = """
403
+ query drugProfile($id: String!) {
404
+ drug(chemblId: $id) {
405
+ id
406
+ name
407
+ drugType
408
+ maximumClinicalTrialPhase
409
+ hasBeenWithdrawn
410
+ description
411
+ mechanismsOfAction {
412
+ rows {
413
+ mechanismOfAction
414
+ targets { id approvedSymbol }
415
+ }
416
+ }
417
+ indications {
418
+ count
419
+ rows {
420
+ disease { id name }
421
+ maxPhaseForIndication
422
+ }
423
+ }
424
+ }
425
+ }
426
+ """
427
+ variables = {"id": entity_id}
428
+
429
+ try:
430
+ detail_resp = _http_post(
431
+ ot_url,
432
+ json={"query": detail_gql, "variables": variables},
433
+ headers=headers,
434
+ timeout=15,
435
+ )
436
+ detail_resp.raise_for_status()
437
+ detail_data = detail_resp.json()
438
+ except Exception as e:
439
+ import httpx
440
+ if isinstance(e, httpx.TimeoutException):
441
+ return {"error": f"Open Targets detail query timed out for {entity_id}", "summary": f"Open Targets detail timed out"}
442
+ if isinstance(e, httpx.HTTPError):
443
+ return {"error": f"Open Targets detail query failed: {e}", "summary": f"Open Targets detail query failed"}
444
+ return {"error": f"Open Targets detail query failed: {e}", "summary": f"Open Targets detail query failed"}
445
+
446
+ data_root = detail_data.get("data", {})
447
+
448
+ if entity_type == "target":
449
+ target = data_root.get("target") or {}
450
+ assoc = target.get("associatedDiseases", {})
451
+ n_diseases = assoc.get("count", 0)
452
+ top_diseases = [
453
+ {"disease": r["disease"]["name"], "score": round(r["score"], 3)}
454
+ for r in assoc.get("rows", [])
455
+ ]
456
+ known_drugs = target.get("knownDrugs", {})
457
+ n_drugs = known_drugs.get("uniqueDrugs", 0)
458
+ drug_rows = known_drugs.get("rows", [])
459
+ tractability = target.get("tractability", [])
460
+
461
+ top_disease_str = ", ".join(
462
+ f"{d['disease']} ({d['score']:.2f})" for d in top_diseases[:3]
463
+ )
464
+ return {
465
+ "summary": (
466
+ f"Open Targets: {target.get('approvedSymbol', query)} — "
467
+ f"{n_diseases} disease associations, "
468
+ f"top: {top_disease_str or 'none'}. "
469
+ f"{n_drugs} known drug(s)."
470
+ ),
471
+ "entity_type": "target",
472
+ "entity_id": entity_id,
473
+ "approved_symbol": target.get("approvedSymbol", ""),
474
+ "approved_name": target.get("approvedName", ""),
475
+ "biotype": target.get("biotype", ""),
476
+ "function": target.get("functionDescriptions", []),
477
+ "tractability": tractability,
478
+ "n_disease_associations": n_diseases,
479
+ "top_diseases": top_diseases,
480
+ "n_known_drugs": n_drugs,
481
+ "known_drugs": [
482
+ {
483
+ "name": d.get("prefName", ""),
484
+ "type": d.get("drugType", ""),
485
+ "mechanism": d.get("mechanismOfAction", ""),
486
+ "phase": d.get("phase", 0),
487
+ }
488
+ for d in drug_rows[:10]
489
+ ],
490
+ }
491
+
492
+ elif entity_type == "disease":
493
+ disease = data_root.get("disease") or {}
494
+ assoc = disease.get("associatedTargets", {})
495
+ n_targets = assoc.get("count", 0)
496
+ top_targets = [
497
+ {"gene": r["target"]["approvedSymbol"], "score": round(r["score"], 3)}
498
+ for r in assoc.get("rows", [])
499
+ ]
500
+ therapeutic_areas = [ta["name"] for ta in disease.get("therapeuticAreas", [])]
501
+ known_drugs = disease.get("knownDrugs", {})
502
+ n_drugs = known_drugs.get("uniqueDrugs", 0)
503
+
504
+ top_target_str = ", ".join(
505
+ f"{t['gene']} ({t['score']:.2f})" for t in top_targets[:3]
506
+ )
507
+ return {
508
+ "summary": (
509
+ f"Open Targets: {disease.get('name', query)} — "
510
+ f"{n_targets} associated targets, "
511
+ f"top: {top_target_str or 'none'}. "
512
+ f"Areas: {', '.join(therapeutic_areas[:3]) or 'N/A'}."
513
+ ),
514
+ "entity_type": "disease",
515
+ "entity_id": entity_id,
516
+ "name": disease.get("name", ""),
517
+ "description": disease.get("description", ""),
518
+ "therapeutic_areas": therapeutic_areas,
519
+ "n_associated_targets": n_targets,
520
+ "top_targets": top_targets,
521
+ "n_known_drugs": n_drugs,
522
+ }
523
+
524
+ else: # drug
525
+ drug = data_root.get("drug") or {}
526
+ moa_rows = drug.get("mechanismsOfAction", {}).get("rows", [])
527
+ indications = drug.get("indications", {})
528
+ n_indications = indications.get("count", 0)
529
+ ind_rows = indications.get("rows", [])
530
+
531
+ mechanisms = [m.get("mechanismOfAction", "") for m in moa_rows]
532
+ return {
533
+ "summary": (
534
+ f"Open Targets: {drug.get('name', query)} — "
535
+ f"{drug.get('drugType', 'unknown')} drug, "
536
+ f"max phase {drug.get('maximumClinicalTrialPhase', 'N/A')}, "
537
+ f"{n_indications} indications."
538
+ ),
539
+ "entity_type": "drug",
540
+ "entity_id": entity_id,
541
+ "name": drug.get("name", ""),
542
+ "drug_type": drug.get("drugType", ""),
543
+ "max_clinical_phase": drug.get("maximumClinicalTrialPhase"),
544
+ "withdrawn": drug.get("hasBeenWithdrawn", False),
545
+ "description": drug.get("description", ""),
546
+ "mechanisms": mechanisms,
547
+ "n_indications": n_indications,
548
+ "indications": [
549
+ {"disease": r["disease"]["name"], "max_phase": r.get("maxPhaseForIndication")}
550
+ for r in ind_rows[:15]
551
+ ],
552
+ }
553
+
554
+
555
+ # ---------------------------------------------------------------------------
556
+ # 3. UniProt lookup
557
+ # ---------------------------------------------------------------------------
558
+
559
+ _UNIPROT_NON_HUMAN_HINTS = (
560
+ "helminth",
561
+ "parasite",
562
+ "schistosoma",
563
+ "fasciola",
564
+ "heligmosomoides",
565
+ "nematode",
566
+ "trematode",
567
+ "cestode",
568
+ "worm",
569
+ "brugia",
570
+ "filaria",
571
+ )
572
+
573
+ _UNIPROT_QUERY_STOPWORDS = {
574
+ "a", "an", "the", "and", "or", "for", "from", "with", "without",
575
+ "in", "on", "of", "to", "by", "via", "as", "that", "this", "these",
576
+ "those", "are", "is", "was", "were", "be", "been", "being", "it",
577
+ "its", "their", "minimal", "annotation", "annotations", "key", "keys",
578
+ "look", "lookup", "search", "find", "protein", "proteins", "immunomodulatory",
579
+ }
580
+
581
+
582
+ def _query_has_non_human_hints(query: str) -> bool:
583
+ q = (query or "").lower()
584
+ return any(hint in q for hint in _UNIPROT_NON_HUMAN_HINTS)
585
+
586
+
587
+ def _keyword_fallback_query(query: str, max_terms: int = 7) -> str:
588
+ tokens = re.findall(r"[A-Za-z0-9_-]+", (query or "").lower())
589
+ selected = []
590
+ for tok in tokens:
591
+ if len(tok) < 3:
592
+ continue
593
+ if tok in _UNIPROT_QUERY_STOPWORDS:
594
+ continue
595
+ if tok not in selected:
596
+ selected.append(tok)
597
+ if len(selected) >= max_terms:
598
+ break
599
+ return " ".join(selected)
600
+
601
+
602
+ def _extract_species_phrases(query: str, max_species: int = 3) -> list[str]:
603
+ """Extract likely binomial species names from free-text query."""
604
+ tokens = re.findall(r"[A-Za-z][A-Za-z-]*", query or "")
605
+ species: list[str] = []
606
+ for i in range(len(tokens) - 1):
607
+ genus_raw = tokens[i]
608
+ species_raw = tokens[i + 1]
609
+ genus = genus_raw.lower()
610
+ epithet = species_raw.lower()
611
+
612
+ if len(genus) < 3 or len(epithet) < 3:
613
+ continue
614
+ if genus in _UNIPROT_QUERY_STOPWORDS or epithet in _UNIPROT_QUERY_STOPWORDS:
615
+ continue
616
+ if not epithet.isalpha():
617
+ continue
618
+ if not (genus in _UNIPROT_NON_HUMAN_HINTS or genus_raw[0].isupper()):
619
+ continue
620
+
621
+ phrase = f"{genus.capitalize()} {epithet}"
622
+ if phrase not in species:
623
+ species.append(phrase)
624
+ if len(species) >= max_species:
625
+ break
626
+ return species
627
+
628
+
629
+ def _build_uniprot_search_candidates(
630
+ *,
631
+ query: str,
632
+ compact_query: str,
633
+ org_clause: str | None,
634
+ ) -> list[str]:
635
+ """Generate ranked UniProt search candidates for robust retrieval."""
636
+ q = (query or "").strip()
637
+ q_lc = q.lower()
638
+ species = _extract_species_phrases(q)
639
+ candidates: list[str] = []
640
+
641
+ def add(candidate: str):
642
+ if candidate and candidate not in candidates:
643
+ candidates.append(candidate)
644
+
645
+ if q:
646
+ if org_clause:
647
+ add(f"({q}) AND {org_clause}")
648
+ add(q)
649
+
650
+ if compact_query and compact_query != q:
651
+ if org_clause:
652
+ add(f"({compact_query}) AND {org_clause}")
653
+ add(compact_query)
654
+
655
+ wants_secreted = any(x in q_lc for x in ("secreted", "excretory", "extracellular", "vesicle", "ev "))
656
+ wants_uncharacterized = any(x in q_lc for x in ("uncharacterized", "understudied", "novel", "hypothetical"))
657
+ wants_scp_taps = any(
658
+ x in q_lc for x in ("venom allergen", "scp", "taps", "val", "cap superfamily", "allergen-like")
659
+ )
660
+
661
+ for sp in species:
662
+ sp_clause = f'organism_name:"{sp}"'
663
+ add(sp_clause)
664
+ if wants_secreted:
665
+ add(f'{sp_clause} AND (secreted OR excretory OR extracellular)')
666
+ if wants_uncharacterized:
667
+ add(f'{sp_clause} AND (uncharacterized OR hypothetical)')
668
+ if wants_scp_taps:
669
+ add(f'{sp_clause} AND ("venom allergen" OR SCP OR TAPS OR VAL)')
670
+
671
+ if _query_has_non_human_hints(q):
672
+ add("parasite")
673
+ add("helminth")
674
+ add("schistosoma")
675
+ add("fasciola")
676
+ add("heligmosomoides")
677
+ if wants_secreted:
678
+ add("(parasite OR helminth) AND (secreted OR excretory OR extracellular)")
679
+ if wants_scp_taps:
680
+ add('(parasite OR helminth) AND ("venom allergen" OR SCP OR TAPS OR VAL)')
681
+
682
+ # Keep search bounded to avoid excessive API calls.
683
+ return candidates[:12]
684
+
685
+
686
+ def _entry_text_blob(entry: dict) -> str:
687
+ parts: list[str] = []
688
+ pd = entry.get("proteinDescription", {}) or {}
689
+ rec = (pd.get("recommendedName", {}) or {}).get("fullName", {}) or {}
690
+ if rec.get("value"):
691
+ parts.append(str(rec.get("value")))
692
+ for alt in (pd.get("alternativeNames", []) or []):
693
+ full = (alt.get("fullName", {}) or {}).get("value")
694
+ if full:
695
+ parts.append(str(full))
696
+ for kw in (entry.get("keywords", []) or []):
697
+ name = kw.get("name")
698
+ if name:
699
+ parts.append(str(name))
700
+ org = entry.get("organism", {}) or {}
701
+ sci = org.get("scientificName")
702
+ if sci:
703
+ parts.append(str(sci))
704
+ return " ".join(parts).lower()
705
+
706
+
707
+ def _entry_relevance_score(
708
+ entry: dict,
709
+ *,
710
+ original_query: str,
711
+ species_phrases: list[str],
712
+ non_human_hints: bool,
713
+ ) -> float:
714
+ q_lc = (original_query or "").lower()
715
+ blob = _entry_text_blob(entry)
716
+ score = 0.0
717
+
718
+ wants_secreted = any(x in q_lc for x in ("secreted", "excretory", "extracellular", "vesicle", "ev "))
719
+ wants_uncharacterized = any(x in q_lc for x in ("uncharacterized", "understudied", "novel", "hypothetical"))
720
+ wants_scp_taps = any(x in q_lc for x in ("venom allergen", "scp", "taps", "val", "cap superfamily"))
721
+
722
+ # Species alignment dominates ranking.
723
+ for sp in species_phrases:
724
+ if sp.lower() in blob:
725
+ score += 8.0
726
+ if non_human_hints and "homo sapiens" in blob:
727
+ score -= 10.0
728
+
729
+ if wants_secreted and any(x in blob for x in ("secreted", "excretory", "extracellular", "signal peptide")):
730
+ score += 3.0
731
+ if wants_uncharacterized and any(x in blob for x in ("uncharacterized", "hypothetical", "putative")):
732
+ score += 3.0
733
+ if wants_scp_taps and any(x in blob for x in ("venom allergen", "scp", "taps", "val", "cap")):
734
+ score += 4.0
735
+
736
+ # Penalize clearly off-target "query not represented in entry text".
737
+ query_tokens = [t for t in re.findall(r"[a-z0-9_-]+", q_lc) if len(t) >= 4]
738
+ overlap = sum(1 for t in query_tokens[:8] if t in blob)
739
+ score += min(overlap * 0.5, 2.0)
740
+
741
+ entry_type = str(entry.get("entryType", "")).lower()
742
+ if wants_uncharacterized and "unreviewed" in entry_type:
743
+ score += 1.0
744
+
745
+ return score
746
+
747
+
748
+ @registry.register(
749
+ name="data_api.uniprot_lookup",
750
+ description="Look up comprehensive protein information from UniProt by gene symbol, UniProt ID, or protein name",
751
+ category="data_api",
752
+ parameters={
753
+ "query": "Gene symbol, UniProt accession (e.g. P04637), or protein name",
754
+ "organism": "Organism filter: common name (human/mouse/...), taxonomy ID, or 'any' (default 'human')",
755
+ },
756
+ requires_data=[],
757
+ usage_guide="You need detailed protein information: function, domains, subcellular location, GO terms, PDB structures, disease involvement, tissue specificity. Comprehensive UniProt protein profile.",
758
+ )
759
+ def uniprot_lookup(query: str, organism: str = "human", **kwargs) -> dict:
760
+ """Look up comprehensive protein data from UniProt REST API."""
761
+ organism_ids = {
762
+ "human": 9606, "mouse": 10090, "rat": 10116,
763
+ "zebrafish": 7955, "drosophila": 7227, "yeast": 559292,
764
+ }
765
+ organism_clean = (organism or "human").strip()
766
+ organism_lc = organism_clean.lower()
767
+
768
+ org_clause = None
769
+ if organism_lc not in ("", "any", "all", "none"):
770
+ if organism_lc.isdigit():
771
+ org_clause = f"organism_id:{organism_lc}"
772
+ elif organism_lc in organism_ids:
773
+ org_clause = f"organism_id:{organism_ids[organism_lc]}"
774
+ else:
775
+ escaped = organism_clean.replace('"', "")
776
+ if escaped:
777
+ org_clause = f'organism_name:"{escaped}"'
778
+
779
+ # If caller left default "human" but query clearly targets non-human organisms
780
+ # (e.g., helminth parasite proteins), do not force a human-only filter.
781
+ if organism_lc == "human" and _query_has_non_human_hints(query):
782
+ org_clause = None
783
+
784
+ # Determine if query is a UniProt accession (e.g. P04637, Q9Y6K9)
785
+ is_accession = len(query) >= 6 and query[0].isalpha() and any(c.isdigit() for c in query)
786
+ species_phrases = _extract_species_phrases(query)
787
+ non_human_hints = _query_has_non_human_hints(query)
788
+
789
+ try:
790
+ if is_accession and not " " in query:
791
+ # Direct accession lookup
792
+ resp = _http_get(
793
+ f"https://rest.uniprot.org/uniprotkb/{query}",
794
+ headers={"Accept": "application/json"},
795
+ timeout=15,
796
+ retries=2,
797
+ )
798
+ if resp.status_code == 200:
799
+ entries = [resp.json()]
800
+ else:
801
+ entries = []
802
+ else:
803
+ entries = []
804
+
805
+ # If direct lookup failed, search
806
+ if not entries:
807
+ base_query = " ".join((query or "").split())
808
+ compact_query = _keyword_fallback_query(base_query)
809
+ search_candidates = _build_uniprot_search_candidates(
810
+ query=base_query,
811
+ compact_query=compact_query,
812
+ org_clause=org_clause,
813
+ )
814
+
815
+ attempted_queries = []
816
+ last_status = None
817
+ matched_query = None
818
+ best_entry = None
819
+ best_score = float("-inf")
820
+ for search_query in search_candidates:
821
+ attempted_queries.append(search_query)
822
+ resp = _http_get(
823
+ "https://rest.uniprot.org/uniprotkb/search",
824
+ params={
825
+ "query": search_query,
826
+ "format": "json",
827
+ "size": 10,
828
+ },
829
+ headers={"Accept": "application/json"},
830
+ timeout=15,
831
+ retries=2,
832
+ )
833
+ last_status = resp.status_code
834
+ if resp.status_code != 200:
835
+ continue
836
+ data = resp.json()
837
+ hits = data.get("results", [])
838
+ if not hits:
839
+ continue
840
+
841
+ for hit in hits:
842
+ s = _entry_relevance_score(
843
+ hit,
844
+ original_query=query,
845
+ species_phrases=species_phrases,
846
+ non_human_hints=non_human_hints,
847
+ )
848
+ if s > best_score:
849
+ best_score = s
850
+ best_entry = hit
851
+ matched_query = search_query
852
+
853
+ if best_score >= 4.0:
854
+ break
855
+
856
+ if best_entry is not None:
857
+ entries = [best_entry]
858
+
859
+ if not entries and last_status not in (None, 200):
860
+ return {
861
+ "error": f"UniProt search failed (HTTP {last_status})",
862
+ "summary": f"UniProt search failed for '{query}'",
863
+ "search_attempts": attempted_queries,
864
+ }
865
+
866
+ except Exception as e:
867
+ return {"error": f"UniProt API error: {e}", "summary": f"UniProt query failed for '{query}'"}
868
+
869
+ if entries and non_human_hints:
870
+ org_name = str((entries[0].get("organism", {}) or {}).get("scientificName", "")).lower()
871
+ if "homo sapiens" in org_name and (matched_query is not None):
872
+ return {
873
+ "error": (
874
+ "Only human hits were returned for a non-human/parasite query. "
875
+ "Please specify organism='any' or a concrete parasite species (taxid/scientific name)."
876
+ ),
877
+ "summary": f"UniProt: no reliable non-human match for '{query}'",
878
+ "search_attempts": attempted_queries if "attempted_queries" in locals() else [],
879
+ }
880
+
881
+ if not entries:
882
+ return {
883
+ "error": f"No UniProt entry found for '{query}' (organism: {organism_clean or 'any'})",
884
+ "summary": f"UniProt: no results for '{query}'",
885
+ "search_attempts": attempted_queries if "attempted_queries" in locals() else [],
886
+ }
887
+
888
+ entry = entries[0]
889
+
890
+ # Extract fields
891
+ accession = entry.get("primaryAccession", "")
892
+ gene_names = []
893
+ for g in entry.get("genes", []):
894
+ gn = g.get("geneName", {}).get("value")
895
+ if gn:
896
+ gene_names.append(gn)
897
+ for syn in g.get("synonyms", []):
898
+ gene_names.append(syn.get("value", ""))
899
+
900
+ protein_name = (
901
+ entry.get("proteinDescription", {})
902
+ .get("recommendedName", {})
903
+ .get("fullName", {})
904
+ .get("value", "Unknown")
905
+ )
906
+
907
+ seq_info = entry.get("sequence", {})
908
+ seq_length = seq_info.get("length", 0)
909
+
910
+ # Function
911
+ function_texts = []
912
+ for c in entry.get("comments", []):
913
+ if c.get("commentType") == "FUNCTION":
914
+ for t in c.get("texts", []):
915
+ function_texts.append(t.get("value", ""))
916
+
917
+ # Subcellular location
918
+ subcellular = []
919
+ for c in entry.get("comments", []):
920
+ if c.get("commentType") == "SUBCELLULAR LOCATION":
921
+ for sl in c.get("subcellularLocations", []):
922
+ loc = sl.get("location", {}).get("value", "")
923
+ if loc:
924
+ subcellular.append(loc)
925
+
926
+ # Tissue specificity
927
+ tissue_specificity = ""
928
+ for c in entry.get("comments", []):
929
+ if c.get("commentType") == "TISSUE SPECIFICITY":
930
+ for t in c.get("texts", []):
931
+ tissue_specificity = t.get("value", "")
932
+
933
+ # Disease involvement
934
+ diseases = []
935
+ for c in entry.get("comments", []):
936
+ if c.get("commentType") == "DISEASE":
937
+ disease = c.get("disease", {})
938
+ if disease:
939
+ diseases.append({
940
+ "name": disease.get("diseaseId", ""),
941
+ "description": disease.get("description", ""),
942
+ "acronym": disease.get("acronym", ""),
943
+ })
944
+
945
+ # Features: domains, GO terms
946
+ features = entry.get("features", [])
947
+ domains = [
948
+ {"name": f.get("description", ""), "type": f.get("type", "")}
949
+ for f in features
950
+ if f.get("type") in ("Domain", "Repeat", "Zinc finger", "Motif")
951
+ ]
952
+
953
+ # GO terms from cross-references
954
+ xrefs = entry.get("uniProtKBCrossReferences", [])
955
+ go_terms = []
956
+ pdb_ids = []
957
+ for xref in xrefs:
958
+ db = xref.get("database", "")
959
+ if db == "GO":
960
+ props = {p["key"]: p["value"] for p in xref.get("properties", [])}
961
+ go_terms.append({
962
+ "id": xref.get("id", ""),
963
+ "term": props.get("GoTerm", ""),
964
+ "evidence": props.get("GoEvidenceType", ""),
965
+ })
966
+ elif db == "PDB":
967
+ pdb_ids.append(xref.get("id", ""))
968
+
969
+ # Keywords
970
+ keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
971
+
972
+ primary_gene = gene_names[0] if gene_names else query
973
+ n_pdb = len(pdb_ids)
974
+
975
+ return {
976
+ "summary": (
977
+ f"UniProt {accession} ({primary_gene}): {protein_name}, "
978
+ f"{seq_length} aa. "
979
+ + (f"{function_texts[0][:120]}... " if function_texts else "")
980
+ + f"{n_pdb} PDB structure(s)."
981
+ ),
982
+ "matched_query": matched_query if "matched_query" in locals() else query,
983
+ "organism_filter": org_clause or "none",
984
+ "accession": accession,
985
+ "gene_names": gene_names,
986
+ "protein_name": protein_name,
987
+ "sequence_length": seq_length,
988
+ "function": function_texts,
989
+ "subcellular_location": subcellular,
990
+ "tissue_specificity": tissue_specificity,
991
+ "diseases": diseases[:10],
992
+ "domains": domains[:20],
993
+ "go_terms": go_terms[:30],
994
+ "pdb_ids": pdb_ids[:30],
995
+ "n_pdb_structures": n_pdb,
996
+ "keywords": keywords,
997
+ "uniprot_url": f"https://www.uniprot.org/uniprot/{accession}",
998
+ }
999
+
1000
+
1001
+ # ---------------------------------------------------------------------------
1002
+ # 4. PDB search
1003
+ # ---------------------------------------------------------------------------
1004
+
1005
+ @registry.register(
1006
+ name="data_api.pdb_search",
1007
+ description="Search RCSB PDB for protein structures by gene name, UniProt ID, or PDB ID",
1008
+ category="data_api",
1009
+ parameters={
1010
+ "query": "Gene name, UniProt accession, or 4-character PDB ID",
1011
+ "method": "Optional experimental method filter: 'X-RAY', 'EM', 'NMR'",
1012
+ "max_results": "Maximum number of structures to return (default 10)",
1013
+ },
1014
+ requires_data=[],
1015
+ usage_guide="You want to find 3D protein structures for a target — PDB IDs, resolution, method, ligands. Use for structure-based drug design and target assessment.",
1016
+ )
1017
+ def pdb_search(query: str, method: str = None, max_results: int = 10, **kwargs) -> dict:
1018
+ """Search RCSB PDB for structures using the search and data APIs."""
1019
+ query_clean = query.strip()
1020
+
1021
+ # If query looks like a PDB ID (4 chars), fetch directly
1022
+ if len(query_clean) == 4 and query_clean.isalnum():
1023
+ return _fetch_pdb_entry(query_clean)
1024
+
1025
+ # Build RCSB search query
1026
+ search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
1027
+
1028
+ # Split multi-term queries into individual search nodes (AND logic)
1029
+ terms = query_clean.split()
1030
+ fallback_note = ""
1031
+ if len(terms) > 1:
1032
+ text_nodes = [
1033
+ {
1034
+ "type": "terminal",
1035
+ "service": "full_text",
1036
+ "parameters": {"value": term},
1037
+ }
1038
+ for term in terms
1039
+ ]
1040
+ else:
1041
+ text_nodes = [
1042
+ {
1043
+ "type": "terminal",
1044
+ "service": "full_text",
1045
+ "parameters": {"value": query_clean},
1046
+ }
1047
+ ]
1048
+
1049
+ # Construct search JSON
1050
+ query_json = {
1051
+ "query": {
1052
+ "type": "group",
1053
+ "logical_operator": "and",
1054
+ "nodes": text_nodes,
1055
+ },
1056
+ "return_type": "entry",
1057
+ "request_options": {
1058
+ "paginate": {"start": 0, "rows": max_results},
1059
+ "sort": [{"sort_by": "score", "direction": "desc"}],
1060
+ },
1061
+ }
1062
+
1063
+ # Add method filter if specified
1064
+ method_value = None
1065
+ if method:
1066
+ method_upper = method.upper()
1067
+ valid_methods = ("X-RAY DIFFRACTION", "ELECTRON MICROSCOPY", "SOLUTION NMR",
1068
+ "X-RAY", "EM", "NMR")
1069
+ if method_upper not in valid_methods:
1070
+ return {"error": f"Invalid method '{method}'. Use 'X-RAY', 'EM', or 'NMR'", "summary": f"Invalid PDB method '{method}'"}
1071
+
1072
+ method_map = {
1073
+ "X-RAY": "X-RAY DIFFRACTION",
1074
+ "EM": "ELECTRON MICROSCOPY",
1075
+ "NMR": "SOLUTION NMR",
1076
+ }
1077
+ method_value = method_map.get(method_upper, method_upper)
1078
+
1079
+ query_json["query"]["nodes"].append({
1080
+ "type": "terminal",
1081
+ "service": "text",
1082
+ "parameters": {
1083
+ "attribute": "exptl.method",
1084
+ "operator": "exact_match",
1085
+ "value": method_value,
1086
+ },
1087
+ })
1088
+
1089
+ try:
1090
+ resp = _http_post(search_url, json=query_json, timeout=15, retries=2)
1091
+ if resp.status_code != 200:
1092
+ return {
1093
+ "error": f"RCSB PDB search failed (HTTP {resp.status_code})",
1094
+ "summary": f"PDB search failed for '{query}'",
1095
+ }
1096
+ data = resp.json()
1097
+ except Exception as e:
1098
+ return {"error": f"PDB search error: {e}", "summary": f"PDB search failed for '{query}'"}
1099
+
1100
+ total_count = data.get("total_count", 0)
1101
+ result_set = data.get("result_set", [])
1102
+
1103
+ if not result_set and len(terms) > 1:
1104
+ # Retry with just the first term (likely the protein/gene name)
1105
+ fallback_json = {
1106
+ "query": {
1107
+ "type": "group",
1108
+ "logical_operator": "and",
1109
+ "nodes": [
1110
+ {
1111
+ "type": "terminal",
1112
+ "service": "full_text",
1113
+ "parameters": {"value": terms[0]},
1114
+ }
1115
+ ],
1116
+ },
1117
+ "return_type": "entry",
1118
+ "request_options": {
1119
+ "paginate": {"start": 0, "rows": max_results},
1120
+ "sort": [{"sort_by": "score", "direction": "desc"}],
1121
+ },
1122
+ }
1123
+ if method and method_value:
1124
+ # re-add method filter
1125
+ fallback_json["query"]["nodes"].append({
1126
+ "type": "terminal",
1127
+ "service": "text",
1128
+ "parameters": {
1129
+ "attribute": "exptl.method",
1130
+ "operator": "exact_match",
1131
+ "value": method_value,
1132
+ },
1133
+ })
1134
+ try:
1135
+ resp2 = _http_post(search_url, json=fallback_json, timeout=15, retries=2)
1136
+ if resp2.status_code == 200:
1137
+ data2 = resp2.json()
1138
+ result_set = data2.get("result_set", [])
1139
+ total_count = data2.get("total_count", 0)
1140
+ if result_set:
1141
+ fallback_note = f" (broadened from '{query}' to '{terms[0]}')"
1142
+ except Exception:
1143
+ pass # Keep original empty result
1144
+
1145
+ if not result_set:
1146
+ return {
1147
+ "summary": f"No PDB structures found for '{query}'",
1148
+ "query": query,
1149
+ "total_count": 0,
1150
+ "structures": [],
1151
+ }
1152
+
1153
+ pdb_ids = [r.get("identifier", "") for r in result_set if r.get("identifier")]
1154
+
1155
+ # Fetch details for each PDB entry
1156
+ structures = []
1157
+ for pdb_id in pdb_ids[:max_results]:
1158
+ try:
1159
+ detail_resp = _http_get(
1160
+ f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}",
1161
+ timeout=10,
1162
+ retries=2,
1163
+ )
1164
+ if detail_resp.status_code != 200:
1165
+ structures.append({"pdb_id": pdb_id, "error": "detail fetch failed"})
1166
+ continue
1167
+ detail = detail_resp.json()
1168
+
1169
+ struct_info = detail.get("struct", {})
1170
+ exptl = detail.get("exptl", [{}])[0] if detail.get("exptl") else {}
1171
+ cell = detail.get("cell", {})
1172
+ rcsb_info = detail.get("rcsb_entry_info", {})
1173
+
1174
+ # Get resolution
1175
+ resolution = None
1176
+ for refl in detail.get("reflns", []):
1177
+ resolution = refl.get("d_resolution_high")
1178
+ if resolution is None:
1179
+ resolution = rcsb_info.get("resolution_combined", [None])
1180
+ resolution = resolution[0] if isinstance(resolution, list) and resolution else resolution
1181
+
1182
+ # Get ligands from nonpolymer entities
1183
+ ligands = []
1184
+ for entity in detail.get("rcsb_entry_container_identifiers", {}).get("non_polymer_entity_ids", []):
1185
+ ligands.append(entity)
1186
+
1187
+ structures.append({
1188
+ "pdb_id": pdb_id,
1189
+ "title": struct_info.get("title", ""),
1190
+ "method": exptl.get("method", ""),
1191
+ "resolution": resolution,
1192
+ "deposition_date": detail.get("rcsb_accession_info", {}).get("deposit_date", ""),
1193
+ "organism": detail.get("rcsb_entry_info", {}).get("deposited_model_count", ""),
1194
+ "n_ligands": len(ligands),
1195
+ })
1196
+ except Exception:
1197
+ structures.append({"pdb_id": pdb_id, "error": "detail fetch failed"})
1198
+
1199
+ # Find best resolution
1200
+ resolutions = [s["resolution"] for s in structures if s.get("resolution")]
1201
+ best_res = min(resolutions) if resolutions else None
1202
+ best_id = None
1203
+ if best_res is not None:
1204
+ for s in structures:
1205
+ if s.get("resolution") == best_res:
1206
+ best_id = s["pdb_id"]
1207
+ break
1208
+
1209
+ method_str = f" ({method})" if method else ""
1210
+ best_str = f", best resolution {best_res:.1f}A ({best_id})" if best_res and best_id else ""
1211
+
1212
+ return {
1213
+ "summary": (
1214
+ f"PDB structures for {query}{method_str}: {total_count} total"
1215
+ f"{best_str}{fallback_note}"
1216
+ ),
1217
+ "query": query,
1218
+ "total_count": total_count,
1219
+ "n_returned": len(structures),
1220
+ "best_resolution": best_res,
1221
+ "best_pdb_id": best_id,
1222
+ "structures": structures,
1223
+ }
1224
+
1225
+
1226
+ def _fetch_pdb_entry(pdb_id: str) -> dict:
1227
+ """Fetch a single PDB entry by ID."""
1228
+ try:
1229
+ resp = _http_get(
1230
+ f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}",
1231
+ timeout=10,
1232
+ retries=2,
1233
+ )
1234
+ if resp.status_code == 404:
1235
+ return {"error": f"PDB entry {pdb_id} not found", "summary": f"No PDB entry for {pdb_id}"}
1236
+ if resp.status_code != 200:
1237
+ return {"error": f"PDB API returned HTTP {resp.status_code}", "summary": f"PDB API error: HTTP {resp.status_code}"}
1238
+ detail = resp.json()
1239
+ except Exception as e:
1240
+ return {"error": f"PDB API error: {e}", "summary": f"PDB API error: {e}"}
1241
+ struct_info = detail.get("struct", {})
1242
+ exptl = detail.get("exptl", [{}])[0] if detail.get("exptl") else {}
1243
+ rcsb_info = detail.get("rcsb_entry_info", {})
1244
+
1245
+ resolution = None
1246
+ for refl in detail.get("reflns", []):
1247
+ resolution = refl.get("d_resolution_high")
1248
+ if resolution is None:
1249
+ res_list = rcsb_info.get("resolution_combined", [])
1250
+ resolution = res_list[0] if isinstance(res_list, list) and res_list else None
1251
+
1252
+ return {
1253
+ "summary": f"PDB {pdb_id}: {struct_info.get('title', 'N/A')} ({exptl.get('method', 'N/A')}, {resolution or 'N/A'}A)",
1254
+ "pdb_id": pdb_id,
1255
+ "title": struct_info.get("title", ""),
1256
+ "method": exptl.get("method", ""),
1257
+ "resolution": resolution,
1258
+ "deposition_date": detail.get("rcsb_accession_info", {}).get("deposit_date", ""),
1259
+ "total_count": 1,
1260
+ "structures": [{
1261
+ "pdb_id": pdb_id,
1262
+ "title": struct_info.get("title", ""),
1263
+ "method": exptl.get("method", ""),
1264
+ "resolution": resolution,
1265
+ }],
1266
+ }
1267
+
1268
+
1269
+ # ---------------------------------------------------------------------------
1270
+ # 5. Ensembl lookup
1271
+ # ---------------------------------------------------------------------------
1272
+
1273
+ @registry.register(
1274
+ name="data_api.ensembl_lookup",
1275
+ description="Look up gene information from Ensembl: genomic coordinates, transcripts, cross-references",
1276
+ category="data_api",
1277
+ parameters={
1278
+ "gene": "Gene symbol (e.g. BRCA1) or Ensembl ID (e.g. ENSG00000012048)",
1279
+ "species": "Species name (default 'human')",
1280
+ },
1281
+ requires_data=[],
1282
+ usage_guide="You need gene-level genomic information: Ensembl ID, chromosome location, transcripts, biotype, cross-references. Use for gene annotation and ID mapping.",
1283
+ )
1284
+ def ensembl_lookup(gene: str, species: str = "human", **kwargs) -> dict:
1285
+ """Look up gene information from the Ensembl REST API."""
1286
+ ensembl_base = "https://rest.ensembl.org"
1287
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
1288
+
1289
+ species_map = {
1290
+ "human": "homo_sapiens", "mouse": "mus_musculus", "rat": "rattus_norvegicus",
1291
+ "zebrafish": "danio_rerio", "drosophila": "drosophila_melanogaster",
1292
+ }
1293
+ species_name = species_map.get(species.lower(), species.lower().replace(" ", "_"))
1294
+
1295
+ gene_clean = gene.strip()
1296
+
1297
+ # Determine if this is an Ensembl ID or a symbol
1298
+ if gene_clean.upper().startswith("ENSG") or gene_clean.upper().startswith("ENSMUSG"):
1299
+ # Direct ID lookup
1300
+ url = f"{ensembl_base}/lookup/id/{gene_clean}"
1301
+ params = {"expand": 1}
1302
+ else:
1303
+ # Symbol lookup
1304
+ url = f"{ensembl_base}/lookup/symbol/{species_name}/{gene_clean}"
1305
+ params = {"expand": 1}
1306
+
1307
+ try:
1308
+ resp = _http_get(url, params=params, headers=headers, timeout=15, retries=2)
1309
+ if resp.status_code == 400:
1310
+ return {
1311
+ "error": f"Gene '{gene}' not found in Ensembl ({species})",
1312
+ "summary": f"Ensembl: gene '{gene}' not found for {species}",
1313
+ }
1314
+ if resp.status_code != 200:
1315
+ return {"error": f"Ensembl API returned HTTP {resp.status_code}", "summary": f"Ensembl API error: HTTP {resp.status_code}"}
1316
+ data = resp.json()
1317
+ except Exception as e:
1318
+ return {"error": f"Ensembl API error: {e}", "summary": f"Ensembl API error: {e}"}
1319
+ ensembl_id = data.get("id", "")
1320
+ display_name = data.get("display_name", gene)
1321
+ description = data.get("description", "")
1322
+ biotype = data.get("biotype", "")
1323
+ chromosome = data.get("seq_region_name", "")
1324
+ start = data.get("start")
1325
+ end = data.get("end")
1326
+ strand = data.get("strand")
1327
+
1328
+ # Parse transcripts
1329
+ transcripts = []
1330
+ for t in data.get("Transcript", []):
1331
+ transcripts.append({
1332
+ "transcript_id": t.get("id", ""),
1333
+ "display_name": t.get("display_name", ""),
1334
+ "biotype": t.get("biotype", ""),
1335
+ "is_canonical": t.get("is_canonical", 0) == 1,
1336
+ "length": t.get("length"),
1337
+ })
1338
+
1339
+ n_transcripts = len(transcripts)
1340
+
1341
+ # Fetch cross-references (UniProt mapping)
1342
+ xrefs = []
1343
+ try:
1344
+ xref_resp = _http_get(
1345
+ f"{ensembl_base}/xrefs/id/{ensembl_id}",
1346
+ params={"external_db": "UniProt%"},
1347
+ headers=headers,
1348
+ timeout=10,
1349
+ retries=2,
1350
+ )
1351
+ if xref_resp.status_code == 200:
1352
+ for xref in xref_resp.json():
1353
+ xrefs.append({
1354
+ "database": xref.get("dbname", ""),
1355
+ "primary_id": xref.get("primary_id", ""),
1356
+ "display_id": xref.get("display_id", ""),
1357
+ })
1358
+ except Exception:
1359
+ pass
1360
+
1361
+ strand_str = "+" if strand == 1 else "-" if strand == -1 else "?"
1362
+ loc_str = f"chr{chromosome}:{start:,}-{end:,} ({strand_str})" if start and end else "unknown"
1363
+
1364
+ return {
1365
+ "summary": (
1366
+ f"{display_name} ({ensembl_id}): {biotype}, "
1367
+ f"{loc_str}, {n_transcripts} transcripts"
1368
+ ),
1369
+ "ensembl_id": ensembl_id,
1370
+ "display_name": display_name,
1371
+ "description": description,
1372
+ "biotype": biotype,
1373
+ "chromosome": chromosome,
1374
+ "start": start,
1375
+ "end": end,
1376
+ "strand": strand,
1377
+ "location": loc_str,
1378
+ "n_transcripts": n_transcripts,
1379
+ "transcripts": transcripts[:20],
1380
+ "cross_references": xrefs[:10],
1381
+ }
1382
+
1383
+
1384
+ # ---------------------------------------------------------------------------
1385
+ # 6. NCBI Gene
1386
+ # ---------------------------------------------------------------------------
1387
+
1388
+ @registry.register(
1389
+ name="data_api.ncbi_gene",
1390
+ description="Query NCBI databases for gene information, ClinVar variants, or dbSNP data",
1391
+ category="data_api",
1392
+ parameters={
1393
+ "query": "Gene symbol (e.g. BRCA1) or NCBI Gene ID (e.g. 672)",
1394
+ "database": "Database to query: 'gene', 'clinvar', or 'dbsnp' (default 'gene')",
1395
+ },
1396
+ requires_data=[],
1397
+ usage_guide="You need NCBI gene summaries, ClinVar clinical variant data, or dbSNP information for a gene. Use for gene annotation, variant interpretation, and clinical genetics.",
1398
+ )
1399
+ def ncbi_gene(query: str, database: str = "gene", **kwargs) -> dict:
1400
+ """Query NCBI E-utilities for gene, ClinVar, or dbSNP data."""
1401
+ valid_dbs = ("gene", "clinvar", "dbsnp")
1402
+ if database not in valid_dbs:
1403
+ return {"error": f"Invalid database '{database}'. Choose from: {', '.join(valid_dbs)}", "summary": f"Invalid NCBI database '{database}'"}
1404
+
1405
+ base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
1406
+
1407
+ # Step 1: Search for the gene/variant
1408
+ if database == "gene":
1409
+ search_term = f"{query}[Gene Name] AND Homo sapiens[Organism]"
1410
+ db = "gene"
1411
+ elif database == "clinvar":
1412
+ search_term = f"{query}[Gene Name]"
1413
+ db = "clinvar"
1414
+ else: # dbsnp
1415
+ search_term = f"{query}[Gene Name]"
1416
+ db = "snp"
1417
+
1418
+ try:
1419
+ search_resp = _http_get(
1420
+ f"{base}/esearch.fcgi",
1421
+ params={
1422
+ "db": db,
1423
+ "term": search_term,
1424
+ "retmax": 20,
1425
+ "retmode": "json",
1426
+ "sort": "relevance",
1427
+ },
1428
+ timeout=15,
1429
+ retries=2,
1430
+ )
1431
+ search_resp.raise_for_status()
1432
+ search_data = search_resp.json()
1433
+ except Exception as e:
1434
+ return {"error": f"NCBI search failed: {e}", "summary": f"NCBI query failed for '{query}'"}
1435
+
1436
+ result = search_data.get("esearchresult", {})
1437
+ ids = result.get("idlist", [])
1438
+ total_count = int(result.get("count", 0))
1439
+
1440
+ if not ids:
1441
+ return {
1442
+ "summary": f"No NCBI {database} results for '{query}'",
1443
+ "query": query,
1444
+ "database": database,
1445
+ "total_count": 0,
1446
+ "results": [],
1447
+ }
1448
+
1449
+ # Step 2: Fetch summaries
1450
+ try:
1451
+ summary_resp = _http_get(
1452
+ f"{base}/esummary.fcgi",
1453
+ params={
1454
+ "db": db,
1455
+ "id": ",".join(ids[:20]),
1456
+ "retmode": "json",
1457
+ },
1458
+ timeout=15,
1459
+ retries=2,
1460
+ )
1461
+ summary_resp.raise_for_status()
1462
+ summary_data = summary_resp.json()
1463
+ except Exception as e:
1464
+ return {"error": f"NCBI summary failed: {e}", "summary": f"NCBI summary lookup failed for '{query}'"}
1465
+
1466
+ results_dict = summary_data.get("result", {})
1467
+
1468
+ if database == "gene":
1469
+ gene_results = []
1470
+ for gid in ids:
1471
+ info = results_dict.get(gid, {})
1472
+ if not info or gid == "uids":
1473
+ continue
1474
+ gene_results.append({
1475
+ "gene_id": gid,
1476
+ "symbol": info.get("name", ""),
1477
+ "description": info.get("description", ""),
1478
+ "chromosome": info.get("chromosome", ""),
1479
+ "organism": info.get("organism", {}).get("scientificname", ""),
1480
+ "aliases": info.get("otheraliases", ""),
1481
+ "summary": info.get("summary", ""),
1482
+ "gene_type": info.get("geneticSource", ""),
1483
+ "map_location": info.get("maplocation", ""),
1484
+ })
1485
+
1486
+ top = gene_results[0] if gene_results else {}
1487
+ return {
1488
+ "summary": (
1489
+ f"NCBI Gene {top.get('gene_id', '')} ({top.get('symbol', query)}): "
1490
+ f"{top.get('description', 'N/A')}, "
1491
+ f"chr{top.get('chromosome', '?')}, "
1492
+ f"{total_count} total ClinVar variants"
1493
+ ),
1494
+ "query": query,
1495
+ "database": "gene",
1496
+ "total_count": total_count,
1497
+ "genes": gene_results,
1498
+ }
1499
+
1500
+ elif database == "clinvar":
1501
+ variants = []
1502
+ for vid in ids:
1503
+ info = results_dict.get(vid, {})
1504
+ if not info or vid == "uids":
1505
+ continue
1506
+ variants.append({
1507
+ "uid": vid,
1508
+ "title": info.get("title", ""),
1509
+ "clinical_significance": info.get("clinical_significance", {}).get("description", ""),
1510
+ "gene_sort": info.get("gene_sort", ""),
1511
+ "variation_set": info.get("variation_set", []),
1512
+ "obj_type": info.get("obj_type", ""),
1513
+ })
1514
+
1515
+ return {
1516
+ "summary": f"ClinVar for {query}: {total_count} total variants, showing {len(variants)}",
1517
+ "query": query,
1518
+ "database": "clinvar",
1519
+ "total_count": total_count,
1520
+ "variants": variants,
1521
+ }
1522
+
1523
+ else: # dbsnp
1524
+ snps = []
1525
+ for sid in ids:
1526
+ info = results_dict.get(sid, {})
1527
+ if not info or sid == "uids":
1528
+ continue
1529
+ snps.append({
1530
+ "uid": sid,
1531
+ "snp_id": info.get("snp_id", sid),
1532
+ "snp_class": info.get("snp_class", ""),
1533
+ "global_maf": info.get("global_mafs", []),
1534
+ "genes": info.get("genes", []),
1535
+ "clinical_significance": info.get("clinical_significance", ""),
1536
+ })
1537
+
1538
+ return {
1539
+ "summary": f"dbSNP for {query}: {total_count} total SNPs, showing {len(snps)}",
1540
+ "query": query,
1541
+ "database": "dbsnp",
1542
+ "total_count": total_count,
1543
+ "snps": snps,
1544
+ }
1545
+
1546
+
1547
+ # ---------------------------------------------------------------------------
1548
+ # 7. ChEMBL advanced
1549
+ # ---------------------------------------------------------------------------
1550
+
1551
+ @registry.register(
1552
+ name="data_api.chembl_advanced",
1553
+ description="Advanced ChEMBL queries: compound details, target activity statistics, mechanisms, drug indications",
1554
+ category="data_api",
1555
+ parameters={
1556
+ "query": "Compound name/ChEMBL ID, target gene, or drug name",
1557
+ "search_type": "Query type: 'compound', 'target_activities', 'mechanism', or 'drug_indication' (default 'compound')",
1558
+ },
1559
+ requires_data=[],
1560
+ usage_guide="You want detailed ChEMBL data: full compound properties, aggregated bioactivity statistics for a target (min/max/median IC50), drug mechanisms of action, or approved indications. More detailed than literature.chembl_query.",
1561
+ )
1562
+ def chembl_advanced(query: str, search_type: str = "compound", **kwargs) -> dict:
1563
+ """Advanced ChEMBL REST API queries with aggregated statistics."""
1564
+ valid_types = ("compound", "target_activities", "mechanism", "drug_indication")
1565
+ if search_type not in valid_types:
1566
+ return {"error": f"Invalid search_type '{search_type}'. Choose from: {', '.join(valid_types)}", "summary": f"Invalid ChEMBL search type '{search_type}'"}
1567
+
1568
+ chembl_base = "https://www.ebi.ac.uk/chembl/api/data"
1569
+ headers = {"Accept": "application/json"}
1570
+
1571
+ if search_type == "compound":
1572
+ return _chembl_compound_search(query, chembl_base, headers)
1573
+ elif search_type == "target_activities":
1574
+ return _chembl_target_activities(query, chembl_base, headers)
1575
+ elif search_type == "mechanism":
1576
+ return _chembl_mechanism(query, chembl_base, headers)
1577
+ else: # drug_indication
1578
+ return _chembl_drug_indication(query, chembl_base, headers)
1579
+
1580
+
1581
+ def _chembl_compound_search(query: str, base: str, headers: dict) -> dict:
1582
+ """Search ChEMBL for a compound with full property details."""
1583
+ try:
1584
+ resp = _http_get(
1585
+ f"{base}/molecule/search.json",
1586
+ params={"q": query, "limit": 5},
1587
+ headers=headers,
1588
+ timeout=15,
1589
+ retries=2,
1590
+ )
1591
+ resp.raise_for_status()
1592
+ data = resp.json()
1593
+ except Exception as e:
1594
+ return {"error": f"ChEMBL compound search failed: {e}", "summary": f"ChEMBL compound search failed: {e}"}
1595
+ molecules = data.get("molecules", [])
1596
+ if not molecules:
1597
+ return {
1598
+ "summary": f"No ChEMBL compounds found for '{query}'",
1599
+ "query": query,
1600
+ "compounds": [],
1601
+ }
1602
+
1603
+ compounds = []
1604
+ for mol in molecules:
1605
+ props = mol.get("molecule_properties", {}) or {}
1606
+ structs = mol.get("molecule_structures", {}) or {}
1607
+ chembl_id = mol.get("molecule_chembl_id", "")
1608
+
1609
+ compounds.append({
1610
+ "chembl_id": chembl_id,
1611
+ "pref_name": mol.get("pref_name", ""),
1612
+ "molecule_type": mol.get("molecule_type", ""),
1613
+ "max_phase": mol.get("max_phase", 0),
1614
+ "oral": mol.get("oral", False),
1615
+ "parenteral": mol.get("parenteral", False),
1616
+ "topical": mol.get("topical", False),
1617
+ "natural_product": mol.get("natural_product", -1),
1618
+ "canonical_smiles": structs.get("canonical_smiles", ""),
1619
+ "inchi_key": structs.get("standard_inchi_key", ""),
1620
+ "molecular_weight": props.get("full_mwt"),
1621
+ "alogp": props.get("alogp"),
1622
+ "hba": props.get("hba"),
1623
+ "hbd": props.get("hbd"),
1624
+ "psa": props.get("psa"),
1625
+ "rtb": props.get("rtb"),
1626
+ "ro5_violations": props.get("num_ro5_violations"),
1627
+ "aromatic_rings": props.get("aromatic_rings"),
1628
+ "heavy_atoms": props.get("heavy_atoms"),
1629
+ "qed_weighted": props.get("qed_weighted"),
1630
+ })
1631
+
1632
+ top = compounds[0]
1633
+ return {
1634
+ "summary": (
1635
+ f"ChEMBL compound {top['chembl_id']} ({top['pref_name'] or query}): "
1636
+ f"MW={top['molecular_weight'] or 'N/A'}, ALogP={top['alogp'] or 'N/A'}, "
1637
+ f"max phase {top['max_phase']}"
1638
+ ),
1639
+ "query": query,
1640
+ "n_results": len(compounds),
1641
+ "compounds": compounds,
1642
+ }
1643
+
1644
+
1645
+ def _chembl_target_activities(query: str, base: str, headers: dict) -> dict:
1646
+ """Get aggregated bioactivity statistics for a target."""
1647
+ # Find the target
1648
+ try:
1649
+ tgt_resp = _http_get(
1650
+ f"{base}/target/search.json",
1651
+ params={"q": query, "limit": 5},
1652
+ headers=headers,
1653
+ timeout=15,
1654
+ retries=2,
1655
+ )
1656
+ tgt_resp.raise_for_status()
1657
+ tgt_data = tgt_resp.json()
1658
+ except Exception as e:
1659
+ return {"error": f"ChEMBL target search failed: {e}", "summary": f"ChEMBL target search failed: {e}"}
1660
+ targets = tgt_data.get("targets", [])
1661
+ if not targets:
1662
+ return {"summary": f"No ChEMBL target found for '{query}'", "query": query}
1663
+
1664
+ # Prefer human SINGLE PROTEIN
1665
+ target = None
1666
+ for t in targets:
1667
+ if t.get("organism") == "Homo sapiens" and t.get("target_type") == "SINGLE PROTEIN":
1668
+ target = t
1669
+ break
1670
+ if not target:
1671
+ target = targets[0]
1672
+
1673
+ chembl_target_id = target.get("target_chembl_id", "")
1674
+ target_name = target.get("pref_name", query)
1675
+
1676
+ # Fetch activities
1677
+ try:
1678
+ act_resp = _http_get(
1679
+ f"{base}/activity.json",
1680
+ params={
1681
+ "target_chembl_id": chembl_target_id,
1682
+ "limit": 100,
1683
+ "standard_type__in": "IC50,Ki,Kd,EC50",
1684
+ },
1685
+ headers=headers,
1686
+ timeout=15,
1687
+ retries=2,
1688
+ )
1689
+ act_resp.raise_for_status()
1690
+ act_data = act_resp.json()
1691
+ except Exception as e:
1692
+ return {"error": f"ChEMBL activity query failed: {e}", "summary": f"ChEMBL activity query failed: {e}"}
1693
+ activities = act_data.get("activities", [])
1694
+
1695
+ # Aggregate statistics
1696
+ import statistics
1697
+
1698
+ by_type = {}
1699
+ unique_molecules = set()
1700
+ for act in activities:
1701
+ mol_id = act.get("molecule_chembl_id", "")
1702
+ unique_molecules.add(mol_id)
1703
+ std_type = act.get("standard_type", "")
1704
+ std_value = act.get("standard_value")
1705
+ if std_value is not None:
1706
+ try:
1707
+ val = float(std_value)
1708
+ by_type.setdefault(std_type, []).append(val)
1709
+ except (ValueError, TypeError):
1710
+ pass
1711
+
1712
+ stats = {}
1713
+ for activity_type, values in by_type.items():
1714
+ sorted_vals = sorted(values)
1715
+ stats[activity_type] = {
1716
+ "count": len(values),
1717
+ "min_nM": round(min(values), 2),
1718
+ "max_nM": round(max(values), 2),
1719
+ "median_nM": round(statistics.median(values), 2),
1720
+ "mean_nM": round(statistics.mean(values), 2),
1721
+ }
1722
+
1723
+ total_activities = sum(s["count"] for s in stats.values())
1724
+ median_str = ""
1725
+ if "IC50" in stats:
1726
+ median_str = f", median IC50 = {stats['IC50']['median_nM']:.0f} nM"
1727
+
1728
+ return {
1729
+ "summary": (
1730
+ f"ChEMBL target {chembl_target_id} ({target_name}): "
1731
+ f"{total_activities} activities, "
1732
+ f"{len(unique_molecules)} unique compounds"
1733
+ f"{median_str}"
1734
+ ),
1735
+ "query": query,
1736
+ "target_chembl_id": chembl_target_id,
1737
+ "target_name": target_name,
1738
+ "organism": target.get("organism", ""),
1739
+ "target_type": target.get("target_type", ""),
1740
+ "n_unique_compounds": len(unique_molecules),
1741
+ "n_activities": total_activities,
1742
+ "activity_statistics": stats,
1743
+ }
1744
+
1745
+
1746
+ def _chembl_mechanism(query: str, base: str, headers: dict) -> dict:
1747
+ """Look up drug mechanisms of action."""
1748
+ try:
1749
+ resp = _http_get(
1750
+ f"{base}/mechanism.json",
1751
+ params={"molecule_chembl_id": query, "limit": 20},
1752
+ headers=headers,
1753
+ timeout=15,
1754
+ retries=2,
1755
+ )
1756
+ resp.raise_for_status()
1757
+ data = resp.json()
1758
+ except Exception as e:
1759
+ return {"error": f"ChEMBL mechanism query failed: {e}", "summary": f"ChEMBL mechanism query failed: {e}"}
1760
+ mechanisms = data.get("mechanisms", [])
1761
+
1762
+ # If no results by molecule ID, try searching by name
1763
+ if not mechanisms:
1764
+ try:
1765
+ mol_resp = _http_get(
1766
+ f"{base}/molecule/search.json",
1767
+ params={"q": query, "limit": 1},
1768
+ headers=headers,
1769
+ timeout=10,
1770
+ retries=2,
1771
+ )
1772
+ mol_resp.raise_for_status()
1773
+ mol_data = mol_resp.json()
1774
+ mols = mol_data.get("molecules", [])
1775
+ if mols:
1776
+ mol_id = mols[0].get("molecule_chembl_id", "")
1777
+ resp2 = _http_get(
1778
+ f"{base}/mechanism.json",
1779
+ params={"molecule_chembl_id": mol_id, "limit": 20},
1780
+ headers=headers,
1781
+ timeout=10,
1782
+ retries=2,
1783
+ )
1784
+ resp2.raise_for_status()
1785
+ mechanisms = resp2.json().get("mechanisms", [])
1786
+ except Exception:
1787
+ pass
1788
+
1789
+ if not mechanisms:
1790
+ return {
1791
+ "summary": f"No mechanisms of action found in ChEMBL for '{query}'",
1792
+ "query": query,
1793
+ "mechanisms": [],
1794
+ }
1795
+
1796
+ parsed = []
1797
+ for mech in mechanisms:
1798
+ parsed.append({
1799
+ "mechanism": mech.get("mechanism_of_action", ""),
1800
+ "action_type": mech.get("action_type", ""),
1801
+ "target_name": mech.get("target_chembl_id", ""),
1802
+ "molecule_chembl_id": mech.get("molecule_chembl_id", ""),
1803
+ "max_phase": mech.get("max_phase"),
1804
+ "direct_interaction": mech.get("direct_interaction"),
1805
+ })
1806
+
1807
+ return {
1808
+ "summary": (
1809
+ f"ChEMBL mechanisms for {query}: {len(parsed)} mechanism(s). "
1810
+ + "; ".join(m["mechanism"] for m in parsed[:3])
1811
+ ),
1812
+ "query": query,
1813
+ "n_mechanisms": len(parsed),
1814
+ "mechanisms": parsed,
1815
+ }
1816
+
1817
+
1818
+ def _chembl_drug_indication(query: str, base: str, headers: dict) -> dict:
1819
+ """Look up approved drug indications."""
1820
+ # Resolve molecule ID
1821
+ mol_id = query
1822
+ try:
1823
+ if not query.upper().startswith("CHEMBL"):
1824
+ mol_resp = _http_get(
1825
+ f"{base}/molecule/search.json",
1826
+ params={"q": query, "limit": 1},
1827
+ headers=headers,
1828
+ timeout=10,
1829
+ retries=2,
1830
+ )
1831
+ mol_resp.raise_for_status()
1832
+ mols = mol_resp.json().get("molecules", [])
1833
+ if mols:
1834
+ mol_id = mols[0].get("molecule_chembl_id", "")
1835
+ except Exception:
1836
+ pass
1837
+
1838
+ try:
1839
+ resp = _http_get(
1840
+ f"{base}/drug_indication.json",
1841
+ params={"molecule_chembl_id": mol_id, "limit": 30},
1842
+ headers=headers,
1843
+ timeout=15,
1844
+ retries=2,
1845
+ )
1846
+ resp.raise_for_status()
1847
+ data = resp.json()
1848
+ except Exception as e:
1849
+ return {"error": f"ChEMBL indication query failed: {e}", "summary": f"ChEMBL indication query failed: {e}"}
1850
+ indications = data.get("drug_indications", [])
1851
+ if not indications:
1852
+ return {
1853
+ "summary": f"No drug indications found in ChEMBL for '{query}'",
1854
+ "query": query,
1855
+ "indications": [],
1856
+ }
1857
+
1858
+ parsed = []
1859
+ for ind in indications:
1860
+ parsed.append({
1861
+ "indication": ind.get("mesh_heading", ""),
1862
+ "mesh_id": ind.get("mesh_id", ""),
1863
+ "efo_id": ind.get("efo_id", ""),
1864
+ "max_phase": ind.get("max_phase_for_ind"),
1865
+ "molecule_chembl_id": ind.get("molecule_chembl_id", ""),
1866
+ })
1867
+
1868
+ approved = [p for p in parsed if p.get("max_phase") == 4]
1869
+ return {
1870
+ "summary": (
1871
+ f"ChEMBL indications for {query} ({mol_id}): "
1872
+ f"{len(parsed)} total, {len(approved)} approved. "
1873
+ + "; ".join(p["indication"] for p in parsed[:5])
1874
+ ),
1875
+ "query": query,
1876
+ "molecule_chembl_id": mol_id,
1877
+ "n_indications": len(parsed),
1878
+ "n_approved": len(approved),
1879
+ "indications": parsed,
1880
+ }
1881
+
1882
+
1883
+ # ---------------------------------------------------------------------------
1884
+ # 8. Drug information lookup (via PubChem)
1885
+ # ---------------------------------------------------------------------------
1886
+
1887
+ @registry.register(
1888
+ name="data_api.drug_info",
1889
+ description="Look up comprehensive drug information: pharmacology, properties, interactions, indications",
1890
+ category="data_api",
1891
+ parameters={
1892
+ "query": "Drug name (e.g. 'imatinib') or compound name",
1893
+ "include": "Information to include: list of 'pharmacology', 'interactions', 'properties' (default ['pharmacology', 'interactions'])",
1894
+ },
1895
+ requires_data=[],
1896
+ usage_guide="You want drug pharmacology, properties, and interaction data. Uses PubChem PUG REST and PUG View APIs for comprehensive drug information.",
1897
+ )
1898
+ def drug_info(query: str, include: list = None, **kwargs) -> dict:
1899
+ """Look up drug information via PubChem REST API.
1900
+
1901
+ Uses PubChem PUG REST and PUG View APIs to retrieve drug properties,
1902
+ pharmacology, and interaction data.
1903
+ """
1904
+ if include is None:
1905
+ include = ["pharmacology", "interactions"]
1906
+
1907
+ # Normalize drug name
1908
+ raw_query = query
1909
+ query = _normalize_drug_query(query)
1910
+ if not query:
1911
+ return {
1912
+ "error": "Drug query is required",
1913
+ "summary": "PubChem: query cannot be empty",
1914
+ }
1915
+
1916
+ pug_base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
1917
+ pugview_base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
1918
+
1919
+ def _query_candidates(text: str) -> list[str]:
1920
+ candidates = []
1921
+ seen = set()
1922
+
1923
+ def _add(candidate: str):
1924
+ c = " ".join((candidate or "").split()).strip()
1925
+ if not c or c.lower() in seen:
1926
+ return
1927
+ seen.add(c.lower())
1928
+ candidates.append(c)
1929
+
1930
+ _add(text)
1931
+ for part in re.split(r"[;,/|()]|\bor\b|\band\b", text, flags=re.IGNORECASE):
1932
+ _add(part)
1933
+ for token in text.split():
1934
+ cleaned = token.strip(" ,;:/|()[]{}")
1935
+ if len(cleaned) >= 3 and re.search(r"[A-Za-z]", cleaned):
1936
+ _add(cleaned)
1937
+ return candidates
1938
+
1939
+ # Step 1: Resolve drug name to CID (with alias/fallback attempts)
1940
+ import urllib.parse
1941
+
1942
+ cid = None
1943
+ resolved_query = raw_query
1944
+ lookup_errors = []
1945
+ for candidate in _query_candidates(raw_query):
1946
+ encoded_query = urllib.parse.quote(candidate, safe="")
1947
+ try:
1948
+ resp = _http_get(
1949
+ f"{pug_base}/compound/name/{encoded_query}/cids/JSON",
1950
+ timeout=10,
1951
+ retries=2,
1952
+ )
1953
+ if resp.status_code == 404:
1954
+ continue
1955
+ resp.raise_for_status()
1956
+ cid_data = resp.json()
1957
+ except Exception as e:
1958
+ lookup_errors.append(f"{candidate}: {e}")
1959
+ continue
1960
+
1961
+ cids = cid_data.get("IdentifierList", {}).get("CID", [])
1962
+ if cids:
1963
+ cid = cids[0]
1964
+ resolved_query = candidate
1965
+ break
1966
+
1967
+ if cid is None:
1968
+ if lookup_errors:
1969
+ return {
1970
+ "error": f"PubChem CID lookup failed: {lookup_errors[0]}",
1971
+ "summary": f"PubChem CID lookup failed for '{raw_query}'",
1972
+ "tried_queries": _query_candidates(raw_query)[:5],
1973
+ }
1974
+ return {
1975
+ "error": f"Drug '{raw_query}' not found in PubChem",
1976
+ "summary": f"PubChem: no compound found for '{raw_query}'",
1977
+ "tried_queries": _query_candidates(raw_query)[:5],
1978
+ }
1979
+
1980
+ # Step 2: Get compound properties
1981
+ properties = {}
1982
+ try:
1983
+ props_resp = _http_get(
1984
+ f"{pug_base}/compound/cid/{cid}/property/"
1985
+ "MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,"
1986
+ "XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,"
1987
+ "RotatableBondCount,HeavyAtomCount,Complexity,InChIKey/JSON",
1988
+ timeout=10,
1989
+ retries=2,
1990
+ )
1991
+ if props_resp.status_code == 200:
1992
+ prop_table = props_resp.json().get("PropertyTable", {}).get("Properties", [])
1993
+ if prop_table:
1994
+ properties = prop_table[0]
1995
+ except Exception:
1996
+ pass
1997
+
1998
+ # Step 3: Get drug/medication information from PUG View
1999
+ pharmacology = {}
2000
+ interactions = []
2001
+ drug_info = {}
2002
+
2003
+ if "pharmacology" in include or "interactions" in include:
2004
+ try:
2005
+ view_resp = _http_get(
2006
+ f"{pugview_base}/data/compound/{cid}/JSON",
2007
+ params={"heading": "Drug and Medication Information"},
2008
+ timeout=15,
2009
+ retries=2,
2010
+ )
2011
+ if view_resp.status_code == 200:
2012
+ view_data = view_resp.json()
2013
+ record = view_data.get("Record", {})
2014
+ sections = record.get("Section", [])
2015
+
2016
+ for section in sections:
2017
+ heading = section.get("TOCHeading", "")
2018
+ for subsection in section.get("Section", []):
2019
+ sub_heading = subsection.get("TOCHeading", "")
2020
+ info_list = subsection.get("Information", [])
2021
+
2022
+ if sub_heading == "Drug Indication":
2023
+ for info in info_list:
2024
+ val = info.get("Value", {}).get("StringWithMarkup", [])
2025
+ if val:
2026
+ drug_info["indication"] = val[0].get("String", "")[:500]
2027
+
2028
+ elif sub_heading == "Mechanism of Action":
2029
+ for info in info_list:
2030
+ val = info.get("Value", {}).get("StringWithMarkup", [])
2031
+ if val:
2032
+ pharmacology["mechanism_of_action"] = val[0].get("String", "")[:500]
2033
+
2034
+ elif sub_heading == "Pharmacology":
2035
+ for info in info_list:
2036
+ val = info.get("Value", {}).get("StringWithMarkup", [])
2037
+ if val:
2038
+ pharmacology["pharmacology"] = val[0].get("String", "")[:500]
2039
+
2040
+ elif sub_heading == "Absorption":
2041
+ for info in info_list:
2042
+ val = info.get("Value", {}).get("StringWithMarkup", [])
2043
+ if val:
2044
+ pharmacology["absorption"] = val[0].get("String", "")[:300]
2045
+
2046
+ elif "Drug Interaction" in sub_heading or "Drug-Drug" in sub_heading:
2047
+ for info in info_list:
2048
+ val = info.get("Value", {}).get("StringWithMarkup", [])
2049
+ if val:
2050
+ interactions.append(val[0].get("String", "")[:200])
2051
+ except Exception:
2052
+ pass
2053
+
2054
+ # Step 4: Get synonyms for the drug
2055
+ synonyms = []
2056
+ try:
2057
+ syn_resp = _http_get(
2058
+ f"{pug_base}/compound/cid/{cid}/synonyms/JSON",
2059
+ timeout=10,
2060
+ retries=2,
2061
+ )
2062
+ if syn_resp.status_code == 200:
2063
+ syn_list = syn_resp.json().get("InformationList", {}).get("Information", [])
2064
+ if syn_list:
2065
+ synonyms = syn_list[0].get("Synonym", [])[:15]
2066
+ except Exception:
2067
+ pass
2068
+
2069
+ # Find DrugBank ID in synonyms
2070
+ drugbank_id = ""
2071
+ for syn in synonyms:
2072
+ if syn.upper().startswith("DB") and len(syn) == 7 and syn[2:].isdigit():
2073
+ drugbank_id = syn
2074
+ break
2075
+
2076
+ mw = properties.get("MolecularWeight", "N/A")
2077
+ formula = properties.get("MolecularFormula", "N/A")
2078
+ smiles = properties.get("CanonicalSMILES", "N/A")
2079
+ mechanism = pharmacology.get("mechanism_of_action", "N/A")
2080
+
2081
+ drugbank_str = f" ({drugbank_id})" if drugbank_id else ""
2082
+ mech_short = mechanism[:80] + "..." if len(mechanism) > 80 else mechanism
2083
+ resolved_note = ""
2084
+ if resolved_query.lower() != raw_query.lower():
2085
+ resolved_note = f" [resolved as '{resolved_query}']"
2086
+
2087
+ return {
2088
+ "summary": (
2089
+ f"{raw_query}{resolved_note}{drugbank_str}: {mech_short}, "
2090
+ f"MW {mw}, {len(interactions)} known drug interactions."
2091
+ ),
2092
+ "query": raw_query,
2093
+ "resolved_query": resolved_query,
2094
+ "cid": cid,
2095
+ "drugbank_id": drugbank_id,
2096
+ "properties": {
2097
+ "molecular_formula": formula,
2098
+ "molecular_weight": mw,
2099
+ "canonical_smiles": smiles,
2100
+ "isomeric_smiles": properties.get("IsomericSMILES", ""),
2101
+ "xlogp": properties.get("XLogP"),
2102
+ "tpsa": properties.get("TPSA"),
2103
+ "hbd": properties.get("HBondDonorCount"),
2104
+ "hba": properties.get("HBondAcceptorCount"),
2105
+ "rotatable_bonds": properties.get("RotatableBondCount"),
2106
+ "inchi_key": properties.get("InChIKey", ""),
2107
+ },
2108
+ "pharmacology": pharmacology,
2109
+ "drug_info": drug_info,
2110
+ "interactions": interactions[:20],
2111
+ "n_interactions": len(interactions),
2112
+ "synonyms": synonyms,
2113
+ "pubchem_url": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}",
2114
+ }