celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/data_api.py
ADDED
|
@@ -0,0 +1,2114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data API tools: rich wrappers for major biomedical data platforms.
|
|
3
|
+
|
|
4
|
+
Provides general-purpose access to DepMap, Open Targets, UniProt, PDB,
|
|
5
|
+
Ensembl, NCBI, ChEMBL, and DrugBank/PubChem.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
from ct.tools import registry
|
|
12
|
+
from ct.tools.http_client import request
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _http_get(url: str, *, params=None, headers=None, timeout: int = 15, retries: int = 2):
|
|
16
|
+
"""GET helper with transient retry/backoff semantics."""
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
# Preserve historical semantics where `retries` represented max attempts.
|
|
20
|
+
resp, error = request(
|
|
21
|
+
"GET",
|
|
22
|
+
url,
|
|
23
|
+
params=params,
|
|
24
|
+
headers=headers,
|
|
25
|
+
timeout=timeout,
|
|
26
|
+
retries=max(retries - 1, 0),
|
|
27
|
+
raise_for_status=False,
|
|
28
|
+
)
|
|
29
|
+
if error:
|
|
30
|
+
raise httpx.HTTPError(error)
|
|
31
|
+
return resp
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _http_post(url: str, *, json=None, data=None, params=None,
|
|
35
|
+
headers=None, timeout: int = 15, retries: int = 2):
|
|
36
|
+
"""POST helper with transient retry/backoff semantics."""
|
|
37
|
+
import httpx
|
|
38
|
+
|
|
39
|
+
# Preserve historical semantics where `retries` represented max attempts.
|
|
40
|
+
resp, error = request(
|
|
41
|
+
"POST",
|
|
42
|
+
url,
|
|
43
|
+
json=json,
|
|
44
|
+
data=data,
|
|
45
|
+
params=params,
|
|
46
|
+
headers=headers,
|
|
47
|
+
timeout=timeout,
|
|
48
|
+
retries=max(retries - 1, 0),
|
|
49
|
+
raise_for_status=False,
|
|
50
|
+
)
|
|
51
|
+
if error:
|
|
52
|
+
raise httpx.HTTPError(error)
|
|
53
|
+
return resp
|
|
54
|
+
|
|
55
|
+
_logger = logging.getLogger("ct.data_api")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _normalize_gene_name(gene: str) -> str:
|
|
59
|
+
"""Normalize a gene symbol: uppercase, strip whitespace, remove common prefixes."""
|
|
60
|
+
gene = gene.strip()
|
|
61
|
+
# Strip common noise prefixes that confuse APIs
|
|
62
|
+
for prefix in ("gene ", "Gene ", "GENE ", "human ", "Human "):
|
|
63
|
+
if gene.startswith(prefix):
|
|
64
|
+
gene = gene[len(prefix):]
|
|
65
|
+
gene = gene.strip()
|
|
66
|
+
# Gene symbols should be uppercase alphanumeric (with hyphens/dots allowed)
|
|
67
|
+
# If it looks like a gene symbol, uppercase it
|
|
68
|
+
if re.match(r'^[A-Za-z][A-Za-z0-9._-]*$', gene):
|
|
69
|
+
gene = gene.upper()
|
|
70
|
+
return gene
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _normalize_drug_query(query: str) -> str:
|
|
74
|
+
"""Strip noise words from drug name queries that confuse APIs."""
|
|
75
|
+
noise_prefixes = [
|
|
76
|
+
"fda-approved ", "fda approved ", "approved drug ",
|
|
77
|
+
"drug ", "compound ", "the drug ", "the compound ",
|
|
78
|
+
"investigational ", "experimental ",
|
|
79
|
+
]
|
|
80
|
+
cleaned = query.strip()
|
|
81
|
+
# Keep stripping prefixes (case-insensitive) until none match
|
|
82
|
+
changed = True
|
|
83
|
+
while changed:
|
|
84
|
+
changed = False
|
|
85
|
+
lower = cleaned.lower()
|
|
86
|
+
for prefix in noise_prefixes:
|
|
87
|
+
if lower.startswith(prefix):
|
|
88
|
+
cleaned = cleaned[len(prefix):]
|
|
89
|
+
changed = True
|
|
90
|
+
break
|
|
91
|
+
return cleaned.strip()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# 1. DepMap search
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
@registry.register(
|
|
99
|
+
name="data_api.depmap_search",
|
|
100
|
+
description="Search DepMap for gene dependency scores across cancer cell lines",
|
|
101
|
+
category="data_api",
|
|
102
|
+
parameters={
|
|
103
|
+
"gene": "Gene symbol (e.g. BRCA1, TP53)",
|
|
104
|
+
"dataset": "Dataset to query: 'crispr', 'expression', 'mutations', or 'cn' (default 'crispr')",
|
|
105
|
+
},
|
|
106
|
+
requires_data=[],
|
|
107
|
+
usage_guide="You want DepMap gene dependency data across cell lines. Returns dependency scores, most/least dependent lineages. Uses local DepMap data when available, or the Cell Model Passports API as fallback.",
|
|
108
|
+
)
|
|
109
|
+
def depmap_search(gene: str, dataset: str = "crispr", **kwargs) -> dict:
|
|
110
|
+
"""Search DepMap for gene dependency / expression / mutation data.
|
|
111
|
+
|
|
112
|
+
Tries local DepMap data first (via ct data loaders), then falls back to
|
|
113
|
+
the Cell Model Passports API (public, no key required).
|
|
114
|
+
"""
|
|
115
|
+
import numpy as np
|
|
116
|
+
|
|
117
|
+
valid_datasets = ("crispr", "expression", "mutations", "cn")
|
|
118
|
+
if dataset not in valid_datasets:
|
|
119
|
+
return {"error": f"Invalid dataset '{dataset}'. Choose from: {', '.join(valid_datasets)}", "summary": f"Invalid dataset '{dataset}'"}
|
|
120
|
+
|
|
121
|
+
# Normalize gene name
|
|
122
|
+
gene = _normalize_gene_name(gene)
|
|
123
|
+
|
|
124
|
+
# --- Attempt local DepMap data ---
|
|
125
|
+
if dataset == "crispr":
|
|
126
|
+
try:
|
|
127
|
+
from ct.data.loaders import load_crispr
|
|
128
|
+
crispr = load_crispr()
|
|
129
|
+
# Try exact match first, then common variations
|
|
130
|
+
if gene not in crispr.columns:
|
|
131
|
+
# Try with/without hyphens, dots, etc.
|
|
132
|
+
found = False
|
|
133
|
+
for variant in [gene.replace("-", ""), gene.replace(".", ""), gene + "A"]:
|
|
134
|
+
if variant in crispr.columns:
|
|
135
|
+
_logger.warning("Gene '%s' not found, using variant '%s'", gene, variant)
|
|
136
|
+
gene = variant
|
|
137
|
+
found = True
|
|
138
|
+
break
|
|
139
|
+
if not found:
|
|
140
|
+
# Try partial match (e.g., "CD274" matches "CD274 (PD-L1)")
|
|
141
|
+
matches = [c for c in crispr.columns if c.startswith(gene + " ") or c == gene]
|
|
142
|
+
if matches:
|
|
143
|
+
gene = matches[0]
|
|
144
|
+
_logger.warning("Gene exact match not found, using '%s'", gene)
|
|
145
|
+
else:
|
|
146
|
+
return {"error": f"Gene {gene} not found in local DepMap CRISPR data", "summary": f"Gene {gene} not in DepMap CRISPR"}
|
|
147
|
+
|
|
148
|
+
scores = crispr[gene].dropna()
|
|
149
|
+
n_lines = len(scores)
|
|
150
|
+
essential = (scores < -0.5).sum()
|
|
151
|
+
mean_score = float(scores.mean())
|
|
152
|
+
min_score = float(scores.min())
|
|
153
|
+
|
|
154
|
+
# Lineage info if model metadata available
|
|
155
|
+
lineage_stats = []
|
|
156
|
+
try:
|
|
157
|
+
from ct.data.loaders import load_model_metadata
|
|
158
|
+
model = load_model_metadata()
|
|
159
|
+
merged = scores.to_frame(name="score").join(
|
|
160
|
+
model.set_index("ModelID")["OncotreeLineage"], how="left"
|
|
161
|
+
)
|
|
162
|
+
if "OncotreeLineage" in merged.columns:
|
|
163
|
+
for lin, grp in merged.groupby("OncotreeLineage"):
|
|
164
|
+
lineage_stats.append({
|
|
165
|
+
"lineage": lin,
|
|
166
|
+
"mean_score": round(float(grp["score"].mean()), 4),
|
|
167
|
+
"n_lines": len(grp),
|
|
168
|
+
"n_essential": int((grp["score"] < -0.5).sum()),
|
|
169
|
+
})
|
|
170
|
+
lineage_stats.sort(key=lambda x: x["mean_score"])
|
|
171
|
+
except Exception:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
most_dependent = [ls["lineage"] for ls in lineage_stats[:3]] if lineage_stats else []
|
|
175
|
+
least_dependent = [ls["lineage"] for ls in lineage_stats[-3:]] if lineage_stats else []
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
"summary": (
|
|
179
|
+
f"{gene} dependency (DepMap CRISPR): essential in {essential}/{n_lines} lines, "
|
|
180
|
+
f"mean score {mean_score:.3f}"
|
|
181
|
+
+ (f", most dependent: {', '.join(most_dependent)}" if most_dependent else "")
|
|
182
|
+
),
|
|
183
|
+
"gene": gene,
|
|
184
|
+
"dataset": "crispr",
|
|
185
|
+
"n_cell_lines": n_lines,
|
|
186
|
+
"n_essential": int(essential),
|
|
187
|
+
"mean_score": round(mean_score, 4),
|
|
188
|
+
"min_score": round(min_score, 4),
|
|
189
|
+
"lineage_stats": lineage_stats[:20],
|
|
190
|
+
"most_dependent_lineages": most_dependent,
|
|
191
|
+
"least_dependent_lineages": least_dependent,
|
|
192
|
+
}
|
|
193
|
+
except (ImportError, FileNotFoundError):
|
|
194
|
+
pass # Fall through to API
|
|
195
|
+
|
|
196
|
+
if dataset == "mutations":
|
|
197
|
+
try:
|
|
198
|
+
from ct.data.loaders import load_mutations
|
|
199
|
+
mutations = load_mutations()
|
|
200
|
+
if gene not in mutations.columns:
|
|
201
|
+
return {"error": f"Gene {gene} not found in local DepMap mutation data"}
|
|
202
|
+
|
|
203
|
+
mutated = mutations[gene].dropna()
|
|
204
|
+
n_lines = len(mutated)
|
|
205
|
+
n_mutated = int((mutated > 0).sum())
|
|
206
|
+
mutation_rate = n_mutated / n_lines if n_lines > 0 else 0
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
"summary": (
|
|
210
|
+
f"{gene} mutations (DepMap): mutated in {n_mutated}/{n_lines} lines "
|
|
211
|
+
f"({mutation_rate:.1%})"
|
|
212
|
+
),
|
|
213
|
+
"gene": gene,
|
|
214
|
+
"dataset": "mutations",
|
|
215
|
+
"n_cell_lines": n_lines,
|
|
216
|
+
"n_mutated": n_mutated,
|
|
217
|
+
"mutation_rate": round(mutation_rate, 4),
|
|
218
|
+
}
|
|
219
|
+
except (ImportError, FileNotFoundError):
|
|
220
|
+
pass # Fall through to API
|
|
221
|
+
|
|
222
|
+
# --- Fallback: Cell Model Passports API ---
|
|
223
|
+
try:
|
|
224
|
+
resp = _http_get(
|
|
225
|
+
"https://www.cellmodelpassports.sanger.ac.uk/api/v1/genes",
|
|
226
|
+
params={"search": gene, "page_size": 5},
|
|
227
|
+
timeout=15,
|
|
228
|
+
)
|
|
229
|
+
if resp.status_code != 200:
|
|
230
|
+
return {
|
|
231
|
+
"error": f"Cell Model Passports API returned HTTP {resp.status_code}",
|
|
232
|
+
"summary": f"Could not query DepMap/CMP for {gene}",
|
|
233
|
+
}
|
|
234
|
+
data = resp.json()
|
|
235
|
+
except Exception as e:
|
|
236
|
+
import httpx
|
|
237
|
+
if isinstance(e, httpx.TimeoutException):
|
|
238
|
+
return {"error": "Cell Model Passports API timed out", "summary": f"CMP timeout for {gene}"}
|
|
239
|
+
if isinstance(e, httpx.HTTPError):
|
|
240
|
+
return {"error": f"CMP API error: {e}", "summary": f"CMP query failed for {gene}"}
|
|
241
|
+
return {"error": f"CMP API error: {e}", "summary": f"CMP query failed for {gene}"}
|
|
242
|
+
|
|
243
|
+
results = data.get("data", data.get("results", []))
|
|
244
|
+
if not results:
|
|
245
|
+
return {
|
|
246
|
+
"error": f"Gene {gene} not found in Cell Model Passports",
|
|
247
|
+
"summary": f"No results for {gene} in CMP",
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
gene_info = results[0] if isinstance(results, list) else results
|
|
251
|
+
return {
|
|
252
|
+
"summary": f"DepMap/CMP: {gene} — found in Cell Model Passports database",
|
|
253
|
+
"gene": gene,
|
|
254
|
+
"dataset": dataset,
|
|
255
|
+
"source": "cell_model_passports",
|
|
256
|
+
"gene_info": gene_info,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
# 2. Open Targets search
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
@registry.register(
|
|
265
|
+
name="data_api.opentargets_search",
|
|
266
|
+
description="Search Open Targets Platform for comprehensive target, disease, or drug profiles",
|
|
267
|
+
category="data_api",
|
|
268
|
+
parameters={
|
|
269
|
+
"query": "Gene name, disease name, or drug name",
|
|
270
|
+
"entity_type": "Entity type: 'target', 'disease', or 'drug' (default 'target')",
|
|
271
|
+
},
|
|
272
|
+
requires_data=[],
|
|
273
|
+
usage_guide="You want a comprehensive profile from Open Targets: disease associations for a target, associated targets for a disease, or indications/mechanisms for a drug. General-purpose Open Targets access.",
|
|
274
|
+
)
|
|
275
|
+
def opentargets_search(query: str, entity_type: str = "target", **kwargs) -> dict:
|
|
276
|
+
"""Query Open Targets Platform GraphQL API for target/disease/drug profiles."""
|
|
277
|
+
ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
|
|
278
|
+
headers = {"Content-Type": "application/json"}
|
|
279
|
+
|
|
280
|
+
valid_types = ("target", "disease", "drug")
|
|
281
|
+
if entity_type not in valid_types:
|
|
282
|
+
return {"error": f"Invalid entity_type '{entity_type}'. Choose from: {', '.join(valid_types)}", "summary": f"Invalid entity type '{entity_type}'"}
|
|
283
|
+
|
|
284
|
+
# Normalize query based on entity type
|
|
285
|
+
if entity_type == "target":
|
|
286
|
+
query = _normalize_gene_name(query)
|
|
287
|
+
elif entity_type == "drug":
|
|
288
|
+
query = _normalize_drug_query(query)
|
|
289
|
+
|
|
290
|
+
# Step 1: Search to resolve ID
|
|
291
|
+
search_gql = """
|
|
292
|
+
query search($q: String!, $entities: [String!]!) {
|
|
293
|
+
search(queryString: $q, entityNames: $entities, page: {size: 5, index: 0}) {
|
|
294
|
+
total
|
|
295
|
+
hits { id entity name description }
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
"""
|
|
299
|
+
entity_names = {
|
|
300
|
+
"target": ["target"],
|
|
301
|
+
"disease": ["disease"],
|
|
302
|
+
"drug": ["drug"],
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
search_resp = _http_post(
|
|
307
|
+
ot_url,
|
|
308
|
+
json={"query": search_gql, "variables": {"q": query, "entities": entity_names[entity_type]}},
|
|
309
|
+
headers=headers,
|
|
310
|
+
timeout=15,
|
|
311
|
+
)
|
|
312
|
+
search_resp.raise_for_status()
|
|
313
|
+
search_data = search_resp.json()
|
|
314
|
+
except Exception as e:
|
|
315
|
+
import httpx
|
|
316
|
+
if isinstance(e, httpx.TimeoutException):
|
|
317
|
+
return {"error": f"Open Targets search timed out for '{query}'", "summary": f"Open Targets timed out for '{query}'"}
|
|
318
|
+
if isinstance(e, httpx.HTTPError):
|
|
319
|
+
return {"error": f"Open Targets search failed: {e}", "summary": f"Open Targets search failed"}
|
|
320
|
+
return {"error": f"Open Targets search failed: {e}", "summary": f"Open Targets search failed"}
|
|
321
|
+
|
|
322
|
+
hits = search_data.get("data", {}).get("search", {}).get("hits", [])
|
|
323
|
+
total = search_data.get("data", {}).get("search", {}).get("total", 0)
|
|
324
|
+
|
|
325
|
+
if not hits:
|
|
326
|
+
return {
|
|
327
|
+
"error": f"No {entity_type} found for '{query}' in Open Targets",
|
|
328
|
+
"summary": f"Open Targets: no {entity_type} matches for '{query}'",
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
top_hit = hits[0]
|
|
332
|
+
entity_id = top_hit["id"]
|
|
333
|
+
entity_name = top_hit.get("name", query)
|
|
334
|
+
|
|
335
|
+
# Step 2: Fetch detailed profile
|
|
336
|
+
if entity_type == "target":
|
|
337
|
+
detail_gql = """
|
|
338
|
+
query targetProfile($id: String!) {
|
|
339
|
+
target(ensemblId: $id) {
|
|
340
|
+
id
|
|
341
|
+
approvedSymbol
|
|
342
|
+
approvedName
|
|
343
|
+
biotype
|
|
344
|
+
functionDescriptions
|
|
345
|
+
subcellularLocations { location }
|
|
346
|
+
tractability {
|
|
347
|
+
label
|
|
348
|
+
modality
|
|
349
|
+
value
|
|
350
|
+
}
|
|
351
|
+
associatedDiseases(page: {size: 10, index: 0}) {
|
|
352
|
+
count
|
|
353
|
+
rows {
|
|
354
|
+
disease { id name }
|
|
355
|
+
score
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
knownDrugs(size: 10) {
|
|
359
|
+
uniqueDrugs
|
|
360
|
+
rows {
|
|
361
|
+
prefName
|
|
362
|
+
drugType
|
|
363
|
+
mechanismOfAction
|
|
364
|
+
phase
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
"""
|
|
370
|
+
variables = {"id": entity_id}
|
|
371
|
+
|
|
372
|
+
elif entity_type == "disease":
|
|
373
|
+
detail_gql = """
|
|
374
|
+
query diseaseProfile($id: String!) {
|
|
375
|
+
disease(efoId: $id) {
|
|
376
|
+
id
|
|
377
|
+
name
|
|
378
|
+
description
|
|
379
|
+
therapeuticAreas { id name }
|
|
380
|
+
associatedTargets(page: {size: 10, index: 0}) {
|
|
381
|
+
count
|
|
382
|
+
rows {
|
|
383
|
+
target { id approvedSymbol }
|
|
384
|
+
score
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
knownDrugs(size: 10) {
|
|
388
|
+
uniqueDrugs
|
|
389
|
+
rows {
|
|
390
|
+
prefName
|
|
391
|
+
drugType
|
|
392
|
+
phase
|
|
393
|
+
mechanismOfAction
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
"""
|
|
399
|
+
variables = {"id": entity_id}
|
|
400
|
+
|
|
401
|
+
else: # drug
|
|
402
|
+
detail_gql = """
|
|
403
|
+
query drugProfile($id: String!) {
|
|
404
|
+
drug(chemblId: $id) {
|
|
405
|
+
id
|
|
406
|
+
name
|
|
407
|
+
drugType
|
|
408
|
+
maximumClinicalTrialPhase
|
|
409
|
+
hasBeenWithdrawn
|
|
410
|
+
description
|
|
411
|
+
mechanismsOfAction {
|
|
412
|
+
rows {
|
|
413
|
+
mechanismOfAction
|
|
414
|
+
targets { id approvedSymbol }
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
indications {
|
|
418
|
+
count
|
|
419
|
+
rows {
|
|
420
|
+
disease { id name }
|
|
421
|
+
maxPhaseForIndication
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
"""
|
|
427
|
+
variables = {"id": entity_id}
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
detail_resp = _http_post(
|
|
431
|
+
ot_url,
|
|
432
|
+
json={"query": detail_gql, "variables": variables},
|
|
433
|
+
headers=headers,
|
|
434
|
+
timeout=15,
|
|
435
|
+
)
|
|
436
|
+
detail_resp.raise_for_status()
|
|
437
|
+
detail_data = detail_resp.json()
|
|
438
|
+
except Exception as e:
|
|
439
|
+
import httpx
|
|
440
|
+
if isinstance(e, httpx.TimeoutException):
|
|
441
|
+
return {"error": f"Open Targets detail query timed out for {entity_id}", "summary": f"Open Targets detail timed out"}
|
|
442
|
+
if isinstance(e, httpx.HTTPError):
|
|
443
|
+
return {"error": f"Open Targets detail query failed: {e}", "summary": f"Open Targets detail query failed"}
|
|
444
|
+
return {"error": f"Open Targets detail query failed: {e}", "summary": f"Open Targets detail query failed"}
|
|
445
|
+
|
|
446
|
+
data_root = detail_data.get("data", {})
|
|
447
|
+
|
|
448
|
+
if entity_type == "target":
|
|
449
|
+
target = data_root.get("target") or {}
|
|
450
|
+
assoc = target.get("associatedDiseases", {})
|
|
451
|
+
n_diseases = assoc.get("count", 0)
|
|
452
|
+
top_diseases = [
|
|
453
|
+
{"disease": r["disease"]["name"], "score": round(r["score"], 3)}
|
|
454
|
+
for r in assoc.get("rows", [])
|
|
455
|
+
]
|
|
456
|
+
known_drugs = target.get("knownDrugs", {})
|
|
457
|
+
n_drugs = known_drugs.get("uniqueDrugs", 0)
|
|
458
|
+
drug_rows = known_drugs.get("rows", [])
|
|
459
|
+
tractability = target.get("tractability", [])
|
|
460
|
+
|
|
461
|
+
top_disease_str = ", ".join(
|
|
462
|
+
f"{d['disease']} ({d['score']:.2f})" for d in top_diseases[:3]
|
|
463
|
+
)
|
|
464
|
+
return {
|
|
465
|
+
"summary": (
|
|
466
|
+
f"Open Targets: {target.get('approvedSymbol', query)} — "
|
|
467
|
+
f"{n_diseases} disease associations, "
|
|
468
|
+
f"top: {top_disease_str or 'none'}. "
|
|
469
|
+
f"{n_drugs} known drug(s)."
|
|
470
|
+
),
|
|
471
|
+
"entity_type": "target",
|
|
472
|
+
"entity_id": entity_id,
|
|
473
|
+
"approved_symbol": target.get("approvedSymbol", ""),
|
|
474
|
+
"approved_name": target.get("approvedName", ""),
|
|
475
|
+
"biotype": target.get("biotype", ""),
|
|
476
|
+
"function": target.get("functionDescriptions", []),
|
|
477
|
+
"tractability": tractability,
|
|
478
|
+
"n_disease_associations": n_diseases,
|
|
479
|
+
"top_diseases": top_diseases,
|
|
480
|
+
"n_known_drugs": n_drugs,
|
|
481
|
+
"known_drugs": [
|
|
482
|
+
{
|
|
483
|
+
"name": d.get("prefName", ""),
|
|
484
|
+
"type": d.get("drugType", ""),
|
|
485
|
+
"mechanism": d.get("mechanismOfAction", ""),
|
|
486
|
+
"phase": d.get("phase", 0),
|
|
487
|
+
}
|
|
488
|
+
for d in drug_rows[:10]
|
|
489
|
+
],
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
elif entity_type == "disease":
|
|
493
|
+
disease = data_root.get("disease") or {}
|
|
494
|
+
assoc = disease.get("associatedTargets", {})
|
|
495
|
+
n_targets = assoc.get("count", 0)
|
|
496
|
+
top_targets = [
|
|
497
|
+
{"gene": r["target"]["approvedSymbol"], "score": round(r["score"], 3)}
|
|
498
|
+
for r in assoc.get("rows", [])
|
|
499
|
+
]
|
|
500
|
+
therapeutic_areas = [ta["name"] for ta in disease.get("therapeuticAreas", [])]
|
|
501
|
+
known_drugs = disease.get("knownDrugs", {})
|
|
502
|
+
n_drugs = known_drugs.get("uniqueDrugs", 0)
|
|
503
|
+
|
|
504
|
+
top_target_str = ", ".join(
|
|
505
|
+
f"{t['gene']} ({t['score']:.2f})" for t in top_targets[:3]
|
|
506
|
+
)
|
|
507
|
+
return {
|
|
508
|
+
"summary": (
|
|
509
|
+
f"Open Targets: {disease.get('name', query)} — "
|
|
510
|
+
f"{n_targets} associated targets, "
|
|
511
|
+
f"top: {top_target_str or 'none'}. "
|
|
512
|
+
f"Areas: {', '.join(therapeutic_areas[:3]) or 'N/A'}."
|
|
513
|
+
),
|
|
514
|
+
"entity_type": "disease",
|
|
515
|
+
"entity_id": entity_id,
|
|
516
|
+
"name": disease.get("name", ""),
|
|
517
|
+
"description": disease.get("description", ""),
|
|
518
|
+
"therapeutic_areas": therapeutic_areas,
|
|
519
|
+
"n_associated_targets": n_targets,
|
|
520
|
+
"top_targets": top_targets,
|
|
521
|
+
"n_known_drugs": n_drugs,
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
else: # drug
|
|
525
|
+
drug = data_root.get("drug") or {}
|
|
526
|
+
moa_rows = drug.get("mechanismsOfAction", {}).get("rows", [])
|
|
527
|
+
indications = drug.get("indications", {})
|
|
528
|
+
n_indications = indications.get("count", 0)
|
|
529
|
+
ind_rows = indications.get("rows", [])
|
|
530
|
+
|
|
531
|
+
mechanisms = [m.get("mechanismOfAction", "") for m in moa_rows]
|
|
532
|
+
return {
|
|
533
|
+
"summary": (
|
|
534
|
+
f"Open Targets: {drug.get('name', query)} — "
|
|
535
|
+
f"{drug.get('drugType', 'unknown')} drug, "
|
|
536
|
+
f"max phase {drug.get('maximumClinicalTrialPhase', 'N/A')}, "
|
|
537
|
+
f"{n_indications} indications."
|
|
538
|
+
),
|
|
539
|
+
"entity_type": "drug",
|
|
540
|
+
"entity_id": entity_id,
|
|
541
|
+
"name": drug.get("name", ""),
|
|
542
|
+
"drug_type": drug.get("drugType", ""),
|
|
543
|
+
"max_clinical_phase": drug.get("maximumClinicalTrialPhase"),
|
|
544
|
+
"withdrawn": drug.get("hasBeenWithdrawn", False),
|
|
545
|
+
"description": drug.get("description", ""),
|
|
546
|
+
"mechanisms": mechanisms,
|
|
547
|
+
"n_indications": n_indications,
|
|
548
|
+
"indications": [
|
|
549
|
+
{"disease": r["disease"]["name"], "max_phase": r.get("maxPhaseForIndication")}
|
|
550
|
+
for r in ind_rows[:15]
|
|
551
|
+
],
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
# ---------------------------------------------------------------------------
|
|
556
|
+
# 3. UniProt lookup
|
|
557
|
+
# ---------------------------------------------------------------------------
|
|
558
|
+
|
|
559
|
+
_UNIPROT_NON_HUMAN_HINTS = (
|
|
560
|
+
"helminth",
|
|
561
|
+
"parasite",
|
|
562
|
+
"schistosoma",
|
|
563
|
+
"fasciola",
|
|
564
|
+
"heligmosomoides",
|
|
565
|
+
"nematode",
|
|
566
|
+
"trematode",
|
|
567
|
+
"cestode",
|
|
568
|
+
"worm",
|
|
569
|
+
"brugia",
|
|
570
|
+
"filaria",
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
_UNIPROT_QUERY_STOPWORDS = {
|
|
574
|
+
"a", "an", "the", "and", "or", "for", "from", "with", "without",
|
|
575
|
+
"in", "on", "of", "to", "by", "via", "as", "that", "this", "these",
|
|
576
|
+
"those", "are", "is", "was", "were", "be", "been", "being", "it",
|
|
577
|
+
"its", "their", "minimal", "annotation", "annotations", "key", "keys",
|
|
578
|
+
"look", "lookup", "search", "find", "protein", "proteins", "immunomodulatory",
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _query_has_non_human_hints(query: str) -> bool:
|
|
583
|
+
q = (query or "").lower()
|
|
584
|
+
return any(hint in q for hint in _UNIPROT_NON_HUMAN_HINTS)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _keyword_fallback_query(query: str, max_terms: int = 7) -> str:
|
|
588
|
+
tokens = re.findall(r"[A-Za-z0-9_-]+", (query or "").lower())
|
|
589
|
+
selected = []
|
|
590
|
+
for tok in tokens:
|
|
591
|
+
if len(tok) < 3:
|
|
592
|
+
continue
|
|
593
|
+
if tok in _UNIPROT_QUERY_STOPWORDS:
|
|
594
|
+
continue
|
|
595
|
+
if tok not in selected:
|
|
596
|
+
selected.append(tok)
|
|
597
|
+
if len(selected) >= max_terms:
|
|
598
|
+
break
|
|
599
|
+
return " ".join(selected)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def _extract_species_phrases(query: str, max_species: int = 3) -> list[str]:
|
|
603
|
+
"""Extract likely binomial species names from free-text query."""
|
|
604
|
+
tokens = re.findall(r"[A-Za-z][A-Za-z-]*", query or "")
|
|
605
|
+
species: list[str] = []
|
|
606
|
+
for i in range(len(tokens) - 1):
|
|
607
|
+
genus_raw = tokens[i]
|
|
608
|
+
species_raw = tokens[i + 1]
|
|
609
|
+
genus = genus_raw.lower()
|
|
610
|
+
epithet = species_raw.lower()
|
|
611
|
+
|
|
612
|
+
if len(genus) < 3 or len(epithet) < 3:
|
|
613
|
+
continue
|
|
614
|
+
if genus in _UNIPROT_QUERY_STOPWORDS or epithet in _UNIPROT_QUERY_STOPWORDS:
|
|
615
|
+
continue
|
|
616
|
+
if not epithet.isalpha():
|
|
617
|
+
continue
|
|
618
|
+
if not (genus in _UNIPROT_NON_HUMAN_HINTS or genus_raw[0].isupper()):
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
phrase = f"{genus.capitalize()} {epithet}"
|
|
622
|
+
if phrase not in species:
|
|
623
|
+
species.append(phrase)
|
|
624
|
+
if len(species) >= max_species:
|
|
625
|
+
break
|
|
626
|
+
return species
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def _build_uniprot_search_candidates(
|
|
630
|
+
*,
|
|
631
|
+
query: str,
|
|
632
|
+
compact_query: str,
|
|
633
|
+
org_clause: str | None,
|
|
634
|
+
) -> list[str]:
|
|
635
|
+
"""Generate ranked UniProt search candidates for robust retrieval."""
|
|
636
|
+
q = (query or "").strip()
|
|
637
|
+
q_lc = q.lower()
|
|
638
|
+
species = _extract_species_phrases(q)
|
|
639
|
+
candidates: list[str] = []
|
|
640
|
+
|
|
641
|
+
def add(candidate: str):
|
|
642
|
+
if candidate and candidate not in candidates:
|
|
643
|
+
candidates.append(candidate)
|
|
644
|
+
|
|
645
|
+
if q:
|
|
646
|
+
if org_clause:
|
|
647
|
+
add(f"({q}) AND {org_clause}")
|
|
648
|
+
add(q)
|
|
649
|
+
|
|
650
|
+
if compact_query and compact_query != q:
|
|
651
|
+
if org_clause:
|
|
652
|
+
add(f"({compact_query}) AND {org_clause}")
|
|
653
|
+
add(compact_query)
|
|
654
|
+
|
|
655
|
+
wants_secreted = any(x in q_lc for x in ("secreted", "excretory", "extracellular", "vesicle", "ev "))
|
|
656
|
+
wants_uncharacterized = any(x in q_lc for x in ("uncharacterized", "understudied", "novel", "hypothetical"))
|
|
657
|
+
wants_scp_taps = any(
|
|
658
|
+
x in q_lc for x in ("venom allergen", "scp", "taps", "val", "cap superfamily", "allergen-like")
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
for sp in species:
|
|
662
|
+
sp_clause = f'organism_name:"{sp}"'
|
|
663
|
+
add(sp_clause)
|
|
664
|
+
if wants_secreted:
|
|
665
|
+
add(f'{sp_clause} AND (secreted OR excretory OR extracellular)')
|
|
666
|
+
if wants_uncharacterized:
|
|
667
|
+
add(f'{sp_clause} AND (uncharacterized OR hypothetical)')
|
|
668
|
+
if wants_scp_taps:
|
|
669
|
+
add(f'{sp_clause} AND ("venom allergen" OR SCP OR TAPS OR VAL)')
|
|
670
|
+
|
|
671
|
+
if _query_has_non_human_hints(q):
|
|
672
|
+
add("parasite")
|
|
673
|
+
add("helminth")
|
|
674
|
+
add("schistosoma")
|
|
675
|
+
add("fasciola")
|
|
676
|
+
add("heligmosomoides")
|
|
677
|
+
if wants_secreted:
|
|
678
|
+
add("(parasite OR helminth) AND (secreted OR excretory OR extracellular)")
|
|
679
|
+
if wants_scp_taps:
|
|
680
|
+
add('(parasite OR helminth) AND ("venom allergen" OR SCP OR TAPS OR VAL)')
|
|
681
|
+
|
|
682
|
+
# Keep search bounded to avoid excessive API calls.
|
|
683
|
+
return candidates[:12]
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def _entry_text_blob(entry: dict) -> str:
|
|
687
|
+
parts: list[str] = []
|
|
688
|
+
pd = entry.get("proteinDescription", {}) or {}
|
|
689
|
+
rec = (pd.get("recommendedName", {}) or {}).get("fullName", {}) or {}
|
|
690
|
+
if rec.get("value"):
|
|
691
|
+
parts.append(str(rec.get("value")))
|
|
692
|
+
for alt in (pd.get("alternativeNames", []) or []):
|
|
693
|
+
full = (alt.get("fullName", {}) or {}).get("value")
|
|
694
|
+
if full:
|
|
695
|
+
parts.append(str(full))
|
|
696
|
+
for kw in (entry.get("keywords", []) or []):
|
|
697
|
+
name = kw.get("name")
|
|
698
|
+
if name:
|
|
699
|
+
parts.append(str(name))
|
|
700
|
+
org = entry.get("organism", {}) or {}
|
|
701
|
+
sci = org.get("scientificName")
|
|
702
|
+
if sci:
|
|
703
|
+
parts.append(str(sci))
|
|
704
|
+
return " ".join(parts).lower()
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
def _entry_relevance_score(
|
|
708
|
+
entry: dict,
|
|
709
|
+
*,
|
|
710
|
+
original_query: str,
|
|
711
|
+
species_phrases: list[str],
|
|
712
|
+
non_human_hints: bool,
|
|
713
|
+
) -> float:
|
|
714
|
+
q_lc = (original_query or "").lower()
|
|
715
|
+
blob = _entry_text_blob(entry)
|
|
716
|
+
score = 0.0
|
|
717
|
+
|
|
718
|
+
wants_secreted = any(x in q_lc for x in ("secreted", "excretory", "extracellular", "vesicle", "ev "))
|
|
719
|
+
wants_uncharacterized = any(x in q_lc for x in ("uncharacterized", "understudied", "novel", "hypothetical"))
|
|
720
|
+
wants_scp_taps = any(x in q_lc for x in ("venom allergen", "scp", "taps", "val", "cap superfamily"))
|
|
721
|
+
|
|
722
|
+
# Species alignment dominates ranking.
|
|
723
|
+
for sp in species_phrases:
|
|
724
|
+
if sp.lower() in blob:
|
|
725
|
+
score += 8.0
|
|
726
|
+
if non_human_hints and "homo sapiens" in blob:
|
|
727
|
+
score -= 10.0
|
|
728
|
+
|
|
729
|
+
if wants_secreted and any(x in blob for x in ("secreted", "excretory", "extracellular", "signal peptide")):
|
|
730
|
+
score += 3.0
|
|
731
|
+
if wants_uncharacterized and any(x in blob for x in ("uncharacterized", "hypothetical", "putative")):
|
|
732
|
+
score += 3.0
|
|
733
|
+
if wants_scp_taps and any(x in blob for x in ("venom allergen", "scp", "taps", "val", "cap")):
|
|
734
|
+
score += 4.0
|
|
735
|
+
|
|
736
|
+
# Penalize clearly off-target "query not represented in entry text".
|
|
737
|
+
query_tokens = [t for t in re.findall(r"[a-z0-9_-]+", q_lc) if len(t) >= 4]
|
|
738
|
+
overlap = sum(1 for t in query_tokens[:8] if t in blob)
|
|
739
|
+
score += min(overlap * 0.5, 2.0)
|
|
740
|
+
|
|
741
|
+
entry_type = str(entry.get("entryType", "")).lower()
|
|
742
|
+
if wants_uncharacterized and "unreviewed" in entry_type:
|
|
743
|
+
score += 1.0
|
|
744
|
+
|
|
745
|
+
return score
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
@registry.register(
|
|
749
|
+
name="data_api.uniprot_lookup",
|
|
750
|
+
description="Look up comprehensive protein information from UniProt by gene symbol, UniProt ID, or protein name",
|
|
751
|
+
category="data_api",
|
|
752
|
+
parameters={
|
|
753
|
+
"query": "Gene symbol, UniProt accession (e.g. P04637), or protein name",
|
|
754
|
+
"organism": "Organism filter: common name (human/mouse/...), taxonomy ID, or 'any' (default 'human')",
|
|
755
|
+
},
|
|
756
|
+
requires_data=[],
|
|
757
|
+
usage_guide="You need detailed protein information: function, domains, subcellular location, GO terms, PDB structures, disease involvement, tissue specificity. Comprehensive UniProt protein profile.",
|
|
758
|
+
)
|
|
759
|
+
def uniprot_lookup(query: str, organism: str = "human", **kwargs) -> dict:
|
|
760
|
+
"""Look up comprehensive protein data from UniProt REST API."""
|
|
761
|
+
organism_ids = {
|
|
762
|
+
"human": 9606, "mouse": 10090, "rat": 10116,
|
|
763
|
+
"zebrafish": 7955, "drosophila": 7227, "yeast": 559292,
|
|
764
|
+
}
|
|
765
|
+
organism_clean = (organism or "human").strip()
|
|
766
|
+
organism_lc = organism_clean.lower()
|
|
767
|
+
|
|
768
|
+
org_clause = None
|
|
769
|
+
if organism_lc not in ("", "any", "all", "none"):
|
|
770
|
+
if organism_lc.isdigit():
|
|
771
|
+
org_clause = f"organism_id:{organism_lc}"
|
|
772
|
+
elif organism_lc in organism_ids:
|
|
773
|
+
org_clause = f"organism_id:{organism_ids[organism_lc]}"
|
|
774
|
+
else:
|
|
775
|
+
escaped = organism_clean.replace('"', "")
|
|
776
|
+
if escaped:
|
|
777
|
+
org_clause = f'organism_name:"{escaped}"'
|
|
778
|
+
|
|
779
|
+
# If caller left default "human" but query clearly targets non-human organisms
|
|
780
|
+
# (e.g., helminth parasite proteins), do not force a human-only filter.
|
|
781
|
+
if organism_lc == "human" and _query_has_non_human_hints(query):
|
|
782
|
+
org_clause = None
|
|
783
|
+
|
|
784
|
+
# Determine if query is a UniProt accession (e.g. P04637, Q9Y6K9)
|
|
785
|
+
is_accession = len(query) >= 6 and query[0].isalpha() and any(c.isdigit() for c in query)
|
|
786
|
+
species_phrases = _extract_species_phrases(query)
|
|
787
|
+
non_human_hints = _query_has_non_human_hints(query)
|
|
788
|
+
|
|
789
|
+
try:
|
|
790
|
+
if is_accession and not " " in query:
|
|
791
|
+
# Direct accession lookup
|
|
792
|
+
resp = _http_get(
|
|
793
|
+
f"https://rest.uniprot.org/uniprotkb/{query}",
|
|
794
|
+
headers={"Accept": "application/json"},
|
|
795
|
+
timeout=15,
|
|
796
|
+
retries=2,
|
|
797
|
+
)
|
|
798
|
+
if resp.status_code == 200:
|
|
799
|
+
entries = [resp.json()]
|
|
800
|
+
else:
|
|
801
|
+
entries = []
|
|
802
|
+
else:
|
|
803
|
+
entries = []
|
|
804
|
+
|
|
805
|
+
# If direct lookup failed, search
|
|
806
|
+
if not entries:
|
|
807
|
+
base_query = " ".join((query or "").split())
|
|
808
|
+
compact_query = _keyword_fallback_query(base_query)
|
|
809
|
+
search_candidates = _build_uniprot_search_candidates(
|
|
810
|
+
query=base_query,
|
|
811
|
+
compact_query=compact_query,
|
|
812
|
+
org_clause=org_clause,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
attempted_queries = []
|
|
816
|
+
last_status = None
|
|
817
|
+
matched_query = None
|
|
818
|
+
best_entry = None
|
|
819
|
+
best_score = float("-inf")
|
|
820
|
+
for search_query in search_candidates:
|
|
821
|
+
attempted_queries.append(search_query)
|
|
822
|
+
resp = _http_get(
|
|
823
|
+
"https://rest.uniprot.org/uniprotkb/search",
|
|
824
|
+
params={
|
|
825
|
+
"query": search_query,
|
|
826
|
+
"format": "json",
|
|
827
|
+
"size": 10,
|
|
828
|
+
},
|
|
829
|
+
headers={"Accept": "application/json"},
|
|
830
|
+
timeout=15,
|
|
831
|
+
retries=2,
|
|
832
|
+
)
|
|
833
|
+
last_status = resp.status_code
|
|
834
|
+
if resp.status_code != 200:
|
|
835
|
+
continue
|
|
836
|
+
data = resp.json()
|
|
837
|
+
hits = data.get("results", [])
|
|
838
|
+
if not hits:
|
|
839
|
+
continue
|
|
840
|
+
|
|
841
|
+
for hit in hits:
|
|
842
|
+
s = _entry_relevance_score(
|
|
843
|
+
hit,
|
|
844
|
+
original_query=query,
|
|
845
|
+
species_phrases=species_phrases,
|
|
846
|
+
non_human_hints=non_human_hints,
|
|
847
|
+
)
|
|
848
|
+
if s > best_score:
|
|
849
|
+
best_score = s
|
|
850
|
+
best_entry = hit
|
|
851
|
+
matched_query = search_query
|
|
852
|
+
|
|
853
|
+
if best_score >= 4.0:
|
|
854
|
+
break
|
|
855
|
+
|
|
856
|
+
if best_entry is not None:
|
|
857
|
+
entries = [best_entry]
|
|
858
|
+
|
|
859
|
+
if not entries and last_status not in (None, 200):
|
|
860
|
+
return {
|
|
861
|
+
"error": f"UniProt search failed (HTTP {last_status})",
|
|
862
|
+
"summary": f"UniProt search failed for '{query}'",
|
|
863
|
+
"search_attempts": attempted_queries,
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
except Exception as e:
|
|
867
|
+
return {"error": f"UniProt API error: {e}", "summary": f"UniProt query failed for '{query}'"}
|
|
868
|
+
|
|
869
|
+
if entries and non_human_hints:
|
|
870
|
+
org_name = str((entries[0].get("organism", {}) or {}).get("scientificName", "")).lower()
|
|
871
|
+
if "homo sapiens" in org_name and (matched_query is not None):
|
|
872
|
+
return {
|
|
873
|
+
"error": (
|
|
874
|
+
"Only human hits were returned for a non-human/parasite query. "
|
|
875
|
+
"Please specify organism='any' or a concrete parasite species (taxid/scientific name)."
|
|
876
|
+
),
|
|
877
|
+
"summary": f"UniProt: no reliable non-human match for '{query}'",
|
|
878
|
+
"search_attempts": attempted_queries if "attempted_queries" in locals() else [],
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
if not entries:
|
|
882
|
+
return {
|
|
883
|
+
"error": f"No UniProt entry found for '{query}' (organism: {organism_clean or 'any'})",
|
|
884
|
+
"summary": f"UniProt: no results for '{query}'",
|
|
885
|
+
"search_attempts": attempted_queries if "attempted_queries" in locals() else [],
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
entry = entries[0]
|
|
889
|
+
|
|
890
|
+
# Extract fields
|
|
891
|
+
accession = entry.get("primaryAccession", "")
|
|
892
|
+
gene_names = []
|
|
893
|
+
for g in entry.get("genes", []):
|
|
894
|
+
gn = g.get("geneName", {}).get("value")
|
|
895
|
+
if gn:
|
|
896
|
+
gene_names.append(gn)
|
|
897
|
+
for syn in g.get("synonyms", []):
|
|
898
|
+
gene_names.append(syn.get("value", ""))
|
|
899
|
+
|
|
900
|
+
protein_name = (
|
|
901
|
+
entry.get("proteinDescription", {})
|
|
902
|
+
.get("recommendedName", {})
|
|
903
|
+
.get("fullName", {})
|
|
904
|
+
.get("value", "Unknown")
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
seq_info = entry.get("sequence", {})
|
|
908
|
+
seq_length = seq_info.get("length", 0)
|
|
909
|
+
|
|
910
|
+
# Function
|
|
911
|
+
function_texts = []
|
|
912
|
+
for c in entry.get("comments", []):
|
|
913
|
+
if c.get("commentType") == "FUNCTION":
|
|
914
|
+
for t in c.get("texts", []):
|
|
915
|
+
function_texts.append(t.get("value", ""))
|
|
916
|
+
|
|
917
|
+
# Subcellular location
|
|
918
|
+
subcellular = []
|
|
919
|
+
for c in entry.get("comments", []):
|
|
920
|
+
if c.get("commentType") == "SUBCELLULAR LOCATION":
|
|
921
|
+
for sl in c.get("subcellularLocations", []):
|
|
922
|
+
loc = sl.get("location", {}).get("value", "")
|
|
923
|
+
if loc:
|
|
924
|
+
subcellular.append(loc)
|
|
925
|
+
|
|
926
|
+
# Tissue specificity
|
|
927
|
+
tissue_specificity = ""
|
|
928
|
+
for c in entry.get("comments", []):
|
|
929
|
+
if c.get("commentType") == "TISSUE SPECIFICITY":
|
|
930
|
+
for t in c.get("texts", []):
|
|
931
|
+
tissue_specificity = t.get("value", "")
|
|
932
|
+
|
|
933
|
+
# Disease involvement
|
|
934
|
+
diseases = []
|
|
935
|
+
for c in entry.get("comments", []):
|
|
936
|
+
if c.get("commentType") == "DISEASE":
|
|
937
|
+
disease = c.get("disease", {})
|
|
938
|
+
if disease:
|
|
939
|
+
diseases.append({
|
|
940
|
+
"name": disease.get("diseaseId", ""),
|
|
941
|
+
"description": disease.get("description", ""),
|
|
942
|
+
"acronym": disease.get("acronym", ""),
|
|
943
|
+
})
|
|
944
|
+
|
|
945
|
+
# Features: domains, GO terms
|
|
946
|
+
features = entry.get("features", [])
|
|
947
|
+
domains = [
|
|
948
|
+
{"name": f.get("description", ""), "type": f.get("type", "")}
|
|
949
|
+
for f in features
|
|
950
|
+
if f.get("type") in ("Domain", "Repeat", "Zinc finger", "Motif")
|
|
951
|
+
]
|
|
952
|
+
|
|
953
|
+
# GO terms from cross-references
|
|
954
|
+
xrefs = entry.get("uniProtKBCrossReferences", [])
|
|
955
|
+
go_terms = []
|
|
956
|
+
pdb_ids = []
|
|
957
|
+
for xref in xrefs:
|
|
958
|
+
db = xref.get("database", "")
|
|
959
|
+
if db == "GO":
|
|
960
|
+
props = {p["key"]: p["value"] for p in xref.get("properties", [])}
|
|
961
|
+
go_terms.append({
|
|
962
|
+
"id": xref.get("id", ""),
|
|
963
|
+
"term": props.get("GoTerm", ""),
|
|
964
|
+
"evidence": props.get("GoEvidenceType", ""),
|
|
965
|
+
})
|
|
966
|
+
elif db == "PDB":
|
|
967
|
+
pdb_ids.append(xref.get("id", ""))
|
|
968
|
+
|
|
969
|
+
# Keywords
|
|
970
|
+
keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
|
|
971
|
+
|
|
972
|
+
primary_gene = gene_names[0] if gene_names else query
|
|
973
|
+
n_pdb = len(pdb_ids)
|
|
974
|
+
|
|
975
|
+
return {
|
|
976
|
+
"summary": (
|
|
977
|
+
f"UniProt {accession} ({primary_gene}): {protein_name}, "
|
|
978
|
+
f"{seq_length} aa. "
|
|
979
|
+
+ (f"{function_texts[0][:120]}... " if function_texts else "")
|
|
980
|
+
+ f"{n_pdb} PDB structure(s)."
|
|
981
|
+
),
|
|
982
|
+
"matched_query": matched_query if "matched_query" in locals() else query,
|
|
983
|
+
"organism_filter": org_clause or "none",
|
|
984
|
+
"accession": accession,
|
|
985
|
+
"gene_names": gene_names,
|
|
986
|
+
"protein_name": protein_name,
|
|
987
|
+
"sequence_length": seq_length,
|
|
988
|
+
"function": function_texts,
|
|
989
|
+
"subcellular_location": subcellular,
|
|
990
|
+
"tissue_specificity": tissue_specificity,
|
|
991
|
+
"diseases": diseases[:10],
|
|
992
|
+
"domains": domains[:20],
|
|
993
|
+
"go_terms": go_terms[:30],
|
|
994
|
+
"pdb_ids": pdb_ids[:30],
|
|
995
|
+
"n_pdb_structures": n_pdb,
|
|
996
|
+
"keywords": keywords,
|
|
997
|
+
"uniprot_url": f"https://www.uniprot.org/uniprot/{accession}",
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
# ---------------------------------------------------------------------------
|
|
1002
|
+
# 4. PDB search
|
|
1003
|
+
# ---------------------------------------------------------------------------
|
|
1004
|
+
|
|
1005
|
+
@registry.register(
|
|
1006
|
+
name="data_api.pdb_search",
|
|
1007
|
+
description="Search RCSB PDB for protein structures by gene name, UniProt ID, or PDB ID",
|
|
1008
|
+
category="data_api",
|
|
1009
|
+
parameters={
|
|
1010
|
+
"query": "Gene name, UniProt accession, or 4-character PDB ID",
|
|
1011
|
+
"method": "Optional experimental method filter: 'X-RAY', 'EM', 'NMR'",
|
|
1012
|
+
"max_results": "Maximum number of structures to return (default 10)",
|
|
1013
|
+
},
|
|
1014
|
+
requires_data=[],
|
|
1015
|
+
usage_guide="You want to find 3D protein structures for a target — PDB IDs, resolution, method, ligands. Use for structure-based drug design and target assessment.",
|
|
1016
|
+
)
|
|
1017
|
+
def pdb_search(query: str, method: str = None, max_results: int = 10, **kwargs) -> dict:
|
|
1018
|
+
"""Search RCSB PDB for structures using the search and data APIs."""
|
|
1019
|
+
query_clean = query.strip()
|
|
1020
|
+
|
|
1021
|
+
# If query looks like a PDB ID (4 chars), fetch directly
|
|
1022
|
+
if len(query_clean) == 4 and query_clean.isalnum():
|
|
1023
|
+
return _fetch_pdb_entry(query_clean)
|
|
1024
|
+
|
|
1025
|
+
# Build RCSB search query
|
|
1026
|
+
search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
|
|
1027
|
+
|
|
1028
|
+
# Split multi-term queries into individual search nodes (AND logic)
|
|
1029
|
+
terms = query_clean.split()
|
|
1030
|
+
fallback_note = ""
|
|
1031
|
+
if len(terms) > 1:
|
|
1032
|
+
text_nodes = [
|
|
1033
|
+
{
|
|
1034
|
+
"type": "terminal",
|
|
1035
|
+
"service": "full_text",
|
|
1036
|
+
"parameters": {"value": term},
|
|
1037
|
+
}
|
|
1038
|
+
for term in terms
|
|
1039
|
+
]
|
|
1040
|
+
else:
|
|
1041
|
+
text_nodes = [
|
|
1042
|
+
{
|
|
1043
|
+
"type": "terminal",
|
|
1044
|
+
"service": "full_text",
|
|
1045
|
+
"parameters": {"value": query_clean},
|
|
1046
|
+
}
|
|
1047
|
+
]
|
|
1048
|
+
|
|
1049
|
+
# Construct search JSON
|
|
1050
|
+
query_json = {
|
|
1051
|
+
"query": {
|
|
1052
|
+
"type": "group",
|
|
1053
|
+
"logical_operator": "and",
|
|
1054
|
+
"nodes": text_nodes,
|
|
1055
|
+
},
|
|
1056
|
+
"return_type": "entry",
|
|
1057
|
+
"request_options": {
|
|
1058
|
+
"paginate": {"start": 0, "rows": max_results},
|
|
1059
|
+
"sort": [{"sort_by": "score", "direction": "desc"}],
|
|
1060
|
+
},
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
# Add method filter if specified
|
|
1064
|
+
method_value = None
|
|
1065
|
+
if method:
|
|
1066
|
+
method_upper = method.upper()
|
|
1067
|
+
valid_methods = ("X-RAY DIFFRACTION", "ELECTRON MICROSCOPY", "SOLUTION NMR",
|
|
1068
|
+
"X-RAY", "EM", "NMR")
|
|
1069
|
+
if method_upper not in valid_methods:
|
|
1070
|
+
return {"error": f"Invalid method '{method}'. Use 'X-RAY', 'EM', or 'NMR'", "summary": f"Invalid PDB method '{method}'"}
|
|
1071
|
+
|
|
1072
|
+
method_map = {
|
|
1073
|
+
"X-RAY": "X-RAY DIFFRACTION",
|
|
1074
|
+
"EM": "ELECTRON MICROSCOPY",
|
|
1075
|
+
"NMR": "SOLUTION NMR",
|
|
1076
|
+
}
|
|
1077
|
+
method_value = method_map.get(method_upper, method_upper)
|
|
1078
|
+
|
|
1079
|
+
query_json["query"]["nodes"].append({
|
|
1080
|
+
"type": "terminal",
|
|
1081
|
+
"service": "text",
|
|
1082
|
+
"parameters": {
|
|
1083
|
+
"attribute": "exptl.method",
|
|
1084
|
+
"operator": "exact_match",
|
|
1085
|
+
"value": method_value,
|
|
1086
|
+
},
|
|
1087
|
+
})
|
|
1088
|
+
|
|
1089
|
+
try:
|
|
1090
|
+
resp = _http_post(search_url, json=query_json, timeout=15, retries=2)
|
|
1091
|
+
if resp.status_code != 200:
|
|
1092
|
+
return {
|
|
1093
|
+
"error": f"RCSB PDB search failed (HTTP {resp.status_code})",
|
|
1094
|
+
"summary": f"PDB search failed for '{query}'",
|
|
1095
|
+
}
|
|
1096
|
+
data = resp.json()
|
|
1097
|
+
except Exception as e:
|
|
1098
|
+
return {"error": f"PDB search error: {e}", "summary": f"PDB search failed for '{query}'"}
|
|
1099
|
+
|
|
1100
|
+
total_count = data.get("total_count", 0)
|
|
1101
|
+
result_set = data.get("result_set", [])
|
|
1102
|
+
|
|
1103
|
+
if not result_set and len(terms) > 1:
|
|
1104
|
+
# Retry with just the first term (likely the protein/gene name)
|
|
1105
|
+
fallback_json = {
|
|
1106
|
+
"query": {
|
|
1107
|
+
"type": "group",
|
|
1108
|
+
"logical_operator": "and",
|
|
1109
|
+
"nodes": [
|
|
1110
|
+
{
|
|
1111
|
+
"type": "terminal",
|
|
1112
|
+
"service": "full_text",
|
|
1113
|
+
"parameters": {"value": terms[0]},
|
|
1114
|
+
}
|
|
1115
|
+
],
|
|
1116
|
+
},
|
|
1117
|
+
"return_type": "entry",
|
|
1118
|
+
"request_options": {
|
|
1119
|
+
"paginate": {"start": 0, "rows": max_results},
|
|
1120
|
+
"sort": [{"sort_by": "score", "direction": "desc"}],
|
|
1121
|
+
},
|
|
1122
|
+
}
|
|
1123
|
+
if method and method_value:
|
|
1124
|
+
# re-add method filter
|
|
1125
|
+
fallback_json["query"]["nodes"].append({
|
|
1126
|
+
"type": "terminal",
|
|
1127
|
+
"service": "text",
|
|
1128
|
+
"parameters": {
|
|
1129
|
+
"attribute": "exptl.method",
|
|
1130
|
+
"operator": "exact_match",
|
|
1131
|
+
"value": method_value,
|
|
1132
|
+
},
|
|
1133
|
+
})
|
|
1134
|
+
try:
|
|
1135
|
+
resp2 = _http_post(search_url, json=fallback_json, timeout=15, retries=2)
|
|
1136
|
+
if resp2.status_code == 200:
|
|
1137
|
+
data2 = resp2.json()
|
|
1138
|
+
result_set = data2.get("result_set", [])
|
|
1139
|
+
total_count = data2.get("total_count", 0)
|
|
1140
|
+
if result_set:
|
|
1141
|
+
fallback_note = f" (broadened from '{query}' to '{terms[0]}')"
|
|
1142
|
+
except Exception:
|
|
1143
|
+
pass # Keep original empty result
|
|
1144
|
+
|
|
1145
|
+
if not result_set:
|
|
1146
|
+
return {
|
|
1147
|
+
"summary": f"No PDB structures found for '{query}'",
|
|
1148
|
+
"query": query,
|
|
1149
|
+
"total_count": 0,
|
|
1150
|
+
"structures": [],
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
pdb_ids = [r.get("identifier", "") for r in result_set if r.get("identifier")]
|
|
1154
|
+
|
|
1155
|
+
# Fetch details for each PDB entry
|
|
1156
|
+
structures = []
|
|
1157
|
+
for pdb_id in pdb_ids[:max_results]:
|
|
1158
|
+
try:
|
|
1159
|
+
detail_resp = _http_get(
|
|
1160
|
+
f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}",
|
|
1161
|
+
timeout=10,
|
|
1162
|
+
retries=2,
|
|
1163
|
+
)
|
|
1164
|
+
if detail_resp.status_code != 200:
|
|
1165
|
+
structures.append({"pdb_id": pdb_id, "error": "detail fetch failed"})
|
|
1166
|
+
continue
|
|
1167
|
+
detail = detail_resp.json()
|
|
1168
|
+
|
|
1169
|
+
struct_info = detail.get("struct", {})
|
|
1170
|
+
exptl = detail.get("exptl", [{}])[0] if detail.get("exptl") else {}
|
|
1171
|
+
cell = detail.get("cell", {})
|
|
1172
|
+
rcsb_info = detail.get("rcsb_entry_info", {})
|
|
1173
|
+
|
|
1174
|
+
# Get resolution
|
|
1175
|
+
resolution = None
|
|
1176
|
+
for refl in detail.get("reflns", []):
|
|
1177
|
+
resolution = refl.get("d_resolution_high")
|
|
1178
|
+
if resolution is None:
|
|
1179
|
+
resolution = rcsb_info.get("resolution_combined", [None])
|
|
1180
|
+
resolution = resolution[0] if isinstance(resolution, list) and resolution else resolution
|
|
1181
|
+
|
|
1182
|
+
# Get ligands from nonpolymer entities
|
|
1183
|
+
ligands = []
|
|
1184
|
+
for entity in detail.get("rcsb_entry_container_identifiers", {}).get("non_polymer_entity_ids", []):
|
|
1185
|
+
ligands.append(entity)
|
|
1186
|
+
|
|
1187
|
+
structures.append({
|
|
1188
|
+
"pdb_id": pdb_id,
|
|
1189
|
+
"title": struct_info.get("title", ""),
|
|
1190
|
+
"method": exptl.get("method", ""),
|
|
1191
|
+
"resolution": resolution,
|
|
1192
|
+
"deposition_date": detail.get("rcsb_accession_info", {}).get("deposit_date", ""),
|
|
1193
|
+
"organism": detail.get("rcsb_entry_info", {}).get("deposited_model_count", ""),
|
|
1194
|
+
"n_ligands": len(ligands),
|
|
1195
|
+
})
|
|
1196
|
+
except Exception:
|
|
1197
|
+
structures.append({"pdb_id": pdb_id, "error": "detail fetch failed"})
|
|
1198
|
+
|
|
1199
|
+
# Find best resolution
|
|
1200
|
+
resolutions = [s["resolution"] for s in structures if s.get("resolution")]
|
|
1201
|
+
best_res = min(resolutions) if resolutions else None
|
|
1202
|
+
best_id = None
|
|
1203
|
+
if best_res is not None:
|
|
1204
|
+
for s in structures:
|
|
1205
|
+
if s.get("resolution") == best_res:
|
|
1206
|
+
best_id = s["pdb_id"]
|
|
1207
|
+
break
|
|
1208
|
+
|
|
1209
|
+
method_str = f" ({method})" if method else ""
|
|
1210
|
+
best_str = f", best resolution {best_res:.1f}A ({best_id})" if best_res and best_id else ""
|
|
1211
|
+
|
|
1212
|
+
return {
|
|
1213
|
+
"summary": (
|
|
1214
|
+
f"PDB structures for {query}{method_str}: {total_count} total"
|
|
1215
|
+
f"{best_str}{fallback_note}"
|
|
1216
|
+
),
|
|
1217
|
+
"query": query,
|
|
1218
|
+
"total_count": total_count,
|
|
1219
|
+
"n_returned": len(structures),
|
|
1220
|
+
"best_resolution": best_res,
|
|
1221
|
+
"best_pdb_id": best_id,
|
|
1222
|
+
"structures": structures,
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def _fetch_pdb_entry(pdb_id: str) -> dict:
|
|
1227
|
+
"""Fetch a single PDB entry by ID."""
|
|
1228
|
+
try:
|
|
1229
|
+
resp = _http_get(
|
|
1230
|
+
f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}",
|
|
1231
|
+
timeout=10,
|
|
1232
|
+
retries=2,
|
|
1233
|
+
)
|
|
1234
|
+
if resp.status_code == 404:
|
|
1235
|
+
return {"error": f"PDB entry {pdb_id} not found", "summary": f"No PDB entry for {pdb_id}"}
|
|
1236
|
+
if resp.status_code != 200:
|
|
1237
|
+
return {"error": f"PDB API returned HTTP {resp.status_code}", "summary": f"PDB API error: HTTP {resp.status_code}"}
|
|
1238
|
+
detail = resp.json()
|
|
1239
|
+
except Exception as e:
|
|
1240
|
+
return {"error": f"PDB API error: {e}", "summary": f"PDB API error: {e}"}
|
|
1241
|
+
struct_info = detail.get("struct", {})
|
|
1242
|
+
exptl = detail.get("exptl", [{}])[0] if detail.get("exptl") else {}
|
|
1243
|
+
rcsb_info = detail.get("rcsb_entry_info", {})
|
|
1244
|
+
|
|
1245
|
+
resolution = None
|
|
1246
|
+
for refl in detail.get("reflns", []):
|
|
1247
|
+
resolution = refl.get("d_resolution_high")
|
|
1248
|
+
if resolution is None:
|
|
1249
|
+
res_list = rcsb_info.get("resolution_combined", [])
|
|
1250
|
+
resolution = res_list[0] if isinstance(res_list, list) and res_list else None
|
|
1251
|
+
|
|
1252
|
+
return {
|
|
1253
|
+
"summary": f"PDB {pdb_id}: {struct_info.get('title', 'N/A')} ({exptl.get('method', 'N/A')}, {resolution or 'N/A'}A)",
|
|
1254
|
+
"pdb_id": pdb_id,
|
|
1255
|
+
"title": struct_info.get("title", ""),
|
|
1256
|
+
"method": exptl.get("method", ""),
|
|
1257
|
+
"resolution": resolution,
|
|
1258
|
+
"deposition_date": detail.get("rcsb_accession_info", {}).get("deposit_date", ""),
|
|
1259
|
+
"total_count": 1,
|
|
1260
|
+
"structures": [{
|
|
1261
|
+
"pdb_id": pdb_id,
|
|
1262
|
+
"title": struct_info.get("title", ""),
|
|
1263
|
+
"method": exptl.get("method", ""),
|
|
1264
|
+
"resolution": resolution,
|
|
1265
|
+
}],
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
|
|
1269
|
+
# ---------------------------------------------------------------------------
|
|
1270
|
+
# 5. Ensembl lookup
|
|
1271
|
+
# ---------------------------------------------------------------------------
|
|
1272
|
+
|
|
1273
|
+
@registry.register(
|
|
1274
|
+
name="data_api.ensembl_lookup",
|
|
1275
|
+
description="Look up gene information from Ensembl: genomic coordinates, transcripts, cross-references",
|
|
1276
|
+
category="data_api",
|
|
1277
|
+
parameters={
|
|
1278
|
+
"gene": "Gene symbol (e.g. BRCA1) or Ensembl ID (e.g. ENSG00000012048)",
|
|
1279
|
+
"species": "Species name (default 'human')",
|
|
1280
|
+
},
|
|
1281
|
+
requires_data=[],
|
|
1282
|
+
usage_guide="You need gene-level genomic information: Ensembl ID, chromosome location, transcripts, biotype, cross-references. Use for gene annotation and ID mapping.",
|
|
1283
|
+
)
|
|
1284
|
+
def ensembl_lookup(gene: str, species: str = "human", **kwargs) -> dict:
|
|
1285
|
+
"""Look up gene information from the Ensembl REST API."""
|
|
1286
|
+
ensembl_base = "https://rest.ensembl.org"
|
|
1287
|
+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
|
1288
|
+
|
|
1289
|
+
species_map = {
|
|
1290
|
+
"human": "homo_sapiens", "mouse": "mus_musculus", "rat": "rattus_norvegicus",
|
|
1291
|
+
"zebrafish": "danio_rerio", "drosophila": "drosophila_melanogaster",
|
|
1292
|
+
}
|
|
1293
|
+
species_name = species_map.get(species.lower(), species.lower().replace(" ", "_"))
|
|
1294
|
+
|
|
1295
|
+
gene_clean = gene.strip()
|
|
1296
|
+
|
|
1297
|
+
# Determine if this is an Ensembl ID or a symbol
|
|
1298
|
+
if gene_clean.upper().startswith("ENSG") or gene_clean.upper().startswith("ENSMUSG"):
|
|
1299
|
+
# Direct ID lookup
|
|
1300
|
+
url = f"{ensembl_base}/lookup/id/{gene_clean}"
|
|
1301
|
+
params = {"expand": 1}
|
|
1302
|
+
else:
|
|
1303
|
+
# Symbol lookup
|
|
1304
|
+
url = f"{ensembl_base}/lookup/symbol/{species_name}/{gene_clean}"
|
|
1305
|
+
params = {"expand": 1}
|
|
1306
|
+
|
|
1307
|
+
try:
|
|
1308
|
+
resp = _http_get(url, params=params, headers=headers, timeout=15, retries=2)
|
|
1309
|
+
if resp.status_code == 400:
|
|
1310
|
+
return {
|
|
1311
|
+
"error": f"Gene '{gene}' not found in Ensembl ({species})",
|
|
1312
|
+
"summary": f"Ensembl: gene '{gene}' not found for {species}",
|
|
1313
|
+
}
|
|
1314
|
+
if resp.status_code != 200:
|
|
1315
|
+
return {"error": f"Ensembl API returned HTTP {resp.status_code}", "summary": f"Ensembl API error: HTTP {resp.status_code}"}
|
|
1316
|
+
data = resp.json()
|
|
1317
|
+
except Exception as e:
|
|
1318
|
+
return {"error": f"Ensembl API error: {e}", "summary": f"Ensembl API error: {e}"}
|
|
1319
|
+
ensembl_id = data.get("id", "")
|
|
1320
|
+
display_name = data.get("display_name", gene)
|
|
1321
|
+
description = data.get("description", "")
|
|
1322
|
+
biotype = data.get("biotype", "")
|
|
1323
|
+
chromosome = data.get("seq_region_name", "")
|
|
1324
|
+
start = data.get("start")
|
|
1325
|
+
end = data.get("end")
|
|
1326
|
+
strand = data.get("strand")
|
|
1327
|
+
|
|
1328
|
+
# Parse transcripts
|
|
1329
|
+
transcripts = []
|
|
1330
|
+
for t in data.get("Transcript", []):
|
|
1331
|
+
transcripts.append({
|
|
1332
|
+
"transcript_id": t.get("id", ""),
|
|
1333
|
+
"display_name": t.get("display_name", ""),
|
|
1334
|
+
"biotype": t.get("biotype", ""),
|
|
1335
|
+
"is_canonical": t.get("is_canonical", 0) == 1,
|
|
1336
|
+
"length": t.get("length"),
|
|
1337
|
+
})
|
|
1338
|
+
|
|
1339
|
+
n_transcripts = len(transcripts)
|
|
1340
|
+
|
|
1341
|
+
# Fetch cross-references (UniProt mapping)
|
|
1342
|
+
xrefs = []
|
|
1343
|
+
try:
|
|
1344
|
+
xref_resp = _http_get(
|
|
1345
|
+
f"{ensembl_base}/xrefs/id/{ensembl_id}",
|
|
1346
|
+
params={"external_db": "UniProt%"},
|
|
1347
|
+
headers=headers,
|
|
1348
|
+
timeout=10,
|
|
1349
|
+
retries=2,
|
|
1350
|
+
)
|
|
1351
|
+
if xref_resp.status_code == 200:
|
|
1352
|
+
for xref in xref_resp.json():
|
|
1353
|
+
xrefs.append({
|
|
1354
|
+
"database": xref.get("dbname", ""),
|
|
1355
|
+
"primary_id": xref.get("primary_id", ""),
|
|
1356
|
+
"display_id": xref.get("display_id", ""),
|
|
1357
|
+
})
|
|
1358
|
+
except Exception:
|
|
1359
|
+
pass
|
|
1360
|
+
|
|
1361
|
+
strand_str = "+" if strand == 1 else "-" if strand == -1 else "?"
|
|
1362
|
+
loc_str = f"chr{chromosome}:{start:,}-{end:,} ({strand_str})" if start and end else "unknown"
|
|
1363
|
+
|
|
1364
|
+
return {
|
|
1365
|
+
"summary": (
|
|
1366
|
+
f"{display_name} ({ensembl_id}): {biotype}, "
|
|
1367
|
+
f"{loc_str}, {n_transcripts} transcripts"
|
|
1368
|
+
),
|
|
1369
|
+
"ensembl_id": ensembl_id,
|
|
1370
|
+
"display_name": display_name,
|
|
1371
|
+
"description": description,
|
|
1372
|
+
"biotype": biotype,
|
|
1373
|
+
"chromosome": chromosome,
|
|
1374
|
+
"start": start,
|
|
1375
|
+
"end": end,
|
|
1376
|
+
"strand": strand,
|
|
1377
|
+
"location": loc_str,
|
|
1378
|
+
"n_transcripts": n_transcripts,
|
|
1379
|
+
"transcripts": transcripts[:20],
|
|
1380
|
+
"cross_references": xrefs[:10],
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
# ---------------------------------------------------------------------------
|
|
1385
|
+
# 6. NCBI Gene
|
|
1386
|
+
# ---------------------------------------------------------------------------
|
|
1387
|
+
|
|
1388
|
+
@registry.register(
|
|
1389
|
+
name="data_api.ncbi_gene",
|
|
1390
|
+
description="Query NCBI databases for gene information, ClinVar variants, or dbSNP data",
|
|
1391
|
+
category="data_api",
|
|
1392
|
+
parameters={
|
|
1393
|
+
"query": "Gene symbol (e.g. BRCA1) or NCBI Gene ID (e.g. 672)",
|
|
1394
|
+
"database": "Database to query: 'gene', 'clinvar', or 'dbsnp' (default 'gene')",
|
|
1395
|
+
},
|
|
1396
|
+
requires_data=[],
|
|
1397
|
+
usage_guide="You need NCBI gene summaries, ClinVar clinical variant data, or dbSNP information for a gene. Use for gene annotation, variant interpretation, and clinical genetics.",
|
|
1398
|
+
)
|
|
1399
|
+
def ncbi_gene(query: str, database: str = "gene", **kwargs) -> dict:
|
|
1400
|
+
"""Query NCBI E-utilities for gene, ClinVar, or dbSNP data."""
|
|
1401
|
+
valid_dbs = ("gene", "clinvar", "dbsnp")
|
|
1402
|
+
if database not in valid_dbs:
|
|
1403
|
+
return {"error": f"Invalid database '{database}'. Choose from: {', '.join(valid_dbs)}", "summary": f"Invalid NCBI database '{database}'"}
|
|
1404
|
+
|
|
1405
|
+
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
1406
|
+
|
|
1407
|
+
# Step 1: Search for the gene/variant
|
|
1408
|
+
if database == "gene":
|
|
1409
|
+
search_term = f"{query}[Gene Name] AND Homo sapiens[Organism]"
|
|
1410
|
+
db = "gene"
|
|
1411
|
+
elif database == "clinvar":
|
|
1412
|
+
search_term = f"{query}[Gene Name]"
|
|
1413
|
+
db = "clinvar"
|
|
1414
|
+
else: # dbsnp
|
|
1415
|
+
search_term = f"{query}[Gene Name]"
|
|
1416
|
+
db = "snp"
|
|
1417
|
+
|
|
1418
|
+
try:
|
|
1419
|
+
search_resp = _http_get(
|
|
1420
|
+
f"{base}/esearch.fcgi",
|
|
1421
|
+
params={
|
|
1422
|
+
"db": db,
|
|
1423
|
+
"term": search_term,
|
|
1424
|
+
"retmax": 20,
|
|
1425
|
+
"retmode": "json",
|
|
1426
|
+
"sort": "relevance",
|
|
1427
|
+
},
|
|
1428
|
+
timeout=15,
|
|
1429
|
+
retries=2,
|
|
1430
|
+
)
|
|
1431
|
+
search_resp.raise_for_status()
|
|
1432
|
+
search_data = search_resp.json()
|
|
1433
|
+
except Exception as e:
|
|
1434
|
+
return {"error": f"NCBI search failed: {e}", "summary": f"NCBI query failed for '{query}'"}
|
|
1435
|
+
|
|
1436
|
+
result = search_data.get("esearchresult", {})
|
|
1437
|
+
ids = result.get("idlist", [])
|
|
1438
|
+
total_count = int(result.get("count", 0))
|
|
1439
|
+
|
|
1440
|
+
if not ids:
|
|
1441
|
+
return {
|
|
1442
|
+
"summary": f"No NCBI {database} results for '{query}'",
|
|
1443
|
+
"query": query,
|
|
1444
|
+
"database": database,
|
|
1445
|
+
"total_count": 0,
|
|
1446
|
+
"results": [],
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
# Step 2: Fetch summaries
|
|
1450
|
+
try:
|
|
1451
|
+
summary_resp = _http_get(
|
|
1452
|
+
f"{base}/esummary.fcgi",
|
|
1453
|
+
params={
|
|
1454
|
+
"db": db,
|
|
1455
|
+
"id": ",".join(ids[:20]),
|
|
1456
|
+
"retmode": "json",
|
|
1457
|
+
},
|
|
1458
|
+
timeout=15,
|
|
1459
|
+
retries=2,
|
|
1460
|
+
)
|
|
1461
|
+
summary_resp.raise_for_status()
|
|
1462
|
+
summary_data = summary_resp.json()
|
|
1463
|
+
except Exception as e:
|
|
1464
|
+
return {"error": f"NCBI summary failed: {e}", "summary": f"NCBI summary lookup failed for '{query}'"}
|
|
1465
|
+
|
|
1466
|
+
results_dict = summary_data.get("result", {})
|
|
1467
|
+
|
|
1468
|
+
if database == "gene":
|
|
1469
|
+
gene_results = []
|
|
1470
|
+
for gid in ids:
|
|
1471
|
+
info = results_dict.get(gid, {})
|
|
1472
|
+
if not info or gid == "uids":
|
|
1473
|
+
continue
|
|
1474
|
+
gene_results.append({
|
|
1475
|
+
"gene_id": gid,
|
|
1476
|
+
"symbol": info.get("name", ""),
|
|
1477
|
+
"description": info.get("description", ""),
|
|
1478
|
+
"chromosome": info.get("chromosome", ""),
|
|
1479
|
+
"organism": info.get("organism", {}).get("scientificname", ""),
|
|
1480
|
+
"aliases": info.get("otheraliases", ""),
|
|
1481
|
+
"summary": info.get("summary", ""),
|
|
1482
|
+
"gene_type": info.get("geneticSource", ""),
|
|
1483
|
+
"map_location": info.get("maplocation", ""),
|
|
1484
|
+
})
|
|
1485
|
+
|
|
1486
|
+
top = gene_results[0] if gene_results else {}
|
|
1487
|
+
return {
|
|
1488
|
+
"summary": (
|
|
1489
|
+
f"NCBI Gene {top.get('gene_id', '')} ({top.get('symbol', query)}): "
|
|
1490
|
+
f"{top.get('description', 'N/A')}, "
|
|
1491
|
+
f"chr{top.get('chromosome', '?')}, "
|
|
1492
|
+
f"{total_count} total ClinVar variants"
|
|
1493
|
+
),
|
|
1494
|
+
"query": query,
|
|
1495
|
+
"database": "gene",
|
|
1496
|
+
"total_count": total_count,
|
|
1497
|
+
"genes": gene_results,
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
elif database == "clinvar":
|
|
1501
|
+
variants = []
|
|
1502
|
+
for vid in ids:
|
|
1503
|
+
info = results_dict.get(vid, {})
|
|
1504
|
+
if not info or vid == "uids":
|
|
1505
|
+
continue
|
|
1506
|
+
variants.append({
|
|
1507
|
+
"uid": vid,
|
|
1508
|
+
"title": info.get("title", ""),
|
|
1509
|
+
"clinical_significance": info.get("clinical_significance", {}).get("description", ""),
|
|
1510
|
+
"gene_sort": info.get("gene_sort", ""),
|
|
1511
|
+
"variation_set": info.get("variation_set", []),
|
|
1512
|
+
"obj_type": info.get("obj_type", ""),
|
|
1513
|
+
})
|
|
1514
|
+
|
|
1515
|
+
return {
|
|
1516
|
+
"summary": f"ClinVar for {query}: {total_count} total variants, showing {len(variants)}",
|
|
1517
|
+
"query": query,
|
|
1518
|
+
"database": "clinvar",
|
|
1519
|
+
"total_count": total_count,
|
|
1520
|
+
"variants": variants,
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
else: # dbsnp
|
|
1524
|
+
snps = []
|
|
1525
|
+
for sid in ids:
|
|
1526
|
+
info = results_dict.get(sid, {})
|
|
1527
|
+
if not info or sid == "uids":
|
|
1528
|
+
continue
|
|
1529
|
+
snps.append({
|
|
1530
|
+
"uid": sid,
|
|
1531
|
+
"snp_id": info.get("snp_id", sid),
|
|
1532
|
+
"snp_class": info.get("snp_class", ""),
|
|
1533
|
+
"global_maf": info.get("global_mafs", []),
|
|
1534
|
+
"genes": info.get("genes", []),
|
|
1535
|
+
"clinical_significance": info.get("clinical_significance", ""),
|
|
1536
|
+
})
|
|
1537
|
+
|
|
1538
|
+
return {
|
|
1539
|
+
"summary": f"dbSNP for {query}: {total_count} total SNPs, showing {len(snps)}",
|
|
1540
|
+
"query": query,
|
|
1541
|
+
"database": "dbsnp",
|
|
1542
|
+
"total_count": total_count,
|
|
1543
|
+
"snps": snps,
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
|
|
1547
|
+
# ---------------------------------------------------------------------------
|
|
1548
|
+
# 7. ChEMBL advanced
|
|
1549
|
+
# ---------------------------------------------------------------------------
|
|
1550
|
+
|
|
1551
|
+
@registry.register(
|
|
1552
|
+
name="data_api.chembl_advanced",
|
|
1553
|
+
description="Advanced ChEMBL queries: compound details, target activity statistics, mechanisms, drug indications",
|
|
1554
|
+
category="data_api",
|
|
1555
|
+
parameters={
|
|
1556
|
+
"query": "Compound name/ChEMBL ID, target gene, or drug name",
|
|
1557
|
+
"search_type": "Query type: 'compound', 'target_activities', 'mechanism', or 'drug_indication' (default 'compound')",
|
|
1558
|
+
},
|
|
1559
|
+
requires_data=[],
|
|
1560
|
+
usage_guide="You want detailed ChEMBL data: full compound properties, aggregated bioactivity statistics for a target (min/max/median IC50), drug mechanisms of action, or approved indications. More detailed than literature.chembl_query.",
|
|
1561
|
+
)
|
|
1562
|
+
def chembl_advanced(query: str, search_type: str = "compound", **kwargs) -> dict:
|
|
1563
|
+
"""Advanced ChEMBL REST API queries with aggregated statistics."""
|
|
1564
|
+
valid_types = ("compound", "target_activities", "mechanism", "drug_indication")
|
|
1565
|
+
if search_type not in valid_types:
|
|
1566
|
+
return {"error": f"Invalid search_type '{search_type}'. Choose from: {', '.join(valid_types)}", "summary": f"Invalid ChEMBL search type '{search_type}'"}
|
|
1567
|
+
|
|
1568
|
+
chembl_base = "https://www.ebi.ac.uk/chembl/api/data"
|
|
1569
|
+
headers = {"Accept": "application/json"}
|
|
1570
|
+
|
|
1571
|
+
if search_type == "compound":
|
|
1572
|
+
return _chembl_compound_search(query, chembl_base, headers)
|
|
1573
|
+
elif search_type == "target_activities":
|
|
1574
|
+
return _chembl_target_activities(query, chembl_base, headers)
|
|
1575
|
+
elif search_type == "mechanism":
|
|
1576
|
+
return _chembl_mechanism(query, chembl_base, headers)
|
|
1577
|
+
else: # drug_indication
|
|
1578
|
+
return _chembl_drug_indication(query, chembl_base, headers)
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
def _chembl_compound_search(query: str, base: str, headers: dict) -> dict:
|
|
1582
|
+
"""Search ChEMBL for a compound with full property details."""
|
|
1583
|
+
try:
|
|
1584
|
+
resp = _http_get(
|
|
1585
|
+
f"{base}/molecule/search.json",
|
|
1586
|
+
params={"q": query, "limit": 5},
|
|
1587
|
+
headers=headers,
|
|
1588
|
+
timeout=15,
|
|
1589
|
+
retries=2,
|
|
1590
|
+
)
|
|
1591
|
+
resp.raise_for_status()
|
|
1592
|
+
data = resp.json()
|
|
1593
|
+
except Exception as e:
|
|
1594
|
+
return {"error": f"ChEMBL compound search failed: {e}", "summary": f"ChEMBL compound search failed: {e}"}
|
|
1595
|
+
molecules = data.get("molecules", [])
|
|
1596
|
+
if not molecules:
|
|
1597
|
+
return {
|
|
1598
|
+
"summary": f"No ChEMBL compounds found for '{query}'",
|
|
1599
|
+
"query": query,
|
|
1600
|
+
"compounds": [],
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
compounds = []
|
|
1604
|
+
for mol in molecules:
|
|
1605
|
+
props = mol.get("molecule_properties", {}) or {}
|
|
1606
|
+
structs = mol.get("molecule_structures", {}) or {}
|
|
1607
|
+
chembl_id = mol.get("molecule_chembl_id", "")
|
|
1608
|
+
|
|
1609
|
+
compounds.append({
|
|
1610
|
+
"chembl_id": chembl_id,
|
|
1611
|
+
"pref_name": mol.get("pref_name", ""),
|
|
1612
|
+
"molecule_type": mol.get("molecule_type", ""),
|
|
1613
|
+
"max_phase": mol.get("max_phase", 0),
|
|
1614
|
+
"oral": mol.get("oral", False),
|
|
1615
|
+
"parenteral": mol.get("parenteral", False),
|
|
1616
|
+
"topical": mol.get("topical", False),
|
|
1617
|
+
"natural_product": mol.get("natural_product", -1),
|
|
1618
|
+
"canonical_smiles": structs.get("canonical_smiles", ""),
|
|
1619
|
+
"inchi_key": structs.get("standard_inchi_key", ""),
|
|
1620
|
+
"molecular_weight": props.get("full_mwt"),
|
|
1621
|
+
"alogp": props.get("alogp"),
|
|
1622
|
+
"hba": props.get("hba"),
|
|
1623
|
+
"hbd": props.get("hbd"),
|
|
1624
|
+
"psa": props.get("psa"),
|
|
1625
|
+
"rtb": props.get("rtb"),
|
|
1626
|
+
"ro5_violations": props.get("num_ro5_violations"),
|
|
1627
|
+
"aromatic_rings": props.get("aromatic_rings"),
|
|
1628
|
+
"heavy_atoms": props.get("heavy_atoms"),
|
|
1629
|
+
"qed_weighted": props.get("qed_weighted"),
|
|
1630
|
+
})
|
|
1631
|
+
|
|
1632
|
+
top = compounds[0]
|
|
1633
|
+
return {
|
|
1634
|
+
"summary": (
|
|
1635
|
+
f"ChEMBL compound {top['chembl_id']} ({top['pref_name'] or query}): "
|
|
1636
|
+
f"MW={top['molecular_weight'] or 'N/A'}, ALogP={top['alogp'] or 'N/A'}, "
|
|
1637
|
+
f"max phase {top['max_phase']}"
|
|
1638
|
+
),
|
|
1639
|
+
"query": query,
|
|
1640
|
+
"n_results": len(compounds),
|
|
1641
|
+
"compounds": compounds,
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
|
|
1645
|
+
def _chembl_target_activities(query: str, base: str, headers: dict) -> dict:
|
|
1646
|
+
"""Get aggregated bioactivity statistics for a target."""
|
|
1647
|
+
# Find the target
|
|
1648
|
+
try:
|
|
1649
|
+
tgt_resp = _http_get(
|
|
1650
|
+
f"{base}/target/search.json",
|
|
1651
|
+
params={"q": query, "limit": 5},
|
|
1652
|
+
headers=headers,
|
|
1653
|
+
timeout=15,
|
|
1654
|
+
retries=2,
|
|
1655
|
+
)
|
|
1656
|
+
tgt_resp.raise_for_status()
|
|
1657
|
+
tgt_data = tgt_resp.json()
|
|
1658
|
+
except Exception as e:
|
|
1659
|
+
return {"error": f"ChEMBL target search failed: {e}", "summary": f"ChEMBL target search failed: {e}"}
|
|
1660
|
+
targets = tgt_data.get("targets", [])
|
|
1661
|
+
if not targets:
|
|
1662
|
+
return {"summary": f"No ChEMBL target found for '{query}'", "query": query}
|
|
1663
|
+
|
|
1664
|
+
# Prefer human SINGLE PROTEIN
|
|
1665
|
+
target = None
|
|
1666
|
+
for t in targets:
|
|
1667
|
+
if t.get("organism") == "Homo sapiens" and t.get("target_type") == "SINGLE PROTEIN":
|
|
1668
|
+
target = t
|
|
1669
|
+
break
|
|
1670
|
+
if not target:
|
|
1671
|
+
target = targets[0]
|
|
1672
|
+
|
|
1673
|
+
chembl_target_id = target.get("target_chembl_id", "")
|
|
1674
|
+
target_name = target.get("pref_name", query)
|
|
1675
|
+
|
|
1676
|
+
# Fetch activities
|
|
1677
|
+
try:
|
|
1678
|
+
act_resp = _http_get(
|
|
1679
|
+
f"{base}/activity.json",
|
|
1680
|
+
params={
|
|
1681
|
+
"target_chembl_id": chembl_target_id,
|
|
1682
|
+
"limit": 100,
|
|
1683
|
+
"standard_type__in": "IC50,Ki,Kd,EC50",
|
|
1684
|
+
},
|
|
1685
|
+
headers=headers,
|
|
1686
|
+
timeout=15,
|
|
1687
|
+
retries=2,
|
|
1688
|
+
)
|
|
1689
|
+
act_resp.raise_for_status()
|
|
1690
|
+
act_data = act_resp.json()
|
|
1691
|
+
except Exception as e:
|
|
1692
|
+
return {"error": f"ChEMBL activity query failed: {e}", "summary": f"ChEMBL activity query failed: {e}"}
|
|
1693
|
+
activities = act_data.get("activities", [])
|
|
1694
|
+
|
|
1695
|
+
# Aggregate statistics
|
|
1696
|
+
import statistics
|
|
1697
|
+
|
|
1698
|
+
by_type = {}
|
|
1699
|
+
unique_molecules = set()
|
|
1700
|
+
for act in activities:
|
|
1701
|
+
mol_id = act.get("molecule_chembl_id", "")
|
|
1702
|
+
unique_molecules.add(mol_id)
|
|
1703
|
+
std_type = act.get("standard_type", "")
|
|
1704
|
+
std_value = act.get("standard_value")
|
|
1705
|
+
if std_value is not None:
|
|
1706
|
+
try:
|
|
1707
|
+
val = float(std_value)
|
|
1708
|
+
by_type.setdefault(std_type, []).append(val)
|
|
1709
|
+
except (ValueError, TypeError):
|
|
1710
|
+
pass
|
|
1711
|
+
|
|
1712
|
+
stats = {}
|
|
1713
|
+
for activity_type, values in by_type.items():
|
|
1714
|
+
sorted_vals = sorted(values)
|
|
1715
|
+
stats[activity_type] = {
|
|
1716
|
+
"count": len(values),
|
|
1717
|
+
"min_nM": round(min(values), 2),
|
|
1718
|
+
"max_nM": round(max(values), 2),
|
|
1719
|
+
"median_nM": round(statistics.median(values), 2),
|
|
1720
|
+
"mean_nM": round(statistics.mean(values), 2),
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
total_activities = sum(s["count"] for s in stats.values())
|
|
1724
|
+
median_str = ""
|
|
1725
|
+
if "IC50" in stats:
|
|
1726
|
+
median_str = f", median IC50 = {stats['IC50']['median_nM']:.0f} nM"
|
|
1727
|
+
|
|
1728
|
+
return {
|
|
1729
|
+
"summary": (
|
|
1730
|
+
f"ChEMBL target {chembl_target_id} ({target_name}): "
|
|
1731
|
+
f"{total_activities} activities, "
|
|
1732
|
+
f"{len(unique_molecules)} unique compounds"
|
|
1733
|
+
f"{median_str}"
|
|
1734
|
+
),
|
|
1735
|
+
"query": query,
|
|
1736
|
+
"target_chembl_id": chembl_target_id,
|
|
1737
|
+
"target_name": target_name,
|
|
1738
|
+
"organism": target.get("organism", ""),
|
|
1739
|
+
"target_type": target.get("target_type", ""),
|
|
1740
|
+
"n_unique_compounds": len(unique_molecules),
|
|
1741
|
+
"n_activities": total_activities,
|
|
1742
|
+
"activity_statistics": stats,
|
|
1743
|
+
}
|
|
1744
|
+
|
|
1745
|
+
|
|
1746
|
+
def _chembl_mechanism(query: str, base: str, headers: dict) -> dict:
|
|
1747
|
+
"""Look up drug mechanisms of action."""
|
|
1748
|
+
try:
|
|
1749
|
+
resp = _http_get(
|
|
1750
|
+
f"{base}/mechanism.json",
|
|
1751
|
+
params={"molecule_chembl_id": query, "limit": 20},
|
|
1752
|
+
headers=headers,
|
|
1753
|
+
timeout=15,
|
|
1754
|
+
retries=2,
|
|
1755
|
+
)
|
|
1756
|
+
resp.raise_for_status()
|
|
1757
|
+
data = resp.json()
|
|
1758
|
+
except Exception as e:
|
|
1759
|
+
return {"error": f"ChEMBL mechanism query failed: {e}", "summary": f"ChEMBL mechanism query failed: {e}"}
|
|
1760
|
+
mechanisms = data.get("mechanisms", [])
|
|
1761
|
+
|
|
1762
|
+
# If no results by molecule ID, try searching by name
|
|
1763
|
+
if not mechanisms:
|
|
1764
|
+
try:
|
|
1765
|
+
mol_resp = _http_get(
|
|
1766
|
+
f"{base}/molecule/search.json",
|
|
1767
|
+
params={"q": query, "limit": 1},
|
|
1768
|
+
headers=headers,
|
|
1769
|
+
timeout=10,
|
|
1770
|
+
retries=2,
|
|
1771
|
+
)
|
|
1772
|
+
mol_resp.raise_for_status()
|
|
1773
|
+
mol_data = mol_resp.json()
|
|
1774
|
+
mols = mol_data.get("molecules", [])
|
|
1775
|
+
if mols:
|
|
1776
|
+
mol_id = mols[0].get("molecule_chembl_id", "")
|
|
1777
|
+
resp2 = _http_get(
|
|
1778
|
+
f"{base}/mechanism.json",
|
|
1779
|
+
params={"molecule_chembl_id": mol_id, "limit": 20},
|
|
1780
|
+
headers=headers,
|
|
1781
|
+
timeout=10,
|
|
1782
|
+
retries=2,
|
|
1783
|
+
)
|
|
1784
|
+
resp2.raise_for_status()
|
|
1785
|
+
mechanisms = resp2.json().get("mechanisms", [])
|
|
1786
|
+
except Exception:
|
|
1787
|
+
pass
|
|
1788
|
+
|
|
1789
|
+
if not mechanisms:
|
|
1790
|
+
return {
|
|
1791
|
+
"summary": f"No mechanisms of action found in ChEMBL for '{query}'",
|
|
1792
|
+
"query": query,
|
|
1793
|
+
"mechanisms": [],
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
parsed = []
|
|
1797
|
+
for mech in mechanisms:
|
|
1798
|
+
parsed.append({
|
|
1799
|
+
"mechanism": mech.get("mechanism_of_action", ""),
|
|
1800
|
+
"action_type": mech.get("action_type", ""),
|
|
1801
|
+
"target_name": mech.get("target_chembl_id", ""),
|
|
1802
|
+
"molecule_chembl_id": mech.get("molecule_chembl_id", ""),
|
|
1803
|
+
"max_phase": mech.get("max_phase"),
|
|
1804
|
+
"direct_interaction": mech.get("direct_interaction"),
|
|
1805
|
+
})
|
|
1806
|
+
|
|
1807
|
+
return {
|
|
1808
|
+
"summary": (
|
|
1809
|
+
f"ChEMBL mechanisms for {query}: {len(parsed)} mechanism(s). "
|
|
1810
|
+
+ "; ".join(m["mechanism"] for m in parsed[:3])
|
|
1811
|
+
),
|
|
1812
|
+
"query": query,
|
|
1813
|
+
"n_mechanisms": len(parsed),
|
|
1814
|
+
"mechanisms": parsed,
|
|
1815
|
+
}
|
|
1816
|
+
|
|
1817
|
+
|
|
1818
|
+
def _chembl_drug_indication(query: str, base: str, headers: dict) -> dict:
|
|
1819
|
+
"""Look up approved drug indications."""
|
|
1820
|
+
# Resolve molecule ID
|
|
1821
|
+
mol_id = query
|
|
1822
|
+
try:
|
|
1823
|
+
if not query.upper().startswith("CHEMBL"):
|
|
1824
|
+
mol_resp = _http_get(
|
|
1825
|
+
f"{base}/molecule/search.json",
|
|
1826
|
+
params={"q": query, "limit": 1},
|
|
1827
|
+
headers=headers,
|
|
1828
|
+
timeout=10,
|
|
1829
|
+
retries=2,
|
|
1830
|
+
)
|
|
1831
|
+
mol_resp.raise_for_status()
|
|
1832
|
+
mols = mol_resp.json().get("molecules", [])
|
|
1833
|
+
if mols:
|
|
1834
|
+
mol_id = mols[0].get("molecule_chembl_id", "")
|
|
1835
|
+
except Exception:
|
|
1836
|
+
pass
|
|
1837
|
+
|
|
1838
|
+
try:
|
|
1839
|
+
resp = _http_get(
|
|
1840
|
+
f"{base}/drug_indication.json",
|
|
1841
|
+
params={"molecule_chembl_id": mol_id, "limit": 30},
|
|
1842
|
+
headers=headers,
|
|
1843
|
+
timeout=15,
|
|
1844
|
+
retries=2,
|
|
1845
|
+
)
|
|
1846
|
+
resp.raise_for_status()
|
|
1847
|
+
data = resp.json()
|
|
1848
|
+
except Exception as e:
|
|
1849
|
+
return {"error": f"ChEMBL indication query failed: {e}", "summary": f"ChEMBL indication query failed: {e}"}
|
|
1850
|
+
indications = data.get("drug_indications", [])
|
|
1851
|
+
if not indications:
|
|
1852
|
+
return {
|
|
1853
|
+
"summary": f"No drug indications found in ChEMBL for '{query}'",
|
|
1854
|
+
"query": query,
|
|
1855
|
+
"indications": [],
|
|
1856
|
+
}
|
|
1857
|
+
|
|
1858
|
+
parsed = []
|
|
1859
|
+
for ind in indications:
|
|
1860
|
+
parsed.append({
|
|
1861
|
+
"indication": ind.get("mesh_heading", ""),
|
|
1862
|
+
"mesh_id": ind.get("mesh_id", ""),
|
|
1863
|
+
"efo_id": ind.get("efo_id", ""),
|
|
1864
|
+
"max_phase": ind.get("max_phase_for_ind"),
|
|
1865
|
+
"molecule_chembl_id": ind.get("molecule_chembl_id", ""),
|
|
1866
|
+
})
|
|
1867
|
+
|
|
1868
|
+
approved = [p for p in parsed if p.get("max_phase") == 4]
|
|
1869
|
+
return {
|
|
1870
|
+
"summary": (
|
|
1871
|
+
f"ChEMBL indications for {query} ({mol_id}): "
|
|
1872
|
+
f"{len(parsed)} total, {len(approved)} approved. "
|
|
1873
|
+
+ "; ".join(p["indication"] for p in parsed[:5])
|
|
1874
|
+
),
|
|
1875
|
+
"query": query,
|
|
1876
|
+
"molecule_chembl_id": mol_id,
|
|
1877
|
+
"n_indications": len(parsed),
|
|
1878
|
+
"n_approved": len(approved),
|
|
1879
|
+
"indications": parsed,
|
|
1880
|
+
}
|
|
1881
|
+
|
|
1882
|
+
|
|
1883
|
+
# ---------------------------------------------------------------------------
|
|
1884
|
+
# 8. Drug information lookup (via PubChem)
|
|
1885
|
+
# ---------------------------------------------------------------------------
|
|
1886
|
+
|
|
1887
|
+
@registry.register(
|
|
1888
|
+
name="data_api.drug_info",
|
|
1889
|
+
description="Look up comprehensive drug information: pharmacology, properties, interactions, indications",
|
|
1890
|
+
category="data_api",
|
|
1891
|
+
parameters={
|
|
1892
|
+
"query": "Drug name (e.g. 'imatinib') or compound name",
|
|
1893
|
+
"include": "Information to include: list of 'pharmacology', 'interactions', 'properties' (default ['pharmacology', 'interactions'])",
|
|
1894
|
+
},
|
|
1895
|
+
requires_data=[],
|
|
1896
|
+
usage_guide="You want drug pharmacology, properties, and interaction data. Uses PubChem PUG REST and PUG View APIs for comprehensive drug information.",
|
|
1897
|
+
)
|
|
1898
|
+
def drug_info(query: str, include: list = None, **kwargs) -> dict:
|
|
1899
|
+
"""Look up drug information via PubChem REST API.
|
|
1900
|
+
|
|
1901
|
+
Uses PubChem PUG REST and PUG View APIs to retrieve drug properties,
|
|
1902
|
+
pharmacology, and interaction data.
|
|
1903
|
+
"""
|
|
1904
|
+
if include is None:
|
|
1905
|
+
include = ["pharmacology", "interactions"]
|
|
1906
|
+
|
|
1907
|
+
# Normalize drug name
|
|
1908
|
+
raw_query = query
|
|
1909
|
+
query = _normalize_drug_query(query)
|
|
1910
|
+
if not query:
|
|
1911
|
+
return {
|
|
1912
|
+
"error": "Drug query is required",
|
|
1913
|
+
"summary": "PubChem: query cannot be empty",
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
pug_base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
|
1917
|
+
pugview_base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
|
|
1918
|
+
|
|
1919
|
+
def _query_candidates(text: str) -> list[str]:
|
|
1920
|
+
candidates = []
|
|
1921
|
+
seen = set()
|
|
1922
|
+
|
|
1923
|
+
def _add(candidate: str):
|
|
1924
|
+
c = " ".join((candidate or "").split()).strip()
|
|
1925
|
+
if not c or c.lower() in seen:
|
|
1926
|
+
return
|
|
1927
|
+
seen.add(c.lower())
|
|
1928
|
+
candidates.append(c)
|
|
1929
|
+
|
|
1930
|
+
_add(text)
|
|
1931
|
+
for part in re.split(r"[;,/|()]|\bor\b|\band\b", text, flags=re.IGNORECASE):
|
|
1932
|
+
_add(part)
|
|
1933
|
+
for token in text.split():
|
|
1934
|
+
cleaned = token.strip(" ,;:/|()[]{}")
|
|
1935
|
+
if len(cleaned) >= 3 and re.search(r"[A-Za-z]", cleaned):
|
|
1936
|
+
_add(cleaned)
|
|
1937
|
+
return candidates
|
|
1938
|
+
|
|
1939
|
+
# Step 1: Resolve drug name to CID (with alias/fallback attempts)
|
|
1940
|
+
import urllib.parse
|
|
1941
|
+
|
|
1942
|
+
cid = None
|
|
1943
|
+
resolved_query = raw_query
|
|
1944
|
+
lookup_errors = []
|
|
1945
|
+
for candidate in _query_candidates(raw_query):
|
|
1946
|
+
encoded_query = urllib.parse.quote(candidate, safe="")
|
|
1947
|
+
try:
|
|
1948
|
+
resp = _http_get(
|
|
1949
|
+
f"{pug_base}/compound/name/{encoded_query}/cids/JSON",
|
|
1950
|
+
timeout=10,
|
|
1951
|
+
retries=2,
|
|
1952
|
+
)
|
|
1953
|
+
if resp.status_code == 404:
|
|
1954
|
+
continue
|
|
1955
|
+
resp.raise_for_status()
|
|
1956
|
+
cid_data = resp.json()
|
|
1957
|
+
except Exception as e:
|
|
1958
|
+
lookup_errors.append(f"{candidate}: {e}")
|
|
1959
|
+
continue
|
|
1960
|
+
|
|
1961
|
+
cids = cid_data.get("IdentifierList", {}).get("CID", [])
|
|
1962
|
+
if cids:
|
|
1963
|
+
cid = cids[0]
|
|
1964
|
+
resolved_query = candidate
|
|
1965
|
+
break
|
|
1966
|
+
|
|
1967
|
+
if cid is None:
|
|
1968
|
+
if lookup_errors:
|
|
1969
|
+
return {
|
|
1970
|
+
"error": f"PubChem CID lookup failed: {lookup_errors[0]}",
|
|
1971
|
+
"summary": f"PubChem CID lookup failed for '{raw_query}'",
|
|
1972
|
+
"tried_queries": _query_candidates(raw_query)[:5],
|
|
1973
|
+
}
|
|
1974
|
+
return {
|
|
1975
|
+
"error": f"Drug '{raw_query}' not found in PubChem",
|
|
1976
|
+
"summary": f"PubChem: no compound found for '{raw_query}'",
|
|
1977
|
+
"tried_queries": _query_candidates(raw_query)[:5],
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1980
|
+
# Step 2: Get compound properties
|
|
1981
|
+
properties = {}
|
|
1982
|
+
try:
|
|
1983
|
+
props_resp = _http_get(
|
|
1984
|
+
f"{pug_base}/compound/cid/{cid}/property/"
|
|
1985
|
+
"MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,"
|
|
1986
|
+
"XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,"
|
|
1987
|
+
"RotatableBondCount,HeavyAtomCount,Complexity,InChIKey/JSON",
|
|
1988
|
+
timeout=10,
|
|
1989
|
+
retries=2,
|
|
1990
|
+
)
|
|
1991
|
+
if props_resp.status_code == 200:
|
|
1992
|
+
prop_table = props_resp.json().get("PropertyTable", {}).get("Properties", [])
|
|
1993
|
+
if prop_table:
|
|
1994
|
+
properties = prop_table[0]
|
|
1995
|
+
except Exception:
|
|
1996
|
+
pass
|
|
1997
|
+
|
|
1998
|
+
# Step 3: Get drug/medication information from PUG View
|
|
1999
|
+
pharmacology = {}
|
|
2000
|
+
interactions = []
|
|
2001
|
+
drug_info = {}
|
|
2002
|
+
|
|
2003
|
+
if "pharmacology" in include or "interactions" in include:
|
|
2004
|
+
try:
|
|
2005
|
+
view_resp = _http_get(
|
|
2006
|
+
f"{pugview_base}/data/compound/{cid}/JSON",
|
|
2007
|
+
params={"heading": "Drug and Medication Information"},
|
|
2008
|
+
timeout=15,
|
|
2009
|
+
retries=2,
|
|
2010
|
+
)
|
|
2011
|
+
if view_resp.status_code == 200:
|
|
2012
|
+
view_data = view_resp.json()
|
|
2013
|
+
record = view_data.get("Record", {})
|
|
2014
|
+
sections = record.get("Section", [])
|
|
2015
|
+
|
|
2016
|
+
for section in sections:
|
|
2017
|
+
heading = section.get("TOCHeading", "")
|
|
2018
|
+
for subsection in section.get("Section", []):
|
|
2019
|
+
sub_heading = subsection.get("TOCHeading", "")
|
|
2020
|
+
info_list = subsection.get("Information", [])
|
|
2021
|
+
|
|
2022
|
+
if sub_heading == "Drug Indication":
|
|
2023
|
+
for info in info_list:
|
|
2024
|
+
val = info.get("Value", {}).get("StringWithMarkup", [])
|
|
2025
|
+
if val:
|
|
2026
|
+
drug_info["indication"] = val[0].get("String", "")[:500]
|
|
2027
|
+
|
|
2028
|
+
elif sub_heading == "Mechanism of Action":
|
|
2029
|
+
for info in info_list:
|
|
2030
|
+
val = info.get("Value", {}).get("StringWithMarkup", [])
|
|
2031
|
+
if val:
|
|
2032
|
+
pharmacology["mechanism_of_action"] = val[0].get("String", "")[:500]
|
|
2033
|
+
|
|
2034
|
+
elif sub_heading == "Pharmacology":
|
|
2035
|
+
for info in info_list:
|
|
2036
|
+
val = info.get("Value", {}).get("StringWithMarkup", [])
|
|
2037
|
+
if val:
|
|
2038
|
+
pharmacology["pharmacology"] = val[0].get("String", "")[:500]
|
|
2039
|
+
|
|
2040
|
+
elif sub_heading == "Absorption":
|
|
2041
|
+
for info in info_list:
|
|
2042
|
+
val = info.get("Value", {}).get("StringWithMarkup", [])
|
|
2043
|
+
if val:
|
|
2044
|
+
pharmacology["absorption"] = val[0].get("String", "")[:300]
|
|
2045
|
+
|
|
2046
|
+
elif "Drug Interaction" in sub_heading or "Drug-Drug" in sub_heading:
|
|
2047
|
+
for info in info_list:
|
|
2048
|
+
val = info.get("Value", {}).get("StringWithMarkup", [])
|
|
2049
|
+
if val:
|
|
2050
|
+
interactions.append(val[0].get("String", "")[:200])
|
|
2051
|
+
except Exception:
|
|
2052
|
+
pass
|
|
2053
|
+
|
|
2054
|
+
# Step 4: Get synonyms for the drug
|
|
2055
|
+
synonyms = []
|
|
2056
|
+
try:
|
|
2057
|
+
syn_resp = _http_get(
|
|
2058
|
+
f"{pug_base}/compound/cid/{cid}/synonyms/JSON",
|
|
2059
|
+
timeout=10,
|
|
2060
|
+
retries=2,
|
|
2061
|
+
)
|
|
2062
|
+
if syn_resp.status_code == 200:
|
|
2063
|
+
syn_list = syn_resp.json().get("InformationList", {}).get("Information", [])
|
|
2064
|
+
if syn_list:
|
|
2065
|
+
synonyms = syn_list[0].get("Synonym", [])[:15]
|
|
2066
|
+
except Exception:
|
|
2067
|
+
pass
|
|
2068
|
+
|
|
2069
|
+
# Find DrugBank ID in synonyms
|
|
2070
|
+
drugbank_id = ""
|
|
2071
|
+
for syn in synonyms:
|
|
2072
|
+
if syn.upper().startswith("DB") and len(syn) == 7 and syn[2:].isdigit():
|
|
2073
|
+
drugbank_id = syn
|
|
2074
|
+
break
|
|
2075
|
+
|
|
2076
|
+
mw = properties.get("MolecularWeight", "N/A")
|
|
2077
|
+
formula = properties.get("MolecularFormula", "N/A")
|
|
2078
|
+
smiles = properties.get("CanonicalSMILES", "N/A")
|
|
2079
|
+
mechanism = pharmacology.get("mechanism_of_action", "N/A")
|
|
2080
|
+
|
|
2081
|
+
drugbank_str = f" ({drugbank_id})" if drugbank_id else ""
|
|
2082
|
+
mech_short = mechanism[:80] + "..." if len(mechanism) > 80 else mechanism
|
|
2083
|
+
resolved_note = ""
|
|
2084
|
+
if resolved_query.lower() != raw_query.lower():
|
|
2085
|
+
resolved_note = f" [resolved as '{resolved_query}']"
|
|
2086
|
+
|
|
2087
|
+
return {
|
|
2088
|
+
"summary": (
|
|
2089
|
+
f"{raw_query}{resolved_note}{drugbank_str}: {mech_short}, "
|
|
2090
|
+
f"MW {mw}, {len(interactions)} known drug interactions."
|
|
2091
|
+
),
|
|
2092
|
+
"query": raw_query,
|
|
2093
|
+
"resolved_query": resolved_query,
|
|
2094
|
+
"cid": cid,
|
|
2095
|
+
"drugbank_id": drugbank_id,
|
|
2096
|
+
"properties": {
|
|
2097
|
+
"molecular_formula": formula,
|
|
2098
|
+
"molecular_weight": mw,
|
|
2099
|
+
"canonical_smiles": smiles,
|
|
2100
|
+
"isomeric_smiles": properties.get("IsomericSMILES", ""),
|
|
2101
|
+
"xlogp": properties.get("XLogP"),
|
|
2102
|
+
"tpsa": properties.get("TPSA"),
|
|
2103
|
+
"hbd": properties.get("HBondDonorCount"),
|
|
2104
|
+
"hba": properties.get("HBondAcceptorCount"),
|
|
2105
|
+
"rotatable_bonds": properties.get("RotatableBondCount"),
|
|
2106
|
+
"inchi_key": properties.get("InChIKey", ""),
|
|
2107
|
+
},
|
|
2108
|
+
"pharmacology": pharmacology,
|
|
2109
|
+
"drug_info": drug_info,
|
|
2110
|
+
"interactions": interactions[:20],
|
|
2111
|
+
"n_interactions": len(interactions),
|
|
2112
|
+
"synonyms": synonyms,
|
|
2113
|
+
"pubchem_url": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}",
|
|
2114
|
+
}
|