celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/target.py
ADDED
|
@@ -0,0 +1,901 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Target discovery tools: neosubstrate scoring, degron prediction, co-essentiality.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
from ct.tools import registry
|
|
8
|
+
from ct.tools.http_client import request
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@registry.register(
|
|
12
|
+
name="target.neosubstrate_score",
|
|
13
|
+
description="Score proteins as potential neosubstrate targets based on degradation selectivity and magnitude",
|
|
14
|
+
category="target",
|
|
15
|
+
parameters={"proteomics_path": "Path to proteomics LFC matrix", "top_n": "Number of top targets to return"},
|
|
16
|
+
requires_data=["proteomics"],
|
|
17
|
+
usage_guide="You want to discover new degradation targets from proteomics data — ranks proteins by selective, potent degradation across compounds. Use early in target discovery campaigns.",
|
|
18
|
+
)
|
|
19
|
+
def neosubstrate_score(proteomics_path: str = None, top_n: int = 50, **kwargs) -> dict:
|
|
20
|
+
"""Score proteins for neosubstrate potential."""
|
|
21
|
+
# Load proteomics data
|
|
22
|
+
if proteomics_path is None:
|
|
23
|
+
try:
|
|
24
|
+
from ct.data.loaders import load_proteomics
|
|
25
|
+
prot = load_proteomics()
|
|
26
|
+
except FileNotFoundError:
|
|
27
|
+
return {
|
|
28
|
+
"error": "Proteomics data not available.",
|
|
29
|
+
"summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
|
|
30
|
+
}
|
|
31
|
+
else:
|
|
32
|
+
prot = pd.read_csv(proteomics_path, index_col=0)
|
|
33
|
+
|
|
34
|
+
# Score: selectivity × |mean_degradation| × log2(n_degraders + 1)
|
|
35
|
+
results = []
|
|
36
|
+
for protein in prot.index:
|
|
37
|
+
values = prot.loc[protein].dropna()
|
|
38
|
+
degraded = values[values < -0.5]
|
|
39
|
+
if len(degraded) == 0:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
n_degraders = len(degraded)
|
|
43
|
+
mean_deg = degraded.mean()
|
|
44
|
+
# Selectivity: fraction of compounds that degrade it (lower = more selective)
|
|
45
|
+
selectivity = 1.0 - (n_degraders / len(values))
|
|
46
|
+
|
|
47
|
+
score = selectivity * abs(mean_deg) * np.log2(n_degraders + 1)
|
|
48
|
+
|
|
49
|
+
results.append({
|
|
50
|
+
"protein": protein,
|
|
51
|
+
"score": score,
|
|
52
|
+
"n_degraders": n_degraders,
|
|
53
|
+
"mean_degradation": mean_deg,
|
|
54
|
+
"selectivity": selectivity,
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
if not results:
|
|
58
|
+
return {
|
|
59
|
+
"summary": f"No neosubstrate candidates found in {len(prot)} proteins (none degraded below -0.5 LFC)",
|
|
60
|
+
"top_targets": [],
|
|
61
|
+
"n_proteins_scored": 0,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
df = pd.DataFrame(results).sort_values("score", ascending=False).head(top_n)
|
|
65
|
+
|
|
66
|
+
# Map UniProt IDs to gene symbols if protein IDs look like UniProt accessions
|
|
67
|
+
top_proteins = df["protein"].tolist()
|
|
68
|
+
if top_proteins and all(len(p) >= 6 and p[0].isalpha() and any(c.isdigit() for c in p) and " " not in p for p in top_proteins[:3]):
|
|
69
|
+
try:
|
|
70
|
+
import httpx
|
|
71
|
+
# Batch lookup via UniProt ID mapping
|
|
72
|
+
ids_str = ",".join(top_proteins)
|
|
73
|
+
resp = httpx.get(
|
|
74
|
+
"https://rest.uniprot.org/uniprotkb/accessions",
|
|
75
|
+
params={"accessions": ids_str, "fields": "accession,gene_primary"},
|
|
76
|
+
headers={"Accept": "application/json"},
|
|
77
|
+
timeout=15,
|
|
78
|
+
)
|
|
79
|
+
if resp.status_code == 200:
|
|
80
|
+
entries = resp.json().get("results", [])
|
|
81
|
+
id_to_gene = {}
|
|
82
|
+
for entry in entries:
|
|
83
|
+
acc = entry.get("primaryAccession", "")
|
|
84
|
+
genes = entry.get("genes", [])
|
|
85
|
+
if genes:
|
|
86
|
+
gene_name = genes[0].get("geneName", {}).get("value", "")
|
|
87
|
+
if gene_name:
|
|
88
|
+
id_to_gene[acc] = gene_name
|
|
89
|
+
if id_to_gene:
|
|
90
|
+
df["gene_symbol"] = df["protein"].map(id_to_gene)
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"summary": f"Top {min(top_n, len(results))} neosubstrate candidates scored from {len(prot)} proteins",
|
|
96
|
+
"top_targets": df.to_dict("records"),
|
|
97
|
+
"n_proteins_scored": len(results),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@registry.register(
|
|
102
|
+
name="target.degron_predict",
|
|
103
|
+
description="Predict structural degron motifs in a protein (zinc fingers, disordered regions, surface accessibility) using UniProt features",
|
|
104
|
+
category="target",
|
|
105
|
+
parameters={"uniprot_id": "UniProt ID of target protein (e.g. P04637 for TP53)"},
|
|
106
|
+
requires_data=[],
|
|
107
|
+
usage_guide="You want to assess whether a protein has structural features (zinc fingers, disordered loops) that make it amenable to E3-mediated degradation. Use after identifying a target of interest.",
|
|
108
|
+
)
|
|
109
|
+
def degron_predict(uniprot_id: str, **kwargs) -> dict:
|
|
110
|
+
"""Predict degron features for a target protein using UniProt feature analysis."""
|
|
111
|
+
# Fetch protein features from UniProt API
|
|
112
|
+
resp, error = request(
|
|
113
|
+
"GET",
|
|
114
|
+
f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json",
|
|
115
|
+
timeout=30,
|
|
116
|
+
headers={"Accept": "application/json"},
|
|
117
|
+
raise_for_status=False,
|
|
118
|
+
)
|
|
119
|
+
if error:
|
|
120
|
+
return {"error": f"Failed to fetch UniProt data: {error}", "summary": f"Failed to fetch UniProt data: {error}"}
|
|
121
|
+
if resp.status_code != 200:
|
|
122
|
+
return {"error": f"UniProt entry not found for {uniprot_id} (HTTP {resp.status_code})", "summary": f"UniProt entry not found for {uniprot_id} (HTTP {resp.status_code})"}
|
|
123
|
+
try:
|
|
124
|
+
data = resp.json()
|
|
125
|
+
except Exception:
|
|
126
|
+
return {"error": f"Invalid UniProt JSON response for {uniprot_id}", "summary": f"Invalid UniProt JSON response for {uniprot_id}"}
|
|
127
|
+
protein_name = data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", uniprot_id)
|
|
128
|
+
gene_name = ""
|
|
129
|
+
genes = data.get("genes", [])
|
|
130
|
+
if genes:
|
|
131
|
+
gene_name = genes[0].get("geneName", {}).get("value", "")
|
|
132
|
+
sequence = data.get("sequence", {})
|
|
133
|
+
seq_length = sequence.get("length", 0)
|
|
134
|
+
|
|
135
|
+
# Extract structural features relevant to degradation
|
|
136
|
+
features = data.get("features", [])
|
|
137
|
+
zinc_fingers = [f for f in features if f.get("type") == "Zinc finger"]
|
|
138
|
+
domains = [f for f in features if f.get("type") == "Domain"]
|
|
139
|
+
disordered = [f for f in features if f.get("type") == "Region" and "Disordered" in f.get("description", "")]
|
|
140
|
+
motifs = [f for f in features if f.get("type") == "Motif"]
|
|
141
|
+
modifications = [f for f in features if f.get("type") in ("Modified residue", "Cross-link")]
|
|
142
|
+
|
|
143
|
+
# Compute degron-relevant scores
|
|
144
|
+
def _region_length(feat):
|
|
145
|
+
loc = feat.get("location", {})
|
|
146
|
+
start = loc.get("start", {}).get("value", 0)
|
|
147
|
+
end = loc.get("end", {}).get("value", 0)
|
|
148
|
+
return max(0, end - start + 1) if start and end else 0
|
|
149
|
+
|
|
150
|
+
disordered_residues = sum(_region_length(f) for f in disordered)
|
|
151
|
+
disorder_fraction = disordered_residues / seq_length if seq_length > 0 else 0
|
|
152
|
+
|
|
153
|
+
# Known degron-associated domain types
|
|
154
|
+
degron_domains = []
|
|
155
|
+
for d in domains:
|
|
156
|
+
desc = d.get("description", "").lower()
|
|
157
|
+
if any(k in desc for k in ["zinc finger", "ring", "btb", "wd40", "kelch", "socs box", "f-box"]):
|
|
158
|
+
degron_domains.append(d.get("description", "unknown"))
|
|
159
|
+
|
|
160
|
+
# Lysine count from sequence (ubiquitination sites)
|
|
161
|
+
raw_seq = sequence.get("value", "")
|
|
162
|
+
lysine_count = raw_seq.count("K") if raw_seq else 0
|
|
163
|
+
lysine_density = lysine_count / seq_length if seq_length > 0 else 0
|
|
164
|
+
|
|
165
|
+
# Known ubiquitination sites from modifications
|
|
166
|
+
ub_sites = [m for m in modifications if "ubiquit" in m.get("description", "").lower()]
|
|
167
|
+
|
|
168
|
+
# Compute overall degradability score (0-1 heuristic)
|
|
169
|
+
score = 0.0
|
|
170
|
+
score_breakdown = {}
|
|
171
|
+
|
|
172
|
+
# Zinc fingers are strong degron features (CRBN/IKZF-type)
|
|
173
|
+
zf_score = min(len(zinc_fingers) * 0.15, 0.3)
|
|
174
|
+
score += zf_score
|
|
175
|
+
score_breakdown["zinc_fingers"] = zf_score
|
|
176
|
+
|
|
177
|
+
# Disordered regions expose protein to E3 engagement
|
|
178
|
+
disorder_score = min(disorder_fraction * 0.5, 0.25)
|
|
179
|
+
score += disorder_score
|
|
180
|
+
score_breakdown["disorder"] = disorder_score
|
|
181
|
+
|
|
182
|
+
# Lysine density enables ubiquitination
|
|
183
|
+
lys_score = min(lysine_density * 3.0, 0.2)
|
|
184
|
+
score += lys_score
|
|
185
|
+
score_breakdown["lysine_accessibility"] = lys_score
|
|
186
|
+
|
|
187
|
+
# Known ubiquitination sites
|
|
188
|
+
ub_score = min(len(ub_sites) * 0.05, 0.15)
|
|
189
|
+
score += ub_score
|
|
190
|
+
score_breakdown["known_ub_sites"] = ub_score
|
|
191
|
+
|
|
192
|
+
# Small-medium proteins degrade more easily
|
|
193
|
+
size_score = 0.1 if seq_length < 800 else 0.05 if seq_length < 1500 else 0.0
|
|
194
|
+
score += size_score
|
|
195
|
+
score_breakdown["protein_size"] = size_score
|
|
196
|
+
|
|
197
|
+
score = min(score, 1.0)
|
|
198
|
+
|
|
199
|
+
# Classify
|
|
200
|
+
if score >= 0.5:
|
|
201
|
+
classification = "high"
|
|
202
|
+
rationale = "Strong structural features for E3-mediated degradation"
|
|
203
|
+
elif score >= 0.25:
|
|
204
|
+
classification = "moderate"
|
|
205
|
+
rationale = "Some favorable features; may require linker/scaffold optimization"
|
|
206
|
+
else:
|
|
207
|
+
classification = "low"
|
|
208
|
+
rationale = "Few structural degron features identified"
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"summary": (
|
|
212
|
+
f"Degron prediction for {gene_name or uniprot_id} ({protein_name}): "
|
|
213
|
+
f"{classification} degradability (score={score:.2f}). "
|
|
214
|
+
f"{len(zinc_fingers)} zinc finger(s), {disordered_residues} disordered residues "
|
|
215
|
+
f"({disorder_fraction:.0%}), {lysine_count} lysines, {len(ub_sites)} known Ub site(s)."
|
|
216
|
+
),
|
|
217
|
+
"uniprot_id": uniprot_id,
|
|
218
|
+
"gene": gene_name,
|
|
219
|
+
"protein_name": protein_name,
|
|
220
|
+
"seq_length": seq_length,
|
|
221
|
+
"degradability_score": round(score, 3),
|
|
222
|
+
"classification": classification,
|
|
223
|
+
"rationale": rationale,
|
|
224
|
+
"score_breakdown": {k: round(v, 3) for k, v in score_breakdown.items()},
|
|
225
|
+
"features": {
|
|
226
|
+
"zinc_fingers": len(zinc_fingers),
|
|
227
|
+
"zinc_finger_details": [
|
|
228
|
+
{"description": f.get("description", ""), "start": f.get("location", {}).get("start", {}).get("value"), "end": f.get("location", {}).get("end", {}).get("value")}
|
|
229
|
+
for f in zinc_fingers
|
|
230
|
+
],
|
|
231
|
+
"disordered_residues": disordered_residues,
|
|
232
|
+
"disorder_fraction": round(disorder_fraction, 3),
|
|
233
|
+
"domains": [d.get("description", "") for d in domains],
|
|
234
|
+
"degron_associated_domains": degron_domains,
|
|
235
|
+
"lysine_count": lysine_count,
|
|
236
|
+
"lysine_density": round(lysine_density, 3),
|
|
237
|
+
"known_ub_sites": len(ub_sites),
|
|
238
|
+
"motifs": [m.get("description", "") for m in motifs],
|
|
239
|
+
},
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@registry.register(
|
|
244
|
+
name="target.coessentiality",
|
|
245
|
+
description="Find co-essential and synthetic lethal partners for a target gene using DepMap CRISPR data",
|
|
246
|
+
category="target",
|
|
247
|
+
parameters={"gene": "Gene symbol", "top_n": "Number of partners to return"},
|
|
248
|
+
requires_data=["depmap_crispr"],
|
|
249
|
+
usage_guide="You need to validate a drug target by finding functionally related genes, or identify synthetic lethal partners for combination therapy. Also useful for understanding pathway context of a gene.",
|
|
250
|
+
)
|
|
251
|
+
def coessentiality(gene: str, top_n: int = 20, **kwargs) -> dict:
|
|
252
|
+
"""Compute co-essentiality network for a gene."""
|
|
253
|
+
from ct.data.loaders import load_crispr
|
|
254
|
+
|
|
255
|
+
crispr = load_crispr()
|
|
256
|
+
|
|
257
|
+
if gene not in crispr.columns:
|
|
258
|
+
return {"error": f"Gene {gene} not found in DepMap CRISPR data", "summary": f"Gene {gene} not found in DepMap CRISPR data"}
|
|
259
|
+
target_vals = crispr[gene].dropna()
|
|
260
|
+
|
|
261
|
+
correlations = []
|
|
262
|
+
for other_gene in crispr.columns:
|
|
263
|
+
if other_gene == gene:
|
|
264
|
+
continue
|
|
265
|
+
other_vals = crispr[other_gene].dropna()
|
|
266
|
+
common = target_vals.index.intersection(other_vals.index)
|
|
267
|
+
if len(common) < 50:
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
from scipy import stats
|
|
271
|
+
r, p = stats.pearsonr(target_vals[common], other_vals[common])
|
|
272
|
+
correlations.append({"gene": other_gene, "r": r, "p": p})
|
|
273
|
+
|
|
274
|
+
if not correlations:
|
|
275
|
+
return {
|
|
276
|
+
"summary": f"Co-essentiality network for {gene}: no genes with sufficient shared cell lines (>=50)",
|
|
277
|
+
"gene": gene,
|
|
278
|
+
"co_essential": [],
|
|
279
|
+
"synthetic_lethal": [],
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
df = pd.DataFrame(correlations).sort_values("r", ascending=False)
|
|
283
|
+
|
|
284
|
+
co_essential = df.head(top_n).to_dict("records")
|
|
285
|
+
synthetic_lethal = df.tail(top_n).sort_values("r").to_dict("records")
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
"summary": f"Co-essentiality network for {gene}: {len(correlations)} genes tested",
|
|
289
|
+
"gene": gene,
|
|
290
|
+
"co_essential": co_essential,
|
|
291
|
+
"synthetic_lethal": synthetic_lethal,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@registry.register(
|
|
296
|
+
name="target.druggability",
|
|
297
|
+
description="Assess the druggability of a protein target using UniProt annotations (protein family, domains, ligands, structural coverage)",
|
|
298
|
+
category="target",
|
|
299
|
+
parameters={"gene": "Gene symbol (e.g. BRAF, EGFR)"},
|
|
300
|
+
requires_data=[],
|
|
301
|
+
usage_guide="You want to evaluate whether a target protein is druggable — checks protein class, known ligands, structural data, and surface accessibility. Use early in target prioritization.",
|
|
302
|
+
)
|
|
303
|
+
def druggability(gene: str, **kwargs) -> dict:
|
|
304
|
+
"""Assess druggability of a protein target via UniProt annotations."""
|
|
305
|
+
# Query UniProt for the gene
|
|
306
|
+
resp, error = request(
|
|
307
|
+
"GET",
|
|
308
|
+
"https://rest.uniprot.org/uniprotkb/search",
|
|
309
|
+
params={
|
|
310
|
+
"query": f"gene_exact:{gene} AND organism_id:9606",
|
|
311
|
+
"format": "json",
|
|
312
|
+
"size": "1",
|
|
313
|
+
},
|
|
314
|
+
timeout=10,
|
|
315
|
+
headers={"Accept": "application/json"},
|
|
316
|
+
raise_for_status=False,
|
|
317
|
+
)
|
|
318
|
+
if error:
|
|
319
|
+
return {"error": f"Failed to fetch UniProt data: {error}", "summary": f"UniProt API error for {gene}"}
|
|
320
|
+
if resp.status_code != 200:
|
|
321
|
+
return {"error": f"UniProt search failed for {gene} (HTTP {resp.status_code})", "summary": f"Failed to query UniProt for {gene}"}
|
|
322
|
+
try:
|
|
323
|
+
data = resp.json()
|
|
324
|
+
except Exception:
|
|
325
|
+
return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt data for {gene}"}
|
|
326
|
+
|
|
327
|
+
results = data.get("results", [])
|
|
328
|
+
if not results:
|
|
329
|
+
return {"error": f"No UniProt entry found for {gene} in human", "summary": f"Gene {gene} not found in UniProt (human)"}
|
|
330
|
+
|
|
331
|
+
entry = results[0]
|
|
332
|
+
|
|
333
|
+
# Extract protein info
|
|
334
|
+
protein_name = (
|
|
335
|
+
entry.get("proteinDescription", {})
|
|
336
|
+
.get("recommendedName", {})
|
|
337
|
+
.get("fullName", {})
|
|
338
|
+
.get("value", gene)
|
|
339
|
+
)
|
|
340
|
+
uniprot_id = entry.get("primaryAccession", "")
|
|
341
|
+
|
|
342
|
+
# Extract features
|
|
343
|
+
features = entry.get("features", [])
|
|
344
|
+
domains = [f.get("description", "") for f in features if f.get("type") == "Domain"]
|
|
345
|
+
keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
|
|
346
|
+
|
|
347
|
+
# Subcellular location
|
|
348
|
+
comments = entry.get("comments", [])
|
|
349
|
+
subcellular_locs = []
|
|
350
|
+
for c in comments:
|
|
351
|
+
if c.get("commentType") == "SUBCELLULAR LOCATION":
|
|
352
|
+
for sl in c.get("subcellularLocations", []):
|
|
353
|
+
loc_val = sl.get("location", {}).get("value", "")
|
|
354
|
+
if loc_val:
|
|
355
|
+
subcellular_locs.append(loc_val)
|
|
356
|
+
|
|
357
|
+
# Transmembrane regions
|
|
358
|
+
transmembrane = [f for f in features if f.get("type") == "Transmembrane"]
|
|
359
|
+
|
|
360
|
+
# Cross-references
|
|
361
|
+
xrefs = entry.get("uniProtKBCrossReferences", [])
|
|
362
|
+
|
|
363
|
+
# Check for ChEMBL cross-refs (known small molecule ligands)
|
|
364
|
+
chembl_refs = [x for x in xrefs if x.get("database") == "ChEMBL"]
|
|
365
|
+
known_drugs = [x.get("id", "") for x in chembl_refs]
|
|
366
|
+
|
|
367
|
+
# Check for PDB cross-refs (structural coverage)
|
|
368
|
+
pdb_refs = [x for x in xrefs if x.get("database") == "PDB"]
|
|
369
|
+
pdb_ids = [x.get("id", "") for x in pdb_refs]
|
|
370
|
+
|
|
371
|
+
# Determine protein class from keywords and domains
|
|
372
|
+
protein_class = "other"
|
|
373
|
+
class_score = 0.0
|
|
374
|
+
keywords_lower = [k.lower() for k in keywords]
|
|
375
|
+
domains_lower = [d.lower() for d in domains]
|
|
376
|
+
all_annotations = " ".join(keywords_lower + domains_lower)
|
|
377
|
+
|
|
378
|
+
if any(k in all_annotations for k in ["kinase", "protein kinase"]):
|
|
379
|
+
protein_class = "kinase"
|
|
380
|
+
class_score = 0.35
|
|
381
|
+
elif any(k in all_annotations for k in ["g-protein coupled receptor", "gpcr"]):
|
|
382
|
+
protein_class = "GPCR"
|
|
383
|
+
class_score = 0.35
|
|
384
|
+
elif any(k in all_annotations for k in ["ion channel", "voltage-gated"]):
|
|
385
|
+
protein_class = "ion_channel"
|
|
386
|
+
class_score = 0.30
|
|
387
|
+
elif any(k in all_annotations for k in ["nuclear hormone receptor", "nuclear receptor"]):
|
|
388
|
+
protein_class = "nuclear_receptor"
|
|
389
|
+
class_score = 0.30
|
|
390
|
+
elif any(k in all_annotations for k in ["protease", "peptidase"]):
|
|
391
|
+
protein_class = "protease"
|
|
392
|
+
class_score = 0.25
|
|
393
|
+
elif any(k in all_annotations for k in ["phosphatase"]):
|
|
394
|
+
protein_class = "phosphatase"
|
|
395
|
+
class_score = 0.25
|
|
396
|
+
elif any(k in all_annotations for k in ["transferase"]):
|
|
397
|
+
protein_class = "transferase"
|
|
398
|
+
class_score = 0.20
|
|
399
|
+
elif any(k in all_annotations for k in ["transcription factor", "transcription"]):
|
|
400
|
+
protein_class = "transcription_factor"
|
|
401
|
+
class_score = 0.10
|
|
402
|
+
elif any(k in all_annotations for k in ["scaffold", "adaptor"]):
|
|
403
|
+
protein_class = "scaffold_adaptor"
|
|
404
|
+
class_score = 0.05
|
|
405
|
+
|
|
406
|
+
# Score: known ligands
|
|
407
|
+
ligand_score = min(len(chembl_refs) * 0.10, 0.25)
|
|
408
|
+
|
|
409
|
+
# Score: surface accessibility (extracellular / secreted / membrane)
|
|
410
|
+
surface_keywords = ["secreted", "cell membrane", "extracellular"]
|
|
411
|
+
is_surface = any(
|
|
412
|
+
any(sk in loc.lower() for sk in surface_keywords)
|
|
413
|
+
for loc in subcellular_locs
|
|
414
|
+
)
|
|
415
|
+
surface_score = 0.15 if is_surface else 0.0
|
|
416
|
+
|
|
417
|
+
# Score: structural coverage (PDB entries)
|
|
418
|
+
structure_score = min(len(pdb_refs) * 0.02, 0.15)
|
|
419
|
+
|
|
420
|
+
# Score: has transmembrane (often druggable for membrane targets)
|
|
421
|
+
tm_score = 0.10 if transmembrane else 0.0
|
|
422
|
+
|
|
423
|
+
total_score = min(class_score + ligand_score + surface_score + structure_score + tm_score, 1.0)
|
|
424
|
+
|
|
425
|
+
# Reasoning
|
|
426
|
+
reasoning_parts = []
|
|
427
|
+
if class_score > 0:
|
|
428
|
+
reasoning_parts.append(f"Protein class '{protein_class}' is a {'highly ' if class_score >= 0.30 else ''}tractable target class")
|
|
429
|
+
else:
|
|
430
|
+
reasoning_parts.append(f"Protein class '{protein_class}' has limited druggability precedent")
|
|
431
|
+
if chembl_refs:
|
|
432
|
+
reasoning_parts.append(f"{len(chembl_refs)} ChEMBL entry/entries indicate known small-molecule interactions")
|
|
433
|
+
else:
|
|
434
|
+
reasoning_parts.append("No ChEMBL cross-references found (no known small-molecule ligands)")
|
|
435
|
+
if pdb_refs:
|
|
436
|
+
reasoning_parts.append(f"{len(pdb_refs)} PDB structure(s) available for structure-based drug design")
|
|
437
|
+
else:
|
|
438
|
+
reasoning_parts.append("No PDB structures available")
|
|
439
|
+
if is_surface:
|
|
440
|
+
reasoning_parts.append("Surface-accessible / extracellular localization supports biologic targeting")
|
|
441
|
+
reasoning = ". ".join(reasoning_parts) + "."
|
|
442
|
+
|
|
443
|
+
# Classify
|
|
444
|
+
if total_score >= 0.6:
|
|
445
|
+
classification = "highly druggable"
|
|
446
|
+
elif total_score >= 0.35:
|
|
447
|
+
classification = "druggable"
|
|
448
|
+
elif total_score >= 0.15:
|
|
449
|
+
classification = "challenging"
|
|
450
|
+
else:
|
|
451
|
+
classification = "undruggable (with current modalities)"
|
|
452
|
+
|
|
453
|
+
return {
|
|
454
|
+
"summary": (
|
|
455
|
+
f"Druggability assessment for {gene} ({protein_name}): "
|
|
456
|
+
f"{classification} (score={total_score:.2f}). "
|
|
457
|
+
f"Class: {protein_class}. {len(pdb_ids)} PDB structures, "
|
|
458
|
+
f"{len(known_drugs)} ChEMBL entries."
|
|
459
|
+
),
|
|
460
|
+
"gene": gene,
|
|
461
|
+
"uniprot_id": uniprot_id,
|
|
462
|
+
"protein_name": protein_name,
|
|
463
|
+
"druggability_score": round(total_score, 3),
|
|
464
|
+
"classification": classification,
|
|
465
|
+
"protein_class": protein_class,
|
|
466
|
+
"known_drugs": known_drugs,
|
|
467
|
+
"structural_coverage": {
|
|
468
|
+
"pdb_count": len(pdb_ids),
|
|
469
|
+
"pdb_ids": pdb_ids[:20], # Cap at 20 for readability
|
|
470
|
+
},
|
|
471
|
+
"surface_accessible": is_surface,
|
|
472
|
+
"subcellular_locations": subcellular_locs,
|
|
473
|
+
"transmembrane_regions": len(transmembrane),
|
|
474
|
+
"domains": domains,
|
|
475
|
+
"reasoning": reasoning,
|
|
476
|
+
"score_breakdown": {
|
|
477
|
+
"protein_class": round(class_score, 3),
|
|
478
|
+
"known_ligands": round(ligand_score, 3),
|
|
479
|
+
"surface_accessibility": round(surface_score, 3),
|
|
480
|
+
"structural_data": round(structure_score, 3),
|
|
481
|
+
"transmembrane": round(tm_score, 3),
|
|
482
|
+
},
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
@registry.register(
|
|
487
|
+
name="target.expression_profile",
|
|
488
|
+
description="Get tissue expression profile for a gene using GTEx Portal API and Human Protein Atlas",
|
|
489
|
+
category="target",
|
|
490
|
+
parameters={
|
|
491
|
+
"gene": "Gene symbol (e.g. TP53, EGFR, BRCA1)",
|
|
492
|
+
"top_n": "Number of top tissues to return (default 10)",
|
|
493
|
+
},
|
|
494
|
+
requires_data=[],
|
|
495
|
+
usage_guide="You want to understand where a target is expressed — tissue specificity, cancer vs normal, and cell type expression. Critical for safety assessment and indication selection.",
|
|
496
|
+
)
|
|
497
|
+
def expression_profile(gene: str, top_n: int = 10, **kwargs) -> dict:
|
|
498
|
+
"""Get tissue expression profile for a gene from GTEx and Human Protein Atlas.
|
|
499
|
+
|
|
500
|
+
Resolves gene symbol to GENCODE ID via the GTEx reference API, then
|
|
501
|
+
fetches median expression per tissue from GTEx v8. Also queries HPA
|
|
502
|
+
for protein-level and single-cell expression. Computes a tissue
|
|
503
|
+
specificity index (tau) from the GTEx TPM values.
|
|
504
|
+
"""
|
|
505
|
+
# --- Step 1: Resolve gene symbol to GENCODE ID via GTEx reference API ---
|
|
506
|
+
def _gene_symbol_candidates(input_gene: str) -> list[str]:
|
|
507
|
+
alias_map = {
|
|
508
|
+
"GBA1": "GBA",
|
|
509
|
+
"PARK2": "PRKN",
|
|
510
|
+
}
|
|
511
|
+
token = (input_gene or "").strip()
|
|
512
|
+
if not token:
|
|
513
|
+
return []
|
|
514
|
+
candidates = [token]
|
|
515
|
+
mapped = alias_map.get(token.upper())
|
|
516
|
+
if mapped:
|
|
517
|
+
candidates.append(mapped)
|
|
518
|
+
if token.endswith("1") and len(token) > 1:
|
|
519
|
+
candidates.append(token[:-1])
|
|
520
|
+
|
|
521
|
+
deduped = []
|
|
522
|
+
seen = set()
|
|
523
|
+
for c in candidates:
|
|
524
|
+
k = c.upper()
|
|
525
|
+
if k in seen:
|
|
526
|
+
continue
|
|
527
|
+
seen.add(k)
|
|
528
|
+
deduped.append(c)
|
|
529
|
+
return deduped
|
|
530
|
+
|
|
531
|
+
gencode_id = None
|
|
532
|
+
gene_symbol = gene
|
|
533
|
+
gene_candidates = _gene_symbol_candidates(gene)
|
|
534
|
+
|
|
535
|
+
for gene_candidate in gene_candidates:
|
|
536
|
+
ref_resp, ref_error = request(
|
|
537
|
+
"GET",
|
|
538
|
+
"https://gtexportal.org/api/v2/reference/gene",
|
|
539
|
+
params={"geneId": gene_candidate},
|
|
540
|
+
timeout=10,
|
|
541
|
+
raise_for_status=False,
|
|
542
|
+
)
|
|
543
|
+
if ref_error or ref_resp.status_code != 200:
|
|
544
|
+
continue
|
|
545
|
+
try:
|
|
546
|
+
ref_data = ref_resp.json()
|
|
547
|
+
except Exception:
|
|
548
|
+
continue
|
|
549
|
+
genes_list = ref_data.get("data", [])
|
|
550
|
+
if genes_list:
|
|
551
|
+
gene_info = genes_list[0]
|
|
552
|
+
gencode_id = gene_info.get("gencodeId", "")
|
|
553
|
+
gene_symbol = gene_info.get("geneSymbol", gene_candidate)
|
|
554
|
+
break
|
|
555
|
+
|
|
556
|
+
# --- Step 2: GTEx median gene expression per tissue ---
|
|
557
|
+
gtex_expression = []
|
|
558
|
+
|
|
559
|
+
if gencode_id:
|
|
560
|
+
gtex_resp, gtex_error = request(
|
|
561
|
+
"GET",
|
|
562
|
+
"https://gtexportal.org/api/v2/expression/medianGeneExpression",
|
|
563
|
+
params={
|
|
564
|
+
"gencodeId": gencode_id,
|
|
565
|
+
"datasetId": "gtex_v8",
|
|
566
|
+
},
|
|
567
|
+
timeout=10,
|
|
568
|
+
raise_for_status=False,
|
|
569
|
+
)
|
|
570
|
+
if not gtex_error and gtex_resp.status_code == 200:
|
|
571
|
+
try:
|
|
572
|
+
gtex_data = gtex_resp.json()
|
|
573
|
+
for entry in gtex_data.get("data", []):
|
|
574
|
+
gtex_expression.append({
|
|
575
|
+
"tissue": entry.get("tissueSiteDetailId", ""),
|
|
576
|
+
"median_tpm": entry.get("median", 0),
|
|
577
|
+
})
|
|
578
|
+
gtex_expression.sort(key=lambda x: x.get("median_tpm", 0), reverse=True)
|
|
579
|
+
except Exception:
|
|
580
|
+
gtex_expression = []
|
|
581
|
+
|
|
582
|
+
# --- Step 3: Compute tissue specificity index (tau) ---
|
|
583
|
+
# Tau ranges from 0 (ubiquitous) to 1 (tissue-specific)
|
|
584
|
+
tau = None
|
|
585
|
+
if gtex_expression:
|
|
586
|
+
tpm_values = [t["median_tpm"] for t in gtex_expression]
|
|
587
|
+
max_tpm = max(tpm_values) if tpm_values else 0
|
|
588
|
+
if max_tpm > 0 and len(tpm_values) > 1:
|
|
589
|
+
n = len(tpm_values)
|
|
590
|
+
tau = sum(1.0 - (x / max_tpm) for x in tpm_values) / (n - 1)
|
|
591
|
+
tau = round(tau, 4)
|
|
592
|
+
|
|
593
|
+
# --- Step 4: Human Protein Atlas ---
|
|
594
|
+
hpa_data = {}
|
|
595
|
+
tissue_rna = []
|
|
596
|
+
tissue_protein = []
|
|
597
|
+
cancer_expression = []
|
|
598
|
+
cell_type_expression = []
|
|
599
|
+
ensembl_id = None
|
|
600
|
+
|
|
601
|
+
# Try to extract Ensembl ID from GENCODE ID (strip version suffix)
|
|
602
|
+
if gencode_id:
|
|
603
|
+
ensembl_id = gencode_id.split(".")[0]
|
|
604
|
+
|
|
605
|
+
# Query HPA using Ensembl ID if available, then gene aliases.
|
|
606
|
+
hpa_queries = []
|
|
607
|
+
if ensembl_id:
|
|
608
|
+
hpa_queries.append(ensembl_id)
|
|
609
|
+
if gene_symbol:
|
|
610
|
+
hpa_queries.append(gene_symbol)
|
|
611
|
+
hpa_queries.extend(gene_candidates)
|
|
612
|
+
|
|
613
|
+
# Stable de-dup for query candidates.
|
|
614
|
+
deduped_hpa_queries = []
|
|
615
|
+
seen_hpa = set()
|
|
616
|
+
for q in hpa_queries:
|
|
617
|
+
key = str(q).upper()
|
|
618
|
+
if key in seen_hpa:
|
|
619
|
+
continue
|
|
620
|
+
seen_hpa.add(key)
|
|
621
|
+
deduped_hpa_queries.append(q)
|
|
622
|
+
|
|
623
|
+
for hpa_query in deduped_hpa_queries:
|
|
624
|
+
hpa_resp, hpa_error = request(
|
|
625
|
+
"GET",
|
|
626
|
+
f"https://www.proteinatlas.org/{hpa_query}.json",
|
|
627
|
+
timeout=10,
|
|
628
|
+
headers={"Accept": "application/json"},
|
|
629
|
+
raise_for_status=False,
|
|
630
|
+
)
|
|
631
|
+
if hpa_error or hpa_resp is None or hpa_resp.status_code != 200:
|
|
632
|
+
continue
|
|
633
|
+
try:
|
|
634
|
+
hpa_data = hpa_resp.json()
|
|
635
|
+
except Exception:
|
|
636
|
+
hpa_data = {}
|
|
637
|
+
if hpa_data:
|
|
638
|
+
break
|
|
639
|
+
|
|
640
|
+
if hpa_data:
|
|
641
|
+
# RNA tissue expression
|
|
642
|
+
for entry in hpa_data.get("RNATissue", {}).get("data", []):
|
|
643
|
+
tissue_rna.append({
|
|
644
|
+
"tissue": entry.get("Tissue", ""),
|
|
645
|
+
"tpm": entry.get("TPM", 0),
|
|
646
|
+
"ntpm": entry.get("nTPM", 0),
|
|
647
|
+
})
|
|
648
|
+
tissue_rna.sort(key=lambda x: x.get("tpm", 0), reverse=True)
|
|
649
|
+
|
|
650
|
+
# Protein tissue expression
|
|
651
|
+
for entry in hpa_data.get("ProteinTissue", {}).get("data", []):
|
|
652
|
+
tissue_protein.append({
|
|
653
|
+
"tissue": entry.get("Tissue", ""),
|
|
654
|
+
"level": entry.get("Level", ""),
|
|
655
|
+
"cell_type": entry.get("CellType", ""),
|
|
656
|
+
})
|
|
657
|
+
|
|
658
|
+
# Cancer expression
|
|
659
|
+
for entry in hpa_data.get("RNACancer", {}).get("data", []):
|
|
660
|
+
cancer_expression.append({
|
|
661
|
+
"cancer": entry.get("Cancer", ""),
|
|
662
|
+
"tpm": entry.get("TPM", 0),
|
|
663
|
+
"ntpm": entry.get("nTPM", 0),
|
|
664
|
+
})
|
|
665
|
+
cancer_expression.sort(key=lambda x: x.get("tpm", 0), reverse=True)
|
|
666
|
+
|
|
667
|
+
# Cell type expression
|
|
668
|
+
for entry in hpa_data.get("RNASingleCell", {}).get("data", []):
|
|
669
|
+
cell_type_expression.append({
|
|
670
|
+
"cell_type": entry.get("CellType", ""),
|
|
671
|
+
"ntpm": entry.get("nTPM", 0),
|
|
672
|
+
})
|
|
673
|
+
cell_type_expression.sort(key=lambda x: x.get("ntpm", 0), reverse=True)
|
|
674
|
+
|
|
675
|
+
# --- Build response ---
|
|
676
|
+
# Prefer GTEx for top tissues (quantitative TPM), fall back to HPA
|
|
677
|
+
top_tissues = gtex_expression[:top_n] if gtex_expression else tissue_rna[:top_n]
|
|
678
|
+
if not top_tissues and not hpa_data:
|
|
679
|
+
return {
|
|
680
|
+
"summary": f"No expression data found for {gene} from GTEx or Human Protein Atlas",
|
|
681
|
+
"gene": gene,
|
|
682
|
+
"error": "No data returned from GTEx or HPA APIs",
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
n_tissues = len(gtex_expression) if gtex_expression else len(tissue_rna)
|
|
686
|
+
|
|
687
|
+
# Build summary line matching the spec format
|
|
688
|
+
if gtex_expression:
|
|
689
|
+
tissue_strs = [
|
|
690
|
+
f"{t['tissue']} ({t['median_tpm']:.1f} TPM)" for t in gtex_expression[:top_n]
|
|
691
|
+
]
|
|
692
|
+
summary = f"{gene_symbol} expression: highest in {', '.join(tissue_strs[:5])}"
|
|
693
|
+
elif tissue_rna:
|
|
694
|
+
tissue_strs = [
|
|
695
|
+
f"{t['tissue']} ({t['tpm']:.1f} TPM)" for t in tissue_rna[:top_n]
|
|
696
|
+
]
|
|
697
|
+
summary = f"{gene_symbol} expression: highest in {', '.join(tissue_strs[:5])}"
|
|
698
|
+
else:
|
|
699
|
+
summary = f"Expression profile for {gene_symbol}: {n_tissues} tissues profiled"
|
|
700
|
+
|
|
701
|
+
if tau is not None:
|
|
702
|
+
specificity_label = (
|
|
703
|
+
"tissue-specific" if tau > 0.8
|
|
704
|
+
else "tissue-enriched" if tau > 0.5
|
|
705
|
+
else "broadly expressed"
|
|
706
|
+
)
|
|
707
|
+
summary += f". Specificity: {specificity_label} (tau={tau:.3f})"
|
|
708
|
+
|
|
709
|
+
# RNA tissue specificity category from HPA
|
|
710
|
+
rna_specificity = hpa_data.get("RNATissue", {}).get("summary", "")
|
|
711
|
+
|
|
712
|
+
return {
|
|
713
|
+
"summary": summary,
|
|
714
|
+
"gene": gene_symbol,
|
|
715
|
+
"gencode_id": gencode_id,
|
|
716
|
+
"ensembl_id": ensembl_id,
|
|
717
|
+
"tissue_specificity_tau": tau,
|
|
718
|
+
"rna_specificity_hpa": rna_specificity,
|
|
719
|
+
"gtex_expression": gtex_expression[:top_n],
|
|
720
|
+
"tissue_rna_hpa": tissue_rna[:top_n],
|
|
721
|
+
"tissue_protein": tissue_protein[:30],
|
|
722
|
+
"cancer_expression": cancer_expression[:20],
|
|
723
|
+
"cell_type_expression": cell_type_expression[:20],
|
|
724
|
+
"n_tissues_profiled": n_tissues,
|
|
725
|
+
"top_expressing_tissues": top_tissues[:top_n],
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
@registry.register(
|
|
730
|
+
name="target.disease_association",
|
|
731
|
+
description="Query Open Targets Platform for disease associations of a gene target",
|
|
732
|
+
category="target",
|
|
733
|
+
parameters={"gene": "Gene symbol (e.g. BRAF, TP53)", "min_score": "Minimum association score (default 0.1)"},
|
|
734
|
+
requires_data=[],
|
|
735
|
+
usage_guide="You want to know which diseases a target is associated with — genetic evidence, drug evidence, literature support. Essential for indication selection and target validation.",
|
|
736
|
+
)
|
|
737
|
+
def disease_association(gene: str, min_score: float = 0.1, **kwargs) -> dict:
|
|
738
|
+
"""Query Open Targets for disease associations of a gene."""
|
|
739
|
+
# Step 1: Resolve gene symbol to Ensembl ID
|
|
740
|
+
ensembl_id = None
|
|
741
|
+
ens_resp, ens_error = request(
|
|
742
|
+
"GET",
|
|
743
|
+
f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene}",
|
|
744
|
+
params={"content-type": "application/json"},
|
|
745
|
+
timeout=10,
|
|
746
|
+
headers={"Content-Type": "application/json"},
|
|
747
|
+
raise_for_status=False,
|
|
748
|
+
)
|
|
749
|
+
if ens_error:
|
|
750
|
+
return {
|
|
751
|
+
"error": f"Failed to resolve {gene} to Ensembl ID: {ens_error}",
|
|
752
|
+
"summary": f"Could not resolve gene symbol {gene} via Ensembl REST API",
|
|
753
|
+
}
|
|
754
|
+
if ens_resp.status_code == 200:
|
|
755
|
+
try:
|
|
756
|
+
ens_data = ens_resp.json()
|
|
757
|
+
ensembl_id = ens_data.get("id", "")
|
|
758
|
+
except Exception:
|
|
759
|
+
ensembl_id = None
|
|
760
|
+
|
|
761
|
+
if not ensembl_id:
|
|
762
|
+
return {
|
|
763
|
+
"error": f"Gene {gene} not found in Ensembl (human)",
|
|
764
|
+
"summary": f"Gene symbol {gene} could not be resolved to an Ensembl ID",
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
# Step 2: Query Open Targets GraphQL
|
|
768
|
+
query = """
|
|
769
|
+
query targetDiseases($ensemblId: String!, $size: Int!) {
|
|
770
|
+
target(ensemblId: $ensemblId) {
|
|
771
|
+
approvedSymbol
|
|
772
|
+
approvedName
|
|
773
|
+
associatedDiseases(page: {index: 0, size: $size}) {
|
|
774
|
+
count
|
|
775
|
+
rows {
|
|
776
|
+
disease {
|
|
777
|
+
id
|
|
778
|
+
name
|
|
779
|
+
}
|
|
780
|
+
score
|
|
781
|
+
datasourceScores {
|
|
782
|
+
id
|
|
783
|
+
score
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
"""
|
|
790
|
+
|
|
791
|
+
ot_resp, ot_error = request(
|
|
792
|
+
"POST",
|
|
793
|
+
"https://api.platform.opentargets.org/api/v4/graphql",
|
|
794
|
+
json={
|
|
795
|
+
"query": query,
|
|
796
|
+
"variables": {
|
|
797
|
+
"ensemblId": ensembl_id,
|
|
798
|
+
"size": 50,
|
|
799
|
+
},
|
|
800
|
+
},
|
|
801
|
+
timeout=10,
|
|
802
|
+
headers={"Content-Type": "application/json"},
|
|
803
|
+
raise_for_status=False,
|
|
804
|
+
)
|
|
805
|
+
if ot_error:
|
|
806
|
+
return {
|
|
807
|
+
"error": f"Open Targets API error: {ot_error}",
|
|
808
|
+
"summary": f"Failed to query Open Targets for {gene}",
|
|
809
|
+
}
|
|
810
|
+
if ot_resp.status_code != 200:
|
|
811
|
+
return {
|
|
812
|
+
"error": f"Open Targets API returned HTTP {ot_resp.status_code}",
|
|
813
|
+
"summary": f"Open Targets query failed for {gene} ({ensembl_id})",
|
|
814
|
+
}
|
|
815
|
+
try:
|
|
816
|
+
ot_data = ot_resp.json()
|
|
817
|
+
except Exception:
|
|
818
|
+
return {
|
|
819
|
+
"error": "Open Targets returned invalid JSON",
|
|
820
|
+
"summary": f"Failed to parse Open Targets response for {gene}",
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
target_data = ot_data.get("data", {}).get("target")
|
|
824
|
+
if not target_data:
|
|
825
|
+
return {
|
|
826
|
+
"error": f"No target data returned from Open Targets for {ensembl_id}",
|
|
827
|
+
"summary": f"Open Targets has no entry for {gene} ({ensembl_id})",
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
approved_symbol = target_data.get("approvedSymbol", gene)
|
|
831
|
+
approved_name = target_data.get("approvedName", "")
|
|
832
|
+
assoc_data = target_data.get("associatedDiseases", {})
|
|
833
|
+
total_count = assoc_data.get("count", 0)
|
|
834
|
+
rows = assoc_data.get("rows", [])
|
|
835
|
+
|
|
836
|
+
# Parse associations
|
|
837
|
+
associations = []
|
|
838
|
+
for row in rows:
|
|
839
|
+
overall_score = row.get("score", 0)
|
|
840
|
+
if overall_score < min_score:
|
|
841
|
+
continue
|
|
842
|
+
|
|
843
|
+
disease = row.get("disease", {})
|
|
844
|
+
disease_id = disease.get("id", "")
|
|
845
|
+
disease_name = disease.get("name", "")
|
|
846
|
+
|
|
847
|
+
# Parse datasource scores into readable categories
|
|
848
|
+
ds_scores = {}
|
|
849
|
+
for ds in row.get("datasourceScores", []):
|
|
850
|
+
comp_id = ds.get("id") or ds.get("componentId", "")
|
|
851
|
+
ds_score = ds.get("score", 0)
|
|
852
|
+
ds_scores[comp_id] = round(ds_score, 4)
|
|
853
|
+
|
|
854
|
+
# Extract key evidence categories
|
|
855
|
+
genetic_score = max(
|
|
856
|
+
ds_scores.get("ot_genetics_portal", 0),
|
|
857
|
+
ds_scores.get("gene_burden", 0),
|
|
858
|
+
ds_scores.get("genomics_england", 0),
|
|
859
|
+
ds_scores.get("eva", 0),
|
|
860
|
+
ds_scores.get("uniprot_variants", 0),
|
|
861
|
+
)
|
|
862
|
+
drug_score = max(
|
|
863
|
+
ds_scores.get("chembl", 0),
|
|
864
|
+
ds_scores.get("europepmc", 0),
|
|
865
|
+
)
|
|
866
|
+
literature_score = ds_scores.get("europepmc", 0)
|
|
867
|
+
|
|
868
|
+
associations.append({
|
|
869
|
+
"disease_id": disease_id,
|
|
870
|
+
"disease_name": disease_name,
|
|
871
|
+
"overall_score": round(overall_score, 4),
|
|
872
|
+
"genetic_association": round(genetic_score, 4),
|
|
873
|
+
"known_drug": round(drug_score, 4),
|
|
874
|
+
"literature": round(literature_score, 4),
|
|
875
|
+
"all_datasource_scores": ds_scores,
|
|
876
|
+
})
|
|
877
|
+
|
|
878
|
+
associations.sort(key=lambda x: x["overall_score"], reverse=True)
|
|
879
|
+
|
|
880
|
+
# Build summary
|
|
881
|
+
n_filtered = len(associations)
|
|
882
|
+
top_diseases = ", ".join(a["disease_name"] for a in associations[:5])
|
|
883
|
+
if not top_diseases:
|
|
884
|
+
top_diseases = "none"
|
|
885
|
+
|
|
886
|
+
summary = (
|
|
887
|
+
f"Disease associations for {approved_symbol} ({approved_name}): "
|
|
888
|
+
f"{n_filtered} diseases above score {min_score} (out of {total_count} total). "
|
|
889
|
+
f"Top: {top_diseases}."
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return {
|
|
893
|
+
"summary": summary,
|
|
894
|
+
"gene": approved_symbol,
|
|
895
|
+
"ensembl_id": ensembl_id,
|
|
896
|
+
"approved_name": approved_name,
|
|
897
|
+
"total_associations": total_count,
|
|
898
|
+
"filtered_associations": n_filtered,
|
|
899
|
+
"min_score": min_score,
|
|
900
|
+
"associations": associations,
|
|
901
|
+
}
|