celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/genomics.py
ADDED
|
@@ -0,0 +1,1387 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Genomics tools: GWAS lookup, eQTL analysis, variant annotation, Mendelian randomization.
|
|
3
|
+
|
|
4
|
+
These are REST/GraphQL API wrappers -- no local data required.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
from ct.tools import registry
|
|
10
|
+
from ct.tools.http_client import request, request_json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@registry.register(
|
|
14
|
+
name="genomics.gwas_lookup",
|
|
15
|
+
description="Query the GWAS Catalog for genetic associations for a gene, optionally filtered by trait",
|
|
16
|
+
category="genomics",
|
|
17
|
+
parameters={
|
|
18
|
+
"gene": "Gene symbol (e.g. 'BRCA1', 'TP53')",
|
|
19
|
+
"trait": "Trait or disease name to filter (optional)",
|
|
20
|
+
"p_threshold": "P-value threshold for significance (default 5e-8)",
|
|
21
|
+
},
|
|
22
|
+
requires_data=[],
|
|
23
|
+
usage_guide="You want to find genome-wide significant genetic associations for a specific gene. Optionally add a trait filter to focus disease context.",
|
|
24
|
+
)
|
|
25
|
+
def gwas_lookup(gene: str = None, trait: str = None, p_threshold: float = 5e-8, **kwargs) -> dict:
|
|
26
|
+
"""Query the NHGRI-EBI GWAS Catalog REST API for genetic associations."""
|
|
27
|
+
try:
|
|
28
|
+
import httpx
|
|
29
|
+
except ImportError:
|
|
30
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
31
|
+
gene = str(gene or "").strip()
|
|
32
|
+
trait = str(trait or "").strip() or None
|
|
33
|
+
if not gene:
|
|
34
|
+
detail = f" (trait='{trait}')" if trait else ""
|
|
35
|
+
return {
|
|
36
|
+
"error": f"Missing required parameter: gene{detail}",
|
|
37
|
+
"summary": "GWAS lookup requires a non-empty gene symbol (e.g., SNCA, APOE).",
|
|
38
|
+
"gene": gene,
|
|
39
|
+
"trait_filter": trait,
|
|
40
|
+
"suggestion": (
|
|
41
|
+
"First identify candidate genes (e.g., with data_api.opentargets_search), "
|
|
42
|
+
"then run genomics.gwas_lookup with one gene at a time."
|
|
43
|
+
),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
base = "https://www.ebi.ac.uk/gwas/rest/api"
|
|
47
|
+
|
|
48
|
+
# Step 1: Find SNPs associated with the gene
|
|
49
|
+
snp_url = f"{base}/singleNucleotidePolymorphisms/search/findByGene"
|
|
50
|
+
params = {"geneName": gene, "size": 100}
|
|
51
|
+
|
|
52
|
+
data, error = request_json(
|
|
53
|
+
"GET",
|
|
54
|
+
snp_url,
|
|
55
|
+
params=params,
|
|
56
|
+
timeout=30,
|
|
57
|
+
retries=2,
|
|
58
|
+
)
|
|
59
|
+
if error:
|
|
60
|
+
return {"error": f"GWAS Catalog query failed: {error}", "summary": f"GWAS Catalog query failed: {error}"}
|
|
61
|
+
embedded = data.get("_embedded", {})
|
|
62
|
+
snps = embedded.get("singleNucleotidePolymorphisms", [])
|
|
63
|
+
|
|
64
|
+
if not snps:
|
|
65
|
+
return {
|
|
66
|
+
"summary": f"No GWAS associations found for gene {gene}",
|
|
67
|
+
"gene": gene,
|
|
68
|
+
"associations": [],
|
|
69
|
+
"n_associations": 0,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Step 2: For each SNP, fetch associations using the summary projection
|
|
73
|
+
# which embeds EFO traits inline (avoids extra per-trait API calls)
|
|
74
|
+
associations = []
|
|
75
|
+
seen = set()
|
|
76
|
+
|
|
77
|
+
for snp_entry in snps[:30]: # Cap at 30 SNPs to limit API calls
|
|
78
|
+
rsid = snp_entry.get("rsId", "")
|
|
79
|
+
if not rsid:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# Use the associationBySnp projection which embeds traits inline
|
|
83
|
+
assoc_url = f"{base}/singleNucleotidePolymorphisms/{rsid}/associations"
|
|
84
|
+
assoc_data, assoc_error = request_json(
|
|
85
|
+
"GET",
|
|
86
|
+
assoc_url,
|
|
87
|
+
params={"projection": "associationBySnp"},
|
|
88
|
+
timeout=10,
|
|
89
|
+
retries=2,
|
|
90
|
+
)
|
|
91
|
+
if assoc_error:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
assoc_list = assoc_data.get("_embedded", {}).get("associations", [])
|
|
95
|
+
|
|
96
|
+
for assoc in assoc_list:
|
|
97
|
+
pval_mantissa = assoc.get("pvalueMantissa")
|
|
98
|
+
pval_exponent = assoc.get("pvalueExponent")
|
|
99
|
+
if pval_mantissa is not None and pval_exponent is not None:
|
|
100
|
+
try:
|
|
101
|
+
pval = float(pval_mantissa) * (10 ** int(pval_exponent))
|
|
102
|
+
except (ValueError, TypeError):
|
|
103
|
+
pval = None
|
|
104
|
+
else:
|
|
105
|
+
pval = None
|
|
106
|
+
|
|
107
|
+
# Filter by p-value threshold
|
|
108
|
+
if pval is not None and pval > p_threshold:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Extract risk allele info from loci
|
|
112
|
+
loci = assoc.get("loci", [])
|
|
113
|
+
risk_allele_name = ""
|
|
114
|
+
if loci:
|
|
115
|
+
risk_alleles = loci[0].get("strongestRiskAlleles", [])
|
|
116
|
+
if risk_alleles:
|
|
117
|
+
risk_allele_name = risk_alleles[0].get("riskAlleleName", "")
|
|
118
|
+
|
|
119
|
+
# Extract traits from embedded efoTraits (no extra API call needed)
|
|
120
|
+
efo_traits = assoc.get("efoTraits", [])
|
|
121
|
+
trait_names = [t.get("trait", "") for t in efo_traits if t.get("trait")]
|
|
122
|
+
trait_name = "; ".join(trait_names)
|
|
123
|
+
|
|
124
|
+
# Filter by trait if specified
|
|
125
|
+
if trait and trait_name:
|
|
126
|
+
if trait.lower() not in trait_name.lower():
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
or_value = assoc.get("orPerCopyNum")
|
|
130
|
+
beta = assoc.get("betaNum")
|
|
131
|
+
beta_unit = assoc.get("betaUnit", "")
|
|
132
|
+
beta_direction = assoc.get("betaDirection", "")
|
|
133
|
+
|
|
134
|
+
assoc_id = f"{rsid}_{pval}_{trait_name}"
|
|
135
|
+
if assoc_id in seen:
|
|
136
|
+
continue
|
|
137
|
+
seen.add(assoc_id)
|
|
138
|
+
|
|
139
|
+
associations.append({
|
|
140
|
+
"rsid": rsid,
|
|
141
|
+
"risk_allele": risk_allele_name,
|
|
142
|
+
"p_value": pval,
|
|
143
|
+
"p_value_str": f"{pval_mantissa}e{pval_exponent}" if pval_mantissa else None,
|
|
144
|
+
"trait": trait_name,
|
|
145
|
+
"or_per_copy": or_value,
|
|
146
|
+
"beta": beta,
|
|
147
|
+
"beta_unit": beta_unit,
|
|
148
|
+
"beta_direction": beta_direction,
|
|
149
|
+
"mapped_gene": gene,
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
# Stop early if we have enough
|
|
153
|
+
if len(associations) >= 50:
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
# Sort by p-value (most significant first)
|
|
157
|
+
associations.sort(key=lambda x: x["p_value"] if x["p_value"] is not None else 1.0)
|
|
158
|
+
|
|
159
|
+
trait_str = f" for trait '{trait}'" if trait else ""
|
|
160
|
+
return {
|
|
161
|
+
"summary": (
|
|
162
|
+
f"GWAS associations for {gene}{trait_str}: "
|
|
163
|
+
f"{len(associations)} genome-wide significant hits (p < {p_threshold})"
|
|
164
|
+
),
|
|
165
|
+
"gene": gene,
|
|
166
|
+
"trait_filter": trait,
|
|
167
|
+
"p_threshold": p_threshold,
|
|
168
|
+
"n_associations": len(associations),
|
|
169
|
+
"associations": associations[:30], # Return top 30
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@registry.register(
|
|
174
|
+
name="genomics.eqtl_lookup",
|
|
175
|
+
description="Query GTEx for expression quantitative trait loci (eQTLs) for a gene across tissues",
|
|
176
|
+
category="genomics",
|
|
177
|
+
parameters={
|
|
178
|
+
"gene": "Gene symbol (e.g. 'BRCA1', 'TP53')",
|
|
179
|
+
"tissue": "GTEx tissue name to filter (optional, e.g. 'Liver', 'Brain_Cortex')",
|
|
180
|
+
},
|
|
181
|
+
requires_data=[],
|
|
182
|
+
usage_guide="You want to find genetic variants that regulate gene expression in specific tissues. Use to understand tissue-specific regulation, identify regulatory variants, and connect GWAS signals to gene function.",
|
|
183
|
+
)
|
|
184
|
+
def eqtl_lookup(gene: str, tissue: str = None, **kwargs) -> dict:
|
|
185
|
+
"""Query the GTEx API for significant eQTLs for a gene."""
|
|
186
|
+
try:
|
|
187
|
+
import httpx
|
|
188
|
+
except ImportError:
|
|
189
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
190
|
+
gtex_base = "https://gtexportal.org/api/v2"
|
|
191
|
+
|
|
192
|
+
# Step 1: Resolve gene symbol to GENCODE ID
|
|
193
|
+
gene_url = f"{gtex_base}/reference/gene"
|
|
194
|
+
gene_params = {"geneId": gene}
|
|
195
|
+
|
|
196
|
+
gene_data, error = request_json(
|
|
197
|
+
"GET",
|
|
198
|
+
gene_url,
|
|
199
|
+
params=gene_params,
|
|
200
|
+
timeout=10,
|
|
201
|
+
retries=2,
|
|
202
|
+
)
|
|
203
|
+
if error:
|
|
204
|
+
return {"error": f"GTEx gene lookup failed: {error}", "summary": f"GTEx gene lookup failed: {error}"}
|
|
205
|
+
genes_list = gene_data.get("data", [])
|
|
206
|
+
if not genes_list:
|
|
207
|
+
return {
|
|
208
|
+
"error": f"Gene '{gene}' not found in GTEx GENCODE v26 reference",
|
|
209
|
+
"suggestion": "Try using the official HGNC gene symbol",
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
# Use the first matching gene entry
|
|
213
|
+
gene_info = genes_list[0]
|
|
214
|
+
gencode_id = gene_info.get("gencodeId", "")
|
|
215
|
+
gene_symbol = gene_info.get("geneSymbol", gene)
|
|
216
|
+
description = gene_info.get("description", "")
|
|
217
|
+
|
|
218
|
+
if not gencode_id:
|
|
219
|
+
return {"error": f"Could not resolve GENCODE ID for {gene}", "summary": f"Could not resolve GENCODE ID for {gene}"}
|
|
220
|
+
# Step 2: Query significant single-tissue eQTLs
|
|
221
|
+
eqtl_url = f"{gtex_base}/association/singleTissueEqtl"
|
|
222
|
+
eqtl_params = {
|
|
223
|
+
"gencodeId": gencode_id,
|
|
224
|
+
"datasetId": "gtex_v8",
|
|
225
|
+
}
|
|
226
|
+
if tissue:
|
|
227
|
+
eqtl_params["tissueSiteDetailId"] = tissue
|
|
228
|
+
|
|
229
|
+
eqtl_data, error = request_json(
|
|
230
|
+
"GET",
|
|
231
|
+
eqtl_url,
|
|
232
|
+
params=eqtl_params,
|
|
233
|
+
timeout=10,
|
|
234
|
+
retries=2,
|
|
235
|
+
)
|
|
236
|
+
if error:
|
|
237
|
+
return {"error": f"GTEx eQTL query failed: {error}", "summary": f"GTEx eQTL query failed: {error}"}
|
|
238
|
+
eqtls_raw = eqtl_data.get("data", [])
|
|
239
|
+
|
|
240
|
+
if not eqtls_raw:
|
|
241
|
+
tissue_str = f" in {tissue}" if tissue else ""
|
|
242
|
+
return {
|
|
243
|
+
"summary": f"No significant eQTLs found for {gene_symbol}{tissue_str} in GTEx v8",
|
|
244
|
+
"gene": gene_symbol,
|
|
245
|
+
"gencode_id": gencode_id,
|
|
246
|
+
"eqtls": [],
|
|
247
|
+
"n_eqtls": 0,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Parse eQTL results
|
|
251
|
+
eqtls = []
|
|
252
|
+
tissues_found = set()
|
|
253
|
+
|
|
254
|
+
for eqtl in eqtls_raw:
|
|
255
|
+
tissue_id = eqtl.get("tissueSiteDetailId", "")
|
|
256
|
+
tissues_found.add(tissue_id)
|
|
257
|
+
|
|
258
|
+
eqtls.append({
|
|
259
|
+
"variant_id": eqtl.get("variantId", ""),
|
|
260
|
+
"snp_id": eqtl.get("snpId", ""),
|
|
261
|
+
"tissue": tissue_id,
|
|
262
|
+
"p_value": eqtl.get("pValue"),
|
|
263
|
+
"nes": eqtl.get("nes"), # Normalized effect size
|
|
264
|
+
"chromosome": eqtl.get("chromosome", ""),
|
|
265
|
+
"pos": eqtl.get("pos"),
|
|
266
|
+
"gene_symbol": eqtl.get("geneSymbol", gene_symbol),
|
|
267
|
+
})
|
|
268
|
+
|
|
269
|
+
# Sort by absolute NES (largest effect first)
|
|
270
|
+
eqtls.sort(key=lambda x: abs(x["nes"]) if x["nes"] is not None else 0, reverse=True)
|
|
271
|
+
|
|
272
|
+
tissue_str = f" in {tissue}" if tissue else f" across {len(tissues_found)} tissues"
|
|
273
|
+
return {
|
|
274
|
+
"summary": (
|
|
275
|
+
f"GTEx eQTLs for {gene_symbol} ({gencode_id}){tissue_str}: "
|
|
276
|
+
f"{len(eqtls)} significant eQTLs found"
|
|
277
|
+
),
|
|
278
|
+
"gene": gene_symbol,
|
|
279
|
+
"gencode_id": gencode_id,
|
|
280
|
+
"gene_description": description,
|
|
281
|
+
"n_eqtls": len(eqtls),
|
|
282
|
+
"n_tissues": len(tissues_found),
|
|
283
|
+
"tissues": sorted(tissues_found),
|
|
284
|
+
"eqtls": eqtls[:50], # Return top 50 by effect size
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@registry.register(
|
|
289
|
+
name="genomics.variant_annotate",
|
|
290
|
+
description="Annotate a genetic variant using Ensembl VEP (Variant Effect Predictor)",
|
|
291
|
+
category="genomics",
|
|
292
|
+
parameters={
|
|
293
|
+
"variant": "Variant identifier: rsID (e.g. 'rs1234') or HGVS notation (e.g. '17:g.41245466G>A')",
|
|
294
|
+
},
|
|
295
|
+
requires_data=[],
|
|
296
|
+
usage_guide="You want to understand the functional consequence of a specific genetic variant. Use to get consequence type (missense, synonymous, etc.), impact prediction, amino acid changes, allele frequencies, and clinical significance.",
|
|
297
|
+
)
|
|
298
|
+
def variant_annotate(variant: str, **kwargs) -> dict:
|
|
299
|
+
"""Annotate a variant using the Ensembl VEP REST API."""
|
|
300
|
+
try:
|
|
301
|
+
import httpx
|
|
302
|
+
except ImportError:
|
|
303
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
304
|
+
ensembl_base = "https://rest.ensembl.org"
|
|
305
|
+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
|
306
|
+
|
|
307
|
+
# Determine if this is an rsID or HGVS notation
|
|
308
|
+
variant_clean = variant.strip()
|
|
309
|
+
if variant_clean.lower().startswith("rs"):
|
|
310
|
+
url = f"{ensembl_base}/vep/human/id/{variant_clean}"
|
|
311
|
+
else:
|
|
312
|
+
url = f"{ensembl_base}/vep/human/hgvs/{variant_clean}"
|
|
313
|
+
|
|
314
|
+
resp, error = request(
|
|
315
|
+
"GET",
|
|
316
|
+
url,
|
|
317
|
+
headers=headers,
|
|
318
|
+
timeout=30,
|
|
319
|
+
retries=2,
|
|
320
|
+
raise_for_status=False,
|
|
321
|
+
)
|
|
322
|
+
if error:
|
|
323
|
+
return {"error": f"Ensembl VEP query failed: {error}", "summary": f"Ensembl VEP query failed: {error}"}
|
|
324
|
+
if resp.status_code == 400:
|
|
325
|
+
return {"error": f"Invalid variant format: '{variant}'. Use rsID (e.g. rs1234) or HGVS (e.g. 17:g.41245466G>A)", "summary": f"Invalid variant format: '{variant}'. Use rsID (e.g. rs1234) or HGVS (e.g. 17:g.41245466G>A)"}
|
|
326
|
+
if resp.status_code >= 400:
|
|
327
|
+
return {"error": f"Ensembl VEP query failed: HTTP {resp.status_code}", "summary": f"Ensembl VEP query failed: HTTP {resp.status_code}"}
|
|
328
|
+
try:
|
|
329
|
+
data = resp.json()
|
|
330
|
+
except Exception:
|
|
331
|
+
return {"error": f"Ensembl VEP query failed: invalid JSON response", "summary": f"Ensembl VEP query failed: invalid JSON response"}
|
|
332
|
+
if not data or not isinstance(data, list):
|
|
333
|
+
return {"error": f"No VEP results for variant {variant}", "summary": f"No VEP results for variant {variant}"}
|
|
334
|
+
vep_result = data[0]
|
|
335
|
+
|
|
336
|
+
# Extract variant identifiers
|
|
337
|
+
variant_id = vep_result.get("id", variant)
|
|
338
|
+
input_str = vep_result.get("input", variant)
|
|
339
|
+
most_severe = vep_result.get("most_severe_consequence", "")
|
|
340
|
+
allele_string = vep_result.get("allele_string", "")
|
|
341
|
+
strand = vep_result.get("strand")
|
|
342
|
+
assembly = vep_result.get("assembly_name", "")
|
|
343
|
+
seq_region = vep_result.get("seq_region_name", "")
|
|
344
|
+
start = vep_result.get("start")
|
|
345
|
+
end = vep_result.get("end")
|
|
346
|
+
|
|
347
|
+
# Extract colocated variants (for allele frequencies, clinical significance)
|
|
348
|
+
colocated = vep_result.get("colocated_variants", [])
|
|
349
|
+
allele_frequencies = {}
|
|
350
|
+
clinical_significance = []
|
|
351
|
+
existing_ids = []
|
|
352
|
+
|
|
353
|
+
for cv in colocated:
|
|
354
|
+
cv_id = cv.get("id", "")
|
|
355
|
+
if cv_id:
|
|
356
|
+
existing_ids.append(cv_id)
|
|
357
|
+
|
|
358
|
+
# Allele frequencies from different populations
|
|
359
|
+
freqs = cv.get("frequencies", {})
|
|
360
|
+
for allele, pop_freqs in freqs.items():
|
|
361
|
+
for pop, freq in pop_freqs.items():
|
|
362
|
+
key = f"{allele}_{pop}"
|
|
363
|
+
allele_frequencies[key] = freq
|
|
364
|
+
|
|
365
|
+
# Minor allele frequency
|
|
366
|
+
maf = cv.get("minor_allele_freq")
|
|
367
|
+
minor_allele = cv.get("minor_allele", "")
|
|
368
|
+
if maf is not None:
|
|
369
|
+
allele_frequencies["minor_allele"] = minor_allele
|
|
370
|
+
allele_frequencies["minor_allele_freq"] = maf
|
|
371
|
+
|
|
372
|
+
# Clinical significance
|
|
373
|
+
clin_sig = cv.get("clin_sig", [])
|
|
374
|
+
if clin_sig:
|
|
375
|
+
clinical_significance.extend(clin_sig)
|
|
376
|
+
|
|
377
|
+
# Extract transcript consequences
|
|
378
|
+
transcript_consequences = []
|
|
379
|
+
for tc in vep_result.get("transcript_consequences", []):
|
|
380
|
+
consequence_terms = tc.get("consequence_terms", [])
|
|
381
|
+
transcript_consequences.append({
|
|
382
|
+
"gene_id": tc.get("gene_id", ""),
|
|
383
|
+
"gene_symbol": tc.get("gene_symbol", ""),
|
|
384
|
+
"transcript_id": tc.get("transcript_id", ""),
|
|
385
|
+
"biotype": tc.get("biotype", ""),
|
|
386
|
+
"consequence_terms": consequence_terms,
|
|
387
|
+
"impact": tc.get("impact", ""),
|
|
388
|
+
"amino_acids": tc.get("amino_acids", ""),
|
|
389
|
+
"codons": tc.get("codons", ""),
|
|
390
|
+
"protein_position": tc.get("protein_position", ""),
|
|
391
|
+
"sift_prediction": tc.get("sift_prediction", ""),
|
|
392
|
+
"sift_score": tc.get("sift_score"),
|
|
393
|
+
"polyphen_prediction": tc.get("polyphen_prediction", ""),
|
|
394
|
+
"polyphen_score": tc.get("polyphen_score"),
|
|
395
|
+
"canonical": tc.get("canonical", 0) == 1,
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
# Sort: canonical transcripts first, then by impact severity
|
|
399
|
+
impact_order = {"HIGH": 0, "MODERATE": 1, "LOW": 2, "MODIFIER": 3}
|
|
400
|
+
transcript_consequences.sort(
|
|
401
|
+
key=lambda x: (
|
|
402
|
+
0 if x["canonical"] else 1,
|
|
403
|
+
impact_order.get(x["impact"], 4),
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Find the most impactful consequence for the summary
|
|
408
|
+
top_consequence = transcript_consequences[0] if transcript_consequences else {}
|
|
409
|
+
gene_symbol = top_consequence.get("gene_symbol", "")
|
|
410
|
+
impact = top_consequence.get("impact", "")
|
|
411
|
+
aa_change = top_consequence.get("amino_acids", "")
|
|
412
|
+
protein_pos = top_consequence.get("protein_position", "")
|
|
413
|
+
|
|
414
|
+
aa_str = ""
|
|
415
|
+
if aa_change and protein_pos:
|
|
416
|
+
aa_str = f", p.{aa_change.replace('/', str(protein_pos))}"
|
|
417
|
+
|
|
418
|
+
clin_str = ""
|
|
419
|
+
if clinical_significance:
|
|
420
|
+
unique_clin = list(set(clinical_significance))
|
|
421
|
+
clin_str = f" Clinical: {', '.join(unique_clin)}."
|
|
422
|
+
|
|
423
|
+
maf_str = ""
|
|
424
|
+
maf_val = allele_frequencies.get("minor_allele_freq")
|
|
425
|
+
if maf_val is not None:
|
|
426
|
+
maf_str = f" MAF={maf_val:.4f} ({allele_frequencies.get('minor_allele', '')})."
|
|
427
|
+
|
|
428
|
+
return {
|
|
429
|
+
"summary": (
|
|
430
|
+
f"VEP annotation for {variant_id}: {most_severe} ({impact}) "
|
|
431
|
+
f"in {gene_symbol}{aa_str}.{clin_str}{maf_str}"
|
|
432
|
+
),
|
|
433
|
+
"variant_id": variant_id,
|
|
434
|
+
"input": input_str,
|
|
435
|
+
"location": f"{seq_region}:{start}-{end}" if seq_region and start else "",
|
|
436
|
+
"assembly": assembly,
|
|
437
|
+
"allele_string": allele_string,
|
|
438
|
+
"most_severe_consequence": most_severe,
|
|
439
|
+
"existing_ids": existing_ids,
|
|
440
|
+
"allele_frequencies": allele_frequencies,
|
|
441
|
+
"clinical_significance": list(set(clinical_significance)),
|
|
442
|
+
"transcript_consequences": transcript_consequences[:10], # Top 10
|
|
443
|
+
"n_transcript_consequences": len(transcript_consequences),
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
@registry.register(
|
|
448
|
+
name="genomics.mendelian_randomization_lookup",
|
|
449
|
+
description="Look up Mendelian randomization and genetic evidence for a gene-disease pair via Open Targets",
|
|
450
|
+
category="genomics",
|
|
451
|
+
parameters={
|
|
452
|
+
"gene": "Gene symbol (e.g. 'PCSK9', 'IL6R')",
|
|
453
|
+
"disease": "Disease name or EFO ID (e.g. 'coronary artery disease' or 'EFO_0001645')",
|
|
454
|
+
},
|
|
455
|
+
requires_data=[],
|
|
456
|
+
usage_guide="You want causal genetic evidence linking a gene to a disease. Use to evaluate target-disease relationships using Mendelian randomization, GWAS colocalisation, and genetic association evidence from Open Targets.",
|
|
457
|
+
)
|
|
458
|
+
def mendelian_randomization_lookup(gene: str, disease: str, **kwargs) -> dict:
|
|
459
|
+
"""Look up MR and genetic evidence from Open Targets Platform GraphQL API."""
|
|
460
|
+
try:
|
|
461
|
+
import httpx
|
|
462
|
+
except ImportError:
|
|
463
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
464
|
+
ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
|
|
465
|
+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
|
466
|
+
|
|
467
|
+
# Step 1: Resolve gene symbol to Ensembl ID via Open Targets search
|
|
468
|
+
search_query = """
|
|
469
|
+
query searchTarget($queryString: String!) {
|
|
470
|
+
search(queryString: $queryString, entityNames: ["target"], page: {size: 5, index: 0}) {
|
|
471
|
+
hits {
|
|
472
|
+
id
|
|
473
|
+
entity
|
|
474
|
+
name
|
|
475
|
+
description
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
search_data, error = request_json(
|
|
482
|
+
"POST",
|
|
483
|
+
ot_url,
|
|
484
|
+
json={"query": search_query, "variables": {"queryString": gene}},
|
|
485
|
+
headers=headers,
|
|
486
|
+
timeout=10,
|
|
487
|
+
retries=2,
|
|
488
|
+
)
|
|
489
|
+
if error:
|
|
490
|
+
return {"error": f"Open Targets search failed: {error}", "summary": f"Open Targets search failed: {error}"}
|
|
491
|
+
hits = search_data.get("data", {}).get("search", {}).get("hits", [])
|
|
492
|
+
target_hits = [h for h in hits if h.get("entity") == "target"]
|
|
493
|
+
|
|
494
|
+
if not target_hits:
|
|
495
|
+
return {"error": f"Gene '{gene}' not found in Open Targets", "summary": f"Gene '{gene}' not found in Open Targets"}
|
|
496
|
+
# Match by gene symbol (case-insensitive)
|
|
497
|
+
ensembl_id = None
|
|
498
|
+
target_name = ""
|
|
499
|
+
for hit in target_hits:
|
|
500
|
+
if hit.get("name", "").upper() == gene.upper():
|
|
501
|
+
ensembl_id = hit["id"]
|
|
502
|
+
target_name = hit.get("name", "")
|
|
503
|
+
break
|
|
504
|
+
if not ensembl_id:
|
|
505
|
+
ensembl_id = target_hits[0]["id"]
|
|
506
|
+
target_name = target_hits[0].get("name", "")
|
|
507
|
+
|
|
508
|
+
# Step 2: Resolve disease to EFO ID (if not already an EFO ID)
|
|
509
|
+
if disease.upper().startswith("EFO_") or disease.upper().startswith("MONDO_") or disease.upper().startswith("HP_"):
|
|
510
|
+
efo_id = disease
|
|
511
|
+
disease_name = disease
|
|
512
|
+
else:
|
|
513
|
+
disease_search_query = """
|
|
514
|
+
query searchDisease($queryString: String!) {
|
|
515
|
+
search(queryString: $queryString, entityNames: ["disease"], page: {size: 5, index: 0}) {
|
|
516
|
+
hits {
|
|
517
|
+
id
|
|
518
|
+
entity
|
|
519
|
+
name
|
|
520
|
+
description
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
disease_data, error = request_json(
|
|
527
|
+
"POST",
|
|
528
|
+
ot_url,
|
|
529
|
+
json={"query": disease_search_query, "variables": {"queryString": disease}},
|
|
530
|
+
headers=headers,
|
|
531
|
+
timeout=10,
|
|
532
|
+
retries=2,
|
|
533
|
+
)
|
|
534
|
+
if error:
|
|
535
|
+
return {"error": f"Open Targets disease search failed: {error}", "summary": f"Open Targets disease search failed: {error}"}
|
|
536
|
+
disease_hits = disease_data.get("data", {}).get("search", {}).get("hits", [])
|
|
537
|
+
disease_hits = [h for h in disease_hits if h.get("entity") == "disease"]
|
|
538
|
+
|
|
539
|
+
if not disease_hits:
|
|
540
|
+
return {"error": f"Disease '{disease}' not found in Open Targets", "summary": f"Disease '{disease}' not found in Open Targets"}
|
|
541
|
+
efo_id = disease_hits[0]["id"]
|
|
542
|
+
disease_name = disease_hits[0].get("name", disease)
|
|
543
|
+
|
|
544
|
+
# Step 3: Query genetic evidence (evidences is on Target, not top-level)
|
|
545
|
+
# Genetic datasources: gwas_credible_sets (L2G scores), eva, gene_burden,
|
|
546
|
+
# gene2phenotype, genomics_england, uniprot_literature
|
|
547
|
+
evidence_query = """
|
|
548
|
+
query targetDiseaseEvidence($ensemblId: String!, $efoId: String!) {
|
|
549
|
+
target(ensemblId: $ensemblId) {
|
|
550
|
+
id
|
|
551
|
+
approvedSymbol
|
|
552
|
+
approvedName
|
|
553
|
+
associatedDiseases(BFilter: $efoId, page: {size: 1, index: 0}) {
|
|
554
|
+
rows {
|
|
555
|
+
score
|
|
556
|
+
disease { id name }
|
|
557
|
+
datasourceScores {
|
|
558
|
+
id
|
|
559
|
+
score
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
evidences(
|
|
564
|
+
efoIds: [$efoId]
|
|
565
|
+
datasourceIds: [
|
|
566
|
+
"gwas_credible_sets", "gene_burden", "eva",
|
|
567
|
+
"gene2phenotype", "genomics_england", "uniprot_literature"
|
|
568
|
+
]
|
|
569
|
+
size: 50
|
|
570
|
+
) {
|
|
571
|
+
count
|
|
572
|
+
rows {
|
|
573
|
+
datasourceId
|
|
574
|
+
datatypeId
|
|
575
|
+
score
|
|
576
|
+
resourceScore
|
|
577
|
+
studyId
|
|
578
|
+
beta
|
|
579
|
+
oddsRatio
|
|
580
|
+
confidence
|
|
581
|
+
studySampleSize
|
|
582
|
+
publicationYear
|
|
583
|
+
variantRsId
|
|
584
|
+
credibleSet {
|
|
585
|
+
studyLocusId
|
|
586
|
+
study { id projectId studyType }
|
|
587
|
+
variant { id rsIds }
|
|
588
|
+
pValueMantissa
|
|
589
|
+
pValueExponent
|
|
590
|
+
beta
|
|
591
|
+
finemappingMethod
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
disease(efoId: $efoId) {
|
|
597
|
+
id
|
|
598
|
+
name
|
|
599
|
+
description
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
"""
|
|
603
|
+
|
|
604
|
+
result_data, error = request_json(
|
|
605
|
+
"POST",
|
|
606
|
+
ot_url,
|
|
607
|
+
json={
|
|
608
|
+
"query": evidence_query,
|
|
609
|
+
"variables": {"ensemblId": ensembl_id, "efoId": efo_id},
|
|
610
|
+
},
|
|
611
|
+
headers=headers,
|
|
612
|
+
timeout=15,
|
|
613
|
+
retries=2,
|
|
614
|
+
)
|
|
615
|
+
if error:
|
|
616
|
+
return {"error": f"Open Targets evidence query failed: {error}", "summary": f"Open Targets evidence query failed: {error}"}
|
|
617
|
+
if result_data.get("errors"):
|
|
618
|
+
error_msgs = [e.get("message", "") for e in result_data["errors"]]
|
|
619
|
+
return {"error": f"Open Targets GraphQL errors: {'; '.join(error_msgs)}", "summary": f"Open Targets GraphQL errors: {'; '.join(error_msgs)}"}
|
|
620
|
+
data = result_data.get("data", {})
|
|
621
|
+
|
|
622
|
+
# Parse target and disease info
|
|
623
|
+
target_info = data.get("target") or {}
|
|
624
|
+
disease_info = data.get("disease") or {}
|
|
625
|
+
approved_symbol = target_info.get("approvedSymbol", gene)
|
|
626
|
+
approved_name = target_info.get("approvedName", "")
|
|
627
|
+
resolved_disease = disease_info.get("name", disease_name if disease_name else disease)
|
|
628
|
+
|
|
629
|
+
# Parse overall association score
|
|
630
|
+
assoc_rows = target_info.get("associatedDiseases", {}).get("rows", [])
|
|
631
|
+
overall_score = assoc_rows[0].get("score") if assoc_rows else None
|
|
632
|
+
datasource_scores = {}
|
|
633
|
+
if assoc_rows:
|
|
634
|
+
for ds in assoc_rows[0].get("datasourceScores", []):
|
|
635
|
+
datasource_scores[ds["id"]] = ds["score"]
|
|
636
|
+
|
|
637
|
+
# Parse evidence rows
|
|
638
|
+
evidences_obj = target_info.get("evidences") or {}
|
|
639
|
+
evidence_count = evidences_obj.get("count", 0)
|
|
640
|
+
evidence_rows = evidences_obj.get("rows", [])
|
|
641
|
+
|
|
642
|
+
# Categorize evidence by datasource
|
|
643
|
+
gwas_evidence = []
|
|
644
|
+
other_genetic_evidence = []
|
|
645
|
+
|
|
646
|
+
for row in evidence_rows:
|
|
647
|
+
datasource = row.get("datasourceId", "")
|
|
648
|
+
|
|
649
|
+
# Extract variant info from credibleSet if available
|
|
650
|
+
credible_set = row.get("credibleSet") or {}
|
|
651
|
+
variant_info = credible_set.get("variant") or {}
|
|
652
|
+
study_info = credible_set.get("study") or {}
|
|
653
|
+
rs_ids = variant_info.get("rsIds", [])
|
|
654
|
+
variant_rsid = rs_ids[0] if rs_ids else (row.get("variantRsId") or "")
|
|
655
|
+
|
|
656
|
+
# Compute p-value from mantissa/exponent
|
|
657
|
+
p_mantissa = credible_set.get("pValueMantissa")
|
|
658
|
+
p_exponent = credible_set.get("pValueExponent")
|
|
659
|
+
p_value = None
|
|
660
|
+
if p_mantissa is not None and p_exponent is not None:
|
|
661
|
+
try:
|
|
662
|
+
p_value = float(p_mantissa) * (10 ** int(p_exponent))
|
|
663
|
+
except (ValueError, TypeError):
|
|
664
|
+
pass
|
|
665
|
+
|
|
666
|
+
evidence_item = {
|
|
667
|
+
"datasource": datasource,
|
|
668
|
+
"datatype": row.get("datatypeId", ""),
|
|
669
|
+
"score": row.get("score"),
|
|
670
|
+
"resource_score": row.get("resourceScore"),
|
|
671
|
+
"variant_id": variant_info.get("id", ""),
|
|
672
|
+
"variant_rsid": variant_rsid,
|
|
673
|
+
"study_id": study_info.get("id") or row.get("studyId", ""),
|
|
674
|
+
"study_type": study_info.get("studyType", ""),
|
|
675
|
+
"p_value": p_value,
|
|
676
|
+
"beta": credible_set.get("beta") or row.get("beta"),
|
|
677
|
+
"odds_ratio": row.get("oddsRatio"),
|
|
678
|
+
"finemapping_method": credible_set.get("finemappingMethod", ""),
|
|
679
|
+
"publication_year": row.get("publicationYear"),
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
if datasource == "gwas_credible_sets":
|
|
683
|
+
gwas_evidence.append(evidence_item)
|
|
684
|
+
else:
|
|
685
|
+
other_genetic_evidence.append(evidence_item)
|
|
686
|
+
|
|
687
|
+
# Compute summary statistics
|
|
688
|
+
all_evidence = gwas_evidence + other_genetic_evidence
|
|
689
|
+
max_score = max((e["score"] for e in all_evidence if e["score"] is not None), default=None)
|
|
690
|
+
n_variants = len(set(e["variant_rsid"] for e in all_evidence if e["variant_rsid"]))
|
|
691
|
+
n_studies = len(set(e["study_id"] for e in all_evidence if e["study_id"]))
|
|
692
|
+
|
|
693
|
+
# Build summary
|
|
694
|
+
parts = []
|
|
695
|
+
if gwas_evidence:
|
|
696
|
+
parts.append(f"{len(gwas_evidence)} GWAS credible set(s)")
|
|
697
|
+
if other_genetic_evidence:
|
|
698
|
+
parts.append(f"{len(other_genetic_evidence)} other genetic evidence(s)")
|
|
699
|
+
if not parts:
|
|
700
|
+
parts.append("no genetic evidence found")
|
|
701
|
+
|
|
702
|
+
score_str = f" Overall association: {overall_score:.3f}." if overall_score is not None else ""
|
|
703
|
+
max_str = f" Max L2G score: {max_score:.3f}." if max_score is not None else ""
|
|
704
|
+
variant_str = f" {n_variants} unique variant(s) across {n_studies} study(ies)." if n_variants > 0 else ""
|
|
705
|
+
|
|
706
|
+
return {
|
|
707
|
+
"summary": (
|
|
708
|
+
f"Genetic evidence for {approved_symbol} -> {resolved_disease}: "
|
|
709
|
+
f"{', '.join(parts)}.{score_str}{max_str}{variant_str}"
|
|
710
|
+
),
|
|
711
|
+
"gene": approved_symbol,
|
|
712
|
+
"gene_name": approved_name,
|
|
713
|
+
"ensembl_id": ensembl_id,
|
|
714
|
+
"disease": resolved_disease,
|
|
715
|
+
"disease_id": efo_id,
|
|
716
|
+
"overall_association_score": overall_score,
|
|
717
|
+
"datasource_scores": datasource_scores,
|
|
718
|
+
"total_evidence_count": evidence_count,
|
|
719
|
+
"gwas_credible_sets": gwas_evidence,
|
|
720
|
+
"other_genetic_evidence": other_genetic_evidence,
|
|
721
|
+
"max_l2g_score": max_score,
|
|
722
|
+
"n_unique_variants": n_variants,
|
|
723
|
+
"n_studies": n_studies,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
@registry.register(
|
|
728
|
+
name="genomics.coloc",
|
|
729
|
+
description="Look up GWAS-eQTL/pQTL colocalization evidence for a gene via Open Targets Platform",
|
|
730
|
+
category="genomics",
|
|
731
|
+
parameters={
|
|
732
|
+
"gene": "Gene symbol (e.g. 'PCSK9', 'IL6R')",
|
|
733
|
+
"study_id": "Specific GWAS study ID to filter (optional)",
|
|
734
|
+
},
|
|
735
|
+
requires_data=[],
|
|
736
|
+
usage_guide="You want to assess whether a GWAS signal and an eQTL/pQTL signal share the same "
|
|
737
|
+
"causal variant at a locus — the gold standard for connecting genetic associations "
|
|
738
|
+
"to gene function. High H4 posterior probability (>0.8) indicates strong colocalization. "
|
|
739
|
+
"Use for target validation and causal gene assignment at GWAS loci.",
|
|
740
|
+
)
|
|
741
|
+
def coloc(gene: str, study_id: str = None, **kwargs) -> dict:
|
|
742
|
+
"""Look up colocalization evidence from Open Targets Platform GraphQL API.
|
|
743
|
+
|
|
744
|
+
Queries the Open Targets credibleSets and colocalisations data for a gene
|
|
745
|
+
target, returning GWAS-QTL colocalization information including H4 posterior
|
|
746
|
+
probabilities (evidence of shared causal variant), study details, and tissues.
|
|
747
|
+
"""
|
|
748
|
+
try:
|
|
749
|
+
import httpx
|
|
750
|
+
except ImportError:
|
|
751
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
752
|
+
ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
|
|
753
|
+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
|
754
|
+
|
|
755
|
+
def _gene_symbol_candidates(input_gene: str) -> list[str]:
|
|
756
|
+
alias_map = {
|
|
757
|
+
"GBA1": "GBA",
|
|
758
|
+
"PARK2": "PRKN",
|
|
759
|
+
}
|
|
760
|
+
token = (input_gene or "").strip()
|
|
761
|
+
if not token:
|
|
762
|
+
return []
|
|
763
|
+
candidates = [token]
|
|
764
|
+
mapped = alias_map.get(token.upper())
|
|
765
|
+
if mapped:
|
|
766
|
+
candidates.append(mapped)
|
|
767
|
+
|
|
768
|
+
# Stable de-dup preserving order (case-insensitive).
|
|
769
|
+
deduped = []
|
|
770
|
+
seen = set()
|
|
771
|
+
for c in candidates:
|
|
772
|
+
k = c.upper()
|
|
773
|
+
if k in seen:
|
|
774
|
+
continue
|
|
775
|
+
seen.add(k)
|
|
776
|
+
deduped.append(c)
|
|
777
|
+
return deduped
|
|
778
|
+
|
|
779
|
+
def _resolve_ensembl_id(symbol: str) -> tuple[str | None, str | None]:
|
|
780
|
+
ens_resp, resolve_error = request(
|
|
781
|
+
"GET",
|
|
782
|
+
f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{symbol}",
|
|
783
|
+
params={"content-type": "application/json"},
|
|
784
|
+
timeout=10,
|
|
785
|
+
retries=2,
|
|
786
|
+
headers={"Content-Type": "application/json"},
|
|
787
|
+
raise_for_status=False,
|
|
788
|
+
)
|
|
789
|
+
if resolve_error:
|
|
790
|
+
return None, f"Failed to resolve {symbol} to Ensembl ID: {resolve_error}"
|
|
791
|
+
if ens_resp.status_code != 200:
|
|
792
|
+
return None, f"Gene {symbol} not found in Ensembl (human)"
|
|
793
|
+
try:
|
|
794
|
+
ens_data = ens_resp.json()
|
|
795
|
+
except Exception:
|
|
796
|
+
return None, f"Failed to parse Ensembl response for {symbol}"
|
|
797
|
+
ensembl = ens_data.get("id", "")
|
|
798
|
+
if not ensembl:
|
|
799
|
+
return None, f"Gene {symbol} not found in Ensembl (human)"
|
|
800
|
+
return ensembl, None
|
|
801
|
+
|
|
802
|
+
# Step 2: Query Open Targets for credible sets with colocalization data.
|
|
803
|
+
# We keep a full query and a lower-complexity fallback query because some
|
|
804
|
+
# genes can hit Open Targets GraphQL complexity limits.
|
|
805
|
+
query_full = """
|
|
806
|
+
query geneColoc($ensemblId: String!, $size: Int!, $colocSize: Int!) {
|
|
807
|
+
target(ensemblId: $ensemblId) {
|
|
808
|
+
id
|
|
809
|
+
approvedSymbol
|
|
810
|
+
approvedName
|
|
811
|
+
credibleSets(page: {index: 0, size: $size}) {
|
|
812
|
+
count
|
|
813
|
+
rows {
|
|
814
|
+
studyLocusId
|
|
815
|
+
studyId
|
|
816
|
+
studyType
|
|
817
|
+
study {
|
|
818
|
+
id
|
|
819
|
+
studyType
|
|
820
|
+
traitFromSource
|
|
821
|
+
diseases {
|
|
822
|
+
id
|
|
823
|
+
name
|
|
824
|
+
}
|
|
825
|
+
nSamples
|
|
826
|
+
}
|
|
827
|
+
variant {
|
|
828
|
+
id
|
|
829
|
+
rsIds
|
|
830
|
+
chromosome
|
|
831
|
+
position
|
|
832
|
+
}
|
|
833
|
+
pValueMantissa
|
|
834
|
+
pValueExponent
|
|
835
|
+
beta
|
|
836
|
+
colocalisation(page: {index: 0, size: $colocSize}) {
|
|
837
|
+
count
|
|
838
|
+
rows {
|
|
839
|
+
h4
|
|
840
|
+
h3
|
|
841
|
+
clpp
|
|
842
|
+
colocalisationMethod
|
|
843
|
+
rightStudyType
|
|
844
|
+
betaRatioSignAverage
|
|
845
|
+
numberColocalisingVariants
|
|
846
|
+
otherStudyLocus {
|
|
847
|
+
studyLocusId
|
|
848
|
+
studyId
|
|
849
|
+
studyType
|
|
850
|
+
qtlGeneId
|
|
851
|
+
study {
|
|
852
|
+
id
|
|
853
|
+
traitFromSource
|
|
854
|
+
condition
|
|
855
|
+
biosample {
|
|
856
|
+
biosampleId
|
|
857
|
+
biosampleName
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
"""
|
|
868
|
+
|
|
869
|
+
query_lean = """
|
|
870
|
+
query geneColocLean($ensemblId: String!, $size: Int!, $colocSize: Int!) {
|
|
871
|
+
target(ensemblId: $ensemblId) {
|
|
872
|
+
id
|
|
873
|
+
approvedSymbol
|
|
874
|
+
approvedName
|
|
875
|
+
credibleSets(page: {index: 0, size: $size}) {
|
|
876
|
+
count
|
|
877
|
+
rows {
|
|
878
|
+
studyLocusId
|
|
879
|
+
studyId
|
|
880
|
+
studyType
|
|
881
|
+
study {
|
|
882
|
+
id
|
|
883
|
+
studyType
|
|
884
|
+
traitFromSource
|
|
885
|
+
diseases {
|
|
886
|
+
id
|
|
887
|
+
name
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
colocalisation(page: {index: 0, size: $colocSize}) {
|
|
891
|
+
count
|
|
892
|
+
rows {
|
|
893
|
+
h4
|
|
894
|
+
h3
|
|
895
|
+
clpp
|
|
896
|
+
colocalisationMethod
|
|
897
|
+
rightStudyType
|
|
898
|
+
otherStudyLocus {
|
|
899
|
+
studyLocusId
|
|
900
|
+
studyId
|
|
901
|
+
studyType
|
|
902
|
+
qtlGeneId
|
|
903
|
+
study {
|
|
904
|
+
id
|
|
905
|
+
traitFromSource
|
|
906
|
+
condition
|
|
907
|
+
biosample {
|
|
908
|
+
biosampleId
|
|
909
|
+
biosampleName
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
"""
|
|
920
|
+
|
|
921
|
+
def _query_target_coloc(ensembl: str) -> tuple[dict | None, str | None]:
|
|
922
|
+
def _run_query(query_text: str, page_attempts: tuple[tuple[int, int], ...]) -> tuple[dict | None, str | None]:
|
|
923
|
+
last_err = None
|
|
924
|
+
for size, coloc_size in page_attempts:
|
|
925
|
+
resp, query_error = request(
|
|
926
|
+
"POST",
|
|
927
|
+
ot_url,
|
|
928
|
+
json={
|
|
929
|
+
"query": query_text,
|
|
930
|
+
"variables": {
|
|
931
|
+
"ensemblId": ensembl,
|
|
932
|
+
"size": size,
|
|
933
|
+
"colocSize": coloc_size,
|
|
934
|
+
},
|
|
935
|
+
},
|
|
936
|
+
headers=headers,
|
|
937
|
+
timeout=15,
|
|
938
|
+
retries=2,
|
|
939
|
+
raise_for_status=False,
|
|
940
|
+
)
|
|
941
|
+
if query_error:
|
|
942
|
+
last_err = f"Open Targets API error: {query_error}"
|
|
943
|
+
continue
|
|
944
|
+
if resp.status_code != 200:
|
|
945
|
+
last_err = f"Open Targets API returned HTTP {resp.status_code}"
|
|
946
|
+
# Retry with smaller page sizes for likely complexity-related rejections.
|
|
947
|
+
if resp.status_code in {400, 413, 422, 429, 500, 502, 503, 504}:
|
|
948
|
+
continue
|
|
949
|
+
break
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
payload = resp.json()
|
|
953
|
+
except Exception:
|
|
954
|
+
last_err = "Open Targets API returned invalid JSON"
|
|
955
|
+
continue
|
|
956
|
+
|
|
957
|
+
gql_errors = payload.get("errors") or []
|
|
958
|
+
if gql_errors:
|
|
959
|
+
msgs = "; ".join(e.get("message", "") for e in gql_errors)
|
|
960
|
+
last_err = f"Open Targets GraphQL errors: {msgs}"
|
|
961
|
+
lower = msgs.lower()
|
|
962
|
+
if any(tok in lower for tok in ("complex", "depth", "cost", "too many", "timeout")):
|
|
963
|
+
continue
|
|
964
|
+
break
|
|
965
|
+
return payload, None
|
|
966
|
+
return None, (last_err or "Open Targets colocalization query failed")
|
|
967
|
+
|
|
968
|
+
# Try richer query first, then lower-complexity fallback.
|
|
969
|
+
attempts = (
|
|
970
|
+
("full", query_full, ((60, 40), (30, 20), (15, 10))),
|
|
971
|
+
("lean", query_lean, ((40, 20), (20, 10), (10, 5))),
|
|
972
|
+
)
|
|
973
|
+
errors = []
|
|
974
|
+
for label, query_text, page_attempts in attempts:
|
|
975
|
+
payload, err = _run_query(query_text, page_attempts)
|
|
976
|
+
if payload is not None:
|
|
977
|
+
return payload, None
|
|
978
|
+
if err:
|
|
979
|
+
errors.append(f"{label} query: {err}")
|
|
980
|
+
if errors:
|
|
981
|
+
return None, "; ".join(errors)
|
|
982
|
+
return None, "Open Targets colocalization query failed"
|
|
983
|
+
|
|
984
|
+
# Try primary symbol first, then common aliases (e.g., GBA1 -> GBA) if needed.
|
|
985
|
+
gene_candidates = _gene_symbol_candidates(gene)
|
|
986
|
+
ensembl_id = None
|
|
987
|
+
result_data = None
|
|
988
|
+
target_data = None
|
|
989
|
+
candidate_errors = []
|
|
990
|
+
query_failures = []
|
|
991
|
+
resolved_candidates = []
|
|
992
|
+
|
|
993
|
+
for gene_candidate in gene_candidates:
|
|
994
|
+
ensembl_candidate, resolve_error = _resolve_ensembl_id(gene_candidate)
|
|
995
|
+
if resolve_error:
|
|
996
|
+
candidate_errors.append(resolve_error)
|
|
997
|
+
continue
|
|
998
|
+
resolved_candidates.append((gene_candidate, ensembl_candidate))
|
|
999
|
+
|
|
1000
|
+
payload, query_error = _query_target_coloc(ensembl_candidate)
|
|
1001
|
+
if query_error:
|
|
1002
|
+
candidate_errors.append(f"{gene_candidate}: {query_error}")
|
|
1003
|
+
query_failures.append((gene_candidate, ensembl_candidate, query_error))
|
|
1004
|
+
continue
|
|
1005
|
+
|
|
1006
|
+
target_candidate = (payload or {}).get("data", {}).get("target")
|
|
1007
|
+
if not target_candidate:
|
|
1008
|
+
candidate_errors.append(
|
|
1009
|
+
f"{gene_candidate}: Open Targets has no entry for {ensembl_candidate}"
|
|
1010
|
+
)
|
|
1011
|
+
query_failures.append(
|
|
1012
|
+
(gene_candidate, ensembl_candidate, f"Open Targets has no entry for {ensembl_candidate}")
|
|
1013
|
+
)
|
|
1014
|
+
continue
|
|
1015
|
+
|
|
1016
|
+
ensembl_id = ensembl_candidate
|
|
1017
|
+
result_data = payload
|
|
1018
|
+
target_data = target_candidate
|
|
1019
|
+
break
|
|
1020
|
+
|
|
1021
|
+
if not target_data:
|
|
1022
|
+
last_error = candidate_errors[-1] if candidate_errors else "Open Targets colocalization query failed"
|
|
1023
|
+
if candidate_errors and all("not found in Ensembl" in e for e in candidate_errors):
|
|
1024
|
+
return {
|
|
1025
|
+
"error": last_error,
|
|
1026
|
+
"summary": f"Gene symbol {gene} could not be resolved to an Ensembl ID",
|
|
1027
|
+
}
|
|
1028
|
+
# Resolved gene(s) but Open Targets could not return colocalization payload.
|
|
1029
|
+
# Return a non-fatal unavailable result so workflows can continue.
|
|
1030
|
+
if resolved_candidates:
|
|
1031
|
+
chosen_symbol, chosen_ensembl = resolved_candidates[0]
|
|
1032
|
+
warning = query_failures[0][2] if query_failures else last_error
|
|
1033
|
+
return {
|
|
1034
|
+
"summary": (
|
|
1035
|
+
f"Colocalization for {chosen_symbol}: unavailable from Open Targets "
|
|
1036
|
+
f"(query failed). Try genomics.eqtl_lookup for orthogonal evidence."
|
|
1037
|
+
),
|
|
1038
|
+
"gene": chosen_symbol,
|
|
1039
|
+
"ensembl_id": chosen_ensembl,
|
|
1040
|
+
"total_gwas_loci": 0,
|
|
1041
|
+
"n_colocalizations": 0,
|
|
1042
|
+
"n_strong_coloc": 0,
|
|
1043
|
+
"n_moderate_coloc": 0,
|
|
1044
|
+
"n_tissues": 0,
|
|
1045
|
+
"n_studies": 0,
|
|
1046
|
+
"tissues": [],
|
|
1047
|
+
"colocalizations": [],
|
|
1048
|
+
"data_unavailable": True,
|
|
1049
|
+
"warning": warning,
|
|
1050
|
+
}
|
|
1051
|
+
if "GraphQL errors" in last_error:
|
|
1052
|
+
return {
|
|
1053
|
+
"error": last_error,
|
|
1054
|
+
"summary": f"GraphQL query errors for {gene} colocalization",
|
|
1055
|
+
}
|
|
1056
|
+
return {
|
|
1057
|
+
"error": last_error,
|
|
1058
|
+
"summary": f"Open Targets colocalization query failed for {gene}",
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
approved_symbol = target_data.get("approvedSymbol", gene)
|
|
1062
|
+
# Backward-compatibility: some mocked test fixtures still use legacy field names.
|
|
1063
|
+
credible_sets = target_data.get("credibleSets") or target_data.get("gwasCredibleSets") or {}
|
|
1064
|
+
rows = credible_sets.get("rows", []) if isinstance(credible_sets, dict) else []
|
|
1065
|
+
|
|
1066
|
+
# Keep only GWAS credible sets for this tool.
|
|
1067
|
+
def _is_gwas(row: dict) -> bool:
|
|
1068
|
+
st = (row.get("studyType") or (row.get("study") or {}).get("studyType") or "")
|
|
1069
|
+
return str(st).lower() == "gwas"
|
|
1070
|
+
|
|
1071
|
+
if target_data.get("gwasCredibleSets") is not None:
|
|
1072
|
+
gwas_rows = rows
|
|
1073
|
+
total_loci = credible_sets.get("count", len(rows))
|
|
1074
|
+
else:
|
|
1075
|
+
gwas_rows = [row for row in rows if _is_gwas(row)]
|
|
1076
|
+
total_loci = len(gwas_rows)
|
|
1077
|
+
|
|
1078
|
+
# Parse colocalization results
|
|
1079
|
+
coloc_results = []
|
|
1080
|
+
tissues_seen = set()
|
|
1081
|
+
studies_seen = set()
|
|
1082
|
+
|
|
1083
|
+
for row in gwas_rows:
|
|
1084
|
+
study = row.get("study") or {}
|
|
1085
|
+
gwas_study_id = row.get("studyId") or study.get("id", "")
|
|
1086
|
+
|
|
1087
|
+
# Filter by study_id if provided
|
|
1088
|
+
if study_id and gwas_study_id != study_id:
|
|
1089
|
+
continue
|
|
1090
|
+
|
|
1091
|
+
variant = row.get("variant") or {}
|
|
1092
|
+
rs_ids = variant.get("rsIds", [])
|
|
1093
|
+
lead_rsid = rs_ids[0] if rs_ids else ""
|
|
1094
|
+
|
|
1095
|
+
# Compute p-value
|
|
1096
|
+
p_mantissa = row.get("pValueMantissa")
|
|
1097
|
+
p_exponent = row.get("pValueExponent")
|
|
1098
|
+
p_value = None
|
|
1099
|
+
if p_mantissa is not None and p_exponent is not None:
|
|
1100
|
+
try:
|
|
1101
|
+
p_value = float(p_mantissa) * (10 ** int(p_exponent))
|
|
1102
|
+
except (ValueError, TypeError):
|
|
1103
|
+
pass
|
|
1104
|
+
|
|
1105
|
+
# Extract L2G score for this gene
|
|
1106
|
+
l2g_score = None
|
|
1107
|
+
l2g_preds_raw = row.get("l2GPredictions") or []
|
|
1108
|
+
if isinstance(l2g_preds_raw, dict):
|
|
1109
|
+
l2g_preds = l2g_preds_raw.get("rows") or []
|
|
1110
|
+
else:
|
|
1111
|
+
l2g_preds = l2g_preds_raw
|
|
1112
|
+
for pred in l2g_preds:
|
|
1113
|
+
pred_target = pred.get("target") or {}
|
|
1114
|
+
if pred_target.get("id") == ensembl_id:
|
|
1115
|
+
l2g_score = pred.get("score")
|
|
1116
|
+
if l2g_score is None:
|
|
1117
|
+
l2g_score = pred.get("yProbaModel")
|
|
1118
|
+
break
|
|
1119
|
+
|
|
1120
|
+
trait = study.get("traitFromSource", "")
|
|
1121
|
+
diseases = study.get("diseases") or []
|
|
1122
|
+
disease_names = [d.get("name", "") for d in diseases if d.get("name")]
|
|
1123
|
+
|
|
1124
|
+
# Parse current Open Targets schema: colocalisation.rows
|
|
1125
|
+
coloc_obj = row.get("colocalisation") or {}
|
|
1126
|
+
qtl_colocs = coloc_obj.get("rows", []) if isinstance(coloc_obj, dict) else []
|
|
1127
|
+
for qtl in qtl_colocs:
|
|
1128
|
+
h4 = qtl.get("h4")
|
|
1129
|
+
h3 = qtl.get("h3")
|
|
1130
|
+
right_study_type = str(qtl.get("rightStudyType") or "").lower()
|
|
1131
|
+
if right_study_type and "qtl" not in right_study_type:
|
|
1132
|
+
continue
|
|
1133
|
+
|
|
1134
|
+
other = qtl.get("otherStudyLocus") or {}
|
|
1135
|
+
other_study = other.get("study") or {}
|
|
1136
|
+
biosample = other_study.get("biosample") or {}
|
|
1137
|
+
|
|
1138
|
+
tissue_name = (
|
|
1139
|
+
biosample.get("biosampleName")
|
|
1140
|
+
or other_study.get("condition")
|
|
1141
|
+
or other_study.get("traitFromSource")
|
|
1142
|
+
or ""
|
|
1143
|
+
)
|
|
1144
|
+
tissue_id = biosample.get("biosampleId", "")
|
|
1145
|
+
qtl_study = other.get("studyId") or other_study.get("id", "")
|
|
1146
|
+
phenotype = other.get("qtlGeneId", "")
|
|
1147
|
+
|
|
1148
|
+
log2_h4_h3 = None
|
|
1149
|
+
if h4 is not None and h3 not in (None, 0):
|
|
1150
|
+
try:
|
|
1151
|
+
if float(h4) > 0 and float(h3) > 0:
|
|
1152
|
+
log2_h4_h3 = math.log2(float(h4) / float(h3))
|
|
1153
|
+
except (TypeError, ValueError, ZeroDivisionError):
|
|
1154
|
+
log2_h4_h3 = None
|
|
1155
|
+
|
|
1156
|
+
if tissue_name:
|
|
1157
|
+
tissues_seen.add(tissue_name)
|
|
1158
|
+
studies_seen.add(gwas_study_id)
|
|
1159
|
+
|
|
1160
|
+
coloc_results.append({
|
|
1161
|
+
"gwas_study_id": gwas_study_id,
|
|
1162
|
+
"trait": trait,
|
|
1163
|
+
"diseases": disease_names,
|
|
1164
|
+
"lead_variant": variant.get("id", ""),
|
|
1165
|
+
"lead_rsid": lead_rsid,
|
|
1166
|
+
"p_value": p_value,
|
|
1167
|
+
"l2g_score": round(l2g_score, 4) if l2g_score is not None else None,
|
|
1168
|
+
"qtl_study_id": qtl_study,
|
|
1169
|
+
"phenotype_id": phenotype,
|
|
1170
|
+
"tissue": tissue_name,
|
|
1171
|
+
"tissue_id": tissue_id,
|
|
1172
|
+
"h4": round(h4, 4) if h4 is not None else None,
|
|
1173
|
+
"h3": round(h3, 4) if h3 is not None else None,
|
|
1174
|
+
"log2_h4_h3": round(log2_h4_h3, 4) if log2_h4_h3 is not None else None,
|
|
1175
|
+
"colocalisation_method": qtl.get("colocalisationMethod"),
|
|
1176
|
+
"right_study_type": qtl.get("rightStudyType"),
|
|
1177
|
+
"clpp": round(qtl.get("clpp"), 4) if qtl.get("clpp") is not None else None,
|
|
1178
|
+
})
|
|
1179
|
+
|
|
1180
|
+
# Backward compatibility with legacy schema field name used in old fixtures.
|
|
1181
|
+
legacy_qtls = row.get("colocalisationsQtl") or []
|
|
1182
|
+
for qtl in legacy_qtls:
|
|
1183
|
+
h4 = qtl.get("h4")
|
|
1184
|
+
tissue_info = qtl.get("tissue") or {}
|
|
1185
|
+
tissue_name = tissue_info.get("name", "")
|
|
1186
|
+
tissue_id = tissue_info.get("id", "")
|
|
1187
|
+
qtl_study = qtl.get("qtlStudyId", "")
|
|
1188
|
+
phenotype = qtl.get("phenotypeId", "")
|
|
1189
|
+
|
|
1190
|
+
if tissue_name:
|
|
1191
|
+
tissues_seen.add(tissue_name)
|
|
1192
|
+
studies_seen.add(gwas_study_id)
|
|
1193
|
+
|
|
1194
|
+
coloc_results.append({
|
|
1195
|
+
"gwas_study_id": gwas_study_id,
|
|
1196
|
+
"trait": trait,
|
|
1197
|
+
"diseases": disease_names,
|
|
1198
|
+
"lead_variant": variant.get("id", ""),
|
|
1199
|
+
"lead_rsid": lead_rsid,
|
|
1200
|
+
"p_value": p_value,
|
|
1201
|
+
"l2g_score": round(l2g_score, 4) if l2g_score is not None else None,
|
|
1202
|
+
"qtl_study_id": qtl_study,
|
|
1203
|
+
"phenotype_id": phenotype,
|
|
1204
|
+
"tissue": tissue_name,
|
|
1205
|
+
"tissue_id": tissue_id,
|
|
1206
|
+
"h4": round(h4, 4) if h4 is not None else None,
|
|
1207
|
+
"h3": round(qtl.get("h3", 0), 4) if qtl.get("h3") is not None else None,
|
|
1208
|
+
"log2_h4_h3": round(qtl.get("log2h4h3", 0), 4) if qtl.get("log2h4h3") is not None else None,
|
|
1209
|
+
"colocalisation_method": None,
|
|
1210
|
+
"right_study_type": None,
|
|
1211
|
+
"clpp": None,
|
|
1212
|
+
})
|
|
1213
|
+
|
|
1214
|
+
# Sort by H4 (strongest colocalization first)
|
|
1215
|
+
coloc_results.sort(key=lambda x: x["h4"] if x["h4"] is not None else 0, reverse=True)
|
|
1216
|
+
|
|
1217
|
+
n_strong = sum(1 for c in coloc_results if c["h4"] is not None and c["h4"] > 0.8)
|
|
1218
|
+
n_moderate = sum(1 for c in coloc_results if c["h4"] is not None and 0.5 < c["h4"] <= 0.8)
|
|
1219
|
+
|
|
1220
|
+
# Build summary
|
|
1221
|
+
study_filter_str = f" (study {study_id})" if study_id else ""
|
|
1222
|
+
if coloc_results:
|
|
1223
|
+
top_coloc = coloc_results[0]
|
|
1224
|
+
top_str = (
|
|
1225
|
+
f"Strongest: {top_coloc['trait']} / {top_coloc['tissue']} "
|
|
1226
|
+
f"(H4={top_coloc['h4']:.3f})" if top_coloc['h4'] is not None
|
|
1227
|
+
else f"Strongest: {top_coloc['trait']} / {top_coloc['tissue']}"
|
|
1228
|
+
)
|
|
1229
|
+
summary = (
|
|
1230
|
+
f"Colocalization for {approved_symbol}{study_filter_str}: "
|
|
1231
|
+
f"{len(coloc_results)} GWAS-QTL pairs across {len(tissues_seen)} tissues, "
|
|
1232
|
+
f"{len(studies_seen)} GWAS studies. "
|
|
1233
|
+
f"{n_strong} strong (H4>0.8), {n_moderate} moderate (0.5<H4<=0.8). "
|
|
1234
|
+
f"{top_str}"
|
|
1235
|
+
)
|
|
1236
|
+
else:
|
|
1237
|
+
summary = (
|
|
1238
|
+
f"Colocalization for {approved_symbol}{study_filter_str}: "
|
|
1239
|
+
f"no QTL colocalization data found ({total_loci} GWAS loci scanned)"
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
return {
|
|
1243
|
+
"summary": summary,
|
|
1244
|
+
"gene": approved_symbol,
|
|
1245
|
+
"ensembl_id": ensembl_id,
|
|
1246
|
+
"total_gwas_loci": total_loci,
|
|
1247
|
+
"n_colocalizations": len(coloc_results),
|
|
1248
|
+
"n_strong_coloc": n_strong,
|
|
1249
|
+
"n_moderate_coloc": n_moderate,
|
|
1250
|
+
"n_tissues": len(tissues_seen),
|
|
1251
|
+
"n_studies": len(studies_seen),
|
|
1252
|
+
"tissues": sorted(tissues_seen),
|
|
1253
|
+
"colocalizations": coloc_results[:50], # Cap at 50
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
# ---------------------------------------------------------------------------
|
|
1258
|
+
# Variant classification (code-gen tool)
|
|
1259
|
+
# ---------------------------------------------------------------------------
|
|
1260
|
+
|
|
1261
|
+
VARIANT_CLASSIFY_PROMPT = """You are an expert bioinformatics data analyst classifying and analyzing genomic variants.
|
|
1262
|
+
|
|
1263
|
+
{namespace_description}
|
|
1264
|
+
|
|
1265
|
+
## Available Data
|
|
1266
|
+
{data_files_description}
|
|
1267
|
+
|
|
1268
|
+
## DATA LOADING
|
|
1269
|
+
- **ZIP files**: Extract first with `zipfile.ZipFile(path, "r").extractall("/tmp/extracted")`
|
|
1270
|
+
- **Excel .xls**: `pd.read_excel(path, engine='xlrd')`
|
|
1271
|
+
- **Excel .xlsx**: `pd.read_excel(path, engine='openpyxl')`
|
|
1272
|
+
- **VCF**: parse with pandas or cyvcf2; standard columns: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
|
1273
|
+
|
|
1274
|
+
Always check `pd.ExcelFile(path).sheet_names` and try both `skiprows=0` and `skiprows=1`
|
|
1275
|
+
(clinical variant files often have multi-row headers).
|
|
1276
|
+
|
|
1277
|
+
## DATA EXPLORATION (DO THIS FIRST)
|
|
1278
|
+
```python
|
|
1279
|
+
print("Columns:", df.columns.tolist())
|
|
1280
|
+
print("Shape:", df.shape)
|
|
1281
|
+
print("Head:\\n", df.head(3))
|
|
1282
|
+
print("Dtypes:\\n", df.dtypes)
|
|
1283
|
+
```
|
|
1284
|
+
|
|
1285
|
+
## VARIANT ANALYSIS
|
|
1286
|
+
|
|
1287
|
+
### VAF (Variant Allele Frequency) Column Discovery
|
|
1288
|
+
VAF columns have many naming conventions. Search broadly:
|
|
1289
|
+
```python
|
|
1290
|
+
vaf_terms = ['variant allele freq', 'allele freq', 'allele frac', 'vaf',
|
|
1291
|
+
'tumor_f', 't_alt_freq', 'af', 'allelic fraction']
|
|
1292
|
+
vaf_col = None
|
|
1293
|
+
for col in df.columns:
|
|
1294
|
+
if any(term in str(col).lower() for term in vaf_terms):
|
|
1295
|
+
vaf_col = col
|
|
1296
|
+
break
|
|
1297
|
+
# Fallback: find float column with values in [0, 1]
|
|
1298
|
+
if vaf_col is None:
|
|
1299
|
+
for col in df.columns:
|
|
1300
|
+
if df[col].dtype in [float, np.float64]:
|
|
1301
|
+
vals = df[col].dropna()
|
|
1302
|
+
if len(vals) > 0 and vals.min() >= 0 and vals.max() <= 1:
|
|
1303
|
+
vaf_col = col
|
|
1304
|
+
break
|
|
1305
|
+
```
|
|
1306
|
+
|
|
1307
|
+
### Effect/Consequence Annotation
|
|
1308
|
+
Variant files often have multiple annotation columns at different granularity levels.
|
|
1309
|
+
Always use the most granular (e.g., Sequence Ontology terms over broad "Effect" categories).
|
|
1310
|
+
```python
|
|
1311
|
+
effect_cols = [c for c in df.columns if any(k in str(c).lower()
|
|
1312
|
+
for k in ['effect', 'consequence', 'ontology', 'classification'])]
|
|
1313
|
+
for col in effect_cols:
|
|
1314
|
+
print(f" {{col}}: {{sorted(df[col].dropna().unique())}}")
|
|
1315
|
+
```
|
|
1316
|
+
|
|
1317
|
+
### Coding vs Noncoding Classification
|
|
1318
|
+
**Coding** (affect protein sequence): synonymous_variant, missense_variant, frameshift_variant,
|
|
1319
|
+
stop_gained, stop_lost, start_lost, inframe_insertion, inframe_deletion,
|
|
1320
|
+
splice_donor_variant, splice_acceptor_variant.
|
|
1321
|
+
|
|
1322
|
+
**Noncoding**: intron_variant, intergenic_variant, 3_prime_UTR_variant, 5_prime_UTR_variant,
|
|
1323
|
+
splice_region_variant, upstream_gene_variant, downstream_gene_variant.
|
|
1324
|
+
|
|
1325
|
+
### Ts/Tv Ratio (Transition/Transversion)
|
|
1326
|
+
Only count SNPs using REF and the first ALT allele (`ALT.split(',')[0]`) so multi-allelic
|
|
1327
|
+
records with SNP first-alleles are not discarded.
|
|
1328
|
+
For raw bacterial VCFs, apply a high-confidence depth filter using the sample FORMAT depth
|
|
1329
|
+
(`FORMAT` field DP, not INFO-level DP): keep SNPs with FORMAT/DP >= 12 before final Ts/Tv
|
|
1330
|
+
reporting unless the question explicitly requests unfiltered raw calls.
|
|
1331
|
+
```python
|
|
1332
|
+
transitions = {{'AG', 'GA', 'CT', 'TC'}}
|
|
1333
|
+
transversions = {{'AC', 'CA', 'AT', 'TA', 'GC', 'CG', 'GT', 'TG'}}
|
|
1334
|
+
ts = tv = 0
|
|
1335
|
+
for _, row in df.iterrows():
|
|
1336
|
+
ref = str(row['REF']).upper()
|
|
1337
|
+
alt = str(row['ALT']).split(',')[0].upper()
|
|
1338
|
+
if len(ref) == 1 and len(alt) == 1:
|
|
1339
|
+
pair = ref + alt
|
|
1340
|
+
if pair in transitions: ts += 1
|
|
1341
|
+
elif pair in transversions: tv += 1
|
|
1342
|
+
tstv = ts / tv if tv > 0 else 0
|
|
1343
|
+
```
|
|
1344
|
+
|
|
1345
|
+
### Carrier/Cohort Analysis
|
|
1346
|
+
When analyzing multiple samples:
|
|
1347
|
+
1. Explore directory to find all variant files and any metadata/annotation files
|
|
1348
|
+
2. Read metadata to identify sample groups (carriers vs controls, etc.)
|
|
1349
|
+
3. Match variant files to samples by ID patterns in filenames
|
|
1350
|
+
4. Filter variants per sample (e.g., non-reference zygosity, VAF thresholds)
|
|
1351
|
+
|
|
1352
|
+
## Rules
|
|
1353
|
+
1. Do NOT import libraries already in the namespace (pd, np, plt, sns, scipy_stats, etc.)
|
|
1354
|
+
2. Save plots to OUTPUT_DIR: `plt.savefig(OUTPUT_DIR / "filename.png", dpi=150, bbox_inches="tight")`; `plt.close()`
|
|
1355
|
+
3. Assign result: `result = {{"summary": "...", "answer": "PRECISE_ANSWER"}}`
|
|
1356
|
+
4. Use print() for intermediate output to verify correctness.
|
|
1357
|
+
5. If 0 results from a filter: print the column values and debug — do not return "N/A".
|
|
1358
|
+
|
|
1359
|
+
Write ONLY the Python code. No explanation, no markdown fences.
|
|
1360
|
+
"""
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
@registry.register(
|
|
1364
|
+
name="genomics.variant_classify",
|
|
1365
|
+
description=(
|
|
1366
|
+
"Classify and analyze genomic variants from VCF, Excel, or clinical variant files "
|
|
1367
|
+
"(VAF filtering, coding/noncoding classification, ClinVar annotation, carrier analysis)"
|
|
1368
|
+
),
|
|
1369
|
+
category="genomics",
|
|
1370
|
+
parameters={"goal": "Variant analysis to perform"},
|
|
1371
|
+
usage_guide=(
|
|
1372
|
+
"Use for variant classification tasks: VAF filtering, Ts/Tv ratios, coding vs noncoding, "
|
|
1373
|
+
"CHIP analysis, carrier genotype analysis, ClinVar classification lookups. "
|
|
1374
|
+
"Handles multi-row Excel headers, various VAF column naming conventions. "
|
|
1375
|
+
"Do NOT use for GWAS, eQTL, or Mendelian randomization — use genomics.gwas_lookup for those."
|
|
1376
|
+
),
|
|
1377
|
+
)
|
|
1378
|
+
def variant_classify(goal: str, _session=None, _prior_results=None, **kwargs) -> dict:
|
|
1379
|
+
"""Classify and analyze genomic variants using generated code in a sandbox."""
|
|
1380
|
+
from ct.tools.code import _generate_and_execute_code
|
|
1381
|
+
|
|
1382
|
+
return _generate_and_execute_code(
|
|
1383
|
+
goal=goal,
|
|
1384
|
+
system_prompt_template=VARIANT_CLASSIFY_PROMPT,
|
|
1385
|
+
session=_session,
|
|
1386
|
+
prior_results=_prior_results,
|
|
1387
|
+
)
|