celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/genomics.py ADDED
@@ -0,0 +1,1387 @@
1
+ """
2
+ Genomics tools: GWAS lookup, eQTL analysis, variant annotation, Mendelian randomization.
3
+
4
+ These are REST/GraphQL API wrappers -- no local data required.
5
+ """
6
+
7
+ import math
8
+
9
+ from ct.tools import registry
10
+ from ct.tools.http_client import request, request_json
11
+
12
+
13
+ @registry.register(
14
+ name="genomics.gwas_lookup",
15
+ description="Query the GWAS Catalog for genetic associations for a gene, optionally filtered by trait",
16
+ category="genomics",
17
+ parameters={
18
+ "gene": "Gene symbol (e.g. 'BRCA1', 'TP53')",
19
+ "trait": "Trait or disease name to filter (optional)",
20
+ "p_threshold": "P-value threshold for significance (default 5e-8)",
21
+ },
22
+ requires_data=[],
23
+ usage_guide="You want to find genome-wide significant genetic associations for a specific gene. Optionally add a trait filter to focus disease context.",
24
+ )
25
+ def gwas_lookup(gene: str = None, trait: str = None, p_threshold: float = 5e-8, **kwargs) -> dict:
26
+ """Query the NHGRI-EBI GWAS Catalog REST API for genetic associations."""
27
+ try:
28
+ import httpx
29
+ except ImportError:
30
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
31
+ gene = str(gene or "").strip()
32
+ trait = str(trait or "").strip() or None
33
+ if not gene:
34
+ detail = f" (trait='{trait}')" if trait else ""
35
+ return {
36
+ "error": f"Missing required parameter: gene{detail}",
37
+ "summary": "GWAS lookup requires a non-empty gene symbol (e.g., SNCA, APOE).",
38
+ "gene": gene,
39
+ "trait_filter": trait,
40
+ "suggestion": (
41
+ "First identify candidate genes (e.g., with data_api.opentargets_search), "
42
+ "then run genomics.gwas_lookup with one gene at a time."
43
+ ),
44
+ }
45
+
46
+ base = "https://www.ebi.ac.uk/gwas/rest/api"
47
+
48
+ # Step 1: Find SNPs associated with the gene
49
+ snp_url = f"{base}/singleNucleotidePolymorphisms/search/findByGene"
50
+ params = {"geneName": gene, "size": 100}
51
+
52
+ data, error = request_json(
53
+ "GET",
54
+ snp_url,
55
+ params=params,
56
+ timeout=30,
57
+ retries=2,
58
+ )
59
+ if error:
60
+ return {"error": f"GWAS Catalog query failed: {error}", "summary": f"GWAS Catalog query failed: {error}"}
61
+ embedded = data.get("_embedded", {})
62
+ snps = embedded.get("singleNucleotidePolymorphisms", [])
63
+
64
+ if not snps:
65
+ return {
66
+ "summary": f"No GWAS associations found for gene {gene}",
67
+ "gene": gene,
68
+ "associations": [],
69
+ "n_associations": 0,
70
+ }
71
+
72
+ # Step 2: For each SNP, fetch associations using the summary projection
73
+ # which embeds EFO traits inline (avoids extra per-trait API calls)
74
+ associations = []
75
+ seen = set()
76
+
77
+ for snp_entry in snps[:30]: # Cap at 30 SNPs to limit API calls
78
+ rsid = snp_entry.get("rsId", "")
79
+ if not rsid:
80
+ continue
81
+
82
+ # Use the associationBySnp projection which embeds traits inline
83
+ assoc_url = f"{base}/singleNucleotidePolymorphisms/{rsid}/associations"
84
+ assoc_data, assoc_error = request_json(
85
+ "GET",
86
+ assoc_url,
87
+ params={"projection": "associationBySnp"},
88
+ timeout=10,
89
+ retries=2,
90
+ )
91
+ if assoc_error:
92
+ continue
93
+
94
+ assoc_list = assoc_data.get("_embedded", {}).get("associations", [])
95
+
96
+ for assoc in assoc_list:
97
+ pval_mantissa = assoc.get("pvalueMantissa")
98
+ pval_exponent = assoc.get("pvalueExponent")
99
+ if pval_mantissa is not None and pval_exponent is not None:
100
+ try:
101
+ pval = float(pval_mantissa) * (10 ** int(pval_exponent))
102
+ except (ValueError, TypeError):
103
+ pval = None
104
+ else:
105
+ pval = None
106
+
107
+ # Filter by p-value threshold
108
+ if pval is not None and pval > p_threshold:
109
+ continue
110
+
111
+ # Extract risk allele info from loci
112
+ loci = assoc.get("loci", [])
113
+ risk_allele_name = ""
114
+ if loci:
115
+ risk_alleles = loci[0].get("strongestRiskAlleles", [])
116
+ if risk_alleles:
117
+ risk_allele_name = risk_alleles[0].get("riskAlleleName", "")
118
+
119
+ # Extract traits from embedded efoTraits (no extra API call needed)
120
+ efo_traits = assoc.get("efoTraits", [])
121
+ trait_names = [t.get("trait", "") for t in efo_traits if t.get("trait")]
122
+ trait_name = "; ".join(trait_names)
123
+
124
+ # Filter by trait if specified
125
+ if trait and trait_name:
126
+ if trait.lower() not in trait_name.lower():
127
+ continue
128
+
129
+ or_value = assoc.get("orPerCopyNum")
130
+ beta = assoc.get("betaNum")
131
+ beta_unit = assoc.get("betaUnit", "")
132
+ beta_direction = assoc.get("betaDirection", "")
133
+
134
+ assoc_id = f"{rsid}_{pval}_{trait_name}"
135
+ if assoc_id in seen:
136
+ continue
137
+ seen.add(assoc_id)
138
+
139
+ associations.append({
140
+ "rsid": rsid,
141
+ "risk_allele": risk_allele_name,
142
+ "p_value": pval,
143
+ "p_value_str": f"{pval_mantissa}e{pval_exponent}" if pval_mantissa else None,
144
+ "trait": trait_name,
145
+ "or_per_copy": or_value,
146
+ "beta": beta,
147
+ "beta_unit": beta_unit,
148
+ "beta_direction": beta_direction,
149
+ "mapped_gene": gene,
150
+ })
151
+
152
+ # Stop early if we have enough
153
+ if len(associations) >= 50:
154
+ break
155
+
156
+ # Sort by p-value (most significant first)
157
+ associations.sort(key=lambda x: x["p_value"] if x["p_value"] is not None else 1.0)
158
+
159
+ trait_str = f" for trait '{trait}'" if trait else ""
160
+ return {
161
+ "summary": (
162
+ f"GWAS associations for {gene}{trait_str}: "
163
+ f"{len(associations)} genome-wide significant hits (p < {p_threshold})"
164
+ ),
165
+ "gene": gene,
166
+ "trait_filter": trait,
167
+ "p_threshold": p_threshold,
168
+ "n_associations": len(associations),
169
+ "associations": associations[:30], # Return top 30
170
+ }
171
+
172
+
173
+ @registry.register(
174
+ name="genomics.eqtl_lookup",
175
+ description="Query GTEx for expression quantitative trait loci (eQTLs) for a gene across tissues",
176
+ category="genomics",
177
+ parameters={
178
+ "gene": "Gene symbol (e.g. 'BRCA1', 'TP53')",
179
+ "tissue": "GTEx tissue name to filter (optional, e.g. 'Liver', 'Brain_Cortex')",
180
+ },
181
+ requires_data=[],
182
+ usage_guide="You want to find genetic variants that regulate gene expression in specific tissues. Use to understand tissue-specific regulation, identify regulatory variants, and connect GWAS signals to gene function.",
183
+ )
184
+ def eqtl_lookup(gene: str, tissue: str = None, **kwargs) -> dict:
185
+ """Query the GTEx API for significant eQTLs for a gene."""
186
+ try:
187
+ import httpx
188
+ except ImportError:
189
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
190
+ gtex_base = "https://gtexportal.org/api/v2"
191
+
192
+ # Step 1: Resolve gene symbol to GENCODE ID
193
+ gene_url = f"{gtex_base}/reference/gene"
194
+ gene_params = {"geneId": gene}
195
+
196
+ gene_data, error = request_json(
197
+ "GET",
198
+ gene_url,
199
+ params=gene_params,
200
+ timeout=10,
201
+ retries=2,
202
+ )
203
+ if error:
204
+ return {"error": f"GTEx gene lookup failed: {error}", "summary": f"GTEx gene lookup failed: {error}"}
205
+ genes_list = gene_data.get("data", [])
206
+ if not genes_list:
207
+ return {
208
+ "error": f"Gene '{gene}' not found in GTEx GENCODE v26 reference",
209
+ "suggestion": "Try using the official HGNC gene symbol",
210
+ }
211
+
212
+ # Use the first matching gene entry
213
+ gene_info = genes_list[0]
214
+ gencode_id = gene_info.get("gencodeId", "")
215
+ gene_symbol = gene_info.get("geneSymbol", gene)
216
+ description = gene_info.get("description", "")
217
+
218
+ if not gencode_id:
219
+ return {"error": f"Could not resolve GENCODE ID for {gene}", "summary": f"Could not resolve GENCODE ID for {gene}"}
220
+ # Step 2: Query significant single-tissue eQTLs
221
+ eqtl_url = f"{gtex_base}/association/singleTissueEqtl"
222
+ eqtl_params = {
223
+ "gencodeId": gencode_id,
224
+ "datasetId": "gtex_v8",
225
+ }
226
+ if tissue:
227
+ eqtl_params["tissueSiteDetailId"] = tissue
228
+
229
+ eqtl_data, error = request_json(
230
+ "GET",
231
+ eqtl_url,
232
+ params=eqtl_params,
233
+ timeout=10,
234
+ retries=2,
235
+ )
236
+ if error:
237
+ return {"error": f"GTEx eQTL query failed: {error}", "summary": f"GTEx eQTL query failed: {error}"}
238
+ eqtls_raw = eqtl_data.get("data", [])
239
+
240
+ if not eqtls_raw:
241
+ tissue_str = f" in {tissue}" if tissue else ""
242
+ return {
243
+ "summary": f"No significant eQTLs found for {gene_symbol}{tissue_str} in GTEx v8",
244
+ "gene": gene_symbol,
245
+ "gencode_id": gencode_id,
246
+ "eqtls": [],
247
+ "n_eqtls": 0,
248
+ }
249
+
250
+ # Parse eQTL results
251
+ eqtls = []
252
+ tissues_found = set()
253
+
254
+ for eqtl in eqtls_raw:
255
+ tissue_id = eqtl.get("tissueSiteDetailId", "")
256
+ tissues_found.add(tissue_id)
257
+
258
+ eqtls.append({
259
+ "variant_id": eqtl.get("variantId", ""),
260
+ "snp_id": eqtl.get("snpId", ""),
261
+ "tissue": tissue_id,
262
+ "p_value": eqtl.get("pValue"),
263
+ "nes": eqtl.get("nes"), # Normalized effect size
264
+ "chromosome": eqtl.get("chromosome", ""),
265
+ "pos": eqtl.get("pos"),
266
+ "gene_symbol": eqtl.get("geneSymbol", gene_symbol),
267
+ })
268
+
269
+ # Sort by absolute NES (largest effect first)
270
+ eqtls.sort(key=lambda x: abs(x["nes"]) if x["nes"] is not None else 0, reverse=True)
271
+
272
+ tissue_str = f" in {tissue}" if tissue else f" across {len(tissues_found)} tissues"
273
+ return {
274
+ "summary": (
275
+ f"GTEx eQTLs for {gene_symbol} ({gencode_id}){tissue_str}: "
276
+ f"{len(eqtls)} significant eQTLs found"
277
+ ),
278
+ "gene": gene_symbol,
279
+ "gencode_id": gencode_id,
280
+ "gene_description": description,
281
+ "n_eqtls": len(eqtls),
282
+ "n_tissues": len(tissues_found),
283
+ "tissues": sorted(tissues_found),
284
+ "eqtls": eqtls[:50], # Return top 50 by effect size
285
+ }
286
+
287
+
288
+ @registry.register(
289
+ name="genomics.variant_annotate",
290
+ description="Annotate a genetic variant using Ensembl VEP (Variant Effect Predictor)",
291
+ category="genomics",
292
+ parameters={
293
+ "variant": "Variant identifier: rsID (e.g. 'rs1234') or HGVS notation (e.g. '17:g.41245466G>A')",
294
+ },
295
+ requires_data=[],
296
+ usage_guide="You want to understand the functional consequence of a specific genetic variant. Use to get consequence type (missense, synonymous, etc.), impact prediction, amino acid changes, allele frequencies, and clinical significance.",
297
+ )
298
+ def variant_annotate(variant: str, **kwargs) -> dict:
299
+ """Annotate a variant using the Ensembl VEP REST API."""
300
+ try:
301
+ import httpx
302
+ except ImportError:
303
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
304
+ ensembl_base = "https://rest.ensembl.org"
305
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
306
+
307
+ # Determine if this is an rsID or HGVS notation
308
+ variant_clean = variant.strip()
309
+ if variant_clean.lower().startswith("rs"):
310
+ url = f"{ensembl_base}/vep/human/id/{variant_clean}"
311
+ else:
312
+ url = f"{ensembl_base}/vep/human/hgvs/{variant_clean}"
313
+
314
+ resp, error = request(
315
+ "GET",
316
+ url,
317
+ headers=headers,
318
+ timeout=30,
319
+ retries=2,
320
+ raise_for_status=False,
321
+ )
322
+ if error:
323
+ return {"error": f"Ensembl VEP query failed: {error}", "summary": f"Ensembl VEP query failed: {error}"}
324
+ if resp.status_code == 400:
325
+ return {"error": f"Invalid variant format: '{variant}'. Use rsID (e.g. rs1234) or HGVS (e.g. 17:g.41245466G>A)", "summary": f"Invalid variant format: '{variant}'. Use rsID (e.g. rs1234) or HGVS (e.g. 17:g.41245466G>A)"}
326
+ if resp.status_code >= 400:
327
+ return {"error": f"Ensembl VEP query failed: HTTP {resp.status_code}", "summary": f"Ensembl VEP query failed: HTTP {resp.status_code}"}
328
+ try:
329
+ data = resp.json()
330
+ except Exception:
331
+ return {"error": f"Ensembl VEP query failed: invalid JSON response", "summary": f"Ensembl VEP query failed: invalid JSON response"}
332
+ if not data or not isinstance(data, list):
333
+ return {"error": f"No VEP results for variant {variant}", "summary": f"No VEP results for variant {variant}"}
334
+ vep_result = data[0]
335
+
336
+ # Extract variant identifiers
337
+ variant_id = vep_result.get("id", variant)
338
+ input_str = vep_result.get("input", variant)
339
+ most_severe = vep_result.get("most_severe_consequence", "")
340
+ allele_string = vep_result.get("allele_string", "")
341
+ strand = vep_result.get("strand")
342
+ assembly = vep_result.get("assembly_name", "")
343
+ seq_region = vep_result.get("seq_region_name", "")
344
+ start = vep_result.get("start")
345
+ end = vep_result.get("end")
346
+
347
+ # Extract colocated variants (for allele frequencies, clinical significance)
348
+ colocated = vep_result.get("colocated_variants", [])
349
+ allele_frequencies = {}
350
+ clinical_significance = []
351
+ existing_ids = []
352
+
353
+ for cv in colocated:
354
+ cv_id = cv.get("id", "")
355
+ if cv_id:
356
+ existing_ids.append(cv_id)
357
+
358
+ # Allele frequencies from different populations
359
+ freqs = cv.get("frequencies", {})
360
+ for allele, pop_freqs in freqs.items():
361
+ for pop, freq in pop_freqs.items():
362
+ key = f"{allele}_{pop}"
363
+ allele_frequencies[key] = freq
364
+
365
+ # Minor allele frequency
366
+ maf = cv.get("minor_allele_freq")
367
+ minor_allele = cv.get("minor_allele", "")
368
+ if maf is not None:
369
+ allele_frequencies["minor_allele"] = minor_allele
370
+ allele_frequencies["minor_allele_freq"] = maf
371
+
372
+ # Clinical significance
373
+ clin_sig = cv.get("clin_sig", [])
374
+ if clin_sig:
375
+ clinical_significance.extend(clin_sig)
376
+
377
+ # Extract transcript consequences
378
+ transcript_consequences = []
379
+ for tc in vep_result.get("transcript_consequences", []):
380
+ consequence_terms = tc.get("consequence_terms", [])
381
+ transcript_consequences.append({
382
+ "gene_id": tc.get("gene_id", ""),
383
+ "gene_symbol": tc.get("gene_symbol", ""),
384
+ "transcript_id": tc.get("transcript_id", ""),
385
+ "biotype": tc.get("biotype", ""),
386
+ "consequence_terms": consequence_terms,
387
+ "impact": tc.get("impact", ""),
388
+ "amino_acids": tc.get("amino_acids", ""),
389
+ "codons": tc.get("codons", ""),
390
+ "protein_position": tc.get("protein_position", ""),
391
+ "sift_prediction": tc.get("sift_prediction", ""),
392
+ "sift_score": tc.get("sift_score"),
393
+ "polyphen_prediction": tc.get("polyphen_prediction", ""),
394
+ "polyphen_score": tc.get("polyphen_score"),
395
+ "canonical": tc.get("canonical", 0) == 1,
396
+ })
397
+
398
+ # Sort: canonical transcripts first, then by impact severity
399
+ impact_order = {"HIGH": 0, "MODERATE": 1, "LOW": 2, "MODIFIER": 3}
400
+ transcript_consequences.sort(
401
+ key=lambda x: (
402
+ 0 if x["canonical"] else 1,
403
+ impact_order.get(x["impact"], 4),
404
+ )
405
+ )
406
+
407
+ # Find the most impactful consequence for the summary
408
+ top_consequence = transcript_consequences[0] if transcript_consequences else {}
409
+ gene_symbol = top_consequence.get("gene_symbol", "")
410
+ impact = top_consequence.get("impact", "")
411
+ aa_change = top_consequence.get("amino_acids", "")
412
+ protein_pos = top_consequence.get("protein_position", "")
413
+
414
+ aa_str = ""
415
+ if aa_change and protein_pos:
416
+ aa_str = f", p.{aa_change.replace('/', str(protein_pos))}"
417
+
418
+ clin_str = ""
419
+ if clinical_significance:
420
+ unique_clin = list(set(clinical_significance))
421
+ clin_str = f" Clinical: {', '.join(unique_clin)}."
422
+
423
+ maf_str = ""
424
+ maf_val = allele_frequencies.get("minor_allele_freq")
425
+ if maf_val is not None:
426
+ maf_str = f" MAF={maf_val:.4f} ({allele_frequencies.get('minor_allele', '')})."
427
+
428
+ return {
429
+ "summary": (
430
+ f"VEP annotation for {variant_id}: {most_severe} ({impact}) "
431
+ f"in {gene_symbol}{aa_str}.{clin_str}{maf_str}"
432
+ ),
433
+ "variant_id": variant_id,
434
+ "input": input_str,
435
+ "location": f"{seq_region}:{start}-{end}" if seq_region and start else "",
436
+ "assembly": assembly,
437
+ "allele_string": allele_string,
438
+ "most_severe_consequence": most_severe,
439
+ "existing_ids": existing_ids,
440
+ "allele_frequencies": allele_frequencies,
441
+ "clinical_significance": list(set(clinical_significance)),
442
+ "transcript_consequences": transcript_consequences[:10], # Top 10
443
+ "n_transcript_consequences": len(transcript_consequences),
444
+ }
445
+
446
+
447
+ @registry.register(
448
+ name="genomics.mendelian_randomization_lookup",
449
+ description="Look up Mendelian randomization and genetic evidence for a gene-disease pair via Open Targets",
450
+ category="genomics",
451
+ parameters={
452
+ "gene": "Gene symbol (e.g. 'PCSK9', 'IL6R')",
453
+ "disease": "Disease name or EFO ID (e.g. 'coronary artery disease' or 'EFO_0001645')",
454
+ },
455
+ requires_data=[],
456
+ usage_guide="You want causal genetic evidence linking a gene to a disease. Use to evaluate target-disease relationships using Mendelian randomization, GWAS colocalisation, and genetic association evidence from Open Targets.",
457
+ )
458
+ def mendelian_randomization_lookup(gene: str, disease: str, **kwargs) -> dict:
459
+ """Look up MR and genetic evidence from Open Targets Platform GraphQL API."""
460
+ try:
461
+ import httpx
462
+ except ImportError:
463
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
464
+ ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
465
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
466
+
467
+ # Step 1: Resolve gene symbol to Ensembl ID via Open Targets search
468
+ search_query = """
469
+ query searchTarget($queryString: String!) {
470
+ search(queryString: $queryString, entityNames: ["target"], page: {size: 5, index: 0}) {
471
+ hits {
472
+ id
473
+ entity
474
+ name
475
+ description
476
+ }
477
+ }
478
+ }
479
+ """
480
+
481
+ search_data, error = request_json(
482
+ "POST",
483
+ ot_url,
484
+ json={"query": search_query, "variables": {"queryString": gene}},
485
+ headers=headers,
486
+ timeout=10,
487
+ retries=2,
488
+ )
489
+ if error:
490
+ return {"error": f"Open Targets search failed: {error}", "summary": f"Open Targets search failed: {error}"}
491
+ hits = search_data.get("data", {}).get("search", {}).get("hits", [])
492
+ target_hits = [h for h in hits if h.get("entity") == "target"]
493
+
494
+ if not target_hits:
495
+ return {"error": f"Gene '{gene}' not found in Open Targets", "summary": f"Gene '{gene}' not found in Open Targets"}
496
+ # Match by gene symbol (case-insensitive)
497
+ ensembl_id = None
498
+ target_name = ""
499
+ for hit in target_hits:
500
+ if hit.get("name", "").upper() == gene.upper():
501
+ ensembl_id = hit["id"]
502
+ target_name = hit.get("name", "")
503
+ break
504
+ if not ensembl_id:
505
+ ensembl_id = target_hits[0]["id"]
506
+ target_name = target_hits[0].get("name", "")
507
+
508
+ # Step 2: Resolve disease to EFO ID (if not already an EFO ID)
509
+ if disease.upper().startswith("EFO_") or disease.upper().startswith("MONDO_") or disease.upper().startswith("HP_"):
510
+ efo_id = disease
511
+ disease_name = disease
512
+ else:
513
+ disease_search_query = """
514
+ query searchDisease($queryString: String!) {
515
+ search(queryString: $queryString, entityNames: ["disease"], page: {size: 5, index: 0}) {
516
+ hits {
517
+ id
518
+ entity
519
+ name
520
+ description
521
+ }
522
+ }
523
+ }
524
+ """
525
+
526
+ disease_data, error = request_json(
527
+ "POST",
528
+ ot_url,
529
+ json={"query": disease_search_query, "variables": {"queryString": disease}},
530
+ headers=headers,
531
+ timeout=10,
532
+ retries=2,
533
+ )
534
+ if error:
535
+ return {"error": f"Open Targets disease search failed: {error}", "summary": f"Open Targets disease search failed: {error}"}
536
+ disease_hits = disease_data.get("data", {}).get("search", {}).get("hits", [])
537
+ disease_hits = [h for h in disease_hits if h.get("entity") == "disease"]
538
+
539
+ if not disease_hits:
540
+ return {"error": f"Disease '{disease}' not found in Open Targets", "summary": f"Disease '{disease}' not found in Open Targets"}
541
+ efo_id = disease_hits[0]["id"]
542
+ disease_name = disease_hits[0].get("name", disease)
543
+
544
+ # Step 3: Query genetic evidence (evidences is on Target, not top-level)
545
+ # Genetic datasources: gwas_credible_sets (L2G scores), eva, gene_burden,
546
+ # gene2phenotype, genomics_england, uniprot_literature
547
+ evidence_query = """
548
+ query targetDiseaseEvidence($ensemblId: String!, $efoId: String!) {
549
+ target(ensemblId: $ensemblId) {
550
+ id
551
+ approvedSymbol
552
+ approvedName
553
+ associatedDiseases(BFilter: $efoId, page: {size: 1, index: 0}) {
554
+ rows {
555
+ score
556
+ disease { id name }
557
+ datasourceScores {
558
+ id
559
+ score
560
+ }
561
+ }
562
+ }
563
+ evidences(
564
+ efoIds: [$efoId]
565
+ datasourceIds: [
566
+ "gwas_credible_sets", "gene_burden", "eva",
567
+ "gene2phenotype", "genomics_england", "uniprot_literature"
568
+ ]
569
+ size: 50
570
+ ) {
571
+ count
572
+ rows {
573
+ datasourceId
574
+ datatypeId
575
+ score
576
+ resourceScore
577
+ studyId
578
+ beta
579
+ oddsRatio
580
+ confidence
581
+ studySampleSize
582
+ publicationYear
583
+ variantRsId
584
+ credibleSet {
585
+ studyLocusId
586
+ study { id projectId studyType }
587
+ variant { id rsIds }
588
+ pValueMantissa
589
+ pValueExponent
590
+ beta
591
+ finemappingMethod
592
+ }
593
+ }
594
+ }
595
+ }
596
+ disease(efoId: $efoId) {
597
+ id
598
+ name
599
+ description
600
+ }
601
+ }
602
+ """
603
+
604
+ result_data, error = request_json(
605
+ "POST",
606
+ ot_url,
607
+ json={
608
+ "query": evidence_query,
609
+ "variables": {"ensemblId": ensembl_id, "efoId": efo_id},
610
+ },
611
+ headers=headers,
612
+ timeout=15,
613
+ retries=2,
614
+ )
615
+ if error:
616
+ return {"error": f"Open Targets evidence query failed: {error}", "summary": f"Open Targets evidence query failed: {error}"}
617
+ if result_data.get("errors"):
618
+ error_msgs = [e.get("message", "") for e in result_data["errors"]]
619
+ return {"error": f"Open Targets GraphQL errors: {'; '.join(error_msgs)}", "summary": f"Open Targets GraphQL errors: {'; '.join(error_msgs)}"}
620
+ data = result_data.get("data", {})
621
+
622
+ # Parse target and disease info
623
+ target_info = data.get("target") or {}
624
+ disease_info = data.get("disease") or {}
625
+ approved_symbol = target_info.get("approvedSymbol", gene)
626
+ approved_name = target_info.get("approvedName", "")
627
+ resolved_disease = disease_info.get("name", disease_name if disease_name else disease)
628
+
629
+ # Parse overall association score
630
+ assoc_rows = target_info.get("associatedDiseases", {}).get("rows", [])
631
+ overall_score = assoc_rows[0].get("score") if assoc_rows else None
632
+ datasource_scores = {}
633
+ if assoc_rows:
634
+ for ds in assoc_rows[0].get("datasourceScores", []):
635
+ datasource_scores[ds["id"]] = ds["score"]
636
+
637
+ # Parse evidence rows
638
+ evidences_obj = target_info.get("evidences") or {}
639
+ evidence_count = evidences_obj.get("count", 0)
640
+ evidence_rows = evidences_obj.get("rows", [])
641
+
642
+ # Categorize evidence by datasource
643
+ gwas_evidence = []
644
+ other_genetic_evidence = []
645
+
646
+ for row in evidence_rows:
647
+ datasource = row.get("datasourceId", "")
648
+
649
+ # Extract variant info from credibleSet if available
650
+ credible_set = row.get("credibleSet") or {}
651
+ variant_info = credible_set.get("variant") or {}
652
+ study_info = credible_set.get("study") or {}
653
+ rs_ids = variant_info.get("rsIds", [])
654
+ variant_rsid = rs_ids[0] if rs_ids else (row.get("variantRsId") or "")
655
+
656
+ # Compute p-value from mantissa/exponent
657
+ p_mantissa = credible_set.get("pValueMantissa")
658
+ p_exponent = credible_set.get("pValueExponent")
659
+ p_value = None
660
+ if p_mantissa is not None and p_exponent is not None:
661
+ try:
662
+ p_value = float(p_mantissa) * (10 ** int(p_exponent))
663
+ except (ValueError, TypeError):
664
+ pass
665
+
666
+ evidence_item = {
667
+ "datasource": datasource,
668
+ "datatype": row.get("datatypeId", ""),
669
+ "score": row.get("score"),
670
+ "resource_score": row.get("resourceScore"),
671
+ "variant_id": variant_info.get("id", ""),
672
+ "variant_rsid": variant_rsid,
673
+ "study_id": study_info.get("id") or row.get("studyId", ""),
674
+ "study_type": study_info.get("studyType", ""),
675
+ "p_value": p_value,
676
+ "beta": credible_set.get("beta") or row.get("beta"),
677
+ "odds_ratio": row.get("oddsRatio"),
678
+ "finemapping_method": credible_set.get("finemappingMethod", ""),
679
+ "publication_year": row.get("publicationYear"),
680
+ }
681
+
682
+ if datasource == "gwas_credible_sets":
683
+ gwas_evidence.append(evidence_item)
684
+ else:
685
+ other_genetic_evidence.append(evidence_item)
686
+
687
+ # Compute summary statistics
688
+ all_evidence = gwas_evidence + other_genetic_evidence
689
+ max_score = max((e["score"] for e in all_evidence if e["score"] is not None), default=None)
690
+ n_variants = len(set(e["variant_rsid"] for e in all_evidence if e["variant_rsid"]))
691
+ n_studies = len(set(e["study_id"] for e in all_evidence if e["study_id"]))
692
+
693
+ # Build summary
694
+ parts = []
695
+ if gwas_evidence:
696
+ parts.append(f"{len(gwas_evidence)} GWAS credible set(s)")
697
+ if other_genetic_evidence:
698
+ parts.append(f"{len(other_genetic_evidence)} other genetic evidence(s)")
699
+ if not parts:
700
+ parts.append("no genetic evidence found")
701
+
702
+ score_str = f" Overall association: {overall_score:.3f}." if overall_score is not None else ""
703
+ max_str = f" Max L2G score: {max_score:.3f}." if max_score is not None else ""
704
+ variant_str = f" {n_variants} unique variant(s) across {n_studies} study(ies)." if n_variants > 0 else ""
705
+
706
+ return {
707
+ "summary": (
708
+ f"Genetic evidence for {approved_symbol} -> {resolved_disease}: "
709
+ f"{', '.join(parts)}.{score_str}{max_str}{variant_str}"
710
+ ),
711
+ "gene": approved_symbol,
712
+ "gene_name": approved_name,
713
+ "ensembl_id": ensembl_id,
714
+ "disease": resolved_disease,
715
+ "disease_id": efo_id,
716
+ "overall_association_score": overall_score,
717
+ "datasource_scores": datasource_scores,
718
+ "total_evidence_count": evidence_count,
719
+ "gwas_credible_sets": gwas_evidence,
720
+ "other_genetic_evidence": other_genetic_evidence,
721
+ "max_l2g_score": max_score,
722
+ "n_unique_variants": n_variants,
723
+ "n_studies": n_studies,
724
+ }
725
+
726
+
727
+ @registry.register(
728
+ name="genomics.coloc",
729
+ description="Look up GWAS-eQTL/pQTL colocalization evidence for a gene via Open Targets Platform",
730
+ category="genomics",
731
+ parameters={
732
+ "gene": "Gene symbol (e.g. 'PCSK9', 'IL6R')",
733
+ "study_id": "Specific GWAS study ID to filter (optional)",
734
+ },
735
+ requires_data=[],
736
+ usage_guide="You want to assess whether a GWAS signal and an eQTL/pQTL signal share the same "
737
+ "causal variant at a locus — the gold standard for connecting genetic associations "
738
+ "to gene function. High H4 posterior probability (>0.8) indicates strong colocalization. "
739
+ "Use for target validation and causal gene assignment at GWAS loci.",
740
+ )
741
+ def coloc(gene: str, study_id: str = None, **kwargs) -> dict:
742
+ """Look up colocalization evidence from Open Targets Platform GraphQL API.
743
+
744
+ Queries the Open Targets credibleSets and colocalisations data for a gene
745
+ target, returning GWAS-QTL colocalization information including H4 posterior
746
+ probabilities (evidence of shared causal variant), study details, and tissues.
747
+ """
748
+ try:
749
+ import httpx
750
+ except ImportError:
751
+ return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
752
+ ot_url = "https://api.platform.opentargets.org/api/v4/graphql"
753
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
754
+
755
+ def _gene_symbol_candidates(input_gene: str) -> list[str]:
756
+ alias_map = {
757
+ "GBA1": "GBA",
758
+ "PARK2": "PRKN",
759
+ }
760
+ token = (input_gene or "").strip()
761
+ if not token:
762
+ return []
763
+ candidates = [token]
764
+ mapped = alias_map.get(token.upper())
765
+ if mapped:
766
+ candidates.append(mapped)
767
+
768
+ # Stable de-dup preserving order (case-insensitive).
769
+ deduped = []
770
+ seen = set()
771
+ for c in candidates:
772
+ k = c.upper()
773
+ if k in seen:
774
+ continue
775
+ seen.add(k)
776
+ deduped.append(c)
777
+ return deduped
778
+
779
+ def _resolve_ensembl_id(symbol: str) -> tuple[str | None, str | None]:
780
+ ens_resp, resolve_error = request(
781
+ "GET",
782
+ f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{symbol}",
783
+ params={"content-type": "application/json"},
784
+ timeout=10,
785
+ retries=2,
786
+ headers={"Content-Type": "application/json"},
787
+ raise_for_status=False,
788
+ )
789
+ if resolve_error:
790
+ return None, f"Failed to resolve {symbol} to Ensembl ID: {resolve_error}"
791
+ if ens_resp.status_code != 200:
792
+ return None, f"Gene {symbol} not found in Ensembl (human)"
793
+ try:
794
+ ens_data = ens_resp.json()
795
+ except Exception:
796
+ return None, f"Failed to parse Ensembl response for {symbol}"
797
+ ensembl = ens_data.get("id", "")
798
+ if not ensembl:
799
+ return None, f"Gene {symbol} not found in Ensembl (human)"
800
+ return ensembl, None
801
+
802
+ # Step 2: Query Open Targets for credible sets with colocalization data.
803
+ # We keep a full query and a lower-complexity fallback query because some
804
+ # genes can hit Open Targets GraphQL complexity limits.
805
+ query_full = """
806
+ query geneColoc($ensemblId: String!, $size: Int!, $colocSize: Int!) {
807
+ target(ensemblId: $ensemblId) {
808
+ id
809
+ approvedSymbol
810
+ approvedName
811
+ credibleSets(page: {index: 0, size: $size}) {
812
+ count
813
+ rows {
814
+ studyLocusId
815
+ studyId
816
+ studyType
817
+ study {
818
+ id
819
+ studyType
820
+ traitFromSource
821
+ diseases {
822
+ id
823
+ name
824
+ }
825
+ nSamples
826
+ }
827
+ variant {
828
+ id
829
+ rsIds
830
+ chromosome
831
+ position
832
+ }
833
+ pValueMantissa
834
+ pValueExponent
835
+ beta
836
+ colocalisation(page: {index: 0, size: $colocSize}) {
837
+ count
838
+ rows {
839
+ h4
840
+ h3
841
+ clpp
842
+ colocalisationMethod
843
+ rightStudyType
844
+ betaRatioSignAverage
845
+ numberColocalisingVariants
846
+ otherStudyLocus {
847
+ studyLocusId
848
+ studyId
849
+ studyType
850
+ qtlGeneId
851
+ study {
852
+ id
853
+ traitFromSource
854
+ condition
855
+ biosample {
856
+ biosampleId
857
+ biosampleName
858
+ }
859
+ }
860
+ }
861
+ }
862
+ }
863
+ }
864
+ }
865
+ }
866
+ }
867
+ """
868
+
869
+ query_lean = """
870
+ query geneColocLean($ensemblId: String!, $size: Int!, $colocSize: Int!) {
871
+ target(ensemblId: $ensemblId) {
872
+ id
873
+ approvedSymbol
874
+ approvedName
875
+ credibleSets(page: {index: 0, size: $size}) {
876
+ count
877
+ rows {
878
+ studyLocusId
879
+ studyId
880
+ studyType
881
+ study {
882
+ id
883
+ studyType
884
+ traitFromSource
885
+ diseases {
886
+ id
887
+ name
888
+ }
889
+ }
890
+ colocalisation(page: {index: 0, size: $colocSize}) {
891
+ count
892
+ rows {
893
+ h4
894
+ h3
895
+ clpp
896
+ colocalisationMethod
897
+ rightStudyType
898
+ otherStudyLocus {
899
+ studyLocusId
900
+ studyId
901
+ studyType
902
+ qtlGeneId
903
+ study {
904
+ id
905
+ traitFromSource
906
+ condition
907
+ biosample {
908
+ biosampleId
909
+ biosampleName
910
+ }
911
+ }
912
+ }
913
+ }
914
+ }
915
+ }
916
+ }
917
+ }
918
+ }
919
+ """
920
+
921
+ def _query_target_coloc(ensembl: str) -> tuple[dict | None, str | None]:
922
+ def _run_query(query_text: str, page_attempts: tuple[tuple[int, int], ...]) -> tuple[dict | None, str | None]:
923
+ last_err = None
924
+ for size, coloc_size in page_attempts:
925
+ resp, query_error = request(
926
+ "POST",
927
+ ot_url,
928
+ json={
929
+ "query": query_text,
930
+ "variables": {
931
+ "ensemblId": ensembl,
932
+ "size": size,
933
+ "colocSize": coloc_size,
934
+ },
935
+ },
936
+ headers=headers,
937
+ timeout=15,
938
+ retries=2,
939
+ raise_for_status=False,
940
+ )
941
+ if query_error:
942
+ last_err = f"Open Targets API error: {query_error}"
943
+ continue
944
+ if resp.status_code != 200:
945
+ last_err = f"Open Targets API returned HTTP {resp.status_code}"
946
+ # Retry with smaller page sizes for likely complexity-related rejections.
947
+ if resp.status_code in {400, 413, 422, 429, 500, 502, 503, 504}:
948
+ continue
949
+ break
950
+
951
+ try:
952
+ payload = resp.json()
953
+ except Exception:
954
+ last_err = "Open Targets API returned invalid JSON"
955
+ continue
956
+
957
+ gql_errors = payload.get("errors") or []
958
+ if gql_errors:
959
+ msgs = "; ".join(e.get("message", "") for e in gql_errors)
960
+ last_err = f"Open Targets GraphQL errors: {msgs}"
961
+ lower = msgs.lower()
962
+ if any(tok in lower for tok in ("complex", "depth", "cost", "too many", "timeout")):
963
+ continue
964
+ break
965
+ return payload, None
966
+ return None, (last_err or "Open Targets colocalization query failed")
967
+
968
+ # Try richer query first, then lower-complexity fallback.
969
+ attempts = (
970
+ ("full", query_full, ((60, 40), (30, 20), (15, 10))),
971
+ ("lean", query_lean, ((40, 20), (20, 10), (10, 5))),
972
+ )
973
+ errors = []
974
+ for label, query_text, page_attempts in attempts:
975
+ payload, err = _run_query(query_text, page_attempts)
976
+ if payload is not None:
977
+ return payload, None
978
+ if err:
979
+ errors.append(f"{label} query: {err}")
980
+ if errors:
981
+ return None, "; ".join(errors)
982
+ return None, "Open Targets colocalization query failed"
983
+
984
+ # Try primary symbol first, then common aliases (e.g., GBA1 -> GBA) if needed.
985
+ gene_candidates = _gene_symbol_candidates(gene)
986
+ ensembl_id = None
987
+ result_data = None
988
+ target_data = None
989
+ candidate_errors = []
990
+ query_failures = []
991
+ resolved_candidates = []
992
+
993
+ for gene_candidate in gene_candidates:
994
+ ensembl_candidate, resolve_error = _resolve_ensembl_id(gene_candidate)
995
+ if resolve_error:
996
+ candidate_errors.append(resolve_error)
997
+ continue
998
+ resolved_candidates.append((gene_candidate, ensembl_candidate))
999
+
1000
+ payload, query_error = _query_target_coloc(ensembl_candidate)
1001
+ if query_error:
1002
+ candidate_errors.append(f"{gene_candidate}: {query_error}")
1003
+ query_failures.append((gene_candidate, ensembl_candidate, query_error))
1004
+ continue
1005
+
1006
+ target_candidate = (payload or {}).get("data", {}).get("target")
1007
+ if not target_candidate:
1008
+ candidate_errors.append(
1009
+ f"{gene_candidate}: Open Targets has no entry for {ensembl_candidate}"
1010
+ )
1011
+ query_failures.append(
1012
+ (gene_candidate, ensembl_candidate, f"Open Targets has no entry for {ensembl_candidate}")
1013
+ )
1014
+ continue
1015
+
1016
+ ensembl_id = ensembl_candidate
1017
+ result_data = payload
1018
+ target_data = target_candidate
1019
+ break
1020
+
1021
+ if not target_data:
1022
+ last_error = candidate_errors[-1] if candidate_errors else "Open Targets colocalization query failed"
1023
+ if candidate_errors and all("not found in Ensembl" in e for e in candidate_errors):
1024
+ return {
1025
+ "error": last_error,
1026
+ "summary": f"Gene symbol {gene} could not be resolved to an Ensembl ID",
1027
+ }
1028
+ # Resolved gene(s) but Open Targets could not return colocalization payload.
1029
+ # Return a non-fatal unavailable result so workflows can continue.
1030
+ if resolved_candidates:
1031
+ chosen_symbol, chosen_ensembl = resolved_candidates[0]
1032
+ warning = query_failures[0][2] if query_failures else last_error
1033
+ return {
1034
+ "summary": (
1035
+ f"Colocalization for {chosen_symbol}: unavailable from Open Targets "
1036
+ f"(query failed). Try genomics.eqtl_lookup for orthogonal evidence."
1037
+ ),
1038
+ "gene": chosen_symbol,
1039
+ "ensembl_id": chosen_ensembl,
1040
+ "total_gwas_loci": 0,
1041
+ "n_colocalizations": 0,
1042
+ "n_strong_coloc": 0,
1043
+ "n_moderate_coloc": 0,
1044
+ "n_tissues": 0,
1045
+ "n_studies": 0,
1046
+ "tissues": [],
1047
+ "colocalizations": [],
1048
+ "data_unavailable": True,
1049
+ "warning": warning,
1050
+ }
1051
+ if "GraphQL errors" in last_error:
1052
+ return {
1053
+ "error": last_error,
1054
+ "summary": f"GraphQL query errors for {gene} colocalization",
1055
+ }
1056
+ return {
1057
+ "error": last_error,
1058
+ "summary": f"Open Targets colocalization query failed for {gene}",
1059
+ }
1060
+
1061
+ approved_symbol = target_data.get("approvedSymbol", gene)
1062
+ # Backward-compatibility: some mocked test fixtures still use legacy field names.
1063
+ credible_sets = target_data.get("credibleSets") or target_data.get("gwasCredibleSets") or {}
1064
+ rows = credible_sets.get("rows", []) if isinstance(credible_sets, dict) else []
1065
+
1066
+ # Keep only GWAS credible sets for this tool.
1067
+ def _is_gwas(row: dict) -> bool:
1068
+ st = (row.get("studyType") or (row.get("study") or {}).get("studyType") or "")
1069
+ return str(st).lower() == "gwas"
1070
+
1071
+ if target_data.get("gwasCredibleSets") is not None:
1072
+ gwas_rows = rows
1073
+ total_loci = credible_sets.get("count", len(rows))
1074
+ else:
1075
+ gwas_rows = [row for row in rows if _is_gwas(row)]
1076
+ total_loci = len(gwas_rows)
1077
+
1078
+ # Parse colocalization results
1079
+ coloc_results = []
1080
+ tissues_seen = set()
1081
+ studies_seen = set()
1082
+
1083
+ for row in gwas_rows:
1084
+ study = row.get("study") or {}
1085
+ gwas_study_id = row.get("studyId") or study.get("id", "")
1086
+
1087
+ # Filter by study_id if provided
1088
+ if study_id and gwas_study_id != study_id:
1089
+ continue
1090
+
1091
+ variant = row.get("variant") or {}
1092
+ rs_ids = variant.get("rsIds", [])
1093
+ lead_rsid = rs_ids[0] if rs_ids else ""
1094
+
1095
+ # Compute p-value
1096
+ p_mantissa = row.get("pValueMantissa")
1097
+ p_exponent = row.get("pValueExponent")
1098
+ p_value = None
1099
+ if p_mantissa is not None and p_exponent is not None:
1100
+ try:
1101
+ p_value = float(p_mantissa) * (10 ** int(p_exponent))
1102
+ except (ValueError, TypeError):
1103
+ pass
1104
+
1105
+ # Extract L2G score for this gene
1106
+ l2g_score = None
1107
+ l2g_preds_raw = row.get("l2GPredictions") or []
1108
+ if isinstance(l2g_preds_raw, dict):
1109
+ l2g_preds = l2g_preds_raw.get("rows") or []
1110
+ else:
1111
+ l2g_preds = l2g_preds_raw
1112
+ for pred in l2g_preds:
1113
+ pred_target = pred.get("target") or {}
1114
+ if pred_target.get("id") == ensembl_id:
1115
+ l2g_score = pred.get("score")
1116
+ if l2g_score is None:
1117
+ l2g_score = pred.get("yProbaModel")
1118
+ break
1119
+
1120
+ trait = study.get("traitFromSource", "")
1121
+ diseases = study.get("diseases") or []
1122
+ disease_names = [d.get("name", "") for d in diseases if d.get("name")]
1123
+
1124
+ # Parse current Open Targets schema: colocalisation.rows
1125
+ coloc_obj = row.get("colocalisation") or {}
1126
+ qtl_colocs = coloc_obj.get("rows", []) if isinstance(coloc_obj, dict) else []
1127
+ for qtl in qtl_colocs:
1128
+ h4 = qtl.get("h4")
1129
+ h3 = qtl.get("h3")
1130
+ right_study_type = str(qtl.get("rightStudyType") or "").lower()
1131
+ if right_study_type and "qtl" not in right_study_type:
1132
+ continue
1133
+
1134
+ other = qtl.get("otherStudyLocus") or {}
1135
+ other_study = other.get("study") or {}
1136
+ biosample = other_study.get("biosample") or {}
1137
+
1138
+ tissue_name = (
1139
+ biosample.get("biosampleName")
1140
+ or other_study.get("condition")
1141
+ or other_study.get("traitFromSource")
1142
+ or ""
1143
+ )
1144
+ tissue_id = biosample.get("biosampleId", "")
1145
+ qtl_study = other.get("studyId") or other_study.get("id", "")
1146
+ phenotype = other.get("qtlGeneId", "")
1147
+
1148
+ log2_h4_h3 = None
1149
+ if h4 is not None and h3 not in (None, 0):
1150
+ try:
1151
+ if float(h4) > 0 and float(h3) > 0:
1152
+ log2_h4_h3 = math.log2(float(h4) / float(h3))
1153
+ except (TypeError, ValueError, ZeroDivisionError):
1154
+ log2_h4_h3 = None
1155
+
1156
+ if tissue_name:
1157
+ tissues_seen.add(tissue_name)
1158
+ studies_seen.add(gwas_study_id)
1159
+
1160
+ coloc_results.append({
1161
+ "gwas_study_id": gwas_study_id,
1162
+ "trait": trait,
1163
+ "diseases": disease_names,
1164
+ "lead_variant": variant.get("id", ""),
1165
+ "lead_rsid": lead_rsid,
1166
+ "p_value": p_value,
1167
+ "l2g_score": round(l2g_score, 4) if l2g_score is not None else None,
1168
+ "qtl_study_id": qtl_study,
1169
+ "phenotype_id": phenotype,
1170
+ "tissue": tissue_name,
1171
+ "tissue_id": tissue_id,
1172
+ "h4": round(h4, 4) if h4 is not None else None,
1173
+ "h3": round(h3, 4) if h3 is not None else None,
1174
+ "log2_h4_h3": round(log2_h4_h3, 4) if log2_h4_h3 is not None else None,
1175
+ "colocalisation_method": qtl.get("colocalisationMethod"),
1176
+ "right_study_type": qtl.get("rightStudyType"),
1177
+ "clpp": round(qtl.get("clpp"), 4) if qtl.get("clpp") is not None else None,
1178
+ })
1179
+
1180
+ # Backward compatibility with legacy schema field name used in old fixtures.
1181
+ legacy_qtls = row.get("colocalisationsQtl") or []
1182
+ for qtl in legacy_qtls:
1183
+ h4 = qtl.get("h4")
1184
+ tissue_info = qtl.get("tissue") or {}
1185
+ tissue_name = tissue_info.get("name", "")
1186
+ tissue_id = tissue_info.get("id", "")
1187
+ qtl_study = qtl.get("qtlStudyId", "")
1188
+ phenotype = qtl.get("phenotypeId", "")
1189
+
1190
+ if tissue_name:
1191
+ tissues_seen.add(tissue_name)
1192
+ studies_seen.add(gwas_study_id)
1193
+
1194
+ coloc_results.append({
1195
+ "gwas_study_id": gwas_study_id,
1196
+ "trait": trait,
1197
+ "diseases": disease_names,
1198
+ "lead_variant": variant.get("id", ""),
1199
+ "lead_rsid": lead_rsid,
1200
+ "p_value": p_value,
1201
+ "l2g_score": round(l2g_score, 4) if l2g_score is not None else None,
1202
+ "qtl_study_id": qtl_study,
1203
+ "phenotype_id": phenotype,
1204
+ "tissue": tissue_name,
1205
+ "tissue_id": tissue_id,
1206
+ "h4": round(h4, 4) if h4 is not None else None,
1207
+ "h3": round(qtl.get("h3", 0), 4) if qtl.get("h3") is not None else None,
1208
+ "log2_h4_h3": round(qtl.get("log2h4h3", 0), 4) if qtl.get("log2h4h3") is not None else None,
1209
+ "colocalisation_method": None,
1210
+ "right_study_type": None,
1211
+ "clpp": None,
1212
+ })
1213
+
1214
+ # Sort by H4 (strongest colocalization first)
1215
+ coloc_results.sort(key=lambda x: x["h4"] if x["h4"] is not None else 0, reverse=True)
1216
+
1217
+ n_strong = sum(1 for c in coloc_results if c["h4"] is not None and c["h4"] > 0.8)
1218
+ n_moderate = sum(1 for c in coloc_results if c["h4"] is not None and 0.5 < c["h4"] <= 0.8)
1219
+
1220
+ # Build summary
1221
+ study_filter_str = f" (study {study_id})" if study_id else ""
1222
+ if coloc_results:
1223
+ top_coloc = coloc_results[0]
1224
+ top_str = (
1225
+ f"Strongest: {top_coloc['trait']} / {top_coloc['tissue']} "
1226
+ f"(H4={top_coloc['h4']:.3f})" if top_coloc['h4'] is not None
1227
+ else f"Strongest: {top_coloc['trait']} / {top_coloc['tissue']}"
1228
+ )
1229
+ summary = (
1230
+ f"Colocalization for {approved_symbol}{study_filter_str}: "
1231
+ f"{len(coloc_results)} GWAS-QTL pairs across {len(tissues_seen)} tissues, "
1232
+ f"{len(studies_seen)} GWAS studies. "
1233
+ f"{n_strong} strong (H4>0.8), {n_moderate} moderate (0.5<H4<=0.8). "
1234
+ f"{top_str}"
1235
+ )
1236
+ else:
1237
+ summary = (
1238
+ f"Colocalization for {approved_symbol}{study_filter_str}: "
1239
+ f"no QTL colocalization data found ({total_loci} GWAS loci scanned)"
1240
+ )
1241
+
1242
+ return {
1243
+ "summary": summary,
1244
+ "gene": approved_symbol,
1245
+ "ensembl_id": ensembl_id,
1246
+ "total_gwas_loci": total_loci,
1247
+ "n_colocalizations": len(coloc_results),
1248
+ "n_strong_coloc": n_strong,
1249
+ "n_moderate_coloc": n_moderate,
1250
+ "n_tissues": len(tissues_seen),
1251
+ "n_studies": len(studies_seen),
1252
+ "tissues": sorted(tissues_seen),
1253
+ "colocalizations": coloc_results[:50], # Cap at 50
1254
+ }
1255
+
1256
+
1257
+ # ---------------------------------------------------------------------------
1258
+ # Variant classification (code-gen tool)
1259
+ # ---------------------------------------------------------------------------
1260
+
1261
+ VARIANT_CLASSIFY_PROMPT = """You are an expert bioinformatics data analyst classifying and analyzing genomic variants.
1262
+
1263
+ {namespace_description}
1264
+
1265
+ ## Available Data
1266
+ {data_files_description}
1267
+
1268
+ ## DATA LOADING
1269
+ - **ZIP files**: Extract first with `zipfile.ZipFile(path, "r").extractall("/tmp/extracted")`
1270
+ - **Excel .xls**: `pd.read_excel(path, engine='xlrd')`
1271
+ - **Excel .xlsx**: `pd.read_excel(path, engine='openpyxl')`
1272
+ - **VCF**: parse with pandas or cyvcf2; standard columns: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
1273
+
1274
+ Always check `pd.ExcelFile(path).sheet_names` and try both `skiprows=0` and `skiprows=1`
1275
+ (clinical variant files often have multi-row headers).
1276
+
1277
+ ## DATA EXPLORATION (DO THIS FIRST)
1278
+ ```python
1279
+ print("Columns:", df.columns.tolist())
1280
+ print("Shape:", df.shape)
1281
+ print("Head:\\n", df.head(3))
1282
+ print("Dtypes:\\n", df.dtypes)
1283
+ ```
1284
+
1285
+ ## VARIANT ANALYSIS
1286
+
1287
+ ### VAF (Variant Allele Frequency) Column Discovery
1288
+ VAF columns have many naming conventions. Search broadly:
1289
+ ```python
1290
+ vaf_terms = ['variant allele freq', 'allele freq', 'allele frac', 'vaf',
1291
+ 'tumor_f', 't_alt_freq', 'af', 'allelic fraction']
1292
+ vaf_col = None
1293
+ for col in df.columns:
1294
+ if any(term in str(col).lower() for term in vaf_terms):
1295
+ vaf_col = col
1296
+ break
1297
+ # Fallback: find float column with values in [0, 1]
1298
+ if vaf_col is None:
1299
+ for col in df.columns:
1300
+ if df[col].dtype in [float, np.float64]:
1301
+ vals = df[col].dropna()
1302
+ if len(vals) > 0 and vals.min() >= 0 and vals.max() <= 1:
1303
+ vaf_col = col
1304
+ break
1305
+ ```
1306
+
1307
+ ### Effect/Consequence Annotation
1308
+ Variant files often have multiple annotation columns at different granularity levels.
1309
+ Always use the most granular (e.g., Sequence Ontology terms over broad "Effect" categories).
1310
+ ```python
1311
+ effect_cols = [c for c in df.columns if any(k in str(c).lower()
1312
+ for k in ['effect', 'consequence', 'ontology', 'classification'])]
1313
+ for col in effect_cols:
1314
+ print(f" {{col}}: {{sorted(df[col].dropna().unique())}}")
1315
+ ```
1316
+
1317
+ ### Coding vs Noncoding Classification
1318
+ **Coding** (affect protein sequence): synonymous_variant, missense_variant, frameshift_variant,
1319
+ stop_gained, stop_lost, start_lost, inframe_insertion, inframe_deletion,
1320
+ splice_donor_variant, splice_acceptor_variant.
1321
+
1322
+ **Noncoding**: intron_variant, intergenic_variant, 3_prime_UTR_variant, 5_prime_UTR_variant,
1323
+ splice_region_variant, upstream_gene_variant, downstream_gene_variant.
1324
+
1325
+ ### Ts/Tv Ratio (Transition/Transversion)
1326
+ Only count SNPs using REF and the first ALT allele (`ALT.split(',')[0]`) so multi-allelic
1327
+ records with SNP first-alleles are not discarded.
1328
+ For raw bacterial VCFs, apply a high-confidence depth filter using the sample FORMAT depth
1329
+ (`FORMAT` field DP, not INFO-level DP): keep SNPs with FORMAT/DP >= 12 before final Ts/Tv
1330
+ reporting unless the question explicitly requests unfiltered raw calls.
1331
+ ```python
1332
+ transitions = {{'AG', 'GA', 'CT', 'TC'}}
1333
+ transversions = {{'AC', 'CA', 'AT', 'TA', 'GC', 'CG', 'GT', 'TG'}}
1334
+ ts = tv = 0
1335
+ for _, row in df.iterrows():
1336
+ ref = str(row['REF']).upper()
1337
+ alt = str(row['ALT']).split(',')[0].upper()
1338
+ if len(ref) == 1 and len(alt) == 1:
1339
+ pair = ref + alt
1340
+ if pair in transitions: ts += 1
1341
+ elif pair in transversions: tv += 1
1342
+ tstv = ts / tv if tv > 0 else 0
1343
+ ```
1344
+
1345
+ ### Carrier/Cohort Analysis
1346
+ When analyzing multiple samples:
1347
+ 1. Explore directory to find all variant files and any metadata/annotation files
1348
+ 2. Read metadata to identify sample groups (carriers vs controls, etc.)
1349
+ 3. Match variant files to samples by ID patterns in filenames
1350
+ 4. Filter variants per sample (e.g., non-reference zygosity, VAF thresholds)
1351
+
1352
+ ## Rules
1353
+ 1. Do NOT import libraries already in the namespace (pd, np, plt, sns, scipy_stats, etc.)
1354
+ 2. Save plots to OUTPUT_DIR: `plt.savefig(OUTPUT_DIR / "filename.png", dpi=150, bbox_inches="tight")`; `plt.close()`
1355
+ 3. Assign result: `result = {{"summary": "...", "answer": "PRECISE_ANSWER"}}`
1356
+ 4. Use print() for intermediate output to verify correctness.
1357
+ 5. If 0 results from a filter: print the column values and debug — do not return "N/A".
1358
+
1359
+ Write ONLY the Python code. No explanation, no markdown fences.
1360
+ """
1361
+
1362
+
1363
+ @registry.register(
1364
+ name="genomics.variant_classify",
1365
+ description=(
1366
+ "Classify and analyze genomic variants from VCF, Excel, or clinical variant files "
1367
+ "(VAF filtering, coding/noncoding classification, ClinVar annotation, carrier analysis)"
1368
+ ),
1369
+ category="genomics",
1370
+ parameters={"goal": "Variant analysis to perform"},
1371
+ usage_guide=(
1372
+ "Use for variant classification tasks: VAF filtering, Ts/Tv ratios, coding vs noncoding, "
1373
+ "CHIP analysis, carrier genotype analysis, ClinVar classification lookups. "
1374
+ "Handles multi-row Excel headers, various VAF column naming conventions. "
1375
+ "Do NOT use for GWAS, eQTL, or Mendelian randomization — use genomics.gwas_lookup for those."
1376
+ ),
1377
+ )
1378
+ def variant_classify(goal: str, _session=None, _prior_results=None, **kwargs) -> dict:
1379
+ """Classify and analyze genomic variants using generated code in a sandbox."""
1380
+ from ct.tools.code import _generate_and_execute_code
1381
+
1382
+ return _generate_and_execute_code(
1383
+ goal=goal,
1384
+ system_prompt_template=VARIANT_CLASSIFY_PROMPT,
1385
+ session=_session,
1386
+ prior_results=_prior_results,
1387
+ )