celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/target.py ADDED
@@ -0,0 +1,901 @@
1
+ """
2
+ Target discovery tools: neosubstrate scoring, degron prediction, co-essentiality.
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from ct.tools import registry
8
+ from ct.tools.http_client import request
9
+
10
+
11
+ @registry.register(
12
+ name="target.neosubstrate_score",
13
+ description="Score proteins as potential neosubstrate targets based on degradation selectivity and magnitude",
14
+ category="target",
15
+ parameters={"proteomics_path": "Path to proteomics LFC matrix", "top_n": "Number of top targets to return"},
16
+ requires_data=["proteomics"],
17
+ usage_guide="You want to discover new degradation targets from proteomics data — ranks proteins by selective, potent degradation across compounds. Use early in target discovery campaigns.",
18
+ )
19
+ def neosubstrate_score(proteomics_path: str = None, top_n: int = 50, **kwargs) -> dict:
20
+ """Score proteins for neosubstrate potential."""
21
+ # Load proteomics data
22
+ if proteomics_path is None:
23
+ try:
24
+ from ct.data.loaders import load_proteomics
25
+ prot = load_proteomics()
26
+ except FileNotFoundError:
27
+ return {
28
+ "error": "Proteomics data not available.",
29
+ "summary": "Proteomics data not available — skipping. Provide proteomics data for full analysis.",
30
+ }
31
+ else:
32
+ prot = pd.read_csv(proteomics_path, index_col=0)
33
+
34
+ # Score: selectivity × |mean_degradation| × log2(n_degraders + 1)
35
+ results = []
36
+ for protein in prot.index:
37
+ values = prot.loc[protein].dropna()
38
+ degraded = values[values < -0.5]
39
+ if len(degraded) == 0:
40
+ continue
41
+
42
+ n_degraders = len(degraded)
43
+ mean_deg = degraded.mean()
44
+ # Selectivity: fraction of compounds that degrade it (lower = more selective)
45
+ selectivity = 1.0 - (n_degraders / len(values))
46
+
47
+ score = selectivity * abs(mean_deg) * np.log2(n_degraders + 1)
48
+
49
+ results.append({
50
+ "protein": protein,
51
+ "score": score,
52
+ "n_degraders": n_degraders,
53
+ "mean_degradation": mean_deg,
54
+ "selectivity": selectivity,
55
+ })
56
+
57
+ if not results:
58
+ return {
59
+ "summary": f"No neosubstrate candidates found in {len(prot)} proteins (none degraded below -0.5 LFC)",
60
+ "top_targets": [],
61
+ "n_proteins_scored": 0,
62
+ }
63
+
64
+ df = pd.DataFrame(results).sort_values("score", ascending=False).head(top_n)
65
+
66
+ # Map UniProt IDs to gene symbols if protein IDs look like UniProt accessions
67
+ top_proteins = df["protein"].tolist()
68
+ if top_proteins and all(len(p) >= 6 and p[0].isalpha() and any(c.isdigit() for c in p) and " " not in p for p in top_proteins[:3]):
69
+ try:
70
+ import httpx
71
+ # Batch lookup via UniProt ID mapping
72
+ ids_str = ",".join(top_proteins)
73
+ resp = httpx.get(
74
+ "https://rest.uniprot.org/uniprotkb/accessions",
75
+ params={"accessions": ids_str, "fields": "accession,gene_primary"},
76
+ headers={"Accept": "application/json"},
77
+ timeout=15,
78
+ )
79
+ if resp.status_code == 200:
80
+ entries = resp.json().get("results", [])
81
+ id_to_gene = {}
82
+ for entry in entries:
83
+ acc = entry.get("primaryAccession", "")
84
+ genes = entry.get("genes", [])
85
+ if genes:
86
+ gene_name = genes[0].get("geneName", {}).get("value", "")
87
+ if gene_name:
88
+ id_to_gene[acc] = gene_name
89
+ if id_to_gene:
90
+ df["gene_symbol"] = df["protein"].map(id_to_gene)
91
+ except Exception:
92
+ pass
93
+
94
+ return {
95
+ "summary": f"Top {min(top_n, len(results))} neosubstrate candidates scored from {len(prot)} proteins",
96
+ "top_targets": df.to_dict("records"),
97
+ "n_proteins_scored": len(results),
98
+ }
99
+
100
+
101
+ @registry.register(
102
+ name="target.degron_predict",
103
+ description="Predict structural degron motifs in a protein (zinc fingers, disordered regions, surface accessibility) using UniProt features",
104
+ category="target",
105
+ parameters={"uniprot_id": "UniProt ID of target protein (e.g. P04637 for TP53)"},
106
+ requires_data=[],
107
+ usage_guide="You want to assess whether a protein has structural features (zinc fingers, disordered loops) that make it amenable to E3-mediated degradation. Use after identifying a target of interest.",
108
+ )
109
+ def degron_predict(uniprot_id: str, **kwargs) -> dict:
110
+ """Predict degron features for a target protein using UniProt feature analysis."""
111
+ # Fetch protein features from UniProt API
112
+ resp, error = request(
113
+ "GET",
114
+ f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json",
115
+ timeout=30,
116
+ headers={"Accept": "application/json"},
117
+ raise_for_status=False,
118
+ )
119
+ if error:
120
+ return {"error": f"Failed to fetch UniProt data: {error}", "summary": f"Failed to fetch UniProt data: {error}"}
121
+ if resp.status_code != 200:
122
+ return {"error": f"UniProt entry not found for {uniprot_id} (HTTP {resp.status_code})", "summary": f"UniProt entry not found for {uniprot_id} (HTTP {resp.status_code})"}
123
+ try:
124
+ data = resp.json()
125
+ except Exception:
126
+ return {"error": f"Invalid UniProt JSON response for {uniprot_id}", "summary": f"Invalid UniProt JSON response for {uniprot_id}"}
127
+ protein_name = data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", uniprot_id)
128
+ gene_name = ""
129
+ genes = data.get("genes", [])
130
+ if genes:
131
+ gene_name = genes[0].get("geneName", {}).get("value", "")
132
+ sequence = data.get("sequence", {})
133
+ seq_length = sequence.get("length", 0)
134
+
135
+ # Extract structural features relevant to degradation
136
+ features = data.get("features", [])
137
+ zinc_fingers = [f for f in features if f.get("type") == "Zinc finger"]
138
+ domains = [f for f in features if f.get("type") == "Domain"]
139
+ disordered = [f for f in features if f.get("type") == "Region" and "Disordered" in f.get("description", "")]
140
+ motifs = [f for f in features if f.get("type") == "Motif"]
141
+ modifications = [f for f in features if f.get("type") in ("Modified residue", "Cross-link")]
142
+
143
+ # Compute degron-relevant scores
144
+ def _region_length(feat):
145
+ loc = feat.get("location", {})
146
+ start = loc.get("start", {}).get("value", 0)
147
+ end = loc.get("end", {}).get("value", 0)
148
+ return max(0, end - start + 1) if start and end else 0
149
+
150
+ disordered_residues = sum(_region_length(f) for f in disordered)
151
+ disorder_fraction = disordered_residues / seq_length if seq_length > 0 else 0
152
+
153
+ # Known degron-associated domain types
154
+ degron_domains = []
155
+ for d in domains:
156
+ desc = d.get("description", "").lower()
157
+ if any(k in desc for k in ["zinc finger", "ring", "btb", "wd40", "kelch", "socs box", "f-box"]):
158
+ degron_domains.append(d.get("description", "unknown"))
159
+
160
+ # Lysine count from sequence (ubiquitination sites)
161
+ raw_seq = sequence.get("value", "")
162
+ lysine_count = raw_seq.count("K") if raw_seq else 0
163
+ lysine_density = lysine_count / seq_length if seq_length > 0 else 0
164
+
165
+ # Known ubiquitination sites from modifications
166
+ ub_sites = [m for m in modifications if "ubiquit" in m.get("description", "").lower()]
167
+
168
+ # Compute overall degradability score (0-1 heuristic)
169
+ score = 0.0
170
+ score_breakdown = {}
171
+
172
+ # Zinc fingers are strong degron features (CRBN/IKZF-type)
173
+ zf_score = min(len(zinc_fingers) * 0.15, 0.3)
174
+ score += zf_score
175
+ score_breakdown["zinc_fingers"] = zf_score
176
+
177
+ # Disordered regions expose protein to E3 engagement
178
+ disorder_score = min(disorder_fraction * 0.5, 0.25)
179
+ score += disorder_score
180
+ score_breakdown["disorder"] = disorder_score
181
+
182
+ # Lysine density enables ubiquitination
183
+ lys_score = min(lysine_density * 3.0, 0.2)
184
+ score += lys_score
185
+ score_breakdown["lysine_accessibility"] = lys_score
186
+
187
+ # Known ubiquitination sites
188
+ ub_score = min(len(ub_sites) * 0.05, 0.15)
189
+ score += ub_score
190
+ score_breakdown["known_ub_sites"] = ub_score
191
+
192
+ # Small-medium proteins degrade more easily
193
+ size_score = 0.1 if seq_length < 800 else 0.05 if seq_length < 1500 else 0.0
194
+ score += size_score
195
+ score_breakdown["protein_size"] = size_score
196
+
197
+ score = min(score, 1.0)
198
+
199
+ # Classify
200
+ if score >= 0.5:
201
+ classification = "high"
202
+ rationale = "Strong structural features for E3-mediated degradation"
203
+ elif score >= 0.25:
204
+ classification = "moderate"
205
+ rationale = "Some favorable features; may require linker/scaffold optimization"
206
+ else:
207
+ classification = "low"
208
+ rationale = "Few structural degron features identified"
209
+
210
+ return {
211
+ "summary": (
212
+ f"Degron prediction for {gene_name or uniprot_id} ({protein_name}): "
213
+ f"{classification} degradability (score={score:.2f}). "
214
+ f"{len(zinc_fingers)} zinc finger(s), {disordered_residues} disordered residues "
215
+ f"({disorder_fraction:.0%}), {lysine_count} lysines, {len(ub_sites)} known Ub site(s)."
216
+ ),
217
+ "uniprot_id": uniprot_id,
218
+ "gene": gene_name,
219
+ "protein_name": protein_name,
220
+ "seq_length": seq_length,
221
+ "degradability_score": round(score, 3),
222
+ "classification": classification,
223
+ "rationale": rationale,
224
+ "score_breakdown": {k: round(v, 3) for k, v in score_breakdown.items()},
225
+ "features": {
226
+ "zinc_fingers": len(zinc_fingers),
227
+ "zinc_finger_details": [
228
+ {"description": f.get("description", ""), "start": f.get("location", {}).get("start", {}).get("value"), "end": f.get("location", {}).get("end", {}).get("value")}
229
+ for f in zinc_fingers
230
+ ],
231
+ "disordered_residues": disordered_residues,
232
+ "disorder_fraction": round(disorder_fraction, 3),
233
+ "domains": [d.get("description", "") for d in domains],
234
+ "degron_associated_domains": degron_domains,
235
+ "lysine_count": lysine_count,
236
+ "lysine_density": round(lysine_density, 3),
237
+ "known_ub_sites": len(ub_sites),
238
+ "motifs": [m.get("description", "") for m in motifs],
239
+ },
240
+ }
241
+
242
+
243
+ @registry.register(
244
+ name="target.coessentiality",
245
+ description="Find co-essential and synthetic lethal partners for a target gene using DepMap CRISPR data",
246
+ category="target",
247
+ parameters={"gene": "Gene symbol", "top_n": "Number of partners to return"},
248
+ requires_data=["depmap_crispr"],
249
+ usage_guide="You need to validate a drug target by finding functionally related genes, or identify synthetic lethal partners for combination therapy. Also useful for understanding pathway context of a gene.",
250
+ )
251
+ def coessentiality(gene: str, top_n: int = 20, **kwargs) -> dict:
252
+ """Compute co-essentiality network for a gene."""
253
+ from ct.data.loaders import load_crispr
254
+
255
+ crispr = load_crispr()
256
+
257
+ if gene not in crispr.columns:
258
+ return {"error": f"Gene {gene} not found in DepMap CRISPR data", "summary": f"Gene {gene} not found in DepMap CRISPR data"}
259
+ target_vals = crispr[gene].dropna()
260
+
261
+ correlations = []
262
+ for other_gene in crispr.columns:
263
+ if other_gene == gene:
264
+ continue
265
+ other_vals = crispr[other_gene].dropna()
266
+ common = target_vals.index.intersection(other_vals.index)
267
+ if len(common) < 50:
268
+ continue
269
+
270
+ from scipy import stats
271
+ r, p = stats.pearsonr(target_vals[common], other_vals[common])
272
+ correlations.append({"gene": other_gene, "r": r, "p": p})
273
+
274
+ if not correlations:
275
+ return {
276
+ "summary": f"Co-essentiality network for {gene}: no genes with sufficient shared cell lines (>=50)",
277
+ "gene": gene,
278
+ "co_essential": [],
279
+ "synthetic_lethal": [],
280
+ }
281
+
282
+ df = pd.DataFrame(correlations).sort_values("r", ascending=False)
283
+
284
+ co_essential = df.head(top_n).to_dict("records")
285
+ synthetic_lethal = df.tail(top_n).sort_values("r").to_dict("records")
286
+
287
+ return {
288
+ "summary": f"Co-essentiality network for {gene}: {len(correlations)} genes tested",
289
+ "gene": gene,
290
+ "co_essential": co_essential,
291
+ "synthetic_lethal": synthetic_lethal,
292
+ }
293
+
294
+
295
+ @registry.register(
296
+ name="target.druggability",
297
+ description="Assess the druggability of a protein target using UniProt annotations (protein family, domains, ligands, structural coverage)",
298
+ category="target",
299
+ parameters={"gene": "Gene symbol (e.g. BRAF, EGFR)"},
300
+ requires_data=[],
301
+ usage_guide="You want to evaluate whether a target protein is druggable — checks protein class, known ligands, structural data, and surface accessibility. Use early in target prioritization.",
302
+ )
303
+ def druggability(gene: str, **kwargs) -> dict:
304
+ """Assess druggability of a protein target via UniProt annotations."""
305
+ # Query UniProt for the gene
306
+ resp, error = request(
307
+ "GET",
308
+ "https://rest.uniprot.org/uniprotkb/search",
309
+ params={
310
+ "query": f"gene_exact:{gene} AND organism_id:9606",
311
+ "format": "json",
312
+ "size": "1",
313
+ },
314
+ timeout=10,
315
+ headers={"Accept": "application/json"},
316
+ raise_for_status=False,
317
+ )
318
+ if error:
319
+ return {"error": f"Failed to fetch UniProt data: {error}", "summary": f"UniProt API error for {gene}"}
320
+ if resp.status_code != 200:
321
+ return {"error": f"UniProt search failed for {gene} (HTTP {resp.status_code})", "summary": f"Failed to query UniProt for {gene}"}
322
+ try:
323
+ data = resp.json()
324
+ except Exception:
325
+ return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt data for {gene}"}
326
+
327
+ results = data.get("results", [])
328
+ if not results:
329
+ return {"error": f"No UniProt entry found for {gene} in human", "summary": f"Gene {gene} not found in UniProt (human)"}
330
+
331
+ entry = results[0]
332
+
333
+ # Extract protein info
334
+ protein_name = (
335
+ entry.get("proteinDescription", {})
336
+ .get("recommendedName", {})
337
+ .get("fullName", {})
338
+ .get("value", gene)
339
+ )
340
+ uniprot_id = entry.get("primaryAccession", "")
341
+
342
+ # Extract features
343
+ features = entry.get("features", [])
344
+ domains = [f.get("description", "") for f in features if f.get("type") == "Domain"]
345
+ keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
346
+
347
+ # Subcellular location
348
+ comments = entry.get("comments", [])
349
+ subcellular_locs = []
350
+ for c in comments:
351
+ if c.get("commentType") == "SUBCELLULAR LOCATION":
352
+ for sl in c.get("subcellularLocations", []):
353
+ loc_val = sl.get("location", {}).get("value", "")
354
+ if loc_val:
355
+ subcellular_locs.append(loc_val)
356
+
357
+ # Transmembrane regions
358
+ transmembrane = [f for f in features if f.get("type") == "Transmembrane"]
359
+
360
+ # Cross-references
361
+ xrefs = entry.get("uniProtKBCrossReferences", [])
362
+
363
+ # Check for ChEMBL cross-refs (known small molecule ligands)
364
+ chembl_refs = [x for x in xrefs if x.get("database") == "ChEMBL"]
365
+ known_drugs = [x.get("id", "") for x in chembl_refs]
366
+
367
+ # Check for PDB cross-refs (structural coverage)
368
+ pdb_refs = [x for x in xrefs if x.get("database") == "PDB"]
369
+ pdb_ids = [x.get("id", "") for x in pdb_refs]
370
+
371
+ # Determine protein class from keywords and domains
372
+ protein_class = "other"
373
+ class_score = 0.0
374
+ keywords_lower = [k.lower() for k in keywords]
375
+ domains_lower = [d.lower() for d in domains]
376
+ all_annotations = " ".join(keywords_lower + domains_lower)
377
+
378
+ if any(k in all_annotations for k in ["kinase", "protein kinase"]):
379
+ protein_class = "kinase"
380
+ class_score = 0.35
381
+ elif any(k in all_annotations for k in ["g-protein coupled receptor", "gpcr"]):
382
+ protein_class = "GPCR"
383
+ class_score = 0.35
384
+ elif any(k in all_annotations for k in ["ion channel", "voltage-gated"]):
385
+ protein_class = "ion_channel"
386
+ class_score = 0.30
387
+ elif any(k in all_annotations for k in ["nuclear hormone receptor", "nuclear receptor"]):
388
+ protein_class = "nuclear_receptor"
389
+ class_score = 0.30
390
+ elif any(k in all_annotations for k in ["protease", "peptidase"]):
391
+ protein_class = "protease"
392
+ class_score = 0.25
393
+ elif any(k in all_annotations for k in ["phosphatase"]):
394
+ protein_class = "phosphatase"
395
+ class_score = 0.25
396
+ elif any(k in all_annotations for k in ["transferase"]):
397
+ protein_class = "transferase"
398
+ class_score = 0.20
399
+ elif any(k in all_annotations for k in ["transcription factor", "transcription"]):
400
+ protein_class = "transcription_factor"
401
+ class_score = 0.10
402
+ elif any(k in all_annotations for k in ["scaffold", "adaptor"]):
403
+ protein_class = "scaffold_adaptor"
404
+ class_score = 0.05
405
+
406
+ # Score: known ligands
407
+ ligand_score = min(len(chembl_refs) * 0.10, 0.25)
408
+
409
+ # Score: surface accessibility (extracellular / secreted / membrane)
410
+ surface_keywords = ["secreted", "cell membrane", "extracellular"]
411
+ is_surface = any(
412
+ any(sk in loc.lower() for sk in surface_keywords)
413
+ for loc in subcellular_locs
414
+ )
415
+ surface_score = 0.15 if is_surface else 0.0
416
+
417
+ # Score: structural coverage (PDB entries)
418
+ structure_score = min(len(pdb_refs) * 0.02, 0.15)
419
+
420
+ # Score: has transmembrane (often druggable for membrane targets)
421
+ tm_score = 0.10 if transmembrane else 0.0
422
+
423
+ total_score = min(class_score + ligand_score + surface_score + structure_score + tm_score, 1.0)
424
+
425
+ # Reasoning
426
+ reasoning_parts = []
427
+ if class_score > 0:
428
+ reasoning_parts.append(f"Protein class '{protein_class}' is a {'highly ' if class_score >= 0.30 else ''}tractable target class")
429
+ else:
430
+ reasoning_parts.append(f"Protein class '{protein_class}' has limited druggability precedent")
431
+ if chembl_refs:
432
+ reasoning_parts.append(f"{len(chembl_refs)} ChEMBL entry/entries indicate known small-molecule interactions")
433
+ else:
434
+ reasoning_parts.append("No ChEMBL cross-references found (no known small-molecule ligands)")
435
+ if pdb_refs:
436
+ reasoning_parts.append(f"{len(pdb_refs)} PDB structure(s) available for structure-based drug design")
437
+ else:
438
+ reasoning_parts.append("No PDB structures available")
439
+ if is_surface:
440
+ reasoning_parts.append("Surface-accessible / extracellular localization supports biologic targeting")
441
+ reasoning = ". ".join(reasoning_parts) + "."
442
+
443
+ # Classify
444
+ if total_score >= 0.6:
445
+ classification = "highly druggable"
446
+ elif total_score >= 0.35:
447
+ classification = "druggable"
448
+ elif total_score >= 0.15:
449
+ classification = "challenging"
450
+ else:
451
+ classification = "undruggable (with current modalities)"
452
+
453
+ return {
454
+ "summary": (
455
+ f"Druggability assessment for {gene} ({protein_name}): "
456
+ f"{classification} (score={total_score:.2f}). "
457
+ f"Class: {protein_class}. {len(pdb_ids)} PDB structures, "
458
+ f"{len(known_drugs)} ChEMBL entries."
459
+ ),
460
+ "gene": gene,
461
+ "uniprot_id": uniprot_id,
462
+ "protein_name": protein_name,
463
+ "druggability_score": round(total_score, 3),
464
+ "classification": classification,
465
+ "protein_class": protein_class,
466
+ "known_drugs": known_drugs,
467
+ "structural_coverage": {
468
+ "pdb_count": len(pdb_ids),
469
+ "pdb_ids": pdb_ids[:20], # Cap at 20 for readability
470
+ },
471
+ "surface_accessible": is_surface,
472
+ "subcellular_locations": subcellular_locs,
473
+ "transmembrane_regions": len(transmembrane),
474
+ "domains": domains,
475
+ "reasoning": reasoning,
476
+ "score_breakdown": {
477
+ "protein_class": round(class_score, 3),
478
+ "known_ligands": round(ligand_score, 3),
479
+ "surface_accessibility": round(surface_score, 3),
480
+ "structural_data": round(structure_score, 3),
481
+ "transmembrane": round(tm_score, 3),
482
+ },
483
+ }
484
+
485
+
486
+ @registry.register(
487
+ name="target.expression_profile",
488
+ description="Get tissue expression profile for a gene using GTEx Portal API and Human Protein Atlas",
489
+ category="target",
490
+ parameters={
491
+ "gene": "Gene symbol (e.g. TP53, EGFR, BRCA1)",
492
+ "top_n": "Number of top tissues to return (default 10)",
493
+ },
494
+ requires_data=[],
495
+ usage_guide="You want to understand where a target is expressed — tissue specificity, cancer vs normal, and cell type expression. Critical for safety assessment and indication selection.",
496
+ )
497
+ def expression_profile(gene: str, top_n: int = 10, **kwargs) -> dict:
498
+ """Get tissue expression profile for a gene from GTEx and Human Protein Atlas.
499
+
500
+ Resolves gene symbol to GENCODE ID via the GTEx reference API, then
501
+ fetches median expression per tissue from GTEx v8. Also queries HPA
502
+ for protein-level and single-cell expression. Computes a tissue
503
+ specificity index (tau) from the GTEx TPM values.
504
+ """
505
+ # --- Step 1: Resolve gene symbol to GENCODE ID via GTEx reference API ---
506
+ def _gene_symbol_candidates(input_gene: str) -> list[str]:
507
+ alias_map = {
508
+ "GBA1": "GBA",
509
+ "PARK2": "PRKN",
510
+ }
511
+ token = (input_gene or "").strip()
512
+ if not token:
513
+ return []
514
+ candidates = [token]
515
+ mapped = alias_map.get(token.upper())
516
+ if mapped:
517
+ candidates.append(mapped)
518
+ if token.endswith("1") and len(token) > 1:
519
+ candidates.append(token[:-1])
520
+
521
+ deduped = []
522
+ seen = set()
523
+ for c in candidates:
524
+ k = c.upper()
525
+ if k in seen:
526
+ continue
527
+ seen.add(k)
528
+ deduped.append(c)
529
+ return deduped
530
+
531
+ gencode_id = None
532
+ gene_symbol = gene
533
+ gene_candidates = _gene_symbol_candidates(gene)
534
+
535
+ for gene_candidate in gene_candidates:
536
+ ref_resp, ref_error = request(
537
+ "GET",
538
+ "https://gtexportal.org/api/v2/reference/gene",
539
+ params={"geneId": gene_candidate},
540
+ timeout=10,
541
+ raise_for_status=False,
542
+ )
543
+ if ref_error or ref_resp.status_code != 200:
544
+ continue
545
+ try:
546
+ ref_data = ref_resp.json()
547
+ except Exception:
548
+ continue
549
+ genes_list = ref_data.get("data", [])
550
+ if genes_list:
551
+ gene_info = genes_list[0]
552
+ gencode_id = gene_info.get("gencodeId", "")
553
+ gene_symbol = gene_info.get("geneSymbol", gene_candidate)
554
+ break
555
+
556
+ # --- Step 2: GTEx median gene expression per tissue ---
557
+ gtex_expression = []
558
+
559
+ if gencode_id:
560
+ gtex_resp, gtex_error = request(
561
+ "GET",
562
+ "https://gtexportal.org/api/v2/expression/medianGeneExpression",
563
+ params={
564
+ "gencodeId": gencode_id,
565
+ "datasetId": "gtex_v8",
566
+ },
567
+ timeout=10,
568
+ raise_for_status=False,
569
+ )
570
+ if not gtex_error and gtex_resp.status_code == 200:
571
+ try:
572
+ gtex_data = gtex_resp.json()
573
+ for entry in gtex_data.get("data", []):
574
+ gtex_expression.append({
575
+ "tissue": entry.get("tissueSiteDetailId", ""),
576
+ "median_tpm": entry.get("median", 0),
577
+ })
578
+ gtex_expression.sort(key=lambda x: x.get("median_tpm", 0), reverse=True)
579
+ except Exception:
580
+ gtex_expression = []
581
+
582
+ # --- Step 3: Compute tissue specificity index (tau) ---
583
+ # Tau ranges from 0 (ubiquitous) to 1 (tissue-specific)
584
+ tau = None
585
+ if gtex_expression:
586
+ tpm_values = [t["median_tpm"] for t in gtex_expression]
587
+ max_tpm = max(tpm_values) if tpm_values else 0
588
+ if max_tpm > 0 and len(tpm_values) > 1:
589
+ n = len(tpm_values)
590
+ tau = sum(1.0 - (x / max_tpm) for x in tpm_values) / (n - 1)
591
+ tau = round(tau, 4)
592
+
593
+ # --- Step 4: Human Protein Atlas ---
594
+ hpa_data = {}
595
+ tissue_rna = []
596
+ tissue_protein = []
597
+ cancer_expression = []
598
+ cell_type_expression = []
599
+ ensembl_id = None
600
+
601
+ # Try to extract Ensembl ID from GENCODE ID (strip version suffix)
602
+ if gencode_id:
603
+ ensembl_id = gencode_id.split(".")[0]
604
+
605
+ # Query HPA using Ensembl ID if available, then gene aliases.
606
+ hpa_queries = []
607
+ if ensembl_id:
608
+ hpa_queries.append(ensembl_id)
609
+ if gene_symbol:
610
+ hpa_queries.append(gene_symbol)
611
+ hpa_queries.extend(gene_candidates)
612
+
613
+ # Stable de-dup for query candidates.
614
+ deduped_hpa_queries = []
615
+ seen_hpa = set()
616
+ for q in hpa_queries:
617
+ key = str(q).upper()
618
+ if key in seen_hpa:
619
+ continue
620
+ seen_hpa.add(key)
621
+ deduped_hpa_queries.append(q)
622
+
623
+ for hpa_query in deduped_hpa_queries:
624
+ hpa_resp, hpa_error = request(
625
+ "GET",
626
+ f"https://www.proteinatlas.org/{hpa_query}.json",
627
+ timeout=10,
628
+ headers={"Accept": "application/json"},
629
+ raise_for_status=False,
630
+ )
631
+ if hpa_error or hpa_resp is None or hpa_resp.status_code != 200:
632
+ continue
633
+ try:
634
+ hpa_data = hpa_resp.json()
635
+ except Exception:
636
+ hpa_data = {}
637
+ if hpa_data:
638
+ break
639
+
640
+ if hpa_data:
641
+ # RNA tissue expression
642
+ for entry in hpa_data.get("RNATissue", {}).get("data", []):
643
+ tissue_rna.append({
644
+ "tissue": entry.get("Tissue", ""),
645
+ "tpm": entry.get("TPM", 0),
646
+ "ntpm": entry.get("nTPM", 0),
647
+ })
648
+ tissue_rna.sort(key=lambda x: x.get("tpm", 0), reverse=True)
649
+
650
+ # Protein tissue expression
651
+ for entry in hpa_data.get("ProteinTissue", {}).get("data", []):
652
+ tissue_protein.append({
653
+ "tissue": entry.get("Tissue", ""),
654
+ "level": entry.get("Level", ""),
655
+ "cell_type": entry.get("CellType", ""),
656
+ })
657
+
658
+ # Cancer expression
659
+ for entry in hpa_data.get("RNACancer", {}).get("data", []):
660
+ cancer_expression.append({
661
+ "cancer": entry.get("Cancer", ""),
662
+ "tpm": entry.get("TPM", 0),
663
+ "ntpm": entry.get("nTPM", 0),
664
+ })
665
+ cancer_expression.sort(key=lambda x: x.get("tpm", 0), reverse=True)
666
+
667
+ # Cell type expression
668
+ for entry in hpa_data.get("RNASingleCell", {}).get("data", []):
669
+ cell_type_expression.append({
670
+ "cell_type": entry.get("CellType", ""),
671
+ "ntpm": entry.get("nTPM", 0),
672
+ })
673
+ cell_type_expression.sort(key=lambda x: x.get("ntpm", 0), reverse=True)
674
+
675
+ # --- Build response ---
676
+ # Prefer GTEx for top tissues (quantitative TPM), fall back to HPA
677
+ top_tissues = gtex_expression[:top_n] if gtex_expression else tissue_rna[:top_n]
678
+ if not top_tissues and not hpa_data:
679
+ return {
680
+ "summary": f"No expression data found for {gene} from GTEx or Human Protein Atlas",
681
+ "gene": gene,
682
+ "error": "No data returned from GTEx or HPA APIs",
683
+ }
684
+
685
+ n_tissues = len(gtex_expression) if gtex_expression else len(tissue_rna)
686
+
687
+ # Build summary line matching the spec format
688
+ if gtex_expression:
689
+ tissue_strs = [
690
+ f"{t['tissue']} ({t['median_tpm']:.1f} TPM)" for t in gtex_expression[:top_n]
691
+ ]
692
+ summary = f"{gene_symbol} expression: highest in {', '.join(tissue_strs[:5])}"
693
+ elif tissue_rna:
694
+ tissue_strs = [
695
+ f"{t['tissue']} ({t['tpm']:.1f} TPM)" for t in tissue_rna[:top_n]
696
+ ]
697
+ summary = f"{gene_symbol} expression: highest in {', '.join(tissue_strs[:5])}"
698
+ else:
699
+ summary = f"Expression profile for {gene_symbol}: {n_tissues} tissues profiled"
700
+
701
+ if tau is not None:
702
+ specificity_label = (
703
+ "tissue-specific" if tau > 0.8
704
+ else "tissue-enriched" if tau > 0.5
705
+ else "broadly expressed"
706
+ )
707
+ summary += f". Specificity: {specificity_label} (tau={tau:.3f})"
708
+
709
+ # RNA tissue specificity category from HPA
710
+ rna_specificity = hpa_data.get("RNATissue", {}).get("summary", "")
711
+
712
+ return {
713
+ "summary": summary,
714
+ "gene": gene_symbol,
715
+ "gencode_id": gencode_id,
716
+ "ensembl_id": ensembl_id,
717
+ "tissue_specificity_tau": tau,
718
+ "rna_specificity_hpa": rna_specificity,
719
+ "gtex_expression": gtex_expression[:top_n],
720
+ "tissue_rna_hpa": tissue_rna[:top_n],
721
+ "tissue_protein": tissue_protein[:30],
722
+ "cancer_expression": cancer_expression[:20],
723
+ "cell_type_expression": cell_type_expression[:20],
724
+ "n_tissues_profiled": n_tissues,
725
+ "top_expressing_tissues": top_tissues[:top_n],
726
+ }
727
+
728
+
729
+ @registry.register(
730
+ name="target.disease_association",
731
+ description="Query Open Targets Platform for disease associations of a gene target",
732
+ category="target",
733
+ parameters={"gene": "Gene symbol (e.g. BRAF, TP53)", "min_score": "Minimum association score (default 0.1)"},
734
+ requires_data=[],
735
+ usage_guide="You want to know which diseases a target is associated with — genetic evidence, drug evidence, literature support. Essential for indication selection and target validation.",
736
+ )
737
+ def disease_association(gene: str, min_score: float = 0.1, **kwargs) -> dict:
738
+ """Query Open Targets for disease associations of a gene."""
739
+ # Step 1: Resolve gene symbol to Ensembl ID
740
+ ensembl_id = None
741
+ ens_resp, ens_error = request(
742
+ "GET",
743
+ f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene}",
744
+ params={"content-type": "application/json"},
745
+ timeout=10,
746
+ headers={"Content-Type": "application/json"},
747
+ raise_for_status=False,
748
+ )
749
+ if ens_error:
750
+ return {
751
+ "error": f"Failed to resolve {gene} to Ensembl ID: {ens_error}",
752
+ "summary": f"Could not resolve gene symbol {gene} via Ensembl REST API",
753
+ }
754
+ if ens_resp.status_code == 200:
755
+ try:
756
+ ens_data = ens_resp.json()
757
+ ensembl_id = ens_data.get("id", "")
758
+ except Exception:
759
+ ensembl_id = None
760
+
761
+ if not ensembl_id:
762
+ return {
763
+ "error": f"Gene {gene} not found in Ensembl (human)",
764
+ "summary": f"Gene symbol {gene} could not be resolved to an Ensembl ID",
765
+ }
766
+
767
+ # Step 2: Query Open Targets GraphQL
768
+ query = """
769
+ query targetDiseases($ensemblId: String!, $size: Int!) {
770
+ target(ensemblId: $ensemblId) {
771
+ approvedSymbol
772
+ approvedName
773
+ associatedDiseases(page: {index: 0, size: $size}) {
774
+ count
775
+ rows {
776
+ disease {
777
+ id
778
+ name
779
+ }
780
+ score
781
+ datasourceScores {
782
+ id
783
+ score
784
+ }
785
+ }
786
+ }
787
+ }
788
+ }
789
+ """
790
+
791
+ ot_resp, ot_error = request(
792
+ "POST",
793
+ "https://api.platform.opentargets.org/api/v4/graphql",
794
+ json={
795
+ "query": query,
796
+ "variables": {
797
+ "ensemblId": ensembl_id,
798
+ "size": 50,
799
+ },
800
+ },
801
+ timeout=10,
802
+ headers={"Content-Type": "application/json"},
803
+ raise_for_status=False,
804
+ )
805
+ if ot_error:
806
+ return {
807
+ "error": f"Open Targets API error: {ot_error}",
808
+ "summary": f"Failed to query Open Targets for {gene}",
809
+ }
810
+ if ot_resp.status_code != 200:
811
+ return {
812
+ "error": f"Open Targets API returned HTTP {ot_resp.status_code}",
813
+ "summary": f"Open Targets query failed for {gene} ({ensembl_id})",
814
+ }
815
+ try:
816
+ ot_data = ot_resp.json()
817
+ except Exception:
818
+ return {
819
+ "error": "Open Targets returned invalid JSON",
820
+ "summary": f"Failed to parse Open Targets response for {gene}",
821
+ }
822
+
823
+ target_data = ot_data.get("data", {}).get("target")
824
+ if not target_data:
825
+ return {
826
+ "error": f"No target data returned from Open Targets for {ensembl_id}",
827
+ "summary": f"Open Targets has no entry for {gene} ({ensembl_id})",
828
+ }
829
+
830
+ approved_symbol = target_data.get("approvedSymbol", gene)
831
+ approved_name = target_data.get("approvedName", "")
832
+ assoc_data = target_data.get("associatedDiseases", {})
833
+ total_count = assoc_data.get("count", 0)
834
+ rows = assoc_data.get("rows", [])
835
+
836
+ # Parse associations
837
+ associations = []
838
+ for row in rows:
839
+ overall_score = row.get("score", 0)
840
+ if overall_score < min_score:
841
+ continue
842
+
843
+ disease = row.get("disease", {})
844
+ disease_id = disease.get("id", "")
845
+ disease_name = disease.get("name", "")
846
+
847
+ # Parse datasource scores into readable categories
848
+ ds_scores = {}
849
+ for ds in row.get("datasourceScores", []):
850
+ comp_id = ds.get("id") or ds.get("componentId", "")
851
+ ds_score = ds.get("score", 0)
852
+ ds_scores[comp_id] = round(ds_score, 4)
853
+
854
+ # Extract key evidence categories
855
+ genetic_score = max(
856
+ ds_scores.get("ot_genetics_portal", 0),
857
+ ds_scores.get("gene_burden", 0),
858
+ ds_scores.get("genomics_england", 0),
859
+ ds_scores.get("eva", 0),
860
+ ds_scores.get("uniprot_variants", 0),
861
+ )
862
+ drug_score = max(
863
+ ds_scores.get("chembl", 0),
864
+ ds_scores.get("europepmc", 0),
865
+ )
866
+ literature_score = ds_scores.get("europepmc", 0)
867
+
868
+ associations.append({
869
+ "disease_id": disease_id,
870
+ "disease_name": disease_name,
871
+ "overall_score": round(overall_score, 4),
872
+ "genetic_association": round(genetic_score, 4),
873
+ "known_drug": round(drug_score, 4),
874
+ "literature": round(literature_score, 4),
875
+ "all_datasource_scores": ds_scores,
876
+ })
877
+
878
+ associations.sort(key=lambda x: x["overall_score"], reverse=True)
879
+
880
+ # Build summary
881
+ n_filtered = len(associations)
882
+ top_diseases = ", ".join(a["disease_name"] for a in associations[:5])
883
+ if not top_diseases:
884
+ top_diseases = "none"
885
+
886
+ summary = (
887
+ f"Disease associations for {approved_symbol} ({approved_name}): "
888
+ f"{n_filtered} diseases above score {min_score} (out of {total_count} total). "
889
+ f"Top: {top_diseases}."
890
+ )
891
+
892
+ return {
893
+ "summary": summary,
894
+ "gene": approved_symbol,
895
+ "ensembl_id": ensembl_id,
896
+ "approved_name": approved_name,
897
+ "total_associations": total_count,
898
+ "filtered_associations": n_filtered,
899
+ "min_score": min_score,
900
+ "associations": associations,
901
+ }