celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/protein.py ADDED
@@ -0,0 +1,678 @@
1
+ """
2
+ Protein analysis tools: embedding generation, function prediction, domain annotation.
3
+
4
+ Uses ESM-2 for embeddings (optional), UniProt API for function data, and InterPro API for domains.
5
+ """
6
+
7
+ import re
8
+
9
+ from ct.tools import registry
10
+ from ct.tools.http_client import request
11
+
12
+
13
+ @registry.register(
14
+ name="protein.embed",
15
+ description="Generate protein sequence embeddings using ESM-2 (local) or ESMFold API",
16
+ category="protein",
17
+ parameters={
18
+ "sequence": "Amino acid sequence (single-letter code, e.g. 'MKTL...')",
19
+ "model": "Embedding model: 'esm2' (default) or 'esm2_small'",
20
+ },
21
+ usage_guide="You have a protein sequence and need a numerical representation for downstream analysis (similarity, clustering, property prediction). ESM-2 embeddings capture evolutionary and structural information. Use for comparing proteins, predicting function, or as features for ML models.",
22
+ )
23
+ def embed(sequence: str, model: str = "esm2", **kwargs) -> dict:
24
+ """Generate ESM-2 protein embeddings.
25
+
26
+ If torch + fair-esm are installed, generates embeddings locally using
27
+ esm2_t33_650M_UR50D (or esm2_t6_8M_UR50D for 'esm2_small').
28
+ Otherwise, returns an error with install instructions.
29
+ """
30
+ import numpy as np
31
+
32
+ # Validate sequence
33
+ valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
34
+ sequence = sequence.strip().upper()
35
+ invalid_chars = set(sequence) - valid_aa - {"X", "U", "B", "Z", "O", "J"}
36
+ if invalid_chars:
37
+ return {
38
+ "error": f"Invalid amino acid characters: {invalid_chars}",
39
+ "summary": f"Sequence contains invalid characters: {invalid_chars}",
40
+ }
41
+
42
+ if len(sequence) == 0:
43
+ return {"error": "Empty sequence provided", "summary": "No sequence to embed"}
44
+
45
+ if len(sequence) > 2048:
46
+ return {
47
+ "error": f"Sequence too long ({len(sequence)} aa). Max 2048 for ESM-2 t33.",
48
+ "summary": f"Sequence length {len(sequence)} exceeds limit of 2048 residues",
49
+ }
50
+
51
+ # Try local ESM-2
52
+ try:
53
+ import torch
54
+ import esm
55
+
56
+ if model == "esm2_small":
57
+ esm_model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
58
+ repr_layer = 6
59
+ embed_dim = 320
60
+ else:
61
+ esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
62
+ repr_layer = 33
63
+ embed_dim = 1280
64
+
65
+ esm_model.eval()
66
+ batch_converter = alphabet.get_batch_converter()
67
+
68
+ data = [("protein", sequence)]
69
+ batch_labels, batch_strs, batch_tokens = batch_converter(data)
70
+
71
+ with torch.no_grad():
72
+ results = esm_model(batch_tokens, repr_layers=[repr_layer], return_contacts=False)
73
+
74
+ # Extract representations
75
+ token_repr = results["representations"][repr_layer]
76
+ # Remove BOS and EOS tokens: [0, 1:-1]
77
+ per_residue = token_repr[0, 1:len(sequence) + 1].numpy()
78
+ mean_pooled = per_residue.mean(axis=0)
79
+
80
+ return {
81
+ "summary": (
82
+ f"ESM-2 embedding for sequence ({len(sequence)} aa): "
83
+ f"{embed_dim}-dim representation generated"
84
+ ),
85
+ "sequence_length": len(sequence),
86
+ "embedding_dim": embed_dim,
87
+ "model": model,
88
+ "embedding_shape": list(per_residue.shape),
89
+ "mean_embedding_stats": {
90
+ "mean": round(float(np.mean(mean_pooled)), 6),
91
+ "std": round(float(np.std(mean_pooled)), 6),
92
+ "min": round(float(np.min(mean_pooled)), 6),
93
+ "max": round(float(np.max(mean_pooled)), 6),
94
+ "norm": round(float(np.linalg.norm(mean_pooled)), 4),
95
+ },
96
+ "per_residue_stats": {
97
+ "mean_norm": round(float(np.mean(np.linalg.norm(per_residue, axis=1))), 4),
98
+ "shape": list(per_residue.shape),
99
+ },
100
+ "computed_locally": True,
101
+ }
102
+
103
+ except ImportError:
104
+ return {
105
+ "error": (
106
+ "ESM-2 requires torch and fair-esm. Install with:\n"
107
+ " pip install torch fair-esm\n"
108
+ "For GPU support: pip install torch --index-url https://download.pytorch.org/whl/cu118"
109
+ ),
110
+ "summary": (
111
+ f"Cannot generate embedding for {len(sequence)} aa sequence — "
112
+ "torch and fair-esm not installed"
113
+ ),
114
+ "sequence_length": len(sequence),
115
+ "computed_locally": False,
116
+ }
117
+
118
+
119
+ @registry.register(
120
+ name="protein.function_predict",
121
+ description="Predict protein function, localization, domains, PTMs, and disease associations from UniProt",
122
+ category="protein",
123
+ parameters={
124
+ "gene": "Gene symbol (e.g. BRCA1) or UniProt ID (e.g. P38398)",
125
+ "sequence": "Amino acid sequence (optional, used for basic analysis if API fails)",
126
+ },
127
+ usage_guide="You need comprehensive protein function information — GO terms, subcellular location, domains, PTMs, disease associations, and tissue specificity. Use for target characterization and understanding protein biology.",
128
+ )
129
+ def function_predict(gene: str, sequence: str = None, **kwargs) -> dict:
130
+ """Query UniProt for comprehensive protein function data.
131
+
132
+ Searches by gene symbol (human) or UniProt accession. Extracts function
133
+ description, subcellular location, GO terms, domains, post-translational
134
+ modifications, disease associations, and tissue specificity.
135
+ """
136
+ # Determine if input is UniProt ID or gene symbol
137
+ is_uniprot_id = (
138
+ len(gene) == 6
139
+ and gene[0].isalpha()
140
+ and all(c.isalnum() for c in gene)
141
+ )
142
+
143
+ if is_uniprot_id:
144
+ resp, error = request(
145
+ "GET",
146
+ f"https://rest.uniprot.org/uniprotkb/{gene}.json",
147
+ timeout=15,
148
+ headers={"Accept": "application/json"},
149
+ raise_for_status=False,
150
+ )
151
+ if error:
152
+ return {"error": f"UniProt API error: {error}", "summary": f"Failed to query UniProt for {gene}"}
153
+ if resp.status_code != 200:
154
+ return {
155
+ "error": f"UniProt entry not found for {gene} (HTTP {resp.status_code})",
156
+ "summary": f"No UniProt entry for {gene}",
157
+ }
158
+ try:
159
+ entry = resp.json()
160
+ except Exception:
161
+ return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt response for {gene}"}
162
+ else:
163
+ resp, error = request(
164
+ "GET",
165
+ "https://rest.uniprot.org/uniprotkb/search",
166
+ params={
167
+ "query": f"{gene} AND organism_id:9606",
168
+ "format": "json",
169
+ "size": "1",
170
+ },
171
+ timeout=15,
172
+ headers={"Accept": "application/json"},
173
+ raise_for_status=False,
174
+ )
175
+ if error:
176
+ return {"error": f"UniProt API error: {error}", "summary": f"Failed to query UniProt for {gene}"}
177
+ if resp.status_code != 200:
178
+ return {
179
+ "error": f"UniProt search failed (HTTP {resp.status_code})",
180
+ "summary": f"UniProt query failed for {gene}",
181
+ }
182
+ try:
183
+ data = resp.json()
184
+ except Exception:
185
+ return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt response for {gene}"}
186
+ results = data.get("results", [])
187
+ if not results:
188
+ return {
189
+ "error": f"No UniProt entry found for gene {gene} in human",
190
+ "summary": f"Gene {gene} not found in UniProt (human)",
191
+ }
192
+ entry = results[0]
193
+
194
+ # Extract basic info
195
+ uniprot_id = entry.get("primaryAccession", "")
196
+ protein_desc = entry.get("proteinDescription", {})
197
+ rec_name = protein_desc.get("recommendedName", {})
198
+ protein_name = rec_name.get("fullName", {}).get("value", gene)
199
+
200
+ gene_names = entry.get("genes", [])
201
+ gene_symbol = gene_names[0].get("geneName", {}).get("value", gene) if gene_names else gene
202
+
203
+ seq_info = entry.get("sequence", {})
204
+ seq_length = seq_info.get("length", 0)
205
+
206
+ # Extract comments (function, location, tissue specificity, etc.)
207
+ comments = entry.get("comments", [])
208
+
209
+ function_text = ""
210
+ subcellular_locations = []
211
+ tissue_specificity = ""
212
+ disease_associations = []
213
+ catalytic_activity = []
214
+
215
+ for comment in comments:
216
+ ct = comment.get("commentType", "")
217
+
218
+ if ct == "FUNCTION":
219
+ texts = comment.get("texts", [])
220
+ if texts:
221
+ function_text = texts[0].get("value", "")
222
+
223
+ elif ct == "SUBCELLULAR LOCATION":
224
+ for sl in comment.get("subcellularLocations", []):
225
+ loc = sl.get("location", {}).get("value", "")
226
+ if loc:
227
+ subcellular_locations.append(loc)
228
+
229
+ elif ct == "TISSUE SPECIFICITY":
230
+ texts = comment.get("texts", [])
231
+ if texts:
232
+ tissue_specificity = texts[0].get("value", "")
233
+
234
+ elif ct == "DISEASE":
235
+ disease = comment.get("disease", {})
236
+ if disease:
237
+ disease_associations.append({
238
+ "name": disease.get("diseaseId", ""),
239
+ "description": disease.get("description", ""),
240
+ "acronym": disease.get("acronym", ""),
241
+ })
242
+
243
+ elif ct == "CATALYTIC ACTIVITY":
244
+ reaction = comment.get("reaction", {})
245
+ if reaction:
246
+ catalytic_activity.append(reaction.get("name", ""))
247
+
248
+ # Extract features
249
+ features = entry.get("features", [])
250
+ domains = []
251
+ ptms = []
252
+ active_sites = []
253
+ binding_sites = []
254
+
255
+ for feat in features:
256
+ ftype = feat.get("type", "")
257
+ desc = feat.get("description", "")
258
+ loc = feat.get("location", {})
259
+ start = loc.get("start", {}).get("value")
260
+ end = loc.get("end", {}).get("value")
261
+
262
+ if ftype == "Domain":
263
+ domains.append({"name": desc, "start": start, "end": end})
264
+ elif ftype in ("Modified residue", "Glycosylation", "Disulfide bond", "Cross-link", "Lipidation"):
265
+ ptms.append({"type": ftype, "description": desc, "position": start})
266
+ elif ftype == "Active site":
267
+ active_sites.append({"description": desc, "position": start})
268
+ elif ftype == "Binding site":
269
+ binding_sites.append({"description": desc, "start": start, "end": end})
270
+
271
+ # Extract GO terms from cross-references
272
+ xrefs = entry.get("uniProtKBCrossReferences", [])
273
+ go_terms = {"biological_process": [], "molecular_function": [], "cellular_component": []}
274
+ for xref in xrefs:
275
+ if xref.get("database") == "GO":
276
+ props = xref.get("properties", [])
277
+ go_id = xref.get("id", "")
278
+ term_name = ""
279
+ term_type = ""
280
+ for p in props:
281
+ if p.get("key") == "GoTerm":
282
+ val = p.get("value", "")
283
+ if val.startswith("P:"):
284
+ term_type = "biological_process"
285
+ term_name = val[2:]
286
+ elif val.startswith("F:"):
287
+ term_type = "molecular_function"
288
+ term_name = val[2:]
289
+ elif val.startswith("C:"):
290
+ term_type = "cellular_component"
291
+ term_name = val[2:]
292
+ if term_type and term_name:
293
+ go_terms[term_type].append({"id": go_id, "name": term_name})
294
+
295
+ # Extract keywords
296
+ keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
297
+
298
+ # Build summary
299
+ location_str = ", ".join(subcellular_locations[:3]) if subcellular_locations else "Unknown"
300
+ domain_str = f"{len(domains)} {'domain' if len(domains) == 1 else 'domains'}"
301
+ if domains:
302
+ domain_names = ", ".join(d["name"] for d in domains[:4])
303
+ domain_str += f" ({domain_names})"
304
+
305
+ disease_str = ""
306
+ if disease_associations:
307
+ disease_names = ", ".join(d["name"] for d in disease_associations[:3])
308
+ disease_str = f" Associated with {disease_names}."
309
+
310
+ func_short = function_text[:150] + "..." if len(function_text) > 150 else function_text
311
+
312
+ summary = (
313
+ f"{gene_symbol} ({uniprot_id}): {func_short} "
314
+ f"{location_str}. {domain_str}.{disease_str}"
315
+ )
316
+
317
+ return {
318
+ "summary": summary,
319
+ "uniprot_id": uniprot_id,
320
+ "gene": gene_symbol,
321
+ "protein_name": protein_name,
322
+ "sequence_length": seq_length,
323
+ "function": function_text,
324
+ "subcellular_locations": subcellular_locations,
325
+ "tissue_specificity": tissue_specificity,
326
+ "go_terms": go_terms,
327
+ "domains": domains,
328
+ "ptms": ptms[:30],
329
+ "active_sites": active_sites,
330
+ "binding_sites": binding_sites,
331
+ "disease_associations": disease_associations,
332
+ "catalytic_activity": catalytic_activity,
333
+ "keywords": keywords,
334
+ }
335
+
336
+
337
+ @registry.register(
338
+ name="protein.domain_annotate",
339
+ description="Annotate protein domains, families, and functional sites using InterPro",
340
+ category="protein",
341
+ parameters={
342
+ "gene": "Gene symbol (e.g. TP53) or domain/family keyword (e.g. CAP superfamily)",
343
+ "uniprot_id": "UniProt accession (e.g. P04637) — used directly if provided",
344
+ },
345
+ usage_guide="You need detailed domain architecture for a protein — domain boundaries, family classifications, active sites, binding sites. Can also search InterPro by domain/family keyword when no UniProt accession can be resolved.",
346
+ )
347
+ def domain_annotate(gene: str = None, uniprot_id: str = None, **kwargs) -> dict:
348
+ """Annotate domains using InterPro API.
349
+
350
+ Resolves gene to UniProt ID if needed, then queries InterPro for full
351
+ domain architecture including Pfam, SMART, PROSITE, and other member databases.
352
+ """
353
+ if not gene and not uniprot_id:
354
+ return {
355
+ "error": "Provide either gene symbol or uniprot_id",
356
+ "summary": "No gene or UniProt ID specified",
357
+ }
358
+
359
+ non_human_hints = (
360
+ "helminth", "parasite", "schistosoma", "fasciola", "heligmosomoides",
361
+ "nematode", "trematode", "cestode", "worm", "brugia", "filaria",
362
+ )
363
+
364
+ def _looks_non_human(text: str) -> bool:
365
+ t = (text or "").lower()
366
+ return any(h in t for h in non_human_hints)
367
+
368
+ def _resolve_uniprot(gene_query: str) -> tuple[str, list[str]]:
369
+ attempts: list[str] = []
370
+ search_terms: list[str] = []
371
+ if _looks_non_human(gene_query):
372
+ search_terms.extend([gene_query, f"{gene_query} AND reviewed:true"])
373
+ else:
374
+ search_terms.extend(
375
+ [
376
+ f"{gene_query} AND organism_id:9606",
377
+ gene_query,
378
+ ]
379
+ )
380
+
381
+ for term in search_terms:
382
+ if term in attempts:
383
+ continue
384
+ attempts.append(term)
385
+ resp, error = request(
386
+ "GET",
387
+ "https://rest.uniprot.org/uniprotkb/search",
388
+ params={
389
+ "query": term,
390
+ "format": "json",
391
+ "size": "1",
392
+ "fields": "accession,gene_names",
393
+ },
394
+ timeout=15,
395
+ headers={"Accept": "application/json"},
396
+ raise_for_status=False,
397
+ )
398
+ if error or resp.status_code != 200:
399
+ continue
400
+ try:
401
+ results = resp.json().get("results", [])
402
+ except Exception:
403
+ results = []
404
+ if results:
405
+ accession = results[0].get("primaryAccession", "")
406
+ if accession:
407
+ return accession, attempts
408
+ return "", attempts
409
+
410
+ def _interpro_keyword_search(term: str) -> dict | None:
411
+ cleaned = " ".join((term or "").split())
412
+ if not cleaned:
413
+ return None
414
+
415
+ endpoints = (
416
+ "https://www.ebi.ac.uk/interpro/api/entry/interpro/",
417
+ "https://www.ebi.ac.uk/interpro/api/entry/all/",
418
+ )
419
+ for endpoint in endpoints:
420
+ resp, error = request(
421
+ "GET",
422
+ endpoint,
423
+ params={"search": cleaned, "page_size": "20"},
424
+ timeout=15,
425
+ headers={"Accept": "application/json"},
426
+ raise_for_status=False,
427
+ )
428
+ if error or resp.status_code != 200:
429
+ continue
430
+ try:
431
+ data = resp.json()
432
+ except Exception:
433
+ continue
434
+ results = data.get("results", [])
435
+ if not results:
436
+ continue
437
+
438
+ domains = []
439
+ families = []
440
+ for entry in results:
441
+ md = entry.get("metadata", {}) or {}
442
+ etype = md.get("type", "")
443
+ annotation = {
444
+ "accession": md.get("accession", ""),
445
+ "name": md.get("name", ""),
446
+ "type": etype,
447
+ "source_database": md.get("source_database", ""),
448
+ "description": (
449
+ (md.get("description") or [{}])[0].get("text", "")
450
+ if isinstance(md.get("description"), list)
451
+ else ""
452
+ )[:200],
453
+ "locations": [],
454
+ }
455
+ if etype == "domain":
456
+ domains.append(annotation)
457
+ elif etype == "family":
458
+ families.append(annotation)
459
+
460
+ return {
461
+ "summary": (
462
+ f"InterPro keyword search '{cleaned}': "
463
+ f"{len(domains)} domains, {len(families)} families (no single UniProt mapping)."
464
+ ),
465
+ "gene": gene,
466
+ "uniprot_id": None,
467
+ "n_domains": len(domains),
468
+ "n_families": len(families),
469
+ "n_sites": 0,
470
+ "domains": domains[:30],
471
+ "families": families[:30],
472
+ "sites": [],
473
+ "homologous_superfamilies": [],
474
+ "total_annotations": len(results),
475
+ "mode": "interpro_keyword_search",
476
+ }
477
+ return None
478
+
479
+ # InterPro entry accession mode (e.g. IPR014044) for domain-family lookup.
480
+ interpro_accession = None
481
+ if isinstance(uniprot_id, str) and re.fullmatch(r"IPR\d{6,}", uniprot_id.strip().upper() or ""):
482
+ interpro_accession = uniprot_id.strip().upper()
483
+ elif isinstance(gene, str) and re.fullmatch(r"IPR\d{6,}", gene.strip().upper() or ""):
484
+ interpro_accession = gene.strip().upper()
485
+
486
+ if interpro_accession:
487
+ resp, error = request(
488
+ "GET",
489
+ f"https://www.ebi.ac.uk/interpro/api/entry/interpro/{interpro_accession}",
490
+ timeout=15,
491
+ headers={"Accept": "application/json"},
492
+ raise_for_status=False,
493
+ )
494
+ if error or resp.status_code != 200:
495
+ # Fallback through keyword search path
496
+ keyword_result = _interpro_keyword_search(interpro_accession)
497
+ if keyword_result is not None:
498
+ return keyword_result
499
+ return {
500
+ "error": f"InterPro entry lookup failed for {interpro_accession}",
501
+ "summary": f"No InterPro entry found for {interpro_accession}",
502
+ }
503
+
504
+ try:
505
+ data = resp.json()
506
+ except Exception:
507
+ return {
508
+ "error": f"Invalid InterPro response for {interpro_accession}",
509
+ "summary": f"Failed to parse InterPro response for {interpro_accession}",
510
+ }
511
+ results = data.get("results", [])
512
+ if not results:
513
+ return {
514
+ "error": f"No InterPro entry results for {interpro_accession}",
515
+ "summary": f"No InterPro data for {interpro_accession}",
516
+ }
517
+
518
+ domains = []
519
+ families = []
520
+ for entry in results:
521
+ md = entry.get("metadata", {}) or {}
522
+ etype = md.get("type", "")
523
+ annotation = {
524
+ "accession": md.get("accession", ""),
525
+ "name": md.get("name", ""),
526
+ "type": etype,
527
+ "source_database": md.get("source_database", ""),
528
+ "description": (
529
+ (md.get("description") or [{}])[0].get("text", "")
530
+ if isinstance(md.get("description"), list)
531
+ else ""
532
+ )[:200],
533
+ "locations": [],
534
+ }
535
+ if etype == "domain":
536
+ domains.append(annotation)
537
+ elif etype == "family":
538
+ families.append(annotation)
539
+ return {
540
+ "summary": (
541
+ f"InterPro {interpro_accession}: {len(domains)} domains, {len(families)} families."
542
+ ),
543
+ "gene": gene,
544
+ "uniprot_id": None,
545
+ "n_domains": len(domains),
546
+ "n_families": len(families),
547
+ "n_sites": 0,
548
+ "domains": domains,
549
+ "families": families,
550
+ "sites": [],
551
+ "homologous_superfamilies": [],
552
+ "total_annotations": len(results),
553
+ "mode": "interpro_accession_lookup",
554
+ }
555
+
556
+ # Resolve gene to UniProt ID if needed
557
+ if not uniprot_id and gene:
558
+ uniprot_id, attempts = _resolve_uniprot(gene)
559
+
560
+ if not uniprot_id:
561
+ keyword_result = _interpro_keyword_search(gene)
562
+ if keyword_result is not None:
563
+ return keyword_result
564
+ attempted = "; ".join(attempts[:4])
565
+ return {
566
+ "error": f"Could not resolve gene {gene} to UniProt ID",
567
+ "summary": f"Gene {gene} not found in UniProt search",
568
+ "resolution_attempts": attempts,
569
+ "attempted_query_preview": attempted,
570
+ }
571
+
572
+ # Query InterPro for protein domain annotations
573
+ resp, error = request(
574
+ "GET",
575
+ f"https://www.ebi.ac.uk/interpro/api/entry/all/protein/uniprot/{uniprot_id}",
576
+ timeout=15,
577
+ headers={"Accept": "application/json"},
578
+ raise_for_status=False,
579
+ )
580
+ if error:
581
+ # Final fallback: keyword search if a gene/domain term is available.
582
+ if gene:
583
+ keyword_result = _interpro_keyword_search(gene)
584
+ if keyword_result is not None:
585
+ return keyword_result
586
+ return {"error": f"InterPro API error: {error}", "summary": f"Failed to query InterPro for {uniprot_id}"}
587
+ if resp.status_code == 204:
588
+ data = {"results": []}
589
+ elif resp.status_code != 200:
590
+ if gene:
591
+ keyword_result = _interpro_keyword_search(gene)
592
+ if keyword_result is not None:
593
+ return keyword_result
594
+ return {
595
+ "error": f"InterPro query failed for {uniprot_id} (HTTP {resp.status_code})",
596
+ "summary": f"No InterPro data for {uniprot_id}",
597
+ }
598
+ else:
599
+ try:
600
+ data = resp.json()
601
+ except Exception:
602
+ return {"error": f"Invalid InterPro response for {uniprot_id}", "summary": f"Failed to parse InterPro response for {uniprot_id}"}
603
+
604
+ # Parse InterPro results
605
+ entries = data.get("results", [])
606
+
607
+ domains = []
608
+ families = []
609
+ sites = []
610
+ homologous_superfamilies = []
611
+
612
+ for entry in entries:
613
+ metadata = entry.get("metadata", {})
614
+ entry_type = metadata.get("type", "")
615
+ entry_name = metadata.get("name", "")
616
+ entry_accession = metadata.get("accession", "")
617
+ source_db = metadata.get("source_database", "")
618
+ description = metadata.get("description", [])
619
+ desc_text = description[0].get("text", "") if description else ""
620
+
621
+ # Get protein locations (domain positions)
622
+ proteins = entry.get("proteins", [])
623
+ locations = []
624
+ for protein in proteins:
625
+ for loc_group in protein.get("entry_protein_locations", []):
626
+ for fragment in loc_group.get("fragments", []):
627
+ locations.append({
628
+ "start": fragment.get("start"),
629
+ "end": fragment.get("end"),
630
+ })
631
+
632
+ annotation = {
633
+ "accession": entry_accession,
634
+ "name": entry_name,
635
+ "type": entry_type,
636
+ "source_database": source_db,
637
+ "description": desc_text[:200],
638
+ "locations": locations,
639
+ }
640
+
641
+ if entry_type == "domain":
642
+ domains.append(annotation)
643
+ elif entry_type == "family":
644
+ families.append(annotation)
645
+ elif entry_type in ("active_site", "binding_site", "conserved_site", "ptm"):
646
+ sites.append(annotation)
647
+ elif entry_type == "homologous_superfamily":
648
+ homologous_superfamilies.append(annotation)
649
+
650
+ # Build summary
651
+ gene_label = gene or uniprot_id
652
+ domain_strs = []
653
+ for d in domains:
654
+ loc_str = ""
655
+ if d["locations"]:
656
+ locs = d["locations"][0]
657
+ loc_str = f" ({locs['start']}-{locs['end']})"
658
+ domain_strs.append(f"{d['name']}{loc_str}")
659
+
660
+ summary = (
661
+ f"{gene_label}: {len(domains)} domain{'s' if len(domains) != 1 else ''}"
662
+ )
663
+ if domain_strs:
664
+ summary += f" — {', '.join(domain_strs[:6])}"
665
+
666
+ return {
667
+ "summary": summary,
668
+ "gene": gene,
669
+ "uniprot_id": uniprot_id,
670
+ "n_domains": len(domains),
671
+ "n_families": len(families),
672
+ "n_sites": len(sites),
673
+ "domains": domains,
674
+ "families": families,
675
+ "sites": sites,
676
+ "homologous_superfamilies": homologous_superfamilies,
677
+ "total_annotations": len(entries),
678
+ }