celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/protein.py
ADDED
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Protein analysis tools: embedding generation, function prediction, domain annotation.
|
|
3
|
+
|
|
4
|
+
Uses ESM-2 for embeddings (optional), UniProt API for function data, and InterPro API for domains.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from ct.tools import registry
|
|
10
|
+
from ct.tools.http_client import request
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@registry.register(
|
|
14
|
+
name="protein.embed",
|
|
15
|
+
description="Generate protein sequence embeddings using ESM-2 (local) or ESMFold API",
|
|
16
|
+
category="protein",
|
|
17
|
+
parameters={
|
|
18
|
+
"sequence": "Amino acid sequence (single-letter code, e.g. 'MKTL...')",
|
|
19
|
+
"model": "Embedding model: 'esm2' (default) or 'esm2_small'",
|
|
20
|
+
},
|
|
21
|
+
usage_guide="You have a protein sequence and need a numerical representation for downstream analysis (similarity, clustering, property prediction). ESM-2 embeddings capture evolutionary and structural information. Use for comparing proteins, predicting function, or as features for ML models.",
|
|
22
|
+
)
|
|
23
|
+
def embed(sequence: str, model: str = "esm2", **kwargs) -> dict:
|
|
24
|
+
"""Generate ESM-2 protein embeddings.
|
|
25
|
+
|
|
26
|
+
If torch + fair-esm are installed, generates embeddings locally using
|
|
27
|
+
esm2_t33_650M_UR50D (or esm2_t6_8M_UR50D for 'esm2_small').
|
|
28
|
+
Otherwise, returns an error with install instructions.
|
|
29
|
+
"""
|
|
30
|
+
import numpy as np
|
|
31
|
+
|
|
32
|
+
# Validate sequence
|
|
33
|
+
valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
|
|
34
|
+
sequence = sequence.strip().upper()
|
|
35
|
+
invalid_chars = set(sequence) - valid_aa - {"X", "U", "B", "Z", "O", "J"}
|
|
36
|
+
if invalid_chars:
|
|
37
|
+
return {
|
|
38
|
+
"error": f"Invalid amino acid characters: {invalid_chars}",
|
|
39
|
+
"summary": f"Sequence contains invalid characters: {invalid_chars}",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if len(sequence) == 0:
|
|
43
|
+
return {"error": "Empty sequence provided", "summary": "No sequence to embed"}
|
|
44
|
+
|
|
45
|
+
if len(sequence) > 2048:
|
|
46
|
+
return {
|
|
47
|
+
"error": f"Sequence too long ({len(sequence)} aa). Max 2048 for ESM-2 t33.",
|
|
48
|
+
"summary": f"Sequence length {len(sequence)} exceeds limit of 2048 residues",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Try local ESM-2
|
|
52
|
+
try:
|
|
53
|
+
import torch
|
|
54
|
+
import esm
|
|
55
|
+
|
|
56
|
+
if model == "esm2_small":
|
|
57
|
+
esm_model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
|
|
58
|
+
repr_layer = 6
|
|
59
|
+
embed_dim = 320
|
|
60
|
+
else:
|
|
61
|
+
esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
|
|
62
|
+
repr_layer = 33
|
|
63
|
+
embed_dim = 1280
|
|
64
|
+
|
|
65
|
+
esm_model.eval()
|
|
66
|
+
batch_converter = alphabet.get_batch_converter()
|
|
67
|
+
|
|
68
|
+
data = [("protein", sequence)]
|
|
69
|
+
batch_labels, batch_strs, batch_tokens = batch_converter(data)
|
|
70
|
+
|
|
71
|
+
with torch.no_grad():
|
|
72
|
+
results = esm_model(batch_tokens, repr_layers=[repr_layer], return_contacts=False)
|
|
73
|
+
|
|
74
|
+
# Extract representations
|
|
75
|
+
token_repr = results["representations"][repr_layer]
|
|
76
|
+
# Remove BOS and EOS tokens: [0, 1:-1]
|
|
77
|
+
per_residue = token_repr[0, 1:len(sequence) + 1].numpy()
|
|
78
|
+
mean_pooled = per_residue.mean(axis=0)
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"summary": (
|
|
82
|
+
f"ESM-2 embedding for sequence ({len(sequence)} aa): "
|
|
83
|
+
f"{embed_dim}-dim representation generated"
|
|
84
|
+
),
|
|
85
|
+
"sequence_length": len(sequence),
|
|
86
|
+
"embedding_dim": embed_dim,
|
|
87
|
+
"model": model,
|
|
88
|
+
"embedding_shape": list(per_residue.shape),
|
|
89
|
+
"mean_embedding_stats": {
|
|
90
|
+
"mean": round(float(np.mean(mean_pooled)), 6),
|
|
91
|
+
"std": round(float(np.std(mean_pooled)), 6),
|
|
92
|
+
"min": round(float(np.min(mean_pooled)), 6),
|
|
93
|
+
"max": round(float(np.max(mean_pooled)), 6),
|
|
94
|
+
"norm": round(float(np.linalg.norm(mean_pooled)), 4),
|
|
95
|
+
},
|
|
96
|
+
"per_residue_stats": {
|
|
97
|
+
"mean_norm": round(float(np.mean(np.linalg.norm(per_residue, axis=1))), 4),
|
|
98
|
+
"shape": list(per_residue.shape),
|
|
99
|
+
},
|
|
100
|
+
"computed_locally": True,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
except ImportError:
|
|
104
|
+
return {
|
|
105
|
+
"error": (
|
|
106
|
+
"ESM-2 requires torch and fair-esm. Install with:\n"
|
|
107
|
+
" pip install torch fair-esm\n"
|
|
108
|
+
"For GPU support: pip install torch --index-url https://download.pytorch.org/whl/cu118"
|
|
109
|
+
),
|
|
110
|
+
"summary": (
|
|
111
|
+
f"Cannot generate embedding for {len(sequence)} aa sequence — "
|
|
112
|
+
"torch and fair-esm not installed"
|
|
113
|
+
),
|
|
114
|
+
"sequence_length": len(sequence),
|
|
115
|
+
"computed_locally": False,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@registry.register(
|
|
120
|
+
name="protein.function_predict",
|
|
121
|
+
description="Predict protein function, localization, domains, PTMs, and disease associations from UniProt",
|
|
122
|
+
category="protein",
|
|
123
|
+
parameters={
|
|
124
|
+
"gene": "Gene symbol (e.g. BRCA1) or UniProt ID (e.g. P38398)",
|
|
125
|
+
"sequence": "Amino acid sequence (optional, used for basic analysis if API fails)",
|
|
126
|
+
},
|
|
127
|
+
usage_guide="You need comprehensive protein function information — GO terms, subcellular location, domains, PTMs, disease associations, and tissue specificity. Use for target characterization and understanding protein biology.",
|
|
128
|
+
)
|
|
129
|
+
def function_predict(gene: str, sequence: str = None, **kwargs) -> dict:
|
|
130
|
+
"""Query UniProt for comprehensive protein function data.
|
|
131
|
+
|
|
132
|
+
Searches by gene symbol (human) or UniProt accession. Extracts function
|
|
133
|
+
description, subcellular location, GO terms, domains, post-translational
|
|
134
|
+
modifications, disease associations, and tissue specificity.
|
|
135
|
+
"""
|
|
136
|
+
# Determine if input is UniProt ID or gene symbol
|
|
137
|
+
is_uniprot_id = (
|
|
138
|
+
len(gene) == 6
|
|
139
|
+
and gene[0].isalpha()
|
|
140
|
+
and all(c.isalnum() for c in gene)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if is_uniprot_id:
|
|
144
|
+
resp, error = request(
|
|
145
|
+
"GET",
|
|
146
|
+
f"https://rest.uniprot.org/uniprotkb/{gene}.json",
|
|
147
|
+
timeout=15,
|
|
148
|
+
headers={"Accept": "application/json"},
|
|
149
|
+
raise_for_status=False,
|
|
150
|
+
)
|
|
151
|
+
if error:
|
|
152
|
+
return {"error": f"UniProt API error: {error}", "summary": f"Failed to query UniProt for {gene}"}
|
|
153
|
+
if resp.status_code != 200:
|
|
154
|
+
return {
|
|
155
|
+
"error": f"UniProt entry not found for {gene} (HTTP {resp.status_code})",
|
|
156
|
+
"summary": f"No UniProt entry for {gene}",
|
|
157
|
+
}
|
|
158
|
+
try:
|
|
159
|
+
entry = resp.json()
|
|
160
|
+
except Exception:
|
|
161
|
+
return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt response for {gene}"}
|
|
162
|
+
else:
|
|
163
|
+
resp, error = request(
|
|
164
|
+
"GET",
|
|
165
|
+
"https://rest.uniprot.org/uniprotkb/search",
|
|
166
|
+
params={
|
|
167
|
+
"query": f"{gene} AND organism_id:9606",
|
|
168
|
+
"format": "json",
|
|
169
|
+
"size": "1",
|
|
170
|
+
},
|
|
171
|
+
timeout=15,
|
|
172
|
+
headers={"Accept": "application/json"},
|
|
173
|
+
raise_for_status=False,
|
|
174
|
+
)
|
|
175
|
+
if error:
|
|
176
|
+
return {"error": f"UniProt API error: {error}", "summary": f"Failed to query UniProt for {gene}"}
|
|
177
|
+
if resp.status_code != 200:
|
|
178
|
+
return {
|
|
179
|
+
"error": f"UniProt search failed (HTTP {resp.status_code})",
|
|
180
|
+
"summary": f"UniProt query failed for {gene}",
|
|
181
|
+
}
|
|
182
|
+
try:
|
|
183
|
+
data = resp.json()
|
|
184
|
+
except Exception:
|
|
185
|
+
return {"error": f"Invalid UniProt response for {gene}", "summary": f"Failed to parse UniProt response for {gene}"}
|
|
186
|
+
results = data.get("results", [])
|
|
187
|
+
if not results:
|
|
188
|
+
return {
|
|
189
|
+
"error": f"No UniProt entry found for gene {gene} in human",
|
|
190
|
+
"summary": f"Gene {gene} not found in UniProt (human)",
|
|
191
|
+
}
|
|
192
|
+
entry = results[0]
|
|
193
|
+
|
|
194
|
+
# Extract basic info
|
|
195
|
+
uniprot_id = entry.get("primaryAccession", "")
|
|
196
|
+
protein_desc = entry.get("proteinDescription", {})
|
|
197
|
+
rec_name = protein_desc.get("recommendedName", {})
|
|
198
|
+
protein_name = rec_name.get("fullName", {}).get("value", gene)
|
|
199
|
+
|
|
200
|
+
gene_names = entry.get("genes", [])
|
|
201
|
+
gene_symbol = gene_names[0].get("geneName", {}).get("value", gene) if gene_names else gene
|
|
202
|
+
|
|
203
|
+
seq_info = entry.get("sequence", {})
|
|
204
|
+
seq_length = seq_info.get("length", 0)
|
|
205
|
+
|
|
206
|
+
# Extract comments (function, location, tissue specificity, etc.)
|
|
207
|
+
comments = entry.get("comments", [])
|
|
208
|
+
|
|
209
|
+
function_text = ""
|
|
210
|
+
subcellular_locations = []
|
|
211
|
+
tissue_specificity = ""
|
|
212
|
+
disease_associations = []
|
|
213
|
+
catalytic_activity = []
|
|
214
|
+
|
|
215
|
+
for comment in comments:
|
|
216
|
+
ct = comment.get("commentType", "")
|
|
217
|
+
|
|
218
|
+
if ct == "FUNCTION":
|
|
219
|
+
texts = comment.get("texts", [])
|
|
220
|
+
if texts:
|
|
221
|
+
function_text = texts[0].get("value", "")
|
|
222
|
+
|
|
223
|
+
elif ct == "SUBCELLULAR LOCATION":
|
|
224
|
+
for sl in comment.get("subcellularLocations", []):
|
|
225
|
+
loc = sl.get("location", {}).get("value", "")
|
|
226
|
+
if loc:
|
|
227
|
+
subcellular_locations.append(loc)
|
|
228
|
+
|
|
229
|
+
elif ct == "TISSUE SPECIFICITY":
|
|
230
|
+
texts = comment.get("texts", [])
|
|
231
|
+
if texts:
|
|
232
|
+
tissue_specificity = texts[0].get("value", "")
|
|
233
|
+
|
|
234
|
+
elif ct == "DISEASE":
|
|
235
|
+
disease = comment.get("disease", {})
|
|
236
|
+
if disease:
|
|
237
|
+
disease_associations.append({
|
|
238
|
+
"name": disease.get("diseaseId", ""),
|
|
239
|
+
"description": disease.get("description", ""),
|
|
240
|
+
"acronym": disease.get("acronym", ""),
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
elif ct == "CATALYTIC ACTIVITY":
|
|
244
|
+
reaction = comment.get("reaction", {})
|
|
245
|
+
if reaction:
|
|
246
|
+
catalytic_activity.append(reaction.get("name", ""))
|
|
247
|
+
|
|
248
|
+
# Extract features
|
|
249
|
+
features = entry.get("features", [])
|
|
250
|
+
domains = []
|
|
251
|
+
ptms = []
|
|
252
|
+
active_sites = []
|
|
253
|
+
binding_sites = []
|
|
254
|
+
|
|
255
|
+
for feat in features:
|
|
256
|
+
ftype = feat.get("type", "")
|
|
257
|
+
desc = feat.get("description", "")
|
|
258
|
+
loc = feat.get("location", {})
|
|
259
|
+
start = loc.get("start", {}).get("value")
|
|
260
|
+
end = loc.get("end", {}).get("value")
|
|
261
|
+
|
|
262
|
+
if ftype == "Domain":
|
|
263
|
+
domains.append({"name": desc, "start": start, "end": end})
|
|
264
|
+
elif ftype in ("Modified residue", "Glycosylation", "Disulfide bond", "Cross-link", "Lipidation"):
|
|
265
|
+
ptms.append({"type": ftype, "description": desc, "position": start})
|
|
266
|
+
elif ftype == "Active site":
|
|
267
|
+
active_sites.append({"description": desc, "position": start})
|
|
268
|
+
elif ftype == "Binding site":
|
|
269
|
+
binding_sites.append({"description": desc, "start": start, "end": end})
|
|
270
|
+
|
|
271
|
+
# Extract GO terms from cross-references
|
|
272
|
+
xrefs = entry.get("uniProtKBCrossReferences", [])
|
|
273
|
+
go_terms = {"biological_process": [], "molecular_function": [], "cellular_component": []}
|
|
274
|
+
for xref in xrefs:
|
|
275
|
+
if xref.get("database") == "GO":
|
|
276
|
+
props = xref.get("properties", [])
|
|
277
|
+
go_id = xref.get("id", "")
|
|
278
|
+
term_name = ""
|
|
279
|
+
term_type = ""
|
|
280
|
+
for p in props:
|
|
281
|
+
if p.get("key") == "GoTerm":
|
|
282
|
+
val = p.get("value", "")
|
|
283
|
+
if val.startswith("P:"):
|
|
284
|
+
term_type = "biological_process"
|
|
285
|
+
term_name = val[2:]
|
|
286
|
+
elif val.startswith("F:"):
|
|
287
|
+
term_type = "molecular_function"
|
|
288
|
+
term_name = val[2:]
|
|
289
|
+
elif val.startswith("C:"):
|
|
290
|
+
term_type = "cellular_component"
|
|
291
|
+
term_name = val[2:]
|
|
292
|
+
if term_type and term_name:
|
|
293
|
+
go_terms[term_type].append({"id": go_id, "name": term_name})
|
|
294
|
+
|
|
295
|
+
# Extract keywords
|
|
296
|
+
keywords = [kw.get("name", "") for kw in entry.get("keywords", [])]
|
|
297
|
+
|
|
298
|
+
# Build summary
|
|
299
|
+
location_str = ", ".join(subcellular_locations[:3]) if subcellular_locations else "Unknown"
|
|
300
|
+
domain_str = f"{len(domains)} {'domain' if len(domains) == 1 else 'domains'}"
|
|
301
|
+
if domains:
|
|
302
|
+
domain_names = ", ".join(d["name"] for d in domains[:4])
|
|
303
|
+
domain_str += f" ({domain_names})"
|
|
304
|
+
|
|
305
|
+
disease_str = ""
|
|
306
|
+
if disease_associations:
|
|
307
|
+
disease_names = ", ".join(d["name"] for d in disease_associations[:3])
|
|
308
|
+
disease_str = f" Associated with {disease_names}."
|
|
309
|
+
|
|
310
|
+
func_short = function_text[:150] + "..." if len(function_text) > 150 else function_text
|
|
311
|
+
|
|
312
|
+
summary = (
|
|
313
|
+
f"{gene_symbol} ({uniprot_id}): {func_short} "
|
|
314
|
+
f"{location_str}. {domain_str}.{disease_str}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
return {
|
|
318
|
+
"summary": summary,
|
|
319
|
+
"uniprot_id": uniprot_id,
|
|
320
|
+
"gene": gene_symbol,
|
|
321
|
+
"protein_name": protein_name,
|
|
322
|
+
"sequence_length": seq_length,
|
|
323
|
+
"function": function_text,
|
|
324
|
+
"subcellular_locations": subcellular_locations,
|
|
325
|
+
"tissue_specificity": tissue_specificity,
|
|
326
|
+
"go_terms": go_terms,
|
|
327
|
+
"domains": domains,
|
|
328
|
+
"ptms": ptms[:30],
|
|
329
|
+
"active_sites": active_sites,
|
|
330
|
+
"binding_sites": binding_sites,
|
|
331
|
+
"disease_associations": disease_associations,
|
|
332
|
+
"catalytic_activity": catalytic_activity,
|
|
333
|
+
"keywords": keywords,
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
@registry.register(
|
|
338
|
+
name="protein.domain_annotate",
|
|
339
|
+
description="Annotate protein domains, families, and functional sites using InterPro",
|
|
340
|
+
category="protein",
|
|
341
|
+
parameters={
|
|
342
|
+
"gene": "Gene symbol (e.g. TP53) or domain/family keyword (e.g. CAP superfamily)",
|
|
343
|
+
"uniprot_id": "UniProt accession (e.g. P04637) — used directly if provided",
|
|
344
|
+
},
|
|
345
|
+
usage_guide="You need detailed domain architecture for a protein — domain boundaries, family classifications, active sites, binding sites. Can also search InterPro by domain/family keyword when no UniProt accession can be resolved.",
|
|
346
|
+
)
|
|
347
|
+
def domain_annotate(gene: str = None, uniprot_id: str = None, **kwargs) -> dict:
|
|
348
|
+
"""Annotate domains using InterPro API.
|
|
349
|
+
|
|
350
|
+
Resolves gene to UniProt ID if needed, then queries InterPro for full
|
|
351
|
+
domain architecture including Pfam, SMART, PROSITE, and other member databases.
|
|
352
|
+
"""
|
|
353
|
+
if not gene and not uniprot_id:
|
|
354
|
+
return {
|
|
355
|
+
"error": "Provide either gene symbol or uniprot_id",
|
|
356
|
+
"summary": "No gene or UniProt ID specified",
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
non_human_hints = (
|
|
360
|
+
"helminth", "parasite", "schistosoma", "fasciola", "heligmosomoides",
|
|
361
|
+
"nematode", "trematode", "cestode", "worm", "brugia", "filaria",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def _looks_non_human(text: str) -> bool:
|
|
365
|
+
t = (text or "").lower()
|
|
366
|
+
return any(h in t for h in non_human_hints)
|
|
367
|
+
|
|
368
|
+
def _resolve_uniprot(gene_query: str) -> tuple[str, list[str]]:
|
|
369
|
+
attempts: list[str] = []
|
|
370
|
+
search_terms: list[str] = []
|
|
371
|
+
if _looks_non_human(gene_query):
|
|
372
|
+
search_terms.extend([gene_query, f"{gene_query} AND reviewed:true"])
|
|
373
|
+
else:
|
|
374
|
+
search_terms.extend(
|
|
375
|
+
[
|
|
376
|
+
f"{gene_query} AND organism_id:9606",
|
|
377
|
+
gene_query,
|
|
378
|
+
]
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
for term in search_terms:
|
|
382
|
+
if term in attempts:
|
|
383
|
+
continue
|
|
384
|
+
attempts.append(term)
|
|
385
|
+
resp, error = request(
|
|
386
|
+
"GET",
|
|
387
|
+
"https://rest.uniprot.org/uniprotkb/search",
|
|
388
|
+
params={
|
|
389
|
+
"query": term,
|
|
390
|
+
"format": "json",
|
|
391
|
+
"size": "1",
|
|
392
|
+
"fields": "accession,gene_names",
|
|
393
|
+
},
|
|
394
|
+
timeout=15,
|
|
395
|
+
headers={"Accept": "application/json"},
|
|
396
|
+
raise_for_status=False,
|
|
397
|
+
)
|
|
398
|
+
if error or resp.status_code != 200:
|
|
399
|
+
continue
|
|
400
|
+
try:
|
|
401
|
+
results = resp.json().get("results", [])
|
|
402
|
+
except Exception:
|
|
403
|
+
results = []
|
|
404
|
+
if results:
|
|
405
|
+
accession = results[0].get("primaryAccession", "")
|
|
406
|
+
if accession:
|
|
407
|
+
return accession, attempts
|
|
408
|
+
return "", attempts
|
|
409
|
+
|
|
410
|
+
def _interpro_keyword_search(term: str) -> dict | None:
|
|
411
|
+
cleaned = " ".join((term or "").split())
|
|
412
|
+
if not cleaned:
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
endpoints = (
|
|
416
|
+
"https://www.ebi.ac.uk/interpro/api/entry/interpro/",
|
|
417
|
+
"https://www.ebi.ac.uk/interpro/api/entry/all/",
|
|
418
|
+
)
|
|
419
|
+
for endpoint in endpoints:
|
|
420
|
+
resp, error = request(
|
|
421
|
+
"GET",
|
|
422
|
+
endpoint,
|
|
423
|
+
params={"search": cleaned, "page_size": "20"},
|
|
424
|
+
timeout=15,
|
|
425
|
+
headers={"Accept": "application/json"},
|
|
426
|
+
raise_for_status=False,
|
|
427
|
+
)
|
|
428
|
+
if error or resp.status_code != 200:
|
|
429
|
+
continue
|
|
430
|
+
try:
|
|
431
|
+
data = resp.json()
|
|
432
|
+
except Exception:
|
|
433
|
+
continue
|
|
434
|
+
results = data.get("results", [])
|
|
435
|
+
if not results:
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
domains = []
|
|
439
|
+
families = []
|
|
440
|
+
for entry in results:
|
|
441
|
+
md = entry.get("metadata", {}) or {}
|
|
442
|
+
etype = md.get("type", "")
|
|
443
|
+
annotation = {
|
|
444
|
+
"accession": md.get("accession", ""),
|
|
445
|
+
"name": md.get("name", ""),
|
|
446
|
+
"type": etype,
|
|
447
|
+
"source_database": md.get("source_database", ""),
|
|
448
|
+
"description": (
|
|
449
|
+
(md.get("description") or [{}])[0].get("text", "")
|
|
450
|
+
if isinstance(md.get("description"), list)
|
|
451
|
+
else ""
|
|
452
|
+
)[:200],
|
|
453
|
+
"locations": [],
|
|
454
|
+
}
|
|
455
|
+
if etype == "domain":
|
|
456
|
+
domains.append(annotation)
|
|
457
|
+
elif etype == "family":
|
|
458
|
+
families.append(annotation)
|
|
459
|
+
|
|
460
|
+
return {
|
|
461
|
+
"summary": (
|
|
462
|
+
f"InterPro keyword search '{cleaned}': "
|
|
463
|
+
f"{len(domains)} domains, {len(families)} families (no single UniProt mapping)."
|
|
464
|
+
),
|
|
465
|
+
"gene": gene,
|
|
466
|
+
"uniprot_id": None,
|
|
467
|
+
"n_domains": len(domains),
|
|
468
|
+
"n_families": len(families),
|
|
469
|
+
"n_sites": 0,
|
|
470
|
+
"domains": domains[:30],
|
|
471
|
+
"families": families[:30],
|
|
472
|
+
"sites": [],
|
|
473
|
+
"homologous_superfamilies": [],
|
|
474
|
+
"total_annotations": len(results),
|
|
475
|
+
"mode": "interpro_keyword_search",
|
|
476
|
+
}
|
|
477
|
+
return None
|
|
478
|
+
|
|
479
|
+
# InterPro entry accession mode (e.g. IPR014044) for domain-family lookup.
|
|
480
|
+
interpro_accession = None
|
|
481
|
+
if isinstance(uniprot_id, str) and re.fullmatch(r"IPR\d{6,}", uniprot_id.strip().upper() or ""):
|
|
482
|
+
interpro_accession = uniprot_id.strip().upper()
|
|
483
|
+
elif isinstance(gene, str) and re.fullmatch(r"IPR\d{6,}", gene.strip().upper() or ""):
|
|
484
|
+
interpro_accession = gene.strip().upper()
|
|
485
|
+
|
|
486
|
+
if interpro_accession:
|
|
487
|
+
resp, error = request(
|
|
488
|
+
"GET",
|
|
489
|
+
f"https://www.ebi.ac.uk/interpro/api/entry/interpro/{interpro_accession}",
|
|
490
|
+
timeout=15,
|
|
491
|
+
headers={"Accept": "application/json"},
|
|
492
|
+
raise_for_status=False,
|
|
493
|
+
)
|
|
494
|
+
if error or resp.status_code != 200:
|
|
495
|
+
# Fallback through keyword search path
|
|
496
|
+
keyword_result = _interpro_keyword_search(interpro_accession)
|
|
497
|
+
if keyword_result is not None:
|
|
498
|
+
return keyword_result
|
|
499
|
+
return {
|
|
500
|
+
"error": f"InterPro entry lookup failed for {interpro_accession}",
|
|
501
|
+
"summary": f"No InterPro entry found for {interpro_accession}",
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
try:
|
|
505
|
+
data = resp.json()
|
|
506
|
+
except Exception:
|
|
507
|
+
return {
|
|
508
|
+
"error": f"Invalid InterPro response for {interpro_accession}",
|
|
509
|
+
"summary": f"Failed to parse InterPro response for {interpro_accession}",
|
|
510
|
+
}
|
|
511
|
+
results = data.get("results", [])
|
|
512
|
+
if not results:
|
|
513
|
+
return {
|
|
514
|
+
"error": f"No InterPro entry results for {interpro_accession}",
|
|
515
|
+
"summary": f"No InterPro data for {interpro_accession}",
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
domains = []
|
|
519
|
+
families = []
|
|
520
|
+
for entry in results:
|
|
521
|
+
md = entry.get("metadata", {}) or {}
|
|
522
|
+
etype = md.get("type", "")
|
|
523
|
+
annotation = {
|
|
524
|
+
"accession": md.get("accession", ""),
|
|
525
|
+
"name": md.get("name", ""),
|
|
526
|
+
"type": etype,
|
|
527
|
+
"source_database": md.get("source_database", ""),
|
|
528
|
+
"description": (
|
|
529
|
+
(md.get("description") or [{}])[0].get("text", "")
|
|
530
|
+
if isinstance(md.get("description"), list)
|
|
531
|
+
else ""
|
|
532
|
+
)[:200],
|
|
533
|
+
"locations": [],
|
|
534
|
+
}
|
|
535
|
+
if etype == "domain":
|
|
536
|
+
domains.append(annotation)
|
|
537
|
+
elif etype == "family":
|
|
538
|
+
families.append(annotation)
|
|
539
|
+
return {
|
|
540
|
+
"summary": (
|
|
541
|
+
f"InterPro {interpro_accession}: {len(domains)} domains, {len(families)} families."
|
|
542
|
+
),
|
|
543
|
+
"gene": gene,
|
|
544
|
+
"uniprot_id": None,
|
|
545
|
+
"n_domains": len(domains),
|
|
546
|
+
"n_families": len(families),
|
|
547
|
+
"n_sites": 0,
|
|
548
|
+
"domains": domains,
|
|
549
|
+
"families": families,
|
|
550
|
+
"sites": [],
|
|
551
|
+
"homologous_superfamilies": [],
|
|
552
|
+
"total_annotations": len(results),
|
|
553
|
+
"mode": "interpro_accession_lookup",
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
# Resolve gene to UniProt ID if needed
|
|
557
|
+
if not uniprot_id and gene:
|
|
558
|
+
uniprot_id, attempts = _resolve_uniprot(gene)
|
|
559
|
+
|
|
560
|
+
if not uniprot_id:
|
|
561
|
+
keyword_result = _interpro_keyword_search(gene)
|
|
562
|
+
if keyword_result is not None:
|
|
563
|
+
return keyword_result
|
|
564
|
+
attempted = "; ".join(attempts[:4])
|
|
565
|
+
return {
|
|
566
|
+
"error": f"Could not resolve gene {gene} to UniProt ID",
|
|
567
|
+
"summary": f"Gene {gene} not found in UniProt search",
|
|
568
|
+
"resolution_attempts": attempts,
|
|
569
|
+
"attempted_query_preview": attempted,
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
# Query InterPro for protein domain annotations
|
|
573
|
+
resp, error = request(
|
|
574
|
+
"GET",
|
|
575
|
+
f"https://www.ebi.ac.uk/interpro/api/entry/all/protein/uniprot/{uniprot_id}",
|
|
576
|
+
timeout=15,
|
|
577
|
+
headers={"Accept": "application/json"},
|
|
578
|
+
raise_for_status=False,
|
|
579
|
+
)
|
|
580
|
+
if error:
|
|
581
|
+
# Final fallback: keyword search if a gene/domain term is available.
|
|
582
|
+
if gene:
|
|
583
|
+
keyword_result = _interpro_keyword_search(gene)
|
|
584
|
+
if keyword_result is not None:
|
|
585
|
+
return keyword_result
|
|
586
|
+
return {"error": f"InterPro API error: {error}", "summary": f"Failed to query InterPro for {uniprot_id}"}
|
|
587
|
+
if resp.status_code == 204:
|
|
588
|
+
data = {"results": []}
|
|
589
|
+
elif resp.status_code != 200:
|
|
590
|
+
if gene:
|
|
591
|
+
keyword_result = _interpro_keyword_search(gene)
|
|
592
|
+
if keyword_result is not None:
|
|
593
|
+
return keyword_result
|
|
594
|
+
return {
|
|
595
|
+
"error": f"InterPro query failed for {uniprot_id} (HTTP {resp.status_code})",
|
|
596
|
+
"summary": f"No InterPro data for {uniprot_id}",
|
|
597
|
+
}
|
|
598
|
+
else:
|
|
599
|
+
try:
|
|
600
|
+
data = resp.json()
|
|
601
|
+
except Exception:
|
|
602
|
+
return {"error": f"Invalid InterPro response for {uniprot_id}", "summary": f"Failed to parse InterPro response for {uniprot_id}"}
|
|
603
|
+
|
|
604
|
+
# Parse InterPro results
|
|
605
|
+
entries = data.get("results", [])
|
|
606
|
+
|
|
607
|
+
domains = []
|
|
608
|
+
families = []
|
|
609
|
+
sites = []
|
|
610
|
+
homologous_superfamilies = []
|
|
611
|
+
|
|
612
|
+
for entry in entries:
|
|
613
|
+
metadata = entry.get("metadata", {})
|
|
614
|
+
entry_type = metadata.get("type", "")
|
|
615
|
+
entry_name = metadata.get("name", "")
|
|
616
|
+
entry_accession = metadata.get("accession", "")
|
|
617
|
+
source_db = metadata.get("source_database", "")
|
|
618
|
+
description = metadata.get("description", [])
|
|
619
|
+
desc_text = description[0].get("text", "") if description else ""
|
|
620
|
+
|
|
621
|
+
# Get protein locations (domain positions)
|
|
622
|
+
proteins = entry.get("proteins", [])
|
|
623
|
+
locations = []
|
|
624
|
+
for protein in proteins:
|
|
625
|
+
for loc_group in protein.get("entry_protein_locations", []):
|
|
626
|
+
for fragment in loc_group.get("fragments", []):
|
|
627
|
+
locations.append({
|
|
628
|
+
"start": fragment.get("start"),
|
|
629
|
+
"end": fragment.get("end"),
|
|
630
|
+
})
|
|
631
|
+
|
|
632
|
+
annotation = {
|
|
633
|
+
"accession": entry_accession,
|
|
634
|
+
"name": entry_name,
|
|
635
|
+
"type": entry_type,
|
|
636
|
+
"source_database": source_db,
|
|
637
|
+
"description": desc_text[:200],
|
|
638
|
+
"locations": locations,
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
if entry_type == "domain":
|
|
642
|
+
domains.append(annotation)
|
|
643
|
+
elif entry_type == "family":
|
|
644
|
+
families.append(annotation)
|
|
645
|
+
elif entry_type in ("active_site", "binding_site", "conserved_site", "ptm"):
|
|
646
|
+
sites.append(annotation)
|
|
647
|
+
elif entry_type == "homologous_superfamily":
|
|
648
|
+
homologous_superfamilies.append(annotation)
|
|
649
|
+
|
|
650
|
+
# Build summary
|
|
651
|
+
gene_label = gene or uniprot_id
|
|
652
|
+
domain_strs = []
|
|
653
|
+
for d in domains:
|
|
654
|
+
loc_str = ""
|
|
655
|
+
if d["locations"]:
|
|
656
|
+
locs = d["locations"][0]
|
|
657
|
+
loc_str = f" ({locs['start']}-{locs['end']})"
|
|
658
|
+
domain_strs.append(f"{d['name']}{loc_str}")
|
|
659
|
+
|
|
660
|
+
summary = (
|
|
661
|
+
f"{gene_label}: {len(domains)} domain{'s' if len(domains) != 1 else ''}"
|
|
662
|
+
)
|
|
663
|
+
if domain_strs:
|
|
664
|
+
summary += f" — {', '.join(domain_strs[:6])}"
|
|
665
|
+
|
|
666
|
+
return {
|
|
667
|
+
"summary": summary,
|
|
668
|
+
"gene": gene,
|
|
669
|
+
"uniprot_id": uniprot_id,
|
|
670
|
+
"n_domains": len(domains),
|
|
671
|
+
"n_families": len(families),
|
|
672
|
+
"n_sites": len(sites),
|
|
673
|
+
"domains": domains,
|
|
674
|
+
"families": families,
|
|
675
|
+
"sites": sites,
|
|
676
|
+
"homologous_superfamilies": homologous_superfamilies,
|
|
677
|
+
"total_annotations": len(entries),
|
|
678
|
+
}
|