celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/parity.py
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clean-room parity tools built from public/open APIs.
|
|
3
|
+
|
|
4
|
+
Adds practical connectors and utilities inspired by commonly requested platform
|
|
5
|
+
capabilities while staying implementation-original inside ct.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datetime import date
|
|
11
|
+
import re
|
|
12
|
+
import xml.etree.ElementTree as ET
|
|
13
|
+
|
|
14
|
+
from ct.tools import registry
|
|
15
|
+
from ct.tools.http_client import request, request_json
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _clip(value, n: int = 240) -> str:
|
|
19
|
+
text = str(value or "").replace("\n", " ").strip()
|
|
20
|
+
if len(text) <= n:
|
|
21
|
+
return text
|
|
22
|
+
return text[: n - 3] + "..."
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _coerce_int(value, default: int, minimum: int = 1, maximum: int = 100) -> int:
|
|
26
|
+
try:
|
|
27
|
+
out = int(value)
|
|
28
|
+
except Exception:
|
|
29
|
+
out = default
|
|
30
|
+
return min(max(out, minimum), maximum)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _flatten_hits(payload, key: str = "hits") -> list[dict]:
|
|
34
|
+
hits = payload.get(key, []) if isinstance(payload, dict) else []
|
|
35
|
+
if isinstance(hits, dict):
|
|
36
|
+
# my* APIs sometimes return dict hits keyed by ids
|
|
37
|
+
hits = list(hits.values())
|
|
38
|
+
return [h for h in hits if isinstance(h, dict)]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_MYGENE_SPECIES_MAP = {
|
|
42
|
+
"human": "human",
|
|
43
|
+
"mouse": "mouse",
|
|
44
|
+
"rat": "rat",
|
|
45
|
+
"zebrafish": "zebrafish",
|
|
46
|
+
"drosophila": "fly",
|
|
47
|
+
"yeast": "yeast",
|
|
48
|
+
"schistosoma mansoni": "6183",
|
|
49
|
+
"fasciola hepatica": "6192",
|
|
50
|
+
"heligmosomoides polygyrus": "6337",
|
|
51
|
+
"nippostrongylus brasiliensis": "27835",
|
|
52
|
+
"trichuris muris": "70415",
|
|
53
|
+
"brugia malayi": "6279",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _normalize_mygene_species(species: str) -> str:
|
|
58
|
+
s = (species or "human").strip().lower()
|
|
59
|
+
if not s:
|
|
60
|
+
return "human"
|
|
61
|
+
if s.isdigit():
|
|
62
|
+
return s
|
|
63
|
+
if s in _MYGENE_SPECIES_MAP:
|
|
64
|
+
return _MYGENE_SPECIES_MAP[s]
|
|
65
|
+
|
|
66
|
+
# Extract likely binomial species from noisy planner text.
|
|
67
|
+
m = re.search(r"([A-Za-z][a-z]+)\s+([a-z][a-z]+)", s)
|
|
68
|
+
if m:
|
|
69
|
+
candidate = f"{m.group(1)} {m.group(2)}".lower()
|
|
70
|
+
if candidate in _MYGENE_SPECIES_MAP:
|
|
71
|
+
return _MYGENE_SPECIES_MAP[candidate]
|
|
72
|
+
|
|
73
|
+
# Fallback to original value; caller can still get API errors surfaced.
|
|
74
|
+
return species
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@registry.register(
|
|
78
|
+
name="data_api.mygene_lookup",
|
|
79
|
+
description="Lookup genes via MyGene.info",
|
|
80
|
+
category="data_api",
|
|
81
|
+
parameters={
|
|
82
|
+
"query": "Gene symbol/name/identifier (e.g., TP53, ENSG00000141510)",
|
|
83
|
+
"species": "Species filter (default human)",
|
|
84
|
+
"size": "Maximum hits (default 10)",
|
|
85
|
+
},
|
|
86
|
+
usage_guide="Use for rapid gene identifier normalization and annotation via MyGene.info.",
|
|
87
|
+
)
|
|
88
|
+
def mygene_lookup(query: str, species: str = "human", size: int = 10, **kwargs) -> dict:
|
|
89
|
+
"""Query MyGene.info for gene-level metadata."""
|
|
90
|
+
q = (query or "").strip()
|
|
91
|
+
if not q:
|
|
92
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
93
|
+
|
|
94
|
+
species_norm = _normalize_mygene_species(species)
|
|
95
|
+
|
|
96
|
+
data, error = request_json(
|
|
97
|
+
"GET",
|
|
98
|
+
"https://mygene.info/v3/query",
|
|
99
|
+
params={
|
|
100
|
+
"q": q,
|
|
101
|
+
"species": species_norm,
|
|
102
|
+
"size": _coerce_int(size, 10),
|
|
103
|
+
"fields": "symbol,name,entrezgene,ensembl.gene,taxid,type_of_gene",
|
|
104
|
+
},
|
|
105
|
+
timeout=20,
|
|
106
|
+
retries=2,
|
|
107
|
+
)
|
|
108
|
+
if error:
|
|
109
|
+
return {"summary": f"MyGene lookup failed: {error}", "error": "api_error"}
|
|
110
|
+
|
|
111
|
+
hits = _flatten_hits(data)
|
|
112
|
+
rows = []
|
|
113
|
+
for hit in hits:
|
|
114
|
+
ens = hit.get("ensembl")
|
|
115
|
+
if isinstance(ens, list) and ens:
|
|
116
|
+
ensembl_gene = ens[0].get("gene")
|
|
117
|
+
elif isinstance(ens, dict):
|
|
118
|
+
ensembl_gene = ens.get("gene")
|
|
119
|
+
else:
|
|
120
|
+
ensembl_gene = None
|
|
121
|
+
rows.append(
|
|
122
|
+
{
|
|
123
|
+
"symbol": hit.get("symbol"),
|
|
124
|
+
"name": hit.get("name"),
|
|
125
|
+
"entrezgene": hit.get("entrezgene"),
|
|
126
|
+
"ensembl_gene": ensembl_gene,
|
|
127
|
+
"taxid": hit.get("taxid"),
|
|
128
|
+
"type_of_gene": hit.get("type_of_gene"),
|
|
129
|
+
"score": hit.get("_score"),
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
"summary": f"MyGene: found {len(rows)} hits for '{q}'.",
|
|
135
|
+
"query": q,
|
|
136
|
+
"species": species_norm,
|
|
137
|
+
"requested_species": species,
|
|
138
|
+
"hits": rows,
|
|
139
|
+
"count": len(rows),
|
|
140
|
+
"source": "mygene.info",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@registry.register(
|
|
145
|
+
name="data_api.mydisease_lookup",
|
|
146
|
+
description="Lookup diseases via MyDisease.info",
|
|
147
|
+
category="data_api",
|
|
148
|
+
parameters={
|
|
149
|
+
"query": "Disease keyword/identifier",
|
|
150
|
+
"size": "Maximum hits (default 10)",
|
|
151
|
+
},
|
|
152
|
+
usage_guide="Use for disease identifier mapping and cross-source disease metadata.",
|
|
153
|
+
)
|
|
154
|
+
def mydisease_lookup(query: str, size: int = 10, **kwargs) -> dict:
|
|
155
|
+
"""Query MyDisease.info for disease metadata."""
|
|
156
|
+
q = (query or "").strip()
|
|
157
|
+
if not q:
|
|
158
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
159
|
+
|
|
160
|
+
data, error = request_json(
|
|
161
|
+
"GET",
|
|
162
|
+
"https://mydisease.info/v1/query",
|
|
163
|
+
params={"q": q, "size": _coerce_int(size, 10)},
|
|
164
|
+
timeout=20,
|
|
165
|
+
retries=2,
|
|
166
|
+
)
|
|
167
|
+
if error:
|
|
168
|
+
return {"summary": f"MyDisease lookup failed: {error}", "error": "api_error"}
|
|
169
|
+
|
|
170
|
+
rows = []
|
|
171
|
+
for hit in _flatten_hits(data):
|
|
172
|
+
disease_name = hit.get("name")
|
|
173
|
+
doid = None
|
|
174
|
+
do_block = hit.get("disease_ontology")
|
|
175
|
+
if isinstance(do_block, dict):
|
|
176
|
+
doid = do_block.get("doid") or do_block.get("id")
|
|
177
|
+
disease_name = disease_name or do_block.get("name")
|
|
178
|
+
rows.append(
|
|
179
|
+
{
|
|
180
|
+
"name": disease_name,
|
|
181
|
+
"doid": doid,
|
|
182
|
+
"mondo": hit.get("mondo"),
|
|
183
|
+
"score": hit.get("_score"),
|
|
184
|
+
"id": hit.get("_id"),
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
"summary": f"MyDisease: found {len(rows)} hits for '{q}'.",
|
|
190
|
+
"query": q,
|
|
191
|
+
"hits": rows,
|
|
192
|
+
"count": len(rows),
|
|
193
|
+
"source": "mydisease.info",
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@registry.register(
|
|
198
|
+
name="data_api.myvariant_lookup",
|
|
199
|
+
description="Lookup variants via MyVariant.info",
|
|
200
|
+
category="data_api",
|
|
201
|
+
parameters={
|
|
202
|
+
"query": "Variant keyword/identifier (e.g., rs121913529, chr17:g.7673803G>A)",
|
|
203
|
+
"size": "Maximum hits (default 10)",
|
|
204
|
+
},
|
|
205
|
+
usage_guide="Use for quick variant annotation triage from aggregated public sources.",
|
|
206
|
+
)
|
|
207
|
+
def myvariant_lookup(query: str, size: int = 10, **kwargs) -> dict:
|
|
208
|
+
"""Query MyVariant.info for variant annotations."""
|
|
209
|
+
q = (query or "").strip()
|
|
210
|
+
if not q:
|
|
211
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
212
|
+
|
|
213
|
+
data, error = request_json(
|
|
214
|
+
"GET",
|
|
215
|
+
"https://myvariant.info/v1/query",
|
|
216
|
+
params={
|
|
217
|
+
"q": q,
|
|
218
|
+
"size": _coerce_int(size, 10),
|
|
219
|
+
"fields": "dbsnp.rsid,clinvar.hgvs,clinvar.clinsig,vcf.gene,vcf.position",
|
|
220
|
+
},
|
|
221
|
+
timeout=20,
|
|
222
|
+
retries=2,
|
|
223
|
+
)
|
|
224
|
+
if error:
|
|
225
|
+
return {"summary": f"MyVariant lookup failed: {error}", "error": "api_error"}
|
|
226
|
+
|
|
227
|
+
rows = []
|
|
228
|
+
for hit in _flatten_hits(data):
|
|
229
|
+
dbsnp = hit.get("dbsnp")
|
|
230
|
+
rsid = None
|
|
231
|
+
if isinstance(dbsnp, dict):
|
|
232
|
+
rsid = dbsnp.get("rsid")
|
|
233
|
+
clinvar = hit.get("clinvar") if isinstance(hit.get("clinvar"), dict) else {}
|
|
234
|
+
vcf = hit.get("vcf") if isinstance(hit.get("vcf"), dict) else {}
|
|
235
|
+
rows.append(
|
|
236
|
+
{
|
|
237
|
+
"id": hit.get("_id"),
|
|
238
|
+
"rsid": rsid,
|
|
239
|
+
"hgvs": clinvar.get("hgvs"),
|
|
240
|
+
"clinical_significance": clinvar.get("clinsig"),
|
|
241
|
+
"gene": vcf.get("gene"),
|
|
242
|
+
"position": vcf.get("position"),
|
|
243
|
+
"score": hit.get("_score"),
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
"summary": f"MyVariant: found {len(rows)} hits for '{q}'.",
|
|
249
|
+
"query": q,
|
|
250
|
+
"hits": rows,
|
|
251
|
+
"count": len(rows),
|
|
252
|
+
"source": "myvariant.info",
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@registry.register(
|
|
257
|
+
name="data_api.mytaxon_lookup",
|
|
258
|
+
description="Lookup taxonomy records via MyTaxon.info",
|
|
259
|
+
category="data_api",
|
|
260
|
+
parameters={
|
|
261
|
+
"query": "Species/taxon keyword or taxonomy ID",
|
|
262
|
+
"size": "Maximum hits (default 10)",
|
|
263
|
+
},
|
|
264
|
+
usage_guide="Use for organism/taxonomy normalization in multi-species analyses.",
|
|
265
|
+
)
|
|
266
|
+
def mytaxon_lookup(query: str, size: int = 10, **kwargs) -> dict:
|
|
267
|
+
"""Query MyTaxon.info for taxonomy metadata."""
|
|
268
|
+
q = (query or "").strip()
|
|
269
|
+
if not q:
|
|
270
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
271
|
+
|
|
272
|
+
data, error = request_json(
|
|
273
|
+
"GET",
|
|
274
|
+
"https://mytaxon.info/v1/query",
|
|
275
|
+
params={"q": q, "size": _coerce_int(size, 10)},
|
|
276
|
+
timeout=20,
|
|
277
|
+
retries=2,
|
|
278
|
+
)
|
|
279
|
+
if error:
|
|
280
|
+
return {"summary": f"MyTaxon lookup failed: {error}", "error": "api_error"}
|
|
281
|
+
|
|
282
|
+
rows = []
|
|
283
|
+
for hit in _flatten_hits(data):
|
|
284
|
+
rows.append(
|
|
285
|
+
{
|
|
286
|
+
"taxid": hit.get("_id") or hit.get("taxid"),
|
|
287
|
+
"scientific_name": hit.get("scientific_name"),
|
|
288
|
+
"common_name": hit.get("common_name"),
|
|
289
|
+
"rank": hit.get("rank"),
|
|
290
|
+
"parent_taxid": hit.get("parent_taxid"),
|
|
291
|
+
"score": hit.get("_score"),
|
|
292
|
+
}
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
"summary": f"MyTaxon: found {len(rows)} hits for '{q}'.",
|
|
297
|
+
"query": q,
|
|
298
|
+
"hits": rows,
|
|
299
|
+
"count": len(rows),
|
|
300
|
+
"source": "mytaxon.info",
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@registry.register(
|
|
305
|
+
name="data_api.mychem_lookup",
|
|
306
|
+
description="Lookup compounds/drugs via MyChem.info",
|
|
307
|
+
category="data_api",
|
|
308
|
+
parameters={
|
|
309
|
+
"query": "Compound/drug keyword or identifier",
|
|
310
|
+
"size": "Maximum hits (default 10)",
|
|
311
|
+
},
|
|
312
|
+
usage_guide="Use for rapid integrated compound metadata lookup across public sources.",
|
|
313
|
+
)
|
|
314
|
+
def mychem_lookup(query: str, size: int = 10, **kwargs) -> dict:
|
|
315
|
+
"""Query MyChem.info for compound metadata."""
|
|
316
|
+
q = (query or "").strip()
|
|
317
|
+
if not q:
|
|
318
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
319
|
+
|
|
320
|
+
data, error = request_json(
|
|
321
|
+
"GET",
|
|
322
|
+
"https://mychem.info/v1/query",
|
|
323
|
+
params={"q": q, "size": _coerce_int(size, 10)},
|
|
324
|
+
timeout=20,
|
|
325
|
+
retries=2,
|
|
326
|
+
)
|
|
327
|
+
if error:
|
|
328
|
+
return {"summary": f"MyChem lookup failed: {error}", "error": "api_error"}
|
|
329
|
+
|
|
330
|
+
rows = []
|
|
331
|
+
for hit in _flatten_hits(data):
|
|
332
|
+
chembl = hit.get("chembl") if isinstance(hit.get("chembl"), dict) else {}
|
|
333
|
+
drugbank = hit.get("drugbank") if isinstance(hit.get("drugbank"), dict) else {}
|
|
334
|
+
rows.append(
|
|
335
|
+
{
|
|
336
|
+
"id": hit.get("_id"),
|
|
337
|
+
"name": hit.get("name") or hit.get("pref_name") or chembl.get("pref_name"),
|
|
338
|
+
"chembl_id": chembl.get("molecule_chembl_id"),
|
|
339
|
+
"drugbank_id": drugbank.get("id"),
|
|
340
|
+
"inchi_key": hit.get("inchi_key"),
|
|
341
|
+
"smiles": hit.get("smiles") or chembl.get("molecule_structures", {}).get("canonical_smiles") if isinstance(chembl.get("molecule_structures"), dict) else None,
|
|
342
|
+
"score": hit.get("_score"),
|
|
343
|
+
}
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
"summary": f"MyChem: found {len(rows)} hits for '{q}'.",
|
|
348
|
+
"query": q,
|
|
349
|
+
"hits": rows,
|
|
350
|
+
"count": len(rows),
|
|
351
|
+
"source": "mychem.info",
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
@registry.register(
|
|
356
|
+
name="data_api.pdbe_search",
|
|
357
|
+
description="Search PDBe entries by keyword",
|
|
358
|
+
category="data_api",
|
|
359
|
+
parameters={
|
|
360
|
+
"query": "PDBe keyword query (protein, ligand, organism, etc.)",
|
|
361
|
+
"size": "Maximum hits (default 10)",
|
|
362
|
+
},
|
|
363
|
+
usage_guide="Use when you need PDBe-centric structure records and metadata.",
|
|
364
|
+
)
|
|
365
|
+
def pdbe_search(query: str, size: int = 10, **kwargs) -> dict:
|
|
366
|
+
"""Search PDBe Solr endpoint for structure entries."""
|
|
367
|
+
q = (query or "").strip()
|
|
368
|
+
if not q:
|
|
369
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
370
|
+
|
|
371
|
+
params = {
|
|
372
|
+
"q": q,
|
|
373
|
+
"wt": "json",
|
|
374
|
+
"rows": _coerce_int(size, 10),
|
|
375
|
+
"fl": "pdb_id,title,experimental_method,resolution,organism_scientific_name",
|
|
376
|
+
}
|
|
377
|
+
data, error = request_json(
|
|
378
|
+
"GET",
|
|
379
|
+
"https://www.ebi.ac.uk/pdbe/search/pdb/select",
|
|
380
|
+
params=params,
|
|
381
|
+
timeout=20,
|
|
382
|
+
retries=2,
|
|
383
|
+
)
|
|
384
|
+
if error:
|
|
385
|
+
return {"summary": f"PDBe search failed: {error}", "error": "api_error"}
|
|
386
|
+
|
|
387
|
+
docs = []
|
|
388
|
+
if isinstance(data, dict):
|
|
389
|
+
docs = ((data.get("response") or {}).get("docs") or [])
|
|
390
|
+
rows = []
|
|
391
|
+
for doc in docs:
|
|
392
|
+
if not isinstance(doc, dict):
|
|
393
|
+
continue
|
|
394
|
+
rows.append(
|
|
395
|
+
{
|
|
396
|
+
"pdb_id": doc.get("pdb_id"),
|
|
397
|
+
"title": doc.get("title"),
|
|
398
|
+
"experimental_method": doc.get("experimental_method"),
|
|
399
|
+
"resolution": doc.get("resolution"),
|
|
400
|
+
"organism": doc.get("organism_scientific_name"),
|
|
401
|
+
}
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
"summary": f"PDBe: found {len(rows)} entries for '{q}'.",
|
|
406
|
+
"query": q,
|
|
407
|
+
"entries": rows,
|
|
408
|
+
"count": len(rows),
|
|
409
|
+
"source": "pdbe",
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
@registry.register(
|
|
414
|
+
name="data_api.reactome_pathway_search",
|
|
415
|
+
description="Search Reactome pathways by keyword",
|
|
416
|
+
category="data_api",
|
|
417
|
+
parameters={
|
|
418
|
+
"query": "Pathway or gene keyword",
|
|
419
|
+
"size": "Maximum hits (default 10)",
|
|
420
|
+
},
|
|
421
|
+
usage_guide="Use to identify curated Reactome pathways related to a query.",
|
|
422
|
+
)
|
|
423
|
+
def reactome_pathway_search(query: str, size: int = 10, **kwargs) -> dict:
|
|
424
|
+
"""Search Reactome content service for pathways."""
|
|
425
|
+
q = (query or "").strip()
|
|
426
|
+
if not q:
|
|
427
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
428
|
+
|
|
429
|
+
data, error = request_json(
|
|
430
|
+
"GET",
|
|
431
|
+
"https://reactome.org/ContentService/search/query",
|
|
432
|
+
params={"query": q, "types": "Pathway", "cluster": "true"},
|
|
433
|
+
timeout=20,
|
|
434
|
+
retries=2,
|
|
435
|
+
)
|
|
436
|
+
if error:
|
|
437
|
+
return {"summary": f"Reactome search failed: {error}", "error": "api_error"}
|
|
438
|
+
|
|
439
|
+
results = []
|
|
440
|
+
if isinstance(data, list):
|
|
441
|
+
results = data
|
|
442
|
+
elif isinstance(data, dict):
|
|
443
|
+
results = data.get("results") or data.get("entries") or []
|
|
444
|
+
|
|
445
|
+
rows = []
|
|
446
|
+
for item in results[: _coerce_int(size, 10)]:
|
|
447
|
+
if not isinstance(item, dict):
|
|
448
|
+
continue
|
|
449
|
+
rows.append(
|
|
450
|
+
{
|
|
451
|
+
"st_id": item.get("stId") or item.get("id"),
|
|
452
|
+
"name": item.get("name") or item.get("displayName"),
|
|
453
|
+
"species": item.get("species") or item.get("speciesName"),
|
|
454
|
+
"type": item.get("type"),
|
|
455
|
+
"url": (
|
|
456
|
+
f"https://reactome.org/content/detail/{item.get('stId')}"
|
|
457
|
+
if item.get("stId")
|
|
458
|
+
else None
|
|
459
|
+
),
|
|
460
|
+
}
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
return {
|
|
464
|
+
"summary": f"Reactome: found {len(rows)} pathway hits for '{q}'.",
|
|
465
|
+
"query": q,
|
|
466
|
+
"pathways": rows,
|
|
467
|
+
"count": len(rows),
|
|
468
|
+
"source": "reactome",
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
@registry.register(
|
|
473
|
+
name="literature.preprint_search",
|
|
474
|
+
description="Search preprints across Europe PMC (bioRxiv/medRxiv) and arXiv",
|
|
475
|
+
category="literature",
|
|
476
|
+
parameters={
|
|
477
|
+
"query": "Search query",
|
|
478
|
+
"source": "Data source: europepmc, arxiv, or both (default both)",
|
|
479
|
+
"max_results": "Maximum results (default 10)",
|
|
480
|
+
},
|
|
481
|
+
usage_guide="Use when you need latest preprint evidence that may not yet appear in PubMed.",
|
|
482
|
+
)
|
|
483
|
+
def preprint_search(query: str, source: str = "both", max_results: int = 10, **kwargs) -> dict:
|
|
484
|
+
"""Search preprints using EuropePMC and/or arXiv."""
|
|
485
|
+
q = (query or "").strip()
|
|
486
|
+
if not q:
|
|
487
|
+
return {"summary": "query is required.", "error": "missing_query"}
|
|
488
|
+
|
|
489
|
+
source_norm = str(source or "both").strip().lower()
|
|
490
|
+
if source_norm not in {"both", "europepmc", "arxiv"}:
|
|
491
|
+
return {"summary": "Invalid source. Use europepmc, arxiv, or both.", "error": "invalid_source"}
|
|
492
|
+
|
|
493
|
+
limit = _coerce_int(max_results, 10, minimum=1, maximum=50)
|
|
494
|
+
rows = []
|
|
495
|
+
|
|
496
|
+
if source_norm in {"both", "europepmc"}:
|
|
497
|
+
epmc_data, epmc_error = request_json(
|
|
498
|
+
"GET",
|
|
499
|
+
"https://www.ebi.ac.uk/europepmc/webservices/rest/search",
|
|
500
|
+
params={
|
|
501
|
+
"query": f"({q}) AND SRC:PPR",
|
|
502
|
+
"format": "json",
|
|
503
|
+
"pageSize": limit,
|
|
504
|
+
},
|
|
505
|
+
timeout=20,
|
|
506
|
+
retries=2,
|
|
507
|
+
)
|
|
508
|
+
if epmc_error is None and isinstance(epmc_data, dict):
|
|
509
|
+
result_list = (((epmc_data.get("resultList") or {}).get("result")) or [])
|
|
510
|
+
for item in result_list:
|
|
511
|
+
if not isinstance(item, dict):
|
|
512
|
+
continue
|
|
513
|
+
title = item.get("title") or item.get("bookOrReportDetails", {}).get("title") if isinstance(item.get("bookOrReportDetails"), dict) else item.get("title")
|
|
514
|
+
rows.append(
|
|
515
|
+
{
|
|
516
|
+
"source": "europepmc",
|
|
517
|
+
"id": item.get("id") or item.get("pmid") or item.get("doi"),
|
|
518
|
+
"title": _clip(title, 220),
|
|
519
|
+
"authors": _clip(item.get("authorString"), 180),
|
|
520
|
+
"journal": item.get("journalTitle") or item.get("pubType"),
|
|
521
|
+
"year": item.get("pubYear"),
|
|
522
|
+
"doi": item.get("doi"),
|
|
523
|
+
"url": item.get("fullTextUrl") or item.get("pmcid") or item.get("doi"),
|
|
524
|
+
}
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
if source_norm in {"both", "arxiv"}:
|
|
528
|
+
resp, err = request(
|
|
529
|
+
"GET",
|
|
530
|
+
"https://export.arxiv.org/api/query",
|
|
531
|
+
params={
|
|
532
|
+
"search_query": f"all:{q}",
|
|
533
|
+
"start": 0,
|
|
534
|
+
"max_results": limit,
|
|
535
|
+
},
|
|
536
|
+
timeout=20,
|
|
537
|
+
retries=2,
|
|
538
|
+
raise_for_status=False,
|
|
539
|
+
)
|
|
540
|
+
if err is None and resp is not None and int(resp.status_code) == 200:
|
|
541
|
+
try:
|
|
542
|
+
root = ET.fromstring(resp.text)
|
|
543
|
+
ns = {"a": "http://www.w3.org/2005/Atom"}
|
|
544
|
+
for entry in root.findall("a:entry", ns):
|
|
545
|
+
title = (entry.findtext("a:title", default="", namespaces=ns) or "").strip()
|
|
546
|
+
published = (entry.findtext("a:published", default="", namespaces=ns) or "")
|
|
547
|
+
authors = [a.findtext("a:name", default="", namespaces=ns) for a in entry.findall("a:author", ns)]
|
|
548
|
+
rows.append(
|
|
549
|
+
{
|
|
550
|
+
"source": "arxiv",
|
|
551
|
+
"id": (entry.findtext("a:id", default="", namespaces=ns) or "").strip(),
|
|
552
|
+
"title": _clip(title, 220),
|
|
553
|
+
"authors": _clip(", ".join([a for a in authors if a]), 180),
|
|
554
|
+
"journal": "arXiv",
|
|
555
|
+
"year": published[:4] if published else None,
|
|
556
|
+
"doi": None,
|
|
557
|
+
"url": (entry.findtext("a:id", default="", namespaces=ns) or "").strip(),
|
|
558
|
+
}
|
|
559
|
+
)
|
|
560
|
+
except Exception:
|
|
561
|
+
pass
|
|
562
|
+
|
|
563
|
+
# Deduplicate by title/url
|
|
564
|
+
seen = set()
|
|
565
|
+
deduped = []
|
|
566
|
+
for row in rows:
|
|
567
|
+
key = (row.get("title"), row.get("url"))
|
|
568
|
+
if key in seen:
|
|
569
|
+
continue
|
|
570
|
+
seen.add(key)
|
|
571
|
+
deduped.append(row)
|
|
572
|
+
|
|
573
|
+
deduped = deduped[:limit]
|
|
574
|
+
return {
|
|
575
|
+
"summary": f"Found {len(deduped)} preprints for '{q}'.",
|
|
576
|
+
"query": q,
|
|
577
|
+
"source": source_norm,
|
|
578
|
+
"articles": deduped,
|
|
579
|
+
"count": len(deduped),
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _count_aromatic_rings(mol) -> int:
|
|
584
|
+
try:
|
|
585
|
+
return sum(1 for ring in mol.GetRingInfo().AtomRings() if any(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring))
|
|
586
|
+
except Exception:
|
|
587
|
+
return 0
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
@registry.register(
|
|
591
|
+
name="chemistry.sa_score",
|
|
592
|
+
description="Estimate synthetic accessibility score (1 easy – 10 hard)",
|
|
593
|
+
category="chemistry",
|
|
594
|
+
parameters={"smiles": "Input SMILES"},
|
|
595
|
+
usage_guide="Use during hit triage to reject compounds likely to be difficult to synthesize.",
|
|
596
|
+
)
|
|
597
|
+
def sa_score(smiles: str, **kwargs) -> dict:
|
|
598
|
+
"""Heuristic synthetic accessibility estimate based on molecular complexity."""
|
|
599
|
+
try:
|
|
600
|
+
from rdkit import Chem
|
|
601
|
+
from rdkit.Chem import Descriptors, Lipinski
|
|
602
|
+
except Exception:
|
|
603
|
+
return {
|
|
604
|
+
"summary": "RDKit is required for chemistry.sa_score.",
|
|
605
|
+
"error": "missing_dependency",
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
smi = (smiles or "").strip()
|
|
609
|
+
if not smi:
|
|
610
|
+
return {"summary": "smiles is required.", "error": "missing_smiles"}
|
|
611
|
+
|
|
612
|
+
mol = Chem.MolFromSmiles(smi)
|
|
613
|
+
if mol is None:
|
|
614
|
+
return {"summary": f"Invalid SMILES: {smi}", "error": "invalid_smiles"}
|
|
615
|
+
|
|
616
|
+
heavy = Descriptors.HeavyAtomCount(mol)
|
|
617
|
+
rings = Lipinski.RingCount(mol)
|
|
618
|
+
aromatic_rings = _count_aromatic_rings(mol)
|
|
619
|
+
sp3 = Lipinski.FractionCSP3(mol)
|
|
620
|
+
rot_bonds = Lipinski.NumRotatableBonds(mol)
|
|
621
|
+
stereo = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True))
|
|
622
|
+
|
|
623
|
+
# Lightweight complexity model mapped into [1,10].
|
|
624
|
+
complexity = 0.0
|
|
625
|
+
complexity += min(heavy / 20.0, 2.5)
|
|
626
|
+
complexity += min(rings * 0.35, 2.0)
|
|
627
|
+
complexity += min(aromatic_rings * 0.25, 1.5)
|
|
628
|
+
complexity += min(rot_bonds * 0.12, 1.2)
|
|
629
|
+
complexity += min(stereo * 0.3, 1.8)
|
|
630
|
+
complexity += max(0.0, (0.4 - float(sp3)) * 2.0)
|
|
631
|
+
|
|
632
|
+
score = max(1.0, min(10.0, 1.5 + complexity))
|
|
633
|
+
band = "easy" if score <= 3.5 else "moderate" if score <= 6.0 else "hard"
|
|
634
|
+
|
|
635
|
+
return {
|
|
636
|
+
"summary": f"Estimated synthetic accessibility score: {score:.2f}/10 ({band}).",
|
|
637
|
+
"smiles": smi,
|
|
638
|
+
"sa_score": round(score, 2),
|
|
639
|
+
"difficulty": band,
|
|
640
|
+
"features": {
|
|
641
|
+
"heavy_atoms": int(heavy),
|
|
642
|
+
"ring_count": int(rings),
|
|
643
|
+
"aromatic_rings": int(aromatic_rings),
|
|
644
|
+
"fraction_csp3": round(float(sp3), 3),
|
|
645
|
+
"rotatable_bonds": int(rot_bonds),
|
|
646
|
+
"stereocenters": int(stereo),
|
|
647
|
+
},
|
|
648
|
+
"note": "Heuristic estimate for prioritization; not a replacement for route planning.",
|
|
649
|
+
}
|