celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/literature.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Literature and database tools: PubMed, OpenAlex, ChEMBL API queries.
|
|
3
|
+
|
|
4
|
+
These are REST API wrappers -- no local data required.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re as _re
|
|
8
|
+
|
|
9
|
+
from ct.tools import registry
|
|
10
|
+
from ct.tools.http_client import request, request_json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize_pubmed_query(query: str) -> str:
|
|
14
|
+
"""Normalize a PubMed query for NCBI E-utilities.
|
|
15
|
+
|
|
16
|
+
- Uppercase standalone boolean operators (and→AND, or→OR, not→NOT)
|
|
17
|
+
- Preserve text inside quoted phrases
|
|
18
|
+
- Normalize whitespace
|
|
19
|
+
"""
|
|
20
|
+
# Split on quoted phrases to preserve them
|
|
21
|
+
parts = _re.split(r'(".*?")', query)
|
|
22
|
+
normalized = []
|
|
23
|
+
for i, part in enumerate(parts):
|
|
24
|
+
if part.startswith('"'):
|
|
25
|
+
# Quoted phrase — keep as-is
|
|
26
|
+
normalized.append(part)
|
|
27
|
+
else:
|
|
28
|
+
# Uppercase standalone boolean operators
|
|
29
|
+
part = _re.sub(r'\b(and)\b', 'AND', part, flags=_re.IGNORECASE)
|
|
30
|
+
part = _re.sub(r'\b(or)\b', 'OR', part, flags=_re.IGNORECASE)
|
|
31
|
+
part = _re.sub(r'\b(not)\b', 'NOT', part, flags=_re.IGNORECASE)
|
|
32
|
+
normalized.append(part)
|
|
33
|
+
result = "".join(normalized)
|
|
34
|
+
# Normalize whitespace
|
|
35
|
+
return " ".join(result.split())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _simplify_query(query: str) -> list[str]:
|
|
39
|
+
"""Generate progressively simpler queries by dropping terms.
|
|
40
|
+
|
|
41
|
+
PubMed ANDs all terms by default, so long queries (8+ terms) often return
|
|
42
|
+
zero results. We try shorter versions as fallbacks.
|
|
43
|
+
"""
|
|
44
|
+
# Remove parenthesized groups and quoted phrases for counting
|
|
45
|
+
clean = _re.sub(r'\([^)]*\)', '', query)
|
|
46
|
+
clean = _re.sub(r'"[^"]*"', '', clean)
|
|
47
|
+
# Split on whitespace, ignoring boolean operators
|
|
48
|
+
words = [w for w in query.split() if w.upper() not in ("AND", "OR", "NOT")]
|
|
49
|
+
|
|
50
|
+
if len(words) <= 4:
|
|
51
|
+
return [] # Already short enough
|
|
52
|
+
|
|
53
|
+
# Try keeping just the most distinctive terms (drop common qualifiers)
|
|
54
|
+
# Strategy: take first N words from the original query
|
|
55
|
+
shorter = []
|
|
56
|
+
if len(words) > 6:
|
|
57
|
+
shorter.append(" ".join(words[:5]))
|
|
58
|
+
if len(words) > 4:
|
|
59
|
+
shorter.append(" ".join(words[:3]))
|
|
60
|
+
return shorter
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@registry.register(
|
|
64
|
+
name="literature.pubmed_search",
|
|
65
|
+
description="Search PubMed for publications via NCBI E-utilities API",
|
|
66
|
+
category="literature",
|
|
67
|
+
parameters={
|
|
68
|
+
"query": "Search query (e.g. 'molecular glue degrader CRBN')",
|
|
69
|
+
"max_results": "Maximum number of results (default 20)",
|
|
70
|
+
},
|
|
71
|
+
usage_guide="You need recent publications on a target, compound, or mechanism. Use to support or challenge computational findings with published evidence.",
|
|
72
|
+
)
|
|
73
|
+
def pubmed_search(query: str, max_results: int = 20, **kwargs) -> dict:
|
|
74
|
+
"""Search PubMed using NCBI E-utilities (ESearch + ESummary)."""
|
|
75
|
+
try:
|
|
76
|
+
import httpx
|
|
77
|
+
except ImportError:
|
|
78
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
79
|
+
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
80
|
+
|
|
81
|
+
# Step 1: ESearch to get PMIDs
|
|
82
|
+
search_url = f"{base}/esearch.fcgi"
|
|
83
|
+
normalized = _normalize_pubmed_query(query)
|
|
84
|
+
params = {
|
|
85
|
+
"db": "pubmed",
|
|
86
|
+
"term": normalized,
|
|
87
|
+
"retmax": max_results,
|
|
88
|
+
"retmode": "json",
|
|
89
|
+
"sort": "relevance",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
search_data, error = request_json(
|
|
93
|
+
"GET",
|
|
94
|
+
search_url,
|
|
95
|
+
params=params,
|
|
96
|
+
timeout=30,
|
|
97
|
+
retries=2,
|
|
98
|
+
)
|
|
99
|
+
if error:
|
|
100
|
+
return {"error": f"PubMed search failed: {error}", "summary": f"PubMed search failed: {error}"}
|
|
101
|
+
result = search_data.get("esearchresult", {})
|
|
102
|
+
pmids = result.get("idlist", [])
|
|
103
|
+
total_count = int(result.get("count", 0))
|
|
104
|
+
|
|
105
|
+
# If no results with a long query, retry with progressively simpler versions
|
|
106
|
+
used_query = query
|
|
107
|
+
if not pmids:
|
|
108
|
+
for simpler in _simplify_query(query):
|
|
109
|
+
params["term"] = _normalize_pubmed_query(simpler)
|
|
110
|
+
search_data, fallback_error = request_json(
|
|
111
|
+
"GET",
|
|
112
|
+
search_url,
|
|
113
|
+
params=params,
|
|
114
|
+
timeout=30,
|
|
115
|
+
retries=2,
|
|
116
|
+
)
|
|
117
|
+
if fallback_error:
|
|
118
|
+
continue
|
|
119
|
+
result = search_data.get("esearchresult", {})
|
|
120
|
+
pmids = result.get("idlist", [])
|
|
121
|
+
total_count = int(result.get("count", 0))
|
|
122
|
+
if pmids:
|
|
123
|
+
used_query = simpler
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
if not pmids:
|
|
127
|
+
return {"summary": f"No results for '{query}'", "total_count": 0, "articles": []}
|
|
128
|
+
|
|
129
|
+
# Step 2: ESummary for article details
|
|
130
|
+
summary_url = f"{base}/esummary.fcgi"
|
|
131
|
+
params = {
|
|
132
|
+
"db": "pubmed",
|
|
133
|
+
"id": ",".join(pmids),
|
|
134
|
+
"retmode": "json",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
summary_data, error = request_json(
|
|
138
|
+
"GET",
|
|
139
|
+
summary_url,
|
|
140
|
+
params=params,
|
|
141
|
+
timeout=30,
|
|
142
|
+
retries=2,
|
|
143
|
+
)
|
|
144
|
+
if error:
|
|
145
|
+
return {"error": f"PubMed summary failed: {error}", "summary": f"PubMed summary failed: {error}"}
|
|
146
|
+
articles = []
|
|
147
|
+
for pmid in pmids:
|
|
148
|
+
info = summary_data.get("result", {}).get(pmid, {})
|
|
149
|
+
if not info or pmid == "uids":
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
authors = info.get("authors", [])
|
|
153
|
+
first_author = authors[0].get("name", "") if authors else ""
|
|
154
|
+
|
|
155
|
+
articles.append({
|
|
156
|
+
"pmid": pmid,
|
|
157
|
+
"title": info.get("title", ""),
|
|
158
|
+
"first_author": first_author,
|
|
159
|
+
"journal": info.get("source", ""),
|
|
160
|
+
"pub_date": info.get("pubdate", ""),
|
|
161
|
+
"doi": next((a.get("value", "") for a in info.get("articleids", [])
|
|
162
|
+
if a.get("idtype") == "doi"), ""),
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
summary = f"PubMed search '{used_query}': {total_count} total, showing {len(articles)}"
|
|
166
|
+
if used_query != query:
|
|
167
|
+
summary += f" (simplified from: '{query}')"
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
"summary": summary,
|
|
171
|
+
"query": used_query,
|
|
172
|
+
"original_query": query,
|
|
173
|
+
"total_count": total_count,
|
|
174
|
+
"articles": articles,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@registry.register(
|
|
179
|
+
name="literature.chembl_query",
|
|
180
|
+
description="Query ChEMBL for compound bioactivity, targets, and SAR data",
|
|
181
|
+
category="literature",
|
|
182
|
+
parameters={
|
|
183
|
+
"query": "Compound name, SMILES, or ChEMBL ID",
|
|
184
|
+
"query_type": "'molecule', 'target', 'activity', or 'similarity'",
|
|
185
|
+
"max_results": "Maximum results (default 20)",
|
|
186
|
+
},
|
|
187
|
+
usage_guide="You want to look up known bioactivity data, find related compounds, or check if a target has known ligands. Use ChEMBL for chemical and pharmacological context.",
|
|
188
|
+
)
|
|
189
|
+
def chembl_query(query: str, query_type: str = "molecule", max_results: int = 20, **kwargs) -> dict:
|
|
190
|
+
"""Query ChEMBL database for compound/target/activity data."""
|
|
191
|
+
try:
|
|
192
|
+
import httpx
|
|
193
|
+
except ImportError:
|
|
194
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
195
|
+
query = str(query or "").strip()
|
|
196
|
+
query_type_raw = str(query_type or "molecule").strip().lower()
|
|
197
|
+
query_type_aliases = {
|
|
198
|
+
"compound": "molecule",
|
|
199
|
+
"drug": "molecule",
|
|
200
|
+
"molecules": "molecule",
|
|
201
|
+
"compounds": "molecule",
|
|
202
|
+
"protein": "target",
|
|
203
|
+
"gene": "target",
|
|
204
|
+
"bioactivity": "activity",
|
|
205
|
+
"activities": "activity",
|
|
206
|
+
"similar": "similarity",
|
|
207
|
+
}
|
|
208
|
+
query_type = query_type_aliases.get(query_type_raw, query_type_raw)
|
|
209
|
+
base = "https://www.ebi.ac.uk/chembl/api/data"
|
|
210
|
+
headers = {"Accept": "application/json"}
|
|
211
|
+
|
|
212
|
+
# Accept common aliases
|
|
213
|
+
if query_type == "compound":
|
|
214
|
+
query_type = "molecule"
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
if query_type == "molecule":
|
|
218
|
+
url = f"{base}/molecule/search.json"
|
|
219
|
+
params = {"q": query, "limit": max_results}
|
|
220
|
+
data, error = request_json(
|
|
221
|
+
"GET",
|
|
222
|
+
url,
|
|
223
|
+
params=params,
|
|
224
|
+
headers=headers,
|
|
225
|
+
timeout=30,
|
|
226
|
+
retries=2,
|
|
227
|
+
)
|
|
228
|
+
if error:
|
|
229
|
+
return {"error": f"ChEMBL query failed: {error}", "summary": f"ChEMBL query failed: {error}"}
|
|
230
|
+
molecules = []
|
|
231
|
+
for mol in data.get("molecules", []):
|
|
232
|
+
props = mol.get("molecule_properties", {}) or {}
|
|
233
|
+
molecules.append({
|
|
234
|
+
"chembl_id": mol.get("molecule_chembl_id", ""),
|
|
235
|
+
"pref_name": mol.get("pref_name", ""),
|
|
236
|
+
"molecule_type": mol.get("molecule_type", ""),
|
|
237
|
+
"max_phase": mol.get("max_phase", 0),
|
|
238
|
+
"mw": props.get("full_mwt"),
|
|
239
|
+
"logp": props.get("alogp"),
|
|
240
|
+
"smiles": (mol.get("molecule_structures", {}) or {}).get("canonical_smiles", ""),
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
"summary": f"ChEMBL molecule search '{query}': {len(molecules)} hits",
|
|
245
|
+
"query": query,
|
|
246
|
+
"molecules": molecules,
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
elif query_type == "target":
|
|
250
|
+
url = f"{base}/target/search.json"
|
|
251
|
+
params = {"q": query, "limit": max_results}
|
|
252
|
+
data, error = request_json(
|
|
253
|
+
"GET",
|
|
254
|
+
url,
|
|
255
|
+
params=params,
|
|
256
|
+
headers=headers,
|
|
257
|
+
timeout=30,
|
|
258
|
+
retries=2,
|
|
259
|
+
)
|
|
260
|
+
if error:
|
|
261
|
+
return {"error": f"ChEMBL query failed: {error}", "summary": f"ChEMBL query failed: {error}"}
|
|
262
|
+
targets = []
|
|
263
|
+
for tgt in data.get("targets", []):
|
|
264
|
+
targets.append({
|
|
265
|
+
"chembl_id": tgt.get("target_chembl_id", ""),
|
|
266
|
+
"pref_name": tgt.get("pref_name", ""),
|
|
267
|
+
"organism": tgt.get("organism", ""),
|
|
268
|
+
"target_type": tgt.get("target_type", ""),
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
"summary": f"ChEMBL target search '{query}': {len(targets)} hits",
|
|
273
|
+
"query": query,
|
|
274
|
+
"targets": targets,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
elif query_type == "activity":
|
|
278
|
+
# Support both target and molecule ChEMBL IDs
|
|
279
|
+
# If query starts with CHEMBL, determine if it's a target or molecule
|
|
280
|
+
# Also support compound names: resolve to molecule ChEMBL ID first
|
|
281
|
+
molecule_id = None
|
|
282
|
+
target_id = None
|
|
283
|
+
|
|
284
|
+
if query.startswith("CHEMBL"):
|
|
285
|
+
# Could be target or molecule — try molecule activity first
|
|
286
|
+
molecule_id = query
|
|
287
|
+
else:
|
|
288
|
+
# Try to resolve compound name to ChEMBL molecule ID
|
|
289
|
+
search_url = f"{base}/molecule/search.json"
|
|
290
|
+
search_params = {"q": query, "limit": 5}
|
|
291
|
+
search_data, search_error = request_json(
|
|
292
|
+
"GET",
|
|
293
|
+
search_url,
|
|
294
|
+
params=search_params,
|
|
295
|
+
headers=headers,
|
|
296
|
+
timeout=30,
|
|
297
|
+
retries=2,
|
|
298
|
+
)
|
|
299
|
+
if not search_error:
|
|
300
|
+
mols = search_data.get("molecules", [])
|
|
301
|
+
if mols:
|
|
302
|
+
molecule_id = mols[0].get("molecule_chembl_id", "")
|
|
303
|
+
|
|
304
|
+
# Query activities by molecule ChEMBL ID
|
|
305
|
+
activities = []
|
|
306
|
+
if molecule_id:
|
|
307
|
+
url = f"{base}/activity.json"
|
|
308
|
+
params = {
|
|
309
|
+
"molecule_chembl_id": molecule_id,
|
|
310
|
+
"limit": max_results,
|
|
311
|
+
}
|
|
312
|
+
data, error = request_json(
|
|
313
|
+
"GET",
|
|
314
|
+
url,
|
|
315
|
+
params=params,
|
|
316
|
+
headers=headers,
|
|
317
|
+
timeout=30,
|
|
318
|
+
retries=2,
|
|
319
|
+
)
|
|
320
|
+
if not error:
|
|
321
|
+
for act in data.get("activities", []):
|
|
322
|
+
activities.append({
|
|
323
|
+
"molecule_chembl_id": act.get("molecule_chembl_id", ""),
|
|
324
|
+
"molecule_name": act.get("molecule_pref_name", ""),
|
|
325
|
+
"target_chembl_id": act.get("target_chembl_id", ""),
|
|
326
|
+
"target_name": act.get("target_pref_name", ""),
|
|
327
|
+
"standard_type": act.get("standard_type", ""),
|
|
328
|
+
"standard_value": act.get("standard_value"),
|
|
329
|
+
"standard_units": act.get("standard_units", ""),
|
|
330
|
+
"pchembl_value": act.get("pchembl_value"),
|
|
331
|
+
"assay_type": act.get("assay_type", ""),
|
|
332
|
+
"assay_description": (act.get("assay_description", "") or "")[:200],
|
|
333
|
+
})
|
|
334
|
+
|
|
335
|
+
# If no results from molecule lookup, try target lookup
|
|
336
|
+
if not activities:
|
|
337
|
+
target_id = query if query.startswith("CHEMBL") else None
|
|
338
|
+
if target_id:
|
|
339
|
+
url = f"{base}/activity.json"
|
|
340
|
+
params = {
|
|
341
|
+
"target_chembl_id": target_id,
|
|
342
|
+
"limit": max_results,
|
|
343
|
+
"standard_type__in": "IC50,Ki,Kd,EC50",
|
|
344
|
+
}
|
|
345
|
+
data, error = request_json(
|
|
346
|
+
"GET",
|
|
347
|
+
url,
|
|
348
|
+
params=params,
|
|
349
|
+
headers=headers,
|
|
350
|
+
timeout=30,
|
|
351
|
+
retries=2,
|
|
352
|
+
)
|
|
353
|
+
if not error:
|
|
354
|
+
for act in data.get("activities", []):
|
|
355
|
+
activities.append({
|
|
356
|
+
"molecule_chembl_id": act.get("molecule_chembl_id", ""),
|
|
357
|
+
"molecule_name": act.get("molecule_pref_name", ""),
|
|
358
|
+
"target_chembl_id": act.get("target_chembl_id", ""),
|
|
359
|
+
"target_name": act.get("target_pref_name", ""),
|
|
360
|
+
"standard_type": act.get("standard_type", ""),
|
|
361
|
+
"standard_value": act.get("standard_value"),
|
|
362
|
+
"standard_units": act.get("standard_units", ""),
|
|
363
|
+
"pchembl_value": act.get("pchembl_value"),
|
|
364
|
+
"assay_type": act.get("assay_type", ""),
|
|
365
|
+
"assay_description": (act.get("assay_description", "") or "")[:200],
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
resolved_id = molecule_id or target_id or query
|
|
369
|
+
return {
|
|
370
|
+
"summary": f"ChEMBL activities for {query} ({resolved_id}): {len(activities)} results",
|
|
371
|
+
"query": query,
|
|
372
|
+
"chembl_id": resolved_id,
|
|
373
|
+
"activities": activities,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
elif query_type == "similarity":
|
|
377
|
+
url = f"{base}/similarity/{query}/70.json"
|
|
378
|
+
params = {"limit": max_results}
|
|
379
|
+
data, error = request_json(
|
|
380
|
+
"GET",
|
|
381
|
+
url,
|
|
382
|
+
params=params,
|
|
383
|
+
headers=headers,
|
|
384
|
+
timeout=30,
|
|
385
|
+
retries=2,
|
|
386
|
+
)
|
|
387
|
+
if error:
|
|
388
|
+
return {"error": f"ChEMBL query failed: {error}", "summary": f"ChEMBL query failed: {error}"}
|
|
389
|
+
hits = []
|
|
390
|
+
for mol in data.get("molecules", []):
|
|
391
|
+
hits.append({
|
|
392
|
+
"chembl_id": mol.get("molecule_chembl_id", ""),
|
|
393
|
+
"pref_name": mol.get("pref_name", ""),
|
|
394
|
+
"similarity": mol.get("similarity", 0),
|
|
395
|
+
"smiles": (mol.get("molecule_structures", {}) or {}).get("canonical_smiles", ""),
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
return {
|
|
399
|
+
"summary": f"ChEMBL similarity search: {len(hits)} hits (>70% similar)",
|
|
400
|
+
"query": query,
|
|
401
|
+
"hits": hits,
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
else:
|
|
405
|
+
return {"error": f"Unknown query_type: {query_type_raw}. Use 'molecule', 'target', 'activity', or 'similarity'", "summary": f"Unknown query_type: {query_type_raw}. Use 'molecule', 'target', 'activity', or 'similarity'"}
|
|
406
|
+
except Exception as e:
|
|
407
|
+
return {"error": f"ChEMBL query failed: {e}", "summary": f"ChEMBL query failed: {e}"}
|
|
408
|
+
@registry.register(
|
|
409
|
+
name="literature.openalex_search",
|
|
410
|
+
description="Search OpenAlex for academic publications with citation data and open access links",
|
|
411
|
+
category="literature",
|
|
412
|
+
parameters={
|
|
413
|
+
"query": "Search query",
|
|
414
|
+
"max_results": "Maximum results (default 20)",
|
|
415
|
+
},
|
|
416
|
+
usage_guide="You want academic publications with citation metrics and open access links. Broader than PubMed — covers all scientific literature. Use for comprehensive literature reviews.",
|
|
417
|
+
)
|
|
418
|
+
def openalex_search(query: str, max_results: int = 20, **kwargs) -> dict:
|
|
419
|
+
"""Search OpenAlex for publications with citation metrics."""
|
|
420
|
+
try:
|
|
421
|
+
import httpx
|
|
422
|
+
except ImportError:
|
|
423
|
+
return {"error": "httpx required (pip install httpx)", "summary": "httpx required (pip install httpx)"}
|
|
424
|
+
url = "https://api.openalex.org/works"
|
|
425
|
+
params = {
|
|
426
|
+
"search": query,
|
|
427
|
+
"per_page": max_results,
|
|
428
|
+
"sort": "relevance_score:desc",
|
|
429
|
+
"mailto": "ct@celltype.bio",
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
data, error = request_json(
|
|
433
|
+
"GET",
|
|
434
|
+
url,
|
|
435
|
+
params=params,
|
|
436
|
+
timeout=30,
|
|
437
|
+
retries=2,
|
|
438
|
+
)
|
|
439
|
+
if error:
|
|
440
|
+
return {"error": f"OpenAlex search failed: {error}", "summary": f"OpenAlex search failed: {error}"}
|
|
441
|
+
results_data = data.get("results", [])
|
|
442
|
+
total_count = data.get("meta", {}).get("count", 0)
|
|
443
|
+
|
|
444
|
+
articles = []
|
|
445
|
+
for work in results_data:
|
|
446
|
+
authorships = work.get("authorships", [])
|
|
447
|
+
first_author = ""
|
|
448
|
+
if authorships:
|
|
449
|
+
author_info = authorships[0].get("author", {})
|
|
450
|
+
first_author = author_info.get("display_name", "")
|
|
451
|
+
|
|
452
|
+
primary_loc = work.get("primary_location") or {}
|
|
453
|
+
source = primary_loc.get("source") or {}
|
|
454
|
+
|
|
455
|
+
articles.append({
|
|
456
|
+
"title": work.get("title", ""),
|
|
457
|
+
"first_author": first_author,
|
|
458
|
+
"publication_year": work.get("publication_year"),
|
|
459
|
+
"cited_by_count": work.get("cited_by_count", 0),
|
|
460
|
+
"doi": work.get("doi", ""),
|
|
461
|
+
"open_access": (work.get("open_access") or {}).get("is_oa", False),
|
|
462
|
+
"source": source.get("display_name", ""),
|
|
463
|
+
"type": work.get("type", ""),
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
return {
|
|
467
|
+
"summary": f"OpenAlex search '{query}': {total_count} total, showing {len(articles)}",
|
|
468
|
+
"query": query,
|
|
469
|
+
"total_count": total_count,
|
|
470
|
+
"articles": articles,
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
@registry.register(
|
|
475
|
+
name="literature.patent_search",
|
|
476
|
+
description="Search patent databases for drug discovery-relevant patents (Lens.org, EPO OPS, or PubMed fallback)",
|
|
477
|
+
category="literature",
|
|
478
|
+
parameters={
|
|
479
|
+
"query": "Patent search query (e.g. 'CRBN molecular glue degrader')",
|
|
480
|
+
"max_results": "Maximum number of results (default 20)",
|
|
481
|
+
},
|
|
482
|
+
usage_guide="You need to find relevant patents for a target, compound class, or technology. Use to assess patent landscape, freedom to operate, or find prior art. Tries Lens.org API first (if api.lens_key configured), then EPO Open Patent Services, then falls back to PubMed patent-related literature.",
|
|
483
|
+
)
|
|
484
|
+
def patent_search(query: str, max_results: int = 20, **kwargs) -> dict:
|
|
485
|
+
"""Search patent databases for drug discovery-relevant patents.
|
|
486
|
+
|
|
487
|
+
Uses a tiered approach:
|
|
488
|
+
1. Lens.org Patent API (if API key is configured via api.lens_key)
|
|
489
|
+
2. EPO Open Patent Services (free, no key required for basic search)
|
|
490
|
+
3. PubMed fallback (searches for patent-related publications)
|
|
491
|
+
"""
|
|
492
|
+
# Try Lens.org first
|
|
493
|
+
session = kwargs.get("_session", None)
|
|
494
|
+
lens_key = None
|
|
495
|
+
if session and hasattr(session, "config"):
|
|
496
|
+
lens_key = session.config.get("api.lens_key", None)
|
|
497
|
+
|
|
498
|
+
if lens_key:
|
|
499
|
+
result = _patent_search_lens(query, max_results, lens_key)
|
|
500
|
+
if result and "error" not in result:
|
|
501
|
+
return result
|
|
502
|
+
|
|
503
|
+
# Try EPO OPS (free, no key required)
|
|
504
|
+
result = _patent_search_epo(query, max_results)
|
|
505
|
+
if result and "error" not in result:
|
|
506
|
+
return result
|
|
507
|
+
|
|
508
|
+
# Fall back to PubMed patent search
|
|
509
|
+
return _patent_search_pubmed_fallback(query, max_results)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _patent_search_lens(query: str, max_results: int, api_key: str) -> dict:
|
|
513
|
+
"""Search Lens.org Patent API."""
|
|
514
|
+
url = "https://api.lens.org/patent/search"
|
|
515
|
+
headers = {
|
|
516
|
+
"Authorization": f"Bearer {api_key}",
|
|
517
|
+
"Content-Type": "application/json",
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
payload = {
|
|
521
|
+
"query": {
|
|
522
|
+
"match": query,
|
|
523
|
+
},
|
|
524
|
+
"size": max_results,
|
|
525
|
+
"sort": [{"relevance": "desc"}],
|
|
526
|
+
"include": [
|
|
527
|
+
"lens_id", "title", "abstract", "applicant",
|
|
528
|
+
"publication_date", "publication_key", "jurisdiction",
|
|
529
|
+
"doc_number", "kind",
|
|
530
|
+
],
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
resp, error = request(
|
|
534
|
+
"POST",
|
|
535
|
+
url,
|
|
536
|
+
json=payload,
|
|
537
|
+
headers=headers,
|
|
538
|
+
timeout=30,
|
|
539
|
+
retries=2,
|
|
540
|
+
raise_for_status=False,
|
|
541
|
+
)
|
|
542
|
+
if error:
|
|
543
|
+
return {"error": f"Lens.org API request failed: {error}", "summary": f"Lens.org API request failed: {error}"}
|
|
544
|
+
if resp.status_code != 200:
|
|
545
|
+
return {"error": f"Lens.org API returned status {resp.status_code}", "summary": f"Lens.org API returned status {resp.status_code}"}
|
|
546
|
+
try:
|
|
547
|
+
data = resp.json()
|
|
548
|
+
except Exception:
|
|
549
|
+
return {"error": "Lens.org API returned invalid JSON", "summary": "Lens.org API returned invalid JSON"}
|
|
550
|
+
results = data.get("data", [])
|
|
551
|
+
total = data.get("total", 0)
|
|
552
|
+
|
|
553
|
+
patents = []
|
|
554
|
+
for item in results:
|
|
555
|
+
title_obj = item.get("title", [])
|
|
556
|
+
title = title_obj[0].get("text", "") if title_obj else ""
|
|
557
|
+
|
|
558
|
+
abstract_obj = item.get("abstract", [])
|
|
559
|
+
abstract = abstract_obj[0].get("text", "")[:300] if abstract_obj else ""
|
|
560
|
+
|
|
561
|
+
applicants = item.get("applicant", [])
|
|
562
|
+
applicant_names = [a.get("name", "") for a in applicants[:3]] if applicants else []
|
|
563
|
+
|
|
564
|
+
patents.append({
|
|
565
|
+
"lens_id": item.get("lens_id", ""),
|
|
566
|
+
"title": title,
|
|
567
|
+
"abstract": abstract,
|
|
568
|
+
"applicants": applicant_names,
|
|
569
|
+
"publication_date": item.get("publication_date", ""),
|
|
570
|
+
"doc_number": item.get("doc_number", ""),
|
|
571
|
+
"jurisdiction": item.get("jurisdiction", ""),
|
|
572
|
+
"kind": item.get("kind", ""),
|
|
573
|
+
})
|
|
574
|
+
|
|
575
|
+
# Date range for summary
|
|
576
|
+
dates = [p["publication_date"] for p in patents if p["publication_date"]]
|
|
577
|
+
date_range = ""
|
|
578
|
+
if dates:
|
|
579
|
+
years = sorted(set(d[:4] for d in dates if len(d) >= 4))
|
|
580
|
+
if years:
|
|
581
|
+
date_range = f" ({years[0]}-{years[-1]})"
|
|
582
|
+
|
|
583
|
+
return {
|
|
584
|
+
"summary": f"Patent search '{query}': {total} total, showing {len(patents)}{date_range}",
|
|
585
|
+
"source": "lens.org",
|
|
586
|
+
"query": query,
|
|
587
|
+
"total_count": total,
|
|
588
|
+
"patents": patents,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _patent_search_epo(query: str, max_results: int) -> dict:
|
|
593
|
+
"""Search EPO Open Patent Services (Espacenet OPS) — free, no key required."""
|
|
594
|
+
import xml.etree.ElementTree as ET
|
|
595
|
+
|
|
596
|
+
# EPO OPS biblio search endpoint
|
|
597
|
+
url = "https://ops.epo.org/3.2/rest-services/published-data/search/biblio"
|
|
598
|
+
params = {
|
|
599
|
+
"q": query,
|
|
600
|
+
"Range": f"1-{min(max_results, 100)}",
|
|
601
|
+
}
|
|
602
|
+
headers = {
|
|
603
|
+
"Accept": "application/xml",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
resp, error = request(
|
|
607
|
+
"GET",
|
|
608
|
+
url,
|
|
609
|
+
params=params,
|
|
610
|
+
headers=headers,
|
|
611
|
+
timeout=30,
|
|
612
|
+
retries=0,
|
|
613
|
+
raise_for_status=False,
|
|
614
|
+
)
|
|
615
|
+
if error:
|
|
616
|
+
return {"error": f"EPO OPS request failed: {error}", "summary": f"EPO OPS request failed: {error}"}
|
|
617
|
+
if resp.status_code == 404:
|
|
618
|
+
return {"error": "No patents found via EPO OPS", "summary": "No patents found via EPO OPS"}
|
|
619
|
+
if resp.status_code == 403:
|
|
620
|
+
# Rate limited or auth required
|
|
621
|
+
return {"error": "EPO OPS rate limited or requires authentication", "summary": "EPO OPS rate limited or requires authentication"}
|
|
622
|
+
if resp.status_code != 200:
|
|
623
|
+
return {"error": f"EPO OPS returned status {resp.status_code}", "summary": f"EPO OPS returned status {resp.status_code}"}
|
|
624
|
+
# Validate Content-Type before XML parsing
|
|
625
|
+
content_type = ""
|
|
626
|
+
try:
|
|
627
|
+
ct_raw = resp.headers.get("content-type", "")
|
|
628
|
+
if isinstance(ct_raw, str):
|
|
629
|
+
content_type = ct_raw.lower()
|
|
630
|
+
except Exception:
|
|
631
|
+
pass
|
|
632
|
+
if content_type and "xml" not in content_type and "text/plain" not in content_type:
|
|
633
|
+
return {"error": f"EPO OPS returned {content_type}, expected XML", "summary": "EPO OPS returned non-XML response"}
|
|
634
|
+
|
|
635
|
+
# Parse XML response
|
|
636
|
+
try:
|
|
637
|
+
root = ET.fromstring(resp.text)
|
|
638
|
+
except ET.ParseError as e:
|
|
639
|
+
return {"error": f"Failed to parse EPO OPS XML: {e}", "summary": "Failed to parse EPO patent XML"}
|
|
640
|
+
|
|
641
|
+
# EPO OPS XML namespaces
|
|
642
|
+
ns = {
|
|
643
|
+
"ops": "http://ops.epo.org",
|
|
644
|
+
"epo": "http://www.epo.org/exchange",
|
|
645
|
+
"exch": "http://www.epo.org/exchange",
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
patents = []
|
|
649
|
+
total_count = 0
|
|
650
|
+
|
|
651
|
+
# Try to get total count
|
|
652
|
+
total_elem = root.find(".//ops:biblio-search", ns)
|
|
653
|
+
if total_elem is not None:
|
|
654
|
+
total_count = int(total_elem.get("total-result-count", 0))
|
|
655
|
+
|
|
656
|
+
# Extract patent documents
|
|
657
|
+
for doc in root.findall(".//exch:exchange-document", ns):
|
|
658
|
+
doc_id = doc.get("doc-number", "")
|
|
659
|
+
country = doc.get("country", "")
|
|
660
|
+
kind = doc.get("kind", "")
|
|
661
|
+
|
|
662
|
+
# Title
|
|
663
|
+
title = ""
|
|
664
|
+
for title_elem in doc.findall(".//exch:invention-title", ns):
|
|
665
|
+
if title_elem.get("lang", "") == "en" or not title:
|
|
666
|
+
title = title_elem.text or ""
|
|
667
|
+
|
|
668
|
+
# Applicants
|
|
669
|
+
applicants = []
|
|
670
|
+
for app in doc.findall(".//exch:applicant/exch:applicant-name/exch:name", ns):
|
|
671
|
+
if app.text:
|
|
672
|
+
applicants.append(app.text)
|
|
673
|
+
|
|
674
|
+
# Publication date
|
|
675
|
+
pub_date = ""
|
|
676
|
+
for pub_ref in doc.findall(".//exch:publication-reference//exch:date", ns):
|
|
677
|
+
if pub_ref.text:
|
|
678
|
+
pub_date = pub_ref.text
|
|
679
|
+
break
|
|
680
|
+
|
|
681
|
+
# Abstract
|
|
682
|
+
abstract = ""
|
|
683
|
+
for abs_elem in doc.findall(".//exch:abstract", ns):
|
|
684
|
+
if abs_elem.get("lang", "") == "en" or not abstract:
|
|
685
|
+
parts = []
|
|
686
|
+
for p in abs_elem.findall(".//exch:p", ns):
|
|
687
|
+
if p.text:
|
|
688
|
+
parts.append(p.text)
|
|
689
|
+
if parts:
|
|
690
|
+
abstract = " ".join(parts)[:300]
|
|
691
|
+
|
|
692
|
+
patent_number = f"{country}{doc_id}{kind}" if country else doc_id
|
|
693
|
+
|
|
694
|
+
patents.append({
|
|
695
|
+
"patent_number": patent_number,
|
|
696
|
+
"title": title,
|
|
697
|
+
"abstract": abstract,
|
|
698
|
+
"applicants": applicants[:3],
|
|
699
|
+
"publication_date": pub_date,
|
|
700
|
+
"country": country,
|
|
701
|
+
"kind": kind,
|
|
702
|
+
})
|
|
703
|
+
|
|
704
|
+
if not patents:
|
|
705
|
+
return {"error": "EPO OPS returned no parseable patents", "summary": "EPO OPS returned no parseable patents"}
|
|
706
|
+
# Date range for summary
|
|
707
|
+
dates = [p["publication_date"] for p in patents if p["publication_date"]]
|
|
708
|
+
date_range = ""
|
|
709
|
+
if dates:
|
|
710
|
+
years = sorted(set(d[:4] for d in dates if len(d) >= 4))
|
|
711
|
+
if years:
|
|
712
|
+
date_range = f" ({years[0]}-{years[-1]})"
|
|
713
|
+
|
|
714
|
+
return {
|
|
715
|
+
"summary": f"Patent search '{query}': {total_count} patents found via EPO{date_range}",
|
|
716
|
+
"source": "epo_ops",
|
|
717
|
+
"query": query,
|
|
718
|
+
"total_count": total_count,
|
|
719
|
+
"patents": patents,
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def _patent_search_pubmed_fallback(query: str, max_results: int) -> dict:
|
|
724
|
+
"""Fall back to PubMed search for patent-related publications."""
|
|
725
|
+
# Add patent-related terms to the query
|
|
726
|
+
patent_query = f"({query}) AND (patent OR intellectual property OR claims OR USPTO OR EPO)"
|
|
727
|
+
|
|
728
|
+
result = pubmed_search(query=patent_query, max_results=max_results)
|
|
729
|
+
|
|
730
|
+
if "error" in result:
|
|
731
|
+
return result
|
|
732
|
+
|
|
733
|
+
# Re-label for clarity
|
|
734
|
+
return {
|
|
735
|
+
"summary": f"Patent search '{query}' (PubMed fallback): {result.get('total_count', 0)} "
|
|
736
|
+
f"patent-related publications found",
|
|
737
|
+
"source": "pubmed_fallback",
|
|
738
|
+
"query": query,
|
|
739
|
+
"note": "No patent API available — showing patent-related PubMed publications. "
|
|
740
|
+
"Configure api.lens_key for direct patent search via Lens.org.",
|
|
741
|
+
"total_count": result.get("total_count", 0),
|
|
742
|
+
"articles": result.get("articles", []),
|
|
743
|
+
}
|