labmate-mcp 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labmate_mcp/__init__.py +4 -0
- labmate_mcp/__main__.py +3 -0
- labmate_mcp/apis.py +1744 -0
- labmate_mcp/bench.py +3392 -0
- labmate_mcp/chemistry.py +572 -0
- labmate_mcp/peptide.py +384 -0
- labmate_mcp/server.py +5116 -0
- labmate_mcp/writing.py +1488 -0
- labmate_mcp-7.0.0.dist-info/METADATA +495 -0
- labmate_mcp-7.0.0.dist-info/RECORD +14 -0
- labmate_mcp-7.0.0.dist-info/WHEEL +5 -0
- labmate_mcp-7.0.0.dist-info/entry_points.txt +2 -0
- labmate_mcp-7.0.0.dist-info/licenses/LICENSE +21 -0
- labmate_mcp-7.0.0.dist-info/top_level.txt +1 -0
labmate_mcp/apis.py
ADDED
|
@@ -0,0 +1,1744 @@
|
|
|
1
|
+
"""
|
|
2
|
+
API clients for scholarly-mcp.
|
|
3
|
+
|
|
4
|
+
All functions are async, return parsed dicts/lists or None on failure.
|
|
5
|
+
Each API module is self-contained with its own headers and error handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import contextlib
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("scholarly-mcp")
|
|
20
|
+
|
|
21
|
+
# =============================================================================
|
|
22
|
+
# Configuration (from environment variables)
|
|
23
|
+
# =============================================================================
|
|
24
|
+
|
|
25
|
+
VERSION = "7.0.0"
|
|
26
|
+
USER_AGENT = (
|
|
27
|
+
f"scholarly-mcp/{VERSION} "
|
|
28
|
+
"(https://github.com/JonasRackl/chemrxiv-mcp; "
|
|
29
|
+
"mailto:scholarly-mcp@users.noreply.github.com)"
|
|
30
|
+
)
|
|
31
|
+
TIMEOUT = 30
|
|
32
|
+
|
|
33
|
+
# Optional credentials — features activate when set
|
|
34
|
+
S2_API_KEY: str | None = os.environ.get("S2_API_KEY")
|
|
35
|
+
OPENALEX_EMAIL: str | None = os.environ.get("OPENALEX_EMAIL")
|
|
36
|
+
UNPAYWALL_EMAIL: str | None = os.environ.get(
|
|
37
|
+
"UNPAYWALL_EMAIL",
|
|
38
|
+
os.environ.get("OPENALEX_EMAIL", "scholarly-mcp@users.noreply.github.com"),
|
|
39
|
+
)
|
|
40
|
+
WOS_API_KEY: str | None = os.environ.get("WOS_API_KEY")
|
|
41
|
+
MP_API_KEY: str | None = os.environ.get("MP_API_KEY")
|
|
42
|
+
RXN_API_KEY: str | None = os.environ.get("RXN_API_KEY")
|
|
43
|
+
COMPTOX_API_KEY: str | None = os.environ.get("COMPTOX_API_KEY")
|
|
44
|
+
|
|
45
|
+
# Base URLs
|
|
46
|
+
CROSSREF_BASE = "https://api.crossref.org"
|
|
47
|
+
OPENALEX_BASE = "https://api.openalex.org"
|
|
48
|
+
S2_BASE = "https://api.semanticscholar.org"
|
|
49
|
+
UNPAYWALL_BASE = "https://api.unpaywall.org/v2"
|
|
50
|
+
PUBCHEM_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
|
51
|
+
CAS_BASE = "https://commonchemistry.cas.org/api"
|
|
52
|
+
WOS_BASE = "https://wos-api.clarivate.com/api/wos"
|
|
53
|
+
NIST_BASE = "https://webbook.nist.gov/cgi/cbook.cgi"
|
|
54
|
+
MP_BASE = "https://api.materialsproject.org"
|
|
55
|
+
RXN_BASE = "https://rxn.res.ibm.com/rxn/api/api/v1"
|
|
56
|
+
UNICHEM_BASE = "https://www.ebi.ac.uk/unichem/api/v1"
|
|
57
|
+
COD_BASE = "https://www.crystallography.net/cod"
|
|
58
|
+
COMPTOX_BASE = "https://api-ccte.epa.gov"
|
|
59
|
+
MASSBANK_BASE = "https://massbank.eu/MassBank/api"
|
|
60
|
+
BINDINGDB_BASE = "https://bindingdb.org/axis2/services/BDBService"
|
|
61
|
+
PDB_DATA_BASE = "https://data.rcsb.org/rest/v1/core"
|
|
62
|
+
PDB_SEARCH_BASE = "https://search.rcsb.org/rcsbsearch/v2/query"
|
|
63
|
+
NPCLASSIFIER_BASE = "https://npclassifier.gnps2.org"
|
|
64
|
+
|
|
65
|
+
# Semantic Scholar field sets
|
|
66
|
+
S2_SEARCH_FIELDS = (
|
|
67
|
+
"paperId,externalIds,title,abstract,year,venue,citationCount,"
|
|
68
|
+
"influentialCitationCount,isOpenAccess,openAccessPdf,tldr,authors"
|
|
69
|
+
)
|
|
70
|
+
S2_DETAIL_FIELDS = (
|
|
71
|
+
"paperId,externalIds,url,title,abstract,venue,publicationVenue,year,"
|
|
72
|
+
"referenceCount,citationCount,influentialCitationCount,isOpenAccess,"
|
|
73
|
+
"openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,"
|
|
74
|
+
"publicationDate,journal,authors,tldr"
|
|
75
|
+
)
|
|
76
|
+
S2_CITATION_FIELDS = (
|
|
77
|
+
"paperId,title,year,citationCount,authors,intents,isInfluential,contexts"
|
|
78
|
+
)
|
|
79
|
+
S2_AUTHOR_FIELDS = (
|
|
80
|
+
"authorId,name,affiliations,paperCount,citationCount,hIndex"
|
|
81
|
+
)
|
|
82
|
+
S2_AUTHOR_DETAIL_FIELDS = (
|
|
83
|
+
"authorId,name,affiliations,paperCount,citationCount,hIndex,"
|
|
84
|
+
"papers,papers.paperId,papers.title,papers.year,papers.citationCount,"
|
|
85
|
+
"papers.venue,papers.externalIds"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# ChemRxiv DOI prefix (for ChemRxiv-specific searches via Crossref)
|
|
89
|
+
CHEMRXIV_DOI_PREFIX = "10.26434"
|
|
90
|
+
|
|
91
|
+
# ChemRxiv subject categories (hardcoded from Atypon platform facets)
|
|
92
|
+
CHEMRXIV_CATEGORIES: dict[int, str] = {
|
|
93
|
+
502556: "Analytical Chemistry",
|
|
94
|
+
502557: "Biological and Medicinal Chemistry",
|
|
95
|
+
502558: "Catalysis",
|
|
96
|
+
502559: "Chemical Biology",
|
|
97
|
+
502560: "Chemical Engineering and Industrial Chemistry",
|
|
98
|
+
502561: "Earth, Space, and Environmental Chemistry",
|
|
99
|
+
502562: "Education",
|
|
100
|
+
502563: "Inorganic Chemistry",
|
|
101
|
+
502564: "Materials Chemistry",
|
|
102
|
+
502565: "Materials Science",
|
|
103
|
+
502566: "Nanoscience",
|
|
104
|
+
502567: "Organic Chemistry",
|
|
105
|
+
502568: "Organometallic Chemistry",
|
|
106
|
+
502569: "Physical Chemistry",
|
|
107
|
+
502570: "Polymer Chemistry",
|
|
108
|
+
502571: "Supramolecular Chemistry",
|
|
109
|
+
502572: "Theoretical and Computational Chemistry",
|
|
110
|
+
502573: "Other",
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# =============================================================================
|
|
115
|
+
# Shared HTTP helpers
|
|
116
|
+
# =============================================================================
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@contextlib.asynccontextmanager
|
|
120
|
+
async def _http(**kwargs):
|
|
121
|
+
"""Shared async HTTP client context manager."""
|
|
122
|
+
defaults = {
|
|
123
|
+
"timeout": TIMEOUT,
|
|
124
|
+
"follow_redirects": True,
|
|
125
|
+
"headers": {"User-Agent": USER_AGENT, "Accept": "application/json"},
|
|
126
|
+
}
|
|
127
|
+
defaults.update(kwargs)
|
|
128
|
+
async with httpx.AsyncClient(**defaults) as client:
|
|
129
|
+
yield client
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def _get(
|
|
133
|
+
url: str,
|
|
134
|
+
params: dict | None = None,
|
|
135
|
+
headers: dict | None = None,
|
|
136
|
+
) -> dict | None:
|
|
137
|
+
"""HTTP GET returning parsed JSON or None on failure."""
|
|
138
|
+
try:
|
|
139
|
+
async with _http() as client:
|
|
140
|
+
resp = await client.get(url, params=params, headers=headers or {})
|
|
141
|
+
resp.raise_for_status()
|
|
142
|
+
return resp.json()
|
|
143
|
+
except httpx.HTTPStatusError as e:
|
|
144
|
+
logger.warning(f"HTTP {e.response.status_code} for GET {url}")
|
|
145
|
+
return None
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.warning(f"GET {url} failed: {e}")
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
async def _post(
|
|
152
|
+
url: str,
|
|
153
|
+
json_data: dict | None = None,
|
|
154
|
+
params: dict | None = None,
|
|
155
|
+
headers: dict | None = None,
|
|
156
|
+
) -> dict | None:
|
|
157
|
+
"""HTTP POST returning parsed JSON or None on failure."""
|
|
158
|
+
try:
|
|
159
|
+
async with _http() as client:
|
|
160
|
+
resp = await client.post(
|
|
161
|
+
url, json=json_data, params=params, headers=headers or {}
|
|
162
|
+
)
|
|
163
|
+
resp.raise_for_status()
|
|
164
|
+
return resp.json()
|
|
165
|
+
except httpx.HTTPStatusError as e:
|
|
166
|
+
logger.warning(f"HTTP {e.response.status_code} for POST {url}")
|
|
167
|
+
return None
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.warning(f"POST {url} failed: {e}")
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# =============================================================================
|
|
174
|
+
# Crossref API
|
|
175
|
+
# =============================================================================
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
async def crossref_search(
|
|
179
|
+
query: str,
|
|
180
|
+
rows: int = 10,
|
|
181
|
+
offset: int = 0,
|
|
182
|
+
sort: str = "relevance",
|
|
183
|
+
filters: dict[str, str] | None = None,
|
|
184
|
+
) -> dict | None:
|
|
185
|
+
"""Search Crossref works. Returns full API response dict."""
|
|
186
|
+
params: dict[str, Any] = {"query": query, "rows": rows, "offset": offset}
|
|
187
|
+
if sort == "date":
|
|
188
|
+
params["sort"] = "deposited"
|
|
189
|
+
params["order"] = "desc"
|
|
190
|
+
if filters:
|
|
191
|
+
params["filter"] = ",".join(f"{k}:{v}" for k, v in filters.items())
|
|
192
|
+
headers = {}
|
|
193
|
+
if OPENALEX_EMAIL:
|
|
194
|
+
headers["mailto"] = OPENALEX_EMAIL # Crossref polite pool
|
|
195
|
+
return await _get(f"{CROSSREF_BASE}/works", params=params, headers=headers)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def crossref_search_chemrxiv(
|
|
199
|
+
query: str = "",
|
|
200
|
+
rows: int = 10,
|
|
201
|
+
offset: int = 0,
|
|
202
|
+
sort: str = "relevance",
|
|
203
|
+
date_from: str | None = None,
|
|
204
|
+
date_to: str | None = None,
|
|
205
|
+
) -> dict | None:
|
|
206
|
+
"""Search specifically within ChemRxiv preprints via Crossref DOI prefix."""
|
|
207
|
+
filters: dict[str, str] = {"prefix": CHEMRXIV_DOI_PREFIX}
|
|
208
|
+
if date_from:
|
|
209
|
+
filters["from-posted-date"] = date_from
|
|
210
|
+
if date_to:
|
|
211
|
+
filters["until-posted-date"] = date_to
|
|
212
|
+
return await crossref_search(
|
|
213
|
+
query=query, rows=rows, offset=offset, sort=sort, filters=filters
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
async def crossref_get_work(doi: str) -> dict | None:
|
|
218
|
+
"""Get a single work by DOI. Returns the message object."""
|
|
219
|
+
headers = {}
|
|
220
|
+
if OPENALEX_EMAIL:
|
|
221
|
+
headers["mailto"] = OPENALEX_EMAIL
|
|
222
|
+
data = await _get(f"{CROSSREF_BASE}/works/{doi}", headers=headers)
|
|
223
|
+
if data:
|
|
224
|
+
return data.get("message")
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# =============================================================================
|
|
229
|
+
# OpenAlex API
|
|
230
|
+
# =============================================================================
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _oa_params(**extra) -> dict:
|
|
234
|
+
"""Build OpenAlex params with polite pool email."""
|
|
235
|
+
params = dict(extra)
|
|
236
|
+
if OPENALEX_EMAIL:
|
|
237
|
+
params["mailto"] = OPENALEX_EMAIL
|
|
238
|
+
return params
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
async def openalex_search(
|
|
242
|
+
query: str,
|
|
243
|
+
filters: str | None = None,
|
|
244
|
+
per_page: int = 10,
|
|
245
|
+
page: int = 1,
|
|
246
|
+
sort: str | None = None,
|
|
247
|
+
) -> dict | None:
|
|
248
|
+
"""Search OpenAlex works. Supports advanced filters and sorting."""
|
|
249
|
+
params = _oa_params(search=query, per_page=per_page, page=page)
|
|
250
|
+
if filters:
|
|
251
|
+
params["filter"] = filters
|
|
252
|
+
if sort:
|
|
253
|
+
params["sort"] = sort
|
|
254
|
+
return await _get(f"{OPENALEX_BASE}/works", params=params)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
async def openalex_get_work(identifier: str) -> dict | None:
|
|
258
|
+
"""Get work by DOI or OpenAlex ID (W-prefixed)."""
|
|
259
|
+
if identifier.startswith("W") or identifier.startswith("https://"):
|
|
260
|
+
url = f"{OPENALEX_BASE}/works/{identifier}"
|
|
261
|
+
else:
|
|
262
|
+
url = f"{OPENALEX_BASE}/works/doi:{identifier}"
|
|
263
|
+
return await _get(url, params=_oa_params())
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
async def openalex_get_author(identifier: str) -> dict | None:
|
|
267
|
+
"""Get author by OpenAlex ID or search by name."""
|
|
268
|
+
if identifier.startswith("A") or identifier.startswith("https://"):
|
|
269
|
+
return await _get(
|
|
270
|
+
f"{OPENALEX_BASE}/authors/{identifier}", params=_oa_params()
|
|
271
|
+
)
|
|
272
|
+
# Name search — return first result
|
|
273
|
+
data = await _get(
|
|
274
|
+
f"{OPENALEX_BASE}/authors",
|
|
275
|
+
params=_oa_params(search=identifier, per_page=1),
|
|
276
|
+
)
|
|
277
|
+
if data and data.get("results"):
|
|
278
|
+
return data["results"][0]
|
|
279
|
+
return data
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
async def openalex_search_authors(
|
|
283
|
+
query: str, per_page: int = 10
|
|
284
|
+
) -> dict | None:
|
|
285
|
+
"""Search authors by name."""
|
|
286
|
+
return await _get(
|
|
287
|
+
f"{OPENALEX_BASE}/authors",
|
|
288
|
+
params=_oa_params(search=query, per_page=per_page),
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
async def openalex_group_by(
|
|
293
|
+
group_by: str,
|
|
294
|
+
filters: str | None = None,
|
|
295
|
+
search: str | None = None,
|
|
296
|
+
per_page: int = 200,
|
|
297
|
+
) -> dict | None:
|
|
298
|
+
"""Aggregate works by a field for bibliometric analysis.
|
|
299
|
+
|
|
300
|
+
group_by options: publication_year, authorships.author.id,
|
|
301
|
+
primary_location.source.id, topics.id, open_access.oa_status, etc.
|
|
302
|
+
"""
|
|
303
|
+
params = _oa_params(group_by=group_by, per_page=per_page)
|
|
304
|
+
if filters:
|
|
305
|
+
params["filter"] = filters
|
|
306
|
+
if search:
|
|
307
|
+
params["search"] = search
|
|
308
|
+
return await _get(f"{OPENALEX_BASE}/works", params=params)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
async def openalex_get_topic(identifier: str) -> dict | None:
|
|
312
|
+
"""Get topic by OpenAlex ID or search by name."""
|
|
313
|
+
if identifier.startswith("T") or identifier.startswith("https://"):
|
|
314
|
+
return await _get(
|
|
315
|
+
f"{OPENALEX_BASE}/topics/{identifier}", params=_oa_params()
|
|
316
|
+
)
|
|
317
|
+
data = await _get(
|
|
318
|
+
f"{OPENALEX_BASE}/topics",
|
|
319
|
+
params=_oa_params(search=identifier, per_page=1),
|
|
320
|
+
)
|
|
321
|
+
if data and data.get("results"):
|
|
322
|
+
return data["results"][0]
|
|
323
|
+
return data
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
async def openalex_get_works(
|
|
327
|
+
filters: str,
|
|
328
|
+
per_page: int = 10,
|
|
329
|
+
page: int = 1,
|
|
330
|
+
sort: str | None = None,
|
|
331
|
+
) -> dict | None:
|
|
332
|
+
"""Get works by filter (no search term). For citation network queries."""
|
|
333
|
+
params = _oa_params(filter=filters, per_page=per_page, page=page)
|
|
334
|
+
if sort:
|
|
335
|
+
params["sort"] = sort
|
|
336
|
+
return await _get(f"{OPENALEX_BASE}/works", params=params)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def openalex_reconstruct_abstract(inverted_index: dict) -> str:
|
|
340
|
+
"""Reconstruct abstract text from OpenAlex inverted index."""
|
|
341
|
+
if not inverted_index:
|
|
342
|
+
return ""
|
|
343
|
+
words: dict[int, str] = {}
|
|
344
|
+
for word, positions in inverted_index.items():
|
|
345
|
+
for pos in positions:
|
|
346
|
+
words[pos] = word
|
|
347
|
+
return " ".join(words[i] for i in sorted(words)) if words else ""
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
# =============================================================================
|
|
351
|
+
# Semantic Scholar API
|
|
352
|
+
# =============================================================================
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _s2_headers() -> dict:
|
|
356
|
+
"""Semantic Scholar headers with optional API key."""
|
|
357
|
+
h: dict[str, str] = {}
|
|
358
|
+
if S2_API_KEY:
|
|
359
|
+
h["x-api-key"] = S2_API_KEY
|
|
360
|
+
return h
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
async def s2_search(
|
|
364
|
+
query: str,
|
|
365
|
+
limit: int = 10,
|
|
366
|
+
offset: int = 0,
|
|
367
|
+
fields: str = S2_SEARCH_FIELDS,
|
|
368
|
+
fields_of_study: str | None = None,
|
|
369
|
+
year: str | None = None,
|
|
370
|
+
publication_types: str | None = None,
|
|
371
|
+
open_access_pdf: bool | None = None,
|
|
372
|
+
) -> dict | None:
|
|
373
|
+
"""Search Semantic Scholar papers.
|
|
374
|
+
|
|
375
|
+
Supports boolean queries, exact phrases ("..."), and filters.
|
|
376
|
+
year format: "2020" or "2020-2025" or "2020-"
|
|
377
|
+
"""
|
|
378
|
+
params: dict[str, Any] = {
|
|
379
|
+
"query": query,
|
|
380
|
+
"fields": fields,
|
|
381
|
+
"limit": limit,
|
|
382
|
+
"offset": offset,
|
|
383
|
+
}
|
|
384
|
+
if fields_of_study:
|
|
385
|
+
params["fieldsOfStudy"] = fields_of_study
|
|
386
|
+
if year:
|
|
387
|
+
params["year"] = year
|
|
388
|
+
if publication_types:
|
|
389
|
+
params["publicationTypes"] = publication_types
|
|
390
|
+
if open_access_pdf:
|
|
391
|
+
params["openAccessPdf"] = ""
|
|
392
|
+
return await _get(
|
|
393
|
+
f"{S2_BASE}/graph/v1/paper/search",
|
|
394
|
+
params=params,
|
|
395
|
+
headers=_s2_headers(),
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
async def s2_get_paper(
|
|
400
|
+
paper_id: str, fields: str = S2_DETAIL_FIELDS
|
|
401
|
+
) -> dict | None:
|
|
402
|
+
"""Get paper by Semantic Scholar ID or external ID.
|
|
403
|
+
|
|
404
|
+
paper_id formats:
|
|
405
|
+
- S2 paper ID (40-char hex)
|
|
406
|
+
- DOI:10.1234/xxx
|
|
407
|
+
- ARXIV:2101.12345
|
|
408
|
+
- PMID:12345678
|
|
409
|
+
- CorpusId:12345678
|
|
410
|
+
"""
|
|
411
|
+
return await _get(
|
|
412
|
+
f"{S2_BASE}/graph/v1/paper/{paper_id}",
|
|
413
|
+
params={"fields": fields},
|
|
414
|
+
headers=_s2_headers(),
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
async def s2_get_citations(
|
|
419
|
+
paper_id: str,
|
|
420
|
+
limit: int = 50,
|
|
421
|
+
offset: int = 0,
|
|
422
|
+
fields: str = S2_CITATION_FIELDS,
|
|
423
|
+
) -> dict | None:
|
|
424
|
+
"""Get papers that cite the given paper.
|
|
425
|
+
|
|
426
|
+
Includes isInfluential flag and citation intents.
|
|
427
|
+
"""
|
|
428
|
+
return await _get(
|
|
429
|
+
f"{S2_BASE}/graph/v1/paper/{paper_id}/citations",
|
|
430
|
+
params={"fields": fields, "limit": limit, "offset": offset},
|
|
431
|
+
headers=_s2_headers(),
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
async def s2_get_references(
|
|
436
|
+
paper_id: str,
|
|
437
|
+
limit: int = 50,
|
|
438
|
+
offset: int = 0,
|
|
439
|
+
fields: str = S2_CITATION_FIELDS,
|
|
440
|
+
) -> dict | None:
|
|
441
|
+
"""Get papers referenced by the given paper."""
|
|
442
|
+
return await _get(
|
|
443
|
+
f"{S2_BASE}/graph/v1/paper/{paper_id}/references",
|
|
444
|
+
params={"fields": fields, "limit": limit, "offset": offset},
|
|
445
|
+
headers=_s2_headers(),
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
async def s2_get_recommendations(
|
|
450
|
+
positive_ids: list[str],
|
|
451
|
+
negative_ids: list[str] | None = None,
|
|
452
|
+
limit: int = 10,
|
|
453
|
+
fields: str = S2_SEARCH_FIELDS,
|
|
454
|
+
) -> dict | None:
|
|
455
|
+
"""Get paper recommendations based on positive/negative examples.
|
|
456
|
+
|
|
457
|
+
IDs can be S2 paper IDs, DOI:xxx, ARXIV:xxx, etc.
|
|
458
|
+
For single-paper similarity, pass one positive ID.
|
|
459
|
+
"""
|
|
460
|
+
body: dict[str, Any] = {"positivePaperIds": positive_ids}
|
|
461
|
+
if negative_ids:
|
|
462
|
+
body["negativePaperIds"] = negative_ids
|
|
463
|
+
return await _post(
|
|
464
|
+
f"{S2_BASE}/recommendations/v1/papers/",
|
|
465
|
+
json_data=body,
|
|
466
|
+
params={"fields": fields, "limit": limit},
|
|
467
|
+
headers=_s2_headers(),
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
async def s2_search_author(
|
|
472
|
+
query: str, limit: int = 5, fields: str = S2_AUTHOR_FIELDS
|
|
473
|
+
) -> dict | None:
|
|
474
|
+
"""Search for authors by name."""
|
|
475
|
+
return await _get(
|
|
476
|
+
f"{S2_BASE}/graph/v1/author/search",
|
|
477
|
+
params={"query": query, "limit": limit, "fields": fields},
|
|
478
|
+
headers=_s2_headers(),
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
async def s2_get_author(
|
|
483
|
+
author_id: str, fields: str = S2_AUTHOR_DETAIL_FIELDS
|
|
484
|
+
) -> dict | None:
|
|
485
|
+
"""Get author details including recent papers."""
|
|
486
|
+
return await _get(
|
|
487
|
+
f"{S2_BASE}/graph/v1/author/{author_id}",
|
|
488
|
+
params={"fields": fields},
|
|
489
|
+
headers=_s2_headers(),
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
# =============================================================================
|
|
494
|
+
# Unpaywall API
|
|
495
|
+
# =============================================================================
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
async def unpaywall_get(doi: str) -> dict | None:
|
|
499
|
+
"""Find open access PDF location for a DOI."""
|
|
500
|
+
email = UNPAYWALL_EMAIL or "scholarly-mcp@users.noreply.github.com"
|
|
501
|
+
return await _get(f"{UNPAYWALL_BASE}/{doi}", params={"email": email})
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
# =============================================================================
|
|
505
|
+
# PubChem API
|
|
506
|
+
# =============================================================================
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
async def pubchem_search_by_name(name: str) -> dict | None:
|
|
510
|
+
"""Search PubChem compounds by name."""
|
|
511
|
+
return await _get(f"{PUBCHEM_BASE}/compound/name/{name}/JSON")
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
async def pubchem_search_by_smiles(smiles: str) -> dict | None:
|
|
515
|
+
"""Search PubChem compounds by SMILES string."""
|
|
516
|
+
# Use POST for SMILES to handle special characters
|
|
517
|
+
try:
|
|
518
|
+
async with _http() as client:
|
|
519
|
+
resp = await client.post(
|
|
520
|
+
f"{PUBCHEM_BASE}/compound/smiles/JSON",
|
|
521
|
+
data={"smiles": smiles},
|
|
522
|
+
)
|
|
523
|
+
resp.raise_for_status()
|
|
524
|
+
return resp.json()
|
|
525
|
+
except Exception as e:
|
|
526
|
+
logger.warning(f"PubChem SMILES search failed: {e}")
|
|
527
|
+
return None
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
async def pubchem_search_by_formula(formula: str) -> dict | None:
|
|
531
|
+
"""Search PubChem compounds by molecular formula."""
|
|
532
|
+
return await _get(
|
|
533
|
+
f"{PUBCHEM_BASE}/compound/fastformula/{formula}/JSON",
|
|
534
|
+
params={"MaxRecords": 10},
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
async def pubchem_get_compound(cid: int | str) -> dict | None:
|
|
539
|
+
"""Get compound record by PubChem CID."""
|
|
540
|
+
return await _get(f"{PUBCHEM_BASE}/compound/cid/{cid}/JSON")
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
async def pubchem_get_properties(
|
|
544
|
+
cid: int | str,
|
|
545
|
+
properties: str | None = None,
|
|
546
|
+
) -> dict | None:
|
|
547
|
+
"""Get computed molecular properties for a compound.
|
|
548
|
+
|
|
549
|
+
Default properties cover Lipinski rule-of-5 and common descriptors.
|
|
550
|
+
"""
|
|
551
|
+
if properties is None:
|
|
552
|
+
properties = (
|
|
553
|
+
"MolecularFormula,MolecularWeight,CanonicalSMILES,"
|
|
554
|
+
"IsomericSMILES,InChI,InChIKey,IUPACName,XLogP,"
|
|
555
|
+
"ExactMass,MonoisotopicMass,TPSA,Complexity,"
|
|
556
|
+
"HBondDonorCount,HBondAcceptorCount,RotatableBondCount,"
|
|
557
|
+
"HeavyAtomCount,CovalentUnitCount"
|
|
558
|
+
)
|
|
559
|
+
return await _get(
|
|
560
|
+
f"{PUBCHEM_BASE}/compound/cid/{cid}/property/{properties}/JSON"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
async def pubchem_get_synonyms(cid: int | str) -> dict | None:
|
|
565
|
+
"""Get all known names/synonyms for a compound."""
|
|
566
|
+
return await _get(f"{PUBCHEM_BASE}/compound/cid/{cid}/synonyms/JSON")
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# =============================================================================
|
|
570
|
+
# Common Chemistry (CAS) API — free, no auth, ~500k compounds
|
|
571
|
+
# =============================================================================
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
async def cas_search(query: str, size: int = 10) -> dict | None:
|
|
575
|
+
"""Search Common Chemistry by name, CAS number, InChI, InChIKey, or SMILES."""
|
|
576
|
+
return await _get(f"{CAS_BASE}/search", params={"q": query, "size": size})
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
async def cas_detail(cas_rn: str) -> dict | None:
|
|
580
|
+
"""Get full compound details by CAS Registry Number.
|
|
581
|
+
|
|
582
|
+
Returns: name, CAS RN, molecular formula, molecular mass, InChI,
|
|
583
|
+
InChIKey, SMILES, canonical SMILES, and experimental properties.
|
|
584
|
+
"""
|
|
585
|
+
return await _get(f"{CAS_BASE}/detail", params={"cas_rn": cas_rn})
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
# =============================================================================
|
|
589
|
+
# Web of Science Starter API (optional — requires WOS_API_KEY)
|
|
590
|
+
# =============================================================================
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def wos_available() -> bool:
|
|
594
|
+
"""Check if Web of Science API credentials are configured."""
|
|
595
|
+
return bool(WOS_API_KEY)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
async def wos_search(
|
|
599
|
+
query: str,
|
|
600
|
+
limit: int = 10,
|
|
601
|
+
first_record: int = 1,
|
|
602
|
+
sort_field: str = "RS",
|
|
603
|
+
database_id: str = "WOS",
|
|
604
|
+
) -> dict | None:
|
|
605
|
+
"""Search Web of Science.
|
|
606
|
+
|
|
607
|
+
Requires WOS_API_KEY environment variable.
|
|
608
|
+
|
|
609
|
+
query: Web of Science advanced search syntax, e.g.:
|
|
610
|
+
- TS=(catalysis AND asymmetric) — topic search
|
|
611
|
+
- AU=(Smith) — author search
|
|
612
|
+
- SO=(Nature) — source/journal search
|
|
613
|
+
- DO=(10.1234/xxx) — DOI search
|
|
614
|
+
|
|
615
|
+
sort_field: RS (relevance), PY (year), TC (times cited), LD (load date)
|
|
616
|
+
database_id: WOS, BCI, CCC, DCI, DIIDW, KJD, MEDLINE, RSCI, SCIELO
|
|
617
|
+
"""
|
|
618
|
+
if not WOS_API_KEY:
|
|
619
|
+
return None
|
|
620
|
+
return await _get(
|
|
621
|
+
WOS_BASE,
|
|
622
|
+
params={
|
|
623
|
+
"databaseId": database_id,
|
|
624
|
+
"usrQuery": query,
|
|
625
|
+
"count": limit,
|
|
626
|
+
"firstRecord": first_record,
|
|
627
|
+
"sortField": sort_field,
|
|
628
|
+
},
|
|
629
|
+
headers={"X-ApiKey": WOS_API_KEY},
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# =============================================================================
|
|
633
|
+
# NIST Chemistry WebBook (scraping — no official API)
|
|
634
|
+
# =============================================================================
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def _strip_html(html: str) -> str:
|
|
638
|
+
"""Remove HTML tags and decode common entities."""
|
|
639
|
+
text = re.sub(r"<[^>]+>", "", html)
|
|
640
|
+
for ent, char in [("&", "&"), ("<", "<"), (">", ">"),
|
|
641
|
+
("±", "±"), ("°", "°"), (" ", " "),
|
|
642
|
+
("°", "°")]:
|
|
643
|
+
text = text.replace(ent, char)
|
|
644
|
+
return text.strip()
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
async def nist_fetch(params: dict[str, str]) -> str | None:
|
|
648
|
+
"""Fetch a NIST WebBook page. Returns raw HTML."""
|
|
649
|
+
params.setdefault("Units", "SI")
|
|
650
|
+
try:
|
|
651
|
+
async with _http() as client:
|
|
652
|
+
resp = await client.get(NIST_BASE, params=params)
|
|
653
|
+
if resp.status_code == 200:
|
|
654
|
+
return resp.text
|
|
655
|
+
except Exception as e:
|
|
656
|
+
logger.warning(f"NIST fetch failed: {e}")
|
|
657
|
+
return None
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
async def nist_search(query: str, search_type: str = "name") -> str | None:
|
|
661
|
+
"""Search NIST WebBook by name, CAS, formula, or InChI."""
|
|
662
|
+
params: dict[str, str] = {}
|
|
663
|
+
if search_type == "cas":
|
|
664
|
+
cas_clean = "C" + query.replace("-", "")
|
|
665
|
+
params["ID"] = cas_clean
|
|
666
|
+
params["Mask"] = "FFF"
|
|
667
|
+
elif search_type == "formula":
|
|
668
|
+
params["Formula"] = query
|
|
669
|
+
params["NoIon"] = "on"
|
|
670
|
+
elif search_type == "inchi":
|
|
671
|
+
params["InChI"] = query
|
|
672
|
+
else:
|
|
673
|
+
params["Name"] = query
|
|
674
|
+
return await nist_fetch(params)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def nist_is_compound_page(html: str) -> bool:
|
|
678
|
+
"""Check if HTML is a single compound page (vs. search results)."""
|
|
679
|
+
return bool(re.search(r'<h1[^>]*id="Top"', html))
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def nist_parse_search_results(html: str) -> list[dict]:
|
|
683
|
+
"""Parse NIST search results page into list of matches."""
|
|
684
|
+
results = []
|
|
685
|
+
for m in re.finditer(
|
|
686
|
+
r'<a\s+href="(/cgi/cbook\.cgi\?ID=(C\d+)[^"]*)"[^>]*>(.*?)</a>',
|
|
687
|
+
html,
|
|
688
|
+
):
|
|
689
|
+
results.append({
|
|
690
|
+
"name": _strip_html(m.group(3)),
|
|
691
|
+
"nist_id": m.group(2),
|
|
692
|
+
"url": f"https://webbook.nist.gov{m.group(1)}",
|
|
693
|
+
})
|
|
694
|
+
return results
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def nist_parse_compound(html: str) -> dict:
|
|
698
|
+
"""Parse NIST compound page into structured data."""
|
|
699
|
+
info: dict[str, Any] = {}
|
|
700
|
+
|
|
701
|
+
# Name
|
|
702
|
+
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.S)
|
|
703
|
+
if m:
|
|
704
|
+
info["name"] = _strip_html(m.group(1))
|
|
705
|
+
|
|
706
|
+
# Key-value pairs from <li><strong>Key:</strong> Value</li>
|
|
707
|
+
kv_map = {
|
|
708
|
+
"Formula": "formula",
|
|
709
|
+
"Molecular weight": "molecular_weight",
|
|
710
|
+
"CAS Registry Number": "cas_rn",
|
|
711
|
+
"IUPAC Standard InChI": "inchi",
|
|
712
|
+
"IUPAC Standard InChIKey": "inchi_key",
|
|
713
|
+
"Chemical structure": None, # skip image
|
|
714
|
+
}
|
|
715
|
+
for m in re.finditer(
|
|
716
|
+
r"<li>\s*<strong>(.*?):</strong>\s*(.*?)</li>", html, re.S
|
|
717
|
+
):
|
|
718
|
+
key = _strip_html(m.group(1))
|
|
719
|
+
val = _strip_html(m.group(2))
|
|
720
|
+
mapped = kv_map.get(key, key.lower().replace(" ", "_"))
|
|
721
|
+
if mapped and val:
|
|
722
|
+
info[mapped] = val
|
|
723
|
+
|
|
724
|
+
# Other names
|
|
725
|
+
m = re.search(r"<strong>Other names:</strong>\s*(.*?)</li>", html, re.S)
|
|
726
|
+
if m:
|
|
727
|
+
raw = _strip_html(m.group(1))
|
|
728
|
+
info["other_names"] = [n.strip() for n in raw.split(";") if n.strip()]
|
|
729
|
+
|
|
730
|
+
# NIST ID from page URLs
|
|
731
|
+
m = re.search(r"ID=(C\d+)", html)
|
|
732
|
+
if m:
|
|
733
|
+
info["nist_id"] = m.group(1)
|
|
734
|
+
|
|
735
|
+
# Detect available data sections
|
|
736
|
+
avail: list[str] = []
|
|
737
|
+
section_names = [
|
|
738
|
+
("Thermochemistry", "thermo"), ("Phase change", "phase_change"),
|
|
739
|
+
("Reaction thermochemistry", "reaction_thermo"),
|
|
740
|
+
("Henry", "henrys_law"), ("Gas phase ion", "ion_energetics"),
|
|
741
|
+
("IR Spec", "ir_spectrum"), ("Mass Spec", "mass_spectrum"),
|
|
742
|
+
("UV/Vis", "uv_vis_spectrum"), ("Vibrational", "vibrational"),
|
|
743
|
+
("Electronic", "electronic"), ("Constants of diatomic", "diatomic"),
|
|
744
|
+
]
|
|
745
|
+
for label, key in section_names:
|
|
746
|
+
if label.lower() in html.lower():
|
|
747
|
+
avail.append(key)
|
|
748
|
+
info["available_data"] = avail
|
|
749
|
+
|
|
750
|
+
# --- Inline thermochemistry data ---
|
|
751
|
+
# Gas phase ΔfH°
|
|
752
|
+
m = re.search(
|
|
753
|
+
r"f</sub>H.*?gas.*?(-?[\d.]+)\s*(?:±|±)\s*([\d.]+)\s*kJ/mol",
|
|
754
|
+
html, re.S,
|
|
755
|
+
)
|
|
756
|
+
if m:
|
|
757
|
+
info["delta_fH_gas_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
|
|
758
|
+
|
|
759
|
+
# Standard entropy S°
|
|
760
|
+
m = re.search(r"S°.*?gas.*?([\d.]+)\s*(?:±|±)?\s*[\d.]*\s*J/mol", html, re.S)
|
|
761
|
+
if m:
|
|
762
|
+
info["S_gas_J_mol_K"] = m.group(1)
|
|
763
|
+
|
|
764
|
+
# Cp gas
|
|
765
|
+
m = re.search(r"C\s*p.*?gas.*?([\d.]+)\s*(?:±|±)?\s*[\d.]*\s*J/mol", html, re.S)
|
|
766
|
+
if m:
|
|
767
|
+
info["Cp_gas_J_mol_K"] = m.group(1)
|
|
768
|
+
|
|
769
|
+
# Phase change: boiling point
|
|
770
|
+
for pat in [
|
|
771
|
+
r"T<sub>boil</sub>\s*=?\s*([\d.]+)\s*(?:±\s*[\d.]+\s*)?K",
|
|
772
|
+
r"boil.*?([\d.]+)\s*K",
|
|
773
|
+
]:
|
|
774
|
+
m = re.search(pat, html, re.S)
|
|
775
|
+
if m:
|
|
776
|
+
info["boiling_point_K"] = m.group(1)
|
|
777
|
+
break
|
|
778
|
+
|
|
779
|
+
# Phase change: melting point
|
|
780
|
+
for pat in [
|
|
781
|
+
r"T<sub>fus</sub>\s*=?\s*([\d.]+)\s*(?:±\s*[\d.]+\s*)?K",
|
|
782
|
+
r"fus.*?([\d.]+)\s*K",
|
|
783
|
+
]:
|
|
784
|
+
m = re.search(pat, html, re.S)
|
|
785
|
+
if m:
|
|
786
|
+
info["melting_point_K"] = m.group(1)
|
|
787
|
+
break
|
|
788
|
+
|
|
789
|
+
# ΔvapH (enthalpy of vaporization)
|
|
790
|
+
m = re.search(r"vap</sub>H.*?([\d.]+)\s*(?:±|±)\s*([\d.]+)\s*kJ/mol", html, re.S)
|
|
791
|
+
if m:
|
|
792
|
+
info["delta_vapH_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
|
|
793
|
+
|
|
794
|
+
# ΔfusH (enthalpy of fusion)
|
|
795
|
+
m = re.search(r"fus</sub>H.*?([\d.]+)\s*(?:±|±)\s*([\d.]+)\s*kJ/mol", html, re.S)
|
|
796
|
+
if m:
|
|
797
|
+
info["delta_fusH_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
|
|
798
|
+
|
|
799
|
+
return info
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
# =============================================================================
|
|
803
|
+
# Materials Project API (optional — requires MP_API_KEY)
|
|
804
|
+
# =============================================================================
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def mp_available() -> bool:
|
|
808
|
+
"""Check if Materials Project API key is configured."""
|
|
809
|
+
return bool(MP_API_KEY)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
async def mp_search(
|
|
813
|
+
formula: str | None = None,
|
|
814
|
+
elements: list[str] | None = None,
|
|
815
|
+
band_gap_min: float | None = None,
|
|
816
|
+
band_gap_max: float | None = None,
|
|
817
|
+
limit: int = 10,
|
|
818
|
+
) -> dict | None:
|
|
819
|
+
"""Search Materials Project for inorganic materials."""
|
|
820
|
+
if not MP_API_KEY:
|
|
821
|
+
return None
|
|
822
|
+
params: dict[str, Any] = {
|
|
823
|
+
"_limit": limit,
|
|
824
|
+
"_fields": (
|
|
825
|
+
"material_id,formula_pretty,structure,symmetry,"
|
|
826
|
+
"band_gap,formation_energy_per_atom,energy_above_hull,"
|
|
827
|
+
"is_stable,theoretical,nsites"
|
|
828
|
+
),
|
|
829
|
+
}
|
|
830
|
+
if formula:
|
|
831
|
+
params["formula"] = formula
|
|
832
|
+
if elements:
|
|
833
|
+
params["elements"] = ",".join(elements)
|
|
834
|
+
if band_gap_min is not None:
|
|
835
|
+
params["band_gap_min"] = band_gap_min
|
|
836
|
+
if band_gap_max is not None:
|
|
837
|
+
params["band_gap_max"] = band_gap_max
|
|
838
|
+
return await _get(
|
|
839
|
+
f"{MP_BASE}/materials/summary/",
|
|
840
|
+
params=params,
|
|
841
|
+
headers={"X-API-KEY": MP_API_KEY},
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
async def mp_get_material(material_id: str) -> dict | None:
|
|
846
|
+
"""Get full material details by Materials Project ID (e.g., 'mp-149')."""
|
|
847
|
+
if not MP_API_KEY:
|
|
848
|
+
return None
|
|
849
|
+
return await _get(
|
|
850
|
+
f"{MP_BASE}/materials/summary/{material_id}",
|
|
851
|
+
params={
|
|
852
|
+
"_fields": (
|
|
853
|
+
"material_id,formula_pretty,structure,symmetry,"
|
|
854
|
+
"band_gap,formation_energy_per_atom,energy_above_hull,"
|
|
855
|
+
"is_stable,theoretical,nsites,volume,density,"
|
|
856
|
+
"efermi,total_magnetization,ordering,is_metal,"
|
|
857
|
+
"database_IDs,deprecated,uncorrected_energy_per_atom"
|
|
858
|
+
),
|
|
859
|
+
},
|
|
860
|
+
headers={"X-API-KEY": MP_API_KEY},
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
# =============================================================================
|
|
865
|
+
# RXN4Chemistry — IBM AI reaction prediction (optional — requires RXN_API_KEY)
|
|
866
|
+
# =============================================================================
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def rxn_available() -> bool:
|
|
870
|
+
"""Check if IBM RXN API key is configured."""
|
|
871
|
+
return bool(RXN_API_KEY)
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
_rxn_project_id: str | None = None
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
async def _rxn_headers() -> dict[str, str]:
|
|
878
|
+
"""Get RXN auth headers."""
|
|
879
|
+
return {
|
|
880
|
+
"Authorization": f"apikey {RXN_API_KEY}",
|
|
881
|
+
"Content-Type": "application/json",
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
async def _rxn_ensure_project() -> str | None:
|
|
886
|
+
"""Create or return cached RXN project ID."""
|
|
887
|
+
global _rxn_project_id
|
|
888
|
+
if _rxn_project_id:
|
|
889
|
+
return _rxn_project_id
|
|
890
|
+
if not RXN_API_KEY:
|
|
891
|
+
return None
|
|
892
|
+
data = await _post(
|
|
893
|
+
f"{RXN_BASE}/projects",
|
|
894
|
+
json_data={"name": "scholarly-mcp"},
|
|
895
|
+
headers=await _rxn_headers(),
|
|
896
|
+
)
|
|
897
|
+
if data and data.get("payload"):
|
|
898
|
+
_rxn_project_id = data["payload"].get("id")
|
|
899
|
+
return _rxn_project_id
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
async def _rxn_poll(url: str, max_wait: int = 60, interval: int = 3) -> dict | None:
|
|
903
|
+
"""Poll an RXN endpoint until result is ready."""
|
|
904
|
+
headers = await _rxn_headers()
|
|
905
|
+
for _ in range(max_wait // interval):
|
|
906
|
+
data = await _get(url, headers=headers)
|
|
907
|
+
if data:
|
|
908
|
+
payload = data.get("payload", {})
|
|
909
|
+
status = payload.get("status", data.get("status", ""))
|
|
910
|
+
if status in ("SUCCESS", "success"):
|
|
911
|
+
return data
|
|
912
|
+
if status in ("FAILED", "failed", "ERROR", "error"):
|
|
913
|
+
return data
|
|
914
|
+
await asyncio.sleep(interval)
|
|
915
|
+
return None
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
async def rxn_predict_reaction(rxn_smiles: str) -> dict | None:
|
|
919
|
+
"""Predict reaction product(s) from reactants.
|
|
920
|
+
|
|
921
|
+
rxn_smiles: reactants.reagents>>products (e.g., 'CC.OC>>')
|
|
922
|
+
"""
|
|
923
|
+
if not RXN_API_KEY:
|
|
924
|
+
return None
|
|
925
|
+
project_id = await _rxn_ensure_project()
|
|
926
|
+
if not project_id:
|
|
927
|
+
return None
|
|
928
|
+
data = await _post(
|
|
929
|
+
f"{RXN_BASE}/predictions",
|
|
930
|
+
json_data={
|
|
931
|
+
"projectId": project_id,
|
|
932
|
+
"name": "mcp-prediction",
|
|
933
|
+
"inputs": [{"rxnSmiles": rxn_smiles}],
|
|
934
|
+
},
|
|
935
|
+
headers=await _rxn_headers(),
|
|
936
|
+
)
|
|
937
|
+
if not data or not data.get("payload"):
|
|
938
|
+
return data
|
|
939
|
+
pred_id = data["payload"].get("id")
|
|
940
|
+
if not pred_id:
|
|
941
|
+
return data
|
|
942
|
+
return await _rxn_poll(
|
|
943
|
+
f"{RXN_BASE}/predictions/{pred_id}",
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
async def rxn_retrosynthesis(product_smiles: str, max_steps: int = 3) -> dict | None:
|
|
948
|
+
"""Plan retrosynthetic route to a target molecule.
|
|
949
|
+
|
|
950
|
+
product_smiles: target product SMILES
|
|
951
|
+
max_steps: maximum retrosynthetic steps (1-10)
|
|
952
|
+
"""
|
|
953
|
+
if not RXN_API_KEY:
|
|
954
|
+
return None
|
|
955
|
+
project_id = await _rxn_ensure_project()
|
|
956
|
+
if not project_id:
|
|
957
|
+
return None
|
|
958
|
+
data = await _post(
|
|
959
|
+
f"{RXN_BASE}/retrosynthesis",
|
|
960
|
+
json_data={
|
|
961
|
+
"projectId": project_id,
|
|
962
|
+
"fap": 0.6,
|
|
963
|
+
"maxSteps": max_steps,
|
|
964
|
+
"nBeams": 10,
|
|
965
|
+
"pruneThreshold": 0.2,
|
|
966
|
+
"isAutomatic": True,
|
|
967
|
+
"content": {"smiles": product_smiles},
|
|
968
|
+
},
|
|
969
|
+
headers=await _rxn_headers(),
|
|
970
|
+
)
|
|
971
|
+
if not data or not data.get("payload"):
|
|
972
|
+
return data
|
|
973
|
+
pred_id = data["payload"].get("id")
|
|
974
|
+
if not pred_id:
|
|
975
|
+
return data
|
|
976
|
+
# Retrosynthesis takes longer — poll with longer timeout
|
|
977
|
+
return await _rxn_poll(
|
|
978
|
+
f"{RXN_BASE}/retrosynthesis/{pred_id}?projectId={project_id}",
|
|
979
|
+
max_wait=120,
|
|
980
|
+
interval=5,
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
async def rxn_paragraph_to_actions(paragraph: str) -> dict | None:
|
|
985
|
+
"""Convert experimental paragraph to structured action steps.
|
|
986
|
+
|
|
987
|
+
Uses IBM RXN NLP model to parse free-text experimental procedures
|
|
988
|
+
into machine-readable action steps (MAKESOLUTION, ADD, STIR, etc.).
|
|
989
|
+
"""
|
|
990
|
+
if not RXN_API_KEY:
|
|
991
|
+
return None
|
|
992
|
+
data = await _post(
|
|
993
|
+
f"{RXN_BASE}/paragraph-actions",
|
|
994
|
+
json_data={"paragraph": paragraph},
|
|
995
|
+
headers=await _rxn_headers(),
|
|
996
|
+
)
|
|
997
|
+
return data
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
async def rxn_predict_atom_mapping(rxn_smiles: str) -> dict | None:
|
|
1001
|
+
"""Predict atom-to-atom mapping for a reaction SMILES.
|
|
1002
|
+
|
|
1003
|
+
rxn_smiles: full reaction SMILES (e.g., 'CC(=O)O.OCC>>CC(=O)OCC.O')
|
|
1004
|
+
Uses the atom-mapping-2020 AI model.
|
|
1005
|
+
"""
|
|
1006
|
+
if not RXN_API_KEY:
|
|
1007
|
+
return None
|
|
1008
|
+
project_id = await _rxn_ensure_project()
|
|
1009
|
+
if not project_id:
|
|
1010
|
+
return None
|
|
1011
|
+
data = await _post(
|
|
1012
|
+
f"{RXN_BASE}/predictions",
|
|
1013
|
+
json_data={
|
|
1014
|
+
"projectId": project_id,
|
|
1015
|
+
"name": "mcp-atom-mapping",
|
|
1016
|
+
"inputs": [{"rxnSmiles": rxn_smiles}],
|
|
1017
|
+
"aiModel": "atom-mapping-2020",
|
|
1018
|
+
},
|
|
1019
|
+
headers=await _rxn_headers(),
|
|
1020
|
+
)
|
|
1021
|
+
if not data or not data.get("payload"):
|
|
1022
|
+
return data
|
|
1023
|
+
pred_id = data["payload"].get("id")
|
|
1024
|
+
if not pred_id:
|
|
1025
|
+
return data
|
|
1026
|
+
return await _rxn_poll(
|
|
1027
|
+
f"{RXN_BASE}/predictions/{pred_id}",
|
|
1028
|
+
)
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
async def rxn_synthesis_plan(
|
|
1032
|
+
prediction_id: str,
|
|
1033
|
+
sequence_index: int = 0,
|
|
1034
|
+
) -> dict | None:
|
|
1035
|
+
"""Create and retrieve a synthesis plan from a retrosynthesis result.
|
|
1036
|
+
|
|
1037
|
+
Takes a retrosynthesis prediction ID and sequence index, creates a
|
|
1038
|
+
synthesis, then retrieves the step-by-step procedure with actions.
|
|
1039
|
+
"""
|
|
1040
|
+
if not RXN_API_KEY:
|
|
1041
|
+
return None
|
|
1042
|
+
project_id = await _rxn_ensure_project()
|
|
1043
|
+
if not project_id:
|
|
1044
|
+
return None
|
|
1045
|
+
headers = await _rxn_headers()
|
|
1046
|
+
|
|
1047
|
+
# Step 1: Get the retrosynthesis result to find sequence IDs
|
|
1048
|
+
retro_data = await _get(
|
|
1049
|
+
f"{RXN_BASE}/retrosynthesis/{prediction_id}?projectId={project_id}",
|
|
1050
|
+
headers=headers,
|
|
1051
|
+
)
|
|
1052
|
+
if not retro_data:
|
|
1053
|
+
return {"error": "Could not retrieve retrosynthesis result"}
|
|
1054
|
+
|
|
1055
|
+
payload = retro_data.get("payload", {})
|
|
1056
|
+
sequences = payload.get("sequences", [])
|
|
1057
|
+
if not sequences:
|
|
1058
|
+
return {"error": "No synthesis sequences found in retrosynthesis result"}
|
|
1059
|
+
if sequence_index >= len(sequences):
|
|
1060
|
+
return {"error": f"Sequence index {sequence_index} out of range (have {len(sequences)})"}
|
|
1061
|
+
|
|
1062
|
+
sequence = sequences[sequence_index]
|
|
1063
|
+
sequence_id = sequence.get("sequenceId", sequence.get("id", ""))
|
|
1064
|
+
if not sequence_id:
|
|
1065
|
+
return {"error": "Could not extract sequence ID from retrosynthesis result", "sequence": sequence}
|
|
1066
|
+
|
|
1067
|
+
# Step 2: Create synthesis from sequence
|
|
1068
|
+
synth_data = await _post(
|
|
1069
|
+
f"{RXN_BASE}/syntheses",
|
|
1070
|
+
json_data={
|
|
1071
|
+
"sequenceId": sequence_id,
|
|
1072
|
+
"projectId": project_id,
|
|
1073
|
+
},
|
|
1074
|
+
headers=headers,
|
|
1075
|
+
)
|
|
1076
|
+
if not synth_data or not synth_data.get("payload"):
|
|
1077
|
+
return {"error": "Failed to create synthesis", "detail": synth_data}
|
|
1078
|
+
synthesis_id = synth_data["payload"].get("id", "")
|
|
1079
|
+
if not synthesis_id:
|
|
1080
|
+
return {"error": "No synthesis ID returned"}
|
|
1081
|
+
|
|
1082
|
+
# Step 3: Poll for synthesis completion, then get procedure
|
|
1083
|
+
synth_result = await _rxn_poll(
|
|
1084
|
+
f"{RXN_BASE}/syntheses/{synthesis_id}?projectId={project_id}",
|
|
1085
|
+
max_wait=90,
|
|
1086
|
+
interval=5,
|
|
1087
|
+
)
|
|
1088
|
+
if not synth_result:
|
|
1089
|
+
return {"error": "Synthesis planning timed out"}
|
|
1090
|
+
|
|
1091
|
+
# Step 4: Get detailed procedure
|
|
1092
|
+
procedure = await _get(
|
|
1093
|
+
f"{RXN_BASE}/syntheses/{synthesis_id}/procedure?projectId={project_id}",
|
|
1094
|
+
headers=headers,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
return {
|
|
1098
|
+
"synthesis_id": synthesis_id,
|
|
1099
|
+
"plan": synth_result,
|
|
1100
|
+
"procedure": procedure,
|
|
1101
|
+
"sequence_index": sequence_index,
|
|
1102
|
+
"total_sequences": len(sequences),
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
# =============================================================================
|
|
1107
|
+
# Rowan Science — Cloud computational chemistry (optional)
|
|
1108
|
+
# Requires: ROWAN_API_KEY + rowan-python package
|
|
1109
|
+
# =============================================================================
|
|
1110
|
+
|
|
1111
|
+
ROWAN_API_KEY: str | None = os.environ.get("ROWAN_API_KEY")
|
|
1112
|
+
|
|
1113
|
+
try:
|
|
1114
|
+
import rowan as _rowan_sdk
|
|
1115
|
+
import stjames as _stjames
|
|
1116
|
+
_ROWAN_SDK = True
|
|
1117
|
+
except ImportError:
|
|
1118
|
+
_ROWAN_SDK = False
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
def rowan_available() -> bool:
|
|
1122
|
+
"""Check if Rowan Science API key AND SDK are configured."""
|
|
1123
|
+
return bool(ROWAN_API_KEY) and _ROWAN_SDK
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
async def _rowan_run_workflow(submit_fn, **kwargs) -> dict | None:
|
|
1127
|
+
"""Submit a Rowan workflow, wait for result, return data dict.
|
|
1128
|
+
|
|
1129
|
+
Runs the synchronous Rowan SDK in a thread pool to avoid blocking
|
|
1130
|
+
the async event loop. Returns workflow.data or error info.
|
|
1131
|
+
"""
|
|
1132
|
+
if not rowan_available():
|
|
1133
|
+
return None
|
|
1134
|
+
|
|
1135
|
+
def _run():
|
|
1136
|
+
_rowan_sdk.api_key = ROWAN_API_KEY
|
|
1137
|
+
result = submit_fn(**kwargs)
|
|
1138
|
+
result.wait_for_result()
|
|
1139
|
+
result.fetch_latest(in_place=True)
|
|
1140
|
+
return {
|
|
1141
|
+
"status": str(result.status),
|
|
1142
|
+
"data": result.data,
|
|
1143
|
+
"credits_charged": result.credits_charged,
|
|
1144
|
+
"workflow_uuid": result.uuid,
|
|
1145
|
+
"workflow_type": result.workflow_type,
|
|
1146
|
+
"elapsed": result.elapsed,
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
try:
|
|
1150
|
+
return await asyncio.to_thread(_run)
|
|
1151
|
+
except Exception as e:
|
|
1152
|
+
logger.warning(f"Rowan workflow failed: {e}")
|
|
1153
|
+
return {"error": str(e)}
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
async def rowan_predict_pka(
|
|
1157
|
+
smiles: str,
|
|
1158
|
+
pka_range: tuple[int, int] = (2, 12),
|
|
1159
|
+
method: str = "aimnet2_wagen2024",
|
|
1160
|
+
) -> dict | None:
|
|
1161
|
+
"""Predict pKa values for a molecule using Rowan Science."""
|
|
1162
|
+
return await _rowan_run_workflow(
|
|
1163
|
+
_rowan_sdk.submit_pka_workflow,
|
|
1164
|
+
initial_molecule=smiles,
|
|
1165
|
+
pka_range=pka_range,
|
|
1166
|
+
method=method,
|
|
1167
|
+
name="labmate-pka",
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
|
|
1171
|
+
async def rowan_predict_solubility(
|
|
1172
|
+
smiles: str,
|
|
1173
|
+
method: str = "fastsolv",
|
|
1174
|
+
solvents: list[str] | None = None,
|
|
1175
|
+
temperatures: list[float] | None = None,
|
|
1176
|
+
) -> dict | None:
|
|
1177
|
+
"""Predict solubility using Rowan Science."""
|
|
1178
|
+
return await _rowan_run_workflow(
|
|
1179
|
+
_rowan_sdk.submit_solubility_workflow,
|
|
1180
|
+
initial_smiles=smiles,
|
|
1181
|
+
solubility_method=method,
|
|
1182
|
+
solvents=solvents,
|
|
1183
|
+
temperatures=temperatures,
|
|
1184
|
+
name="labmate-solubility",
|
|
1185
|
+
)
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
async def rowan_predict_admet(smiles: str) -> dict | None:
|
|
1189
|
+
"""Predict ADMET properties using Rowan Science."""
|
|
1190
|
+
return await _rowan_run_workflow(
|
|
1191
|
+
_rowan_sdk.submit_admet_workflow,
|
|
1192
|
+
initial_smiles=smiles,
|
|
1193
|
+
name="labmate-admet",
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
async def rowan_search_tautomers(smiles: str) -> dict | None:
|
|
1198
|
+
"""Enumerate and rank tautomers using Rowan Science."""
|
|
1199
|
+
mol = _stjames.Molecule.from_smiles(smiles)
|
|
1200
|
+
return await _rowan_run_workflow(
|
|
1201
|
+
_rowan_sdk.submit_tautomer_search_workflow,
|
|
1202
|
+
initial_molecule=mol,
|
|
1203
|
+
name="labmate-tautomers",
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
async def rowan_compute_descriptors(smiles: str) -> dict | None:
|
|
1208
|
+
"""Compute molecular descriptors using Rowan Science."""
|
|
1209
|
+
mol = _stjames.Molecule.from_smiles(smiles)
|
|
1210
|
+
return await _rowan_run_workflow(
|
|
1211
|
+
_rowan_sdk.submit_descriptors_workflow,
|
|
1212
|
+
initial_molecule=mol,
|
|
1213
|
+
name="labmate-descriptors",
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
async def rowan_predict_nmr(
|
|
1218
|
+
smiles: str,
|
|
1219
|
+
solvent: str = "chloroform",
|
|
1220
|
+
) -> dict | None:
|
|
1221
|
+
"""Predict NMR chemical shifts using Rowan Science."""
|
|
1222
|
+
mol = _stjames.Molecule.from_smiles(smiles)
|
|
1223
|
+
return await _rowan_run_workflow(
|
|
1224
|
+
_rowan_sdk.submit_nmr_workflow,
|
|
1225
|
+
initial_molecule=mol,
|
|
1226
|
+
solvent=solvent,
|
|
1227
|
+
name="labmate-nmr",
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
|
|
1231
|
+
# =============================================================================
|
|
1232
|
+
# UniChem — Universal chemical identifier cross-reference (no auth)
|
|
1233
|
+
# =============================================================================
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
async def unichem_lookup(inchikey: str) -> dict | None:
|
|
1237
|
+
"""Cross-reference a compound across 40+ databases by InChIKey.
|
|
1238
|
+
|
|
1239
|
+
Returns source IDs from ChEMBL, PubChem, DrugBank, ZINC, etc.
|
|
1240
|
+
"""
|
|
1241
|
+
return await _post(
|
|
1242
|
+
f"{UNICHEM_BASE}/compounds",
|
|
1243
|
+
json_data={"type": "inchikey", "compound": inchikey},
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
async def unichem_sources() -> dict | None:
|
|
1248
|
+
"""List all available UniChem data sources."""
|
|
1249
|
+
return await _get(f"{UNICHEM_BASE}/sources")
|
|
1250
|
+
|
|
1251
|
+
|
|
1252
|
+
# =============================================================================
|
|
1253
|
+
# Crystallography Open Database (COD) — open crystal structures (no auth)
|
|
1254
|
+
# =============================================================================
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
async def cod_search(
|
|
1258
|
+
formula: str | None = None,
|
|
1259
|
+
elements: list[str] | None = None,
|
|
1260
|
+
text: str | None = None,
|
|
1261
|
+
limit: int = 20,
|
|
1262
|
+
) -> list | None:
|
|
1263
|
+
"""Search COD for crystal structures.
|
|
1264
|
+
|
|
1265
|
+
formula: Hill notation (e.g., 'C6 H6', 'Fe2 O3')
|
|
1266
|
+
elements: required elements (e.g., ['Fe', 'O'])
|
|
1267
|
+
text: free text search in compound names
|
|
1268
|
+
"""
|
|
1269
|
+
params: dict[str, str] = {"format": "json"}
|
|
1270
|
+
if formula:
|
|
1271
|
+
params["formula"] = formula
|
|
1272
|
+
if elements:
|
|
1273
|
+
for i, el in enumerate(elements[:8], 1):
|
|
1274
|
+
params[f"el{i}"] = el
|
|
1275
|
+
if text:
|
|
1276
|
+
params["text"] = text
|
|
1277
|
+
try:
|
|
1278
|
+
async with _http() as client:
|
|
1279
|
+
resp = await client.get(f"{COD_BASE}/result", params=params)
|
|
1280
|
+
if resp.status_code == 200:
|
|
1281
|
+
data = resp.json()
|
|
1282
|
+
if isinstance(data, list):
|
|
1283
|
+
return data[:limit]
|
|
1284
|
+
return data
|
|
1285
|
+
except Exception as e:
|
|
1286
|
+
logger.warning(f"COD search failed: {e}")
|
|
1287
|
+
return None
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
async def cod_get_cif(cod_id: int | str) -> str | None:
|
|
1291
|
+
"""Download CIF file for a COD entry."""
|
|
1292
|
+
try:
|
|
1293
|
+
async with _http() as client:
|
|
1294
|
+
resp = await client.get(f"{COD_BASE}/{cod_id}.cif")
|
|
1295
|
+
if resp.status_code == 200:
|
|
1296
|
+
return resp.text
|
|
1297
|
+
except Exception as e:
|
|
1298
|
+
logger.warning(f"COD CIF download failed: {e}")
|
|
1299
|
+
return None
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
# =============================================================================
|
|
1303
|
+
# EPA CompTox Dashboard (optional — requires COMPTOX_API_KEY)
|
|
1304
|
+
# =============================================================================
|
|
1305
|
+
|
|
1306
|
+
|
|
1307
|
+
def comptox_available() -> bool:
|
|
1308
|
+
"""Check if EPA CompTox API key is configured."""
|
|
1309
|
+
return bool(COMPTOX_API_KEY)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
async def comptox_search(query: str) -> dict | None:
|
|
1313
|
+
"""Search CompTox by chemical name, CAS, or DTXSID."""
|
|
1314
|
+
if not COMPTOX_API_KEY:
|
|
1315
|
+
return None
|
|
1316
|
+
# Try name search
|
|
1317
|
+
return await _get(
|
|
1318
|
+
f"{COMPTOX_BASE}/chemical/search/by-name/{query}",
|
|
1319
|
+
headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
|
|
1320
|
+
)
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
async def comptox_get_details(dtxsid: str) -> dict | None:
|
|
1324
|
+
"""Get full chemical details by DTXSID identifier."""
|
|
1325
|
+
if not COMPTOX_API_KEY:
|
|
1326
|
+
return None
|
|
1327
|
+
return await _get(
|
|
1328
|
+
f"{COMPTOX_BASE}/chemical/detail/search/by-dtxsid/{dtxsid}",
|
|
1329
|
+
headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
|
|
1330
|
+
)
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
async def comptox_get_properties(dtxsid: str) -> dict | None:
|
|
1334
|
+
"""Get physicochemical and fate properties for a chemical."""
|
|
1335
|
+
if not COMPTOX_API_KEY:
|
|
1336
|
+
return None
|
|
1337
|
+
return await _get(
|
|
1338
|
+
f"{COMPTOX_BASE}/chemical/property/search/by-dtxsid/{dtxsid}",
|
|
1339
|
+
headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
|
|
1340
|
+
)
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
async def comptox_get_hazard(dtxsid: str) -> dict | None:
|
|
1344
|
+
"""Get hazard data for a chemical."""
|
|
1345
|
+
if not COMPTOX_API_KEY:
|
|
1346
|
+
return None
|
|
1347
|
+
return await _get(
|
|
1348
|
+
f"{COMPTOX_BASE}/hazard/search/by-dtxsid/{dtxsid}",
|
|
1349
|
+
headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
# =============================================================================
|
|
1354
|
+
# MassBank EU — Mass spectrometry reference spectra (no auth)
|
|
1355
|
+
# =============================================================================
|
|
1356
|
+
|
|
1357
|
+
|
|
1358
|
+
async def massbank_search(
|
|
1359
|
+
compound_name: str | None = None,
|
|
1360
|
+
formula: str | None = None,
|
|
1361
|
+
inchikey: str | None = None,
|
|
1362
|
+
exact_mass_min: float | None = None,
|
|
1363
|
+
exact_mass_max: float | None = None,
|
|
1364
|
+
instrument_type: str | None = None,
|
|
1365
|
+
limit: int = 20,
|
|
1366
|
+
) -> list | None:
|
|
1367
|
+
"""Search MassBank for reference mass spectra."""
|
|
1368
|
+
params: dict[str, Any] = {"limit": limit}
|
|
1369
|
+
if compound_name:
|
|
1370
|
+
params["compound_name"] = compound_name
|
|
1371
|
+
if formula:
|
|
1372
|
+
params["formula"] = formula
|
|
1373
|
+
if inchikey:
|
|
1374
|
+
params["inchi_key"] = inchikey
|
|
1375
|
+
if exact_mass_min is not None:
|
|
1376
|
+
params["exact_mass_from"] = exact_mass_min
|
|
1377
|
+
if exact_mass_max is not None:
|
|
1378
|
+
params["exact_mass_to"] = exact_mass_max
|
|
1379
|
+
if instrument_type:
|
|
1380
|
+
params["instrument_type"] = instrument_type
|
|
1381
|
+
try:
|
|
1382
|
+
async with _http() as client:
|
|
1383
|
+
resp = await client.get(f"{MASSBANK_BASE}/records", params=params)
|
|
1384
|
+
if resp.status_code == 200:
|
|
1385
|
+
data = resp.json()
|
|
1386
|
+
return data if isinstance(data, list) else data.get("data", [])
|
|
1387
|
+
except Exception as e:
|
|
1388
|
+
logger.warning(f"MassBank search failed: {e}")
|
|
1389
|
+
return None
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
async def massbank_get_record(accession: str) -> dict | None:
|
|
1393
|
+
"""Get a specific MassBank spectrum record."""
|
|
1394
|
+
return await _get(f"{MASSBANK_BASE}/records/{accession}")
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
# =============================================================================
|
|
1398
|
+
# BindingDB — Protein-ligand binding affinities (no auth)
|
|
1399
|
+
# =============================================================================
|
|
1400
|
+
|
|
1401
|
+
|
|
1402
|
+
async def bindingdb_by_target(
|
|
1403
|
+
uniprot_id: str,
|
|
1404
|
+
cutoff_nm: int = 10000,
|
|
1405
|
+
) -> list[dict] | None:
|
|
1406
|
+
"""Get ligands for a protein target by UniProt ID.
|
|
1407
|
+
|
|
1408
|
+
cutoff_nm: binding affinity cutoff in nM (default 10 µM)
|
|
1409
|
+
Returns list of dicts with compound SMILES, Ki, IC50, Kd, EC50.
|
|
1410
|
+
"""
|
|
1411
|
+
try:
|
|
1412
|
+
async with _http() as client:
|
|
1413
|
+
resp = await client.get(
|
|
1414
|
+
f"{BINDINGDB_BASE}/getLigandsByUniprots",
|
|
1415
|
+
params={
|
|
1416
|
+
"uniprot": uniprot_id,
|
|
1417
|
+
"cutoff": cutoff_nm,
|
|
1418
|
+
"response": "application/json",
|
|
1419
|
+
},
|
|
1420
|
+
timeout=60,
|
|
1421
|
+
)
|
|
1422
|
+
if resp.status_code == 200:
|
|
1423
|
+
# BindingDB may return JSON or TSV depending on version
|
|
1424
|
+
try:
|
|
1425
|
+
return resp.json()
|
|
1426
|
+
except Exception:
|
|
1427
|
+
# Parse TSV fallback
|
|
1428
|
+
return _parse_bindingdb_tsv(resp.text)
|
|
1429
|
+
except Exception as e:
|
|
1430
|
+
logger.warning(f"BindingDB target search failed: {e}")
|
|
1431
|
+
return None
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
async def bindingdb_by_smiles(
|
|
1435
|
+
smiles: str,
|
|
1436
|
+
cutoff: float = 0.8,
|
|
1437
|
+
) -> list[dict] | None:
|
|
1438
|
+
"""Find similar compounds in BindingDB by SMILES.
|
|
1439
|
+
|
|
1440
|
+
cutoff: Tanimoto similarity threshold (0-1, default 0.8)
|
|
1441
|
+
"""
|
|
1442
|
+
try:
|
|
1443
|
+
async with _http() as client:
|
|
1444
|
+
resp = await client.get(
|
|
1445
|
+
f"{BINDINGDB_BASE}/getTargetByCompound",
|
|
1446
|
+
params={
|
|
1447
|
+
"smiles": smiles,
|
|
1448
|
+
"cutoff": cutoff,
|
|
1449
|
+
"response": "application/json",
|
|
1450
|
+
},
|
|
1451
|
+
timeout=60,
|
|
1452
|
+
)
|
|
1453
|
+
if resp.status_code == 200:
|
|
1454
|
+
try:
|
|
1455
|
+
return resp.json()
|
|
1456
|
+
except Exception:
|
|
1457
|
+
return _parse_bindingdb_tsv(resp.text)
|
|
1458
|
+
except Exception as e:
|
|
1459
|
+
logger.warning(f"BindingDB SMILES search failed: {e}")
|
|
1460
|
+
return None
|
|
1461
|
+
|
|
1462
|
+
|
|
1463
|
+
def _parse_bindingdb_tsv(text: str) -> list[dict]:
|
|
1464
|
+
"""Parse BindingDB tab-separated response into list of dicts."""
|
|
1465
|
+
lines = text.strip().split("\n")
|
|
1466
|
+
if len(lines) < 2:
|
|
1467
|
+
return []
|
|
1468
|
+
headers = lines[0].split("\t")
|
|
1469
|
+
results = []
|
|
1470
|
+
for line in lines[1:]:
|
|
1471
|
+
vals = line.split("\t")
|
|
1472
|
+
row = {}
|
|
1473
|
+
for i, h in enumerate(headers):
|
|
1474
|
+
if i < len(vals):
|
|
1475
|
+
row[h.strip()] = vals[i].strip()
|
|
1476
|
+
results.append(row)
|
|
1477
|
+
return results
|
|
1478
|
+
|
|
1479
|
+
|
|
1480
|
+
# =============================================================================
|
|
1481
|
+
# Crossref BibTeX (content negotiation — no extra API)
|
|
1482
|
+
# =============================================================================
|
|
1483
|
+
|
|
1484
|
+
|
|
1485
|
+
async def crossref_get_bibtex(doi: str) -> str | None:
|
|
1486
|
+
"""Get BibTeX entry for a DOI via Crossref content negotiation."""
|
|
1487
|
+
doi = doi.strip().removeprefix("https://doi.org/").removeprefix("http://doi.org/")
|
|
1488
|
+
try:
|
|
1489
|
+
async with _http() as client:
|
|
1490
|
+
resp = await client.get(
|
|
1491
|
+
f"https://doi.org/{doi}",
|
|
1492
|
+
headers={"Accept": "application/x-bibtex"},
|
|
1493
|
+
follow_redirects=True,
|
|
1494
|
+
)
|
|
1495
|
+
if resp.status_code == 200 and "@" in resp.text:
|
|
1496
|
+
return resp.text.strip()
|
|
1497
|
+
except Exception as e:
|
|
1498
|
+
logger.warning(f"BibTeX fetch failed for {doi}: {e}")
|
|
1499
|
+
return None
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
async def crossref_get_bibtex_batch(dois: list[str]) -> list[tuple[str, str | None]]:
|
|
1503
|
+
"""Get BibTeX entries for multiple DOIs. Returns list of (doi, bibtex)."""
|
|
1504
|
+
results = []
|
|
1505
|
+
for doi in dois:
|
|
1506
|
+
bib = await crossref_get_bibtex(doi)
|
|
1507
|
+
results.append((doi, bib))
|
|
1508
|
+
return results
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
# =============================================================================
|
|
1512
|
+
# RCSB PDB — Protein Data Bank (no auth)
|
|
1513
|
+
# =============================================================================
|
|
1514
|
+
|
|
1515
|
+
|
|
1516
|
+
async def pdb_search(
|
|
1517
|
+
query: str,
|
|
1518
|
+
search_type: str = "full_text",
|
|
1519
|
+
limit: int = 10,
|
|
1520
|
+
) -> dict | None:
|
|
1521
|
+
"""Search RCSB PDB for protein/nucleic acid structures.
|
|
1522
|
+
|
|
1523
|
+
search_type: 'full_text', 'structure_title', 'structure_author'
|
|
1524
|
+
"""
|
|
1525
|
+
service_map = {
|
|
1526
|
+
"full_text": "full_text",
|
|
1527
|
+
"structure_title": "text",
|
|
1528
|
+
"structure_author": "text",
|
|
1529
|
+
}
|
|
1530
|
+
service = service_map.get(search_type, "full_text")
|
|
1531
|
+
|
|
1532
|
+
json_body: dict[str, Any] = {
|
|
1533
|
+
"query": {
|
|
1534
|
+
"type": "terminal",
|
|
1535
|
+
"service": service,
|
|
1536
|
+
"parameters": {"value": query},
|
|
1537
|
+
},
|
|
1538
|
+
"return_type": "entry",
|
|
1539
|
+
"request_options": {
|
|
1540
|
+
"results_content_type": ["experimental"],
|
|
1541
|
+
"paginate": {"start": 0, "rows": limit},
|
|
1542
|
+
"sort": [{"sort_by": "score", "direction": "desc"}],
|
|
1543
|
+
},
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
# For author/title, use the text service with specific attribute
|
|
1547
|
+
if search_type == "structure_title":
|
|
1548
|
+
json_body["query"]["parameters"] = {
|
|
1549
|
+
"attribute": "struct.title",
|
|
1550
|
+
"operator": "contains_phrase",
|
|
1551
|
+
"value": query,
|
|
1552
|
+
}
|
|
1553
|
+
elif search_type == "structure_author":
|
|
1554
|
+
json_body["query"]["parameters"] = {
|
|
1555
|
+
"attribute": "rcsb_primary_citation.rcsb_authors",
|
|
1556
|
+
"operator": "contains_phrase",
|
|
1557
|
+
"value": query,
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
try:
|
|
1561
|
+
async with _http() as client:
|
|
1562
|
+
resp = await client.post(
|
|
1563
|
+
PDB_SEARCH_BASE,
|
|
1564
|
+
json=json_body,
|
|
1565
|
+
timeout=30,
|
|
1566
|
+
)
|
|
1567
|
+
if resp.status_code == 200:
|
|
1568
|
+
return resp.json()
|
|
1569
|
+
except Exception as e:
|
|
1570
|
+
logger.warning(f"PDB search failed: {e}")
|
|
1571
|
+
return None
|
|
1572
|
+
|
|
1573
|
+
|
|
1574
|
+
async def pdb_get_entry(pdb_id: str) -> dict | None:
|
|
1575
|
+
"""Get full entry details from RCSB PDB."""
|
|
1576
|
+
pdb_id = pdb_id.strip().upper()
|
|
1577
|
+
return await _get(f"{PDB_DATA_BASE}/entry/{pdb_id}")
|
|
1578
|
+
|
|
1579
|
+
|
|
1580
|
+
async def pdb_get_entity(pdb_id: str, entity_id: int = 1) -> dict | None:
|
|
1581
|
+
"""Get polymer entity details (protein/nucleic acid chain)."""
|
|
1582
|
+
pdb_id = pdb_id.strip().upper()
|
|
1583
|
+
return await _get(f"{PDB_DATA_BASE}/polymer_entity/{pdb_id}/{entity_id}")
|
|
1584
|
+
|
|
1585
|
+
|
|
1586
|
+
async def pdb_get_ligands(pdb_id: str) -> list[dict]:
|
|
1587
|
+
"""Get all non-polymer (ligand) entities in a PDB structure."""
|
|
1588
|
+
pdb_id = pdb_id.strip().upper()
|
|
1589
|
+
ligands = []
|
|
1590
|
+
# PDB structures can have multiple non-polymer entities
|
|
1591
|
+
for entity_id in range(1, 20): # usually < 10
|
|
1592
|
+
data = await _get(
|
|
1593
|
+
f"{PDB_DATA_BASE}/nonpolymer_entity/{pdb_id}/{entity_id}"
|
|
1594
|
+
)
|
|
1595
|
+
if data:
|
|
1596
|
+
ligands.append(data)
|
|
1597
|
+
else:
|
|
1598
|
+
break
|
|
1599
|
+
return ligands
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
# =============================================================================
|
|
1603
|
+
# PubChem GHS Hazard Data (extends existing PubChem client)
|
|
1604
|
+
# =============================================================================
|
|
1605
|
+
|
|
1606
|
+
|
|
1607
|
+
async def pubchem_get_ghs(cid: int) -> dict | None:
|
|
1608
|
+
"""Get GHS Classification data (hazard pictograms, H/P statements) for a compound.
|
|
1609
|
+
|
|
1610
|
+
Uses PubChem PUG-View API to get the GHS section.
|
|
1611
|
+
"""
|
|
1612
|
+
try:
|
|
1613
|
+
async with _http() as client:
|
|
1614
|
+
resp = await client.get(
|
|
1615
|
+
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON",
|
|
1616
|
+
params={"heading": "GHS Classification"},
|
|
1617
|
+
timeout=20,
|
|
1618
|
+
)
|
|
1619
|
+
if resp.status_code == 200:
|
|
1620
|
+
return resp.json()
|
|
1621
|
+
except Exception as e:
|
|
1622
|
+
logger.warning(f"PubChem GHS fetch failed for CID {cid}: {e}")
|
|
1623
|
+
return None
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
def parse_ghs_data(pug_view_data: dict) -> dict:
|
|
1627
|
+
"""Parse PubChem PUG-View GHS response into structured hazard data."""
|
|
1628
|
+
result: dict[str, Any] = {
|
|
1629
|
+
"pictograms": [],
|
|
1630
|
+
"signal_word": "",
|
|
1631
|
+
"hazard_statements": [],
|
|
1632
|
+
"precautionary_statements": [],
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
if not pug_view_data:
|
|
1636
|
+
return result
|
|
1637
|
+
|
|
1638
|
+
# Navigate the nested PUG-View structure
|
|
1639
|
+
record = pug_view_data.get("Record", {})
|
|
1640
|
+
sections = record.get("Section", [])
|
|
1641
|
+
|
|
1642
|
+
for section in sections:
|
|
1643
|
+
for subsec in section.get("Section", []):
|
|
1644
|
+
heading = subsec.get("TOCHeading", "")
|
|
1645
|
+
|
|
1646
|
+
for info in subsec.get("Information", []):
|
|
1647
|
+
val = info.get("Value", {})
|
|
1648
|
+
|
|
1649
|
+
if "Pictogram" in heading or "Pictogram" in info.get("Name", ""):
|
|
1650
|
+
# Extract pictogram names
|
|
1651
|
+
for sv in val.get("StringWithMarkup", []):
|
|
1652
|
+
text = sv.get("String", "")
|
|
1653
|
+
if text:
|
|
1654
|
+
result["pictograms"].append(text)
|
|
1655
|
+
# Also check for markup references
|
|
1656
|
+
for mu in sv.get("Markup", []):
|
|
1657
|
+
extra = mu.get("Extra", "")
|
|
1658
|
+
if extra:
|
|
1659
|
+
result["pictograms"].append(extra)
|
|
1660
|
+
|
|
1661
|
+
elif "Signal" in heading or "Signal" in info.get("Name", ""):
|
|
1662
|
+
for sv in val.get("StringWithMarkup", []):
|
|
1663
|
+
text = sv.get("String", "")
|
|
1664
|
+
if text and text.lower() in ("danger", "warning"):
|
|
1665
|
+
result["signal_word"] = text
|
|
1666
|
+
|
|
1667
|
+
elif "Hazard Statement" in heading or "H Statement" in info.get("Name", ""):
|
|
1668
|
+
for sv in val.get("StringWithMarkup", []):
|
|
1669
|
+
text = sv.get("String", "")
|
|
1670
|
+
if text:
|
|
1671
|
+
result["hazard_statements"].append(text)
|
|
1672
|
+
|
|
1673
|
+
elif "Precautionary" in heading or "P Statement" in info.get("Name", ""):
|
|
1674
|
+
for sv in val.get("StringWithMarkup", []):
|
|
1675
|
+
text = sv.get("String", "")
|
|
1676
|
+
if text:
|
|
1677
|
+
result["precautionary_statements"].append(text)
|
|
1678
|
+
|
|
1679
|
+
# Deduplicate
|
|
1680
|
+
result["pictograms"] = list(dict.fromkeys(result["pictograms"]))
|
|
1681
|
+
result["hazard_statements"] = list(dict.fromkeys(result["hazard_statements"]))
|
|
1682
|
+
result["precautionary_statements"] = list(dict.fromkeys(result["precautionary_statements"]))
|
|
1683
|
+
|
|
1684
|
+
return result
|
|
1685
|
+
|
|
1686
|
+
|
|
1687
|
+
# =============================================================================
|
|
1688
|
+
# GNPS NPClassifier — Natural product classification (no auth)
|
|
1689
|
+
# =============================================================================
|
|
1690
|
+
|
|
1691
|
+
|
|
1692
|
+
async def gnps_classify_compound(smiles: str) -> dict | None:
|
|
1693
|
+
"""Classify a compound into natural product classes using GNPS NPClassifier.
|
|
1694
|
+
|
|
1695
|
+
Returns pathway, superclass, class, and isglycoside prediction.
|
|
1696
|
+
"""
|
|
1697
|
+
try:
|
|
1698
|
+
async with _http() as client:
|
|
1699
|
+
resp = await client.get(
|
|
1700
|
+
f"{NPCLASSIFIER_BASE}/classify",
|
|
1701
|
+
params={"smiles": smiles},
|
|
1702
|
+
timeout=30,
|
|
1703
|
+
)
|
|
1704
|
+
if resp.status_code == 200:
|
|
1705
|
+
return resp.json()
|
|
1706
|
+
except Exception as e:
|
|
1707
|
+
logger.warning(f"NPClassifier failed: {e}")
|
|
1708
|
+
return None
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
# =============================================================================
|
|
1712
|
+
# OpenAlex Sources — Journal metrics (extends existing OpenAlex client)
|
|
1713
|
+
# =============================================================================
|
|
1714
|
+
|
|
1715
|
+
|
|
1716
|
+
async def openalex_get_source(source_id: str) -> dict | None:
|
|
1717
|
+
"""Get journal/source details from OpenAlex.
|
|
1718
|
+
|
|
1719
|
+
source_id: OpenAlex source ID (e.g., 'S137773608') or ISSN
|
|
1720
|
+
"""
|
|
1721
|
+
headers = _oa_headers()
|
|
1722
|
+
# Try direct ID lookup
|
|
1723
|
+
if source_id.startswith("S") or source_id.startswith("https://"):
|
|
1724
|
+
return await _get(
|
|
1725
|
+
f"https://api.openalex.org/sources/{source_id}",
|
|
1726
|
+
headers=headers,
|
|
1727
|
+
)
|
|
1728
|
+
# Try ISSN lookup
|
|
1729
|
+
return await _get(
|
|
1730
|
+
f"https://api.openalex.org/sources/issn:{source_id}",
|
|
1731
|
+
headers=headers,
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
async def openalex_search_sources(
|
|
1736
|
+
query: str,
|
|
1737
|
+
limit: int = 10,
|
|
1738
|
+
) -> dict | None:
|
|
1739
|
+
"""Search for journals/sources by name in OpenAlex."""
|
|
1740
|
+
return await _get(
|
|
1741
|
+
"https://api.openalex.org/sources",
|
|
1742
|
+
params={"search": query, "per_page": limit},
|
|
1743
|
+
headers=_oa_headers(),
|
|
1744
|
+
)
|