labmate-mcp 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labmate_mcp/apis.py ADDED
@@ -0,0 +1,1744 @@
1
+ """
2
+ API clients for scholarly-mcp.
3
+
4
+ All functions are async, return parsed dicts/lists or None on failure.
5
+ Each API module is self-contained with its own headers and error handling.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import contextlib
12
+ import logging
13
+ import os
14
+ import re
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ logger = logging.getLogger("scholarly-mcp")
20
+
21
+ # =============================================================================
22
+ # Configuration (from environment variables)
23
+ # =============================================================================
24
+
25
+ VERSION = "7.0.0"
26
+ USER_AGENT = (
27
+ f"scholarly-mcp/{VERSION} "
28
+ "(https://github.com/JonasRackl/chemrxiv-mcp; "
29
+ "mailto:scholarly-mcp@users.noreply.github.com)"
30
+ )
31
+ TIMEOUT = 30
32
+
33
+ # Optional credentials — features activate when set
34
+ S2_API_KEY: str | None = os.environ.get("S2_API_KEY")
35
+ OPENALEX_EMAIL: str | None = os.environ.get("OPENALEX_EMAIL")
36
+ UNPAYWALL_EMAIL: str | None = os.environ.get(
37
+ "UNPAYWALL_EMAIL",
38
+ os.environ.get("OPENALEX_EMAIL", "scholarly-mcp@users.noreply.github.com"),
39
+ )
40
+ WOS_API_KEY: str | None = os.environ.get("WOS_API_KEY")
41
+ MP_API_KEY: str | None = os.environ.get("MP_API_KEY")
42
+ RXN_API_KEY: str | None = os.environ.get("RXN_API_KEY")
43
+ COMPTOX_API_KEY: str | None = os.environ.get("COMPTOX_API_KEY")
44
+
45
+ # Base URLs
46
+ CROSSREF_BASE = "https://api.crossref.org"
47
+ OPENALEX_BASE = "https://api.openalex.org"
48
+ S2_BASE = "https://api.semanticscholar.org"
49
+ UNPAYWALL_BASE = "https://api.unpaywall.org/v2"
50
+ PUBCHEM_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
51
+ CAS_BASE = "https://commonchemistry.cas.org/api"
52
+ WOS_BASE = "https://wos-api.clarivate.com/api/wos"
53
+ NIST_BASE = "https://webbook.nist.gov/cgi/cbook.cgi"
54
+ MP_BASE = "https://api.materialsproject.org"
55
+ RXN_BASE = "https://rxn.res.ibm.com/rxn/api/api/v1"
56
+ UNICHEM_BASE = "https://www.ebi.ac.uk/unichem/api/v1"
57
+ COD_BASE = "https://www.crystallography.net/cod"
58
+ COMPTOX_BASE = "https://api-ccte.epa.gov"
59
+ MASSBANK_BASE = "https://massbank.eu/MassBank/api"
60
+ BINDINGDB_BASE = "https://bindingdb.org/axis2/services/BDBService"
61
+ PDB_DATA_BASE = "https://data.rcsb.org/rest/v1/core"
62
+ PDB_SEARCH_BASE = "https://search.rcsb.org/rcsbsearch/v2/query"
63
+ NPCLASSIFIER_BASE = "https://npclassifier.gnps2.org"
64
+
65
+ # Semantic Scholar field sets
66
+ S2_SEARCH_FIELDS = (
67
+ "paperId,externalIds,title,abstract,year,venue,citationCount,"
68
+ "influentialCitationCount,isOpenAccess,openAccessPdf,tldr,authors"
69
+ )
70
+ S2_DETAIL_FIELDS = (
71
+ "paperId,externalIds,url,title,abstract,venue,publicationVenue,year,"
72
+ "referenceCount,citationCount,influentialCitationCount,isOpenAccess,"
73
+ "openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,"
74
+ "publicationDate,journal,authors,tldr"
75
+ )
76
+ S2_CITATION_FIELDS = (
77
+ "paperId,title,year,citationCount,authors,intents,isInfluential,contexts"
78
+ )
79
+ S2_AUTHOR_FIELDS = (
80
+ "authorId,name,affiliations,paperCount,citationCount,hIndex"
81
+ )
82
+ S2_AUTHOR_DETAIL_FIELDS = (
83
+ "authorId,name,affiliations,paperCount,citationCount,hIndex,"
84
+ "papers,papers.paperId,papers.title,papers.year,papers.citationCount,"
85
+ "papers.venue,papers.externalIds"
86
+ )
87
+
88
+ # ChemRxiv DOI prefix (for ChemRxiv-specific searches via Crossref)
89
+ CHEMRXIV_DOI_PREFIX = "10.26434"
90
+
91
+ # ChemRxiv subject categories (hardcoded from Atypon platform facets)
92
+ CHEMRXIV_CATEGORIES: dict[int, str] = {
93
+ 502556: "Analytical Chemistry",
94
+ 502557: "Biological and Medicinal Chemistry",
95
+ 502558: "Catalysis",
96
+ 502559: "Chemical Biology",
97
+ 502560: "Chemical Engineering and Industrial Chemistry",
98
+ 502561: "Earth, Space, and Environmental Chemistry",
99
+ 502562: "Education",
100
+ 502563: "Inorganic Chemistry",
101
+ 502564: "Materials Chemistry",
102
+ 502565: "Materials Science",
103
+ 502566: "Nanoscience",
104
+ 502567: "Organic Chemistry",
105
+ 502568: "Organometallic Chemistry",
106
+ 502569: "Physical Chemistry",
107
+ 502570: "Polymer Chemistry",
108
+ 502571: "Supramolecular Chemistry",
109
+ 502572: "Theoretical and Computational Chemistry",
110
+ 502573: "Other",
111
+ }
112
+
113
+
114
+ # =============================================================================
115
+ # Shared HTTP helpers
116
+ # =============================================================================
117
+
118
+
119
+ @contextlib.asynccontextmanager
120
+ async def _http(**kwargs):
121
+ """Shared async HTTP client context manager."""
122
+ defaults = {
123
+ "timeout": TIMEOUT,
124
+ "follow_redirects": True,
125
+ "headers": {"User-Agent": USER_AGENT, "Accept": "application/json"},
126
+ }
127
+ defaults.update(kwargs)
128
+ async with httpx.AsyncClient(**defaults) as client:
129
+ yield client
130
+
131
+
132
+ async def _get(
133
+ url: str,
134
+ params: dict | None = None,
135
+ headers: dict | None = None,
136
+ ) -> dict | None:
137
+ """HTTP GET returning parsed JSON or None on failure."""
138
+ try:
139
+ async with _http() as client:
140
+ resp = await client.get(url, params=params, headers=headers or {})
141
+ resp.raise_for_status()
142
+ return resp.json()
143
+ except httpx.HTTPStatusError as e:
144
+ logger.warning(f"HTTP {e.response.status_code} for GET {url}")
145
+ return None
146
+ except Exception as e:
147
+ logger.warning(f"GET {url} failed: {e}")
148
+ return None
149
+
150
+
151
+ async def _post(
152
+ url: str,
153
+ json_data: dict | None = None,
154
+ params: dict | None = None,
155
+ headers: dict | None = None,
156
+ ) -> dict | None:
157
+ """HTTP POST returning parsed JSON or None on failure."""
158
+ try:
159
+ async with _http() as client:
160
+ resp = await client.post(
161
+ url, json=json_data, params=params, headers=headers or {}
162
+ )
163
+ resp.raise_for_status()
164
+ return resp.json()
165
+ except httpx.HTTPStatusError as e:
166
+ logger.warning(f"HTTP {e.response.status_code} for POST {url}")
167
+ return None
168
+ except Exception as e:
169
+ logger.warning(f"POST {url} failed: {e}")
170
+ return None
171
+
172
+
173
+ # =============================================================================
174
+ # Crossref API
175
+ # =============================================================================
176
+
177
+
178
+ async def crossref_search(
179
+ query: str,
180
+ rows: int = 10,
181
+ offset: int = 0,
182
+ sort: str = "relevance",
183
+ filters: dict[str, str] | None = None,
184
+ ) -> dict | None:
185
+ """Search Crossref works. Returns full API response dict."""
186
+ params: dict[str, Any] = {"query": query, "rows": rows, "offset": offset}
187
+ if sort == "date":
188
+ params["sort"] = "deposited"
189
+ params["order"] = "desc"
190
+ if filters:
191
+ params["filter"] = ",".join(f"{k}:{v}" for k, v in filters.items())
192
+ headers = {}
193
+ if OPENALEX_EMAIL:
194
+ headers["mailto"] = OPENALEX_EMAIL # Crossref polite pool
195
+ return await _get(f"{CROSSREF_BASE}/works", params=params, headers=headers)
196
+
197
+
198
+ async def crossref_search_chemrxiv(
199
+ query: str = "",
200
+ rows: int = 10,
201
+ offset: int = 0,
202
+ sort: str = "relevance",
203
+ date_from: str | None = None,
204
+ date_to: str | None = None,
205
+ ) -> dict | None:
206
+ """Search specifically within ChemRxiv preprints via Crossref DOI prefix."""
207
+ filters: dict[str, str] = {"prefix": CHEMRXIV_DOI_PREFIX}
208
+ if date_from:
209
+ filters["from-posted-date"] = date_from
210
+ if date_to:
211
+ filters["until-posted-date"] = date_to
212
+ return await crossref_search(
213
+ query=query, rows=rows, offset=offset, sort=sort, filters=filters
214
+ )
215
+
216
+
217
+ async def crossref_get_work(doi: str) -> dict | None:
218
+ """Get a single work by DOI. Returns the message object."""
219
+ headers = {}
220
+ if OPENALEX_EMAIL:
221
+ headers["mailto"] = OPENALEX_EMAIL
222
+ data = await _get(f"{CROSSREF_BASE}/works/{doi}", headers=headers)
223
+ if data:
224
+ return data.get("message")
225
+ return None
226
+
227
+
228
+ # =============================================================================
229
+ # OpenAlex API
230
+ # =============================================================================
231
+
232
+
233
+ def _oa_params(**extra) -> dict:
234
+ """Build OpenAlex params with polite pool email."""
235
+ params = dict(extra)
236
+ if OPENALEX_EMAIL:
237
+ params["mailto"] = OPENALEX_EMAIL
238
+ return params
239
+
240
+
241
+ async def openalex_search(
242
+ query: str,
243
+ filters: str | None = None,
244
+ per_page: int = 10,
245
+ page: int = 1,
246
+ sort: str | None = None,
247
+ ) -> dict | None:
248
+ """Search OpenAlex works. Supports advanced filters and sorting."""
249
+ params = _oa_params(search=query, per_page=per_page, page=page)
250
+ if filters:
251
+ params["filter"] = filters
252
+ if sort:
253
+ params["sort"] = sort
254
+ return await _get(f"{OPENALEX_BASE}/works", params=params)
255
+
256
+
257
+ async def openalex_get_work(identifier: str) -> dict | None:
258
+ """Get work by DOI or OpenAlex ID (W-prefixed)."""
259
+ if identifier.startswith("W") or identifier.startswith("https://"):
260
+ url = f"{OPENALEX_BASE}/works/{identifier}"
261
+ else:
262
+ url = f"{OPENALEX_BASE}/works/doi:{identifier}"
263
+ return await _get(url, params=_oa_params())
264
+
265
+
266
+ async def openalex_get_author(identifier: str) -> dict | None:
267
+ """Get author by OpenAlex ID or search by name."""
268
+ if identifier.startswith("A") or identifier.startswith("https://"):
269
+ return await _get(
270
+ f"{OPENALEX_BASE}/authors/{identifier}", params=_oa_params()
271
+ )
272
+ # Name search — return first result
273
+ data = await _get(
274
+ f"{OPENALEX_BASE}/authors",
275
+ params=_oa_params(search=identifier, per_page=1),
276
+ )
277
+ if data and data.get("results"):
278
+ return data["results"][0]
279
+ return data
280
+
281
+
282
+ async def openalex_search_authors(
283
+ query: str, per_page: int = 10
284
+ ) -> dict | None:
285
+ """Search authors by name."""
286
+ return await _get(
287
+ f"{OPENALEX_BASE}/authors",
288
+ params=_oa_params(search=query, per_page=per_page),
289
+ )
290
+
291
+
292
+ async def openalex_group_by(
293
+ group_by: str,
294
+ filters: str | None = None,
295
+ search: str | None = None,
296
+ per_page: int = 200,
297
+ ) -> dict | None:
298
+ """Aggregate works by a field for bibliometric analysis.
299
+
300
+ group_by options: publication_year, authorships.author.id,
301
+ primary_location.source.id, topics.id, open_access.oa_status, etc.
302
+ """
303
+ params = _oa_params(group_by=group_by, per_page=per_page)
304
+ if filters:
305
+ params["filter"] = filters
306
+ if search:
307
+ params["search"] = search
308
+ return await _get(f"{OPENALEX_BASE}/works", params=params)
309
+
310
+
311
+ async def openalex_get_topic(identifier: str) -> dict | None:
312
+ """Get topic by OpenAlex ID or search by name."""
313
+ if identifier.startswith("T") or identifier.startswith("https://"):
314
+ return await _get(
315
+ f"{OPENALEX_BASE}/topics/{identifier}", params=_oa_params()
316
+ )
317
+ data = await _get(
318
+ f"{OPENALEX_BASE}/topics",
319
+ params=_oa_params(search=identifier, per_page=1),
320
+ )
321
+ if data and data.get("results"):
322
+ return data["results"][0]
323
+ return data
324
+
325
+
326
+ async def openalex_get_works(
327
+ filters: str,
328
+ per_page: int = 10,
329
+ page: int = 1,
330
+ sort: str | None = None,
331
+ ) -> dict | None:
332
+ """Get works by filter (no search term). For citation network queries."""
333
+ params = _oa_params(filter=filters, per_page=per_page, page=page)
334
+ if sort:
335
+ params["sort"] = sort
336
+ return await _get(f"{OPENALEX_BASE}/works", params=params)
337
+
338
+
339
+ def openalex_reconstruct_abstract(inverted_index: dict) -> str:
340
+ """Reconstruct abstract text from OpenAlex inverted index."""
341
+ if not inverted_index:
342
+ return ""
343
+ words: dict[int, str] = {}
344
+ for word, positions in inverted_index.items():
345
+ for pos in positions:
346
+ words[pos] = word
347
+ return " ".join(words[i] for i in sorted(words)) if words else ""
348
+
349
+
350
+ # =============================================================================
351
+ # Semantic Scholar API
352
+ # =============================================================================
353
+
354
+
355
+ def _s2_headers() -> dict:
356
+ """Semantic Scholar headers with optional API key."""
357
+ h: dict[str, str] = {}
358
+ if S2_API_KEY:
359
+ h["x-api-key"] = S2_API_KEY
360
+ return h
361
+
362
+
363
+ async def s2_search(
364
+ query: str,
365
+ limit: int = 10,
366
+ offset: int = 0,
367
+ fields: str = S2_SEARCH_FIELDS,
368
+ fields_of_study: str | None = None,
369
+ year: str | None = None,
370
+ publication_types: str | None = None,
371
+ open_access_pdf: bool | None = None,
372
+ ) -> dict | None:
373
+ """Search Semantic Scholar papers.
374
+
375
+ Supports boolean queries, exact phrases ("..."), and filters.
376
+ year format: "2020" or "2020-2025" or "2020-"
377
+ """
378
+ params: dict[str, Any] = {
379
+ "query": query,
380
+ "fields": fields,
381
+ "limit": limit,
382
+ "offset": offset,
383
+ }
384
+ if fields_of_study:
385
+ params["fieldsOfStudy"] = fields_of_study
386
+ if year:
387
+ params["year"] = year
388
+ if publication_types:
389
+ params["publicationTypes"] = publication_types
390
+ if open_access_pdf:
391
+ params["openAccessPdf"] = ""
392
+ return await _get(
393
+ f"{S2_BASE}/graph/v1/paper/search",
394
+ params=params,
395
+ headers=_s2_headers(),
396
+ )
397
+
398
+
399
+ async def s2_get_paper(
400
+ paper_id: str, fields: str = S2_DETAIL_FIELDS
401
+ ) -> dict | None:
402
+ """Get paper by Semantic Scholar ID or external ID.
403
+
404
+ paper_id formats:
405
+ - S2 paper ID (40-char hex)
406
+ - DOI:10.1234/xxx
407
+ - ARXIV:2101.12345
408
+ - PMID:12345678
409
+ - CorpusId:12345678
410
+ """
411
+ return await _get(
412
+ f"{S2_BASE}/graph/v1/paper/{paper_id}",
413
+ params={"fields": fields},
414
+ headers=_s2_headers(),
415
+ )
416
+
417
+
418
+ async def s2_get_citations(
419
+ paper_id: str,
420
+ limit: int = 50,
421
+ offset: int = 0,
422
+ fields: str = S2_CITATION_FIELDS,
423
+ ) -> dict | None:
424
+ """Get papers that cite the given paper.
425
+
426
+ Includes isInfluential flag and citation intents.
427
+ """
428
+ return await _get(
429
+ f"{S2_BASE}/graph/v1/paper/{paper_id}/citations",
430
+ params={"fields": fields, "limit": limit, "offset": offset},
431
+ headers=_s2_headers(),
432
+ )
433
+
434
+
435
+ async def s2_get_references(
436
+ paper_id: str,
437
+ limit: int = 50,
438
+ offset: int = 0,
439
+ fields: str = S2_CITATION_FIELDS,
440
+ ) -> dict | None:
441
+ """Get papers referenced by the given paper."""
442
+ return await _get(
443
+ f"{S2_BASE}/graph/v1/paper/{paper_id}/references",
444
+ params={"fields": fields, "limit": limit, "offset": offset},
445
+ headers=_s2_headers(),
446
+ )
447
+
448
+
449
+ async def s2_get_recommendations(
450
+ positive_ids: list[str],
451
+ negative_ids: list[str] | None = None,
452
+ limit: int = 10,
453
+ fields: str = S2_SEARCH_FIELDS,
454
+ ) -> dict | None:
455
+ """Get paper recommendations based on positive/negative examples.
456
+
457
+ IDs can be S2 paper IDs, DOI:xxx, ARXIV:xxx, etc.
458
+ For single-paper similarity, pass one positive ID.
459
+ """
460
+ body: dict[str, Any] = {"positivePaperIds": positive_ids}
461
+ if negative_ids:
462
+ body["negativePaperIds"] = negative_ids
463
+ return await _post(
464
+ f"{S2_BASE}/recommendations/v1/papers/",
465
+ json_data=body,
466
+ params={"fields": fields, "limit": limit},
467
+ headers=_s2_headers(),
468
+ )
469
+
470
+
471
+ async def s2_search_author(
472
+ query: str, limit: int = 5, fields: str = S2_AUTHOR_FIELDS
473
+ ) -> dict | None:
474
+ """Search for authors by name."""
475
+ return await _get(
476
+ f"{S2_BASE}/graph/v1/author/search",
477
+ params={"query": query, "limit": limit, "fields": fields},
478
+ headers=_s2_headers(),
479
+ )
480
+
481
+
482
+ async def s2_get_author(
483
+ author_id: str, fields: str = S2_AUTHOR_DETAIL_FIELDS
484
+ ) -> dict | None:
485
+ """Get author details including recent papers."""
486
+ return await _get(
487
+ f"{S2_BASE}/graph/v1/author/{author_id}",
488
+ params={"fields": fields},
489
+ headers=_s2_headers(),
490
+ )
491
+
492
+
493
+ # =============================================================================
494
+ # Unpaywall API
495
+ # =============================================================================
496
+
497
+
498
+ async def unpaywall_get(doi: str) -> dict | None:
499
+ """Find open access PDF location for a DOI."""
500
+ email = UNPAYWALL_EMAIL or "scholarly-mcp@users.noreply.github.com"
501
+ return await _get(f"{UNPAYWALL_BASE}/{doi}", params={"email": email})
502
+
503
+
504
+ # =============================================================================
505
+ # PubChem API
506
+ # =============================================================================
507
+
508
+
509
+ async def pubchem_search_by_name(name: str) -> dict | None:
510
+ """Search PubChem compounds by name."""
511
+ return await _get(f"{PUBCHEM_BASE}/compound/name/{name}/JSON")
512
+
513
+
514
+ async def pubchem_search_by_smiles(smiles: str) -> dict | None:
515
+ """Search PubChem compounds by SMILES string."""
516
+ # Use POST for SMILES to handle special characters
517
+ try:
518
+ async with _http() as client:
519
+ resp = await client.post(
520
+ f"{PUBCHEM_BASE}/compound/smiles/JSON",
521
+ data={"smiles": smiles},
522
+ )
523
+ resp.raise_for_status()
524
+ return resp.json()
525
+ except Exception as e:
526
+ logger.warning(f"PubChem SMILES search failed: {e}")
527
+ return None
528
+
529
+
530
+ async def pubchem_search_by_formula(formula: str) -> dict | None:
531
+ """Search PubChem compounds by molecular formula."""
532
+ return await _get(
533
+ f"{PUBCHEM_BASE}/compound/fastformula/{formula}/JSON",
534
+ params={"MaxRecords": 10},
535
+ )
536
+
537
+
538
+ async def pubchem_get_compound(cid: int | str) -> dict | None:
539
+ """Get compound record by PubChem CID."""
540
+ return await _get(f"{PUBCHEM_BASE}/compound/cid/{cid}/JSON")
541
+
542
+
543
+ async def pubchem_get_properties(
544
+ cid: int | str,
545
+ properties: str | None = None,
546
+ ) -> dict | None:
547
+ """Get computed molecular properties for a compound.
548
+
549
+ Default properties cover Lipinski rule-of-5 and common descriptors.
550
+ """
551
+ if properties is None:
552
+ properties = (
553
+ "MolecularFormula,MolecularWeight,CanonicalSMILES,"
554
+ "IsomericSMILES,InChI,InChIKey,IUPACName,XLogP,"
555
+ "ExactMass,MonoisotopicMass,TPSA,Complexity,"
556
+ "HBondDonorCount,HBondAcceptorCount,RotatableBondCount,"
557
+ "HeavyAtomCount,CovalentUnitCount"
558
+ )
559
+ return await _get(
560
+ f"{PUBCHEM_BASE}/compound/cid/{cid}/property/{properties}/JSON"
561
+ )
562
+
563
+
564
+ async def pubchem_get_synonyms(cid: int | str) -> dict | None:
565
+ """Get all known names/synonyms for a compound."""
566
+ return await _get(f"{PUBCHEM_BASE}/compound/cid/{cid}/synonyms/JSON")
567
+
568
+
569
+ # =============================================================================
570
+ # Common Chemistry (CAS) API — free, no auth, ~500k compounds
571
+ # =============================================================================
572
+
573
+
574
+ async def cas_search(query: str, size: int = 10) -> dict | None:
575
+ """Search Common Chemistry by name, CAS number, InChI, InChIKey, or SMILES."""
576
+ return await _get(f"{CAS_BASE}/search", params={"q": query, "size": size})
577
+
578
+
579
+ async def cas_detail(cas_rn: str) -> dict | None:
580
+ """Get full compound details by CAS Registry Number.
581
+
582
+ Returns: name, CAS RN, molecular formula, molecular mass, InChI,
583
+ InChIKey, SMILES, canonical SMILES, and experimental properties.
584
+ """
585
+ return await _get(f"{CAS_BASE}/detail", params={"cas_rn": cas_rn})
586
+
587
+
588
+ # =============================================================================
589
+ # Web of Science Starter API (optional — requires WOS_API_KEY)
590
+ # =============================================================================
591
+
592
+
593
+ def wos_available() -> bool:
594
+ """Check if Web of Science API credentials are configured."""
595
+ return bool(WOS_API_KEY)
596
+
597
+
598
+ async def wos_search(
599
+ query: str,
600
+ limit: int = 10,
601
+ first_record: int = 1,
602
+ sort_field: str = "RS",
603
+ database_id: str = "WOS",
604
+ ) -> dict | None:
605
+ """Search Web of Science.
606
+
607
+ Requires WOS_API_KEY environment variable.
608
+
609
+ query: Web of Science advanced search syntax, e.g.:
610
+ - TS=(catalysis AND asymmetric) — topic search
611
+ - AU=(Smith) — author search
612
+ - SO=(Nature) — source/journal search
613
+ - DO=(10.1234/xxx) — DOI search
614
+
615
+ sort_field: RS (relevance), PY (year), TC (times cited), LD (load date)
616
+ database_id: WOS, BCI, CCC, DCI, DIIDW, KJD, MEDLINE, RSCI, SCIELO
617
+ """
618
+ if not WOS_API_KEY:
619
+ return None
620
+ return await _get(
621
+ WOS_BASE,
622
+ params={
623
+ "databaseId": database_id,
624
+ "usrQuery": query,
625
+ "count": limit,
626
+ "firstRecord": first_record,
627
+ "sortField": sort_field,
628
+ },
629
+ headers={"X-ApiKey": WOS_API_KEY},
630
+ )
631
+
632
+ # =============================================================================
633
+ # NIST Chemistry WebBook (scraping — no official API)
634
+ # =============================================================================
635
+
636
+
637
+ def _strip_html(html: str) -> str:
638
+ """Remove HTML tags and decode common entities."""
639
+ text = re.sub(r"<[^>]+>", "", html)
640
+ for ent, char in [("&amp;", "&"), ("&lt;", "<"), ("&gt;", ">"),
641
+ ("&plusmn;", "±"), ("&deg;", "°"), ("&nbsp;", " "),
642
+ ("&#176;", "°")]:
643
+ text = text.replace(ent, char)
644
+ return text.strip()
645
+
646
+
647
+ async def nist_fetch(params: dict[str, str]) -> str | None:
648
+ """Fetch a NIST WebBook page. Returns raw HTML."""
649
+ params.setdefault("Units", "SI")
650
+ try:
651
+ async with _http() as client:
652
+ resp = await client.get(NIST_BASE, params=params)
653
+ if resp.status_code == 200:
654
+ return resp.text
655
+ except Exception as e:
656
+ logger.warning(f"NIST fetch failed: {e}")
657
+ return None
658
+
659
+
660
+ async def nist_search(query: str, search_type: str = "name") -> str | None:
661
+ """Search NIST WebBook by name, CAS, formula, or InChI."""
662
+ params: dict[str, str] = {}
663
+ if search_type == "cas":
664
+ cas_clean = "C" + query.replace("-", "")
665
+ params["ID"] = cas_clean
666
+ params["Mask"] = "FFF"
667
+ elif search_type == "formula":
668
+ params["Formula"] = query
669
+ params["NoIon"] = "on"
670
+ elif search_type == "inchi":
671
+ params["InChI"] = query
672
+ else:
673
+ params["Name"] = query
674
+ return await nist_fetch(params)
675
+
676
+
677
+ def nist_is_compound_page(html: str) -> bool:
678
+ """Check if HTML is a single compound page (vs. search results)."""
679
+ return bool(re.search(r'<h1[^>]*id="Top"', html))
680
+
681
+
682
+ def nist_parse_search_results(html: str) -> list[dict]:
683
+ """Parse NIST search results page into list of matches."""
684
+ results = []
685
+ for m in re.finditer(
686
+ r'<a\s+href="(/cgi/cbook\.cgi\?ID=(C\d+)[^"]*)"[^>]*>(.*?)</a>',
687
+ html,
688
+ ):
689
+ results.append({
690
+ "name": _strip_html(m.group(3)),
691
+ "nist_id": m.group(2),
692
+ "url": f"https://webbook.nist.gov{m.group(1)}",
693
+ })
694
+ return results
695
+
696
+
697
+ def nist_parse_compound(html: str) -> dict:
698
+ """Parse NIST compound page into structured data."""
699
+ info: dict[str, Any] = {}
700
+
701
+ # Name
702
+ m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.S)
703
+ if m:
704
+ info["name"] = _strip_html(m.group(1))
705
+
706
+ # Key-value pairs from <li><strong>Key:</strong> Value</li>
707
+ kv_map = {
708
+ "Formula": "formula",
709
+ "Molecular weight": "molecular_weight",
710
+ "CAS Registry Number": "cas_rn",
711
+ "IUPAC Standard InChI": "inchi",
712
+ "IUPAC Standard InChIKey": "inchi_key",
713
+ "Chemical structure": None, # skip image
714
+ }
715
+ for m in re.finditer(
716
+ r"<li>\s*<strong>(.*?):</strong>\s*(.*?)</li>", html, re.S
717
+ ):
718
+ key = _strip_html(m.group(1))
719
+ val = _strip_html(m.group(2))
720
+ mapped = kv_map.get(key, key.lower().replace(" ", "_"))
721
+ if mapped and val:
722
+ info[mapped] = val
723
+
724
+ # Other names
725
+ m = re.search(r"<strong>Other names:</strong>\s*(.*?)</li>", html, re.S)
726
+ if m:
727
+ raw = _strip_html(m.group(1))
728
+ info["other_names"] = [n.strip() for n in raw.split(";") if n.strip()]
729
+
730
+ # NIST ID from page URLs
731
+ m = re.search(r"ID=(C\d+)", html)
732
+ if m:
733
+ info["nist_id"] = m.group(1)
734
+
735
+ # Detect available data sections
736
+ avail: list[str] = []
737
+ section_names = [
738
+ ("Thermochemistry", "thermo"), ("Phase change", "phase_change"),
739
+ ("Reaction thermochemistry", "reaction_thermo"),
740
+ ("Henry", "henrys_law"), ("Gas phase ion", "ion_energetics"),
741
+ ("IR Spec", "ir_spectrum"), ("Mass Spec", "mass_spectrum"),
742
+ ("UV/Vis", "uv_vis_spectrum"), ("Vibrational", "vibrational"),
743
+ ("Electronic", "electronic"), ("Constants of diatomic", "diatomic"),
744
+ ]
745
+ for label, key in section_names:
746
+ if label.lower() in html.lower():
747
+ avail.append(key)
748
+ info["available_data"] = avail
749
+
750
+ # --- Inline thermochemistry data ---
751
+ # Gas phase ΔfH°
752
+ m = re.search(
753
+ r"f</sub>H.*?gas.*?(-?[\d.]+)\s*(?:±|&plusmn;)\s*([\d.]+)\s*kJ/mol",
754
+ html, re.S,
755
+ )
756
+ if m:
757
+ info["delta_fH_gas_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
758
+
759
+ # Standard entropy S°
760
+ m = re.search(r"S°.*?gas.*?([\d.]+)\s*(?:±|&plusmn;)?\s*[\d.]*\s*J/mol", html, re.S)
761
+ if m:
762
+ info["S_gas_J_mol_K"] = m.group(1)
763
+
764
+ # Cp gas
765
+ m = re.search(r"C\s*p.*?gas.*?([\d.]+)\s*(?:±|&plusmn;)?\s*[\d.]*\s*J/mol", html, re.S)
766
+ if m:
767
+ info["Cp_gas_J_mol_K"] = m.group(1)
768
+
769
+ # Phase change: boiling point
770
+ for pat in [
771
+ r"T<sub>boil</sub>\s*=?\s*([\d.]+)\s*(?:±\s*[\d.]+\s*)?K",
772
+ r"boil.*?([\d.]+)\s*K",
773
+ ]:
774
+ m = re.search(pat, html, re.S)
775
+ if m:
776
+ info["boiling_point_K"] = m.group(1)
777
+ break
778
+
779
+ # Phase change: melting point
780
+ for pat in [
781
+ r"T<sub>fus</sub>\s*=?\s*([\d.]+)\s*(?:±\s*[\d.]+\s*)?K",
782
+ r"fus.*?([\d.]+)\s*K",
783
+ ]:
784
+ m = re.search(pat, html, re.S)
785
+ if m:
786
+ info["melting_point_K"] = m.group(1)
787
+ break
788
+
789
+ # ΔvapH (enthalpy of vaporization)
790
+ m = re.search(r"vap</sub>H.*?([\d.]+)\s*(?:±|&plusmn;)\s*([\d.]+)\s*kJ/mol", html, re.S)
791
+ if m:
792
+ info["delta_vapH_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
793
+
794
+ # ΔfusH (enthalpy of fusion)
795
+ m = re.search(r"fus</sub>H.*?([\d.]+)\s*(?:±|&plusmn;)\s*([\d.]+)\s*kJ/mol", html, re.S)
796
+ if m:
797
+ info["delta_fusH_kJ_mol"] = f"{m.group(1)} ± {m.group(2)}"
798
+
799
+ return info
800
+
801
+
802
+ # =============================================================================
803
+ # Materials Project API (optional — requires MP_API_KEY)
804
+ # =============================================================================
805
+
806
+
807
+ def mp_available() -> bool:
808
+ """Check if Materials Project API key is configured."""
809
+ return bool(MP_API_KEY)
810
+
811
+
812
+ async def mp_search(
813
+ formula: str | None = None,
814
+ elements: list[str] | None = None,
815
+ band_gap_min: float | None = None,
816
+ band_gap_max: float | None = None,
817
+ limit: int = 10,
818
+ ) -> dict | None:
819
+ """Search Materials Project for inorganic materials."""
820
+ if not MP_API_KEY:
821
+ return None
822
+ params: dict[str, Any] = {
823
+ "_limit": limit,
824
+ "_fields": (
825
+ "material_id,formula_pretty,structure,symmetry,"
826
+ "band_gap,formation_energy_per_atom,energy_above_hull,"
827
+ "is_stable,theoretical,nsites"
828
+ ),
829
+ }
830
+ if formula:
831
+ params["formula"] = formula
832
+ if elements:
833
+ params["elements"] = ",".join(elements)
834
+ if band_gap_min is not None:
835
+ params["band_gap_min"] = band_gap_min
836
+ if band_gap_max is not None:
837
+ params["band_gap_max"] = band_gap_max
838
+ return await _get(
839
+ f"{MP_BASE}/materials/summary/",
840
+ params=params,
841
+ headers={"X-API-KEY": MP_API_KEY},
842
+ )
843
+
844
+
845
+ async def mp_get_material(material_id: str) -> dict | None:
846
+ """Get full material details by Materials Project ID (e.g., 'mp-149')."""
847
+ if not MP_API_KEY:
848
+ return None
849
+ return await _get(
850
+ f"{MP_BASE}/materials/summary/{material_id}",
851
+ params={
852
+ "_fields": (
853
+ "material_id,formula_pretty,structure,symmetry,"
854
+ "band_gap,formation_energy_per_atom,energy_above_hull,"
855
+ "is_stable,theoretical,nsites,volume,density,"
856
+ "efermi,total_magnetization,ordering,is_metal,"
857
+ "database_IDs,deprecated,uncorrected_energy_per_atom"
858
+ ),
859
+ },
860
+ headers={"X-API-KEY": MP_API_KEY},
861
+ )
862
+
863
+
864
+ # =============================================================================
865
+ # RXN4Chemistry — IBM AI reaction prediction (optional — requires RXN_API_KEY)
866
+ # =============================================================================
867
+
868
+
869
+ def rxn_available() -> bool:
870
+ """Check if IBM RXN API key is configured."""
871
+ return bool(RXN_API_KEY)
872
+
873
+
874
+ _rxn_project_id: str | None = None
875
+
876
+
877
+ async def _rxn_headers() -> dict[str, str]:
878
+ """Get RXN auth headers."""
879
+ return {
880
+ "Authorization": f"apikey {RXN_API_KEY}",
881
+ "Content-Type": "application/json",
882
+ }
883
+
884
+
885
+ async def _rxn_ensure_project() -> str | None:
886
+ """Create or return cached RXN project ID."""
887
+ global _rxn_project_id
888
+ if _rxn_project_id:
889
+ return _rxn_project_id
890
+ if not RXN_API_KEY:
891
+ return None
892
+ data = await _post(
893
+ f"{RXN_BASE}/projects",
894
+ json_data={"name": "scholarly-mcp"},
895
+ headers=await _rxn_headers(),
896
+ )
897
+ if data and data.get("payload"):
898
+ _rxn_project_id = data["payload"].get("id")
899
+ return _rxn_project_id
900
+
901
+
902
+ async def _rxn_poll(url: str, max_wait: int = 60, interval: int = 3) -> dict | None:
903
+ """Poll an RXN endpoint until result is ready."""
904
+ headers = await _rxn_headers()
905
+ for _ in range(max_wait // interval):
906
+ data = await _get(url, headers=headers)
907
+ if data:
908
+ payload = data.get("payload", {})
909
+ status = payload.get("status", data.get("status", ""))
910
+ if status in ("SUCCESS", "success"):
911
+ return data
912
+ if status in ("FAILED", "failed", "ERROR", "error"):
913
+ return data
914
+ await asyncio.sleep(interval)
915
+ return None
916
+
917
+
918
+ async def rxn_predict_reaction(rxn_smiles: str) -> dict | None:
919
+ """Predict reaction product(s) from reactants.
920
+
921
+ rxn_smiles: reactants.reagents>>products (e.g., 'CC.OC>>')
922
+ """
923
+ if not RXN_API_KEY:
924
+ return None
925
+ project_id = await _rxn_ensure_project()
926
+ if not project_id:
927
+ return None
928
+ data = await _post(
929
+ f"{RXN_BASE}/predictions",
930
+ json_data={
931
+ "projectId": project_id,
932
+ "name": "mcp-prediction",
933
+ "inputs": [{"rxnSmiles": rxn_smiles}],
934
+ },
935
+ headers=await _rxn_headers(),
936
+ )
937
+ if not data or not data.get("payload"):
938
+ return data
939
+ pred_id = data["payload"].get("id")
940
+ if not pred_id:
941
+ return data
942
+ return await _rxn_poll(
943
+ f"{RXN_BASE}/predictions/{pred_id}",
944
+ )
945
+
946
+
947
+ async def rxn_retrosynthesis(product_smiles: str, max_steps: int = 3) -> dict | None:
948
+ """Plan retrosynthetic route to a target molecule.
949
+
950
+ product_smiles: target product SMILES
951
+ max_steps: maximum retrosynthetic steps (1-10)
952
+ """
953
+ if not RXN_API_KEY:
954
+ return None
955
+ project_id = await _rxn_ensure_project()
956
+ if not project_id:
957
+ return None
958
+ data = await _post(
959
+ f"{RXN_BASE}/retrosynthesis",
960
+ json_data={
961
+ "projectId": project_id,
962
+ "fap": 0.6,
963
+ "maxSteps": max_steps,
964
+ "nBeams": 10,
965
+ "pruneThreshold": 0.2,
966
+ "isAutomatic": True,
967
+ "content": {"smiles": product_smiles},
968
+ },
969
+ headers=await _rxn_headers(),
970
+ )
971
+ if not data or not data.get("payload"):
972
+ return data
973
+ pred_id = data["payload"].get("id")
974
+ if not pred_id:
975
+ return data
976
+ # Retrosynthesis takes longer — poll with longer timeout
977
+ return await _rxn_poll(
978
+ f"{RXN_BASE}/retrosynthesis/{pred_id}?projectId={project_id}",
979
+ max_wait=120,
980
+ interval=5,
981
+ )
982
+
983
+
984
+ async def rxn_paragraph_to_actions(paragraph: str) -> dict | None:
985
+ """Convert experimental paragraph to structured action steps.
986
+
987
+ Uses IBM RXN NLP model to parse free-text experimental procedures
988
+ into machine-readable action steps (MAKESOLUTION, ADD, STIR, etc.).
989
+ """
990
+ if not RXN_API_KEY:
991
+ return None
992
+ data = await _post(
993
+ f"{RXN_BASE}/paragraph-actions",
994
+ json_data={"paragraph": paragraph},
995
+ headers=await _rxn_headers(),
996
+ )
997
+ return data
998
+
999
+
1000
+ async def rxn_predict_atom_mapping(rxn_smiles: str) -> dict | None:
1001
+ """Predict atom-to-atom mapping for a reaction SMILES.
1002
+
1003
+ rxn_smiles: full reaction SMILES (e.g., 'CC(=O)O.OCC>>CC(=O)OCC.O')
1004
+ Uses the atom-mapping-2020 AI model.
1005
+ """
1006
+ if not RXN_API_KEY:
1007
+ return None
1008
+ project_id = await _rxn_ensure_project()
1009
+ if not project_id:
1010
+ return None
1011
+ data = await _post(
1012
+ f"{RXN_BASE}/predictions",
1013
+ json_data={
1014
+ "projectId": project_id,
1015
+ "name": "mcp-atom-mapping",
1016
+ "inputs": [{"rxnSmiles": rxn_smiles}],
1017
+ "aiModel": "atom-mapping-2020",
1018
+ },
1019
+ headers=await _rxn_headers(),
1020
+ )
1021
+ if not data or not data.get("payload"):
1022
+ return data
1023
+ pred_id = data["payload"].get("id")
1024
+ if not pred_id:
1025
+ return data
1026
+ return await _rxn_poll(
1027
+ f"{RXN_BASE}/predictions/{pred_id}",
1028
+ )
1029
+
1030
+
1031
+ async def rxn_synthesis_plan(
1032
+ prediction_id: str,
1033
+ sequence_index: int = 0,
1034
+ ) -> dict | None:
1035
+ """Create and retrieve a synthesis plan from a retrosynthesis result.
1036
+
1037
+ Takes a retrosynthesis prediction ID and sequence index, creates a
1038
+ synthesis, then retrieves the step-by-step procedure with actions.
1039
+ """
1040
+ if not RXN_API_KEY:
1041
+ return None
1042
+ project_id = await _rxn_ensure_project()
1043
+ if not project_id:
1044
+ return None
1045
+ headers = await _rxn_headers()
1046
+
1047
+ # Step 1: Get the retrosynthesis result to find sequence IDs
1048
+ retro_data = await _get(
1049
+ f"{RXN_BASE}/retrosynthesis/{prediction_id}?projectId={project_id}",
1050
+ headers=headers,
1051
+ )
1052
+ if not retro_data:
1053
+ return {"error": "Could not retrieve retrosynthesis result"}
1054
+
1055
+ payload = retro_data.get("payload", {})
1056
+ sequences = payload.get("sequences", [])
1057
+ if not sequences:
1058
+ return {"error": "No synthesis sequences found in retrosynthesis result"}
1059
+ if sequence_index >= len(sequences):
1060
+ return {"error": f"Sequence index {sequence_index} out of range (have {len(sequences)})"}
1061
+
1062
+ sequence = sequences[sequence_index]
1063
+ sequence_id = sequence.get("sequenceId", sequence.get("id", ""))
1064
+ if not sequence_id:
1065
+ return {"error": "Could not extract sequence ID from retrosynthesis result", "sequence": sequence}
1066
+
1067
+ # Step 2: Create synthesis from sequence
1068
+ synth_data = await _post(
1069
+ f"{RXN_BASE}/syntheses",
1070
+ json_data={
1071
+ "sequenceId": sequence_id,
1072
+ "projectId": project_id,
1073
+ },
1074
+ headers=headers,
1075
+ )
1076
+ if not synth_data or not synth_data.get("payload"):
1077
+ return {"error": "Failed to create synthesis", "detail": synth_data}
1078
+ synthesis_id = synth_data["payload"].get("id", "")
1079
+ if not synthesis_id:
1080
+ return {"error": "No synthesis ID returned"}
1081
+
1082
+ # Step 3: Poll for synthesis completion, then get procedure
1083
+ synth_result = await _rxn_poll(
1084
+ f"{RXN_BASE}/syntheses/{synthesis_id}?projectId={project_id}",
1085
+ max_wait=90,
1086
+ interval=5,
1087
+ )
1088
+ if not synth_result:
1089
+ return {"error": "Synthesis planning timed out"}
1090
+
1091
+ # Step 4: Get detailed procedure
1092
+ procedure = await _get(
1093
+ f"{RXN_BASE}/syntheses/{synthesis_id}/procedure?projectId={project_id}",
1094
+ headers=headers,
1095
+ )
1096
+
1097
+ return {
1098
+ "synthesis_id": synthesis_id,
1099
+ "plan": synth_result,
1100
+ "procedure": procedure,
1101
+ "sequence_index": sequence_index,
1102
+ "total_sequences": len(sequences),
1103
+ }
1104
+
1105
+
1106
+ # =============================================================================
1107
+ # Rowan Science — Cloud computational chemistry (optional)
1108
+ # Requires: ROWAN_API_KEY + rowan-python package
1109
+ # =============================================================================
1110
+
1111
+ ROWAN_API_KEY: str | None = os.environ.get("ROWAN_API_KEY")
1112
+
1113
+ try:
1114
+ import rowan as _rowan_sdk
1115
+ import stjames as _stjames
1116
+ _ROWAN_SDK = True
1117
+ except ImportError:
1118
+ _ROWAN_SDK = False
1119
+
1120
+
1121
+ def rowan_available() -> bool:
1122
+ """Check if Rowan Science API key AND SDK are configured."""
1123
+ return bool(ROWAN_API_KEY) and _ROWAN_SDK
1124
+
1125
+
1126
+ async def _rowan_run_workflow(submit_fn, **kwargs) -> dict | None:
1127
+ """Submit a Rowan workflow, wait for result, return data dict.
1128
+
1129
+ Runs the synchronous Rowan SDK in a thread pool to avoid blocking
1130
+ the async event loop. Returns workflow.data or error info.
1131
+ """
1132
+ if not rowan_available():
1133
+ return None
1134
+
1135
+ def _run():
1136
+ _rowan_sdk.api_key = ROWAN_API_KEY
1137
+ result = submit_fn(**kwargs)
1138
+ result.wait_for_result()
1139
+ result.fetch_latest(in_place=True)
1140
+ return {
1141
+ "status": str(result.status),
1142
+ "data": result.data,
1143
+ "credits_charged": result.credits_charged,
1144
+ "workflow_uuid": result.uuid,
1145
+ "workflow_type": result.workflow_type,
1146
+ "elapsed": result.elapsed,
1147
+ }
1148
+
1149
+ try:
1150
+ return await asyncio.to_thread(_run)
1151
+ except Exception as e:
1152
+ logger.warning(f"Rowan workflow failed: {e}")
1153
+ return {"error": str(e)}
1154
+
1155
+
1156
+ async def rowan_predict_pka(
1157
+ smiles: str,
1158
+ pka_range: tuple[int, int] = (2, 12),
1159
+ method: str = "aimnet2_wagen2024",
1160
+ ) -> dict | None:
1161
+ """Predict pKa values for a molecule using Rowan Science."""
1162
+ return await _rowan_run_workflow(
1163
+ _rowan_sdk.submit_pka_workflow,
1164
+ initial_molecule=smiles,
1165
+ pka_range=pka_range,
1166
+ method=method,
1167
+ name="labmate-pka",
1168
+ )
1169
+
1170
+
1171
+ async def rowan_predict_solubility(
1172
+ smiles: str,
1173
+ method: str = "fastsolv",
1174
+ solvents: list[str] | None = None,
1175
+ temperatures: list[float] | None = None,
1176
+ ) -> dict | None:
1177
+ """Predict solubility using Rowan Science."""
1178
+ return await _rowan_run_workflow(
1179
+ _rowan_sdk.submit_solubility_workflow,
1180
+ initial_smiles=smiles,
1181
+ solubility_method=method,
1182
+ solvents=solvents,
1183
+ temperatures=temperatures,
1184
+ name="labmate-solubility",
1185
+ )
1186
+
1187
+
1188
+ async def rowan_predict_admet(smiles: str) -> dict | None:
1189
+ """Predict ADMET properties using Rowan Science."""
1190
+ return await _rowan_run_workflow(
1191
+ _rowan_sdk.submit_admet_workflow,
1192
+ initial_smiles=smiles,
1193
+ name="labmate-admet",
1194
+ )
1195
+
1196
+
1197
+ async def rowan_search_tautomers(smiles: str) -> dict | None:
1198
+ """Enumerate and rank tautomers using Rowan Science."""
1199
+ mol = _stjames.Molecule.from_smiles(smiles)
1200
+ return await _rowan_run_workflow(
1201
+ _rowan_sdk.submit_tautomer_search_workflow,
1202
+ initial_molecule=mol,
1203
+ name="labmate-tautomers",
1204
+ )
1205
+
1206
+
1207
+ async def rowan_compute_descriptors(smiles: str) -> dict | None:
1208
+ """Compute molecular descriptors using Rowan Science."""
1209
+ mol = _stjames.Molecule.from_smiles(smiles)
1210
+ return await _rowan_run_workflow(
1211
+ _rowan_sdk.submit_descriptors_workflow,
1212
+ initial_molecule=mol,
1213
+ name="labmate-descriptors",
1214
+ )
1215
+
1216
+
1217
+ async def rowan_predict_nmr(
1218
+ smiles: str,
1219
+ solvent: str = "chloroform",
1220
+ ) -> dict | None:
1221
+ """Predict NMR chemical shifts using Rowan Science."""
1222
+ mol = _stjames.Molecule.from_smiles(smiles)
1223
+ return await _rowan_run_workflow(
1224
+ _rowan_sdk.submit_nmr_workflow,
1225
+ initial_molecule=mol,
1226
+ solvent=solvent,
1227
+ name="labmate-nmr",
1228
+ )
1229
+
1230
+
1231
+ # =============================================================================
1232
+ # UniChem — Universal chemical identifier cross-reference (no auth)
1233
+ # =============================================================================
1234
+
1235
+
1236
+ async def unichem_lookup(inchikey: str) -> dict | None:
1237
+ """Cross-reference a compound across 40+ databases by InChIKey.
1238
+
1239
+ Returns source IDs from ChEMBL, PubChem, DrugBank, ZINC, etc.
1240
+ """
1241
+ return await _post(
1242
+ f"{UNICHEM_BASE}/compounds",
1243
+ json_data={"type": "inchikey", "compound": inchikey},
1244
+ )
1245
+
1246
+
1247
+ async def unichem_sources() -> dict | None:
1248
+ """List all available UniChem data sources."""
1249
+ return await _get(f"{UNICHEM_BASE}/sources")
1250
+
1251
+
1252
+ # =============================================================================
1253
+ # Crystallography Open Database (COD) — open crystal structures (no auth)
1254
+ # =============================================================================
1255
+
1256
+
1257
+ async def cod_search(
1258
+ formula: str | None = None,
1259
+ elements: list[str] | None = None,
1260
+ text: str | None = None,
1261
+ limit: int = 20,
1262
+ ) -> list | None:
1263
+ """Search COD for crystal structures.
1264
+
1265
+ formula: Hill notation (e.g., 'C6 H6', 'Fe2 O3')
1266
+ elements: required elements (e.g., ['Fe', 'O'])
1267
+ text: free text search in compound names
1268
+ """
1269
+ params: dict[str, str] = {"format": "json"}
1270
+ if formula:
1271
+ params["formula"] = formula
1272
+ if elements:
1273
+ for i, el in enumerate(elements[:8], 1):
1274
+ params[f"el{i}"] = el
1275
+ if text:
1276
+ params["text"] = text
1277
+ try:
1278
+ async with _http() as client:
1279
+ resp = await client.get(f"{COD_BASE}/result", params=params)
1280
+ if resp.status_code == 200:
1281
+ data = resp.json()
1282
+ if isinstance(data, list):
1283
+ return data[:limit]
1284
+ return data
1285
+ except Exception as e:
1286
+ logger.warning(f"COD search failed: {e}")
1287
+ return None
1288
+
1289
+
1290
+ async def cod_get_cif(cod_id: int | str) -> str | None:
1291
+ """Download CIF file for a COD entry."""
1292
+ try:
1293
+ async with _http() as client:
1294
+ resp = await client.get(f"{COD_BASE}/{cod_id}.cif")
1295
+ if resp.status_code == 200:
1296
+ return resp.text
1297
+ except Exception as e:
1298
+ logger.warning(f"COD CIF download failed: {e}")
1299
+ return None
1300
+
1301
+
1302
+ # =============================================================================
1303
+ # EPA CompTox Dashboard (optional — requires COMPTOX_API_KEY)
1304
+ # =============================================================================
1305
+
1306
+
1307
+ def comptox_available() -> bool:
1308
+ """Check if EPA CompTox API key is configured."""
1309
+ return bool(COMPTOX_API_KEY)
1310
+
1311
+
1312
+ async def comptox_search(query: str) -> dict | None:
1313
+ """Search CompTox by chemical name, CAS, or DTXSID."""
1314
+ if not COMPTOX_API_KEY:
1315
+ return None
1316
+ # Try name search
1317
+ return await _get(
1318
+ f"{COMPTOX_BASE}/chemical/search/by-name/{query}",
1319
+ headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
1320
+ )
1321
+
1322
+
1323
+ async def comptox_get_details(dtxsid: str) -> dict | None:
1324
+ """Get full chemical details by DTXSID identifier."""
1325
+ if not COMPTOX_API_KEY:
1326
+ return None
1327
+ return await _get(
1328
+ f"{COMPTOX_BASE}/chemical/detail/search/by-dtxsid/{dtxsid}",
1329
+ headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
1330
+ )
1331
+
1332
+
1333
+ async def comptox_get_properties(dtxsid: str) -> dict | None:
1334
+ """Get physicochemical and fate properties for a chemical."""
1335
+ if not COMPTOX_API_KEY:
1336
+ return None
1337
+ return await _get(
1338
+ f"{COMPTOX_BASE}/chemical/property/search/by-dtxsid/{dtxsid}",
1339
+ headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
1340
+ )
1341
+
1342
+
1343
+ async def comptox_get_hazard(dtxsid: str) -> dict | None:
1344
+ """Get hazard data for a chemical."""
1345
+ if not COMPTOX_API_KEY:
1346
+ return None
1347
+ return await _get(
1348
+ f"{COMPTOX_BASE}/hazard/search/by-dtxsid/{dtxsid}",
1349
+ headers={"x-api-key": COMPTOX_API_KEY, "Accept": "application/json"},
1350
+ )
1351
+
1352
+
1353
+ # =============================================================================
1354
+ # MassBank EU — Mass spectrometry reference spectra (no auth)
1355
+ # =============================================================================
1356
+
1357
+
1358
+ async def massbank_search(
1359
+ compound_name: str | None = None,
1360
+ formula: str | None = None,
1361
+ inchikey: str | None = None,
1362
+ exact_mass_min: float | None = None,
1363
+ exact_mass_max: float | None = None,
1364
+ instrument_type: str | None = None,
1365
+ limit: int = 20,
1366
+ ) -> list | None:
1367
+ """Search MassBank for reference mass spectra."""
1368
+ params: dict[str, Any] = {"limit": limit}
1369
+ if compound_name:
1370
+ params["compound_name"] = compound_name
1371
+ if formula:
1372
+ params["formula"] = formula
1373
+ if inchikey:
1374
+ params["inchi_key"] = inchikey
1375
+ if exact_mass_min is not None:
1376
+ params["exact_mass_from"] = exact_mass_min
1377
+ if exact_mass_max is not None:
1378
+ params["exact_mass_to"] = exact_mass_max
1379
+ if instrument_type:
1380
+ params["instrument_type"] = instrument_type
1381
+ try:
1382
+ async with _http() as client:
1383
+ resp = await client.get(f"{MASSBANK_BASE}/records", params=params)
1384
+ if resp.status_code == 200:
1385
+ data = resp.json()
1386
+ return data if isinstance(data, list) else data.get("data", [])
1387
+ except Exception as e:
1388
+ logger.warning(f"MassBank search failed: {e}")
1389
+ return None
1390
+
1391
+
1392
+ async def massbank_get_record(accession: str) -> dict | None:
1393
+ """Get a specific MassBank spectrum record."""
1394
+ return await _get(f"{MASSBANK_BASE}/records/{accession}")
1395
+
1396
+
1397
+ # =============================================================================
1398
+ # BindingDB — Protein-ligand binding affinities (no auth)
1399
+ # =============================================================================
1400
+
1401
+
1402
+ async def bindingdb_by_target(
1403
+ uniprot_id: str,
1404
+ cutoff_nm: int = 10000,
1405
+ ) -> list[dict] | None:
1406
+ """Get ligands for a protein target by UniProt ID.
1407
+
1408
+ cutoff_nm: binding affinity cutoff in nM (default 10 µM)
1409
+ Returns list of dicts with compound SMILES, Ki, IC50, Kd, EC50.
1410
+ """
1411
+ try:
1412
+ async with _http() as client:
1413
+ resp = await client.get(
1414
+ f"{BINDINGDB_BASE}/getLigandsByUniprots",
1415
+ params={
1416
+ "uniprot": uniprot_id,
1417
+ "cutoff": cutoff_nm,
1418
+ "response": "application/json",
1419
+ },
1420
+ timeout=60,
1421
+ )
1422
+ if resp.status_code == 200:
1423
+ # BindingDB may return JSON or TSV depending on version
1424
+ try:
1425
+ return resp.json()
1426
+ except Exception:
1427
+ # Parse TSV fallback
1428
+ return _parse_bindingdb_tsv(resp.text)
1429
+ except Exception as e:
1430
+ logger.warning(f"BindingDB target search failed: {e}")
1431
+ return None
1432
+
1433
+
1434
+ async def bindingdb_by_smiles(
1435
+ smiles: str,
1436
+ cutoff: float = 0.8,
1437
+ ) -> list[dict] | None:
1438
+ """Find similar compounds in BindingDB by SMILES.
1439
+
1440
+ cutoff: Tanimoto similarity threshold (0-1, default 0.8)
1441
+ """
1442
+ try:
1443
+ async with _http() as client:
1444
+ resp = await client.get(
1445
+ f"{BINDINGDB_BASE}/getTargetByCompound",
1446
+ params={
1447
+ "smiles": smiles,
1448
+ "cutoff": cutoff,
1449
+ "response": "application/json",
1450
+ },
1451
+ timeout=60,
1452
+ )
1453
+ if resp.status_code == 200:
1454
+ try:
1455
+ return resp.json()
1456
+ except Exception:
1457
+ return _parse_bindingdb_tsv(resp.text)
1458
+ except Exception as e:
1459
+ logger.warning(f"BindingDB SMILES search failed: {e}")
1460
+ return None
1461
+
1462
+
1463
+ def _parse_bindingdb_tsv(text: str) -> list[dict]:
1464
+ """Parse BindingDB tab-separated response into list of dicts."""
1465
+ lines = text.strip().split("\n")
1466
+ if len(lines) < 2:
1467
+ return []
1468
+ headers = lines[0].split("\t")
1469
+ results = []
1470
+ for line in lines[1:]:
1471
+ vals = line.split("\t")
1472
+ row = {}
1473
+ for i, h in enumerate(headers):
1474
+ if i < len(vals):
1475
+ row[h.strip()] = vals[i].strip()
1476
+ results.append(row)
1477
+ return results
1478
+
1479
+
1480
+ # =============================================================================
1481
+ # Crossref BibTeX (content negotiation — no extra API)
1482
+ # =============================================================================
1483
+
1484
+
1485
+ async def crossref_get_bibtex(doi: str) -> str | None:
1486
+ """Get BibTeX entry for a DOI via Crossref content negotiation."""
1487
+ doi = doi.strip().removeprefix("https://doi.org/").removeprefix("http://doi.org/")
1488
+ try:
1489
+ async with _http() as client:
1490
+ resp = await client.get(
1491
+ f"https://doi.org/{doi}",
1492
+ headers={"Accept": "application/x-bibtex"},
1493
+ follow_redirects=True,
1494
+ )
1495
+ if resp.status_code == 200 and "@" in resp.text:
1496
+ return resp.text.strip()
1497
+ except Exception as e:
1498
+ logger.warning(f"BibTeX fetch failed for {doi}: {e}")
1499
+ return None
1500
+
1501
+
1502
+ async def crossref_get_bibtex_batch(dois: list[str]) -> list[tuple[str, str | None]]:
1503
+ """Get BibTeX entries for multiple DOIs. Returns list of (doi, bibtex)."""
1504
+ results = []
1505
+ for doi in dois:
1506
+ bib = await crossref_get_bibtex(doi)
1507
+ results.append((doi, bib))
1508
+ return results
1509
+
1510
+
1511
+ # =============================================================================
1512
+ # RCSB PDB — Protein Data Bank (no auth)
1513
+ # =============================================================================
1514
+
1515
+
1516
+ async def pdb_search(
1517
+ query: str,
1518
+ search_type: str = "full_text",
1519
+ limit: int = 10,
1520
+ ) -> dict | None:
1521
+ """Search RCSB PDB for protein/nucleic acid structures.
1522
+
1523
+ search_type: 'full_text', 'structure_title', 'structure_author'
1524
+ """
1525
+ service_map = {
1526
+ "full_text": "full_text",
1527
+ "structure_title": "text",
1528
+ "structure_author": "text",
1529
+ }
1530
+ service = service_map.get(search_type, "full_text")
1531
+
1532
+ json_body: dict[str, Any] = {
1533
+ "query": {
1534
+ "type": "terminal",
1535
+ "service": service,
1536
+ "parameters": {"value": query},
1537
+ },
1538
+ "return_type": "entry",
1539
+ "request_options": {
1540
+ "results_content_type": ["experimental"],
1541
+ "paginate": {"start": 0, "rows": limit},
1542
+ "sort": [{"sort_by": "score", "direction": "desc"}],
1543
+ },
1544
+ }
1545
+
1546
+ # For author/title, use the text service with specific attribute
1547
+ if search_type == "structure_title":
1548
+ json_body["query"]["parameters"] = {
1549
+ "attribute": "struct.title",
1550
+ "operator": "contains_phrase",
1551
+ "value": query,
1552
+ }
1553
+ elif search_type == "structure_author":
1554
+ json_body["query"]["parameters"] = {
1555
+ "attribute": "rcsb_primary_citation.rcsb_authors",
1556
+ "operator": "contains_phrase",
1557
+ "value": query,
1558
+ }
1559
+
1560
+ try:
1561
+ async with _http() as client:
1562
+ resp = await client.post(
1563
+ PDB_SEARCH_BASE,
1564
+ json=json_body,
1565
+ timeout=30,
1566
+ )
1567
+ if resp.status_code == 200:
1568
+ return resp.json()
1569
+ except Exception as e:
1570
+ logger.warning(f"PDB search failed: {e}")
1571
+ return None
1572
+
1573
+
1574
+ async def pdb_get_entry(pdb_id: str) -> dict | None:
1575
+ """Get full entry details from RCSB PDB."""
1576
+ pdb_id = pdb_id.strip().upper()
1577
+ return await _get(f"{PDB_DATA_BASE}/entry/{pdb_id}")
1578
+
1579
+
1580
+ async def pdb_get_entity(pdb_id: str, entity_id: int = 1) -> dict | None:
1581
+ """Get polymer entity details (protein/nucleic acid chain)."""
1582
+ pdb_id = pdb_id.strip().upper()
1583
+ return await _get(f"{PDB_DATA_BASE}/polymer_entity/{pdb_id}/{entity_id}")
1584
+
1585
+
1586
+ async def pdb_get_ligands(pdb_id: str) -> list[dict]:
1587
+ """Get all non-polymer (ligand) entities in a PDB structure."""
1588
+ pdb_id = pdb_id.strip().upper()
1589
+ ligands = []
1590
+ # PDB structures can have multiple non-polymer entities
1591
+ for entity_id in range(1, 20): # usually < 10
1592
+ data = await _get(
1593
+ f"{PDB_DATA_BASE}/nonpolymer_entity/{pdb_id}/{entity_id}"
1594
+ )
1595
+ if data:
1596
+ ligands.append(data)
1597
+ else:
1598
+ break
1599
+ return ligands
1600
+
1601
+
1602
+ # =============================================================================
1603
+ # PubChem GHS Hazard Data (extends existing PubChem client)
1604
+ # =============================================================================
1605
+
1606
+
1607
+ async def pubchem_get_ghs(cid: int) -> dict | None:
1608
+ """Get GHS Classification data (hazard pictograms, H/P statements) for a compound.
1609
+
1610
+ Uses PubChem PUG-View API to get the GHS section.
1611
+ """
1612
+ try:
1613
+ async with _http() as client:
1614
+ resp = await client.get(
1615
+ f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON",
1616
+ params={"heading": "GHS Classification"},
1617
+ timeout=20,
1618
+ )
1619
+ if resp.status_code == 200:
1620
+ return resp.json()
1621
+ except Exception as e:
1622
+ logger.warning(f"PubChem GHS fetch failed for CID {cid}: {e}")
1623
+ return None
1624
+
1625
+
1626
+ def parse_ghs_data(pug_view_data: dict) -> dict:
1627
+ """Parse PubChem PUG-View GHS response into structured hazard data."""
1628
+ result: dict[str, Any] = {
1629
+ "pictograms": [],
1630
+ "signal_word": "",
1631
+ "hazard_statements": [],
1632
+ "precautionary_statements": [],
1633
+ }
1634
+
1635
+ if not pug_view_data:
1636
+ return result
1637
+
1638
+ # Navigate the nested PUG-View structure
1639
+ record = pug_view_data.get("Record", {})
1640
+ sections = record.get("Section", [])
1641
+
1642
+ for section in sections:
1643
+ for subsec in section.get("Section", []):
1644
+ heading = subsec.get("TOCHeading", "")
1645
+
1646
+ for info in subsec.get("Information", []):
1647
+ val = info.get("Value", {})
1648
+
1649
+ if "Pictogram" in heading or "Pictogram" in info.get("Name", ""):
1650
+ # Extract pictogram names
1651
+ for sv in val.get("StringWithMarkup", []):
1652
+ text = sv.get("String", "")
1653
+ if text:
1654
+ result["pictograms"].append(text)
1655
+ # Also check for markup references
1656
+ for mu in sv.get("Markup", []):
1657
+ extra = mu.get("Extra", "")
1658
+ if extra:
1659
+ result["pictograms"].append(extra)
1660
+
1661
+ elif "Signal" in heading or "Signal" in info.get("Name", ""):
1662
+ for sv in val.get("StringWithMarkup", []):
1663
+ text = sv.get("String", "")
1664
+ if text and text.lower() in ("danger", "warning"):
1665
+ result["signal_word"] = text
1666
+
1667
+ elif "Hazard Statement" in heading or "H Statement" in info.get("Name", ""):
1668
+ for sv in val.get("StringWithMarkup", []):
1669
+ text = sv.get("String", "")
1670
+ if text:
1671
+ result["hazard_statements"].append(text)
1672
+
1673
+ elif "Precautionary" in heading or "P Statement" in info.get("Name", ""):
1674
+ for sv in val.get("StringWithMarkup", []):
1675
+ text = sv.get("String", "")
1676
+ if text:
1677
+ result["precautionary_statements"].append(text)
1678
+
1679
+ # Deduplicate
1680
+ result["pictograms"] = list(dict.fromkeys(result["pictograms"]))
1681
+ result["hazard_statements"] = list(dict.fromkeys(result["hazard_statements"]))
1682
+ result["precautionary_statements"] = list(dict.fromkeys(result["precautionary_statements"]))
1683
+
1684
+ return result
1685
+
1686
+
1687
+ # =============================================================================
1688
+ # GNPS NPClassifier — Natural product classification (no auth)
1689
+ # =============================================================================
1690
+
1691
+
1692
+ async def gnps_classify_compound(smiles: str) -> dict | None:
1693
+ """Classify a compound into natural product classes using GNPS NPClassifier.
1694
+
1695
+ Returns pathway, superclass, class, and isglycoside prediction.
1696
+ """
1697
+ try:
1698
+ async with _http() as client:
1699
+ resp = await client.get(
1700
+ f"{NPCLASSIFIER_BASE}/classify",
1701
+ params={"smiles": smiles},
1702
+ timeout=30,
1703
+ )
1704
+ if resp.status_code == 200:
1705
+ return resp.json()
1706
+ except Exception as e:
1707
+ logger.warning(f"NPClassifier failed: {e}")
1708
+ return None
1709
+
1710
+
1711
+ # =============================================================================
1712
+ # OpenAlex Sources — Journal metrics (extends existing OpenAlex client)
1713
+ # =============================================================================
1714
+
1715
+
1716
+ async def openalex_get_source(source_id: str) -> dict | None:
1717
+ """Get journal/source details from OpenAlex.
1718
+
1719
+ source_id: OpenAlex source ID (e.g., 'S137773608') or ISSN
1720
+ """
1721
+ headers = _oa_headers()
1722
+ # Try direct ID lookup
1723
+ if source_id.startswith("S") or source_id.startswith("https://"):
1724
+ return await _get(
1725
+ f"https://api.openalex.org/sources/{source_id}",
1726
+ headers=headers,
1727
+ )
1728
+ # Try ISSN lookup
1729
+ return await _get(
1730
+ f"https://api.openalex.org/sources/issn:{source_id}",
1731
+ headers=headers,
1732
+ )
1733
+
1734
+
1735
+ async def openalex_search_sources(
1736
+ query: str,
1737
+ limit: int = 10,
1738
+ ) -> dict | None:
1739
+ """Search for journals/sources by name in OpenAlex."""
1740
+ return await _get(
1741
+ "https://api.openalex.org/sources",
1742
+ params={"search": query, "per_page": limit},
1743
+ headers=_oa_headers(),
1744
+ )