crossref-local 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,413 @@
1
+ """MCP server for CrossRef Local - Claude integration.
2
+
3
+ This server exposes crossref-local functionality as MCP tools,
4
+ enabling Claude Desktop and other MCP clients to search academic papers.
5
+
6
+ Usage:
7
+ crossref-local serve # stdio (Claude Desktop)
8
+ crossref-local serve -t http --port 8082 # HTTP transport
9
+ crossref-local-mcp # Direct entry point
10
+ """
11
+
12
+ import json
13
+
14
+ from fastmcp import FastMCP
15
+
16
+
17
+ from . import (
18
+ get as _get,
19
+ info as _info,
20
+ search as _search,
21
+ )
22
+
23
+ # Initialize MCP server
24
+ mcp = FastMCP(
25
+ name="crossref-local",
26
+ instructions="Local CrossRef database with 167M+ works and full-text search. "
27
+ "Use search to find papers, search_by_doi for DOI lookup, enrich_dois to add "
28
+ "citation counts and references, and status for stats.",
29
+ )
30
+
31
+
32
+ @mcp.tool()
33
+ def search(
34
+ query: str,
35
+ limit: int = 10,
36
+ offset: int = 0,
37
+ with_abstracts: bool = False,
38
+ ) -> str:
39
+ """Search for academic works by title, abstract, or authors.
40
+
41
+ Uses FTS5 full-text search index for fast searching across 167M+ papers.
42
+ Supports FTS5 query syntax: AND, OR, NOT, "exact phrases".
43
+
44
+ Args:
45
+ query: Search query (e.g., "machine learning", "CRISPR", "neural network AND hippocampus")
46
+ limit: Maximum number of results to return (default: 10, max: 100)
47
+ offset: Skip first N results for pagination (default: 0)
48
+ with_abstracts: Include abstracts in results (default: False)
49
+
50
+ Returns:
51
+ JSON string with search results including total count and matching works.
52
+
53
+ Examples:
54
+ search("machine learning")
55
+ search("CRISPR", limit=20)
56
+ search("neural network AND memory", with_abstracts=True)
57
+ """
58
+ results = _search(query, limit=min(limit, 100), offset=offset)
59
+
60
+ works_data = []
61
+ for work in results.works:
62
+ work_dict = {
63
+ "doi": work.doi,
64
+ "title": work.title,
65
+ "authors": work.authors,
66
+ "year": work.year,
67
+ "journal": work.journal,
68
+ }
69
+ if with_abstracts and work.abstract:
70
+ work_dict["abstract"] = work.abstract
71
+ works_data.append(work_dict)
72
+
73
+ return json.dumps(
74
+ {
75
+ "query": results.query,
76
+ "total": results.total,
77
+ "returned": len(works_data),
78
+ "elapsed_ms": round(results.elapsed_ms, 2),
79
+ "works": works_data,
80
+ },
81
+ indent=2,
82
+ )
83
+
84
+
85
+ @mcp.tool()
86
+ def search_by_doi(doi: str, as_citation: bool = False) -> str:
87
+ """Get detailed information about a work by DOI.
88
+
89
+ Args:
90
+ doi: Digital Object Identifier (e.g., "10.1038/nature12373")
91
+ as_citation: Return formatted citation instead of full metadata
92
+
93
+ Returns:
94
+ JSON string with work metadata, or formatted citation string.
95
+
96
+ Examples:
97
+ search_by_doi("10.1038/nature12373")
98
+ search_by_doi("10.1126/science.aax0758", as_citation=True)
99
+ """
100
+ work = _get(doi)
101
+
102
+ if work is None:
103
+ return json.dumps({"error": f"DOI not found: {doi}"})
104
+
105
+ if as_citation:
106
+ return work.citation()
107
+
108
+ return json.dumps(work.to_dict(), indent=2)
109
+
110
+
111
+ @mcp.tool()
112
+ def status() -> str:
113
+ """Get database statistics and status.
114
+
115
+ Returns:
116
+ JSON string with database path, work count, FTS index count, and citation count.
117
+ """
118
+ db_info = _info()
119
+ return json.dumps(db_info, indent=2)
120
+
121
+
122
+ @mcp.tool()
123
+ def enrich_dois(dois: list[str]) -> str:
124
+ """Enrich DOIs with full metadata including citation counts and references.
125
+
126
+ Use this after search() to get detailed metadata for papers.
127
+ The search() tool returns basic info (title, authors, year, journal).
128
+ This tool adds: citation_count, references, volume, issue, publisher, etc.
129
+
130
+ Typical workflow:
131
+ 1. search("epilepsy seizure prediction") -> get DOIs
132
+ 2. enrich_dois([doi1, doi2, ...]) -> get full metadata
133
+
134
+ Args:
135
+ dois: List of DOIs to enrich (e.g., ["10.1038/nature12373", "10.1126/science.aax0758"])
136
+
137
+ Returns:
138
+ JSON string with enriched works including citation_count and references.
139
+
140
+ Examples:
141
+ enrich_dois(["10.1038/nature12373"])
142
+ enrich_dois(["10.1038/s41467-017-02577-y", "10.1093/brain/aww019"])
143
+ """
144
+ from . import get_many as _get_many
145
+
146
+ works = _get_many(dois)
147
+
148
+ works_data = []
149
+ for work in works:
150
+ works_data.append(work.to_dict())
151
+
152
+ return json.dumps(
153
+ {
154
+ "requested": len(dois),
155
+ "found": len(works_data),
156
+ "works": works_data,
157
+ },
158
+ indent=2,
159
+ )
160
+
161
+
162
+ @mcp.tool()
163
+ def cache_create(
164
+ name: str,
165
+ query: str,
166
+ limit: int = 1000,
167
+ ) -> str:
168
+ """Create a paper cache from search query.
169
+
170
+ Fetches full metadata for papers matching query and saves to disk cache.
171
+ Use this to build a reusable paper collection for a research topic.
172
+
173
+ Args:
174
+ name: Cache name (e.g., "epilepsy", "alzheimers")
175
+ query: FTS search query
176
+ limit: Max papers to cache (default: 1000)
177
+
178
+ Returns:
179
+ JSON with cache info (path, paper count, size)
180
+
181
+ Example:
182
+ cache_create("epilepsy", "epilepsy seizure prediction", limit=500)
183
+ """
184
+ from . import cache
185
+
186
+ info = cache.create(name, query=query, limit=limit)
187
+ return json.dumps(info.to_dict(), indent=2)
188
+
189
+
190
+ @mcp.tool()
191
+ def cache_query(
192
+ name: str,
193
+ fields: list[str] | None = None,
194
+ include_abstract: bool = False,
195
+ include_references: bool = False,
196
+ include_citations: bool = False,
197
+ year_min: int | None = None,
198
+ year_max: int | None = None,
199
+ journal: str | None = None,
200
+ limit: int | None = None,
201
+ ) -> str:
202
+ """Query cached papers with field filtering.
203
+
204
+ Returns minimal data to reduce context usage. Specify only fields needed.
205
+
206
+ Args:
207
+ name: Cache name
208
+ fields: Explicit field list (e.g., ["doi", "title", "year"])
209
+ include_abstract: Include abstract (default: False)
210
+ include_references: Include references list (default: False)
211
+ include_citations: Include citation_count (default: False)
212
+ year_min: Filter by minimum year
213
+ year_max: Filter by maximum year
214
+ journal: Filter by journal name (substring match)
215
+ limit: Max results to return
216
+
217
+ Returns:
218
+ JSON array of filtered papers
219
+
220
+ Examples:
221
+ cache_query("epilepsy", fields=["doi", "title", "year"])
222
+ cache_query("epilepsy", year_min=2020, include_citations=True, limit=50)
223
+ """
224
+ from . import cache
225
+
226
+ papers = cache.query(
227
+ name,
228
+ fields=fields,
229
+ include_abstract=include_abstract,
230
+ include_references=include_references,
231
+ include_citations=include_citations,
232
+ year_min=year_min,
233
+ year_max=year_max,
234
+ journal=journal,
235
+ limit=limit,
236
+ )
237
+ return json.dumps({"count": len(papers), "papers": papers}, indent=2)
238
+
239
+
240
+ @mcp.tool()
241
+ def cache_stats(name: str) -> str:
242
+ """Get cache statistics.
243
+
244
+ Returns year distribution, top journals, citation stats without loading full data.
245
+
246
+ Args:
247
+ name: Cache name
248
+
249
+ Returns:
250
+ JSON with statistics (paper_count, year_range, top_journals, etc.)
251
+ """
252
+ from . import cache
253
+
254
+ stats = cache.stats(name)
255
+ return json.dumps(stats, indent=2)
256
+
257
+
258
+ @mcp.tool()
259
+ def cache_list() -> str:
260
+ """List all available caches.
261
+
262
+ Returns:
263
+ JSON array of cache info (name, path, paper_count, size)
264
+ """
265
+ from . import cache
266
+
267
+ caches = cache.list_caches()
268
+ return json.dumps([c.to_dict() for c in caches], indent=2)
269
+
270
+
271
+ @mcp.tool()
272
+ def cache_top_cited(
273
+ name: str,
274
+ n: int = 20,
275
+ year_min: int | None = None,
276
+ year_max: int | None = None,
277
+ ) -> str:
278
+ """Get top cited papers from cache.
279
+
280
+ Args:
281
+ name: Cache name
282
+ n: Number of papers to return
283
+ year_min: Filter by minimum year
284
+ year_max: Filter by maximum year
285
+
286
+ Returns:
287
+ JSON array of top cited papers
288
+ """
289
+ from .cache_viz import get_top_cited
290
+
291
+ papers = get_top_cited(name, n=n, year_min=year_min, year_max=year_max)
292
+ return json.dumps(papers, indent=2)
293
+
294
+
295
+ @mcp.tool()
296
+ def cache_citation_summary(name: str) -> str:
297
+ """Get citation statistics for cached papers.
298
+
299
+ Returns mean, median, max citations and counts of highly cited papers.
300
+
301
+ Args:
302
+ name: Cache name
303
+
304
+ Returns:
305
+ JSON with citation statistics
306
+ """
307
+ from .cache_viz import get_citation_summary
308
+
309
+ summary = get_citation_summary(name)
310
+ return json.dumps(summary, indent=2)
311
+
312
+
313
+ @mcp.tool()
314
+ def cache_plot_scatter(
315
+ name: str,
316
+ output: str,
317
+ top_n: int = 10,
318
+ ) -> str:
319
+ """Generate year vs citations scatter plot.
320
+
321
+ Saves plot to file and returns top cited papers.
322
+
323
+ Args:
324
+ name: Cache name
325
+ output: Output file path (png/pdf/svg)
326
+ top_n: Number of top papers to label on plot
327
+
328
+ Returns:
329
+ JSON with output path and top papers list
330
+ """
331
+ from .cache_viz import plot_year_citations
332
+
333
+ result = plot_year_citations(name, output=output, top_n=top_n)
334
+ return json.dumps(result, indent=2)
335
+
336
+
337
+ @mcp.tool()
338
+ def cache_plot_network(
339
+ name: str,
340
+ output: str,
341
+ max_nodes: int = 100,
342
+ ) -> str:
343
+ """Generate citation network visualization.
344
+
345
+ Creates interactive HTML graph showing citation relationships.
346
+
347
+ Args:
348
+ name: Cache name
349
+ output: Output HTML file path
350
+ max_nodes: Maximum papers to include
351
+
352
+ Returns:
353
+ JSON with network stats
354
+ """
355
+ from .cache_viz import plot_citation_network
356
+
357
+ result = plot_citation_network(name, output=output, max_nodes=max_nodes)
358
+ return json.dumps(result, indent=2)
359
+
360
+
361
+ @mcp.tool()
362
+ def cache_export(
363
+ name: str,
364
+ output_path: str,
365
+ format: str = "json",
366
+ fields: list[str] | None = None,
367
+ ) -> str:
368
+ """Export cache to file.
369
+
370
+ Args:
371
+ name: Cache name
372
+ output_path: Output file path
373
+ format: Export format (json, csv, bibtex, dois)
374
+ fields: Fields to include (for json/csv)
375
+
376
+ Returns:
377
+ JSON with output path
378
+ """
379
+ from . import cache
380
+
381
+ path = cache.export(name, output_path, format=format, fields=fields)
382
+ return json.dumps({"exported": path, "format": format})
383
+
384
+
385
+ def run_server(
386
+ transport: str = "stdio",
387
+ host: str = "localhost",
388
+ port: int = 8082,
389
+ ) -> None:
390
+ """Run the MCP server.
391
+
392
+ Args:
393
+ transport: Transport protocol ("stdio", "sse", or "http")
394
+ host: Host for HTTP/SSE transport
395
+ port: Port for HTTP/SSE transport
396
+ """
397
+ if transport == "stdio":
398
+ mcp.run(transport="stdio")
399
+ elif transport == "sse":
400
+ mcp.run(transport="sse", host=host, port=port)
401
+ elif transport == "http":
402
+ mcp.run(transport="streamable-http", host=host, port=port)
403
+ else:
404
+ raise ValueError(f"Unknown transport: {transport}")
405
+
406
+
407
+ def main():
408
+ """Entry point for crossref-local-mcp command."""
409
+ run_server(transport="stdio")
410
+
411
+
412
+ if __name__ == "__main__":
413
+ main()
crossref_local/models.py CHANGED
File without changes
@@ -0,0 +1,269 @@
1
+ """Remote API client for crossref_local.
2
+
3
+ Connects to a CrossRef Local API server instead of direct database access.
4
+ Use this when the database is on a remote server accessible via HTTP.
5
+ """
6
+
7
+ import json
8
+ import urllib.request
9
+ import urllib.parse
10
+ import urllib.error
11
+ from typing import List, Optional, Dict, Any
12
+
13
+ from .models import Work, SearchResult
14
+
15
+
16
+ class RemoteClient:
17
+ """
18
+ HTTP client for CrossRef Local API server.
19
+
20
+ Provides the same interface as the local API but connects
21
+ to a remote server via HTTP.
22
+
23
+ Example:
24
+ >>> client = RemoteClient("http://localhost:3333")
25
+ >>> results = client.search(title="machine learning", limit=10)
26
+ >>> work = client.get("10.1038/nature12373")
27
+ """
28
+
29
+ def __init__(self, base_url: str = "http://localhost:3333", timeout: int = 30):
30
+ """
31
+ Initialize remote client.
32
+
33
+ Args:
34
+ base_url: API server URL (default: http://localhost:3333)
35
+ timeout: Request timeout in seconds
36
+ """
37
+ self.base_url = base_url.rstrip("/")
38
+ self.timeout = timeout
39
+
40
+ def _request(self, endpoint: str, params: Optional[Dict[str, Any]] = None) -> Dict:
41
+ """Make HTTP GET request to API."""
42
+ url = f"{self.base_url}{endpoint}"
43
+ if params:
44
+ # Filter out None values
45
+ params = {k: v for k, v in params.items() if v is not None}
46
+ if params:
47
+ url = f"{url}?{urllib.parse.urlencode(params)}"
48
+
49
+ try:
50
+ req = urllib.request.Request(url)
51
+ req.add_header("Accept", "application/json")
52
+ with urllib.request.urlopen(req, timeout=self.timeout) as response:
53
+ return json.loads(response.read().decode("utf-8"))
54
+ except urllib.error.HTTPError as e:
55
+ if e.code == 404:
56
+ return None
57
+ raise ConnectionError(f"API request failed: {e.code} {e.reason}") from e
58
+ except urllib.error.URLError as e:
59
+ raise ConnectionError(
60
+ f"Cannot connect to API at {self.base_url}: {e.reason}"
61
+ ) from e
62
+
63
+ def health(self) -> Dict:
64
+ """Check API server health."""
65
+ return self._request("/health")
66
+
67
+ def info(self) -> Dict:
68
+ """Get database/API information."""
69
+ root = self._request("/")
70
+ info_data = self._request("/info")
71
+ return {
72
+ "api_url": self.base_url,
73
+ "api_version": root.get("version", "unknown"),
74
+ "status": root.get("status", "unknown"),
75
+ "mode": "remote",
76
+ "works": info_data.get("total_papers", 0) if info_data else 0,
77
+ "fts_indexed": info_data.get("fts_indexed", 0) if info_data else 0,
78
+ "citations": info_data.get("citations", 0) if info_data else 0,
79
+ }
80
+
81
+ def search(
82
+ self,
83
+ query: Optional[str] = None,
84
+ doi: Optional[str] = None,
85
+ title: Optional[str] = None,
86
+ authors: Optional[str] = None,
87
+ year: Optional[int] = None,
88
+ limit: int = 10,
89
+ offset: int = 0,
90
+ ) -> SearchResult:
91
+ """
92
+ Search for papers.
93
+
94
+ Args:
95
+ query: Full-text search query (searches title by default)
96
+ doi: Search by DOI
97
+ title: Search by title (explicit)
98
+ authors: Search by author name
99
+ year: Filter by publication year
100
+ limit: Maximum results (default: 10, max: 100)
101
+ offset: Skip first N results for pagination
102
+
103
+ Returns:
104
+ SearchResult with matching works
105
+ """
106
+ # Use new /works endpoint with FTS5 search
107
+ search_query = query or title
108
+
109
+ params = {
110
+ "q": search_query,
111
+ "limit": min(limit, 100),
112
+ "offset": offset,
113
+ }
114
+
115
+ data = self._request("/works", params)
116
+
117
+ if not data:
118
+ return SearchResult(works=[], total=0, query=query or "", elapsed_ms=0.0)
119
+
120
+ works = []
121
+ for item in data.get("results", []):
122
+ work = Work(
123
+ doi=item.get("doi", ""),
124
+ title=item.get("title", ""),
125
+ authors=item.get("authors", []),
126
+ year=item.get("year"),
127
+ journal=item.get("journal"),
128
+ volume=item.get("volume"),
129
+ issue=item.get("issue"),
130
+ page=item.get("page") or item.get("pages"),
131
+ abstract=item.get("abstract"),
132
+ citation_count=item.get("citation_count"),
133
+ )
134
+ works.append(work)
135
+
136
+ return SearchResult(
137
+ works=works,
138
+ total=data.get("total", len(works)),
139
+ query=query or title or doi or "",
140
+ elapsed_ms=data.get("elapsed_ms", 0.0),
141
+ )
142
+
143
+ def get(self, doi: str) -> Optional[Work]:
144
+ """
145
+ Get a work by DOI.
146
+
147
+ Args:
148
+ doi: Digital Object Identifier
149
+
150
+ Returns:
151
+ Work object or None if not found
152
+ """
153
+ # Use /works/{doi} endpoint directly
154
+ data = self._request(f"/works/{doi}")
155
+ if not data or "error" in data:
156
+ return None
157
+
158
+ return Work(
159
+ doi=data.get("doi", doi),
160
+ title=data.get("title", ""),
161
+ authors=data.get("authors", []),
162
+ year=data.get("year"),
163
+ journal=data.get("journal"),
164
+ volume=data.get("volume"),
165
+ issue=data.get("issue"),
166
+ page=data.get("page"),
167
+ abstract=data.get("abstract"),
168
+ citation_count=data.get("citation_count"),
169
+ )
170
+
171
+ def get_many(self, dois: List[str]) -> List[Work]:
172
+ """
173
+ Get multiple works by DOI using batch endpoint.
174
+
175
+ Args:
176
+ dois: List of DOIs
177
+
178
+ Returns:
179
+ List of Work objects
180
+ """
181
+ # Use batch endpoint if available
182
+ try:
183
+ data = {"dois": dois}
184
+ req_data = json.dumps(data).encode("utf-8")
185
+ req = urllib.request.Request(
186
+ f"{self.base_url}/works/batch", data=req_data, method="POST"
187
+ )
188
+ req.add_header("Content-Type", "application/json")
189
+ req.add_header("Accept", "application/json")
190
+
191
+ with urllib.request.urlopen(req, timeout=self.timeout) as response:
192
+ result = json.loads(response.read().decode("utf-8"))
193
+
194
+ works = []
195
+ for item in result.get("results", []):
196
+ work = Work(
197
+ doi=item.get("doi", ""),
198
+ title=item.get("title", ""),
199
+ authors=item.get("authors", []),
200
+ year=item.get("year"),
201
+ journal=item.get("journal"),
202
+ volume=item.get("volume"),
203
+ issue=item.get("issue"),
204
+ page=item.get("page"),
205
+ abstract=item.get("abstract"),
206
+ citation_count=item.get("citation_count"),
207
+ )
208
+ works.append(work)
209
+ return works
210
+ except Exception:
211
+ # Fallback to individual lookups
212
+ works = []
213
+ for doi in dois:
214
+ work = self.get(doi)
215
+ if work:
216
+ works.append(work)
217
+ return works
218
+
219
+ def exists(self, doi: str) -> bool:
220
+ """Check if a DOI exists."""
221
+ return self.get(doi) is not None
222
+
223
+ def get_citations(self, doi: str, direction: str = "both") -> Dict:
224
+ """
225
+ Get citations for a paper.
226
+
227
+ Args:
228
+ doi: Paper DOI
229
+ direction: 'citing', 'cited_by', or 'both'
230
+
231
+ Returns:
232
+ Dict with citation information
233
+ """
234
+ params = {"doi": doi, "direction": direction}
235
+ return self._request("/api/citations/", params) or {}
236
+
237
+ def get_journal(
238
+ self, issn: Optional[str] = None, name: Optional[str] = None
239
+ ) -> Dict:
240
+ """
241
+ Get journal information.
242
+
243
+ Args:
244
+ issn: Journal ISSN
245
+ name: Journal name
246
+
247
+ Returns:
248
+ Dict with journal information
249
+ """
250
+ params = {"issn": issn, "name": name}
251
+ return self._request("/api/journal/", params) or {}
252
+
253
+
254
+ # Module-level client for convenience
255
+ _client: Optional[RemoteClient] = None
256
+
257
+
258
+ def get_client(base_url: str = "http://localhost:3333") -> RemoteClient:
259
+ """Get or create singleton remote client."""
260
+ global _client
261
+ if _client is None or _client.base_url != base_url:
262
+ _client = RemoteClient(base_url)
263
+ return _client
264
+
265
+
266
+ def reset_client() -> None:
267
+ """Reset singleton client."""
268
+ global _client
269
+ _client = None