crossref-local 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ """Remote API client for crossref_local.
2
+
3
+ Connects to a CrossRef Local API server instead of direct database access.
4
+ Use this when the database is on a remote server accessible via HTTP.
5
+ """
6
+
7
+ import json
8
+ import urllib.request
9
+ import urllib.parse
10
+ import urllib.error
11
+ from typing import List, Optional, Dict, Any
12
+
13
+ from .models import Work, SearchResult
14
+
15
+
16
+ class RemoteClient:
17
+ """
18
+ HTTP client for CrossRef Local API server.
19
+
20
+ Provides the same interface as the local API but connects
21
+ to a remote server via HTTP.
22
+
23
+ Example:
24
+ >>> client = RemoteClient("http://localhost:3333")
25
+ >>> results = client.search(title="machine learning", limit=10)
26
+ >>> work = client.get("10.1038/nature12373")
27
+ """
28
+
29
+ def __init__(self, base_url: str = "http://localhost:3333", timeout: int = 30):
30
+ """
31
+ Initialize remote client.
32
+
33
+ Args:
34
+ base_url: API server URL (default: http://localhost:3333)
35
+ timeout: Request timeout in seconds
36
+ """
37
+ self.base_url = base_url.rstrip("/")
38
+ self.timeout = timeout
39
+
40
+ def _request(self, endpoint: str, params: Optional[Dict[str, Any]] = None) -> Dict:
41
+ """Make HTTP GET request to API."""
42
+ url = f"{self.base_url}{endpoint}"
43
+ if params:
44
+ # Filter out None values
45
+ params = {k: v for k, v in params.items() if v is not None}
46
+ if params:
47
+ url = f"{url}?{urllib.parse.urlencode(params)}"
48
+
49
+ try:
50
+ req = urllib.request.Request(url)
51
+ req.add_header("Accept", "application/json")
52
+ with urllib.request.urlopen(req, timeout=self.timeout) as response:
53
+ return json.loads(response.read().decode("utf-8"))
54
+ except urllib.error.HTTPError as e:
55
+ if e.code == 404:
56
+ return None
57
+ raise ConnectionError(f"API request failed: {e.code} {e.reason}") from e
58
+ except urllib.error.URLError as e:
59
+ raise ConnectionError(
60
+ f"Cannot connect to API at {self.base_url}: {e.reason}"
61
+ ) from e
62
+
63
+ def health(self) -> Dict:
64
+ """Check API server health."""
65
+ return self._request("/health")
66
+
67
+ def info(self) -> Dict:
68
+ """Get database/API information."""
69
+ root = self._request("/")
70
+ info_data = self._request("/info")
71
+ return {
72
+ "api_url": self.base_url,
73
+ "api_version": root.get("version", "unknown"),
74
+ "status": root.get("status", "unknown"),
75
+ "mode": "remote",
76
+ "works": info_data.get("total_papers", 0) if info_data else 0,
77
+ "fts_indexed": info_data.get("fts_indexed", 0) if info_data else 0,
78
+ "citations": info_data.get("citations", 0) if info_data else 0,
79
+ }
80
+
81
+ def search(
82
+ self,
83
+ query: Optional[str] = None,
84
+ doi: Optional[str] = None,
85
+ title: Optional[str] = None,
86
+ authors: Optional[str] = None,
87
+ year: Optional[int] = None,
88
+ limit: int = 10,
89
+ offset: int = 0,
90
+ ) -> SearchResult:
91
+ """
92
+ Search for papers.
93
+
94
+ Args:
95
+ query: Full-text search query (searches title by default)
96
+ doi: Search by DOI
97
+ title: Search by title (explicit)
98
+ authors: Search by author name
99
+ year: Filter by publication year
100
+ limit: Maximum results (default: 10, max: 100)
101
+ offset: Skip first N results for pagination
102
+
103
+ Returns:
104
+ SearchResult with matching works
105
+ """
106
+ # Use new /works endpoint with FTS5 search
107
+ search_query = query or title
108
+
109
+ params = {
110
+ "q": search_query,
111
+ "limit": min(limit, 100),
112
+ "offset": offset,
113
+ }
114
+
115
+ data = self._request("/works", params)
116
+
117
+ if not data:
118
+ return SearchResult(works=[], total=0, query=query or "", elapsed_ms=0.0)
119
+
120
+ works = []
121
+ for item in data.get("results", []):
122
+ work = Work(
123
+ doi=item.get("doi", ""),
124
+ title=item.get("title", ""),
125
+ authors=item.get("authors", []),
126
+ year=item.get("year"),
127
+ journal=item.get("journal"),
128
+ volume=item.get("volume"),
129
+ issue=item.get("issue"),
130
+ page=item.get("page") or item.get("pages"),
131
+ abstract=item.get("abstract"),
132
+ citation_count=item.get("citation_count"),
133
+ )
134
+ works.append(work)
135
+
136
+ return SearchResult(
137
+ works=works,
138
+ total=data.get("total", len(works)),
139
+ query=query or title or doi or "",
140
+ elapsed_ms=data.get("elapsed_ms", 0.0),
141
+ )
142
+
143
+ def get(self, doi: str) -> Optional[Work]:
144
+ """
145
+ Get a work by DOI.
146
+
147
+ Args:
148
+ doi: Digital Object Identifier
149
+
150
+ Returns:
151
+ Work object or None if not found
152
+ """
153
+ # Use /works/{doi} endpoint directly
154
+ data = self._request(f"/works/{doi}")
155
+ if not data or "error" in data:
156
+ return None
157
+
158
+ return Work(
159
+ doi=data.get("doi", doi),
160
+ title=data.get("title", ""),
161
+ authors=data.get("authors", []),
162
+ year=data.get("year"),
163
+ journal=data.get("journal"),
164
+ volume=data.get("volume"),
165
+ issue=data.get("issue"),
166
+ page=data.get("page"),
167
+ abstract=data.get("abstract"),
168
+ citation_count=data.get("citation_count"),
169
+ )
170
+
171
+ def get_many(self, dois: List[str]) -> List[Work]:
172
+ """
173
+ Get multiple works by DOI using batch endpoint.
174
+
175
+ Args:
176
+ dois: List of DOIs
177
+
178
+ Returns:
179
+ List of Work objects
180
+ """
181
+ # Use batch endpoint if available
182
+ try:
183
+ data = {"dois": dois}
184
+ req_data = json.dumps(data).encode("utf-8")
185
+ req = urllib.request.Request(
186
+ f"{self.base_url}/works/batch", data=req_data, method="POST"
187
+ )
188
+ req.add_header("Content-Type", "application/json")
189
+ req.add_header("Accept", "application/json")
190
+
191
+ with urllib.request.urlopen(req, timeout=self.timeout) as response:
192
+ result = json.loads(response.read().decode("utf-8"))
193
+
194
+ works = []
195
+ for item in result.get("results", []):
196
+ work = Work(
197
+ doi=item.get("doi", ""),
198
+ title=item.get("title", ""),
199
+ authors=item.get("authors", []),
200
+ year=item.get("year"),
201
+ journal=item.get("journal"),
202
+ )
203
+ works.append(work)
204
+ return works
205
+ except Exception:
206
+ # Fallback to individual lookups
207
+ works = []
208
+ for doi in dois:
209
+ work = self.get(doi)
210
+ if work:
211
+ works.append(work)
212
+ return works
213
+
214
+ def exists(self, doi: str) -> bool:
215
+ """Check if a DOI exists."""
216
+ return self.get(doi) is not None
217
+
218
+ def get_citations(self, doi: str, direction: str = "both") -> Dict:
219
+ """
220
+ Get citations for a paper.
221
+
222
+ Args:
223
+ doi: Paper DOI
224
+ direction: 'citing', 'cited_by', or 'both'
225
+
226
+ Returns:
227
+ Dict with citation information
228
+ """
229
+ params = {"doi": doi, "direction": direction}
230
+ return self._request("/api/citations/", params) or {}
231
+
232
+ def get_journal(
233
+ self, issn: Optional[str] = None, name: Optional[str] = None
234
+ ) -> Dict:
235
+ """
236
+ Get journal information.
237
+
238
+ Args:
239
+ issn: Journal ISSN
240
+ name: Journal name
241
+
242
+ Returns:
243
+ Dict with journal information
244
+ """
245
+ params = {"issn": issn, "name": name}
246
+ return self._request("/api/journal/", params) or {}
247
+
248
+
249
+ # Module-level client for convenience
250
+ _client: Optional[RemoteClient] = None
251
+
252
+
253
+ def get_client(base_url: str = "http://localhost:3333") -> RemoteClient:
254
+ """Get or create singleton remote client."""
255
+ global _client
256
+ if _client is None or _client.base_url != base_url:
257
+ _client = RemoteClient(base_url)
258
+ return _client
259
+
260
+
261
+ def reset_client() -> None:
262
+ """Reset singleton client."""
263
+ global _client
264
+ _client = None
@@ -0,0 +1,352 @@
1
+ """FastAPI server for CrossRef Local with FTS5 search.
2
+
3
+ This server provides proper full-text search using FTS5 index,
4
+ unlike the Django API which only scans a limited subset.
5
+
6
+ Usage:
7
+ crossref-local api # Run on default port 3333
8
+ crossref-local api --port 8080 # Custom port
9
+
10
+ # Or directly:
11
+ uvicorn crossref_local.server:app --host 0.0.0.0 --port 3333
12
+ """
13
+
14
+ import time
15
+ from typing import Optional, List
16
+
17
+ from fastapi import FastAPI, Query, HTTPException
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel
20
+
21
+ from . import fts
22
+ from .db import get_db
23
+ from .models import Work
24
+
25
+ app = FastAPI(
26
+ title="CrossRef Local API",
27
+ description="Fast full-text search across 167M+ scholarly works",
28
+ version="1.1.0",
29
+ )
30
+
31
+ # CORS middleware
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"],
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+
40
+ class WorkResponse(BaseModel):
41
+ doi: str
42
+ title: Optional[str] = None
43
+ authors: List[str] = []
44
+ year: Optional[int] = None
45
+ journal: Optional[str] = None
46
+ issn: Optional[str] = None
47
+ volume: Optional[str] = None
48
+ issue: Optional[str] = None
49
+ page: Optional[str] = None
50
+ abstract: Optional[str] = None
51
+ citation_count: Optional[int] = None
52
+
53
+
54
+ class SearchResponse(BaseModel):
55
+ query: str
56
+ total: int
57
+ returned: int
58
+ elapsed_ms: float
59
+ results: List[WorkResponse]
60
+
61
+
62
+ class InfoResponse(BaseModel):
63
+ name: str = "CrossRef Local API"
64
+ version: str = "1.1.0"
65
+ status: str = "running"
66
+ mode: str = "local"
67
+ total_papers: int
68
+ fts_indexed: int
69
+ citations: int
70
+ database_path: str
71
+
72
+
73
+ @app.get("/")
74
+ def root():
75
+ """API root with endpoint information."""
76
+ return {
77
+ "name": "CrossRef Local API",
78
+ "version": "1.1.0",
79
+ "status": "running",
80
+ "endpoints": {
81
+ "health": "/health",
82
+ "info": "/info",
83
+ "search": "/works?q=<query>",
84
+ "get_by_doi": "/works/{doi}",
85
+ "batch": "/works/batch",
86
+ },
87
+ }
88
+
89
+
90
+ @app.get("/health")
91
+ def health():
92
+ """Health check endpoint."""
93
+ db = get_db()
94
+ return {
95
+ "status": "healthy",
96
+ "database_connected": db is not None,
97
+ "database_path": str(db.db_path) if db else None,
98
+ }
99
+
100
+
101
+ @app.get("/info", response_model=InfoResponse)
102
+ def info():
103
+ """Get database statistics."""
104
+ db = get_db()
105
+
106
+ row = db.fetchone("SELECT COUNT(*) as count FROM works")
107
+ work_count = row["count"] if row else 0
108
+
109
+ try:
110
+ row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
111
+ fts_count = row["count"] if row else 0
112
+ except Exception:
113
+ fts_count = 0
114
+
115
+ try:
116
+ row = db.fetchone("SELECT COUNT(*) as count FROM citations")
117
+ citation_count = row["count"] if row else 0
118
+ except Exception:
119
+ citation_count = 0
120
+
121
+ return InfoResponse(
122
+ total_papers=work_count,
123
+ fts_indexed=fts_count,
124
+ citations=citation_count,
125
+ database_path=str(db.db_path),
126
+ )
127
+
128
+
129
+ @app.get("/works", response_model=SearchResponse)
130
+ def search_works(
131
+ q: str = Query(..., description="Search query (FTS5 syntax supported)"),
132
+ limit: int = Query(10, ge=1, le=100, description="Max results"),
133
+ offset: int = Query(0, ge=0, description="Skip first N results"),
134
+ ):
135
+ """
136
+ Full-text search across works.
137
+
138
+ Uses FTS5 index for fast searching across titles, abstracts, and authors.
139
+ Supports FTS5 query syntax like AND, OR, NOT, "exact phrases".
140
+
141
+ Examples:
142
+ /works?q=machine learning
143
+ /works?q="neural network" AND hippocampus
144
+ /works?q=CRISPR&limit=20
145
+ """
146
+ start = time.perf_counter()
147
+
148
+ try:
149
+ results = fts.search(q, limit=limit, offset=offset)
150
+ except Exception as e:
151
+ raise HTTPException(status_code=400, detail=f"Search error: {e}")
152
+
153
+ elapsed_ms = (time.perf_counter() - start) * 1000
154
+
155
+ return SearchResponse(
156
+ query=q,
157
+ total=results.total,
158
+ returned=len(results.works),
159
+ elapsed_ms=round(elapsed_ms, 2),
160
+ results=[
161
+ WorkResponse(
162
+ doi=w.doi,
163
+ title=w.title,
164
+ authors=w.authors,
165
+ year=w.year,
166
+ journal=w.journal,
167
+ issn=w.issn,
168
+ volume=w.volume,
169
+ issue=w.issue,
170
+ page=w.page,
171
+ abstract=w.abstract,
172
+ citation_count=w.citation_count,
173
+ )
174
+ for w in results.works
175
+ ],
176
+ )
177
+
178
+
179
+ @app.get("/works/{doi:path}", response_model=Optional[WorkResponse])
180
+ def get_work(doi: str):
181
+ """
182
+ Get work metadata by DOI.
183
+
184
+ Examples:
185
+ /works/10.1038/nature12373
186
+ /works/10.1016/j.cell.2020.01.001
187
+ """
188
+ db = get_db()
189
+ metadata = db.get_metadata(doi)
190
+
191
+ if metadata is None:
192
+ raise HTTPException(status_code=404, detail=f"DOI not found: {doi}")
193
+
194
+ work = Work.from_metadata(doi, metadata)
195
+
196
+ return WorkResponse(
197
+ doi=work.doi,
198
+ title=work.title,
199
+ authors=work.authors,
200
+ year=work.year,
201
+ journal=work.journal,
202
+ issn=work.issn,
203
+ volume=work.volume,
204
+ issue=work.issue,
205
+ page=work.page,
206
+ abstract=work.abstract,
207
+ citation_count=work.citation_count,
208
+ )
209
+
210
+
211
+ class BatchRequest(BaseModel):
212
+ dois: List[str]
213
+
214
+
215
+ class BatchResponse(BaseModel):
216
+ requested: int
217
+ found: int
218
+ results: List[WorkResponse]
219
+
220
+
221
+ @app.post("/works/batch", response_model=BatchResponse)
222
+ def get_works_batch(request: BatchRequest):
223
+ """
224
+ Get multiple works by DOI.
225
+
226
+ Request body: {"dois": ["10.1038/...", "10.1016/..."]}
227
+ """
228
+ db = get_db()
229
+ results = []
230
+
231
+ for doi in request.dois:
232
+ metadata = db.get_metadata(doi)
233
+ if metadata:
234
+ work = Work.from_metadata(doi, metadata)
235
+ results.append(
236
+ WorkResponse(
237
+ doi=work.doi,
238
+ title=work.title,
239
+ authors=work.authors,
240
+ year=work.year,
241
+ journal=work.journal,
242
+ abstract=work.abstract,
243
+ citation_count=work.citation_count,
244
+ )
245
+ )
246
+
247
+ return BatchResponse(
248
+ requested=len(request.dois),
249
+ found=len(results),
250
+ results=results,
251
+ )
252
+
253
+
254
+ # For backwards compatibility with existing API endpoints
255
+ @app.get("/api/search/")
256
+ def api_search_compat(
257
+ title: Optional[str] = None,
258
+ q: Optional[str] = None,
259
+ doi: Optional[str] = None,
260
+ limit: int = 10,
261
+ ):
262
+ """Backwards-compatible search endpoint."""
263
+ query = title or q
264
+
265
+ if doi:
266
+ # DOI lookup
267
+ try:
268
+ work = get_work(doi)
269
+ return {
270
+ "query": {"doi": doi},
271
+ "results": [work.model_dump()],
272
+ "total": 1,
273
+ "returned": 1,
274
+ }
275
+ except HTTPException:
276
+ return {"query": {"doi": doi}, "results": [], "total": 0, "returned": 0}
277
+
278
+ if not query:
279
+ raise HTTPException(
280
+ status_code=400, detail="Specify q, title, or doi parameter"
281
+ )
282
+
283
+ # Call fts.search directly (not the endpoint function)
284
+ results = fts.search(query, limit=limit, offset=0)
285
+ return {
286
+ "query": {
287
+ "title": query,
288
+ "doi": None,
289
+ "year": None,
290
+ "authors": None,
291
+ "limit": limit,
292
+ },
293
+ "results": [
294
+ WorkResponse(
295
+ doi=w.doi,
296
+ title=w.title,
297
+ authors=w.authors,
298
+ year=w.year,
299
+ journal=w.journal,
300
+ issn=w.issn,
301
+ volume=w.volume,
302
+ issue=w.issue,
303
+ page=w.page,
304
+ abstract=w.abstract,
305
+ citation_count=w.citation_count,
306
+ ).model_dump()
307
+ for w in results.works
308
+ ],
309
+ "total": results.total,
310
+ "returned": len(results.works),
311
+ }
312
+
313
+
314
+ @app.get("/api/stats/")
315
+ def api_stats_compat():
316
+ """Backwards-compatible stats endpoint."""
317
+ db = get_db()
318
+
319
+ row = db.fetchone("SELECT COUNT(*) as count FROM works")
320
+ work_count = row["count"] if row else 0
321
+
322
+ # Get table names
323
+ tables = []
324
+ for row in db.fetchall("SELECT name FROM sqlite_master WHERE type='table'"):
325
+ tables.append(row["name"])
326
+
327
+ # Get index names
328
+ indices = []
329
+ for row in db.fetchall("SELECT name FROM sqlite_master WHERE type='index'"):
330
+ if row["name"]:
331
+ indices.append(row["name"])
332
+
333
+ return {
334
+ "total_papers": work_count,
335
+ "database_size_mb": None,
336
+ "year_range": None,
337
+ "total_journals": 0,
338
+ "total_citations": None,
339
+ "tables": tables,
340
+ "indices": indices,
341
+ }
342
+
343
+
344
+ def run_server(host: str = "0.0.0.0", port: int = 3333):
345
+ """Run the FastAPI server."""
346
+ import uvicorn
347
+
348
+ uvicorn.run(app, host=host, port=port)
349
+
350
+
351
+ if __name__ == "__main__":
352
+ run_server()