crossref-local 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,466 @@
1
+ """Cache module for crossref-local.
2
+
3
+ Provides disk-based caching of paper metadata to reduce context usage
4
+ and enable efficient re-querying with field filtering.
5
+
6
+ Architecture:
7
+ 1. FTS search -> DOIs (fast, minimal)
8
+ 2. Cache DOIs -> full metadata saved to disk
9
+ 3. Query cache -> filtered fields based on need
10
+
11
+ Usage:
12
+ >>> from crossref_local import cache
13
+ >>> # Create cache from search
14
+ >>> cache.create("epilepsy", query="epilepsy seizure prediction", limit=100)
15
+ >>> # Query with minimal fields
16
+ >>> papers = cache.query("epilepsy", fields=["doi", "title", "year"])
17
+ >>> # Get statistics
18
+ >>> stats = cache.stats("epilepsy")
19
+ """
20
+
21
+ import json
22
+ import os
23
+ import time
24
+ from dataclasses import dataclass
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional
27
+
28
+ from .api import get_many, search
29
+
30
+
31
+ def _get_cache_dir() -> Path:
32
+ """Get cache directory, creating if needed."""
33
+ cache_dir = Path(
34
+ os.environ.get(
35
+ "CROSSREF_LOCAL_CACHE_DIR", Path.home() / ".cache" / "crossref-local"
36
+ )
37
+ )
38
+ cache_dir.mkdir(parents=True, exist_ok=True)
39
+ return cache_dir
40
+
41
+
42
+ def _cache_path(name: str) -> Path:
43
+ """Get path for a named cache."""
44
+ return _get_cache_dir() / f"{name}.json"
45
+
46
+
47
+ def _meta_path(name: str) -> Path:
48
+ """Get path for cache metadata."""
49
+ return _get_cache_dir() / f"{name}.meta.json"
50
+
51
+
52
+ @dataclass
53
+ class CacheInfo:
54
+ """Information about a cache."""
55
+
56
+ name: str
57
+ path: str
58
+ size_bytes: int
59
+ paper_count: int
60
+ created_at: str
61
+ query: Optional[str] = None
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "name": self.name,
66
+ "path": self.path,
67
+ "size_bytes": self.size_bytes,
68
+ "size_mb": round(self.size_bytes / 1024 / 1024, 2),
69
+ "paper_count": self.paper_count,
70
+ "created_at": self.created_at,
71
+ "query": self.query,
72
+ }
73
+
74
+
75
+ def create(
76
+ name: str,
77
+ query: Optional[str] = None,
78
+ dois: Optional[List[str]] = None,
79
+ papers: Optional[List[Dict[str, Any]]] = None,
80
+ limit: int = 1000,
81
+ offset: int = 0,
82
+ ) -> CacheInfo:
83
+ """Create a cache from search query, DOI list, or pre-fetched papers.
84
+
85
+ Args:
86
+ name: Cache name (used as filename)
87
+ query: FTS search query (if dois/papers not provided)
88
+ dois: Explicit list of DOIs to cache
89
+ papers: Pre-fetched paper dicts (skips API calls)
90
+ limit: Max papers to fetch (for query mode)
91
+ offset: Offset for pagination (for query mode)
92
+
93
+ Returns:
94
+ CacheInfo with cache details
95
+
96
+ Example:
97
+ >>> create("epilepsy", query="epilepsy seizure", limit=500)
98
+ >>> create("my_papers", dois=["10.1038/nature12373", ...])
99
+ >>> create("imported", papers=[{"doi": "...", "title": "..."}])
100
+ """
101
+ if papers is not None:
102
+ # Use pre-fetched papers directly
103
+ pass
104
+ elif dois is None and query is None:
105
+ raise ValueError("Must provide 'query', 'dois', or 'papers'")
106
+ elif dois is None:
107
+ # Get DOIs from search
108
+ results = search(query, limit=limit, offset=offset)
109
+ dois = [w.doi for w in results.works]
110
+ # Fetch full metadata
111
+ works = get_many(dois)
112
+ papers = [w.to_dict() for w in works]
113
+ else:
114
+ # Fetch full metadata for DOIs
115
+ works = get_many(dois)
116
+ papers = [w.to_dict() for w in works]
117
+
118
+ # Save cache
119
+ cache_file = _cache_path(name)
120
+ with open(cache_file, "w") as f:
121
+ json.dump(papers, f)
122
+
123
+ # Save metadata
124
+ meta = {
125
+ "name": name,
126
+ "query": query,
127
+ "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
128
+ "paper_count": len(papers),
129
+ "dois_requested": len(dois) if dois else len(papers),
130
+ }
131
+ with open(_meta_path(name), "w") as f:
132
+ json.dump(meta, f, indent=2)
133
+
134
+ return CacheInfo(
135
+ name=name,
136
+ path=str(cache_file),
137
+ size_bytes=cache_file.stat().st_size,
138
+ paper_count=len(papers),
139
+ created_at=meta["created_at"],
140
+ query=query,
141
+ )
142
+
143
+
144
+ def append(
145
+ name: str,
146
+ query: Optional[str] = None,
147
+ dois: Optional[List[str]] = None,
148
+ limit: int = 1000,
149
+ offset: int = 0,
150
+ ) -> CacheInfo:
151
+ """Append papers to existing cache.
152
+
153
+ Args:
154
+ name: Existing cache name
155
+ query: FTS search query (if dois not provided)
156
+ dois: Explicit list of DOIs to add
157
+ limit: Max papers to fetch (for query mode)
158
+ offset: Offset for pagination (for query mode)
159
+
160
+ Returns:
161
+ Updated CacheInfo
162
+ """
163
+ if not exists(name):
164
+ return create(name, query=query, dois=dois, limit=limit, offset=offset)
165
+
166
+ # Load existing
167
+ existing = load(name)
168
+ existing_dois = {p["doi"] for p in existing}
169
+
170
+ # Get new DOIs
171
+ if dois is None and query is not None:
172
+ results = search(query, limit=limit, offset=offset)
173
+ dois = [w.doi for w in results.works]
174
+ elif dois is None:
175
+ raise ValueError("Must provide either 'query' or 'dois'")
176
+
177
+ # Filter out already cached
178
+ new_dois = [d for d in dois if d not in existing_dois]
179
+
180
+ if new_dois:
181
+ # Fetch new metadata
182
+ new_works = get_many(new_dois)
183
+ new_papers = [w.to_dict() for w in new_works]
184
+
185
+ # Combine and save
186
+ all_papers = existing + new_papers
187
+ cache_file = _cache_path(name)
188
+ with open(cache_file, "w") as f:
189
+ json.dump(all_papers, f)
190
+
191
+ # Update metadata
192
+ meta_file = _meta_path(name)
193
+ if meta_file.exists():
194
+ with open(meta_file) as f:
195
+ meta = json.load(f)
196
+ else:
197
+ meta = {"name": name}
198
+
199
+ meta["updated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
200
+ meta["paper_count"] = len(all_papers)
201
+
202
+ with open(meta_file, "w") as f:
203
+ json.dump(meta, f, indent=2)
204
+
205
+ return info(name)
206
+
207
+ return info(name)
208
+
209
+
210
+ def load(name: str) -> List[Dict[str, Any]]:
211
+ """Load raw cache data.
212
+
213
+ Args:
214
+ name: Cache name
215
+
216
+ Returns:
217
+ List of paper dictionaries with full metadata
218
+ """
219
+ cache_file = _cache_path(name)
220
+ if not cache_file.exists():
221
+ raise FileNotFoundError(f"Cache not found: {name}")
222
+
223
+ with open(cache_file) as f:
224
+ return json.load(f)
225
+
226
+
227
+ def query(
228
+ name: str,
229
+ fields: Optional[List[str]] = None,
230
+ include_abstract: bool = False,
231
+ include_references: bool = False,
232
+ include_citations: bool = False,
233
+ year_min: Optional[int] = None,
234
+ year_max: Optional[int] = None,
235
+ journal: Optional[str] = None,
236
+ limit: Optional[int] = None,
237
+ ) -> List[Dict[str, Any]]:
238
+ """Query cache with field filtering.
239
+
240
+ Args:
241
+ name: Cache name
242
+ fields: Explicit field list (overrides include_* flags)
243
+ include_abstract: Include abstract field
244
+ include_references: Include references list
245
+ include_citations: Include citation_count
246
+ year_min: Filter by minimum year
247
+ year_max: Filter by maximum year
248
+ journal: Filter by journal name (substring match)
249
+ limit: Max results to return
250
+
251
+ Returns:
252
+ Filtered list of paper dictionaries
253
+
254
+ Example:
255
+ >>> # Minimal query
256
+ >>> papers = query("epilepsy", fields=["doi", "title", "year"])
257
+ >>> # With filters
258
+ >>> papers = query("epilepsy", year_min=2020, include_citations=True)
259
+ """
260
+ papers = load(name)
261
+
262
+ # Apply filters
263
+ if year_min is not None:
264
+ papers = [p for p in papers if p.get("year") and p["year"] >= year_min]
265
+ if year_max is not None:
266
+ papers = [p for p in papers if p.get("year") and p["year"] <= year_max]
267
+ if journal is not None:
268
+ journal_lower = journal.lower()
269
+ papers = [
270
+ p
271
+ for p in papers
272
+ if p.get("journal") and journal_lower in p["journal"].lower()
273
+ ]
274
+
275
+ # Apply limit
276
+ if limit is not None:
277
+ papers = papers[:limit]
278
+
279
+ # Field projection
280
+ if fields is not None:
281
+ # Explicit field list
282
+ papers = [{k: p.get(k) for k in fields if k in p} for p in papers]
283
+ else:
284
+ # Build field list from flags
285
+ base_fields = {"doi", "title", "authors", "year", "journal"}
286
+ if include_abstract:
287
+ base_fields.add("abstract")
288
+ if include_references:
289
+ base_fields.add("references")
290
+ if include_citations:
291
+ base_fields.add("citation_count")
292
+
293
+ papers = [{k: p.get(k) for k in base_fields if k in p} for p in papers]
294
+
295
+ return papers
296
+
297
+
298
+ def query_dois(name: str) -> List[str]:
299
+ """Get just DOIs from cache.
300
+
301
+ Args:
302
+ name: Cache name
303
+
304
+ Returns:
305
+ List of DOIs
306
+ """
307
+ papers = load(name)
308
+ return [p["doi"] for p in papers if p.get("doi")]
309
+
310
+
311
+ def stats(name: str) -> Dict[str, Any]:
312
+ """Get cache statistics.
313
+
314
+ Args:
315
+ name: Cache name
316
+
317
+ Returns:
318
+ Dictionary with statistics
319
+ """
320
+ papers = load(name)
321
+
322
+ # Year distribution
323
+ years = [p.get("year") for p in papers if p.get("year")]
324
+ year_dist = {}
325
+ for y in years:
326
+ year_dist[y] = year_dist.get(y, 0) + 1
327
+
328
+ # Journal distribution
329
+ journals = [p.get("journal") for p in papers if p.get("journal")]
330
+ journal_dist = {}
331
+ for j in journals:
332
+ journal_dist[j] = journal_dist.get(j, 0) + 1
333
+ top_journals = sorted(journal_dist.items(), key=lambda x: -x[1])[:20]
334
+
335
+ # Abstract coverage
336
+ with_abstract = sum(1 for p in papers if p.get("abstract"))
337
+
338
+ # Citation stats
339
+ citations = [p.get("citation_count", 0) for p in papers if p.get("citation_count")]
340
+
341
+ return {
342
+ "paper_count": len(papers),
343
+ "year_range": {
344
+ "min": min(years) if years else None,
345
+ "max": max(years) if years else None,
346
+ },
347
+ "year_distribution": dict(sorted(year_dist.items())),
348
+ "with_abstract": with_abstract,
349
+ "abstract_coverage": round(with_abstract / len(papers) * 100, 1)
350
+ if papers
351
+ else 0,
352
+ "top_journals": [{"journal": j, "count": c} for j, c in top_journals],
353
+ "citation_stats": {
354
+ "total": sum(citations),
355
+ "mean": round(sum(citations) / len(citations), 1) if citations else 0,
356
+ "max": max(citations) if citations else 0,
357
+ }
358
+ if citations
359
+ else None,
360
+ }
361
+
362
+
363
+ def info(name: str) -> CacheInfo:
364
+ """Get cache information.
365
+
366
+ Args:
367
+ name: Cache name
368
+
369
+ Returns:
370
+ CacheInfo object
371
+ """
372
+ cache_file = _cache_path(name)
373
+ if not cache_file.exists():
374
+ raise FileNotFoundError(f"Cache not found: {name}")
375
+
376
+ meta_file = _meta_path(name)
377
+ meta = {}
378
+ if meta_file.exists():
379
+ with open(meta_file) as f:
380
+ meta = json.load(f)
381
+
382
+ papers = load(name)
383
+
384
+ return CacheInfo(
385
+ name=name,
386
+ path=str(cache_file),
387
+ size_bytes=cache_file.stat().st_size,
388
+ paper_count=len(papers),
389
+ created_at=meta.get("created_at", "unknown"),
390
+ query=meta.get("query"),
391
+ )
392
+
393
+
394
+ def exists(name: str) -> bool:
395
+ """Check if cache exists.
396
+
397
+ Args:
398
+ name: Cache name
399
+
400
+ Returns:
401
+ True if cache exists
402
+ """
403
+ return _cache_path(name).exists()
404
+
405
+
406
+ def list_caches() -> List[CacheInfo]:
407
+ """List all available caches.
408
+
409
+ Returns:
410
+ List of CacheInfo objects
411
+ """
412
+ cache_dir = _get_cache_dir()
413
+ caches = []
414
+
415
+ for f in cache_dir.glob("*.json"):
416
+ if f.name.endswith(".meta.json"):
417
+ continue
418
+ name = f.stem
419
+ try:
420
+ caches.append(info(name))
421
+ except Exception:
422
+ pass
423
+
424
+ return sorted(caches, key=lambda c: c.name)
425
+
426
+
427
+ def delete(name: str) -> bool:
428
+ """Delete a cache.
429
+
430
+ Args:
431
+ name: Cache name
432
+
433
+ Returns:
434
+ True if deleted
435
+ """
436
+ cache_file = _cache_path(name)
437
+ meta_file = _meta_path(name)
438
+
439
+ deleted = False
440
+ if cache_file.exists():
441
+ cache_file.unlink()
442
+ deleted = True
443
+ if meta_file.exists():
444
+ meta_file.unlink()
445
+
446
+ return deleted
447
+
448
+
449
+
450
+ # Re-export from cache_export for backwards compatibility
451
+ from .cache_export import export
452
+
453
+ __all__ = [
454
+ "CacheInfo",
455
+ "create",
456
+ "append",
457
+ "load",
458
+ "query",
459
+ "query_dois",
460
+ "stats",
461
+ "info",
462
+ "exists",
463
+ "list_caches",
464
+ "delete",
465
+ "export",
466
+ ]
@@ -0,0 +1,83 @@
1
+ """Export functionality for cache module."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from .cache import load
8
+
9
+
10
+ def export(
11
+ name: str,
12
+ output_path: str,
13
+ format: str = "json",
14
+ fields: Optional[List[str]] = None,
15
+ ) -> str:
16
+ """Export cache to file.
17
+
18
+ Args:
19
+ name: Cache name
20
+ output_path: Output file path
21
+ format: Export format (json, csv, bibtex, dois)
22
+ fields: Fields to include (for json/csv)
23
+
24
+ Returns:
25
+ Output file path
26
+ """
27
+ papers = load(name)
28
+ output = Path(output_path)
29
+
30
+ if format == "json":
31
+ if fields:
32
+ papers = [{k: p.get(k) for k in fields} for p in papers]
33
+ with open(output, "w") as f:
34
+ json.dump(papers, f, indent=2)
35
+
36
+ elif format == "csv":
37
+ import csv
38
+
39
+ if fields is None:
40
+ fields = ["doi", "title", "authors", "year", "journal"]
41
+ with open(output, "w", newline="") as f:
42
+ writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
43
+ writer.writeheader()
44
+ for p in papers:
45
+ row = dict(p)
46
+ if "authors" in row and isinstance(row["authors"], list):
47
+ row["authors"] = "; ".join(row["authors"])
48
+ writer.writerow(row)
49
+
50
+ elif format == "bibtex":
51
+ lines = []
52
+ for p in papers:
53
+ doi = p.get("doi", "").replace("/", "_").replace(".", "_")
54
+ entry = f"@article{{{doi},\n"
55
+ if p.get("title"):
56
+ entry += f" title = {{{p['title']}}},\n"
57
+ if p.get("authors"):
58
+ authors = (
59
+ " and ".join(p["authors"])
60
+ if isinstance(p["authors"], list)
61
+ else p["authors"]
62
+ )
63
+ entry += f" author = {{{authors}}},\n"
64
+ if p.get("year"):
65
+ entry += f" year = {{{p['year']}}},\n"
66
+ if p.get("journal"):
67
+ entry += f" journal = {{{p['journal']}}},\n"
68
+ if p.get("doi"):
69
+ entry += f" doi = {{{p['doi']}}},\n"
70
+ entry += "}\n"
71
+ lines.append(entry)
72
+ with open(output, "w") as f:
73
+ f.write("\n".join(lines))
74
+
75
+ elif format == "dois":
76
+ dois = [p["doi"] for p in papers if p.get("doi")]
77
+ with open(output, "w") as f:
78
+ f.write("\n".join(dois))
79
+
80
+ else:
81
+ raise ValueError(f"Unknown format: {format}")
82
+
83
+ return str(output)