crossref-local 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossref_local/__init__.py +86 -22
- crossref_local/__main__.py +6 -0
- crossref_local/aio.py +0 -0
- crossref_local/api.py +148 -5
- crossref_local/cache.py +466 -0
- crossref_local/cache_export.py +83 -0
- crossref_local/cache_viz.py +296 -0
- crossref_local/citations.py +0 -0
- crossref_local/cli.py +358 -97
- crossref_local/cli_cache.py +179 -0
- crossref_local/cli_completion.py +245 -0
- crossref_local/cli_main.py +20 -0
- crossref_local/cli_mcp.py +275 -0
- crossref_local/config.py +99 -3
- crossref_local/db.py +3 -1
- crossref_local/fts.py +38 -4
- crossref_local/impact_factor/__init__.py +0 -0
- crossref_local/impact_factor/calculator.py +0 -0
- crossref_local/impact_factor/journal_lookup.py +0 -0
- crossref_local/mcp_server.py +413 -0
- crossref_local/models.py +0 -0
- crossref_local/remote.py +269 -0
- crossref_local/server.py +352 -0
- {crossref_local-0.3.0.dist-info → crossref_local-0.4.0.dist-info}/METADATA +152 -7
- crossref_local-0.4.0.dist-info/RECORD +27 -0
- crossref_local-0.4.0.dist-info/entry_points.txt +3 -0
- crossref_local-0.3.0.dist-info/RECORD +0 -16
- crossref_local-0.3.0.dist-info/entry_points.txt +0 -2
- {crossref_local-0.3.0.dist-info → crossref_local-0.4.0.dist-info}/WHEEL +0 -0
crossref_local/cache.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
"""Cache module for crossref-local.
|
|
2
|
+
|
|
3
|
+
Provides disk-based caching of paper metadata to reduce context usage
|
|
4
|
+
and enable efficient re-querying with field filtering.
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
1. FTS search -> DOIs (fast, minimal)
|
|
8
|
+
2. Cache DOIs -> full metadata saved to disk
|
|
9
|
+
3. Query cache -> filtered fields based on need
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
>>> from crossref_local import cache
|
|
13
|
+
>>> # Create cache from search
|
|
14
|
+
>>> cache.create("epilepsy", query="epilepsy seizure prediction", limit=100)
|
|
15
|
+
>>> # Query with minimal fields
|
|
16
|
+
>>> papers = cache.query("epilepsy", fields=["doi", "title", "year"])
|
|
17
|
+
>>> # Get statistics
|
|
18
|
+
>>> stats = cache.stats("epilepsy")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, List, Optional
|
|
27
|
+
|
|
28
|
+
from .api import get_many, search
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_cache_dir() -> Path:
|
|
32
|
+
"""Get cache directory, creating if needed."""
|
|
33
|
+
cache_dir = Path(
|
|
34
|
+
os.environ.get(
|
|
35
|
+
"CROSSREF_LOCAL_CACHE_DIR", Path.home() / ".cache" / "crossref-local"
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
return cache_dir
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _cache_path(name: str) -> Path:
|
|
43
|
+
"""Get path for a named cache."""
|
|
44
|
+
return _get_cache_dir() / f"{name}.json"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _meta_path(name: str) -> Path:
|
|
48
|
+
"""Get path for cache metadata."""
|
|
49
|
+
return _get_cache_dir() / f"{name}.meta.json"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class CacheInfo:
|
|
54
|
+
"""Information about a cache."""
|
|
55
|
+
|
|
56
|
+
name: str
|
|
57
|
+
path: str
|
|
58
|
+
size_bytes: int
|
|
59
|
+
paper_count: int
|
|
60
|
+
created_at: str
|
|
61
|
+
query: Optional[str] = None
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict:
|
|
64
|
+
return {
|
|
65
|
+
"name": self.name,
|
|
66
|
+
"path": self.path,
|
|
67
|
+
"size_bytes": self.size_bytes,
|
|
68
|
+
"size_mb": round(self.size_bytes / 1024 / 1024, 2),
|
|
69
|
+
"paper_count": self.paper_count,
|
|
70
|
+
"created_at": self.created_at,
|
|
71
|
+
"query": self.query,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def create(
|
|
76
|
+
name: str,
|
|
77
|
+
query: Optional[str] = None,
|
|
78
|
+
dois: Optional[List[str]] = None,
|
|
79
|
+
papers: Optional[List[Dict[str, Any]]] = None,
|
|
80
|
+
limit: int = 1000,
|
|
81
|
+
offset: int = 0,
|
|
82
|
+
) -> CacheInfo:
|
|
83
|
+
"""Create a cache from search query, DOI list, or pre-fetched papers.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
name: Cache name (used as filename)
|
|
87
|
+
query: FTS search query (if dois/papers not provided)
|
|
88
|
+
dois: Explicit list of DOIs to cache
|
|
89
|
+
papers: Pre-fetched paper dicts (skips API calls)
|
|
90
|
+
limit: Max papers to fetch (for query mode)
|
|
91
|
+
offset: Offset for pagination (for query mode)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
CacheInfo with cache details
|
|
95
|
+
|
|
96
|
+
Example:
|
|
97
|
+
>>> create("epilepsy", query="epilepsy seizure", limit=500)
|
|
98
|
+
>>> create("my_papers", dois=["10.1038/nature12373", ...])
|
|
99
|
+
>>> create("imported", papers=[{"doi": "...", "title": "..."}])
|
|
100
|
+
"""
|
|
101
|
+
if papers is not None:
|
|
102
|
+
# Use pre-fetched papers directly
|
|
103
|
+
pass
|
|
104
|
+
elif dois is None and query is None:
|
|
105
|
+
raise ValueError("Must provide 'query', 'dois', or 'papers'")
|
|
106
|
+
elif dois is None:
|
|
107
|
+
# Get DOIs from search
|
|
108
|
+
results = search(query, limit=limit, offset=offset)
|
|
109
|
+
dois = [w.doi for w in results.works]
|
|
110
|
+
# Fetch full metadata
|
|
111
|
+
works = get_many(dois)
|
|
112
|
+
papers = [w.to_dict() for w in works]
|
|
113
|
+
else:
|
|
114
|
+
# Fetch full metadata for DOIs
|
|
115
|
+
works = get_many(dois)
|
|
116
|
+
papers = [w.to_dict() for w in works]
|
|
117
|
+
|
|
118
|
+
# Save cache
|
|
119
|
+
cache_file = _cache_path(name)
|
|
120
|
+
with open(cache_file, "w") as f:
|
|
121
|
+
json.dump(papers, f)
|
|
122
|
+
|
|
123
|
+
# Save metadata
|
|
124
|
+
meta = {
|
|
125
|
+
"name": name,
|
|
126
|
+
"query": query,
|
|
127
|
+
"created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
128
|
+
"paper_count": len(papers),
|
|
129
|
+
"dois_requested": len(dois) if dois else len(papers),
|
|
130
|
+
}
|
|
131
|
+
with open(_meta_path(name), "w") as f:
|
|
132
|
+
json.dump(meta, f, indent=2)
|
|
133
|
+
|
|
134
|
+
return CacheInfo(
|
|
135
|
+
name=name,
|
|
136
|
+
path=str(cache_file),
|
|
137
|
+
size_bytes=cache_file.stat().st_size,
|
|
138
|
+
paper_count=len(papers),
|
|
139
|
+
created_at=meta["created_at"],
|
|
140
|
+
query=query,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def append(
|
|
145
|
+
name: str,
|
|
146
|
+
query: Optional[str] = None,
|
|
147
|
+
dois: Optional[List[str]] = None,
|
|
148
|
+
limit: int = 1000,
|
|
149
|
+
offset: int = 0,
|
|
150
|
+
) -> CacheInfo:
|
|
151
|
+
"""Append papers to existing cache.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
name: Existing cache name
|
|
155
|
+
query: FTS search query (if dois not provided)
|
|
156
|
+
dois: Explicit list of DOIs to add
|
|
157
|
+
limit: Max papers to fetch (for query mode)
|
|
158
|
+
offset: Offset for pagination (for query mode)
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Updated CacheInfo
|
|
162
|
+
"""
|
|
163
|
+
if not exists(name):
|
|
164
|
+
return create(name, query=query, dois=dois, limit=limit, offset=offset)
|
|
165
|
+
|
|
166
|
+
# Load existing
|
|
167
|
+
existing = load(name)
|
|
168
|
+
existing_dois = {p["doi"] for p in existing}
|
|
169
|
+
|
|
170
|
+
# Get new DOIs
|
|
171
|
+
if dois is None and query is not None:
|
|
172
|
+
results = search(query, limit=limit, offset=offset)
|
|
173
|
+
dois = [w.doi for w in results.works]
|
|
174
|
+
elif dois is None:
|
|
175
|
+
raise ValueError("Must provide either 'query' or 'dois'")
|
|
176
|
+
|
|
177
|
+
# Filter out already cached
|
|
178
|
+
new_dois = [d for d in dois if d not in existing_dois]
|
|
179
|
+
|
|
180
|
+
if new_dois:
|
|
181
|
+
# Fetch new metadata
|
|
182
|
+
new_works = get_many(new_dois)
|
|
183
|
+
new_papers = [w.to_dict() for w in new_works]
|
|
184
|
+
|
|
185
|
+
# Combine and save
|
|
186
|
+
all_papers = existing + new_papers
|
|
187
|
+
cache_file = _cache_path(name)
|
|
188
|
+
with open(cache_file, "w") as f:
|
|
189
|
+
json.dump(all_papers, f)
|
|
190
|
+
|
|
191
|
+
# Update metadata
|
|
192
|
+
meta_file = _meta_path(name)
|
|
193
|
+
if meta_file.exists():
|
|
194
|
+
with open(meta_file) as f:
|
|
195
|
+
meta = json.load(f)
|
|
196
|
+
else:
|
|
197
|
+
meta = {"name": name}
|
|
198
|
+
|
|
199
|
+
meta["updated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
200
|
+
meta["paper_count"] = len(all_papers)
|
|
201
|
+
|
|
202
|
+
with open(meta_file, "w") as f:
|
|
203
|
+
json.dump(meta, f, indent=2)
|
|
204
|
+
|
|
205
|
+
return info(name)
|
|
206
|
+
|
|
207
|
+
return info(name)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def load(name: str) -> List[Dict[str, Any]]:
|
|
211
|
+
"""Load raw cache data.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
name: Cache name
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of paper dictionaries with full metadata
|
|
218
|
+
"""
|
|
219
|
+
cache_file = _cache_path(name)
|
|
220
|
+
if not cache_file.exists():
|
|
221
|
+
raise FileNotFoundError(f"Cache not found: {name}")
|
|
222
|
+
|
|
223
|
+
with open(cache_file) as f:
|
|
224
|
+
return json.load(f)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def query(
|
|
228
|
+
name: str,
|
|
229
|
+
fields: Optional[List[str]] = None,
|
|
230
|
+
include_abstract: bool = False,
|
|
231
|
+
include_references: bool = False,
|
|
232
|
+
include_citations: bool = False,
|
|
233
|
+
year_min: Optional[int] = None,
|
|
234
|
+
year_max: Optional[int] = None,
|
|
235
|
+
journal: Optional[str] = None,
|
|
236
|
+
limit: Optional[int] = None,
|
|
237
|
+
) -> List[Dict[str, Any]]:
|
|
238
|
+
"""Query cache with field filtering.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
name: Cache name
|
|
242
|
+
fields: Explicit field list (overrides include_* flags)
|
|
243
|
+
include_abstract: Include abstract field
|
|
244
|
+
include_references: Include references list
|
|
245
|
+
include_citations: Include citation_count
|
|
246
|
+
year_min: Filter by minimum year
|
|
247
|
+
year_max: Filter by maximum year
|
|
248
|
+
journal: Filter by journal name (substring match)
|
|
249
|
+
limit: Max results to return
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Filtered list of paper dictionaries
|
|
253
|
+
|
|
254
|
+
Example:
|
|
255
|
+
>>> # Minimal query
|
|
256
|
+
>>> papers = query("epilepsy", fields=["doi", "title", "year"])
|
|
257
|
+
>>> # With filters
|
|
258
|
+
>>> papers = query("epilepsy", year_min=2020, include_citations=True)
|
|
259
|
+
"""
|
|
260
|
+
papers = load(name)
|
|
261
|
+
|
|
262
|
+
# Apply filters
|
|
263
|
+
if year_min is not None:
|
|
264
|
+
papers = [p for p in papers if p.get("year") and p["year"] >= year_min]
|
|
265
|
+
if year_max is not None:
|
|
266
|
+
papers = [p for p in papers if p.get("year") and p["year"] <= year_max]
|
|
267
|
+
if journal is not None:
|
|
268
|
+
journal_lower = journal.lower()
|
|
269
|
+
papers = [
|
|
270
|
+
p
|
|
271
|
+
for p in papers
|
|
272
|
+
if p.get("journal") and journal_lower in p["journal"].lower()
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
# Apply limit
|
|
276
|
+
if limit is not None:
|
|
277
|
+
papers = papers[:limit]
|
|
278
|
+
|
|
279
|
+
# Field projection
|
|
280
|
+
if fields is not None:
|
|
281
|
+
# Explicit field list
|
|
282
|
+
papers = [{k: p.get(k) for k in fields if k in p} for p in papers]
|
|
283
|
+
else:
|
|
284
|
+
# Build field list from flags
|
|
285
|
+
base_fields = {"doi", "title", "authors", "year", "journal"}
|
|
286
|
+
if include_abstract:
|
|
287
|
+
base_fields.add("abstract")
|
|
288
|
+
if include_references:
|
|
289
|
+
base_fields.add("references")
|
|
290
|
+
if include_citations:
|
|
291
|
+
base_fields.add("citation_count")
|
|
292
|
+
|
|
293
|
+
papers = [{k: p.get(k) for k in base_fields if k in p} for p in papers]
|
|
294
|
+
|
|
295
|
+
return papers
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def query_dois(name: str) -> List[str]:
|
|
299
|
+
"""Get just DOIs from cache.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
name: Cache name
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
List of DOIs
|
|
306
|
+
"""
|
|
307
|
+
papers = load(name)
|
|
308
|
+
return [p["doi"] for p in papers if p.get("doi")]
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def stats(name: str) -> Dict[str, Any]:
|
|
312
|
+
"""Get cache statistics.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
name: Cache name
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Dictionary with statistics
|
|
319
|
+
"""
|
|
320
|
+
papers = load(name)
|
|
321
|
+
|
|
322
|
+
# Year distribution
|
|
323
|
+
years = [p.get("year") for p in papers if p.get("year")]
|
|
324
|
+
year_dist = {}
|
|
325
|
+
for y in years:
|
|
326
|
+
year_dist[y] = year_dist.get(y, 0) + 1
|
|
327
|
+
|
|
328
|
+
# Journal distribution
|
|
329
|
+
journals = [p.get("journal") for p in papers if p.get("journal")]
|
|
330
|
+
journal_dist = {}
|
|
331
|
+
for j in journals:
|
|
332
|
+
journal_dist[j] = journal_dist.get(j, 0) + 1
|
|
333
|
+
top_journals = sorted(journal_dist.items(), key=lambda x: -x[1])[:20]
|
|
334
|
+
|
|
335
|
+
# Abstract coverage
|
|
336
|
+
with_abstract = sum(1 for p in papers if p.get("abstract"))
|
|
337
|
+
|
|
338
|
+
# Citation stats
|
|
339
|
+
citations = [p.get("citation_count", 0) for p in papers if p.get("citation_count")]
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
"paper_count": len(papers),
|
|
343
|
+
"year_range": {
|
|
344
|
+
"min": min(years) if years else None,
|
|
345
|
+
"max": max(years) if years else None,
|
|
346
|
+
},
|
|
347
|
+
"year_distribution": dict(sorted(year_dist.items())),
|
|
348
|
+
"with_abstract": with_abstract,
|
|
349
|
+
"abstract_coverage": round(with_abstract / len(papers) * 100, 1)
|
|
350
|
+
if papers
|
|
351
|
+
else 0,
|
|
352
|
+
"top_journals": [{"journal": j, "count": c} for j, c in top_journals],
|
|
353
|
+
"citation_stats": {
|
|
354
|
+
"total": sum(citations),
|
|
355
|
+
"mean": round(sum(citations) / len(citations), 1) if citations else 0,
|
|
356
|
+
"max": max(citations) if citations else 0,
|
|
357
|
+
}
|
|
358
|
+
if citations
|
|
359
|
+
else None,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def info(name: str) -> CacheInfo:
|
|
364
|
+
"""Get cache information.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
name: Cache name
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
CacheInfo object
|
|
371
|
+
"""
|
|
372
|
+
cache_file = _cache_path(name)
|
|
373
|
+
if not cache_file.exists():
|
|
374
|
+
raise FileNotFoundError(f"Cache not found: {name}")
|
|
375
|
+
|
|
376
|
+
meta_file = _meta_path(name)
|
|
377
|
+
meta = {}
|
|
378
|
+
if meta_file.exists():
|
|
379
|
+
with open(meta_file) as f:
|
|
380
|
+
meta = json.load(f)
|
|
381
|
+
|
|
382
|
+
papers = load(name)
|
|
383
|
+
|
|
384
|
+
return CacheInfo(
|
|
385
|
+
name=name,
|
|
386
|
+
path=str(cache_file),
|
|
387
|
+
size_bytes=cache_file.stat().st_size,
|
|
388
|
+
paper_count=len(papers),
|
|
389
|
+
created_at=meta.get("created_at", "unknown"),
|
|
390
|
+
query=meta.get("query"),
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def exists(name: str) -> bool:
|
|
395
|
+
"""Check if cache exists.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
name: Cache name
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
True if cache exists
|
|
402
|
+
"""
|
|
403
|
+
return _cache_path(name).exists()
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def list_caches() -> List[CacheInfo]:
|
|
407
|
+
"""List all available caches.
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
List of CacheInfo objects
|
|
411
|
+
"""
|
|
412
|
+
cache_dir = _get_cache_dir()
|
|
413
|
+
caches = []
|
|
414
|
+
|
|
415
|
+
for f in cache_dir.glob("*.json"):
|
|
416
|
+
if f.name.endswith(".meta.json"):
|
|
417
|
+
continue
|
|
418
|
+
name = f.stem
|
|
419
|
+
try:
|
|
420
|
+
caches.append(info(name))
|
|
421
|
+
except Exception:
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
return sorted(caches, key=lambda c: c.name)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def delete(name: str) -> bool:
|
|
428
|
+
"""Delete a cache.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
name: Cache name
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
True if deleted
|
|
435
|
+
"""
|
|
436
|
+
cache_file = _cache_path(name)
|
|
437
|
+
meta_file = _meta_path(name)
|
|
438
|
+
|
|
439
|
+
deleted = False
|
|
440
|
+
if cache_file.exists():
|
|
441
|
+
cache_file.unlink()
|
|
442
|
+
deleted = True
|
|
443
|
+
if meta_file.exists():
|
|
444
|
+
meta_file.unlink()
|
|
445
|
+
|
|
446
|
+
return deleted
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
# Re-export from cache_export for backwards compatibility
|
|
451
|
+
from .cache_export import export
|
|
452
|
+
|
|
453
|
+
__all__ = [
|
|
454
|
+
"CacheInfo",
|
|
455
|
+
"create",
|
|
456
|
+
"append",
|
|
457
|
+
"load",
|
|
458
|
+
"query",
|
|
459
|
+
"query_dois",
|
|
460
|
+
"stats",
|
|
461
|
+
"info",
|
|
462
|
+
"exists",
|
|
463
|
+
"list_caches",
|
|
464
|
+
"delete",
|
|
465
|
+
"export",
|
|
466
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Export functionality for cache module."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from .cache import load
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def export(
|
|
11
|
+
name: str,
|
|
12
|
+
output_path: str,
|
|
13
|
+
format: str = "json",
|
|
14
|
+
fields: Optional[List[str]] = None,
|
|
15
|
+
) -> str:
|
|
16
|
+
"""Export cache to file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
name: Cache name
|
|
20
|
+
output_path: Output file path
|
|
21
|
+
format: Export format (json, csv, bibtex, dois)
|
|
22
|
+
fields: Fields to include (for json/csv)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Output file path
|
|
26
|
+
"""
|
|
27
|
+
papers = load(name)
|
|
28
|
+
output = Path(output_path)
|
|
29
|
+
|
|
30
|
+
if format == "json":
|
|
31
|
+
if fields:
|
|
32
|
+
papers = [{k: p.get(k) for k in fields} for p in papers]
|
|
33
|
+
with open(output, "w") as f:
|
|
34
|
+
json.dump(papers, f, indent=2)
|
|
35
|
+
|
|
36
|
+
elif format == "csv":
|
|
37
|
+
import csv
|
|
38
|
+
|
|
39
|
+
if fields is None:
|
|
40
|
+
fields = ["doi", "title", "authors", "year", "journal"]
|
|
41
|
+
with open(output, "w", newline="") as f:
|
|
42
|
+
writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
43
|
+
writer.writeheader()
|
|
44
|
+
for p in papers:
|
|
45
|
+
row = dict(p)
|
|
46
|
+
if "authors" in row and isinstance(row["authors"], list):
|
|
47
|
+
row["authors"] = "; ".join(row["authors"])
|
|
48
|
+
writer.writerow(row)
|
|
49
|
+
|
|
50
|
+
elif format == "bibtex":
|
|
51
|
+
lines = []
|
|
52
|
+
for p in papers:
|
|
53
|
+
doi = p.get("doi", "").replace("/", "_").replace(".", "_")
|
|
54
|
+
entry = f"@article{{{doi},\n"
|
|
55
|
+
if p.get("title"):
|
|
56
|
+
entry += f" title = {{{p['title']}}},\n"
|
|
57
|
+
if p.get("authors"):
|
|
58
|
+
authors = (
|
|
59
|
+
" and ".join(p["authors"])
|
|
60
|
+
if isinstance(p["authors"], list)
|
|
61
|
+
else p["authors"]
|
|
62
|
+
)
|
|
63
|
+
entry += f" author = {{{authors}}},\n"
|
|
64
|
+
if p.get("year"):
|
|
65
|
+
entry += f" year = {{{p['year']}}},\n"
|
|
66
|
+
if p.get("journal"):
|
|
67
|
+
entry += f" journal = {{{p['journal']}}},\n"
|
|
68
|
+
if p.get("doi"):
|
|
69
|
+
entry += f" doi = {{{p['doi']}}},\n"
|
|
70
|
+
entry += "}\n"
|
|
71
|
+
lines.append(entry)
|
|
72
|
+
with open(output, "w") as f:
|
|
73
|
+
f.write("\n".join(lines))
|
|
74
|
+
|
|
75
|
+
elif format == "dois":
|
|
76
|
+
dois = [p["doi"] for p in papers if p.get("doi")]
|
|
77
|
+
with open(output, "w") as f:
|
|
78
|
+
f.write("\n".join(dois))
|
|
79
|
+
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unknown format: {format}")
|
|
82
|
+
|
|
83
|
+
return str(output)
|