crossref-local 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossref_local/__init__.py +38 -16
- crossref_local/__main__.py +0 -0
- crossref_local/_aio/__init__.py +30 -0
- crossref_local/_aio/_impl.py +238 -0
- crossref_local/_cache/__init__.py +15 -0
- crossref_local/_cache/export.py +100 -0
- crossref_local/_cache/utils.py +93 -0
- crossref_local/_cache/viz.py +296 -0
- crossref_local/_cli/__init__.py +9 -0
- crossref_local/_cli/cache.py +179 -0
- crossref_local/_cli/cli.py +512 -0
- crossref_local/_cli/completion.py +245 -0
- crossref_local/_cli/main.py +20 -0
- crossref_local/_cli/mcp.py +351 -0
- crossref_local/_cli/mcp_server.py +413 -0
- crossref_local/_core/__init__.py +58 -0
- crossref_local/{api.py → _core/api.py} +130 -36
- crossref_local/{citations.py → _core/citations.py} +55 -26
- crossref_local/{config.py → _core/config.py} +57 -42
- crossref_local/{db.py → _core/db.py} +32 -26
- crossref_local/{fts.py → _core/fts.py} +18 -14
- crossref_local/{models.py → _core/models.py} +11 -6
- crossref_local/{impact_factor → _impact_factor}/__init__.py +0 -0
- crossref_local/{impact_factor → _impact_factor}/calculator.py +0 -0
- crossref_local/{impact_factor → _impact_factor}/journal_lookup.py +0 -0
- crossref_local/_remote/__init__.py +56 -0
- crossref_local/_remote/base.py +356 -0
- crossref_local/_remote/collections.py +175 -0
- crossref_local/_server/__init__.py +140 -0
- crossref_local/_server/middleware.py +25 -0
- crossref_local/_server/models.py +129 -0
- crossref_local/_server/routes_citations.py +98 -0
- crossref_local/_server/routes_collections.py +282 -0
- crossref_local/_server/routes_compat.py +102 -0
- crossref_local/_server/routes_works.py +128 -0
- crossref_local/_server/server.py +19 -0
- crossref_local/aio.py +30 -206
- crossref_local/cache.py +466 -0
- crossref_local/cli.py +5 -447
- crossref_local/jobs.py +169 -0
- crossref_local/mcp_server.py +5 -199
- crossref_local/remote.py +5 -261
- crossref_local/server.py +5 -349
- {crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/METADATA +88 -24
- crossref_local-0.5.0.dist-info/RECORD +47 -0
- crossref_local-0.3.1.dist-info/RECORD +0 -20
- {crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/WHEEL +0 -0
- {crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/entry_points.txt +0 -0
crossref_local/__init__.py
CHANGED
|
@@ -20,19 +20,19 @@ Async usage:
|
|
|
20
20
|
Configuration
|
|
21
21
|
-------------
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
DB mode (direct database access):
|
|
24
24
|
>>> from crossref_local import configure
|
|
25
25
|
>>> configure("/path/to/crossref.db")
|
|
26
26
|
Or set CROSSREF_LOCAL_DB environment variable.
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
>>> from crossref_local import
|
|
30
|
-
>>>
|
|
31
|
-
Or set
|
|
28
|
+
HTTP mode (API access via HTTP):
|
|
29
|
+
>>> from crossref_local import configure_http
|
|
30
|
+
>>> configure_http("http://localhost:8333")
|
|
31
|
+
Or set CROSSREF_LOCAL_API_URL environment variable.
|
|
32
32
|
|
|
33
33
|
Typical setup with SSH tunnel:
|
|
34
|
-
$ ssh -L
|
|
35
|
-
>>>
|
|
34
|
+
$ ssh -L 8333:127.0.0.1:8333 your-server # In terminal
|
|
35
|
+
>>> configure_http() # Uses default localhost:8333
|
|
36
36
|
|
|
37
37
|
Public API
|
|
38
38
|
----------
|
|
@@ -43,6 +43,8 @@ Functions:
|
|
|
43
43
|
get(doi) -> Work | None
|
|
44
44
|
get_many(dois) -> list[Work]
|
|
45
45
|
exists(doi) -> bool
|
|
46
|
+
enrich(results) -> SearchResult
|
|
47
|
+
enrich_dois(dois) -> list[Work]
|
|
46
48
|
configure(db_path) -> None
|
|
47
49
|
configure_remote(api_url) -> None
|
|
48
50
|
get_mode() -> str
|
|
@@ -62,29 +64,41 @@ Modules:
|
|
|
62
64
|
aio - Async versions of all API functions
|
|
63
65
|
"""
|
|
64
66
|
|
|
65
|
-
__version__ = "0.
|
|
67
|
+
__version__ = "0.5.0"
|
|
66
68
|
|
|
67
|
-
# Core API (
|
|
68
|
-
from .
|
|
69
|
+
# Core API (from _core package)
|
|
70
|
+
from ._core import (
|
|
71
|
+
# Functions
|
|
69
72
|
search,
|
|
70
73
|
count,
|
|
71
74
|
get,
|
|
72
75
|
get_many,
|
|
73
76
|
exists,
|
|
77
|
+
enrich,
|
|
78
|
+
enrich_dois,
|
|
74
79
|
configure,
|
|
80
|
+
configure_http,
|
|
75
81
|
configure_remote,
|
|
76
82
|
get_mode,
|
|
77
83
|
info,
|
|
84
|
+
# Models
|
|
85
|
+
Work,
|
|
86
|
+
SearchResult,
|
|
87
|
+
# Citations
|
|
88
|
+
get_citing,
|
|
89
|
+
get_cited,
|
|
90
|
+
get_citation_count,
|
|
91
|
+
CitationNetwork,
|
|
78
92
|
)
|
|
79
93
|
|
|
80
|
-
# Models (public classes)
|
|
81
|
-
from .models import Work, SearchResult
|
|
82
|
-
|
|
83
94
|
# Async API (public module)
|
|
84
95
|
from . import aio
|
|
85
96
|
|
|
86
|
-
#
|
|
87
|
-
from .
|
|
97
|
+
# Cache module (public)
|
|
98
|
+
from . import cache
|
|
99
|
+
|
|
100
|
+
# Jobs module (public)
|
|
101
|
+
from . import jobs
|
|
88
102
|
|
|
89
103
|
|
|
90
104
|
# Public API - what users should import
|
|
@@ -97,9 +111,13 @@ __all__ = [
|
|
|
97
111
|
"get",
|
|
98
112
|
"get_many",
|
|
99
113
|
"exists",
|
|
114
|
+
# Enrichment (add citations/references to search results)
|
|
115
|
+
"enrich",
|
|
116
|
+
"enrich_dois",
|
|
100
117
|
# Configuration
|
|
101
118
|
"configure",
|
|
102
|
-
"
|
|
119
|
+
"configure_http",
|
|
120
|
+
"configure_remote", # Backward compatibility alias
|
|
103
121
|
"get_mode",
|
|
104
122
|
"info",
|
|
105
123
|
# Data models
|
|
@@ -107,6 +125,10 @@ __all__ = [
|
|
|
107
125
|
"SearchResult",
|
|
108
126
|
# Async API
|
|
109
127
|
"aio",
|
|
128
|
+
# Cache module
|
|
129
|
+
"cache",
|
|
130
|
+
# Jobs module
|
|
131
|
+
"jobs",
|
|
110
132
|
# Citation network
|
|
111
133
|
"get_citing",
|
|
112
134
|
"get_cited",
|
crossref_local/__main__.py
CHANGED
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Async API module."""
|
|
3
|
+
|
|
4
|
+
from ._impl import (
|
|
5
|
+
SearchResult,
|
|
6
|
+
Work,
|
|
7
|
+
count,
|
|
8
|
+
count_many,
|
|
9
|
+
exists,
|
|
10
|
+
get,
|
|
11
|
+
get_many,
|
|
12
|
+
info,
|
|
13
|
+
search,
|
|
14
|
+
search_many,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"search",
|
|
19
|
+
"count",
|
|
20
|
+
"get",
|
|
21
|
+
"get_many",
|
|
22
|
+
"exists",
|
|
23
|
+
"info",
|
|
24
|
+
"search_many",
|
|
25
|
+
"count_many",
|
|
26
|
+
"SearchResult",
|
|
27
|
+
"Work",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
# EOF
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async API for crossref_local.
|
|
3
|
+
|
|
4
|
+
Provides async versions of all API functions. Uses thread pool execution
|
|
5
|
+
with per-thread database connections for thread safety.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from crossref_local import aio
|
|
9
|
+
|
|
10
|
+
async def main():
|
|
11
|
+
results = await aio.search("machine learning")
|
|
12
|
+
work = await aio.get("10.1038/nature12373")
|
|
13
|
+
n = await aio.count("CRISPR")
|
|
14
|
+
|
|
15
|
+
# Or import individual functions
|
|
16
|
+
from crossref_local.aio import search, get, count
|
|
17
|
+
|
|
18
|
+
# Concurrent operations
|
|
19
|
+
counts = await aio.count_many(["CRISPR", "machine learning"])
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import asyncio as _asyncio
|
|
23
|
+
import threading as _threading
|
|
24
|
+
from typing import List, Optional
|
|
25
|
+
|
|
26
|
+
from .._core.config import Config as _Config
|
|
27
|
+
from .._core.db import Database as _Database
|
|
28
|
+
from .._core.models import SearchResult, Work
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"search",
|
|
32
|
+
"count",
|
|
33
|
+
"get",
|
|
34
|
+
"get_many",
|
|
35
|
+
"exists",
|
|
36
|
+
"info",
|
|
37
|
+
"search_many",
|
|
38
|
+
"count_many",
|
|
39
|
+
# Public types for type hints
|
|
40
|
+
"SearchResult",
|
|
41
|
+
"Work",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# Thread-local storage for database connections
|
|
45
|
+
_thread_local = _threading.local()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_thread_db() -> _Database:
|
|
49
|
+
"""Get thread-local database connection."""
|
|
50
|
+
if not hasattr(_thread_local, "db"):
|
|
51
|
+
_thread_local.db = _Database(_Config.get_db_path())
|
|
52
|
+
return _thread_local.db
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _search_sync(query: str, limit: int, offset: int) -> SearchResult:
|
|
56
|
+
"""Thread-safe sync search."""
|
|
57
|
+
from .._core import fts
|
|
58
|
+
|
|
59
|
+
db = _get_thread_db()
|
|
60
|
+
return fts._search_with_db(db, query, limit, offset)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _count_sync(query: str) -> int:
|
|
64
|
+
"""Thread-safe sync count."""
|
|
65
|
+
from .._core import fts
|
|
66
|
+
|
|
67
|
+
db = _get_thread_db()
|
|
68
|
+
return fts._count_with_db(db, query)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_sync(doi: str) -> Optional[Work]:
|
|
72
|
+
"""Thread-safe sync get."""
|
|
73
|
+
db = _get_thread_db()
|
|
74
|
+
metadata = db.get_metadata(doi)
|
|
75
|
+
if metadata:
|
|
76
|
+
return Work.from_metadata(doi, metadata)
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_many_sync(dois: List[str]) -> List[Work]:
|
|
81
|
+
"""Thread-safe sync get_many."""
|
|
82
|
+
db = _get_thread_db()
|
|
83
|
+
works = []
|
|
84
|
+
for doi in dois:
|
|
85
|
+
metadata = db.get_metadata(doi)
|
|
86
|
+
if metadata:
|
|
87
|
+
works.append(Work.from_metadata(doi, metadata))
|
|
88
|
+
return works
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _exists_sync(doi: str) -> bool:
|
|
92
|
+
"""Thread-safe sync exists."""
|
|
93
|
+
db = _get_thread_db()
|
|
94
|
+
row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (doi,))
|
|
95
|
+
return row is not None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _info_sync() -> dict:
|
|
99
|
+
"""Thread-safe sync info."""
|
|
100
|
+
db = _get_thread_db()
|
|
101
|
+
|
|
102
|
+
row = db.fetchone("SELECT COUNT(*) as count FROM works")
|
|
103
|
+
work_count = row["count"] if row else 0
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
|
|
107
|
+
fts_count = row["count"] if row else 0
|
|
108
|
+
except Exception:
|
|
109
|
+
fts_count = 0
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
row = db.fetchone("SELECT COUNT(*) as count FROM citations")
|
|
113
|
+
citation_count = row["count"] if row else 0
|
|
114
|
+
except Exception:
|
|
115
|
+
citation_count = 0
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"db_path": str(_Config.get_db_path()),
|
|
119
|
+
"works": work_count,
|
|
120
|
+
"fts_indexed": fts_count,
|
|
121
|
+
"citations": citation_count,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def search(
|
|
126
|
+
query: str,
|
|
127
|
+
limit: int = 10,
|
|
128
|
+
offset: int = 0,
|
|
129
|
+
) -> SearchResult:
|
|
130
|
+
"""
|
|
131
|
+
Async full-text search across works.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
query: Search query (supports FTS5 syntax)
|
|
135
|
+
limit: Maximum results to return
|
|
136
|
+
offset: Skip first N results (for pagination)
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
SearchResult with matching works
|
|
140
|
+
"""
|
|
141
|
+
return await _asyncio.to_thread(_search_sync, query, limit, offset)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
async def count(query: str) -> int:
|
|
145
|
+
"""
|
|
146
|
+
Async count matching works without fetching results.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
query: FTS5 search query
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Number of matching works
|
|
153
|
+
"""
|
|
154
|
+
return await _asyncio.to_thread(_count_sync, query)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def get(doi: str) -> Optional[Work]:
|
|
158
|
+
"""
|
|
159
|
+
Async get a work by DOI.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
doi: Digital Object Identifier
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Work object or None if not found
|
|
166
|
+
"""
|
|
167
|
+
return await _asyncio.to_thread(_get_sync, doi)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
async def get_many(dois: List[str]) -> List[Work]:
|
|
171
|
+
"""
|
|
172
|
+
Async get multiple works by DOI.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
dois: List of DOIs
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of Work objects (missing DOIs are skipped)
|
|
179
|
+
"""
|
|
180
|
+
return await _asyncio.to_thread(_get_many_sync, dois)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
async def exists(doi: str) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Async check if a DOI exists in the database.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
doi: Digital Object Identifier
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
True if DOI exists
|
|
192
|
+
"""
|
|
193
|
+
return await _asyncio.to_thread(_exists_sync, doi)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async def info() -> dict:
|
|
197
|
+
"""
|
|
198
|
+
Async get database information.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dictionary with database stats
|
|
202
|
+
"""
|
|
203
|
+
return await _asyncio.to_thread(_info_sync)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
async def search_many(queries: List[str], limit: int = 10) -> List[SearchResult]:
|
|
207
|
+
"""
|
|
208
|
+
Run multiple searches concurrently.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
queries: List of search queries
|
|
212
|
+
limit: Maximum results per query
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of SearchResult objects
|
|
216
|
+
"""
|
|
217
|
+
tasks = [search(q, limit=limit) for q in queries]
|
|
218
|
+
return await _asyncio.gather(*tasks)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
async def count_many(queries: List[str]) -> dict:
|
|
222
|
+
"""
|
|
223
|
+
Count matches for multiple queries concurrently.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
queries: List of search queries
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Dict mapping query -> count
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
>>> counts = await count_many(["CRISPR", "machine learning"])
|
|
233
|
+
>>> print(counts)
|
|
234
|
+
{'CRISPR': 45000, 'machine learning': 477922}
|
|
235
|
+
"""
|
|
236
|
+
tasks = [count(q) for q in queries]
|
|
237
|
+
results = await _asyncio.gather(*tasks)
|
|
238
|
+
return dict(zip(queries, results))
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Internal cache helper modules."""
|
|
3
|
+
|
|
4
|
+
from .export import export
|
|
5
|
+
from .utils import cache_path, get_cache_dir, meta_path, sanitize_name
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"export",
|
|
9
|
+
"cache_path",
|
|
10
|
+
"get_cache_dir",
|
|
11
|
+
"meta_path",
|
|
12
|
+
"sanitize_name",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# EOF
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Export functionality for cache module."""
|
|
2
|
+
|
|
3
|
+
import csv as _csv
|
|
4
|
+
import json as _json
|
|
5
|
+
from pathlib import Path as _Path
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from .utils import sanitize_name as _sanitize_name
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"export",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _load_cache(name: str, user_id: Optional[str] = None):
|
|
16
|
+
"""Load cache data (lazy import to avoid circular dependency)."""
|
|
17
|
+
from ..cache import load
|
|
18
|
+
|
|
19
|
+
return load(name, user_id=user_id)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def export(
|
|
23
|
+
name: str,
|
|
24
|
+
output_path: str,
|
|
25
|
+
format: str = "json",
|
|
26
|
+
fields: Optional[List[str]] = None,
|
|
27
|
+
user_id: Optional[str] = None,
|
|
28
|
+
) -> str:
|
|
29
|
+
"""Export cache to file.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: Cache name
|
|
33
|
+
output_path: Output file path
|
|
34
|
+
format: Export format (json, csv, bibtex, dois)
|
|
35
|
+
fields: Fields to include (for json/csv)
|
|
36
|
+
user_id: Optional user ID for multi-tenant scoping
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Output file path
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If cache name contains invalid characters
|
|
43
|
+
"""
|
|
44
|
+
# Validate cache name
|
|
45
|
+
_sanitize_name(name)
|
|
46
|
+
papers = _load_cache(name, user_id=user_id)
|
|
47
|
+
output = _Path(output_path)
|
|
48
|
+
|
|
49
|
+
if format == "json":
|
|
50
|
+
if fields:
|
|
51
|
+
papers = [{k: p.get(k) for k in fields} for p in papers]
|
|
52
|
+
with open(output, "w") as f:
|
|
53
|
+
_json.dump(papers, f, indent=2)
|
|
54
|
+
|
|
55
|
+
elif format == "csv":
|
|
56
|
+
if fields is None:
|
|
57
|
+
fields = ["doi", "title", "authors", "year", "journal"]
|
|
58
|
+
with open(output, "w", newline="") as f:
|
|
59
|
+
writer = _csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
60
|
+
writer.writeheader()
|
|
61
|
+
for p in papers:
|
|
62
|
+
row = dict(p)
|
|
63
|
+
if "authors" in row and isinstance(row["authors"], list):
|
|
64
|
+
row["authors"] = "; ".join(row["authors"])
|
|
65
|
+
writer.writerow(row)
|
|
66
|
+
|
|
67
|
+
elif format == "bibtex":
|
|
68
|
+
lines = []
|
|
69
|
+
for p in papers:
|
|
70
|
+
doi = p.get("doi", "").replace("/", "_").replace(".", "_")
|
|
71
|
+
entry = f"@article{{{doi},\n"
|
|
72
|
+
if p.get("title"):
|
|
73
|
+
entry += f" title = {{{p['title']}}},\n"
|
|
74
|
+
if p.get("authors"):
|
|
75
|
+
authors = (
|
|
76
|
+
" and ".join(p["authors"])
|
|
77
|
+
if isinstance(p["authors"], list)
|
|
78
|
+
else p["authors"]
|
|
79
|
+
)
|
|
80
|
+
entry += f" author = {{{authors}}},\n"
|
|
81
|
+
if p.get("year"):
|
|
82
|
+
entry += f" year = {{{p['year']}}},\n"
|
|
83
|
+
if p.get("journal"):
|
|
84
|
+
entry += f" journal = {{{p['journal']}}},\n"
|
|
85
|
+
if p.get("doi"):
|
|
86
|
+
entry += f" doi = {{{p['doi']}}},\n"
|
|
87
|
+
entry += "}\n"
|
|
88
|
+
lines.append(entry)
|
|
89
|
+
with open(output, "w") as f:
|
|
90
|
+
f.write("\n".join(lines))
|
|
91
|
+
|
|
92
|
+
elif format == "dois":
|
|
93
|
+
dois = [p["doi"] for p in papers if p.get("doi")]
|
|
94
|
+
with open(output, "w") as f:
|
|
95
|
+
f.write("\n".join(dois))
|
|
96
|
+
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"Unknown format: {format}")
|
|
99
|
+
|
|
100
|
+
return str(output)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Cache utility functions for crossref-local.
|
|
2
|
+
|
|
3
|
+
Provides path handling and validation utilities for the cache module.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os as _os
|
|
7
|
+
import re as _re
|
|
8
|
+
from pathlib import Path as _Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"sanitize_name",
|
|
13
|
+
"get_cache_dir",
|
|
14
|
+
"cache_path",
|
|
15
|
+
"meta_path",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Valid cache name pattern: alphanumeric, underscores, hyphens only
|
|
20
|
+
_CACHE_NAME_PATTERN = _re.compile(r"^[a-zA-Z0-9_-]+$")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def sanitize_name(name: str) -> str:
|
|
24
|
+
"""Sanitize cache name to prevent path traversal.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
name: Cache name to sanitize
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Sanitized name
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If name contains invalid characters
|
|
34
|
+
"""
|
|
35
|
+
if not name:
|
|
36
|
+
raise ValueError("Cache name cannot be empty")
|
|
37
|
+
if not _CACHE_NAME_PATTERN.match(name):
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Invalid cache name '{name}': only alphanumeric, underscores, and hyphens allowed"
|
|
40
|
+
)
|
|
41
|
+
if len(name) > 64:
|
|
42
|
+
raise ValueError(f"Cache name too long: {len(name)} chars (max 64)")
|
|
43
|
+
return name
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_cache_dir(user_id: Optional[str] = None) -> _Path:
|
|
47
|
+
"""Get cache directory, creating if needed.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
user_id: Optional user ID for multi-tenant scoping.
|
|
51
|
+
If provided, creates a user-specific subdirectory.
|
|
52
|
+
"""
|
|
53
|
+
cache_dir = _Path(
|
|
54
|
+
_os.environ.get(
|
|
55
|
+
"CROSSREF_LOCAL_CACHE_DIR", _Path.home() / ".cache" / "crossref-local"
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
# Add user subdirectory for multi-tenant support
|
|
59
|
+
if user_id:
|
|
60
|
+
# Sanitize user_id as well
|
|
61
|
+
safe_user_id = _re.sub(r"[^a-zA-Z0-9_-]", "", user_id[:16])
|
|
62
|
+
if safe_user_id:
|
|
63
|
+
cache_dir = cache_dir / safe_user_id
|
|
64
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
return cache_dir
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def cache_path(name: str, user_id: Optional[str] = None) -> _Path:
|
|
69
|
+
"""Get path for a named cache.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name: Cache name (will be sanitized)
|
|
73
|
+
user_id: Optional user ID for multi-tenant scoping
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Path to cache file
|
|
77
|
+
"""
|
|
78
|
+
safe_name = sanitize_name(name)
|
|
79
|
+
return get_cache_dir(user_id) / f"{safe_name}.json"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def meta_path(name: str, user_id: Optional[str] = None) -> _Path:
|
|
83
|
+
"""Get path for cache metadata.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
name: Cache name (will be sanitized)
|
|
87
|
+
user_id: Optional user ID for multi-tenant scoping
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Path to metadata file
|
|
91
|
+
"""
|
|
92
|
+
safe_name = sanitize_name(name)
|
|
93
|
+
return get_cache_dir(user_id) / f"{safe_name}.meta.json"
|