openalex-local 0.1.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openalex_local/__init__.py +54 -3
- openalex_local/__main__.py +6 -0
- openalex_local/_cache/__init__.py +45 -0
- openalex_local/_cache/core.py +298 -0
- openalex_local/_cache/export.py +100 -0
- openalex_local/_cache/models.py +17 -0
- openalex_local/_cache/utils.py +85 -0
- openalex_local/_cli/__init__.py +9 -0
- openalex_local/_cli/cli.py +409 -0
- openalex_local/_cli/cli_cache.py +220 -0
- openalex_local/_cli/mcp.py +210 -0
- openalex_local/_cli/mcp_server.py +235 -0
- openalex_local/_core/__init__.py +42 -0
- openalex_local/_core/api.py +376 -0
- openalex_local/_core/config.py +120 -0
- openalex_local/_core/db.py +214 -0
- openalex_local/_core/export.py +252 -0
- openalex_local/_core/fts.py +165 -0
- openalex_local/_core/models.py +432 -0
- openalex_local/_remote/__init__.py +34 -0
- openalex_local/_remote/base.py +256 -0
- openalex_local/_server/__init__.py +117 -0
- openalex_local/_server/routes.py +175 -0
- openalex_local/aio.py +259 -0
- openalex_local/cache.py +31 -0
- openalex_local/cli.py +8 -0
- openalex_local/jobs.py +169 -0
- openalex_local/remote.py +8 -0
- openalex_local/server.py +8 -0
- openalex_local-0.3.1.dist-info/METADATA +288 -0
- openalex_local-0.3.1.dist-info/RECORD +34 -0
- {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/WHEEL +1 -1
- openalex_local-0.3.1.dist-info/entry_points.txt +2 -0
- openalex_local/config.py +0 -73
- openalex_local/models.py +0 -187
- openalex_local-0.1.0.dist-info/METADATA +0 -152
- openalex_local-0.1.0.dist-info/RECORD +0 -8
- openalex_local-0.1.0.dist-info/entry_points.txt +0 -2
- {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/top_level.txt +0 -0
openalex_local/aio.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Async API for openalex_local.
|
|
2
|
+
|
|
3
|
+
Provides async versions of all core API functions using thread-local
|
|
4
|
+
database connections and asyncio.to_thread() for non-blocking execution.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> import asyncio
|
|
8
|
+
>>> from openalex_local import aio
|
|
9
|
+
>>>
|
|
10
|
+
>>> async def main():
|
|
11
|
+
... results = await aio.search("machine learning", limit=10)
|
|
12
|
+
... work = await aio.get("W2741809807")
|
|
13
|
+
... print(f"Found {results.total} matches")
|
|
14
|
+
>>>
|
|
15
|
+
>>> asyncio.run(main())
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import threading
|
|
20
|
+
from typing import Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from ._core.config import Config
|
|
23
|
+
from ._core.db import Database
|
|
24
|
+
from ._core.fts import _search_with_db, _count_with_db
|
|
25
|
+
from ._core.models import SearchResult, Work
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"search",
|
|
29
|
+
"search_many",
|
|
30
|
+
"count",
|
|
31
|
+
"count_many",
|
|
32
|
+
"get",
|
|
33
|
+
"get_many",
|
|
34
|
+
"exists",
|
|
35
|
+
"info",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Thread-local storage for database connections
|
|
39
|
+
_thread_local = threading.local()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_thread_db() -> Database:
|
|
43
|
+
"""Get or create thread-local database connection."""
|
|
44
|
+
if not hasattr(_thread_local, "db"):
|
|
45
|
+
_thread_local.db = Database(Config.get_db_path())
|
|
46
|
+
return _thread_local.db
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _search_sync(query: str, limit: int, offset: int) -> SearchResult:
|
|
50
|
+
"""Synchronous search with thread-local database."""
|
|
51
|
+
db = _get_thread_db()
|
|
52
|
+
return _search_with_db(db, query, limit, offset)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _count_sync(query: str) -> int:
|
|
56
|
+
"""Synchronous count with thread-local database."""
|
|
57
|
+
db = _get_thread_db()
|
|
58
|
+
return _count_with_db(db, query)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_sync(id_or_doi: str) -> Optional[Work]:
|
|
62
|
+
"""Synchronous get with thread-local database."""
|
|
63
|
+
db = _get_thread_db()
|
|
64
|
+
|
|
65
|
+
# Try as OpenAlex ID first
|
|
66
|
+
if id_or_doi.startswith("W") or id_or_doi.startswith("w"):
|
|
67
|
+
data = db.get_work(id_or_doi.upper())
|
|
68
|
+
if data:
|
|
69
|
+
return Work.from_db_row(data)
|
|
70
|
+
|
|
71
|
+
# Try as DOI
|
|
72
|
+
data = db.get_work_by_doi(id_or_doi)
|
|
73
|
+
if data:
|
|
74
|
+
return Work.from_db_row(data)
|
|
75
|
+
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _get_many_sync(ids: List[str]) -> List[Work]:
|
|
80
|
+
"""Synchronous get_many with thread-local database."""
|
|
81
|
+
works = []
|
|
82
|
+
for id_or_doi in ids:
|
|
83
|
+
work = _get_sync(id_or_doi)
|
|
84
|
+
if work:
|
|
85
|
+
works.append(work)
|
|
86
|
+
return works
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _exists_sync(id_or_doi: str) -> bool:
|
|
90
|
+
"""Synchronous exists check with thread-local database."""
|
|
91
|
+
db = _get_thread_db()
|
|
92
|
+
|
|
93
|
+
# Try as OpenAlex ID first
|
|
94
|
+
if id_or_doi.startswith("W") or id_or_doi.startswith("w"):
|
|
95
|
+
row = db.fetchone(
|
|
96
|
+
"SELECT 1 FROM works WHERE openalex_id = ?", (id_or_doi.upper(),)
|
|
97
|
+
)
|
|
98
|
+
if row:
|
|
99
|
+
return True
|
|
100
|
+
|
|
101
|
+
# Try as DOI
|
|
102
|
+
row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (id_or_doi,))
|
|
103
|
+
return row is not None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _info_sync() -> dict:
|
|
107
|
+
"""Synchronous info with thread-local database."""
|
|
108
|
+
db = _get_thread_db()
|
|
109
|
+
|
|
110
|
+
row = db.fetchone("SELECT COUNT(*) as count FROM works")
|
|
111
|
+
work_count = row["count"] if row else 0
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
|
|
115
|
+
fts_count = row["count"] if row else 0
|
|
116
|
+
except Exception:
|
|
117
|
+
fts_count = 0
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"status": "ok",
|
|
121
|
+
"mode": "db",
|
|
122
|
+
"db_path": str(Config.get_db_path()),
|
|
123
|
+
"work_count": work_count,
|
|
124
|
+
"fts_indexed": fts_count,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def search(
|
|
129
|
+
query: str,
|
|
130
|
+
limit: int = 20,
|
|
131
|
+
offset: int = 0,
|
|
132
|
+
) -> SearchResult:
|
|
133
|
+
"""
|
|
134
|
+
Async full-text search across works.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
query: Search query (supports FTS5 syntax)
|
|
138
|
+
limit: Maximum results to return
|
|
139
|
+
offset: Skip first N results (for pagination)
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
SearchResult with matching works
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> results = await aio.search("machine learning", limit=10)
|
|
146
|
+
>>> print(f"Found {results.total} matches")
|
|
147
|
+
"""
|
|
148
|
+
return await asyncio.to_thread(_search_sync, query, limit, offset)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
async def count(query: str) -> int:
|
|
152
|
+
"""
|
|
153
|
+
Async count of matching works.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
query: FTS5 search query
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Number of matching works
|
|
160
|
+
"""
|
|
161
|
+
return await asyncio.to_thread(_count_sync, query)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def get(id_or_doi: str) -> Optional[Work]:
|
|
165
|
+
"""
|
|
166
|
+
Async get work by OpenAlex ID or DOI.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
id_or_doi: OpenAlex ID (e.g., W2741809807) or DOI
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Work object or None if not found
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
>>> work = await aio.get("W2741809807")
|
|
176
|
+
>>> work = await aio.get("10.1038/nature12373")
|
|
177
|
+
"""
|
|
178
|
+
return await asyncio.to_thread(_get_sync, id_or_doi)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
async def get_many(ids: List[str]) -> List[Work]:
|
|
182
|
+
"""
|
|
183
|
+
Async get multiple works by OpenAlex ID or DOI.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
ids: List of OpenAlex IDs or DOIs
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of Work objects (missing IDs are skipped)
|
|
190
|
+
"""
|
|
191
|
+
return await asyncio.to_thread(_get_many_sync, ids)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
async def exists(id_or_doi: str) -> bool:
|
|
195
|
+
"""
|
|
196
|
+
Async check if a work exists in the database.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
id_or_doi: OpenAlex ID or DOI
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
True if work exists
|
|
203
|
+
"""
|
|
204
|
+
return await asyncio.to_thread(_exists_sync, id_or_doi)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def info() -> dict:
|
|
208
|
+
"""
|
|
209
|
+
Async get database information.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Dictionary with database stats
|
|
213
|
+
"""
|
|
214
|
+
return await asyncio.to_thread(_info_sync)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
async def search_many(
|
|
218
|
+
queries: List[str],
|
|
219
|
+
limit: int = 10,
|
|
220
|
+
) -> List[SearchResult]:
|
|
221
|
+
"""
|
|
222
|
+
Execute multiple searches concurrently.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
queries: List of search queries
|
|
226
|
+
limit: Maximum results per query
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of SearchResult objects
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
>>> queries = ["machine learning", "neural networks", "deep learning"]
|
|
233
|
+
>>> results = await aio.search_many(queries, limit=5)
|
|
234
|
+
>>> for r in results:
|
|
235
|
+
... print(f"{r.query}: {r.total} matches")
|
|
236
|
+
"""
|
|
237
|
+
tasks = [search(q, limit=limit) for q in queries]
|
|
238
|
+
return await asyncio.gather(*tasks)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
async def count_many(queries: List[str]) -> Dict[str, int]:
|
|
242
|
+
"""
|
|
243
|
+
Count matches for multiple queries concurrently.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
queries: List of search queries
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Dictionary mapping queries to counts
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> queries = ["machine learning", "neural networks"]
|
|
253
|
+
>>> counts = await aio.count_many(queries)
|
|
254
|
+
>>> print(counts)
|
|
255
|
+
{'machine learning': 5000, 'neural networks': 3000}
|
|
256
|
+
"""
|
|
257
|
+
tasks = [count(q) for q in queries]
|
|
258
|
+
counts = await asyncio.gather(*tasks)
|
|
259
|
+
return dict(zip(queries, counts))
|
openalex_local/cache.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Cache module - re-exports from _cache package."""
|
|
2
|
+
|
|
3
|
+
from ._cache import (
|
|
4
|
+
CacheInfo,
|
|
5
|
+
create,
|
|
6
|
+
append,
|
|
7
|
+
load,
|
|
8
|
+
query,
|
|
9
|
+
query_ids,
|
|
10
|
+
stats,
|
|
11
|
+
info,
|
|
12
|
+
exists,
|
|
13
|
+
list_caches,
|
|
14
|
+
delete,
|
|
15
|
+
export,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"CacheInfo",
|
|
20
|
+
"create",
|
|
21
|
+
"append",
|
|
22
|
+
"load",
|
|
23
|
+
"query",
|
|
24
|
+
"query_ids",
|
|
25
|
+
"stats",
|
|
26
|
+
"info",
|
|
27
|
+
"exists",
|
|
28
|
+
"list_caches",
|
|
29
|
+
"delete",
|
|
30
|
+
"export",
|
|
31
|
+
]
|
openalex_local/cli.py
ADDED
openalex_local/jobs.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Timestamp: 2026-01-29
|
|
3
|
+
"""Simple job/queue system for batch operations."""
|
|
4
|
+
|
|
5
|
+
import json as _json
|
|
6
|
+
import time as _time
|
|
7
|
+
import uuid as _uuid
|
|
8
|
+
from dataclasses import dataclass as _dataclass
|
|
9
|
+
from dataclasses import field as _field
|
|
10
|
+
from pathlib import Path as _Path
|
|
11
|
+
from typing import Any as _Any
|
|
12
|
+
from typing import Callable as _Callable
|
|
13
|
+
from typing import Optional as _Optional
|
|
14
|
+
|
|
15
|
+
__all__ = ["create", "get", "list_jobs", "run"]
|
|
16
|
+
|
|
17
|
+
# Default jobs directory
|
|
18
|
+
_JOBS_DIR = _Path.home() / ".openalex_local" / "jobs"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@_dataclass
|
|
22
|
+
class _Job:
|
|
23
|
+
"""A batch job with progress tracking (internal)."""
|
|
24
|
+
|
|
25
|
+
id: str
|
|
26
|
+
items: list[str] # e.g., DOIs or OpenAlex IDs to process
|
|
27
|
+
completed: list[str] = _field(default_factory=list)
|
|
28
|
+
failed: dict[str, str] = _field(default_factory=dict) # item -> error
|
|
29
|
+
status: str = "pending" # pending, running, completed, failed
|
|
30
|
+
created_at: float = _field(default_factory=_time.time)
|
|
31
|
+
updated_at: float = _field(default_factory=_time.time)
|
|
32
|
+
metadata: dict[str, _Any] = _field(default_factory=dict)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def pending(self) -> list[str]:
|
|
36
|
+
"""Items not yet processed."""
|
|
37
|
+
done = set(self.completed) | set(self.failed.keys())
|
|
38
|
+
return [i for i in self.items if i not in done]
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def progress(self) -> float:
|
|
42
|
+
"""Progress as percentage (0-100)."""
|
|
43
|
+
if not self.items:
|
|
44
|
+
return 100.0
|
|
45
|
+
return len(self.completed) / len(self.items) * 100
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict:
|
|
48
|
+
return {
|
|
49
|
+
"id": self.id,
|
|
50
|
+
"items": self.items,
|
|
51
|
+
"completed": self.completed,
|
|
52
|
+
"failed": self.failed,
|
|
53
|
+
"status": self.status,
|
|
54
|
+
"created_at": self.created_at,
|
|
55
|
+
"updated_at": self.updated_at,
|
|
56
|
+
"metadata": self.metadata,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dict(cls, data: dict) -> "_Job":
|
|
61
|
+
return cls(**data)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class _JobQueue:
|
|
65
|
+
"""Manages job persistence and execution (internal)."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, jobs_dir: _Optional[_Path] = None):
|
|
68
|
+
self.jobs_dir = _Path(jobs_dir) if jobs_dir else _JOBS_DIR
|
|
69
|
+
self.jobs_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
def _job_path(self, job_id: str) -> _Path:
|
|
72
|
+
return self.jobs_dir / f"{job_id}.json"
|
|
73
|
+
|
|
74
|
+
def save(self, job: _Job) -> None:
|
|
75
|
+
"""Save job to disk."""
|
|
76
|
+
job.updated_at = _time.time()
|
|
77
|
+
self._job_path(job.id).write_text(_json.dumps(job.to_dict(), indent=2))
|
|
78
|
+
|
|
79
|
+
def load(self, job_id: str) -> _Optional[_Job]:
|
|
80
|
+
"""Load job from disk."""
|
|
81
|
+
path = self._job_path(job_id)
|
|
82
|
+
if not path.exists():
|
|
83
|
+
return None
|
|
84
|
+
return _Job.from_dict(_json.loads(path.read_text()))
|
|
85
|
+
|
|
86
|
+
def create(self, items: list[str], **metadata) -> _Job:
|
|
87
|
+
"""Create a new job."""
|
|
88
|
+
job = _Job(id=str(_uuid.uuid4())[:8], items=items, metadata=metadata)
|
|
89
|
+
self.save(job)
|
|
90
|
+
return job
|
|
91
|
+
|
|
92
|
+
def list(self) -> list[_Job]:
|
|
93
|
+
"""List all jobs."""
|
|
94
|
+
jobs = []
|
|
95
|
+
for path in self.jobs_dir.glob("*.json"):
|
|
96
|
+
try:
|
|
97
|
+
jobs.append(_Job.from_dict(_json.loads(path.read_text())))
|
|
98
|
+
except Exception:
|
|
99
|
+
continue
|
|
100
|
+
return sorted(jobs, key=lambda j: j.created_at, reverse=True)
|
|
101
|
+
|
|
102
|
+
def delete(self, job_id: str) -> bool:
|
|
103
|
+
"""Delete a job."""
|
|
104
|
+
path = self._job_path(job_id)
|
|
105
|
+
if path.exists():
|
|
106
|
+
path.unlink()
|
|
107
|
+
return True
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def run(
|
|
111
|
+
self,
|
|
112
|
+
job: _Job,
|
|
113
|
+
processor: _Callable[[str], _Any],
|
|
114
|
+
on_progress: _Optional[_Callable[[_Job], None]] = None,
|
|
115
|
+
) -> _Job:
|
|
116
|
+
"""Run a job with a processor function."""
|
|
117
|
+
job.status = "running"
|
|
118
|
+
self.save(job)
|
|
119
|
+
|
|
120
|
+
for item in job.pending:
|
|
121
|
+
try:
|
|
122
|
+
processor(item)
|
|
123
|
+
job.completed.append(item)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
job.failed[item] = str(e)
|
|
126
|
+
self.save(job)
|
|
127
|
+
if on_progress:
|
|
128
|
+
on_progress(job)
|
|
129
|
+
|
|
130
|
+
job.status = "completed" if not job.failed else "failed"
|
|
131
|
+
self.save(job)
|
|
132
|
+
return job
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# Module-level convenience functions
|
|
136
|
+
_queue = None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _get_queue() -> _JobQueue:
|
|
140
|
+
global _queue
|
|
141
|
+
if _queue is None:
|
|
142
|
+
_queue = _JobQueue()
|
|
143
|
+
return _queue
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def create(items: list[str], **metadata) -> _Job:
|
|
147
|
+
"""Create a new job."""
|
|
148
|
+
return _get_queue().create(items, **metadata)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get(job_id: str) -> _Optional[_Job]:
|
|
152
|
+
"""Get a job by ID."""
|
|
153
|
+
return _get_queue().load(job_id)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def list_jobs() -> list[_Job]:
|
|
157
|
+
"""List all jobs."""
|
|
158
|
+
return _get_queue().list()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def run(job_id: str, processor: _Callable[[str], _Any]) -> _Job:
|
|
162
|
+
"""Run or resume a job."""
|
|
163
|
+
job = get(job_id)
|
|
164
|
+
if not job:
|
|
165
|
+
raise ValueError(f"Job not found: {job_id}")
|
|
166
|
+
return _get_queue().run(job, processor)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# EOF
|
openalex_local/remote.py
ADDED