openalex-local 0.1.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. openalex_local/__init__.py +54 -3
  2. openalex_local/__main__.py +6 -0
  3. openalex_local/_cache/__init__.py +45 -0
  4. openalex_local/_cache/core.py +298 -0
  5. openalex_local/_cache/export.py +100 -0
  6. openalex_local/_cache/models.py +17 -0
  7. openalex_local/_cache/utils.py +85 -0
  8. openalex_local/_cli/__init__.py +9 -0
  9. openalex_local/_cli/cli.py +409 -0
  10. openalex_local/_cli/cli_cache.py +220 -0
  11. openalex_local/_cli/mcp.py +210 -0
  12. openalex_local/_cli/mcp_server.py +235 -0
  13. openalex_local/_core/__init__.py +42 -0
  14. openalex_local/_core/api.py +376 -0
  15. openalex_local/_core/config.py +120 -0
  16. openalex_local/_core/db.py +214 -0
  17. openalex_local/_core/export.py +252 -0
  18. openalex_local/_core/fts.py +165 -0
  19. openalex_local/_core/models.py +432 -0
  20. openalex_local/_remote/__init__.py +34 -0
  21. openalex_local/_remote/base.py +256 -0
  22. openalex_local/_server/__init__.py +117 -0
  23. openalex_local/_server/routes.py +175 -0
  24. openalex_local/aio.py +259 -0
  25. openalex_local/cache.py +31 -0
  26. openalex_local/cli.py +8 -0
  27. openalex_local/jobs.py +169 -0
  28. openalex_local/remote.py +8 -0
  29. openalex_local/server.py +8 -0
  30. openalex_local-0.3.1.dist-info/METADATA +288 -0
  31. openalex_local-0.3.1.dist-info/RECORD +34 -0
  32. {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/WHEEL +1 -1
  33. openalex_local-0.3.1.dist-info/entry_points.txt +2 -0
  34. openalex_local/config.py +0 -73
  35. openalex_local/models.py +0 -187
  36. openalex_local-0.1.0.dist-info/METADATA +0 -152
  37. openalex_local-0.1.0.dist-info/RECORD +0 -8
  38. openalex_local-0.1.0.dist-info/entry_points.txt +0 -2
  39. {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,58 @@ Example:
8
8
  >>> work = get("10.1038/nature12373") # or DOI
9
9
  """
10
10
 
11
- __version__ = "0.1.0"
11
+ __version__ = "0.3.1"
12
12
 
13
- # API will be exposed here after implementation
14
- # from .api import search, get, count, info
13
+ from ._core import (
14
+ SUPPORTED_FORMATS,
15
+ SearchResult,
16
+ Work,
17
+ configure,
18
+ count,
19
+ enrich,
20
+ enrich_ids,
21
+ exists,
22
+ get,
23
+ get_many,
24
+ get_mode,
25
+ info,
26
+ save,
27
+ search,
28
+ )
29
+
30
+ # Jobs module (public functions only)
31
+ from . import jobs
32
+
33
+ # Async module
34
+ from . import aio
35
+
36
+ # Cache module
37
+ from . import cache
38
+
39
+ __all__ = [
40
+ # Core functions
41
+ "search",
42
+ "count",
43
+ "get",
44
+ "get_many",
45
+ "exists",
46
+ "info",
47
+ # Enrich functions
48
+ "enrich",
49
+ "enrich_ids",
50
+ # Configuration
51
+ "configure",
52
+ "get_mode",
53
+ # Models
54
+ "Work",
55
+ "SearchResult",
56
+ # Export
57
+ "save",
58
+ "SUPPORTED_FORMATS",
59
+ # Jobs
60
+ "jobs",
61
+ # Async
62
+ "aio",
63
+ # Cache
64
+ "cache",
65
+ ]
@@ -0,0 +1,6 @@
1
+ """Allow running as python -m openalex_local."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,45 @@
1
+ """Cache module for openalex_local.
2
+
3
+ Provides local caching of search results and works for offline analysis.
4
+
5
+ Example:
6
+ >>> from openalex_local import cache
7
+ >>> # Create a cache from search
8
+ >>> info = cache.create("ml_papers", query="machine learning", limit=1000)
9
+ >>> print(f"Cached {info.count} papers")
10
+ >>>
11
+ >>> # Query the cache
12
+ >>> papers = cache.query("ml_papers", year_min=2020)
13
+ >>> # Get IDs for further processing
14
+ >>> ids = cache.query_ids("ml_papers")
15
+ """
16
+
17
+ from .models import CacheInfo
18
+ from .core import (
19
+ create,
20
+ append,
21
+ load,
22
+ query,
23
+ query_ids,
24
+ stats,
25
+ info,
26
+ exists,
27
+ list_caches,
28
+ delete,
29
+ )
30
+ from .export import export
31
+
32
+ __all__ = [
33
+ "CacheInfo",
34
+ "create",
35
+ "append",
36
+ "load",
37
+ "query",
38
+ "query_ids",
39
+ "stats",
40
+ "info",
41
+ "exists",
42
+ "list_caches",
43
+ "delete",
44
+ "export",
45
+ ]
@@ -0,0 +1,298 @@
1
+ """Core cache operations."""
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from .models import CacheInfo
9
+ from .utils import (
10
+ ensure_cache_dir,
11
+ get_cache_dir,
12
+ get_cache_path,
13
+ validate_cache_name,
14
+ )
15
+
16
+
17
+ def _load_cache_raw(name: str) -> Dict[str, Any]:
18
+ """Load raw cache data."""
19
+ path = get_cache_path(name)
20
+ if not path.exists():
21
+ raise FileNotFoundError(f"Cache not found: {name}")
22
+ with open(path, "r", encoding="utf-8") as f:
23
+ return json.load(f)
24
+
25
+
26
+ def _save_cache_raw(name: str, data: Dict[str, Any]) -> Path:
27
+ """Save raw cache data."""
28
+ ensure_cache_dir()
29
+ path = get_cache_path(name)
30
+ with open(path, "w", encoding="utf-8") as f:
31
+ json.dump(data, f, ensure_ascii=False, indent=2)
32
+ return path
33
+
34
+
35
+ def create(
36
+ name: str,
37
+ query: Optional[str] = None,
38
+ ids: Optional[List[str]] = None,
39
+ papers: Optional[List[Dict]] = None,
40
+ limit: int = 1000,
41
+ ) -> CacheInfo:
42
+ """
43
+ Create a new cache.
44
+
45
+ Args:
46
+ name: Cache name (will be sanitized for filesystem)
47
+ query: Search query to populate cache
48
+ ids: List of OpenAlex IDs or DOIs to cache
49
+ papers: Pre-fetched paper dictionaries to cache
50
+ limit: Maximum papers to cache from query
51
+
52
+ Returns:
53
+ CacheInfo with cache details
54
+ """
55
+ from .. import search, get_many
56
+
57
+ error = validate_cache_name(name)
58
+ if error:
59
+ raise ValueError(error)
60
+
61
+ works_data = []
62
+ queries = []
63
+
64
+ if query:
65
+ results = search(query, limit=limit)
66
+ works_data.extend([w.to_dict() for w in results.works])
67
+ queries.append(query)
68
+
69
+ if ids:
70
+ works = get_many(ids)
71
+ works_data.extend([w.to_dict() for w in works])
72
+
73
+ if papers:
74
+ works_data.extend(papers)
75
+
76
+ # Remove duplicates by openalex_id
77
+ seen = set()
78
+ unique_works = []
79
+ for w in works_data:
80
+ oid = w.get("openalex_id")
81
+ if oid and oid not in seen:
82
+ seen.add(oid)
83
+ unique_works.append(w)
84
+
85
+ now = datetime.utcnow().isoformat()
86
+ cache_data = {
87
+ "name": name,
88
+ "created_at": now,
89
+ "updated_at": now,
90
+ "queries": queries,
91
+ "works": unique_works,
92
+ }
93
+
94
+ path = _save_cache_raw(name, cache_data)
95
+
96
+ return CacheInfo(
97
+ name=name,
98
+ path=str(path),
99
+ count=len(unique_works),
100
+ created_at=now,
101
+ updated_at=now,
102
+ queries=queries,
103
+ size_bytes=path.stat().st_size,
104
+ )
105
+
106
+
107
+ def append(
108
+ name: str,
109
+ query: Optional[str] = None,
110
+ ids: Optional[List[str]] = None,
111
+ limit: int = 1000,
112
+ ) -> CacheInfo:
113
+ """Append works to an existing cache."""
114
+ from .. import search, get_many
115
+
116
+ cache_data = _load_cache_raw(name)
117
+ existing_ids = {w.get("openalex_id") for w in cache_data.get("works", [])}
118
+
119
+ new_works = []
120
+ queries = cache_data.get("queries", [])
121
+
122
+ if query:
123
+ results = search(query, limit=limit)
124
+ for w in results.works:
125
+ if w.openalex_id not in existing_ids:
126
+ new_works.append(w.to_dict())
127
+ existing_ids.add(w.openalex_id)
128
+ if query not in queries:
129
+ queries.append(query)
130
+
131
+ if ids:
132
+ works = get_many(ids)
133
+ for w in works:
134
+ if w.openalex_id not in existing_ids:
135
+ new_works.append(w.to_dict())
136
+ existing_ids.add(w.openalex_id)
137
+
138
+ cache_data["works"].extend(new_works)
139
+ cache_data["queries"] = queries
140
+ cache_data["updated_at"] = datetime.utcnow().isoformat()
141
+
142
+ path = _save_cache_raw(name, cache_data)
143
+
144
+ return CacheInfo(
145
+ name=name,
146
+ path=str(path),
147
+ count=len(cache_data["works"]),
148
+ created_at=cache_data.get("created_at", ""),
149
+ updated_at=cache_data["updated_at"],
150
+ queries=queries,
151
+ size_bytes=path.stat().st_size,
152
+ )
153
+
154
+
155
+ def load(name: str) -> List[Dict]:
156
+ """Load all works from a cache."""
157
+ cache_data = _load_cache_raw(name)
158
+ return cache_data.get("works", [])
159
+
160
+
161
+ def query(
162
+ name: str,
163
+ fields: Optional[List[str]] = None,
164
+ year_min: Optional[int] = None,
165
+ year_max: Optional[int] = None,
166
+ cited_min: Optional[int] = None,
167
+ has_abstract: Optional[bool] = None,
168
+ is_oa: Optional[bool] = None,
169
+ source: Optional[str] = None,
170
+ limit: Optional[int] = None,
171
+ ) -> List[Dict]:
172
+ """Query a cache with filters."""
173
+ works = load(name)
174
+ results = []
175
+
176
+ for w in works:
177
+ if year_min and (w.get("year") or 0) < year_min:
178
+ continue
179
+ if year_max and (w.get("year") or 9999) > year_max:
180
+ continue
181
+ if cited_min and (w.get("cited_by_count") or 0) < cited_min:
182
+ continue
183
+ if has_abstract is not None:
184
+ has_abs = bool(w.get("abstract"))
185
+ if has_abstract != has_abs:
186
+ continue
187
+ if is_oa is not None and w.get("is_oa") != is_oa:
188
+ continue
189
+ if source and source.lower() not in (w.get("source") or "").lower():
190
+ continue
191
+
192
+ if fields:
193
+ w = {k: w.get(k) for k in fields}
194
+
195
+ results.append(w)
196
+
197
+ if limit and len(results) >= limit:
198
+ break
199
+
200
+ return results
201
+
202
+
203
+ def query_ids(name: str) -> List[str]:
204
+ """Get all OpenAlex IDs from a cache."""
205
+ works = load(name)
206
+ return [w.get("openalex_id") for w in works if w.get("openalex_id")]
207
+
208
+
209
+ def stats(name: str) -> Dict[str, Any]:
210
+ """Get statistics for a cache."""
211
+ cache_data = _load_cache_raw(name)
212
+ works = cache_data.get("works", [])
213
+
214
+ if not works:
215
+ return {
216
+ "name": name, "total": 0, "year_min": None, "year_max": None,
217
+ "citations_total": 0, "citations_mean": 0,
218
+ "with_abstract": 0, "open_access": 0, "sources": [],
219
+ }
220
+
221
+ years = [w.get("year") for w in works if w.get("year")]
222
+ citations = [w.get("cited_by_count") or 0 for w in works]
223
+ abstracts = sum(1 for w in works if w.get("abstract"))
224
+ oa_count = sum(1 for w in works if w.get("is_oa"))
225
+
226
+ source_counts: Dict[str, int] = {}
227
+ for w in works:
228
+ src = w.get("source")
229
+ if src:
230
+ source_counts[src] = source_counts.get(src, 0) + 1
231
+ top_sources = sorted(source_counts.items(), key=lambda x: -x[1])[:10]
232
+
233
+ return {
234
+ "name": name,
235
+ "total": len(works),
236
+ "year_min": min(years) if years else None,
237
+ "year_max": max(years) if years else None,
238
+ "citations_total": sum(citations),
239
+ "citations_mean": sum(citations) / len(works) if works else 0,
240
+ "with_abstract": abstracts,
241
+ "with_abstract_pct": round(100 * abstracts / len(works), 1) if works else 0,
242
+ "open_access": oa_count,
243
+ "open_access_pct": round(100 * oa_count / len(works), 1) if works else 0,
244
+ "sources": top_sources,
245
+ "queries": cache_data.get("queries", []),
246
+ "created_at": cache_data.get("created_at"),
247
+ "updated_at": cache_data.get("updated_at"),
248
+ }
249
+
250
+
251
+ def info(name: str) -> CacheInfo:
252
+ """Get cache info."""
253
+ path = get_cache_path(name)
254
+ if not path.exists():
255
+ raise FileNotFoundError(f"Cache not found: {name}")
256
+
257
+ cache_data = _load_cache_raw(name)
258
+
259
+ return CacheInfo(
260
+ name=name,
261
+ path=str(path),
262
+ count=len(cache_data.get("works", [])),
263
+ created_at=cache_data.get("created_at", ""),
264
+ updated_at=cache_data.get("updated_at", ""),
265
+ queries=cache_data.get("queries", []),
266
+ size_bytes=path.stat().st_size,
267
+ )
268
+
269
+
270
+ def exists(name: str) -> bool:
271
+ """Check if a cache exists."""
272
+ return get_cache_path(name).exists()
273
+
274
+
275
+ def list_caches() -> List[CacheInfo]:
276
+ """List all caches."""
277
+ cache_dir = get_cache_dir()
278
+ if not cache_dir.exists():
279
+ return []
280
+
281
+ caches = []
282
+ for path in cache_dir.glob("*.json"):
283
+ try:
284
+ cache_info = info(path.stem)
285
+ caches.append(cache_info)
286
+ except (json.JSONDecodeError, KeyError):
287
+ continue
288
+
289
+ return sorted(caches, key=lambda c: c.updated_at, reverse=True)
290
+
291
+
292
+ def delete(name: str) -> bool:
293
+ """Delete a cache."""
294
+ path = get_cache_path(name)
295
+ if path.exists():
296
+ path.unlink()
297
+ return True
298
+ return False
@@ -0,0 +1,100 @@
1
+ """Cache export functionality."""
2
+
3
+ import csv
4
+ import json
5
+ from pathlib import Path
6
+ from typing import List, Dict
7
+
8
+ from .core import load
9
+
10
+
11
+ def export(
12
+ name: str,
13
+ output_path: str,
14
+ format: str = "json",
15
+ ) -> str:
16
+ """
17
+ Export a cache to a file.
18
+
19
+ Args:
20
+ name: Cache name
21
+ output_path: Output file path
22
+ format: Export format ("json", "csv", "bibtex")
23
+
24
+ Returns:
25
+ Path to exported file
26
+ """
27
+ works = load(name)
28
+ output = Path(output_path)
29
+
30
+ if format == "json":
31
+ _export_json(works, output)
32
+ elif format == "csv":
33
+ _export_csv(works, output)
34
+ elif format == "bibtex":
35
+ _export_bibtex(works, output)
36
+ else:
37
+ raise ValueError(f"Unknown format: {format}. Use 'json', 'csv', or 'bibtex'")
38
+
39
+ return str(output)
40
+
41
+
42
+ def _export_json(works: List[Dict], output: Path) -> None:
43
+ """Export to JSON format."""
44
+ with open(output, "w", encoding="utf-8") as f:
45
+ json.dump(works, f, ensure_ascii=False, indent=2)
46
+
47
+
48
+ def _export_csv(works: List[Dict], output: Path) -> None:
49
+ """Export to CSV format."""
50
+ if not works:
51
+ output.write_text("")
52
+ return
53
+
54
+ # Get all unique keys
55
+ keys = set()
56
+ for w in works:
57
+ keys.update(w.keys())
58
+
59
+ # Prioritize common fields
60
+ priority = ["openalex_id", "doi", "title", "authors", "year", "source", "cited_by_count"]
61
+ fieldnames = [k for k in priority if k in keys]
62
+ fieldnames.extend(sorted(k for k in keys if k not in priority))
63
+
64
+ with open(output, "w", encoding="utf-8", newline="") as f:
65
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
66
+ writer.writeheader()
67
+ for w in works:
68
+ row = {}
69
+ for k, v in w.items():
70
+ if isinstance(v, list):
71
+ row[k] = "; ".join(str(x) for x in v)
72
+ else:
73
+ row[k] = v
74
+ writer.writerow(row)
75
+
76
+
77
+ def _export_bibtex(works: List[Dict], output: Path) -> None:
78
+ """Export to BibTeX format."""
79
+ from .._core.models import Work
80
+
81
+ lines = []
82
+ for w in works:
83
+ work = Work(
84
+ openalex_id=w.get("openalex_id", ""),
85
+ doi=w.get("doi"),
86
+ title=w.get("title"),
87
+ authors=w.get("authors", []),
88
+ year=w.get("year"),
89
+ source=w.get("source"),
90
+ volume=w.get("volume"),
91
+ issue=w.get("issue"),
92
+ pages=w.get("pages"),
93
+ publisher=w.get("publisher"),
94
+ type=w.get("type"),
95
+ oa_url=w.get("oa_url"),
96
+ )
97
+ lines.append(work.citation("bibtex"))
98
+ lines.append("")
99
+
100
+ output.write_text("\n".join(lines), encoding="utf-8")
@@ -0,0 +1,17 @@
1
+ """Cache data models."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List
5
+
6
+
7
+ @dataclass
8
+ class CacheInfo:
9
+ """Information about a cache."""
10
+
11
+ name: str
12
+ path: str
13
+ count: int
14
+ created_at: str
15
+ updated_at: str
16
+ queries: List[str] = field(default_factory=list)
17
+ size_bytes: int = 0
@@ -0,0 +1,85 @@
1
+ """Cache utilities for openalex_local."""
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ # Default cache directory
9
+ DEFAULT_CACHE_DIR = Path.home() / ".openalex_local" / "caches"
10
+
11
+
12
+ def get_cache_dir() -> Path:
13
+ """Get cache directory from environment or default."""
14
+ env_dir = os.environ.get("OPENALEX_LOCAL_CACHE_DIR")
15
+ if env_dir:
16
+ return Path(env_dir)
17
+ return DEFAULT_CACHE_DIR
18
+
19
+
20
+ def sanitize_cache_name(name: str) -> str:
21
+ """
22
+ Sanitize cache name for filesystem safety.
23
+
24
+ Args:
25
+ name: Raw cache name
26
+
27
+ Returns:
28
+ Sanitized cache name
29
+
30
+ Example:
31
+ >>> sanitize_cache_name("my cache/name!")
32
+ 'my_cache_name_'
33
+ """
34
+ # Replace non-alphanumeric characters (except - and _) with underscore
35
+ sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
36
+ # Remove leading/trailing underscores
37
+ sanitized = sanitized.strip("_")
38
+ # Limit length
39
+ if len(sanitized) > 100:
40
+ sanitized = sanitized[:100]
41
+ # Ensure not empty
42
+ if not sanitized:
43
+ sanitized = "cache"
44
+ return sanitized
45
+
46
+
47
+ def get_cache_path(name: str) -> Path:
48
+ """
49
+ Get full path to cache file.
50
+
51
+ Args:
52
+ name: Cache name
53
+
54
+ Returns:
55
+ Path to cache JSON file
56
+ """
57
+ cache_dir = get_cache_dir()
58
+ safe_name = sanitize_cache_name(name)
59
+ return cache_dir / f"{safe_name}.json"
60
+
61
+
62
+ def ensure_cache_dir() -> Path:
63
+ """Ensure cache directory exists."""
64
+ cache_dir = get_cache_dir()
65
+ cache_dir.mkdir(parents=True, exist_ok=True)
66
+ return cache_dir
67
+
68
+
69
+ def validate_cache_name(name: str) -> Optional[str]:
70
+ """
71
+ Validate cache name and return error message if invalid.
72
+
73
+ Args:
74
+ name: Cache name to validate
75
+
76
+ Returns:
77
+ Error message if invalid, None if valid
78
+ """
79
+ if not name:
80
+ return "Cache name cannot be empty"
81
+ if len(name) > 100:
82
+ return "Cache name too long (max 100 characters)"
83
+ if name.startswith("."):
84
+ return "Cache name cannot start with '.'"
85
+ return None
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python3
2
+ """Internal CLI modules."""
3
+
4
+ from .cli import cli, main
5
+ from .mcp import mcp, run_mcp_server
6
+
7
+ __all__ = ["cli", "main", "mcp", "run_mcp_server"]
8
+
9
+ # EOF