openalex-local 0.1.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openalex_local/__init__.py +54 -3
- openalex_local/__main__.py +6 -0
- openalex_local/_cache/__init__.py +45 -0
- openalex_local/_cache/core.py +298 -0
- openalex_local/_cache/export.py +100 -0
- openalex_local/_cache/models.py +17 -0
- openalex_local/_cache/utils.py +85 -0
- openalex_local/_cli/__init__.py +9 -0
- openalex_local/_cli/cli.py +409 -0
- openalex_local/_cli/cli_cache.py +220 -0
- openalex_local/_cli/mcp.py +210 -0
- openalex_local/_cli/mcp_server.py +235 -0
- openalex_local/_core/__init__.py +42 -0
- openalex_local/_core/api.py +376 -0
- openalex_local/_core/config.py +120 -0
- openalex_local/_core/db.py +214 -0
- openalex_local/_core/export.py +252 -0
- openalex_local/_core/fts.py +165 -0
- openalex_local/_core/models.py +432 -0
- openalex_local/_remote/__init__.py +34 -0
- openalex_local/_remote/base.py +256 -0
- openalex_local/_server/__init__.py +117 -0
- openalex_local/_server/routes.py +175 -0
- openalex_local/aio.py +259 -0
- openalex_local/cache.py +31 -0
- openalex_local/cli.py +8 -0
- openalex_local/jobs.py +169 -0
- openalex_local/remote.py +8 -0
- openalex_local/server.py +8 -0
- openalex_local-0.3.1.dist-info/METADATA +288 -0
- openalex_local-0.3.1.dist-info/RECORD +34 -0
- {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/WHEEL +1 -1
- openalex_local-0.3.1.dist-info/entry_points.txt +2 -0
- openalex_local/config.py +0 -73
- openalex_local/models.py +0 -187
- openalex_local-0.1.0.dist-info/METADATA +0 -152
- openalex_local-0.1.0.dist-info/RECORD +0 -8
- openalex_local-0.1.0.dist-info/entry_points.txt +0 -2
- {openalex_local-0.1.0.dist-info → openalex_local-0.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Export functionality for Work and SearchResult objects.
|
|
2
|
+
|
|
3
|
+
Supports multiple output formats:
|
|
4
|
+
- text: Human-readable formatted text
|
|
5
|
+
- json: JSON format with all fields
|
|
6
|
+
- bibtex: BibTeX bibliography format
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json as _json
|
|
10
|
+
from pathlib import Path as _Path
|
|
11
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .models import SearchResult, Work
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"save",
|
|
18
|
+
"export_text",
|
|
19
|
+
"export_json",
|
|
20
|
+
"export_bibtex",
|
|
21
|
+
"SUPPORTED_FORMATS",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
SUPPORTED_FORMATS = ["text", "json", "bibtex"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def work_to_text(work: "Work", include_abstract: bool = False) -> str:
|
|
28
|
+
"""Convert a Work to human-readable text format.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
work: Work object to convert
|
|
32
|
+
include_abstract: Whether to include abstract
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Formatted text string
|
|
36
|
+
"""
|
|
37
|
+
lines = []
|
|
38
|
+
|
|
39
|
+
# Title
|
|
40
|
+
title = work.title or "Untitled"
|
|
41
|
+
year = f"({work.year})" if work.year else ""
|
|
42
|
+
lines.append(f"{title} {year}".strip())
|
|
43
|
+
|
|
44
|
+
# Authors
|
|
45
|
+
if work.authors:
|
|
46
|
+
authors_str = ", ".join(work.authors[:5])
|
|
47
|
+
if len(work.authors) > 5:
|
|
48
|
+
authors_str += f" et al. ({len(work.authors)} authors)"
|
|
49
|
+
lines.append(f"Authors: {authors_str}")
|
|
50
|
+
|
|
51
|
+
# Journal and identifiers
|
|
52
|
+
if work.source:
|
|
53
|
+
source_line = f"Journal: {work.source}"
|
|
54
|
+
if work.volume:
|
|
55
|
+
source_line += f", {work.volume}"
|
|
56
|
+
if work.issue:
|
|
57
|
+
source_line += f"({work.issue})"
|
|
58
|
+
if work.pages:
|
|
59
|
+
source_line += f", {work.pages}"
|
|
60
|
+
lines.append(source_line)
|
|
61
|
+
|
|
62
|
+
if work.doi:
|
|
63
|
+
lines.append(f"DOI: {work.doi}")
|
|
64
|
+
|
|
65
|
+
lines.append(f"OpenAlex ID: {work.openalex_id}")
|
|
66
|
+
|
|
67
|
+
# Citation count
|
|
68
|
+
if work.cited_by_count is not None:
|
|
69
|
+
lines.append(f"Citations: {work.cited_by_count}")
|
|
70
|
+
|
|
71
|
+
# Open access
|
|
72
|
+
if work.is_oa:
|
|
73
|
+
lines.append(f"Open Access: {work.oa_url or 'Yes'}")
|
|
74
|
+
|
|
75
|
+
# Abstract
|
|
76
|
+
if include_abstract and work.abstract:
|
|
77
|
+
lines.append(f"Abstract: {work.abstract}")
|
|
78
|
+
|
|
79
|
+
return "\n".join(lines)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def export_text(
|
|
83
|
+
works: List["Work"],
|
|
84
|
+
include_abstract: bool = False,
|
|
85
|
+
query: Optional[str] = None,
|
|
86
|
+
total: Optional[int] = None,
|
|
87
|
+
elapsed_ms: Optional[float] = None,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""Export works to text format.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
works: List of Work objects
|
|
93
|
+
include_abstract: Whether to include abstracts
|
|
94
|
+
query: Original search query (for header)
|
|
95
|
+
total: Total number of matches
|
|
96
|
+
elapsed_ms: Search time in milliseconds
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Formatted text string
|
|
100
|
+
"""
|
|
101
|
+
lines = []
|
|
102
|
+
|
|
103
|
+
# Header
|
|
104
|
+
if query is not None:
|
|
105
|
+
lines.append(f"Search: {query}")
|
|
106
|
+
if total is not None:
|
|
107
|
+
lines.append(f"Found: {total:,} matches")
|
|
108
|
+
if elapsed_ms is not None:
|
|
109
|
+
lines.append(f"Time: {elapsed_ms:.1f}ms")
|
|
110
|
+
lines.append("")
|
|
111
|
+
lines.append("=" * 60)
|
|
112
|
+
lines.append("")
|
|
113
|
+
|
|
114
|
+
# Works
|
|
115
|
+
for i, work in enumerate(works, 1):
|
|
116
|
+
lines.append(f"[{i}]")
|
|
117
|
+
lines.append(work_to_text(work, include_abstract=include_abstract))
|
|
118
|
+
lines.append("")
|
|
119
|
+
lines.append("-" * 40)
|
|
120
|
+
lines.append("")
|
|
121
|
+
|
|
122
|
+
return "\n".join(lines)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def export_json(
|
|
126
|
+
works: List["Work"],
|
|
127
|
+
query: Optional[str] = None,
|
|
128
|
+
total: Optional[int] = None,
|
|
129
|
+
elapsed_ms: Optional[float] = None,
|
|
130
|
+
indent: int = 2,
|
|
131
|
+
) -> str:
|
|
132
|
+
"""Export works to JSON format.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
works: List of Work objects
|
|
136
|
+
query: Original search query
|
|
137
|
+
total: Total number of matches
|
|
138
|
+
elapsed_ms: Search time in milliseconds
|
|
139
|
+
indent: JSON indentation
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
JSON string
|
|
143
|
+
"""
|
|
144
|
+
data = {
|
|
145
|
+
"works": [w.to_dict() for w in works],
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if query is not None:
|
|
149
|
+
data["query"] = query
|
|
150
|
+
if total is not None:
|
|
151
|
+
data["total"] = total
|
|
152
|
+
if elapsed_ms is not None:
|
|
153
|
+
data["elapsed_ms"] = elapsed_ms
|
|
154
|
+
|
|
155
|
+
return _json.dumps(data, indent=indent, ensure_ascii=False)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def export_bibtex(works: List["Work"]) -> str:
|
|
159
|
+
"""Export works to BibTeX format.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
works: List of Work objects
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
BibTeX string with all entries
|
|
166
|
+
"""
|
|
167
|
+
entries = [w.citation("bibtex") for w in works]
|
|
168
|
+
return "\n\n".join(entries)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def save(
|
|
172
|
+
data: Union["Work", "SearchResult", List["Work"]],
|
|
173
|
+
path: Union[str, _Path],
|
|
174
|
+
format: str = "json",
|
|
175
|
+
include_abstract: bool = True,
|
|
176
|
+
) -> str:
|
|
177
|
+
"""Save Work(s) or SearchResult to a file.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
data: Work, SearchResult, or list of Works to save
|
|
181
|
+
path: Output file path
|
|
182
|
+
format: Output format ("text", "json", "bibtex")
|
|
183
|
+
include_abstract: Include abstracts in text format
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Path to saved file
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
ValueError: If format is not supported
|
|
190
|
+
|
|
191
|
+
Examples:
|
|
192
|
+
>>> from openalex_local import search, save
|
|
193
|
+
>>> results = search("machine learning", limit=10)
|
|
194
|
+
>>> save(results, "results.json")
|
|
195
|
+
>>> save(results, "results.bib", format="bibtex")
|
|
196
|
+
>>> save(results, "results.txt", format="text")
|
|
197
|
+
"""
|
|
198
|
+
from .models import SearchResult, Work
|
|
199
|
+
|
|
200
|
+
if format not in SUPPORTED_FORMATS:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Unsupported format: {format}. "
|
|
203
|
+
f"Supported formats: {', '.join(SUPPORTED_FORMATS)}"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
path = _Path(path)
|
|
207
|
+
|
|
208
|
+
# Extract works and metadata
|
|
209
|
+
if isinstance(data, Work):
|
|
210
|
+
works = [data]
|
|
211
|
+
query = None
|
|
212
|
+
total = None
|
|
213
|
+
elapsed_ms = None
|
|
214
|
+
elif isinstance(data, SearchResult):
|
|
215
|
+
works = data.works
|
|
216
|
+
query = data.query
|
|
217
|
+
total = data.total
|
|
218
|
+
elapsed_ms = data.elapsed_ms
|
|
219
|
+
elif isinstance(data, list):
|
|
220
|
+
works = data
|
|
221
|
+
query = None
|
|
222
|
+
total = len(data)
|
|
223
|
+
elapsed_ms = None
|
|
224
|
+
else:
|
|
225
|
+
raise TypeError(f"Unsupported data type: {type(data)}")
|
|
226
|
+
|
|
227
|
+
# Generate content
|
|
228
|
+
if format == "text":
|
|
229
|
+
content = export_text(
|
|
230
|
+
works,
|
|
231
|
+
include_abstract=include_abstract,
|
|
232
|
+
query=query,
|
|
233
|
+
total=total,
|
|
234
|
+
elapsed_ms=elapsed_ms,
|
|
235
|
+
)
|
|
236
|
+
elif format == "json":
|
|
237
|
+
content = export_json(
|
|
238
|
+
works,
|
|
239
|
+
query=query,
|
|
240
|
+
total=total,
|
|
241
|
+
elapsed_ms=elapsed_ms,
|
|
242
|
+
)
|
|
243
|
+
elif format == "bibtex":
|
|
244
|
+
content = export_bibtex(works)
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
247
|
+
|
|
248
|
+
# Write to file
|
|
249
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
250
|
+
path.write_text(content, encoding="utf-8")
|
|
251
|
+
|
|
252
|
+
return str(path)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Full-text search using FTS5."""
|
|
2
|
+
|
|
3
|
+
import re as _re
|
|
4
|
+
import time as _time
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from .db import Database, get_db
|
|
8
|
+
from .models import SearchResult, Work
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"search",
|
|
12
|
+
"count",
|
|
13
|
+
"search_ids",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _sanitize_query(query: str) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Sanitize query for FTS5.
|
|
20
|
+
|
|
21
|
+
Handles special characters that FTS5 interprets as operators.
|
|
22
|
+
"""
|
|
23
|
+
if query.startswith('"') and query.endswith('"'):
|
|
24
|
+
return query
|
|
25
|
+
|
|
26
|
+
has_hyphenated_word = _re.search(r"\w+-\w+", query)
|
|
27
|
+
has_special = _re.search(r"[/\\@#$%^&]", query)
|
|
28
|
+
|
|
29
|
+
if has_hyphenated_word or has_special:
|
|
30
|
+
words = query.split()
|
|
31
|
+
quoted = " ".join(f'"{w}"' for w in words)
|
|
32
|
+
return quoted
|
|
33
|
+
|
|
34
|
+
return query
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def search(
|
|
38
|
+
query: str,
|
|
39
|
+
limit: int = 20,
|
|
40
|
+
offset: int = 0,
|
|
41
|
+
db: Optional[Database] = None,
|
|
42
|
+
) -> SearchResult:
|
|
43
|
+
"""
|
|
44
|
+
Full-text search across works.
|
|
45
|
+
|
|
46
|
+
Uses FTS5 index for fast searching across titles and abstracts.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
query: Search query (supports FTS5 syntax like AND, OR, NOT, "phrases")
|
|
50
|
+
limit: Maximum results to return
|
|
51
|
+
offset: Skip first N results (for pagination)
|
|
52
|
+
db: Database connection (uses singleton if not provided)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
SearchResult with matching works
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> results = search("machine learning neural networks")
|
|
59
|
+
>>> print(f"Found {results.total} matches in {results.elapsed_ms:.1f}ms")
|
|
60
|
+
"""
|
|
61
|
+
if db is None:
|
|
62
|
+
db = get_db()
|
|
63
|
+
|
|
64
|
+
start = _time.perf_counter()
|
|
65
|
+
safe_query = _sanitize_query(query)
|
|
66
|
+
|
|
67
|
+
# Get total count
|
|
68
|
+
count_row = db.fetchone(
|
|
69
|
+
"SELECT COUNT(*) as total FROM works_fts WHERE works_fts MATCH ?",
|
|
70
|
+
(safe_query,),
|
|
71
|
+
)
|
|
72
|
+
total = count_row["total"] if count_row else 0
|
|
73
|
+
|
|
74
|
+
# Get matching works
|
|
75
|
+
rows = db.fetchall(
|
|
76
|
+
"""
|
|
77
|
+
SELECT w.*
|
|
78
|
+
FROM works_fts f
|
|
79
|
+
JOIN works w ON f.rowid = w.rowid
|
|
80
|
+
WHERE works_fts MATCH ?
|
|
81
|
+
LIMIT ? OFFSET ?
|
|
82
|
+
""",
|
|
83
|
+
(safe_query, limit, offset),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
elapsed_ms = (_time.perf_counter() - start) * 1000
|
|
87
|
+
|
|
88
|
+
# Convert to Work objects
|
|
89
|
+
works = []
|
|
90
|
+
for row in rows:
|
|
91
|
+
data = db._row_to_dict(row)
|
|
92
|
+
works.append(Work.from_db_row(data))
|
|
93
|
+
|
|
94
|
+
return SearchResult(
|
|
95
|
+
works=works,
|
|
96
|
+
total=total,
|
|
97
|
+
query=query,
|
|
98
|
+
elapsed_ms=elapsed_ms,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def count(query: str, db: Optional[Database] = None) -> int:
|
|
103
|
+
"""
|
|
104
|
+
Count matching works without fetching results.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
query: FTS5 search query
|
|
108
|
+
db: Database connection
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Number of matching works
|
|
112
|
+
"""
|
|
113
|
+
if db is None:
|
|
114
|
+
db = get_db()
|
|
115
|
+
|
|
116
|
+
safe_query = _sanitize_query(query)
|
|
117
|
+
row = db.fetchone(
|
|
118
|
+
"SELECT COUNT(*) as total FROM works_fts WHERE works_fts MATCH ?",
|
|
119
|
+
(safe_query,),
|
|
120
|
+
)
|
|
121
|
+
return row["total"] if row else 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def search_ids(
|
|
125
|
+
query: str,
|
|
126
|
+
limit: int = 1000,
|
|
127
|
+
db: Optional[Database] = None,
|
|
128
|
+
) -> List[str]:
|
|
129
|
+
"""
|
|
130
|
+
Search and return only OpenAlex IDs (faster than full search).
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
query: FTS5 search query
|
|
134
|
+
limit: Maximum IDs to return
|
|
135
|
+
db: Database connection
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of matching OpenAlex IDs
|
|
139
|
+
"""
|
|
140
|
+
if db is None:
|
|
141
|
+
db = get_db()
|
|
142
|
+
|
|
143
|
+
safe_query = _sanitize_query(query)
|
|
144
|
+
rows = db.fetchall(
|
|
145
|
+
"""
|
|
146
|
+
SELECT w.openalex_id
|
|
147
|
+
FROM works_fts f
|
|
148
|
+
JOIN works w ON f.rowid = w.rowid
|
|
149
|
+
WHERE works_fts MATCH ?
|
|
150
|
+
LIMIT ?
|
|
151
|
+
""",
|
|
152
|
+
(safe_query, limit),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return [row["openalex_id"] for row in rows]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _search_with_db(db: Database, query: str, limit: int, offset: int) -> SearchResult:
|
|
159
|
+
"""Search with explicit database connection (for thread-safe async)."""
|
|
160
|
+
return search(query, limit, offset, db=db)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _count_with_db(db: Database, query: str) -> int:
|
|
164
|
+
"""Count with explicit database connection (for thread-safe async)."""
|
|
165
|
+
return count(query, db=db)
|