pyzotero 1.7.5__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyzotero/__init__.py +60 -0
- pyzotero/_client.py +1402 -0
- pyzotero/_decorators.py +195 -0
- pyzotero/_search.py +190 -0
- pyzotero/_upload.py +241 -0
- pyzotero/_utils.py +86 -0
- pyzotero/cli.py +420 -1
- pyzotero/errors.py +185 -0
- pyzotero/filetransport.py +2 -2
- pyzotero/semantic_scholar.py +441 -0
- pyzotero/zotero.py +62 -2029
- pyzotero/zotero_errors.py +53 -136
- {pyzotero-1.7.5.dist-info → pyzotero-1.8.0.dist-info}/METADATA +3 -3
- pyzotero-1.8.0.dist-info/RECORD +16 -0
- pyzotero-1.7.5.dist-info/RECORD +0 -9
- {pyzotero-1.7.5.dist-info → pyzotero-1.8.0.dist-info}/WHEEL +0 -0
- {pyzotero-1.7.5.dist-info → pyzotero-1.8.0.dist-info}/entry_points.txt +0 -0
pyzotero/filetransport.py
CHANGED
|
@@ -60,8 +60,8 @@ def is_absolute_url(self):
|
|
|
60
60
|
return not self.is_relative_url
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
httpx.URL.is_relative_url = property(is_relative_url)
|
|
64
|
-
httpx.URL.is_absolute_url = property(is_absolute_url)
|
|
63
|
+
httpx.URL.is_relative_url = property(is_relative_url)
|
|
64
|
+
httpx.URL.is_absolute_url = property(is_absolute_url)
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
class FileTransport(AsyncBaseTransport, BaseTransport):
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
"""Semantic Scholar API client for pyzotero.
|
|
2
|
+
|
|
3
|
+
This module provides functions to interact with the Semantic Scholar Graph API
|
|
4
|
+
for fetching paper metadata, citations, references, and recommendations.
|
|
5
|
+
|
|
6
|
+
API Documentation: https://api.semanticscholar.org/api-docs
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
from httpx import codes as http
|
|
11
|
+
|
|
12
|
+
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
|
13
|
+
RECOMMENDATIONS_URL = "https://api.semanticscholar.org/recommendations/v1"
|
|
14
|
+
|
|
15
|
+
# Fields to request from the Semantic Scholar API
|
|
16
|
+
DEFAULT_FIELDS = [
|
|
17
|
+
"paperId",
|
|
18
|
+
"externalIds",
|
|
19
|
+
"title",
|
|
20
|
+
"abstract",
|
|
21
|
+
"venue",
|
|
22
|
+
"year",
|
|
23
|
+
"referenceCount",
|
|
24
|
+
"citationCount",
|
|
25
|
+
"influentialCitationCount",
|
|
26
|
+
"isOpenAccess",
|
|
27
|
+
"openAccessPdf",
|
|
28
|
+
"authors",
|
|
29
|
+
"publicationTypes",
|
|
30
|
+
"publicationDate",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# Timeout for API requests (seconds)
|
|
34
|
+
REQUEST_TIMEOUT = 30.0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SemanticScholarError(Exception):
|
|
38
|
+
"""Base exception for Semantic Scholar API errors."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RateLimitError(SemanticScholarError):
|
|
42
|
+
"""Raised when API rate limit is exceeded."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, msg="Rate limit exceeded. Please wait and try again."):
|
|
45
|
+
super().__init__(msg)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class PaperNotFoundError(SemanticScholarError):
|
|
49
|
+
"""Raised when a paper is not found."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, msg="Paper not found."):
|
|
52
|
+
super().__init__(msg)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _make_request(url, params=None):
|
|
56
|
+
"""Make an HTTP GET request to the Semantic Scholar API.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
url: The full URL to request
|
|
60
|
+
params: Optional dict of query parameters
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The JSON response as a dict
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
RateLimitError: If rate limit is exceeded (HTTP 429)
|
|
67
|
+
PaperNotFoundError: If paper is not found (HTTP 404)
|
|
68
|
+
SemanticScholarError: For other API errors
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
with httpx.Client(timeout=REQUEST_TIMEOUT) as client:
|
|
72
|
+
response = client.get(url, params=params)
|
|
73
|
+
|
|
74
|
+
_check_response(response)
|
|
75
|
+
return response.json()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _check_response(response):
|
|
79
|
+
"""Check HTTP response and raise appropriate exceptions.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
response: httpx Response object
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
RateLimitError: If rate limit is exceeded (HTTP 429)
|
|
86
|
+
PaperNotFoundError: If paper is not found (HTTP 404)
|
|
87
|
+
SemanticScholarError: For other API errors
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
if response.status_code == http.TOO_MANY_REQUESTS:
|
|
91
|
+
raise RateLimitError
|
|
92
|
+
|
|
93
|
+
if response.status_code == http.NOT_FOUND:
|
|
94
|
+
raise PaperNotFoundError
|
|
95
|
+
|
|
96
|
+
if response.status_code != http.OK:
|
|
97
|
+
msg = f"Semantic Scholar API error: {response.status_code} - {response.text}"
|
|
98
|
+
raise SemanticScholarError(msg)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _format_paper_id(identifier, id_type=None): # noqa: PLR0911
|
|
102
|
+
"""Format a paper identifier for the Semantic Scholar API.
|
|
103
|
+
|
|
104
|
+
Semantic Scholar accepts various identifier formats:
|
|
105
|
+
- DOI: DOI:10.1234/example
|
|
106
|
+
- arXiv: ARXIV:1234.5678
|
|
107
|
+
- Semantic Scholar ID: direct use
|
|
108
|
+
- PMID: PMID:12345678
|
|
109
|
+
- MAG: MAG:12345678
|
|
110
|
+
- ACL: ACL:P19-1234
|
|
111
|
+
- CorpusID: CorpusId:12345678
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
identifier: The paper identifier
|
|
115
|
+
id_type: Optional type hint ("doi", "arxiv", "pmid", "mag", "acl", "corpus")
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Formatted identifier string for the API
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
if not identifier:
|
|
122
|
+
return identifier
|
|
123
|
+
|
|
124
|
+
identifier = identifier.strip()
|
|
125
|
+
|
|
126
|
+
# If already prefixed, return as-is
|
|
127
|
+
known_prefixes = ["DOI:", "ARXIV:", "PMID:", "MAG:", "ACL:", "CorpusId:"]
|
|
128
|
+
for prefix in known_prefixes:
|
|
129
|
+
if identifier.upper().startswith(prefix.upper()):
|
|
130
|
+
return identifier
|
|
131
|
+
|
|
132
|
+
# Strip common DOI URL prefixes
|
|
133
|
+
doi_prefixes = ["https://doi.org/", "http://doi.org/", "doi:"]
|
|
134
|
+
for prefix in doi_prefixes:
|
|
135
|
+
if identifier.lower().startswith(prefix.lower()):
|
|
136
|
+
identifier = identifier[len(prefix) :]
|
|
137
|
+
return f"DOI:{identifier}"
|
|
138
|
+
|
|
139
|
+
# If type hint provided, add appropriate prefix
|
|
140
|
+
if id_type:
|
|
141
|
+
type_map = {
|
|
142
|
+
"doi": "DOI:",
|
|
143
|
+
"arxiv": "ARXIV:",
|
|
144
|
+
"pmid": "PMID:",
|
|
145
|
+
"mag": "MAG:",
|
|
146
|
+
"acl": "ACL:",
|
|
147
|
+
"corpus": "CorpusId:",
|
|
148
|
+
}
|
|
149
|
+
prefix = type_map.get(id_type.lower())
|
|
150
|
+
if prefix:
|
|
151
|
+
return f"{prefix}{identifier}"
|
|
152
|
+
|
|
153
|
+
# Heuristic detection
|
|
154
|
+
# DOIs typically contain a slash and start with 10.
|
|
155
|
+
if "/" in identifier and identifier.startswith("10."):
|
|
156
|
+
return f"DOI:{identifier}"
|
|
157
|
+
|
|
158
|
+
# arXiv IDs have a specific format (YYMM.NNNNN or category/YYMMNNN)
|
|
159
|
+
if "." in identifier and identifier.split(".")[0].isdigit():
|
|
160
|
+
return f"ARXIV:{identifier}"
|
|
161
|
+
|
|
162
|
+
# If all else fails, assume it's a Semantic Scholar ID
|
|
163
|
+
return identifier
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _normalise_paper(paper_data):
|
|
167
|
+
"""Normalise paper data from Semantic Scholar to a consistent format.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
paper_data: Raw paper data from the API
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Normalised paper dict with consistent field names
|
|
174
|
+
|
|
175
|
+
"""
|
|
176
|
+
if not paper_data:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
external_ids = paper_data.get("externalIds") or {}
|
|
180
|
+
authors = paper_data.get("authors") or []
|
|
181
|
+
open_access_pdf = paper_data.get("openAccessPdf") or {}
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"paperId": paper_data.get("paperId"),
|
|
185
|
+
"doi": external_ids.get("DOI"),
|
|
186
|
+
"arxivId": external_ids.get("ArXiv"),
|
|
187
|
+
"pmid": external_ids.get("PubMed"),
|
|
188
|
+
"title": paper_data.get("title"),
|
|
189
|
+
"abstract": paper_data.get("abstract"),
|
|
190
|
+
"venue": paper_data.get("venue"),
|
|
191
|
+
"year": paper_data.get("year"),
|
|
192
|
+
"authors": [
|
|
193
|
+
{
|
|
194
|
+
"authorId": a.get("authorId"),
|
|
195
|
+
"name": a.get("name"),
|
|
196
|
+
}
|
|
197
|
+
for a in authors
|
|
198
|
+
],
|
|
199
|
+
"citationCount": paper_data.get("citationCount"),
|
|
200
|
+
"referenceCount": paper_data.get("referenceCount"),
|
|
201
|
+
"influentialCitationCount": paper_data.get("influentialCitationCount"),
|
|
202
|
+
"isOpenAccess": paper_data.get("isOpenAccess"),
|
|
203
|
+
"openAccessPdfUrl": open_access_pdf.get("url"),
|
|
204
|
+
"publicationTypes": paper_data.get("publicationTypes"),
|
|
205
|
+
"publicationDate": paper_data.get("publicationDate"),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def get_paper(identifier, id_type=None):
|
|
210
|
+
"""Get details for a single paper.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
|
|
214
|
+
id_type: Optional type hint for the identifier
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Normalised paper dict
|
|
218
|
+
|
|
219
|
+
Raises:
|
|
220
|
+
PaperNotFoundError: If paper is not found
|
|
221
|
+
SemanticScholarError: For API errors
|
|
222
|
+
|
|
223
|
+
"""
|
|
224
|
+
paper_id = _format_paper_id(identifier, id_type)
|
|
225
|
+
url = f"{BASE_URL}/paper/{paper_id}"
|
|
226
|
+
params = {"fields": ",".join(DEFAULT_FIELDS)}
|
|
227
|
+
|
|
228
|
+
data = _make_request(url, params)
|
|
229
|
+
return _normalise_paper(data)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def get_citations(identifier, id_type=None, limit=100, offset=0):
|
|
233
|
+
"""Get papers that cite a given paper.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
|
|
237
|
+
id_type: Optional type hint for the identifier
|
|
238
|
+
limit: Maximum number of results (default 100, max 1000)
|
|
239
|
+
offset: Offset for pagination
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Dict with 'total' count and 'papers' list
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
PaperNotFoundError: If paper is not found
|
|
246
|
+
SemanticScholarError: For API errors
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
paper_id = _format_paper_id(identifier, id_type)
|
|
250
|
+
url = f"{BASE_URL}/paper/{paper_id}/citations"
|
|
251
|
+
params = {
|
|
252
|
+
"fields": ",".join(DEFAULT_FIELDS),
|
|
253
|
+
"limit": min(limit, 1000),
|
|
254
|
+
"offset": offset,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
data = _make_request(url, params)
|
|
258
|
+
|
|
259
|
+
# Citations API returns {"data": [...], "offset": N, "next": N}
|
|
260
|
+
papers = []
|
|
261
|
+
for item in data.get("data", []):
|
|
262
|
+
citing_paper = item.get("citingPaper")
|
|
263
|
+
if citing_paper:
|
|
264
|
+
papers.append(_normalise_paper(citing_paper))
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"total": len(papers),
|
|
268
|
+
"offset": data.get("offset", 0),
|
|
269
|
+
"papers": papers,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def get_references(identifier, id_type=None, limit=100, offset=0):
|
|
274
|
+
"""Get papers that a given paper references.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
|
|
278
|
+
id_type: Optional type hint for the identifier
|
|
279
|
+
limit: Maximum number of results (default 100, max 1000)
|
|
280
|
+
offset: Offset for pagination
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Dict with 'total' count and 'papers' list
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
PaperNotFoundError: If paper is not found
|
|
287
|
+
SemanticScholarError: For API errors
|
|
288
|
+
|
|
289
|
+
"""
|
|
290
|
+
paper_id = _format_paper_id(identifier, id_type)
|
|
291
|
+
url = f"{BASE_URL}/paper/{paper_id}/references"
|
|
292
|
+
params = {
|
|
293
|
+
"fields": ",".join(DEFAULT_FIELDS),
|
|
294
|
+
"limit": min(limit, 1000),
|
|
295
|
+
"offset": offset,
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
data = _make_request(url, params)
|
|
299
|
+
|
|
300
|
+
# References API returns {"data": [...], "offset": N, "next": N}
|
|
301
|
+
papers = []
|
|
302
|
+
for item in data.get("data", []):
|
|
303
|
+
cited_paper = item.get("citedPaper")
|
|
304
|
+
if cited_paper:
|
|
305
|
+
papers.append(_normalise_paper(cited_paper))
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
"total": len(papers),
|
|
309
|
+
"offset": data.get("offset", 0),
|
|
310
|
+
"papers": papers,
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_recommendations(identifier, id_type=None, limit=100):
|
|
315
|
+
"""Get recommended papers based on a seed paper.
|
|
316
|
+
|
|
317
|
+
Uses Semantic Scholar's recommendation API which returns papers
|
|
318
|
+
similar to the input based on SPECTER2 embeddings.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
|
|
322
|
+
id_type: Optional type hint for the identifier
|
|
323
|
+
limit: Maximum number of recommendations (default 100, max 500)
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Dict with 'papers' list of recommended papers
|
|
327
|
+
|
|
328
|
+
Raises:
|
|
329
|
+
PaperNotFoundError: If paper is not found
|
|
330
|
+
SemanticScholarError: For API errors
|
|
331
|
+
|
|
332
|
+
"""
|
|
333
|
+
# First, get the paper to obtain its Semantic Scholar ID
|
|
334
|
+
paper = get_paper(identifier, id_type)
|
|
335
|
+
paper_id = paper.get("paperId")
|
|
336
|
+
|
|
337
|
+
if not paper_id:
|
|
338
|
+
raise PaperNotFoundError
|
|
339
|
+
|
|
340
|
+
url = f"{RECOMMENDATIONS_URL}/papers"
|
|
341
|
+
params = {
|
|
342
|
+
"fields": ",".join(DEFAULT_FIELDS),
|
|
343
|
+
"limit": min(limit, 500),
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
# POST request with paper IDs in body
|
|
347
|
+
with httpx.Client(timeout=REQUEST_TIMEOUT) as client:
|
|
348
|
+
response = client.post(
|
|
349
|
+
url,
|
|
350
|
+
params=params,
|
|
351
|
+
json={"positivePaperIds": [paper_id]},
|
|
352
|
+
)
|
|
353
|
+
_check_response(response)
|
|
354
|
+
data = response.json()
|
|
355
|
+
|
|
356
|
+
papers = [_normalise_paper(p) for p in data.get("recommendedPapers", [])]
|
|
357
|
+
|
|
358
|
+
return {"papers": papers}
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def search_papers(
|
|
362
|
+
query,
|
|
363
|
+
limit=100,
|
|
364
|
+
offset=0,
|
|
365
|
+
year=None,
|
|
366
|
+
open_access_only=False,
|
|
367
|
+
sort=None,
|
|
368
|
+
min_citations=None,
|
|
369
|
+
):
|
|
370
|
+
"""Search for papers by keyword query.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
query: Search query string
|
|
374
|
+
limit: Maximum number of results (default 100, max 100)
|
|
375
|
+
offset: Offset for pagination
|
|
376
|
+
year: Optional year filter (e.g., "2020", "2018-2022", "2020-")
|
|
377
|
+
open_access_only: If True, only return open access papers
|
|
378
|
+
sort: Sort order - "citationCount" (descending) or "year" (descending)
|
|
379
|
+
min_citations: Minimum citation count filter (applied client-side)
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Dict with 'total' count, 'offset', and 'papers' list
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
SemanticScholarError: For API errors
|
|
386
|
+
|
|
387
|
+
"""
|
|
388
|
+
url = f"{BASE_URL}/paper/search"
|
|
389
|
+
params = {
|
|
390
|
+
"query": query,
|
|
391
|
+
"fields": ",".join(DEFAULT_FIELDS),
|
|
392
|
+
"limit": min(limit, 100), # API max is 100 per request
|
|
393
|
+
"offset": offset,
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
if year:
|
|
397
|
+
params["year"] = year
|
|
398
|
+
|
|
399
|
+
if open_access_only:
|
|
400
|
+
params["openAccessPdf"] = ""
|
|
401
|
+
|
|
402
|
+
if sort:
|
|
403
|
+
# Semantic Scholar supports sorting by citationCount:desc or publicationDate:desc
|
|
404
|
+
sort_map = {
|
|
405
|
+
"citationCount": "citationCount:desc",
|
|
406
|
+
"citations": "citationCount:desc",
|
|
407
|
+
"year": "publicationDate:desc",
|
|
408
|
+
"date": "publicationDate:desc",
|
|
409
|
+
}
|
|
410
|
+
if sort in sort_map:
|
|
411
|
+
params["sort"] = sort_map[sort]
|
|
412
|
+
|
|
413
|
+
data = _make_request(url, params)
|
|
414
|
+
|
|
415
|
+
papers = [_normalise_paper(p) for p in data.get("data", [])]
|
|
416
|
+
|
|
417
|
+
# Apply client-side citation filter if specified
|
|
418
|
+
if min_citations is not None and min_citations > 0:
|
|
419
|
+
papers = [p for p in papers if (p.get("citationCount") or 0) >= min_citations]
|
|
420
|
+
|
|
421
|
+
return {
|
|
422
|
+
"total": data.get("total", len(papers)),
|
|
423
|
+
"offset": data.get("offset", 0),
|
|
424
|
+
"papers": papers,
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def filter_by_citations(papers, min_citations):
|
|
429
|
+
"""Filter a list of papers by minimum citation count.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
papers: List of normalised paper dicts
|
|
433
|
+
min_citations: Minimum citation count
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Filtered list of papers
|
|
437
|
+
|
|
438
|
+
"""
|
|
439
|
+
if min_citations is None or min_citations <= 0:
|
|
440
|
+
return papers
|
|
441
|
+
return [p for p in papers if (p.get("citationCount") or 0) >= min_citations]
|