pyzotero 1.7.5__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyzotero/filetransport.py CHANGED
@@ -60,8 +60,8 @@ def is_absolute_url(self):
60
60
  return not self.is_relative_url
61
61
 
62
62
 
63
- httpx.URL.is_relative_url = property(is_relative_url) # type: ignore
64
- httpx.URL.is_absolute_url = property(is_absolute_url) # type: ignore
63
+ httpx.URL.is_relative_url = property(is_relative_url)
64
+ httpx.URL.is_absolute_url = property(is_absolute_url)
65
65
 
66
66
 
67
67
  class FileTransport(AsyncBaseTransport, BaseTransport):
@@ -0,0 +1,441 @@
1
+ """Semantic Scholar API client for pyzotero.
2
+
3
+ This module provides functions to interact with the Semantic Scholar Graph API
4
+ for fetching paper metadata, citations, references, and recommendations.
5
+
6
+ API Documentation: https://api.semanticscholar.org/api-docs
7
+ """
8
+
9
+ import httpx
10
+ from httpx import codes as http
11
+
12
+ BASE_URL = "https://api.semanticscholar.org/graph/v1"
13
+ RECOMMENDATIONS_URL = "https://api.semanticscholar.org/recommendations/v1"
14
+
15
+ # Fields to request from the Semantic Scholar API
16
+ DEFAULT_FIELDS = [
17
+ "paperId",
18
+ "externalIds",
19
+ "title",
20
+ "abstract",
21
+ "venue",
22
+ "year",
23
+ "referenceCount",
24
+ "citationCount",
25
+ "influentialCitationCount",
26
+ "isOpenAccess",
27
+ "openAccessPdf",
28
+ "authors",
29
+ "publicationTypes",
30
+ "publicationDate",
31
+ ]
32
+
33
+ # Timeout for API requests (seconds)
34
+ REQUEST_TIMEOUT = 30.0
35
+
36
+
37
+ class SemanticScholarError(Exception):
38
+ """Base exception for Semantic Scholar API errors."""
39
+
40
+
41
+ class RateLimitError(SemanticScholarError):
42
+ """Raised when API rate limit is exceeded."""
43
+
44
+ def __init__(self, msg="Rate limit exceeded. Please wait and try again."):
45
+ super().__init__(msg)
46
+
47
+
48
+ class PaperNotFoundError(SemanticScholarError):
49
+ """Raised when a paper is not found."""
50
+
51
+ def __init__(self, msg="Paper not found."):
52
+ super().__init__(msg)
53
+
54
+
55
+ def _make_request(url, params=None):
56
+ """Make an HTTP GET request to the Semantic Scholar API.
57
+
58
+ Args:
59
+ url: The full URL to request
60
+ params: Optional dict of query parameters
61
+
62
+ Returns:
63
+ The JSON response as a dict
64
+
65
+ Raises:
66
+ RateLimitError: If rate limit is exceeded (HTTP 429)
67
+ PaperNotFoundError: If paper is not found (HTTP 404)
68
+ SemanticScholarError: For other API errors
69
+
70
+ """
71
+ with httpx.Client(timeout=REQUEST_TIMEOUT) as client:
72
+ response = client.get(url, params=params)
73
+
74
+ _check_response(response)
75
+ return response.json()
76
+
77
+
78
+ def _check_response(response):
79
+ """Check HTTP response and raise appropriate exceptions.
80
+
81
+ Args:
82
+ response: httpx Response object
83
+
84
+ Raises:
85
+ RateLimitError: If rate limit is exceeded (HTTP 429)
86
+ PaperNotFoundError: If paper is not found (HTTP 404)
87
+ SemanticScholarError: For other API errors
88
+
89
+ """
90
+ if response.status_code == http.TOO_MANY_REQUESTS:
91
+ raise RateLimitError
92
+
93
+ if response.status_code == http.NOT_FOUND:
94
+ raise PaperNotFoundError
95
+
96
+ if response.status_code != http.OK:
97
+ msg = f"Semantic Scholar API error: {response.status_code} - {response.text}"
98
+ raise SemanticScholarError(msg)
99
+
100
+
101
+ def _format_paper_id(identifier, id_type=None): # noqa: PLR0911
102
+ """Format a paper identifier for the Semantic Scholar API.
103
+
104
+ Semantic Scholar accepts various identifier formats:
105
+ - DOI: DOI:10.1234/example
106
+ - arXiv: ARXIV:1234.5678
107
+ - Semantic Scholar ID: direct use
108
+ - PMID: PMID:12345678
109
+ - MAG: MAG:12345678
110
+ - ACL: ACL:P19-1234
111
+ - CorpusID: CorpusId:12345678
112
+
113
+ Args:
114
+ identifier: The paper identifier
115
+ id_type: Optional type hint ("doi", "arxiv", "pmid", "mag", "acl", "corpus")
116
+
117
+ Returns:
118
+ Formatted identifier string for the API
119
+
120
+ """
121
+ if not identifier:
122
+ return identifier
123
+
124
+ identifier = identifier.strip()
125
+
126
+ # If already prefixed, return as-is
127
+ known_prefixes = ["DOI:", "ARXIV:", "PMID:", "MAG:", "ACL:", "CorpusId:"]
128
+ for prefix in known_prefixes:
129
+ if identifier.upper().startswith(prefix.upper()):
130
+ return identifier
131
+
132
+ # Strip common DOI URL prefixes
133
+ doi_prefixes = ["https://doi.org/", "http://doi.org/", "doi:"]
134
+ for prefix in doi_prefixes:
135
+ if identifier.lower().startswith(prefix.lower()):
136
+ identifier = identifier[len(prefix) :]
137
+ return f"DOI:{identifier}"
138
+
139
+ # If type hint provided, add appropriate prefix
140
+ if id_type:
141
+ type_map = {
142
+ "doi": "DOI:",
143
+ "arxiv": "ARXIV:",
144
+ "pmid": "PMID:",
145
+ "mag": "MAG:",
146
+ "acl": "ACL:",
147
+ "corpus": "CorpusId:",
148
+ }
149
+ prefix = type_map.get(id_type.lower())
150
+ if prefix:
151
+ return f"{prefix}{identifier}"
152
+
153
+ # Heuristic detection
154
+ # DOIs typically contain a slash and start with 10.
155
+ if "/" in identifier and identifier.startswith("10."):
156
+ return f"DOI:{identifier}"
157
+
158
+ # arXiv IDs have a specific format (YYMM.NNNNN or category/YYMMNNN)
159
+ if "." in identifier and identifier.split(".")[0].isdigit():
160
+ return f"ARXIV:{identifier}"
161
+
162
+ # If all else fails, assume it's a Semantic Scholar ID
163
+ return identifier
164
+
165
+
166
+ def _normalise_paper(paper_data):
167
+ """Normalise paper data from Semantic Scholar to a consistent format.
168
+
169
+ Args:
170
+ paper_data: Raw paper data from the API
171
+
172
+ Returns:
173
+ Normalised paper dict with consistent field names
174
+
175
+ """
176
+ if not paper_data:
177
+ return None
178
+
179
+ external_ids = paper_data.get("externalIds") or {}
180
+ authors = paper_data.get("authors") or []
181
+ open_access_pdf = paper_data.get("openAccessPdf") or {}
182
+
183
+ return {
184
+ "paperId": paper_data.get("paperId"),
185
+ "doi": external_ids.get("DOI"),
186
+ "arxivId": external_ids.get("ArXiv"),
187
+ "pmid": external_ids.get("PubMed"),
188
+ "title": paper_data.get("title"),
189
+ "abstract": paper_data.get("abstract"),
190
+ "venue": paper_data.get("venue"),
191
+ "year": paper_data.get("year"),
192
+ "authors": [
193
+ {
194
+ "authorId": a.get("authorId"),
195
+ "name": a.get("name"),
196
+ }
197
+ for a in authors
198
+ ],
199
+ "citationCount": paper_data.get("citationCount"),
200
+ "referenceCount": paper_data.get("referenceCount"),
201
+ "influentialCitationCount": paper_data.get("influentialCitationCount"),
202
+ "isOpenAccess": paper_data.get("isOpenAccess"),
203
+ "openAccessPdfUrl": open_access_pdf.get("url"),
204
+ "publicationTypes": paper_data.get("publicationTypes"),
205
+ "publicationDate": paper_data.get("publicationDate"),
206
+ }
207
+
208
+
209
+ def get_paper(identifier, id_type=None):
210
+ """Get details for a single paper.
211
+
212
+ Args:
213
+ identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
214
+ id_type: Optional type hint for the identifier
215
+
216
+ Returns:
217
+ Normalised paper dict
218
+
219
+ Raises:
220
+ PaperNotFoundError: If paper is not found
221
+ SemanticScholarError: For API errors
222
+
223
+ """
224
+ paper_id = _format_paper_id(identifier, id_type)
225
+ url = f"{BASE_URL}/paper/{paper_id}"
226
+ params = {"fields": ",".join(DEFAULT_FIELDS)}
227
+
228
+ data = _make_request(url, params)
229
+ return _normalise_paper(data)
230
+
231
+
232
+ def get_citations(identifier, id_type=None, limit=100, offset=0):
233
+ """Get papers that cite a given paper.
234
+
235
+ Args:
236
+ identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
237
+ id_type: Optional type hint for the identifier
238
+ limit: Maximum number of results (default 100, max 1000)
239
+ offset: Offset for pagination
240
+
241
+ Returns:
242
+ Dict with 'total' count and 'papers' list
243
+
244
+ Raises:
245
+ PaperNotFoundError: If paper is not found
246
+ SemanticScholarError: For API errors
247
+
248
+ """
249
+ paper_id = _format_paper_id(identifier, id_type)
250
+ url = f"{BASE_URL}/paper/{paper_id}/citations"
251
+ params = {
252
+ "fields": ",".join(DEFAULT_FIELDS),
253
+ "limit": min(limit, 1000),
254
+ "offset": offset,
255
+ }
256
+
257
+ data = _make_request(url, params)
258
+
259
+ # Citations API returns {"data": [...], "offset": N, "next": N}
260
+ papers = []
261
+ for item in data.get("data", []):
262
+ citing_paper = item.get("citingPaper")
263
+ if citing_paper:
264
+ papers.append(_normalise_paper(citing_paper))
265
+
266
+ return {
267
+ "total": len(papers),
268
+ "offset": data.get("offset", 0),
269
+ "papers": papers,
270
+ }
271
+
272
+
273
+ def get_references(identifier, id_type=None, limit=100, offset=0):
274
+ """Get papers that a given paper references.
275
+
276
+ Args:
277
+ identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
278
+ id_type: Optional type hint for the identifier
279
+ limit: Maximum number of results (default 100, max 1000)
280
+ offset: Offset for pagination
281
+
282
+ Returns:
283
+ Dict with 'total' count and 'papers' list
284
+
285
+ Raises:
286
+ PaperNotFoundError: If paper is not found
287
+ SemanticScholarError: For API errors
288
+
289
+ """
290
+ paper_id = _format_paper_id(identifier, id_type)
291
+ url = f"{BASE_URL}/paper/{paper_id}/references"
292
+ params = {
293
+ "fields": ",".join(DEFAULT_FIELDS),
294
+ "limit": min(limit, 1000),
295
+ "offset": offset,
296
+ }
297
+
298
+ data = _make_request(url, params)
299
+
300
+ # References API returns {"data": [...], "offset": N, "next": N}
301
+ papers = []
302
+ for item in data.get("data", []):
303
+ cited_paper = item.get("citedPaper")
304
+ if cited_paper:
305
+ papers.append(_normalise_paper(cited_paper))
306
+
307
+ return {
308
+ "total": len(papers),
309
+ "offset": data.get("offset", 0),
310
+ "papers": papers,
311
+ }
312
+
313
+
314
+ def get_recommendations(identifier, id_type=None, limit=100):
315
+ """Get recommended papers based on a seed paper.
316
+
317
+ Uses Semantic Scholar's recommendation API which returns papers
318
+ similar to the input based on SPECTER2 embeddings.
319
+
320
+ Args:
321
+ identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.)
322
+ id_type: Optional type hint for the identifier
323
+ limit: Maximum number of recommendations (default 100, max 500)
324
+
325
+ Returns:
326
+ Dict with 'papers' list of recommended papers
327
+
328
+ Raises:
329
+ PaperNotFoundError: If paper is not found
330
+ SemanticScholarError: For API errors
331
+
332
+ """
333
+ # First, get the paper to obtain its Semantic Scholar ID
334
+ paper = get_paper(identifier, id_type)
335
+ paper_id = paper.get("paperId")
336
+
337
+ if not paper_id:
338
+ raise PaperNotFoundError
339
+
340
+ url = f"{RECOMMENDATIONS_URL}/papers"
341
+ params = {
342
+ "fields": ",".join(DEFAULT_FIELDS),
343
+ "limit": min(limit, 500),
344
+ }
345
+
346
+ # POST request with paper IDs in body
347
+ with httpx.Client(timeout=REQUEST_TIMEOUT) as client:
348
+ response = client.post(
349
+ url,
350
+ params=params,
351
+ json={"positivePaperIds": [paper_id]},
352
+ )
353
+ _check_response(response)
354
+ data = response.json()
355
+
356
+ papers = [_normalise_paper(p) for p in data.get("recommendedPapers", [])]
357
+
358
+ return {"papers": papers}
359
+
360
+
361
+ def search_papers(
362
+ query,
363
+ limit=100,
364
+ offset=0,
365
+ year=None,
366
+ open_access_only=False,
367
+ sort=None,
368
+ min_citations=None,
369
+ ):
370
+ """Search for papers by keyword query.
371
+
372
+ Args:
373
+ query: Search query string
374
+ limit: Maximum number of results (default 100, max 100)
375
+ offset: Offset for pagination
376
+ year: Optional year filter (e.g., "2020", "2018-2022", "2020-")
377
+ open_access_only: If True, only return open access papers
378
+ sort: Sort order - "citationCount" (descending) or "year" (descending)
379
+ min_citations: Minimum citation count filter (applied client-side)
380
+
381
+ Returns:
382
+ Dict with 'total' count, 'offset', and 'papers' list
383
+
384
+ Raises:
385
+ SemanticScholarError: For API errors
386
+
387
+ """
388
+ url = f"{BASE_URL}/paper/search"
389
+ params = {
390
+ "query": query,
391
+ "fields": ",".join(DEFAULT_FIELDS),
392
+ "limit": min(limit, 100), # API max is 100 per request
393
+ "offset": offset,
394
+ }
395
+
396
+ if year:
397
+ params["year"] = year
398
+
399
+ if open_access_only:
400
+ params["openAccessPdf"] = ""
401
+
402
+ if sort:
403
+ # Semantic Scholar supports sorting by citationCount:desc or publicationDate:desc
404
+ sort_map = {
405
+ "citationCount": "citationCount:desc",
406
+ "citations": "citationCount:desc",
407
+ "year": "publicationDate:desc",
408
+ "date": "publicationDate:desc",
409
+ }
410
+ if sort in sort_map:
411
+ params["sort"] = sort_map[sort]
412
+
413
+ data = _make_request(url, params)
414
+
415
+ papers = [_normalise_paper(p) for p in data.get("data", [])]
416
+
417
+ # Apply client-side citation filter if specified
418
+ if min_citations is not None and min_citations > 0:
419
+ papers = [p for p in papers if (p.get("citationCount") or 0) >= min_citations]
420
+
421
+ return {
422
+ "total": data.get("total", len(papers)),
423
+ "offset": data.get("offset", 0),
424
+ "papers": papers,
425
+ }
426
+
427
+
428
+ def filter_by_citations(papers, min_citations):
429
+ """Filter a list of papers by minimum citation count.
430
+
431
+ Args:
432
+ papers: List of normalised paper dicts
433
+ min_citations: Minimum citation count
434
+
435
+ Returns:
436
+ Filtered list of papers
437
+
438
+ """
439
+ if min_citations is None or min_citations <= 0:
440
+ return papers
441
+ return [p for p in papers if (p.get("citationCount") or 0) >= min_citations]