@fbraza/pi-cite 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,40 +0,0 @@
1
- # Sci-Hub PDF Resolver — Routine Quick-Reference
2
-
3
- Resolves DOIs to direct PDF URLs via Sci-Hub mirrors. **Always check institutional access and open-access sources first** (PubMed Central, publisher OA). Use Sci-Hub only as a last resort.
4
-
5
- **Script:** `scripts/scihub_pdf_resolver.py` — zero-dependency Python script.
6
-
7
- ## CLI Usage
8
-
9
- ```bash
10
- python scripts/scihub_pdf_resolver.py "10.1038/s41586-024-07000-0"
11
- ```
12
-
13
- ## Output Codes
14
-
15
- | Output | Meaning |
16
- |---|---|
17
- | Prints a URL | Direct PDF link, ready to download |
18
- | `NOT_FOUND` | Sci-Hub does not have this paper. Check for `OA_LINK <url>` for open-access alternatives. |
19
- | `MIRROR_ERROR` | Sci-Hub mirrors could not be reached reliably |
20
- | `INVALID_INPUT` | The DOI is malformed |
21
-
22
- ## Exit Codes
23
-
24
- `0` = found, `1` = not found, `2` = mirror error, `3` = invalid input.
25
-
26
- ## Python API
27
-
28
- ```python
29
- from scripts.scihub_pdf_resolver import resolve_pdf
30
-
31
- status, url = resolve_pdf("10.1038/s41586-024-07000-0")
32
- if status == "FOUND":
33
- print(f"PDF available at: {url}")
34
- elif status == "NOT_FOUND" and url:
35
- print(f"Open-access link: {url}")
36
- ```
37
-
38
- ## Mirror Configuration
39
-
40
- Set `SCIHUB_MIRRORS` environment variable (comma-separated URLs) to override defaults. Defaults: `sci-hub.st`, `sci-hub.ru`, `sci-hub.se`.
@@ -1,50 +0,0 @@
1
- # Semantic Scholar — Routine Quick-Reference
2
-
3
- Quick-start for paper search, paper lookup, and author search. For citation network analysis or bulk queries → consult the full API documentation.
4
-
5
- ## Paper Search
6
-
7
- ```
8
- GET https://api.semanticscholar.org/graph/v1/paper/search
9
- ```
10
-
11
- **Parameters:**
12
- | Parameter | Value |
13
- |---|---|
14
- | `query` | Search terms |
15
- | `limit` | Max results (default 10, max 100) |
16
- | `offset` | Pagination offset |
17
- | `fields` | Comma-separated fields to return |
18
- | `year` | `<YYYY>` or `<YYYY-YYYY>` range |
19
- | `fieldsOfStudy` | Field of study filter |
20
-
21
- **Useful fields:** `paperId`, `title`, `abstract`, `year`, `referenceCount`, `citationCount`, `authors`, `journal`, `publicationTypes`, `tldr`, `openAccessPdf`, `externalIds`
22
-
23
- **Example:**
24
- ```
25
- https://api.semanticscholar.org/graph/v1/paper/search?query=CRISPR+off-target&limit=20&fields=title,abstract,year,citationCount,openAccessPdf
26
- ```
27
-
28
- ## Paper Details (by ID)
29
-
30
- ```
31
- GET https://api.semanticscholar.org/graph/v1/paper/{paper_id}
32
- ```
33
-
34
- `paper_id` accepts: S2 ID, DOI (`DOI:10.xxx`), PMID (`PMID:12345`), ArXiv ID.
35
-
36
- **Example:**
37
- ```
38
- https://api.semanticscholar.org/graph/v1/paper/DOI:10.1038/s41586-024-07000-0?fields=title,abstract,year,citationCount,references,citations
39
- ```
40
-
41
- ## Author Search
42
-
43
- ```
44
- GET https://api.semanticscholar.org/graph/v1/author/search?query=<name>
45
- GET https://api.semanticscholar.org/graph/v1/author/{author_id}/papers
46
- ```
47
-
48
- ## Rate Limits
49
-
50
- 100 requests / 5 min (unauthenticated). Higher limits available with an API key.
@@ -1,289 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Resolve a DOI to a direct PDF URL through Sci-Hub.
3
-
4
- Zero dependencies. Python standard library only.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import base64
10
- import hashlib
11
- import http.client
12
- import http.cookiejar
13
- import json
14
- import os
15
- import re
16
- import sys
17
- from typing import Iterable
18
- from urllib.error import HTTPError, URLError
19
- from urllib.parse import quote, urljoin, urlsplit, urlunsplit
20
- from urllib.request import (
21
- HTTPCookieProcessor,
22
- HTTPRedirectHandler,
23
- Request,
24
- build_opener,
25
- )
26
-
27
- TIMEOUT = 20
28
- STATUS_FOUND = "FOUND"
29
- STATUS_NOT_FOUND = "NOT_FOUND"
30
- STATUS_MIRROR_ERROR = "MIRROR_ERROR"
31
- STATUS_INVALID_INPUT = "INVALID_INPUT"
32
- DEFAULT_MIRRORS = (
33
- "https://sci-hub.st",
34
- "https://sci-hub.ru",
35
- "https://sci-hub.se",
36
- )
37
- UA = (
38
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
39
- "AppleWebKit/537.36 (KHTML, like Gecko) "
40
- "Chrome/133.0.0.0 Safari/537.36"
41
- )
42
- PDF_PATTERNS = (
43
- re.compile(r'<(?:iframe|embed|object)[^>]+(?:src|data)=["\']([^"\']+)["\']', re.I),
44
- re.compile(r'["\']((?:https?:)?//[^"\']+?(?:\.pdf|/pdf)[^"\']*)["\']', re.I),
45
- )
46
- OA_HINT_PATTERN = re.compile(
47
- r'<block-rounded[^>]+class\s*=\s*["\'][^"\']*\bopenaccess\b[^"\']*["\'][^>]*>(?:(?!</block-rounded>).)*?<a[^>]+href\s*=\s*["\']([^"\']+)["\']',
48
- re.I | re.S,
49
- )
50
-
51
-
52
- class Browser:
53
- def __init__(self) -> None:
54
- jar = http.cookiejar.CookieJar()
55
- self.opener = build_opener(HTTPCookieProcessor(jar), HTTPRedirectHandler())
56
-
57
- def open(
58
- self,
59
- url: str,
60
- *,
61
- data: bytes | None = None,
62
- headers: dict[str, str] | None = None,
63
- ) -> http.client.HTTPResponse:
64
- req = Request(url, data=data, headers=headers or {})
65
- return self.opener.open(req, timeout=TIMEOUT)
66
-
67
-
68
- def _headers(extra: dict[str, str] | None = None) -> dict[str, str]:
69
- base = {
70
- "User-Agent": UA,
71
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
72
- "Accept-Language": "en-US,en;q=0.9",
73
- }
74
- if extra:
75
- base.update(extra)
76
- return base
77
-
78
-
79
- def _canonicalize(url: str) -> str:
80
- parts = urlsplit(url.strip().replace("\\/", "/"))
81
- return urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ""))
82
-
83
-
84
- def _normalize_doi(raw: str) -> str:
85
- doi = raw.strip()
86
- doi = re.sub(r"^(?:doi:\s*)", "", doi, flags=re.I)
87
- doi = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", doi, flags=re.I)
88
- return doi.strip()
89
-
90
-
91
- def _extract_title(html: str) -> str:
92
- match = re.search(r"<title[^>]*>(.*?)</title>", html, re.I | re.S)
93
- if not match:
94
- return ""
95
- return " ".join(match.group(1).split())
96
-
97
-
98
- def _iter_pdf_candidates(html: str, page_url: str) -> Iterable[str]:
99
- seen: set[str] = set()
100
- for pattern in PDF_PATTERNS:
101
- for raw in pattern.findall(html):
102
- candidate = raw.strip()
103
- if not candidate:
104
- continue
105
- if candidate.startswith("//"):
106
- candidate = f"https:{candidate}"
107
- else:
108
- candidate = urljoin(page_url, candidate)
109
- candidate = _canonicalize(candidate)
110
- if candidate in seen:
111
- continue
112
- seen.add(candidate)
113
- yield candidate
114
-
115
-
116
- def _has_altcha(html: str) -> bool:
117
- return bool(re.search(r"/captcha/challenge/\d+", html))
118
-
119
-
120
- def _hexdigest(data: str, algorithm: str) -> str:
121
- digest = hashlib.new(algorithm.strip().lower().replace("-", ""))
122
- digest.update(data.encode("utf-8"))
123
- return digest.hexdigest()
124
-
125
-
126
- def _solve_altcha(browser: Browser, page_url: str, html: str) -> bool:
127
- challenge_id = re.search(r"/captcha/challenge/(\d+)", html)
128
- if not challenge_id:
129
- return False
130
- parts = urlsplit(page_url)
131
- base_url = f"{parts.scheme}://{parts.netloc}"
132
- challenge_url = urljoin(base_url, f"/captcha/challenge/{challenge_id.group(1)}")
133
- solution_url = urljoin(base_url, f"/captcha/solution/{challenge_id.group(1)}")
134
- try:
135
- with browser.open(challenge_url, headers=_headers({"Accept": "application/json"})) as resp:
136
- challenge = json.loads(resp.read().decode("utf-8", errors="replace"))
137
- algorithm = str(challenge["algorithm"])
138
- salt = str(challenge["salt"])
139
- target = str(challenge["challenge"])
140
- max_number = int(challenge["maxNumber"])
141
- except (HTTPError, URLError, OSError, KeyError, TypeError, ValueError, json.JSONDecodeError):
142
- return False
143
- number = None
144
- try:
145
- for value in range(max_number + 1):
146
- if _hexdigest(f"{salt}{value}", algorithm) == target:
147
- number = value
148
- break
149
- except ValueError:
150
- return False
151
- if number is None:
152
- return False
153
- payload = base64.b64encode(
154
- json.dumps(
155
- {
156
- "algorithm": algorithm,
157
- "challenge": target,
158
- "number": number,
159
- "salt": salt,
160
- "signature": challenge.get("signature", ""),
161
- "took": 0,
162
- },
163
- separators=(",", ":"),
164
- ).encode("utf-8")
165
- ).decode("ascii")
166
- body = json.dumps({"captcha": payload}).encode("utf-8")
167
- try:
168
- with browser.open(
169
- solution_url,
170
- data=body,
171
- headers=_headers(
172
- {
173
- "Content-Type": "application/json",
174
- "Accept": "application/json",
175
- "Origin": base_url,
176
- "Referer": page_url,
177
- }
178
- ),
179
- ) as resp:
180
- response = json.loads(resp.read().decode("utf-8", errors="replace"))
181
- except (HTTPError, URLError, OSError, TypeError, ValueError, json.JSONDecodeError):
182
- return False
183
- return bool(response.get("success"))
184
-
185
-
186
- def _fetch_page(browser: Browser, doi_url: str) -> tuple[str, str]:
187
- current_url = doi_url
188
- for _ in range(3):
189
- with browser.open(current_url, headers=_headers()) as resp:
190
- final_url = resp.geturl()
191
- html = resp.read().decode("utf-8", errors="replace")
192
- if not _has_altcha(html):
193
- return final_url, html
194
- if not _solve_altcha(browser, final_url, html):
195
- break
196
- current_url = doi_url
197
- return "", ""
198
-
199
-
200
- def _is_pdf(browser: Browser, url: str) -> bool:
201
- headers = _headers(
202
- {
203
- "Accept": "application/pdf,*/*;q=0.8",
204
- "Range": "bytes=0-7",
205
- }
206
- )
207
- try:
208
- with browser.open(url, headers=headers) as resp:
209
- content_type = (resp.headers.get("Content-Type") or "").lower()
210
- if "application/pdf" in content_type:
211
- return True
212
- prefix = resp.read(8)
213
- return prefix.startswith(b"%PDF-")
214
- except (HTTPError, URLError, OSError):
215
- return False
216
-
217
-
218
- def _extract_oa_link(html: str, page_url: str) -> str:
219
- match = OA_HINT_PATTERN.search(html)
220
- if not match:
221
- return ""
222
- candidate = match.group(1).strip()
223
- if not candidate:
224
- return ""
225
- if candidate.startswith("//"):
226
- candidate = f"https:{candidate}"
227
- else:
228
- candidate = urljoin(page_url, candidate)
229
- return _canonicalize(candidate)
230
-
231
-
232
- def _mirror_list() -> tuple[str, ...]:
233
- raw = os.environ.get("SCIHUB_MIRRORS", "")
234
- if raw.strip():
235
- return tuple(item.strip().rstrip("/") for item in raw.split(",") if item.strip())
236
- return DEFAULT_MIRRORS
237
-
238
-
239
- def resolve_pdf(doi: str) -> tuple[str, str]:
240
- normalized = _normalize_doi(doi)
241
- if not normalized:
242
- return STATUS_INVALID_INPUT, ""
243
- safe_doi = quote(normalized, safe="/:().-_")
244
- saw_not_found = False
245
- saw_mirror_error = False
246
- oa_link = ""
247
- for mirror in _mirror_list():
248
- browser = Browser()
249
- try:
250
- page_url, html = _fetch_page(browser, f"{mirror}/{safe_doi}")
251
- except (HTTPError, URLError, OSError):
252
- saw_mirror_error = True
253
- continue
254
- if not html:
255
- saw_mirror_error = True
256
- continue
257
- title = _extract_title(html).lower()
258
- if "not available through sci-hub" in title or "no articles found" in title:
259
- saw_not_found = True
260
- if not oa_link:
261
- oa_link = _extract_oa_link(html, page_url)
262
- continue
263
- for candidate in _iter_pdf_candidates(html, page_url):
264
- if _is_pdf(browser, candidate):
265
- return STATUS_FOUND, candidate
266
- saw_mirror_error = True
267
- if saw_not_found:
268
- return STATUS_NOT_FOUND, oa_link
269
- if saw_mirror_error:
270
- return STATUS_MIRROR_ERROR, ""
271
- return STATUS_NOT_FOUND, ""
272
-
273
-
274
- if __name__ == "__main__":
275
- if len(sys.argv) != 2:
276
- print("Usage: scihub-paper-downloader.py <DOI>", file=sys.stderr)
277
- sys.exit(1)
278
- status, url = resolve_pdf(sys.argv[1])
279
- if status == STATUS_FOUND:
280
- print(url)
281
- sys.exit(0)
282
- print(status)
283
- if status == STATUS_NOT_FOUND and url:
284
- print(f"OA_LINK {url}")
285
- if status == STATUS_NOT_FOUND:
286
- sys.exit(1)
287
- if status == STATUS_MIRROR_ERROR:
288
- sys.exit(2)
289
- sys.exit(3)