@fbraza/pi-cite 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -12
- package/package.json +3 -4
- package/skills/literature/SKILL.md +21 -40
- package/skills/literature/references/preclinical-extraction-guide.md +1 -1
- package/skills/literature/scripts/generate_table.py +1 -3
- package/skills/literature/scripts/synthesis.py +4 -3
- package/src/index.ts +0 -4
- package/src/literature-search.ts +7 -117
- package/src/rendering.ts +59 -92
- package/src/shared.ts +0 -21
- package/src/types.ts +0 -13
- package/skills/literature/references/full-text-access-guide.md +0 -34
- package/skills/literature/references/scihub_routine.md +0 -40
- package/skills/literature/references/semanticscholar_routine.md +0 -50
- package/skills/literature/scripts/scihub_pdf_resolver.py +0 -289
- package/src/fulltext.ts +0 -524
- package/src/semantic-scholar.ts +0 -199
|
@@ -1,289 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Resolve a DOI to a direct PDF URL through Sci-Hub.
|
|
3
|
-
|
|
4
|
-
Zero dependencies. Python standard library only.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import base64
|
|
10
|
-
import hashlib
|
|
11
|
-
import http.client
|
|
12
|
-
import http.cookiejar
|
|
13
|
-
import json
|
|
14
|
-
import os
|
|
15
|
-
import re
|
|
16
|
-
import sys
|
|
17
|
-
from typing import Iterable
|
|
18
|
-
from urllib.error import HTTPError, URLError
|
|
19
|
-
from urllib.parse import quote, urljoin, urlsplit, urlunsplit
|
|
20
|
-
from urllib.request import (
|
|
21
|
-
HTTPCookieProcessor,
|
|
22
|
-
HTTPRedirectHandler,
|
|
23
|
-
Request,
|
|
24
|
-
build_opener,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
TIMEOUT = 20
|
|
28
|
-
STATUS_FOUND = "FOUND"
|
|
29
|
-
STATUS_NOT_FOUND = "NOT_FOUND"
|
|
30
|
-
STATUS_MIRROR_ERROR = "MIRROR_ERROR"
|
|
31
|
-
STATUS_INVALID_INPUT = "INVALID_INPUT"
|
|
32
|
-
DEFAULT_MIRRORS = (
|
|
33
|
-
"https://sci-hub.st",
|
|
34
|
-
"https://sci-hub.ru",
|
|
35
|
-
"https://sci-hub.se",
|
|
36
|
-
)
|
|
37
|
-
UA = (
|
|
38
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
39
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
40
|
-
"Chrome/133.0.0.0 Safari/537.36"
|
|
41
|
-
)
|
|
42
|
-
PDF_PATTERNS = (
|
|
43
|
-
re.compile(r'<(?:iframe|embed|object)[^>]+(?:src|data)=["\']([^"\']+)["\']', re.I),
|
|
44
|
-
re.compile(r'["\']((?:https?:)?//[^"\']+?(?:\.pdf|/pdf)[^"\']*)["\']', re.I),
|
|
45
|
-
)
|
|
46
|
-
OA_HINT_PATTERN = re.compile(
|
|
47
|
-
r'<block-rounded[^>]+class\s*=\s*["\'][^"\']*\bopenaccess\b[^"\']*["\'][^>]*>(?:(?!</block-rounded>).)*?<a[^>]+href\s*=\s*["\']([^"\']+)["\']',
|
|
48
|
-
re.I | re.S,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class Browser:
|
|
53
|
-
def __init__(self) -> None:
|
|
54
|
-
jar = http.cookiejar.CookieJar()
|
|
55
|
-
self.opener = build_opener(HTTPCookieProcessor(jar), HTTPRedirectHandler())
|
|
56
|
-
|
|
57
|
-
def open(
|
|
58
|
-
self,
|
|
59
|
-
url: str,
|
|
60
|
-
*,
|
|
61
|
-
data: bytes | None = None,
|
|
62
|
-
headers: dict[str, str] | None = None,
|
|
63
|
-
) -> http.client.HTTPResponse:
|
|
64
|
-
req = Request(url, data=data, headers=headers or {})
|
|
65
|
-
return self.opener.open(req, timeout=TIMEOUT)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _headers(extra: dict[str, str] | None = None) -> dict[str, str]:
|
|
69
|
-
base = {
|
|
70
|
-
"User-Agent": UA,
|
|
71
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
72
|
-
"Accept-Language": "en-US,en;q=0.9",
|
|
73
|
-
}
|
|
74
|
-
if extra:
|
|
75
|
-
base.update(extra)
|
|
76
|
-
return base
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _canonicalize(url: str) -> str:
|
|
80
|
-
parts = urlsplit(url.strip().replace("\\/", "/"))
|
|
81
|
-
return urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ""))
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def _normalize_doi(raw: str) -> str:
|
|
85
|
-
doi = raw.strip()
|
|
86
|
-
doi = re.sub(r"^(?:doi:\s*)", "", doi, flags=re.I)
|
|
87
|
-
doi = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", doi, flags=re.I)
|
|
88
|
-
return doi.strip()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _extract_title(html: str) -> str:
|
|
92
|
-
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.I | re.S)
|
|
93
|
-
if not match:
|
|
94
|
-
return ""
|
|
95
|
-
return " ".join(match.group(1).split())
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _iter_pdf_candidates(html: str, page_url: str) -> Iterable[str]:
|
|
99
|
-
seen: set[str] = set()
|
|
100
|
-
for pattern in PDF_PATTERNS:
|
|
101
|
-
for raw in pattern.findall(html):
|
|
102
|
-
candidate = raw.strip()
|
|
103
|
-
if not candidate:
|
|
104
|
-
continue
|
|
105
|
-
if candidate.startswith("//"):
|
|
106
|
-
candidate = f"https:{candidate}"
|
|
107
|
-
else:
|
|
108
|
-
candidate = urljoin(page_url, candidate)
|
|
109
|
-
candidate = _canonicalize(candidate)
|
|
110
|
-
if candidate in seen:
|
|
111
|
-
continue
|
|
112
|
-
seen.add(candidate)
|
|
113
|
-
yield candidate
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def _has_altcha(html: str) -> bool:
|
|
117
|
-
return bool(re.search(r"/captcha/challenge/\d+", html))
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def _hexdigest(data: str, algorithm: str) -> str:
|
|
121
|
-
digest = hashlib.new(algorithm.strip().lower().replace("-", ""))
|
|
122
|
-
digest.update(data.encode("utf-8"))
|
|
123
|
-
return digest.hexdigest()
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def _solve_altcha(browser: Browser, page_url: str, html: str) -> bool:
|
|
127
|
-
challenge_id = re.search(r"/captcha/challenge/(\d+)", html)
|
|
128
|
-
if not challenge_id:
|
|
129
|
-
return False
|
|
130
|
-
parts = urlsplit(page_url)
|
|
131
|
-
base_url = f"{parts.scheme}://{parts.netloc}"
|
|
132
|
-
challenge_url = urljoin(base_url, f"/captcha/challenge/{challenge_id.group(1)}")
|
|
133
|
-
solution_url = urljoin(base_url, f"/captcha/solution/{challenge_id.group(1)}")
|
|
134
|
-
try:
|
|
135
|
-
with browser.open(challenge_url, headers=_headers({"Accept": "application/json"})) as resp:
|
|
136
|
-
challenge = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
137
|
-
algorithm = str(challenge["algorithm"])
|
|
138
|
-
salt = str(challenge["salt"])
|
|
139
|
-
target = str(challenge["challenge"])
|
|
140
|
-
max_number = int(challenge["maxNumber"])
|
|
141
|
-
except (HTTPError, URLError, OSError, KeyError, TypeError, ValueError, json.JSONDecodeError):
|
|
142
|
-
return False
|
|
143
|
-
number = None
|
|
144
|
-
try:
|
|
145
|
-
for value in range(max_number + 1):
|
|
146
|
-
if _hexdigest(f"{salt}{value}", algorithm) == target:
|
|
147
|
-
number = value
|
|
148
|
-
break
|
|
149
|
-
except ValueError:
|
|
150
|
-
return False
|
|
151
|
-
if number is None:
|
|
152
|
-
return False
|
|
153
|
-
payload = base64.b64encode(
|
|
154
|
-
json.dumps(
|
|
155
|
-
{
|
|
156
|
-
"algorithm": algorithm,
|
|
157
|
-
"challenge": target,
|
|
158
|
-
"number": number,
|
|
159
|
-
"salt": salt,
|
|
160
|
-
"signature": challenge.get("signature", ""),
|
|
161
|
-
"took": 0,
|
|
162
|
-
},
|
|
163
|
-
separators=(",", ":"),
|
|
164
|
-
).encode("utf-8")
|
|
165
|
-
).decode("ascii")
|
|
166
|
-
body = json.dumps({"captcha": payload}).encode("utf-8")
|
|
167
|
-
try:
|
|
168
|
-
with browser.open(
|
|
169
|
-
solution_url,
|
|
170
|
-
data=body,
|
|
171
|
-
headers=_headers(
|
|
172
|
-
{
|
|
173
|
-
"Content-Type": "application/json",
|
|
174
|
-
"Accept": "application/json",
|
|
175
|
-
"Origin": base_url,
|
|
176
|
-
"Referer": page_url,
|
|
177
|
-
}
|
|
178
|
-
),
|
|
179
|
-
) as resp:
|
|
180
|
-
response = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
181
|
-
except (HTTPError, URLError, OSError, TypeError, ValueError, json.JSONDecodeError):
|
|
182
|
-
return False
|
|
183
|
-
return bool(response.get("success"))
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def _fetch_page(browser: Browser, doi_url: str) -> tuple[str, str]:
|
|
187
|
-
current_url = doi_url
|
|
188
|
-
for _ in range(3):
|
|
189
|
-
with browser.open(current_url, headers=_headers()) as resp:
|
|
190
|
-
final_url = resp.geturl()
|
|
191
|
-
html = resp.read().decode("utf-8", errors="replace")
|
|
192
|
-
if not _has_altcha(html):
|
|
193
|
-
return final_url, html
|
|
194
|
-
if not _solve_altcha(browser, final_url, html):
|
|
195
|
-
break
|
|
196
|
-
current_url = doi_url
|
|
197
|
-
return "", ""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _is_pdf(browser: Browser, url: str) -> bool:
|
|
201
|
-
headers = _headers(
|
|
202
|
-
{
|
|
203
|
-
"Accept": "application/pdf,*/*;q=0.8",
|
|
204
|
-
"Range": "bytes=0-7",
|
|
205
|
-
}
|
|
206
|
-
)
|
|
207
|
-
try:
|
|
208
|
-
with browser.open(url, headers=headers) as resp:
|
|
209
|
-
content_type = (resp.headers.get("Content-Type") or "").lower()
|
|
210
|
-
if "application/pdf" in content_type:
|
|
211
|
-
return True
|
|
212
|
-
prefix = resp.read(8)
|
|
213
|
-
return prefix.startswith(b"%PDF-")
|
|
214
|
-
except (HTTPError, URLError, OSError):
|
|
215
|
-
return False
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def _extract_oa_link(html: str, page_url: str) -> str:
|
|
219
|
-
match = OA_HINT_PATTERN.search(html)
|
|
220
|
-
if not match:
|
|
221
|
-
return ""
|
|
222
|
-
candidate = match.group(1).strip()
|
|
223
|
-
if not candidate:
|
|
224
|
-
return ""
|
|
225
|
-
if candidate.startswith("//"):
|
|
226
|
-
candidate = f"https:{candidate}"
|
|
227
|
-
else:
|
|
228
|
-
candidate = urljoin(page_url, candidate)
|
|
229
|
-
return _canonicalize(candidate)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def _mirror_list() -> tuple[str, ...]:
|
|
233
|
-
raw = os.environ.get("SCIHUB_MIRRORS", "")
|
|
234
|
-
if raw.strip():
|
|
235
|
-
return tuple(item.strip().rstrip("/") for item in raw.split(",") if item.strip())
|
|
236
|
-
return DEFAULT_MIRRORS
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def resolve_pdf(doi: str) -> tuple[str, str]:
|
|
240
|
-
normalized = _normalize_doi(doi)
|
|
241
|
-
if not normalized:
|
|
242
|
-
return STATUS_INVALID_INPUT, ""
|
|
243
|
-
safe_doi = quote(normalized, safe="/:().-_")
|
|
244
|
-
saw_not_found = False
|
|
245
|
-
saw_mirror_error = False
|
|
246
|
-
oa_link = ""
|
|
247
|
-
for mirror in _mirror_list():
|
|
248
|
-
browser = Browser()
|
|
249
|
-
try:
|
|
250
|
-
page_url, html = _fetch_page(browser, f"{mirror}/{safe_doi}")
|
|
251
|
-
except (HTTPError, URLError, OSError):
|
|
252
|
-
saw_mirror_error = True
|
|
253
|
-
continue
|
|
254
|
-
if not html:
|
|
255
|
-
saw_mirror_error = True
|
|
256
|
-
continue
|
|
257
|
-
title = _extract_title(html).lower()
|
|
258
|
-
if "not available through sci-hub" in title or "no articles found" in title:
|
|
259
|
-
saw_not_found = True
|
|
260
|
-
if not oa_link:
|
|
261
|
-
oa_link = _extract_oa_link(html, page_url)
|
|
262
|
-
continue
|
|
263
|
-
for candidate in _iter_pdf_candidates(html, page_url):
|
|
264
|
-
if _is_pdf(browser, candidate):
|
|
265
|
-
return STATUS_FOUND, candidate
|
|
266
|
-
saw_mirror_error = True
|
|
267
|
-
if saw_not_found:
|
|
268
|
-
return STATUS_NOT_FOUND, oa_link
|
|
269
|
-
if saw_mirror_error:
|
|
270
|
-
return STATUS_MIRROR_ERROR, ""
|
|
271
|
-
return STATUS_NOT_FOUND, ""
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if __name__ == "__main__":
|
|
275
|
-
if len(sys.argv) != 2:
|
|
276
|
-
print("Usage: scihub-paper-downloader.py <DOI>", file=sys.stderr)
|
|
277
|
-
sys.exit(1)
|
|
278
|
-
status, url = resolve_pdf(sys.argv[1])
|
|
279
|
-
if status == STATUS_FOUND:
|
|
280
|
-
print(url)
|
|
281
|
-
sys.exit(0)
|
|
282
|
-
print(status)
|
|
283
|
-
if status == STATUS_NOT_FOUND and url:
|
|
284
|
-
print(f"OA_LINK {url}")
|
|
285
|
-
if status == STATUS_NOT_FOUND:
|
|
286
|
-
sys.exit(1)
|
|
287
|
-
if status == STATUS_MIRROR_ERROR:
|
|
288
|
-
sys.exit(2)
|
|
289
|
-
sys.exit(3)
|