@fbraza/pi-cite 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -12
- package/package.json +3 -4
- package/skills/literature/SKILL.md +21 -40
- package/skills/literature/references/preclinical-extraction-guide.md +1 -1
- package/skills/literature/scripts/generate_table.py +1 -3
- package/skills/literature/scripts/synthesis.py +4 -3
- package/src/index.ts +0 -4
- package/src/literature-search.ts +2 -110
- package/src/rendering.ts +13 -23
- package/src/shared.ts +0 -21
- package/src/types.ts +0 -13
- package/skills/literature/references/full-text-access-guide.md +0 -34
- package/skills/literature/references/scihub_routine.md +0 -40
- package/skills/literature/references/semanticscholar_routine.md +0 -50
- package/skills/literature/scripts/scihub_pdf_resolver.py +0 -289
- package/src/fulltext.ts +0 -524
- package/src/semantic-scholar.ts +0 -199
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
# Sci-Hub PDF Resolver — Routine Quick-Reference
|
|
2
|
-
|
|
3
|
-
Resolves DOIs to direct PDF URLs via Sci-Hub mirrors. **Always check institutional access and open-access sources first** (PubMed Central, publisher OA). Use Sci-Hub only as a last resort.
|
|
4
|
-
|
|
5
|
-
**Script:** `scripts/scihub_pdf_resolver.py` — zero-dependency Python script.
|
|
6
|
-
|
|
7
|
-
## CLI Usage
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
python scripts/scihub_pdf_resolver.py "10.1038/s41586-024-07000-0"
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
## Output Codes
|
|
14
|
-
|
|
15
|
-
| Output | Meaning |
|
|
16
|
-
|---|---|
|
|
17
|
-
| Prints a URL | Direct PDF link, ready to download |
|
|
18
|
-
| `NOT_FOUND` | Sci-Hub does not have this paper. Check for `OA_LINK <url>` for open-access alternatives. |
|
|
19
|
-
| `MIRROR_ERROR` | Sci-Hub mirrors could not be reached reliably |
|
|
20
|
-
| `INVALID_INPUT` | The DOI is malformed |
|
|
21
|
-
|
|
22
|
-
## Exit Codes
|
|
23
|
-
|
|
24
|
-
`0` = found, `1` = not found, `2` = mirror error, `3` = invalid input.
|
|
25
|
-
|
|
26
|
-
## Python API
|
|
27
|
-
|
|
28
|
-
```python
|
|
29
|
-
from scripts.scihub_pdf_resolver import resolve_pdf
|
|
30
|
-
|
|
31
|
-
status, url = resolve_pdf("10.1038/s41586-024-07000-0")
|
|
32
|
-
if status == "FOUND":
|
|
33
|
-
print(f"PDF available at: {url}")
|
|
34
|
-
elif status == "NOT_FOUND" and url:
|
|
35
|
-
print(f"Open-access link: {url}")
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## Mirror Configuration
|
|
39
|
-
|
|
40
|
-
Set `SCIHUB_MIRRORS` environment variable (comma-separated URLs) to override defaults. Defaults: `sci-hub.st`, `sci-hub.ru`, `sci-hub.se`.
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# Semantic Scholar — Routine Quick-Reference
|
|
2
|
-
|
|
3
|
-
Quick-start for paper search, paper lookup, and author search. For citation network analysis or bulk queries → consult the full API documentation.
|
|
4
|
-
|
|
5
|
-
## Paper Search
|
|
6
|
-
|
|
7
|
-
```
|
|
8
|
-
GET https://api.semanticscholar.org/graph/v1/paper/search
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
**Parameters:**
|
|
12
|
-
| Parameter | Value |
|
|
13
|
-
|---|---|
|
|
14
|
-
| `query` | Search terms |
|
|
15
|
-
| `limit` | Max results (default 10, max 100) |
|
|
16
|
-
| `offset` | Pagination offset |
|
|
17
|
-
| `fields` | Comma-separated fields to return |
|
|
18
|
-
| `year` | `<YYYY>` or `<YYYY-YYYY>` range |
|
|
19
|
-
| `fieldsOfStudy` | Field of study filter |
|
|
20
|
-
|
|
21
|
-
**Useful fields:** `paperId`, `title`, `abstract`, `year`, `referenceCount`, `citationCount`, `authors`, `journal`, `publicationTypes`, `tldr`, `openAccessPdf`, `externalIds`
|
|
22
|
-
|
|
23
|
-
**Example:**
|
|
24
|
-
```
|
|
25
|
-
https://api.semanticscholar.org/graph/v1/paper/search?query=CRISPR+off-target&limit=20&fields=title,abstract,year,citationCount,openAccessPdf
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
## Paper Details (by ID)
|
|
29
|
-
|
|
30
|
-
```
|
|
31
|
-
GET https://api.semanticscholar.org/graph/v1/paper/{paper_id}
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
`paper_id` accepts: S2 ID, DOI (`DOI:10.xxx`), PMID (`PMID:12345`), ArXiv ID.
|
|
35
|
-
|
|
36
|
-
**Example:**
|
|
37
|
-
```
|
|
38
|
-
https://api.semanticscholar.org/graph/v1/paper/DOI:10.1038/s41586-024-07000-0?fields=title,abstract,year,citationCount,references,citations
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
## Author Search
|
|
42
|
-
|
|
43
|
-
```
|
|
44
|
-
GET https://api.semanticscholar.org/graph/v1/author/search?query=<name>
|
|
45
|
-
GET https://api.semanticscholar.org/graph/v1/author/{author_id}/papers
|
|
46
|
-
```
|
|
47
|
-
|
|
48
|
-
## Rate Limits
|
|
49
|
-
|
|
50
|
-
100 requests / 5 min (unauthenticated). Higher limits available with an API key.
|
|
@@ -1,289 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Resolve a DOI to a direct PDF URL through Sci-Hub.
|
|
3
|
-
|
|
4
|
-
Zero dependencies. Python standard library only.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import base64
|
|
10
|
-
import hashlib
|
|
11
|
-
import http.client
|
|
12
|
-
import http.cookiejar
|
|
13
|
-
import json
|
|
14
|
-
import os
|
|
15
|
-
import re
|
|
16
|
-
import sys
|
|
17
|
-
from typing import Iterable
|
|
18
|
-
from urllib.error import HTTPError, URLError
|
|
19
|
-
from urllib.parse import quote, urljoin, urlsplit, urlunsplit
|
|
20
|
-
from urllib.request import (
|
|
21
|
-
HTTPCookieProcessor,
|
|
22
|
-
HTTPRedirectHandler,
|
|
23
|
-
Request,
|
|
24
|
-
build_opener,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
TIMEOUT = 20
|
|
28
|
-
STATUS_FOUND = "FOUND"
|
|
29
|
-
STATUS_NOT_FOUND = "NOT_FOUND"
|
|
30
|
-
STATUS_MIRROR_ERROR = "MIRROR_ERROR"
|
|
31
|
-
STATUS_INVALID_INPUT = "INVALID_INPUT"
|
|
32
|
-
DEFAULT_MIRRORS = (
|
|
33
|
-
"https://sci-hub.st",
|
|
34
|
-
"https://sci-hub.ru",
|
|
35
|
-
"https://sci-hub.se",
|
|
36
|
-
)
|
|
37
|
-
UA = (
|
|
38
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
39
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
40
|
-
"Chrome/133.0.0.0 Safari/537.36"
|
|
41
|
-
)
|
|
42
|
-
PDF_PATTERNS = (
|
|
43
|
-
re.compile(r'<(?:iframe|embed|object)[^>]+(?:src|data)=["\']([^"\']+)["\']', re.I),
|
|
44
|
-
re.compile(r'["\']((?:https?:)?//[^"\']+?(?:\.pdf|/pdf)[^"\']*)["\']', re.I),
|
|
45
|
-
)
|
|
46
|
-
OA_HINT_PATTERN = re.compile(
|
|
47
|
-
r'<block-rounded[^>]+class\s*=\s*["\'][^"\']*\bopenaccess\b[^"\']*["\'][^>]*>(?:(?!</block-rounded>).)*?<a[^>]+href\s*=\s*["\']([^"\']+)["\']',
|
|
48
|
-
re.I | re.S,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class Browser:
|
|
53
|
-
def __init__(self) -> None:
|
|
54
|
-
jar = http.cookiejar.CookieJar()
|
|
55
|
-
self.opener = build_opener(HTTPCookieProcessor(jar), HTTPRedirectHandler())
|
|
56
|
-
|
|
57
|
-
def open(
|
|
58
|
-
self,
|
|
59
|
-
url: str,
|
|
60
|
-
*,
|
|
61
|
-
data: bytes | None = None,
|
|
62
|
-
headers: dict[str, str] | None = None,
|
|
63
|
-
) -> http.client.HTTPResponse:
|
|
64
|
-
req = Request(url, data=data, headers=headers or {})
|
|
65
|
-
return self.opener.open(req, timeout=TIMEOUT)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _headers(extra: dict[str, str] | None = None) -> dict[str, str]:
|
|
69
|
-
base = {
|
|
70
|
-
"User-Agent": UA,
|
|
71
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
72
|
-
"Accept-Language": "en-US,en;q=0.9",
|
|
73
|
-
}
|
|
74
|
-
if extra:
|
|
75
|
-
base.update(extra)
|
|
76
|
-
return base
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _canonicalize(url: str) -> str:
|
|
80
|
-
parts = urlsplit(url.strip().replace("\\/", "/"))
|
|
81
|
-
return urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ""))
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def _normalize_doi(raw: str) -> str:
|
|
85
|
-
doi = raw.strip()
|
|
86
|
-
doi = re.sub(r"^(?:doi:\s*)", "", doi, flags=re.I)
|
|
87
|
-
doi = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", doi, flags=re.I)
|
|
88
|
-
return doi.strip()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _extract_title(html: str) -> str:
|
|
92
|
-
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.I | re.S)
|
|
93
|
-
if not match:
|
|
94
|
-
return ""
|
|
95
|
-
return " ".join(match.group(1).split())
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _iter_pdf_candidates(html: str, page_url: str) -> Iterable[str]:
|
|
99
|
-
seen: set[str] = set()
|
|
100
|
-
for pattern in PDF_PATTERNS:
|
|
101
|
-
for raw in pattern.findall(html):
|
|
102
|
-
candidate = raw.strip()
|
|
103
|
-
if not candidate:
|
|
104
|
-
continue
|
|
105
|
-
if candidate.startswith("//"):
|
|
106
|
-
candidate = f"https:{candidate}"
|
|
107
|
-
else:
|
|
108
|
-
candidate = urljoin(page_url, candidate)
|
|
109
|
-
candidate = _canonicalize(candidate)
|
|
110
|
-
if candidate in seen:
|
|
111
|
-
continue
|
|
112
|
-
seen.add(candidate)
|
|
113
|
-
yield candidate
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def _has_altcha(html: str) -> bool:
|
|
117
|
-
return bool(re.search(r"/captcha/challenge/\d+", html))
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def _hexdigest(data: str, algorithm: str) -> str:
|
|
121
|
-
digest = hashlib.new(algorithm.strip().lower().replace("-", ""))
|
|
122
|
-
digest.update(data.encode("utf-8"))
|
|
123
|
-
return digest.hexdigest()
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def _solve_altcha(browser: Browser, page_url: str, html: str) -> bool:
|
|
127
|
-
challenge_id = re.search(r"/captcha/challenge/(\d+)", html)
|
|
128
|
-
if not challenge_id:
|
|
129
|
-
return False
|
|
130
|
-
parts = urlsplit(page_url)
|
|
131
|
-
base_url = f"{parts.scheme}://{parts.netloc}"
|
|
132
|
-
challenge_url = urljoin(base_url, f"/captcha/challenge/{challenge_id.group(1)}")
|
|
133
|
-
solution_url = urljoin(base_url, f"/captcha/solution/{challenge_id.group(1)}")
|
|
134
|
-
try:
|
|
135
|
-
with browser.open(challenge_url, headers=_headers({"Accept": "application/json"})) as resp:
|
|
136
|
-
challenge = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
137
|
-
algorithm = str(challenge["algorithm"])
|
|
138
|
-
salt = str(challenge["salt"])
|
|
139
|
-
target = str(challenge["challenge"])
|
|
140
|
-
max_number = int(challenge["maxNumber"])
|
|
141
|
-
except (HTTPError, URLError, OSError, KeyError, TypeError, ValueError, json.JSONDecodeError):
|
|
142
|
-
return False
|
|
143
|
-
number = None
|
|
144
|
-
try:
|
|
145
|
-
for value in range(max_number + 1):
|
|
146
|
-
if _hexdigest(f"{salt}{value}", algorithm) == target:
|
|
147
|
-
number = value
|
|
148
|
-
break
|
|
149
|
-
except ValueError:
|
|
150
|
-
return False
|
|
151
|
-
if number is None:
|
|
152
|
-
return False
|
|
153
|
-
payload = base64.b64encode(
|
|
154
|
-
json.dumps(
|
|
155
|
-
{
|
|
156
|
-
"algorithm": algorithm,
|
|
157
|
-
"challenge": target,
|
|
158
|
-
"number": number,
|
|
159
|
-
"salt": salt,
|
|
160
|
-
"signature": challenge.get("signature", ""),
|
|
161
|
-
"took": 0,
|
|
162
|
-
},
|
|
163
|
-
separators=(",", ":"),
|
|
164
|
-
).encode("utf-8")
|
|
165
|
-
).decode("ascii")
|
|
166
|
-
body = json.dumps({"captcha": payload}).encode("utf-8")
|
|
167
|
-
try:
|
|
168
|
-
with browser.open(
|
|
169
|
-
solution_url,
|
|
170
|
-
data=body,
|
|
171
|
-
headers=_headers(
|
|
172
|
-
{
|
|
173
|
-
"Content-Type": "application/json",
|
|
174
|
-
"Accept": "application/json",
|
|
175
|
-
"Origin": base_url,
|
|
176
|
-
"Referer": page_url,
|
|
177
|
-
}
|
|
178
|
-
),
|
|
179
|
-
) as resp:
|
|
180
|
-
response = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
181
|
-
except (HTTPError, URLError, OSError, TypeError, ValueError, json.JSONDecodeError):
|
|
182
|
-
return False
|
|
183
|
-
return bool(response.get("success"))
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def _fetch_page(browser: Browser, doi_url: str) -> tuple[str, str]:
|
|
187
|
-
current_url = doi_url
|
|
188
|
-
for _ in range(3):
|
|
189
|
-
with browser.open(current_url, headers=_headers()) as resp:
|
|
190
|
-
final_url = resp.geturl()
|
|
191
|
-
html = resp.read().decode("utf-8", errors="replace")
|
|
192
|
-
if not _has_altcha(html):
|
|
193
|
-
return final_url, html
|
|
194
|
-
if not _solve_altcha(browser, final_url, html):
|
|
195
|
-
break
|
|
196
|
-
current_url = doi_url
|
|
197
|
-
return "", ""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _is_pdf(browser: Browser, url: str) -> bool:
|
|
201
|
-
headers = _headers(
|
|
202
|
-
{
|
|
203
|
-
"Accept": "application/pdf,*/*;q=0.8",
|
|
204
|
-
"Range": "bytes=0-7",
|
|
205
|
-
}
|
|
206
|
-
)
|
|
207
|
-
try:
|
|
208
|
-
with browser.open(url, headers=headers) as resp:
|
|
209
|
-
content_type = (resp.headers.get("Content-Type") or "").lower()
|
|
210
|
-
if "application/pdf" in content_type:
|
|
211
|
-
return True
|
|
212
|
-
prefix = resp.read(8)
|
|
213
|
-
return prefix.startswith(b"%PDF-")
|
|
214
|
-
except (HTTPError, URLError, OSError):
|
|
215
|
-
return False
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def _extract_oa_link(html: str, page_url: str) -> str:
|
|
219
|
-
match = OA_HINT_PATTERN.search(html)
|
|
220
|
-
if not match:
|
|
221
|
-
return ""
|
|
222
|
-
candidate = match.group(1).strip()
|
|
223
|
-
if not candidate:
|
|
224
|
-
return ""
|
|
225
|
-
if candidate.startswith("//"):
|
|
226
|
-
candidate = f"https:{candidate}"
|
|
227
|
-
else:
|
|
228
|
-
candidate = urljoin(page_url, candidate)
|
|
229
|
-
return _canonicalize(candidate)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def _mirror_list() -> tuple[str, ...]:
|
|
233
|
-
raw = os.environ.get("SCIHUB_MIRRORS", "")
|
|
234
|
-
if raw.strip():
|
|
235
|
-
return tuple(item.strip().rstrip("/") for item in raw.split(",") if item.strip())
|
|
236
|
-
return DEFAULT_MIRRORS
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def resolve_pdf(doi: str) -> tuple[str, str]:
|
|
240
|
-
normalized = _normalize_doi(doi)
|
|
241
|
-
if not normalized:
|
|
242
|
-
return STATUS_INVALID_INPUT, ""
|
|
243
|
-
safe_doi = quote(normalized, safe="/:().-_")
|
|
244
|
-
saw_not_found = False
|
|
245
|
-
saw_mirror_error = False
|
|
246
|
-
oa_link = ""
|
|
247
|
-
for mirror in _mirror_list():
|
|
248
|
-
browser = Browser()
|
|
249
|
-
try:
|
|
250
|
-
page_url, html = _fetch_page(browser, f"{mirror}/{safe_doi}")
|
|
251
|
-
except (HTTPError, URLError, OSError):
|
|
252
|
-
saw_mirror_error = True
|
|
253
|
-
continue
|
|
254
|
-
if not html:
|
|
255
|
-
saw_mirror_error = True
|
|
256
|
-
continue
|
|
257
|
-
title = _extract_title(html).lower()
|
|
258
|
-
if "not available through sci-hub" in title or "no articles found" in title:
|
|
259
|
-
saw_not_found = True
|
|
260
|
-
if not oa_link:
|
|
261
|
-
oa_link = _extract_oa_link(html, page_url)
|
|
262
|
-
continue
|
|
263
|
-
for candidate in _iter_pdf_candidates(html, page_url):
|
|
264
|
-
if _is_pdf(browser, candidate):
|
|
265
|
-
return STATUS_FOUND, candidate
|
|
266
|
-
saw_mirror_error = True
|
|
267
|
-
if saw_not_found:
|
|
268
|
-
return STATUS_NOT_FOUND, oa_link
|
|
269
|
-
if saw_mirror_error:
|
|
270
|
-
return STATUS_MIRROR_ERROR, ""
|
|
271
|
-
return STATUS_NOT_FOUND, ""
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if __name__ == "__main__":
|
|
275
|
-
if len(sys.argv) != 2:
|
|
276
|
-
print("Usage: scihub-paper-downloader.py <DOI>", file=sys.stderr)
|
|
277
|
-
sys.exit(1)
|
|
278
|
-
status, url = resolve_pdf(sys.argv[1])
|
|
279
|
-
if status == STATUS_FOUND:
|
|
280
|
-
print(url)
|
|
281
|
-
sys.exit(0)
|
|
282
|
-
print(status)
|
|
283
|
-
if status == STATUS_NOT_FOUND and url:
|
|
284
|
-
print(f"OA_LINK {url}")
|
|
285
|
-
if status == STATUS_NOT_FOUND:
|
|
286
|
-
sys.exit(1)
|
|
287
|
-
if status == STATUS_MIRROR_ERROR:
|
|
288
|
-
sys.exit(2)
|
|
289
|
-
sys.exit(3)
|