@fbraza/pi-cite 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/package.json +5 -1
- package/skills/literature/SKILL.md +208 -0
- package/skills/literature/references/full-text-access-guide.md +34 -0
- package/skills/literature/references/preclinical-extraction-guide.md +215 -0
- package/skills/literature/references/pubmed_api_reference.md +298 -0
- package/skills/literature/references/pubmed_common_queries.md +453 -0
- package/skills/literature/references/pubmed_routine.md +93 -0
- package/skills/literature/references/pubmed_search_syntax.md +436 -0
- package/skills/literature/references/scihub_routine.md +40 -0
- package/skills/literature/references/semanticscholar_routine.md +50 -0
- package/skills/literature/scripts/export_all.py +53 -0
- package/skills/literature/scripts/extract_experiments.py +401 -0
- package/skills/literature/scripts/generate_table.py +96 -0
- package/skills/literature/scripts/scihub_pdf_resolver.py +289 -0
- package/skills/literature/scripts/synthesis.py +93 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Resolve a DOI to a direct PDF URL through Sci-Hub.
|
|
3
|
+
|
|
4
|
+
Zero dependencies. Python standard library only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import hashlib
|
|
11
|
+
import http.client
|
|
12
|
+
import http.cookiejar
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Iterable
|
|
18
|
+
from urllib.error import HTTPError, URLError
|
|
19
|
+
from urllib.parse import quote, urljoin, urlsplit, urlunsplit
|
|
20
|
+
from urllib.request import (
|
|
21
|
+
HTTPCookieProcessor,
|
|
22
|
+
HTTPRedirectHandler,
|
|
23
|
+
Request,
|
|
24
|
+
build_opener,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
TIMEOUT = 20
|
|
28
|
+
STATUS_FOUND = "FOUND"
|
|
29
|
+
STATUS_NOT_FOUND = "NOT_FOUND"
|
|
30
|
+
STATUS_MIRROR_ERROR = "MIRROR_ERROR"
|
|
31
|
+
STATUS_INVALID_INPUT = "INVALID_INPUT"
|
|
32
|
+
DEFAULT_MIRRORS = (
|
|
33
|
+
"https://sci-hub.st",
|
|
34
|
+
"https://sci-hub.ru",
|
|
35
|
+
"https://sci-hub.se",
|
|
36
|
+
)
|
|
37
|
+
UA = (
|
|
38
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
39
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
40
|
+
"Chrome/133.0.0.0 Safari/537.36"
|
|
41
|
+
)
|
|
42
|
+
PDF_PATTERNS = (
|
|
43
|
+
re.compile(r'<(?:iframe|embed|object)[^>]+(?:src|data)=["\']([^"\']+)["\']', re.I),
|
|
44
|
+
re.compile(r'["\']((?:https?:)?//[^"\']+?(?:\.pdf|/pdf)[^"\']*)["\']', re.I),
|
|
45
|
+
)
|
|
46
|
+
OA_HINT_PATTERN = re.compile(
|
|
47
|
+
r'<block-rounded[^>]+class\s*=\s*["\'][^"\']*\bopenaccess\b[^"\']*["\'][^>]*>(?:(?!</block-rounded>).)*?<a[^>]+href\s*=\s*["\']([^"\']+)["\']',
|
|
48
|
+
re.I | re.S,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Browser:
|
|
53
|
+
def __init__(self) -> None:
|
|
54
|
+
jar = http.cookiejar.CookieJar()
|
|
55
|
+
self.opener = build_opener(HTTPCookieProcessor(jar), HTTPRedirectHandler())
|
|
56
|
+
|
|
57
|
+
def open(
|
|
58
|
+
self,
|
|
59
|
+
url: str,
|
|
60
|
+
*,
|
|
61
|
+
data: bytes | None = None,
|
|
62
|
+
headers: dict[str, str] | None = None,
|
|
63
|
+
) -> http.client.HTTPResponse:
|
|
64
|
+
req = Request(url, data=data, headers=headers or {})
|
|
65
|
+
return self.opener.open(req, timeout=TIMEOUT)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _headers(extra: dict[str, str] | None = None) -> dict[str, str]:
|
|
69
|
+
base = {
|
|
70
|
+
"User-Agent": UA,
|
|
71
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
72
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
73
|
+
}
|
|
74
|
+
if extra:
|
|
75
|
+
base.update(extra)
|
|
76
|
+
return base
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _canonicalize(url: str) -> str:
|
|
80
|
+
parts = urlsplit(url.strip().replace("\\/", "/"))
|
|
81
|
+
return urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ""))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _normalize_doi(raw: str) -> str:
|
|
85
|
+
doi = raw.strip()
|
|
86
|
+
doi = re.sub(r"^(?:doi:\s*)", "", doi, flags=re.I)
|
|
87
|
+
doi = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", doi, flags=re.I)
|
|
88
|
+
return doi.strip()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _extract_title(html: str) -> str:
|
|
92
|
+
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.I | re.S)
|
|
93
|
+
if not match:
|
|
94
|
+
return ""
|
|
95
|
+
return " ".join(match.group(1).split())
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _iter_pdf_candidates(html: str, page_url: str) -> Iterable[str]:
|
|
99
|
+
seen: set[str] = set()
|
|
100
|
+
for pattern in PDF_PATTERNS:
|
|
101
|
+
for raw in pattern.findall(html):
|
|
102
|
+
candidate = raw.strip()
|
|
103
|
+
if not candidate:
|
|
104
|
+
continue
|
|
105
|
+
if candidate.startswith("//"):
|
|
106
|
+
candidate = f"https:{candidate}"
|
|
107
|
+
else:
|
|
108
|
+
candidate = urljoin(page_url, candidate)
|
|
109
|
+
candidate = _canonicalize(candidate)
|
|
110
|
+
if candidate in seen:
|
|
111
|
+
continue
|
|
112
|
+
seen.add(candidate)
|
|
113
|
+
yield candidate
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _has_altcha(html: str) -> bool:
|
|
117
|
+
return bool(re.search(r"/captcha/challenge/\d+", html))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _hexdigest(data: str, algorithm: str) -> str:
|
|
121
|
+
digest = hashlib.new(algorithm.strip().lower().replace("-", ""))
|
|
122
|
+
digest.update(data.encode("utf-8"))
|
|
123
|
+
return digest.hexdigest()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _solve_altcha(browser: Browser, page_url: str, html: str) -> bool:
|
|
127
|
+
challenge_id = re.search(r"/captcha/challenge/(\d+)", html)
|
|
128
|
+
if not challenge_id:
|
|
129
|
+
return False
|
|
130
|
+
parts = urlsplit(page_url)
|
|
131
|
+
base_url = f"{parts.scheme}://{parts.netloc}"
|
|
132
|
+
challenge_url = urljoin(base_url, f"/captcha/challenge/{challenge_id.group(1)}")
|
|
133
|
+
solution_url = urljoin(base_url, f"/captcha/solution/{challenge_id.group(1)}")
|
|
134
|
+
try:
|
|
135
|
+
with browser.open(challenge_url, headers=_headers({"Accept": "application/json"})) as resp:
|
|
136
|
+
challenge = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
137
|
+
algorithm = str(challenge["algorithm"])
|
|
138
|
+
salt = str(challenge["salt"])
|
|
139
|
+
target = str(challenge["challenge"])
|
|
140
|
+
max_number = int(challenge["maxNumber"])
|
|
141
|
+
except (HTTPError, URLError, OSError, KeyError, TypeError, ValueError, json.JSONDecodeError):
|
|
142
|
+
return False
|
|
143
|
+
number = None
|
|
144
|
+
try:
|
|
145
|
+
for value in range(max_number + 1):
|
|
146
|
+
if _hexdigest(f"{salt}{value}", algorithm) == target:
|
|
147
|
+
number = value
|
|
148
|
+
break
|
|
149
|
+
except ValueError:
|
|
150
|
+
return False
|
|
151
|
+
if number is None:
|
|
152
|
+
return False
|
|
153
|
+
payload = base64.b64encode(
|
|
154
|
+
json.dumps(
|
|
155
|
+
{
|
|
156
|
+
"algorithm": algorithm,
|
|
157
|
+
"challenge": target,
|
|
158
|
+
"number": number,
|
|
159
|
+
"salt": salt,
|
|
160
|
+
"signature": challenge.get("signature", ""),
|
|
161
|
+
"took": 0,
|
|
162
|
+
},
|
|
163
|
+
separators=(",", ":"),
|
|
164
|
+
).encode("utf-8")
|
|
165
|
+
).decode("ascii")
|
|
166
|
+
body = json.dumps({"captcha": payload}).encode("utf-8")
|
|
167
|
+
try:
|
|
168
|
+
with browser.open(
|
|
169
|
+
solution_url,
|
|
170
|
+
data=body,
|
|
171
|
+
headers=_headers(
|
|
172
|
+
{
|
|
173
|
+
"Content-Type": "application/json",
|
|
174
|
+
"Accept": "application/json",
|
|
175
|
+
"Origin": base_url,
|
|
176
|
+
"Referer": page_url,
|
|
177
|
+
}
|
|
178
|
+
),
|
|
179
|
+
) as resp:
|
|
180
|
+
response = json.loads(resp.read().decode("utf-8", errors="replace"))
|
|
181
|
+
except (HTTPError, URLError, OSError, TypeError, ValueError, json.JSONDecodeError):
|
|
182
|
+
return False
|
|
183
|
+
return bool(response.get("success"))
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _fetch_page(browser: Browser, doi_url: str) -> tuple[str, str]:
|
|
187
|
+
current_url = doi_url
|
|
188
|
+
for _ in range(3):
|
|
189
|
+
with browser.open(current_url, headers=_headers()) as resp:
|
|
190
|
+
final_url = resp.geturl()
|
|
191
|
+
html = resp.read().decode("utf-8", errors="replace")
|
|
192
|
+
if not _has_altcha(html):
|
|
193
|
+
return final_url, html
|
|
194
|
+
if not _solve_altcha(browser, final_url, html):
|
|
195
|
+
break
|
|
196
|
+
current_url = doi_url
|
|
197
|
+
return "", ""
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _is_pdf(browser: Browser, url: str) -> bool:
|
|
201
|
+
headers = _headers(
|
|
202
|
+
{
|
|
203
|
+
"Accept": "application/pdf,*/*;q=0.8",
|
|
204
|
+
"Range": "bytes=0-7",
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
try:
|
|
208
|
+
with browser.open(url, headers=headers) as resp:
|
|
209
|
+
content_type = (resp.headers.get("Content-Type") or "").lower()
|
|
210
|
+
if "application/pdf" in content_type:
|
|
211
|
+
return True
|
|
212
|
+
prefix = resp.read(8)
|
|
213
|
+
return prefix.startswith(b"%PDF-")
|
|
214
|
+
except (HTTPError, URLError, OSError):
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _extract_oa_link(html: str, page_url: str) -> str:
|
|
219
|
+
match = OA_HINT_PATTERN.search(html)
|
|
220
|
+
if not match:
|
|
221
|
+
return ""
|
|
222
|
+
candidate = match.group(1).strip()
|
|
223
|
+
if not candidate:
|
|
224
|
+
return ""
|
|
225
|
+
if candidate.startswith("//"):
|
|
226
|
+
candidate = f"https:{candidate}"
|
|
227
|
+
else:
|
|
228
|
+
candidate = urljoin(page_url, candidate)
|
|
229
|
+
return _canonicalize(candidate)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _mirror_list() -> tuple[str, ...]:
|
|
233
|
+
raw = os.environ.get("SCIHUB_MIRRORS", "")
|
|
234
|
+
if raw.strip():
|
|
235
|
+
return tuple(item.strip().rstrip("/") for item in raw.split(",") if item.strip())
|
|
236
|
+
return DEFAULT_MIRRORS
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def resolve_pdf(doi: str) -> tuple[str, str]:
|
|
240
|
+
normalized = _normalize_doi(doi)
|
|
241
|
+
if not normalized:
|
|
242
|
+
return STATUS_INVALID_INPUT, ""
|
|
243
|
+
safe_doi = quote(normalized, safe="/:().-_")
|
|
244
|
+
saw_not_found = False
|
|
245
|
+
saw_mirror_error = False
|
|
246
|
+
oa_link = ""
|
|
247
|
+
for mirror in _mirror_list():
|
|
248
|
+
browser = Browser()
|
|
249
|
+
try:
|
|
250
|
+
page_url, html = _fetch_page(browser, f"{mirror}/{safe_doi}")
|
|
251
|
+
except (HTTPError, URLError, OSError):
|
|
252
|
+
saw_mirror_error = True
|
|
253
|
+
continue
|
|
254
|
+
if not html:
|
|
255
|
+
saw_mirror_error = True
|
|
256
|
+
continue
|
|
257
|
+
title = _extract_title(html).lower()
|
|
258
|
+
if "not available through sci-hub" in title or "no articles found" in title:
|
|
259
|
+
saw_not_found = True
|
|
260
|
+
if not oa_link:
|
|
261
|
+
oa_link = _extract_oa_link(html, page_url)
|
|
262
|
+
continue
|
|
263
|
+
for candidate in _iter_pdf_candidates(html, page_url):
|
|
264
|
+
if _is_pdf(browser, candidate):
|
|
265
|
+
return STATUS_FOUND, candidate
|
|
266
|
+
saw_mirror_error = True
|
|
267
|
+
if saw_not_found:
|
|
268
|
+
return STATUS_NOT_FOUND, oa_link
|
|
269
|
+
if saw_mirror_error:
|
|
270
|
+
return STATUS_MIRROR_ERROR, ""
|
|
271
|
+
return STATUS_NOT_FOUND, ""
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
if __name__ == "__main__":
|
|
275
|
+
if len(sys.argv) != 2:
|
|
276
|
+
print("Usage: scihub-paper-downloader.py <DOI>", file=sys.stderr)
|
|
277
|
+
sys.exit(1)
|
|
278
|
+
status, url = resolve_pdf(sys.argv[1])
|
|
279
|
+
if status == STATUS_FOUND:
|
|
280
|
+
print(url)
|
|
281
|
+
sys.exit(0)
|
|
282
|
+
print(status)
|
|
283
|
+
if status == STATUS_NOT_FOUND and url:
|
|
284
|
+
print(f"OA_LINK {url}")
|
|
285
|
+
if status == STATUS_NOT_FOUND:
|
|
286
|
+
sys.exit(1)
|
|
287
|
+
if status == STATUS_MIRROR_ERROR:
|
|
288
|
+
sys.exit(2)
|
|
289
|
+
sys.exit(3)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified literature synthesis helpers.
|
|
3
|
+
|
|
4
|
+
This module combines general literature review summarisation with the
|
|
5
|
+
preclinical extraction summary used by the legacy merged literature workflow.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def classify_study_type(paper: Dict) -> str:
|
|
15
|
+
publication_types = [str(x).lower() for x in paper.get("publication_types", [])]
|
|
16
|
+
text = " ".join(publication_types)
|
|
17
|
+
if "meta-analysis" in text or "systematic review" in text:
|
|
18
|
+
return "Systematic review / meta-analysis"
|
|
19
|
+
if "randomized controlled trial" in text:
|
|
20
|
+
return "Randomized controlled trial"
|
|
21
|
+
if "clinical trial" in text:
|
|
22
|
+
return "Clinical study"
|
|
23
|
+
if paper.get("is_preprint"):
|
|
24
|
+
return "Preprint"
|
|
25
|
+
|
|
26
|
+
abstract = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
|
|
27
|
+
if any(x in abstract for x in ["xenograft", "mouse", "mice", "in vivo"]):
|
|
28
|
+
if any(y in abstract for y in ["cell line", "in vitro", "organoid"]):
|
|
29
|
+
return "In vitro + in vivo"
|
|
30
|
+
return "In vivo"
|
|
31
|
+
if any(x in abstract for x in ["cell line", "in vitro", "organoid", "crispr"]):
|
|
32
|
+
return "In vitro"
|
|
33
|
+
return "Observational / other"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def classify_evidence_quality(paper: Dict) -> str:
|
|
37
|
+
study_type = classify_study_type(paper)
|
|
38
|
+
citation_count = int(paper.get("citation_count") or 0)
|
|
39
|
+
if study_type in {"Systematic review / meta-analysis", "Randomized controlled trial"}:
|
|
40
|
+
return "High"
|
|
41
|
+
if study_type in {"Clinical study", "In vitro + in vivo"}:
|
|
42
|
+
return "Moderate"
|
|
43
|
+
if paper.get("is_preprint"):
|
|
44
|
+
return "Preliminary (preprint)"
|
|
45
|
+
if study_type in {"In vivo", "In vitro"}:
|
|
46
|
+
return "Moderate" if citation_count >= 20 else "Low to moderate"
|
|
47
|
+
return "Preliminary"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def summarize_papers(papers: List[Dict]) -> Dict:
|
|
51
|
+
study_types = Counter(classify_study_type(p) for p in papers)
|
|
52
|
+
evidence = Counter(classify_evidence_quality(p) for p in papers)
|
|
53
|
+
years = [int(p.get("year")) for p in papers if str(p.get("year", "")).isdigit()]
|
|
54
|
+
return {
|
|
55
|
+
"total_papers": len(papers),
|
|
56
|
+
"study_type_breakdown": dict(study_types),
|
|
57
|
+
"evidence_quality_breakdown": dict(evidence),
|
|
58
|
+
"year_range": [min(years), max(years)] if years else None,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def generate_narrative(papers: List[Dict], topic: str = "") -> str:
|
|
63
|
+
summary = summarize_papers(papers)
|
|
64
|
+
lead = f"Literature synthesis for **{topic}**." if topic else "Literature synthesis."
|
|
65
|
+
lines = [lead, "", f"- Papers reviewed: {summary['total_papers']}"]
|
|
66
|
+
if summary["year_range"]:
|
|
67
|
+
lines.append(f"- Year range: {summary['year_range'][0]}-{summary['year_range'][1]}")
|
|
68
|
+
if summary["study_type_breakdown"]:
|
|
69
|
+
lines.append("- Study types: " + ", ".join(f"{k} ({v})" for k, v in summary["study_type_breakdown"].items()))
|
|
70
|
+
if summary["evidence_quality_breakdown"]:
|
|
71
|
+
lines.append("- Evidence quality: " + ", ".join(f"{k} ({v})" for k, v in summary["evidence_quality_breakdown"].items()))
|
|
72
|
+
|
|
73
|
+
top_titles = [p.get("title", "Untitled") for p in papers[:5]]
|
|
74
|
+
if top_titles:
|
|
75
|
+
lines.extend(["", "Top prioritised papers:"])
|
|
76
|
+
lines.extend([f"{i + 1}. {title}" for i, title in enumerate(top_titles)])
|
|
77
|
+
return "\n".join(lines)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def synthesize_literature(
|
|
81
|
+
papers: List[Dict],
|
|
82
|
+
experiments: Optional[List[Dict]] = None,
|
|
83
|
+
topic: str = "",
|
|
84
|
+
mode: str = "general",
|
|
85
|
+
) -> Dict:
|
|
86
|
+
summary = summarize_papers(papers)
|
|
87
|
+
summary["mode"] = mode
|
|
88
|
+
summary["topic"] = topic
|
|
89
|
+
summary["narrative_markdown"] = generate_narrative(papers, topic=topic)
|
|
90
|
+
if experiments:
|
|
91
|
+
summary["experiment_type_breakdown"] = dict(Counter(e.get("experiment_type", "unclassified") for e in experiments))
|
|
92
|
+
summary["model_systems"] = dict(Counter(filter(None, [e.get("cell_lines") or e.get("animal_models") for e in experiments])))
|
|
93
|
+
return summary
|