@fbraza/pi-cite 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ #!/usr/bin/env python3
2
+ """Resolve a DOI to a direct PDF URL through Sci-Hub.
3
+
4
+ Zero dependencies. Python standard library only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import hashlib
11
+ import http.client
12
+ import http.cookiejar
13
+ import json
14
+ import os
15
+ import re
16
+ import sys
17
+ from typing import Iterable
18
+ from urllib.error import HTTPError, URLError
19
+ from urllib.parse import quote, urljoin, urlsplit, urlunsplit
20
+ from urllib.request import (
21
+ HTTPCookieProcessor,
22
+ HTTPRedirectHandler,
23
+ Request,
24
+ build_opener,
25
+ )
26
+
27
+ TIMEOUT = 20
28
+ STATUS_FOUND = "FOUND"
29
+ STATUS_NOT_FOUND = "NOT_FOUND"
30
+ STATUS_MIRROR_ERROR = "MIRROR_ERROR"
31
+ STATUS_INVALID_INPUT = "INVALID_INPUT"
32
+ DEFAULT_MIRRORS = (
33
+ "https://sci-hub.st",
34
+ "https://sci-hub.ru",
35
+ "https://sci-hub.se",
36
+ )
37
+ UA = (
38
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
39
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
40
+ "Chrome/133.0.0.0 Safari/537.36"
41
+ )
42
+ PDF_PATTERNS = (
43
+ re.compile(r'<(?:iframe|embed|object)[^>]+(?:src|data)=["\']([^"\']+)["\']', re.I),
44
+ re.compile(r'["\']((?:https?:)?//[^"\']+?(?:\.pdf|/pdf)[^"\']*)["\']', re.I),
45
+ )
46
+ OA_HINT_PATTERN = re.compile(
47
+ r'<block-rounded[^>]+class\s*=\s*["\'][^"\']*\bopenaccess\b[^"\']*["\'][^>]*>(?:(?!</block-rounded>).)*?<a[^>]+href\s*=\s*["\']([^"\']+)["\']',
48
+ re.I | re.S,
49
+ )
50
+
51
+
52
+ class Browser:
53
+ def __init__(self) -> None:
54
+ jar = http.cookiejar.CookieJar()
55
+ self.opener = build_opener(HTTPCookieProcessor(jar), HTTPRedirectHandler())
56
+
57
+ def open(
58
+ self,
59
+ url: str,
60
+ *,
61
+ data: bytes | None = None,
62
+ headers: dict[str, str] | None = None,
63
+ ) -> http.client.HTTPResponse:
64
+ req = Request(url, data=data, headers=headers or {})
65
+ return self.opener.open(req, timeout=TIMEOUT)
66
+
67
+
68
+ def _headers(extra: dict[str, str] | None = None) -> dict[str, str]:
69
+ base = {
70
+ "User-Agent": UA,
71
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
72
+ "Accept-Language": "en-US,en;q=0.9",
73
+ }
74
+ if extra:
75
+ base.update(extra)
76
+ return base
77
+
78
+
79
+ def _canonicalize(url: str) -> str:
80
+ parts = urlsplit(url.strip().replace("\\/", "/"))
81
+ return urlunsplit((parts.scheme, parts.netloc, parts.path, parts.query, ""))
82
+
83
+
84
+ def _normalize_doi(raw: str) -> str:
85
+ doi = raw.strip()
86
+ doi = re.sub(r"^(?:doi:\s*)", "", doi, flags=re.I)
87
+ doi = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", doi, flags=re.I)
88
+ return doi.strip()
89
+
90
+
91
+ def _extract_title(html: str) -> str:
92
+ match = re.search(r"<title[^>]*>(.*?)</title>", html, re.I | re.S)
93
+ if not match:
94
+ return ""
95
+ return " ".join(match.group(1).split())
96
+
97
+
98
+ def _iter_pdf_candidates(html: str, page_url: str) -> Iterable[str]:
99
+ seen: set[str] = set()
100
+ for pattern in PDF_PATTERNS:
101
+ for raw in pattern.findall(html):
102
+ candidate = raw.strip()
103
+ if not candidate:
104
+ continue
105
+ if candidate.startswith("//"):
106
+ candidate = f"https:{candidate}"
107
+ else:
108
+ candidate = urljoin(page_url, candidate)
109
+ candidate = _canonicalize(candidate)
110
+ if candidate in seen:
111
+ continue
112
+ seen.add(candidate)
113
+ yield candidate
114
+
115
+
116
+ def _has_altcha(html: str) -> bool:
117
+ return bool(re.search(r"/captcha/challenge/\d+", html))
118
+
119
+
120
+ def _hexdigest(data: str, algorithm: str) -> str:
121
+ digest = hashlib.new(algorithm.strip().lower().replace("-", ""))
122
+ digest.update(data.encode("utf-8"))
123
+ return digest.hexdigest()
124
+
125
+
126
+ def _solve_altcha(browser: Browser, page_url: str, html: str) -> bool:
127
+ challenge_id = re.search(r"/captcha/challenge/(\d+)", html)
128
+ if not challenge_id:
129
+ return False
130
+ parts = urlsplit(page_url)
131
+ base_url = f"{parts.scheme}://{parts.netloc}"
132
+ challenge_url = urljoin(base_url, f"/captcha/challenge/{challenge_id.group(1)}")
133
+ solution_url = urljoin(base_url, f"/captcha/solution/{challenge_id.group(1)}")
134
+ try:
135
+ with browser.open(challenge_url, headers=_headers({"Accept": "application/json"})) as resp:
136
+ challenge = json.loads(resp.read().decode("utf-8", errors="replace"))
137
+ algorithm = str(challenge["algorithm"])
138
+ salt = str(challenge["salt"])
139
+ target = str(challenge["challenge"])
140
+ max_number = int(challenge["maxNumber"])
141
+ except (HTTPError, URLError, OSError, KeyError, TypeError, ValueError, json.JSONDecodeError):
142
+ return False
143
+ number = None
144
+ try:
145
+ for value in range(max_number + 1):
146
+ if _hexdigest(f"{salt}{value}", algorithm) == target:
147
+ number = value
148
+ break
149
+ except ValueError:
150
+ return False
151
+ if number is None:
152
+ return False
153
+ payload = base64.b64encode(
154
+ json.dumps(
155
+ {
156
+ "algorithm": algorithm,
157
+ "challenge": target,
158
+ "number": number,
159
+ "salt": salt,
160
+ "signature": challenge.get("signature", ""),
161
+ "took": 0,
162
+ },
163
+ separators=(",", ":"),
164
+ ).encode("utf-8")
165
+ ).decode("ascii")
166
+ body = json.dumps({"captcha": payload}).encode("utf-8")
167
+ try:
168
+ with browser.open(
169
+ solution_url,
170
+ data=body,
171
+ headers=_headers(
172
+ {
173
+ "Content-Type": "application/json",
174
+ "Accept": "application/json",
175
+ "Origin": base_url,
176
+ "Referer": page_url,
177
+ }
178
+ ),
179
+ ) as resp:
180
+ response = json.loads(resp.read().decode("utf-8", errors="replace"))
181
+ except (HTTPError, URLError, OSError, TypeError, ValueError, json.JSONDecodeError):
182
+ return False
183
+ return bool(response.get("success"))
184
+
185
+
186
+ def _fetch_page(browser: Browser, doi_url: str) -> tuple[str, str]:
187
+ current_url = doi_url
188
+ for _ in range(3):
189
+ with browser.open(current_url, headers=_headers()) as resp:
190
+ final_url = resp.geturl()
191
+ html = resp.read().decode("utf-8", errors="replace")
192
+ if not _has_altcha(html):
193
+ return final_url, html
194
+ if not _solve_altcha(browser, final_url, html):
195
+ break
196
+ current_url = doi_url
197
+ return "", ""
198
+
199
+
200
+ def _is_pdf(browser: Browser, url: str) -> bool:
201
+ headers = _headers(
202
+ {
203
+ "Accept": "application/pdf,*/*;q=0.8",
204
+ "Range": "bytes=0-7",
205
+ }
206
+ )
207
+ try:
208
+ with browser.open(url, headers=headers) as resp:
209
+ content_type = (resp.headers.get("Content-Type") or "").lower()
210
+ if "application/pdf" in content_type:
211
+ return True
212
+ prefix = resp.read(8)
213
+ return prefix.startswith(b"%PDF-")
214
+ except (HTTPError, URLError, OSError):
215
+ return False
216
+
217
+
218
+ def _extract_oa_link(html: str, page_url: str) -> str:
219
+ match = OA_HINT_PATTERN.search(html)
220
+ if not match:
221
+ return ""
222
+ candidate = match.group(1).strip()
223
+ if not candidate:
224
+ return ""
225
+ if candidate.startswith("//"):
226
+ candidate = f"https:{candidate}"
227
+ else:
228
+ candidate = urljoin(page_url, candidate)
229
+ return _canonicalize(candidate)
230
+
231
+
232
+ def _mirror_list() -> tuple[str, ...]:
233
+ raw = os.environ.get("SCIHUB_MIRRORS", "")
234
+ if raw.strip():
235
+ return tuple(item.strip().rstrip("/") for item in raw.split(",") if item.strip())
236
+ return DEFAULT_MIRRORS
237
+
238
+
239
+ def resolve_pdf(doi: str) -> tuple[str, str]:
240
+ normalized = _normalize_doi(doi)
241
+ if not normalized:
242
+ return STATUS_INVALID_INPUT, ""
243
+ safe_doi = quote(normalized, safe="/:().-_")
244
+ saw_not_found = False
245
+ saw_mirror_error = False
246
+ oa_link = ""
247
+ for mirror in _mirror_list():
248
+ browser = Browser()
249
+ try:
250
+ page_url, html = _fetch_page(browser, f"{mirror}/{safe_doi}")
251
+ except (HTTPError, URLError, OSError):
252
+ saw_mirror_error = True
253
+ continue
254
+ if not html:
255
+ saw_mirror_error = True
256
+ continue
257
+ title = _extract_title(html).lower()
258
+ if "not available through sci-hub" in title or "no articles found" in title:
259
+ saw_not_found = True
260
+ if not oa_link:
261
+ oa_link = _extract_oa_link(html, page_url)
262
+ continue
263
+ for candidate in _iter_pdf_candidates(html, page_url):
264
+ if _is_pdf(browser, candidate):
265
+ return STATUS_FOUND, candidate
266
+ saw_mirror_error = True
267
+ if saw_not_found:
268
+ return STATUS_NOT_FOUND, oa_link
269
+ if saw_mirror_error:
270
+ return STATUS_MIRROR_ERROR, ""
271
+ return STATUS_NOT_FOUND, ""
272
+
273
+
274
+ if __name__ == "__main__":
275
+ if len(sys.argv) != 2:
276
+ print("Usage: scihub-paper-downloader.py <DOI>", file=sys.stderr)
277
+ sys.exit(1)
278
+ status, url = resolve_pdf(sys.argv[1])
279
+ if status == STATUS_FOUND:
280
+ print(url)
281
+ sys.exit(0)
282
+ print(status)
283
+ if status == STATUS_NOT_FOUND and url:
284
+ print(f"OA_LINK {url}")
285
+ if status == STATUS_NOT_FOUND:
286
+ sys.exit(1)
287
+ if status == STATUS_MIRROR_ERROR:
288
+ sys.exit(2)
289
+ sys.exit(3)
@@ -0,0 +1,93 @@
1
+ """
2
+ Unified literature synthesis helpers.
3
+
4
+ This module combines general literature review summarisation with the
5
+ preclinical extraction summary used by the legacy merged literature workflow.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections import Counter
11
+ from typing import Dict, List, Optional
12
+
13
+
14
+ def classify_study_type(paper: Dict) -> str:
15
+ publication_types = [str(x).lower() for x in paper.get("publication_types", [])]
16
+ text = " ".join(publication_types)
17
+ if "meta-analysis" in text or "systematic review" in text:
18
+ return "Systematic review / meta-analysis"
19
+ if "randomized controlled trial" in text:
20
+ return "Randomized controlled trial"
21
+ if "clinical trial" in text:
22
+ return "Clinical study"
23
+ if paper.get("is_preprint"):
24
+ return "Preprint"
25
+
26
+ abstract = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
27
+ if any(x in abstract for x in ["xenograft", "mouse", "mice", "in vivo"]):
28
+ if any(y in abstract for y in ["cell line", "in vitro", "organoid"]):
29
+ return "In vitro + in vivo"
30
+ return "In vivo"
31
+ if any(x in abstract for x in ["cell line", "in vitro", "organoid", "crispr"]):
32
+ return "In vitro"
33
+ return "Observational / other"
34
+
35
+
36
+ def classify_evidence_quality(paper: Dict) -> str:
37
+ study_type = classify_study_type(paper)
38
+ citation_count = int(paper.get("citation_count") or 0)
39
+ if study_type in {"Systematic review / meta-analysis", "Randomized controlled trial"}:
40
+ return "High"
41
+ if study_type in {"Clinical study", "In vitro + in vivo"}:
42
+ return "Moderate"
43
+ if paper.get("is_preprint"):
44
+ return "Preliminary (preprint)"
45
+ if study_type in {"In vivo", "In vitro"}:
46
+ return "Moderate" if citation_count >= 20 else "Low to moderate"
47
+ return "Preliminary"
48
+
49
+
50
+ def summarize_papers(papers: List[Dict]) -> Dict:
51
+ study_types = Counter(classify_study_type(p) for p in papers)
52
+ evidence = Counter(classify_evidence_quality(p) for p in papers)
53
+ years = [int(p.get("year")) for p in papers if str(p.get("year", "")).isdigit()]
54
+ return {
55
+ "total_papers": len(papers),
56
+ "study_type_breakdown": dict(study_types),
57
+ "evidence_quality_breakdown": dict(evidence),
58
+ "year_range": [min(years), max(years)] if years else None,
59
+ }
60
+
61
+
62
+ def generate_narrative(papers: List[Dict], topic: str = "") -> str:
63
+ summary = summarize_papers(papers)
64
+ lead = f"Literature synthesis for **{topic}**." if topic else "Literature synthesis."
65
+ lines = [lead, "", f"- Papers reviewed: {summary['total_papers']}"]
66
+ if summary["year_range"]:
67
+ lines.append(f"- Year range: {summary['year_range'][0]}-{summary['year_range'][1]}")
68
+ if summary["study_type_breakdown"]:
69
+ lines.append("- Study types: " + ", ".join(f"{k} ({v})" for k, v in summary["study_type_breakdown"].items()))
70
+ if summary["evidence_quality_breakdown"]:
71
+ lines.append("- Evidence quality: " + ", ".join(f"{k} ({v})" for k, v in summary["evidence_quality_breakdown"].items()))
72
+
73
+ top_titles = [p.get("title", "Untitled") for p in papers[:5]]
74
+ if top_titles:
75
+ lines.extend(["", "Top prioritised papers:"])
76
+ lines.extend([f"{i + 1}. {title}" for i, title in enumerate(top_titles)])
77
+ return "\n".join(lines)
78
+
79
+
80
+ def synthesize_literature(
81
+ papers: List[Dict],
82
+ experiments: Optional[List[Dict]] = None,
83
+ topic: str = "",
84
+ mode: str = "general",
85
+ ) -> Dict:
86
+ summary = summarize_papers(papers)
87
+ summary["mode"] = mode
88
+ summary["topic"] = topic
89
+ summary["narrative_markdown"] = generate_narrative(papers, topic=topic)
90
+ if experiments:
91
+ summary["experiment_type_breakdown"] = dict(Counter(e.get("experiment_type", "unclassified") for e in experiments))
92
+ summary["model_systems"] = dict(Counter(filter(None, [e.get("cell_lines") or e.get("animal_models") for e in experiments])))
93
+ return summary