scitex 2.4.2__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/database.py +9 -2
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +55 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/RECORD +25 -17
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/open_access.py
|
|
4
|
+
"""
|
|
5
|
+
Open Access Detection Module.
|
|
6
|
+
|
|
7
|
+
Provides utilities for determining if a paper is open access based on:
|
|
8
|
+
- Known open access sources (arXiv, PMC, bioRxiv, etc.)
|
|
9
|
+
- Unpaywall API lookup
|
|
10
|
+
- Publisher patterns
|
|
11
|
+
- Journal DOAJ status
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import Optional, List, Dict, Any
|
|
20
|
+
import asyncio
|
|
21
|
+
import aiohttp
|
|
22
|
+
|
|
23
|
+
from scitex import logging
|
|
24
|
+
from scitex.scholar.config import ScholarConfig
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# Load OA config from default.yaml (single source of truth)
|
|
29
|
+
_config = None
|
|
30
|
+
|
|
31
|
+
def _get_config() -> ScholarConfig:
|
|
32
|
+
"""Get or create singleton config instance."""
|
|
33
|
+
global _config
|
|
34
|
+
if _config is None:
|
|
35
|
+
_config = ScholarConfig()
|
|
36
|
+
return _config
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _get_oa_sources() -> frozenset:
|
|
40
|
+
"""Get OA sources from config (single source of truth)."""
|
|
41
|
+
config = _get_config()
|
|
42
|
+
sources = config.get("OPENACCESS_SOURCES") or []
|
|
43
|
+
return frozenset(s.lower() for s in sources)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_oa_journals() -> tuple:
|
|
47
|
+
"""Get OA journal patterns from config (single source of truth)."""
|
|
48
|
+
config = _get_config()
|
|
49
|
+
journals = config.get("OPENACCESS_JOURNALS") or []
|
|
50
|
+
return tuple(j.lower() for j in journals)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_unpaywall_email() -> str:
|
|
54
|
+
"""Get Unpaywall API email from config."""
|
|
55
|
+
config = _get_config()
|
|
56
|
+
return config.get("unpaywall_email") or "research@scitex.io"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class OAStatus(Enum):
|
|
60
|
+
"""Open Access status categories (aligned with Unpaywall)."""
|
|
61
|
+
GOLD = "gold" # Published in OA journal (DOAJ listed)
|
|
62
|
+
GREEN = "green" # Available in repository (arXiv, PMC, etc.)
|
|
63
|
+
HYBRID = "hybrid" # OA article in subscription journal
|
|
64
|
+
BRONZE = "bronze" # Free to read on publisher site, but no license
|
|
65
|
+
CLOSED = "closed" # Paywalled
|
|
66
|
+
UNKNOWN = "unknown" # Status not determined
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class OAResult:
|
|
71
|
+
"""Result of open access detection."""
|
|
72
|
+
is_open_access: bool
|
|
73
|
+
status: OAStatus
|
|
74
|
+
oa_url: Optional[str] = None
|
|
75
|
+
source: Optional[str] = None # How we determined OA status
|
|
76
|
+
license: Optional[str] = None
|
|
77
|
+
confidence: float = 1.0 # 0-1, how confident we are
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Open Access Sources and Journals are loaded from config/default.yaml
|
|
81
|
+
# These properties provide lazy-loaded access to config values
|
|
82
|
+
# (single source of truth: config/default.yaml → OPENACCESS_SOURCES, OPENACCESS_JOURNALS)
|
|
83
|
+
|
|
84
|
+
# arXiv ID patterns
|
|
85
|
+
ARXIV_PATTERNS = [
|
|
86
|
+
re.compile(r"^\d{4}\.\d{4,5}(v\d+)?$"), # New format: 2301.12345
|
|
87
|
+
re.compile(r"^[a-z-]+/\d{7}(v\d+)?$"), # Old format: hep-th/9901001
|
|
88
|
+
re.compile(r"^arxiv:\d{4}\.\d{4,5}(v\d+)?$", re.IGNORECASE),
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def is_arxiv_id(identifier: str) -> bool:
|
|
93
|
+
"""Check if identifier looks like an arXiv ID."""
|
|
94
|
+
if not identifier:
|
|
95
|
+
return False
|
|
96
|
+
identifier = identifier.strip()
|
|
97
|
+
return any(p.match(identifier) for p in ARXIV_PATTERNS)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_open_access_source(source: str) -> bool:
|
|
101
|
+
"""Check if source is a known open access repository.
|
|
102
|
+
|
|
103
|
+
Sources are loaded from config/default.yaml → OPENACCESS_SOURCES
|
|
104
|
+
"""
|
|
105
|
+
if not source:
|
|
106
|
+
return False
|
|
107
|
+
return source.lower() in _get_oa_sources()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def is_open_access_journal(journal_name: str, use_cache: bool = True) -> bool:
|
|
111
|
+
"""Check if journal is a known open access journal.
|
|
112
|
+
|
|
113
|
+
Uses three-tier lookup:
|
|
114
|
+
1. Fast check against config/default.yaml → OPENACCESS_JOURNALS (pattern matching)
|
|
115
|
+
2. Comprehensive check against cached OpenAlex OA sources (exact match, 62K+ journals)
|
|
116
|
+
3. Journal normalizer check (handles abbreviations, variants, historical names)
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
journal_name: Journal name to check
|
|
120
|
+
use_cache: Whether to use OpenAlex cache (default True)
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
True if journal is known to be Open Access
|
|
124
|
+
"""
|
|
125
|
+
if not journal_name:
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
journal_lower = journal_name.lower()
|
|
129
|
+
|
|
130
|
+
# Tier 1: Fast pattern match from YAML config
|
|
131
|
+
if any(oa_journal in journal_lower for oa_journal in _get_oa_journals()):
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
# Tier 2: Check OpenAlex cache (62K+ OA sources)
|
|
135
|
+
if use_cache:
|
|
136
|
+
try:
|
|
137
|
+
from .oa_cache import is_oa_journal_cached
|
|
138
|
+
if is_oa_journal_cached(journal_name):
|
|
139
|
+
return True
|
|
140
|
+
except ImportError:
|
|
141
|
+
pass # Cache module not available
|
|
142
|
+
|
|
143
|
+
# Tier 3: Use journal normalizer (handles abbreviations, variants)
|
|
144
|
+
if use_cache:
|
|
145
|
+
try:
|
|
146
|
+
from .journal_normalizer import get_journal_normalizer
|
|
147
|
+
normalizer = get_journal_normalizer()
|
|
148
|
+
if normalizer.is_open_access(journal_name):
|
|
149
|
+
return True
|
|
150
|
+
except ImportError:
|
|
151
|
+
pass # Normalizer module not available
|
|
152
|
+
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def detect_oa_from_identifiers(
|
|
157
|
+
doi: Optional[str] = None,
|
|
158
|
+
arxiv_id: Optional[str] = None,
|
|
159
|
+
pmcid: Optional[str] = None,
|
|
160
|
+
source: Optional[str] = None,
|
|
161
|
+
journal: Optional[str] = None,
|
|
162
|
+
is_open_access_flag: Optional[bool] = None,
|
|
163
|
+
) -> OAResult:
|
|
164
|
+
"""
|
|
165
|
+
Detect open access status from paper identifiers without API calls.
|
|
166
|
+
|
|
167
|
+
This is fast but may miss some OA papers (e.g., hybrid articles).
|
|
168
|
+
For comprehensive detection, use check_oa_status_async() with Unpaywall.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
doi: Paper DOI
|
|
172
|
+
arxiv_id: arXiv identifier
|
|
173
|
+
pmcid: PubMed Central ID (starts with PMC)
|
|
174
|
+
source: Source database (arxiv, pmc, biorxiv, etc.)
|
|
175
|
+
journal: Journal name
|
|
176
|
+
is_open_access_flag: Pre-existing OA flag from search API
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
OAResult with detection results
|
|
180
|
+
"""
|
|
181
|
+
# If we already have an OA flag from a reliable source, trust it
|
|
182
|
+
if is_open_access_flag is True:
|
|
183
|
+
return OAResult(
|
|
184
|
+
is_open_access=True,
|
|
185
|
+
status=OAStatus.UNKNOWN, # We don't know the specific type
|
|
186
|
+
source="api_flag",
|
|
187
|
+
confidence=0.9,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# arXiv - always open access (GREEN)
|
|
191
|
+
if arxiv_id and is_arxiv_id(arxiv_id):
|
|
192
|
+
return OAResult(
|
|
193
|
+
is_open_access=True,
|
|
194
|
+
status=OAStatus.GREEN,
|
|
195
|
+
oa_url=f"https://arxiv.org/pdf/{arxiv_id}.pdf",
|
|
196
|
+
source="arxiv",
|
|
197
|
+
confidence=1.0,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# PMC - always open access (GREEN)
|
|
201
|
+
if pmcid and pmcid.upper().startswith("PMC"):
|
|
202
|
+
pmc_num = pmcid[3:] if pmcid.upper().startswith("PMC") else pmcid
|
|
203
|
+
return OAResult(
|
|
204
|
+
is_open_access=True,
|
|
205
|
+
status=OAStatus.GREEN,
|
|
206
|
+
oa_url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_num}/pdf/",
|
|
207
|
+
source="pmc",
|
|
208
|
+
confidence=1.0,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Known OA source
|
|
212
|
+
if source and is_open_access_source(source):
|
|
213
|
+
return OAResult(
|
|
214
|
+
is_open_access=True,
|
|
215
|
+
status=OAStatus.GREEN if source.lower() in ["arxiv", "pmc", "biorxiv", "medrxiv"] else OAStatus.GOLD,
|
|
216
|
+
source=f"source_{source}",
|
|
217
|
+
confidence=0.95,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Known OA journal
|
|
221
|
+
if journal and is_open_access_journal(journal):
|
|
222
|
+
return OAResult(
|
|
223
|
+
is_open_access=True,
|
|
224
|
+
status=OAStatus.GOLD,
|
|
225
|
+
source="oa_journal",
|
|
226
|
+
confidence=0.85,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# If we have a DOI but no other OA indicators, it's likely paywalled
|
|
230
|
+
if doi and not arxiv_id and not pmcid:
|
|
231
|
+
return OAResult(
|
|
232
|
+
is_open_access=False,
|
|
233
|
+
status=OAStatus.UNKNOWN, # Could be hybrid OA, need Unpaywall to confirm
|
|
234
|
+
source="no_oa_indicators",
|
|
235
|
+
confidence=0.6, # Low confidence - could be hybrid OA
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Unknown
|
|
239
|
+
return OAResult(
|
|
240
|
+
is_open_access=False,
|
|
241
|
+
status=OAStatus.UNKNOWN,
|
|
242
|
+
source="unknown",
|
|
243
|
+
confidence=0.3,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
async def check_oa_status_unpaywall(
|
|
248
|
+
doi: str,
|
|
249
|
+
email: str = None,
|
|
250
|
+
timeout: float = 10.0,
|
|
251
|
+
) -> OAResult:
|
|
252
|
+
"""
|
|
253
|
+
Check open access status via Unpaywall API.
|
|
254
|
+
|
|
255
|
+
Unpaywall is the authoritative source for OA status detection.
|
|
256
|
+
Rate limit: 100,000 requests/day with email.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
doi: Paper DOI (required)
|
|
260
|
+
email: Email for Unpaywall API (required for polite access)
|
|
261
|
+
timeout: Request timeout in seconds
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
OAResult with comprehensive OA information
|
|
265
|
+
"""
|
|
266
|
+
if not doi:
|
|
267
|
+
return OAResult(
|
|
268
|
+
is_open_access=False,
|
|
269
|
+
status=OAStatus.UNKNOWN,
|
|
270
|
+
source="no_doi",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Use config email if not provided
|
|
274
|
+
if email is None:
|
|
275
|
+
email = _get_unpaywall_email()
|
|
276
|
+
|
|
277
|
+
# Clean DOI
|
|
278
|
+
doi = doi.strip()
|
|
279
|
+
if doi.lower().startswith("https://doi.org/"):
|
|
280
|
+
doi = doi[16:]
|
|
281
|
+
elif doi.lower().startswith("doi:"):
|
|
282
|
+
doi = doi[4:]
|
|
283
|
+
|
|
284
|
+
url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
async with aiohttp.ClientSession() as session:
|
|
288
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
|
|
289
|
+
if resp.status == 404:
|
|
290
|
+
return OAResult(
|
|
291
|
+
is_open_access=False,
|
|
292
|
+
status=OAStatus.UNKNOWN,
|
|
293
|
+
source="unpaywall_not_found",
|
|
294
|
+
confidence=0.5,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if resp.status != 200:
|
|
298
|
+
logger.warning(f"Unpaywall API error: {resp.status}")
|
|
299
|
+
return OAResult(
|
|
300
|
+
is_open_access=False,
|
|
301
|
+
status=OAStatus.UNKNOWN,
|
|
302
|
+
source="unpaywall_error",
|
|
303
|
+
confidence=0.0,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
data = await resp.json()
|
|
307
|
+
|
|
308
|
+
is_oa = data.get("is_oa", False)
|
|
309
|
+
oa_status_str = data.get("oa_status", "closed")
|
|
310
|
+
|
|
311
|
+
# Map Unpaywall status to our enum
|
|
312
|
+
status_map = {
|
|
313
|
+
"gold": OAStatus.GOLD,
|
|
314
|
+
"green": OAStatus.GREEN,
|
|
315
|
+
"hybrid": OAStatus.HYBRID,
|
|
316
|
+
"bronze": OAStatus.BRONZE,
|
|
317
|
+
"closed": OAStatus.CLOSED,
|
|
318
|
+
}
|
|
319
|
+
status = status_map.get(oa_status_str, OAStatus.UNKNOWN)
|
|
320
|
+
|
|
321
|
+
# Get best OA location
|
|
322
|
+
oa_url = None
|
|
323
|
+
license_str = None
|
|
324
|
+
best_oa = data.get("best_oa_location")
|
|
325
|
+
if best_oa:
|
|
326
|
+
oa_url = best_oa.get("url_for_pdf") or best_oa.get("url")
|
|
327
|
+
license_str = best_oa.get("license")
|
|
328
|
+
|
|
329
|
+
return OAResult(
|
|
330
|
+
is_open_access=is_oa,
|
|
331
|
+
status=status,
|
|
332
|
+
oa_url=oa_url,
|
|
333
|
+
source="unpaywall",
|
|
334
|
+
license=license_str,
|
|
335
|
+
confidence=1.0,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
except asyncio.TimeoutError:
|
|
339
|
+
logger.warning(f"Unpaywall timeout for DOI: {doi}")
|
|
340
|
+
return OAResult(
|
|
341
|
+
is_open_access=False,
|
|
342
|
+
status=OAStatus.UNKNOWN,
|
|
343
|
+
source="unpaywall_timeout",
|
|
344
|
+
confidence=0.0,
|
|
345
|
+
)
|
|
346
|
+
except Exception as e:
|
|
347
|
+
logger.error(f"Unpaywall API error: {e}")
|
|
348
|
+
return OAResult(
|
|
349
|
+
is_open_access=False,
|
|
350
|
+
status=OAStatus.UNKNOWN,
|
|
351
|
+
source="unpaywall_exception",
|
|
352
|
+
confidence=0.0,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
async def check_oa_status_async(
|
|
357
|
+
doi: Optional[str] = None,
|
|
358
|
+
arxiv_id: Optional[str] = None,
|
|
359
|
+
pmcid: Optional[str] = None,
|
|
360
|
+
source: Optional[str] = None,
|
|
361
|
+
journal: Optional[str] = None,
|
|
362
|
+
is_open_access_flag: Optional[bool] = None,
|
|
363
|
+
use_unpaywall: bool = True,
|
|
364
|
+
unpaywall_email: str = None,
|
|
365
|
+
) -> OAResult:
|
|
366
|
+
"""
|
|
367
|
+
Comprehensive open access detection.
|
|
368
|
+
|
|
369
|
+
First tries fast local detection, then falls back to Unpaywall API
|
|
370
|
+
if the status is uncertain.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
doi: Paper DOI
|
|
374
|
+
arxiv_id: arXiv identifier
|
|
375
|
+
pmcid: PubMed Central ID
|
|
376
|
+
source: Source database
|
|
377
|
+
journal: Journal name
|
|
378
|
+
is_open_access_flag: Pre-existing OA flag
|
|
379
|
+
use_unpaywall: Whether to query Unpaywall for uncertain cases
|
|
380
|
+
unpaywall_email: Email for Unpaywall API
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
OAResult with best available OA information
|
|
384
|
+
"""
|
|
385
|
+
# Try fast local detection first
|
|
386
|
+
local_result = detect_oa_from_identifiers(
|
|
387
|
+
doi=doi,
|
|
388
|
+
arxiv_id=arxiv_id,
|
|
389
|
+
pmcid=pmcid,
|
|
390
|
+
source=source,
|
|
391
|
+
journal=journal,
|
|
392
|
+
is_open_access_flag=is_open_access_flag,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# If we're confident, return immediately
|
|
396
|
+
if local_result.confidence >= 0.9:
|
|
397
|
+
return local_result
|
|
398
|
+
|
|
399
|
+
# If we have a DOI and local detection was uncertain, try Unpaywall
|
|
400
|
+
if use_unpaywall and doi and local_result.confidence < 0.7:
|
|
401
|
+
unpaywall_result = await check_oa_status_unpaywall(
|
|
402
|
+
doi=doi,
|
|
403
|
+
email=unpaywall_email,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Unpaywall is authoritative if it returns a result
|
|
407
|
+
if unpaywall_result.confidence > local_result.confidence:
|
|
408
|
+
return unpaywall_result
|
|
409
|
+
|
|
410
|
+
return local_result
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def check_oa_status(
|
|
414
|
+
doi: Optional[str] = None,
|
|
415
|
+
arxiv_id: Optional[str] = None,
|
|
416
|
+
pmcid: Optional[str] = None,
|
|
417
|
+
source: Optional[str] = None,
|
|
418
|
+
journal: Optional[str] = None,
|
|
419
|
+
is_open_access_flag: Optional[bool] = None,
|
|
420
|
+
use_unpaywall: bool = False, # Default to sync-safe behavior
|
|
421
|
+
) -> OAResult:
|
|
422
|
+
"""
|
|
423
|
+
Synchronous wrapper for OA detection.
|
|
424
|
+
|
|
425
|
+
By default only uses local detection (no API calls).
|
|
426
|
+
Set use_unpaywall=True to use Unpaywall API (requires event loop).
|
|
427
|
+
"""
|
|
428
|
+
if use_unpaywall:
|
|
429
|
+
try:
|
|
430
|
+
loop = asyncio.get_event_loop()
|
|
431
|
+
except RuntimeError:
|
|
432
|
+
loop = asyncio.new_event_loop()
|
|
433
|
+
asyncio.set_event_loop(loop)
|
|
434
|
+
|
|
435
|
+
return loop.run_until_complete(
|
|
436
|
+
check_oa_status_async(
|
|
437
|
+
doi=doi,
|
|
438
|
+
arxiv_id=arxiv_id,
|
|
439
|
+
pmcid=pmcid,
|
|
440
|
+
source=source,
|
|
441
|
+
journal=journal,
|
|
442
|
+
is_open_access_flag=is_open_access_flag,
|
|
443
|
+
use_unpaywall=True,
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return detect_oa_from_identifiers(
|
|
448
|
+
doi=doi,
|
|
449
|
+
arxiv_id=arxiv_id,
|
|
450
|
+
pmcid=pmcid,
|
|
451
|
+
source=source,
|
|
452
|
+
journal=journal,
|
|
453
|
+
is_open_access_flag=is_open_access_flag,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# EOF
|
|
@@ -33,6 +33,7 @@ from scitex.scholar.pdf_download.strategies import (
|
|
|
33
33
|
try_download_direct_async,
|
|
34
34
|
try_download_manual_async,
|
|
35
35
|
try_download_response_body_async,
|
|
36
|
+
try_download_open_access_async,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
@@ -65,6 +66,17 @@ class ScholarPDFDownloader:
|
|
|
65
66
|
self.context = context
|
|
66
67
|
self.output_dir = self.config.get_library_downloads_dir()
|
|
67
68
|
|
|
69
|
+
# Load access preferences from config
|
|
70
|
+
self.prefer_open_access = self.config.resolve(
|
|
71
|
+
"prefer_open_access", default=True, type=bool
|
|
72
|
+
)
|
|
73
|
+
self.enable_paywall_access = self.config.resolve(
|
|
74
|
+
"enable_paywall_access", default=False, type=bool
|
|
75
|
+
)
|
|
76
|
+
self.track_paywall_attempts = self.config.resolve(
|
|
77
|
+
"track_paywall_attempts", default=True, type=bool
|
|
78
|
+
)
|
|
79
|
+
|
|
68
80
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
69
81
|
pass
|
|
70
82
|
|
|
@@ -130,6 +142,131 @@ class ScholarPDFDownloader:
|
|
|
130
142
|
)
|
|
131
143
|
return saved_paths
|
|
132
144
|
|
|
145
|
+
async def download_open_access(
|
|
146
|
+
self,
|
|
147
|
+
oa_url: str,
|
|
148
|
+
output_path: Union[str, Path],
|
|
149
|
+
metadata: Optional[dict] = None,
|
|
150
|
+
) -> Optional[Path]:
|
|
151
|
+
"""Download PDF from an Open Access URL.
|
|
152
|
+
|
|
153
|
+
This is a simpler path for known OA papers - no browser automation needed.
|
|
154
|
+
Uses direct HTTP download with appropriate handling for different OA sources
|
|
155
|
+
(arXiv, PMC, OpenAlex OA URLs, etc.).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
oa_url: Open Access URL (from paper.metadata.access.oa_url)
|
|
159
|
+
output_path: Path to save the downloaded PDF
|
|
160
|
+
metadata: Optional paper metadata for logging
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to downloaded PDF if successful, None otherwise
|
|
164
|
+
"""
|
|
165
|
+
if not oa_url:
|
|
166
|
+
logger.debug(f"{self.name}: No OA URL provided")
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
if isinstance(output_path, str):
|
|
170
|
+
output_path = Path(output_path)
|
|
171
|
+
if not str(output_path).endswith(".pdf"):
|
|
172
|
+
output_path = Path(str(output_path) + ".pdf")
|
|
173
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
|
|
175
|
+
logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...")
|
|
176
|
+
|
|
177
|
+
result = await try_download_open_access_async(
|
|
178
|
+
oa_url=oa_url,
|
|
179
|
+
output_path=output_path,
|
|
180
|
+
metadata=metadata,
|
|
181
|
+
func_name=self.name,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
if result:
|
|
185
|
+
logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}")
|
|
186
|
+
else:
|
|
187
|
+
logger.debug(f"{self.name}: OA download failed, may need browser-based download")
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
async def download_smart(
|
|
192
|
+
self,
|
|
193
|
+
paper,
|
|
194
|
+
output_path: Union[str, Path],
|
|
195
|
+
) -> Optional[Path]:
|
|
196
|
+
"""Smart download method that chooses the best strategy based on paper metadata.
|
|
197
|
+
|
|
198
|
+
Priority order:
|
|
199
|
+
1. Try Open Access URL if available and prefer_open_access is True
|
|
200
|
+
2. Try regular PDF URLs if available
|
|
201
|
+
3. Try paywall access if enable_paywall_access is True and OA failed
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
paper: Paper object with metadata (from scitex.scholar.core.Paper)
|
|
205
|
+
output_path: Path to save the downloaded PDF
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Path to downloaded PDF if successful, None otherwise
|
|
209
|
+
"""
|
|
210
|
+
from scitex.scholar.core.Paper import Paper
|
|
211
|
+
|
|
212
|
+
if isinstance(output_path, str):
|
|
213
|
+
output_path = Path(output_path)
|
|
214
|
+
if not str(output_path).endswith(".pdf"):
|
|
215
|
+
output_path = Path(str(output_path) + ".pdf")
|
|
216
|
+
|
|
217
|
+
# Extract metadata
|
|
218
|
+
meta = paper.metadata if hasattr(paper, 'metadata') else paper
|
|
219
|
+
access = getattr(meta, 'access', None)
|
|
220
|
+
url_meta = getattr(meta, 'url', None)
|
|
221
|
+
id_meta = getattr(meta, 'id', None)
|
|
222
|
+
|
|
223
|
+
is_open_access = getattr(access, 'is_open_access', False) if access else False
|
|
224
|
+
oa_url = getattr(access, 'oa_url', None) if access else None
|
|
225
|
+
pdf_urls = getattr(url_meta, 'pdfs', []) if url_meta else []
|
|
226
|
+
doi = getattr(id_meta, 'doi', None) if id_meta else None
|
|
227
|
+
|
|
228
|
+
logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}")
|
|
229
|
+
|
|
230
|
+
# Strategy 1: Try Open Access if available
|
|
231
|
+
if self.prefer_open_access and oa_url:
|
|
232
|
+
logger.info(f"{self.name}: Trying Open Access URL first")
|
|
233
|
+
result = await self.download_open_access(oa_url, output_path)
|
|
234
|
+
if result:
|
|
235
|
+
# Update access metadata to record successful OA download
|
|
236
|
+
if access and self.track_paywall_attempts:
|
|
237
|
+
access.paywall_bypass_attempted = False
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
# Strategy 2: Try available PDF URLs
|
|
241
|
+
for pdf_entry in pdf_urls:
|
|
242
|
+
pdf_url = pdf_entry.get('url') if isinstance(pdf_entry, dict) else pdf_entry
|
|
243
|
+
if pdf_url:
|
|
244
|
+
logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...")
|
|
245
|
+
result = await self.download_from_url(pdf_url, output_path, doi=doi)
|
|
246
|
+
if result:
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
# Strategy 3: Try paywall access if enabled
|
|
250
|
+
if self.enable_paywall_access and not is_open_access:
|
|
251
|
+
logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)")
|
|
252
|
+
if access and self.track_paywall_attempts:
|
|
253
|
+
access.paywall_bypass_attempted = True
|
|
254
|
+
|
|
255
|
+
# Use DOI-based URL if available
|
|
256
|
+
if doi:
|
|
257
|
+
doi_url = f"https://doi.org/{doi}"
|
|
258
|
+
result = await self.download_from_url(doi_url, output_path, doi=doi)
|
|
259
|
+
if result:
|
|
260
|
+
if access and self.track_paywall_attempts:
|
|
261
|
+
access.paywall_bypass_success = True
|
|
262
|
+
return result
|
|
263
|
+
else:
|
|
264
|
+
if access and self.track_paywall_attempts:
|
|
265
|
+
access.paywall_bypass_success = False
|
|
266
|
+
|
|
267
|
+
logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}")
|
|
268
|
+
return None
|
|
269
|
+
|
|
133
270
|
async def download_from_url(
|
|
134
271
|
self,
|
|
135
272
|
pdf_url: str,
|
|
@@ -11,6 +11,10 @@ from .chrome_pdf_viewer import try_download_chrome_pdf_viewer_async
|
|
|
11
11
|
from .direct_download import try_download_direct_async
|
|
12
12
|
from .response_body import try_download_response_body_async
|
|
13
13
|
from .manual_download_fallback import try_download_manual_async
|
|
14
|
+
from .open_access_download import (
|
|
15
|
+
try_download_open_access_async,
|
|
16
|
+
try_download_open_access_sync,
|
|
17
|
+
)
|
|
14
18
|
|
|
15
19
|
# Manual download utilities
|
|
16
20
|
from .manual_download_utils import (
|
|
@@ -27,6 +31,8 @@ __all__ = [
|
|
|
27
31
|
"try_download_direct_async",
|
|
28
32
|
"try_download_response_body_async",
|
|
29
33
|
"try_download_manual_async",
|
|
34
|
+
"try_download_open_access_async",
|
|
35
|
+
"try_download_open_access_sync",
|
|
30
36
|
# Manual download utilities
|
|
31
37
|
"DownloadMonitorAndSync",
|
|
32
38
|
"FlexibleFilenameGenerator",
|