scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/auth/__init__.py +35 -0
- scitex/browser/auth/google.py +381 -0
- scitex/browser/collaboration/__init__.py +5 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/README.md +117 -0
- scitex/scholar/citation_graph/__init__.py +29 -0
- scitex/scholar/citation_graph/builder.py +214 -0
- scitex/scholar/citation_graph/database.py +246 -0
- scitex/scholar/citation_graph/example.py +96 -0
- scitex/scholar/citation_graph/models.py +80 -0
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +56 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
- scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/open_access.py
|
|
4
|
+
"""
|
|
5
|
+
Open Access Detection Module.
|
|
6
|
+
|
|
7
|
+
Provides utilities for determining if a paper is open access based on:
|
|
8
|
+
- Known open access sources (arXiv, PMC, bioRxiv, etc.)
|
|
9
|
+
- Unpaywall API lookup
|
|
10
|
+
- Publisher patterns
|
|
11
|
+
- Journal DOAJ status
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import Optional, List, Dict, Any
|
|
20
|
+
import asyncio
|
|
21
|
+
import aiohttp
|
|
22
|
+
|
|
23
|
+
from scitex import logging
|
|
24
|
+
from scitex.scholar.config import ScholarConfig
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# Load OA config from default.yaml (single source of truth)
|
|
29
|
+
_config = None
|
|
30
|
+
|
|
31
|
+
def _get_config() -> ScholarConfig:
|
|
32
|
+
"""Get or create singleton config instance."""
|
|
33
|
+
global _config
|
|
34
|
+
if _config is None:
|
|
35
|
+
_config = ScholarConfig()
|
|
36
|
+
return _config
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _get_oa_sources() -> frozenset:
|
|
40
|
+
"""Get OA sources from config (single source of truth)."""
|
|
41
|
+
config = _get_config()
|
|
42
|
+
sources = config.get("OPENACCESS_SOURCES") or []
|
|
43
|
+
return frozenset(s.lower() for s in sources)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_oa_journals() -> tuple:
|
|
47
|
+
"""Get OA journal patterns from config (single source of truth)."""
|
|
48
|
+
config = _get_config()
|
|
49
|
+
journals = config.get("OPENACCESS_JOURNALS") or []
|
|
50
|
+
return tuple(j.lower() for j in journals)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_unpaywall_email() -> str:
|
|
54
|
+
"""Get Unpaywall API email from config."""
|
|
55
|
+
config = _get_config()
|
|
56
|
+
return config.get("unpaywall_email") or "research@scitex.io"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class OAStatus(Enum):
|
|
60
|
+
"""Open Access status categories (aligned with Unpaywall)."""
|
|
61
|
+
GOLD = "gold" # Published in OA journal (DOAJ listed)
|
|
62
|
+
GREEN = "green" # Available in repository (arXiv, PMC, etc.)
|
|
63
|
+
HYBRID = "hybrid" # OA article in subscription journal
|
|
64
|
+
BRONZE = "bronze" # Free to read on publisher site, but no license
|
|
65
|
+
CLOSED = "closed" # Paywalled
|
|
66
|
+
UNKNOWN = "unknown" # Status not determined
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class OAResult:
|
|
71
|
+
"""Result of open access detection."""
|
|
72
|
+
is_open_access: bool
|
|
73
|
+
status: OAStatus
|
|
74
|
+
oa_url: Optional[str] = None
|
|
75
|
+
source: Optional[str] = None # How we determined OA status
|
|
76
|
+
license: Optional[str] = None
|
|
77
|
+
confidence: float = 1.0 # 0-1, how confident we are
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Open Access Sources and Journals are loaded from config/default.yaml
|
|
81
|
+
# These properties provide lazy-loaded access to config values
|
|
82
|
+
# (single source of truth: config/default.yaml → OPENACCESS_SOURCES, OPENACCESS_JOURNALS)
|
|
83
|
+
|
|
84
|
+
# arXiv ID patterns
|
|
85
|
+
ARXIV_PATTERNS = [
|
|
86
|
+
re.compile(r"^\d{4}\.\d{4,5}(v\d+)?$"), # New format: 2301.12345
|
|
87
|
+
re.compile(r"^[a-z-]+/\d{7}(v\d+)?$"), # Old format: hep-th/9901001
|
|
88
|
+
re.compile(r"^arxiv:\d{4}\.\d{4,5}(v\d+)?$", re.IGNORECASE),
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def is_arxiv_id(identifier: str) -> bool:
|
|
93
|
+
"""Check if identifier looks like an arXiv ID."""
|
|
94
|
+
if not identifier:
|
|
95
|
+
return False
|
|
96
|
+
identifier = identifier.strip()
|
|
97
|
+
return any(p.match(identifier) for p in ARXIV_PATTERNS)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_open_access_source(source: str) -> bool:
|
|
101
|
+
"""Check if source is a known open access repository.
|
|
102
|
+
|
|
103
|
+
Sources are loaded from config/default.yaml → OPENACCESS_SOURCES
|
|
104
|
+
"""
|
|
105
|
+
if not source:
|
|
106
|
+
return False
|
|
107
|
+
return source.lower() in _get_oa_sources()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def is_open_access_journal(journal_name: str, use_cache: bool = True) -> bool:
|
|
111
|
+
"""Check if journal is a known open access journal.
|
|
112
|
+
|
|
113
|
+
Uses three-tier lookup:
|
|
114
|
+
1. Fast check against config/default.yaml → OPENACCESS_JOURNALS (pattern matching)
|
|
115
|
+
2. Comprehensive check against cached OpenAlex OA sources (exact match, 62K+ journals)
|
|
116
|
+
3. Journal normalizer check (handles abbreviations, variants, historical names)
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
journal_name: Journal name to check
|
|
120
|
+
use_cache: Whether to use OpenAlex cache (default True)
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
True if journal is known to be Open Access
|
|
124
|
+
"""
|
|
125
|
+
if not journal_name:
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
journal_lower = journal_name.lower()
|
|
129
|
+
|
|
130
|
+
# Tier 1: Fast pattern match from YAML config
|
|
131
|
+
if any(oa_journal in journal_lower for oa_journal in _get_oa_journals()):
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
# Tier 2: Check OpenAlex cache (62K+ OA sources)
|
|
135
|
+
if use_cache:
|
|
136
|
+
try:
|
|
137
|
+
from .oa_cache import is_oa_journal_cached
|
|
138
|
+
if is_oa_journal_cached(journal_name):
|
|
139
|
+
return True
|
|
140
|
+
except ImportError:
|
|
141
|
+
pass # Cache module not available
|
|
142
|
+
|
|
143
|
+
# Tier 3: Use journal normalizer (handles abbreviations, variants)
|
|
144
|
+
if use_cache:
|
|
145
|
+
try:
|
|
146
|
+
from .journal_normalizer import get_journal_normalizer
|
|
147
|
+
normalizer = get_journal_normalizer()
|
|
148
|
+
if normalizer.is_open_access(journal_name):
|
|
149
|
+
return True
|
|
150
|
+
except ImportError:
|
|
151
|
+
pass # Normalizer module not available
|
|
152
|
+
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def detect_oa_from_identifiers(
|
|
157
|
+
doi: Optional[str] = None,
|
|
158
|
+
arxiv_id: Optional[str] = None,
|
|
159
|
+
pmcid: Optional[str] = None,
|
|
160
|
+
source: Optional[str] = None,
|
|
161
|
+
journal: Optional[str] = None,
|
|
162
|
+
is_open_access_flag: Optional[bool] = None,
|
|
163
|
+
) -> OAResult:
|
|
164
|
+
"""
|
|
165
|
+
Detect open access status from paper identifiers without API calls.
|
|
166
|
+
|
|
167
|
+
This is fast but may miss some OA papers (e.g., hybrid articles).
|
|
168
|
+
For comprehensive detection, use check_oa_status_async() with Unpaywall.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
doi: Paper DOI
|
|
172
|
+
arxiv_id: arXiv identifier
|
|
173
|
+
pmcid: PubMed Central ID (starts with PMC)
|
|
174
|
+
source: Source database (arxiv, pmc, biorxiv, etc.)
|
|
175
|
+
journal: Journal name
|
|
176
|
+
is_open_access_flag: Pre-existing OA flag from search API
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
OAResult with detection results
|
|
180
|
+
"""
|
|
181
|
+
# If we already have an OA flag from a reliable source, trust it
|
|
182
|
+
if is_open_access_flag is True:
|
|
183
|
+
return OAResult(
|
|
184
|
+
is_open_access=True,
|
|
185
|
+
status=OAStatus.UNKNOWN, # We don't know the specific type
|
|
186
|
+
source="api_flag",
|
|
187
|
+
confidence=0.9,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# arXiv - always open access (GREEN)
|
|
191
|
+
if arxiv_id and is_arxiv_id(arxiv_id):
|
|
192
|
+
return OAResult(
|
|
193
|
+
is_open_access=True,
|
|
194
|
+
status=OAStatus.GREEN,
|
|
195
|
+
oa_url=f"https://arxiv.org/pdf/{arxiv_id}.pdf",
|
|
196
|
+
source="arxiv",
|
|
197
|
+
confidence=1.0,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# PMC - always open access (GREEN)
|
|
201
|
+
if pmcid and pmcid.upper().startswith("PMC"):
|
|
202
|
+
pmc_num = pmcid[3:] if pmcid.upper().startswith("PMC") else pmcid
|
|
203
|
+
return OAResult(
|
|
204
|
+
is_open_access=True,
|
|
205
|
+
status=OAStatus.GREEN,
|
|
206
|
+
oa_url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_num}/pdf/",
|
|
207
|
+
source="pmc",
|
|
208
|
+
confidence=1.0,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Known OA source
|
|
212
|
+
if source and is_open_access_source(source):
|
|
213
|
+
return OAResult(
|
|
214
|
+
is_open_access=True,
|
|
215
|
+
status=OAStatus.GREEN if source.lower() in ["arxiv", "pmc", "biorxiv", "medrxiv"] else OAStatus.GOLD,
|
|
216
|
+
source=f"source_{source}",
|
|
217
|
+
confidence=0.95,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Known OA journal
|
|
221
|
+
if journal and is_open_access_journal(journal):
|
|
222
|
+
return OAResult(
|
|
223
|
+
is_open_access=True,
|
|
224
|
+
status=OAStatus.GOLD,
|
|
225
|
+
source="oa_journal",
|
|
226
|
+
confidence=0.85,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# If we have a DOI but no other OA indicators, it's likely paywalled
|
|
230
|
+
if doi and not arxiv_id and not pmcid:
|
|
231
|
+
return OAResult(
|
|
232
|
+
is_open_access=False,
|
|
233
|
+
status=OAStatus.UNKNOWN, # Could be hybrid OA, need Unpaywall to confirm
|
|
234
|
+
source="no_oa_indicators",
|
|
235
|
+
confidence=0.6, # Low confidence - could be hybrid OA
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Unknown
|
|
239
|
+
return OAResult(
|
|
240
|
+
is_open_access=False,
|
|
241
|
+
status=OAStatus.UNKNOWN,
|
|
242
|
+
source="unknown",
|
|
243
|
+
confidence=0.3,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
async def check_oa_status_unpaywall(
|
|
248
|
+
doi: str,
|
|
249
|
+
email: str = None,
|
|
250
|
+
timeout: float = 10.0,
|
|
251
|
+
) -> OAResult:
|
|
252
|
+
"""
|
|
253
|
+
Check open access status via Unpaywall API.
|
|
254
|
+
|
|
255
|
+
Unpaywall is the authoritative source for OA status detection.
|
|
256
|
+
Rate limit: 100,000 requests/day with email.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
doi: Paper DOI (required)
|
|
260
|
+
email: Email for Unpaywall API (required for polite access)
|
|
261
|
+
timeout: Request timeout in seconds
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
OAResult with comprehensive OA information
|
|
265
|
+
"""
|
|
266
|
+
if not doi:
|
|
267
|
+
return OAResult(
|
|
268
|
+
is_open_access=False,
|
|
269
|
+
status=OAStatus.UNKNOWN,
|
|
270
|
+
source="no_doi",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Use config email if not provided
|
|
274
|
+
if email is None:
|
|
275
|
+
email = _get_unpaywall_email()
|
|
276
|
+
|
|
277
|
+
# Clean DOI
|
|
278
|
+
doi = doi.strip()
|
|
279
|
+
if doi.lower().startswith("https://doi.org/"):
|
|
280
|
+
doi = doi[16:]
|
|
281
|
+
elif doi.lower().startswith("doi:"):
|
|
282
|
+
doi = doi[4:]
|
|
283
|
+
|
|
284
|
+
url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
async with aiohttp.ClientSession() as session:
|
|
288
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
|
|
289
|
+
if resp.status == 404:
|
|
290
|
+
return OAResult(
|
|
291
|
+
is_open_access=False,
|
|
292
|
+
status=OAStatus.UNKNOWN,
|
|
293
|
+
source="unpaywall_not_found",
|
|
294
|
+
confidence=0.5,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if resp.status != 200:
|
|
298
|
+
logger.warning(f"Unpaywall API error: {resp.status}")
|
|
299
|
+
return OAResult(
|
|
300
|
+
is_open_access=False,
|
|
301
|
+
status=OAStatus.UNKNOWN,
|
|
302
|
+
source="unpaywall_error",
|
|
303
|
+
confidence=0.0,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
data = await resp.json()
|
|
307
|
+
|
|
308
|
+
is_oa = data.get("is_oa", False)
|
|
309
|
+
oa_status_str = data.get("oa_status", "closed")
|
|
310
|
+
|
|
311
|
+
# Map Unpaywall status to our enum
|
|
312
|
+
status_map = {
|
|
313
|
+
"gold": OAStatus.GOLD,
|
|
314
|
+
"green": OAStatus.GREEN,
|
|
315
|
+
"hybrid": OAStatus.HYBRID,
|
|
316
|
+
"bronze": OAStatus.BRONZE,
|
|
317
|
+
"closed": OAStatus.CLOSED,
|
|
318
|
+
}
|
|
319
|
+
status = status_map.get(oa_status_str, OAStatus.UNKNOWN)
|
|
320
|
+
|
|
321
|
+
# Get best OA location
|
|
322
|
+
oa_url = None
|
|
323
|
+
license_str = None
|
|
324
|
+
best_oa = data.get("best_oa_location")
|
|
325
|
+
if best_oa:
|
|
326
|
+
oa_url = best_oa.get("url_for_pdf") or best_oa.get("url")
|
|
327
|
+
license_str = best_oa.get("license")
|
|
328
|
+
|
|
329
|
+
return OAResult(
|
|
330
|
+
is_open_access=is_oa,
|
|
331
|
+
status=status,
|
|
332
|
+
oa_url=oa_url,
|
|
333
|
+
source="unpaywall",
|
|
334
|
+
license=license_str,
|
|
335
|
+
confidence=1.0,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
except asyncio.TimeoutError:
|
|
339
|
+
logger.warning(f"Unpaywall timeout for DOI: {doi}")
|
|
340
|
+
return OAResult(
|
|
341
|
+
is_open_access=False,
|
|
342
|
+
status=OAStatus.UNKNOWN,
|
|
343
|
+
source="unpaywall_timeout",
|
|
344
|
+
confidence=0.0,
|
|
345
|
+
)
|
|
346
|
+
except Exception as e:
|
|
347
|
+
logger.error(f"Unpaywall API error: {e}")
|
|
348
|
+
return OAResult(
|
|
349
|
+
is_open_access=False,
|
|
350
|
+
status=OAStatus.UNKNOWN,
|
|
351
|
+
source="unpaywall_exception",
|
|
352
|
+
confidence=0.0,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
async def check_oa_status_async(
|
|
357
|
+
doi: Optional[str] = None,
|
|
358
|
+
arxiv_id: Optional[str] = None,
|
|
359
|
+
pmcid: Optional[str] = None,
|
|
360
|
+
source: Optional[str] = None,
|
|
361
|
+
journal: Optional[str] = None,
|
|
362
|
+
is_open_access_flag: Optional[bool] = None,
|
|
363
|
+
use_unpaywall: bool = True,
|
|
364
|
+
unpaywall_email: str = None,
|
|
365
|
+
) -> OAResult:
|
|
366
|
+
"""
|
|
367
|
+
Comprehensive open access detection.
|
|
368
|
+
|
|
369
|
+
First tries fast local detection, then falls back to Unpaywall API
|
|
370
|
+
if the status is uncertain.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
doi: Paper DOI
|
|
374
|
+
arxiv_id: arXiv identifier
|
|
375
|
+
pmcid: PubMed Central ID
|
|
376
|
+
source: Source database
|
|
377
|
+
journal: Journal name
|
|
378
|
+
is_open_access_flag: Pre-existing OA flag
|
|
379
|
+
use_unpaywall: Whether to query Unpaywall for uncertain cases
|
|
380
|
+
unpaywall_email: Email for Unpaywall API
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
OAResult with best available OA information
|
|
384
|
+
"""
|
|
385
|
+
# Try fast local detection first
|
|
386
|
+
local_result = detect_oa_from_identifiers(
|
|
387
|
+
doi=doi,
|
|
388
|
+
arxiv_id=arxiv_id,
|
|
389
|
+
pmcid=pmcid,
|
|
390
|
+
source=source,
|
|
391
|
+
journal=journal,
|
|
392
|
+
is_open_access_flag=is_open_access_flag,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# If we're confident, return immediately
|
|
396
|
+
if local_result.confidence >= 0.9:
|
|
397
|
+
return local_result
|
|
398
|
+
|
|
399
|
+
# If we have a DOI and local detection was uncertain, try Unpaywall
|
|
400
|
+
if use_unpaywall and doi and local_result.confidence < 0.7:
|
|
401
|
+
unpaywall_result = await check_oa_status_unpaywall(
|
|
402
|
+
doi=doi,
|
|
403
|
+
email=unpaywall_email,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Unpaywall is authoritative if it returns a result
|
|
407
|
+
if unpaywall_result.confidence > local_result.confidence:
|
|
408
|
+
return unpaywall_result
|
|
409
|
+
|
|
410
|
+
return local_result
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def check_oa_status(
|
|
414
|
+
doi: Optional[str] = None,
|
|
415
|
+
arxiv_id: Optional[str] = None,
|
|
416
|
+
pmcid: Optional[str] = None,
|
|
417
|
+
source: Optional[str] = None,
|
|
418
|
+
journal: Optional[str] = None,
|
|
419
|
+
is_open_access_flag: Optional[bool] = None,
|
|
420
|
+
use_unpaywall: bool = False, # Default to sync-safe behavior
|
|
421
|
+
) -> OAResult:
|
|
422
|
+
"""
|
|
423
|
+
Synchronous wrapper for OA detection.
|
|
424
|
+
|
|
425
|
+
By default only uses local detection (no API calls).
|
|
426
|
+
Set use_unpaywall=True to use Unpaywall API (requires event loop).
|
|
427
|
+
"""
|
|
428
|
+
if use_unpaywall:
|
|
429
|
+
try:
|
|
430
|
+
loop = asyncio.get_event_loop()
|
|
431
|
+
except RuntimeError:
|
|
432
|
+
loop = asyncio.new_event_loop()
|
|
433
|
+
asyncio.set_event_loop(loop)
|
|
434
|
+
|
|
435
|
+
return loop.run_until_complete(
|
|
436
|
+
check_oa_status_async(
|
|
437
|
+
doi=doi,
|
|
438
|
+
arxiv_id=arxiv_id,
|
|
439
|
+
pmcid=pmcid,
|
|
440
|
+
source=source,
|
|
441
|
+
journal=journal,
|
|
442
|
+
is_open_access_flag=is_open_access_flag,
|
|
443
|
+
use_unpaywall=True,
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return detect_oa_from_identifiers(
|
|
448
|
+
doi=doi,
|
|
449
|
+
arxiv_id=arxiv_id,
|
|
450
|
+
pmcid=pmcid,
|
|
451
|
+
source=source,
|
|
452
|
+
journal=journal,
|
|
453
|
+
is_open_access_flag=is_open_access_flag,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# EOF
|
|
@@ -240,6 +240,13 @@ class ScholarEngine:
|
|
|
240
240
|
if name in engine_classes:
|
|
241
241
|
if name == "url_doi_engine":
|
|
242
242
|
self._engine_instances[name] = engine_classes[name]()
|
|
243
|
+
elif name == "CrossRefLocal":
|
|
244
|
+
# Get API URL from config (supports SCITEX_SCHOLAR_CROSSREF_API_URL env var)
|
|
245
|
+
api_url = self.config.resolve("crossref_api_url", "http://127.0.0.1:3333")
|
|
246
|
+
self._engine_instances[name] = engine_classes[name](
|
|
247
|
+
"research@example.com",
|
|
248
|
+
api_url=api_url
|
|
249
|
+
)
|
|
243
250
|
else:
|
|
244
251
|
self._engine_instances[name] = engine_classes[name](
|
|
245
252
|
"research@example.com"
|
|
@@ -483,7 +490,8 @@ class ScholarEngine:
|
|
|
483
490
|
"""Merge two metadata structures with engine priority."""
|
|
484
491
|
merged = base.copy()
|
|
485
492
|
engine_priority = {
|
|
486
|
-
"URL":
|
|
493
|
+
"URL": 6,
|
|
494
|
+
"CrossRefLocal": 5,
|
|
487
495
|
"CrossRef": 4,
|
|
488
496
|
"OpenAlex": 3,
|
|
489
497
|
"Semantic_Scholar": 2,
|
|
@@ -23,7 +23,14 @@ logger = logging.getLogger(__name__)
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class CrossRefLocalEngine(BaseDOIEngine):
|
|
26
|
-
"""CrossRef Local Engine using local Django API
|
|
26
|
+
"""CrossRef Local Engine using local Django API or external public API
|
|
27
|
+
|
|
28
|
+
Supports both:
|
|
29
|
+
- Internal API: http://crossref:3333 (Docker network)
|
|
30
|
+
- External API: https://scitex.ai/scholar/api/crossref (Public internet)
|
|
31
|
+
|
|
32
|
+
Automatically detects API format and adjusts endpoints accordingly.
|
|
33
|
+
"""
|
|
27
34
|
|
|
28
35
|
def __init__(
|
|
29
36
|
self,
|
|
@@ -33,6 +40,9 @@ class CrossRefLocalEngine(BaseDOIEngine):
|
|
|
33
40
|
super().__init__(email)
|
|
34
41
|
self.api_url = api_url.rstrip("/")
|
|
35
42
|
|
|
43
|
+
# Detect API type: external (public) vs internal (Docker/local)
|
|
44
|
+
self._is_external_api = "/api/crossref" in self.api_url or "scitex.ai" in self.api_url
|
|
45
|
+
|
|
36
46
|
@property
|
|
37
47
|
def name(self) -> str:
|
|
38
48
|
return "CrossRefLocal"
|
|
@@ -41,6 +51,26 @@ class CrossRefLocalEngine(BaseDOIEngine):
|
|
|
41
51
|
def rate_limit_delay(self) -> float:
|
|
42
52
|
return 0.01
|
|
43
53
|
|
|
54
|
+
def _build_endpoint_url(self, endpoint: str) -> str:
|
|
55
|
+
"""Build the correct endpoint URL based on API type
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
endpoint: Endpoint name (e.g., 'search', 'health', 'stats')
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Full URL for the endpoint
|
|
62
|
+
|
|
63
|
+
Examples:
|
|
64
|
+
Internal: http://crossref:3333/api/search/
|
|
65
|
+
External: https://scitex.ai/scholar/api/crossref/search/
|
|
66
|
+
"""
|
|
67
|
+
if self._is_external_api:
|
|
68
|
+
# External API: base URL already includes /scholar/api/crossref
|
|
69
|
+
return f"{self.api_url}/{endpoint}/"
|
|
70
|
+
else:
|
|
71
|
+
# Internal API: need to add /api/ prefix
|
|
72
|
+
return f"{self.api_url}/api/{endpoint}/"
|
|
73
|
+
|
|
44
74
|
def search(
|
|
45
75
|
self,
|
|
46
76
|
title: Optional[str] = None,
|
|
@@ -80,8 +110,8 @@ class CrossRefLocalEngine(BaseDOIEngine):
|
|
|
80
110
|
def _make_search_request(
|
|
81
111
|
self, params: dict, return_as: str
|
|
82
112
|
) -> Optional[Dict]:
|
|
83
|
-
"""Make search request to local API"""
|
|
84
|
-
url =
|
|
113
|
+
"""Make search request to local or external API"""
|
|
114
|
+
url = self._build_endpoint_url("search")
|
|
85
115
|
|
|
86
116
|
try:
|
|
87
117
|
assert return_as in [
|
|
@@ -129,7 +159,7 @@ class CrossRefLocalEngine(BaseDOIEngine):
|
|
|
129
159
|
doi = doi.replace("https://doi.org/", "").replace(
|
|
130
160
|
"http://doi.org/", ""
|
|
131
161
|
)
|
|
132
|
-
url =
|
|
162
|
+
url = self._build_endpoint_url("search")
|
|
133
163
|
params = {"doi": doi}
|
|
134
164
|
|
|
135
165
|
try:
|
|
@@ -242,22 +272,53 @@ if __name__ == "__main__":
|
|
|
242
272
|
from scitex.scholar.metadata_engines.individual import CrossRefLocalEngine
|
|
243
273
|
|
|
244
274
|
TITLE = "deep learning"
|
|
245
|
-
DOI = "10.
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
275
|
+
DOI = "10.1038/nature12373"
|
|
276
|
+
|
|
277
|
+
# Example 1: Internal API (Docker network or localhost)
|
|
278
|
+
print("\n" + "=" * 60)
|
|
279
|
+
print("INTERNAL API EXAMPLE")
|
|
280
|
+
print("=" * 60)
|
|
281
|
+
engine_internal = CrossRefLocalEngine(
|
|
282
|
+
"test@example.com",
|
|
283
|
+
api_url="http://crossref:3333"
|
|
284
|
+
)
|
|
285
|
+
print(f"API URL: {engine_internal.api_url}")
|
|
286
|
+
print(f"Is External: {engine_internal._is_external_api}")
|
|
287
|
+
print(f"Search endpoint: {engine_internal._build_endpoint_url('search')}")
|
|
288
|
+
|
|
289
|
+
# Example 2: External API (public internet)
|
|
290
|
+
print("\n" + "=" * 60)
|
|
291
|
+
print("EXTERNAL API EXAMPLE")
|
|
292
|
+
print("=" * 60)
|
|
293
|
+
engine_external = CrossRefLocalEngine(
|
|
294
|
+
"test@example.com",
|
|
295
|
+
api_url="https://scitex.ai/scholar/api/crossref"
|
|
296
|
+
)
|
|
297
|
+
print(f"API URL: {engine_external.api_url}")
|
|
298
|
+
print(f"Is External: {engine_external._is_external_api}")
|
|
299
|
+
print(f"Search endpoint: {engine_external._build_endpoint_url('search')}")
|
|
300
|
+
|
|
301
|
+
# Test search (use external for demo)
|
|
302
|
+
print("\n" + "=" * 60)
|
|
303
|
+
print("SEARCH TEST")
|
|
304
|
+
print("=" * 60)
|
|
305
|
+
result = engine_external.search(doi=DOI)
|
|
306
|
+
if result:
|
|
307
|
+
print(f"Title: {result.get('basic', {}).get('title')}")
|
|
308
|
+
print(f"DOI: {result.get('id', {}).get('doi')}")
|
|
309
|
+
print(f"Year: {result.get('basic', {}).get('year')}")
|
|
310
|
+
else:
|
|
311
|
+
print("No results found")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# Usage examples:
|
|
315
|
+
#
|
|
316
|
+
# Internal API (from NAS Docker):
|
|
317
|
+
# export SCITEX_SCHOLAR_CROSSREF_API_URL=http://crossref:3333
|
|
318
|
+
# python -m scitex.scholar.metadata_engines.individual.CrossRefLocalEngine
|
|
319
|
+
#
|
|
320
|
+
# External API (from anywhere):
|
|
321
|
+
# export SCITEX_SCHOLAR_CROSSREF_API_URL=https://scitex.ai/scholar/api/crossref
|
|
322
|
+
# python -m scitex.scholar.metadata_engines.individual.CrossRefLocalEngine
|
|
262
323
|
|
|
263
324
|
# EOF
|