scitex 2.4.2__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/scholar/citation_graph/database.py +9 -2
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +55 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/RECORD +25 -17
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.2.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/oa_cache.py
|
|
4
|
+
"""
|
|
5
|
+
Open Access Sources Cache.
|
|
6
|
+
|
|
7
|
+
Caches OA journal/source information from OpenAlex API with daily refresh.
|
|
8
|
+
Provides fast local lookups without per-paper API calls.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Optional, Set, Dict, Any
|
|
17
|
+
import asyncio
|
|
18
|
+
import aiohttp
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
|
|
22
|
+
from scitex import logging
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Cache settings
|
|
27
|
+
CACHE_TTL_SECONDS = 86400 # 1 day
|
|
28
|
+
OPENALEX_OA_SOURCES_URL = "https://api.openalex.org/sources"
|
|
29
|
+
OPENALEX_POLITE_EMAIL = "research@scitex.io"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_default_cache_dir() -> Path:
|
|
33
|
+
"""Get default cache directory respecting SCITEX_DIR env var."""
|
|
34
|
+
scitex_dir = os.environ.get("SCITEX_DIR", "~/.scitex")
|
|
35
|
+
return Path(scitex_dir).expanduser() / "scholar" / "cache"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OASourcesCache:
|
|
39
|
+
"""
|
|
40
|
+
Manages cached Open Access sources from OpenAlex.
|
|
41
|
+
|
|
42
|
+
Features:
|
|
43
|
+
- Lazy loading on first access
|
|
44
|
+
- 1-day TTL with automatic refresh
|
|
45
|
+
- Thread-safe singleton pattern
|
|
46
|
+
- Fallback to config YAML if API fails
|
|
47
|
+
- Journal name normalization via ISSN-L
|
|
48
|
+
- Handles abbreviations, variants, and historical names
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
_instance: Optional['OASourcesCache'] = None
|
|
52
|
+
_lock = asyncio.Lock() if hasattr(asyncio, 'Lock') else None
|
|
53
|
+
|
|
54
|
+
def __init__(self, cache_dir: Optional[Path] = None):
|
|
55
|
+
self._cache_dir = cache_dir or _get_default_cache_dir()
|
|
56
|
+
self._cache_file = self._cache_dir / "oa_sources_cache.json"
|
|
57
|
+
self._oa_source_ids: Set[str] = set() # OpenAlex source IDs
|
|
58
|
+
self._oa_source_names: Set[str] = set() # Lowercase source names
|
|
59
|
+
self._oa_issns: Set[str] = set() # ISSNs for journal lookup
|
|
60
|
+
self._issn_l_map: Dict[str, str] = {} # ISSN → ISSN-L mapping
|
|
61
|
+
self._name_to_issn_l: Dict[str, str] = {} # name variant → ISSN-L
|
|
62
|
+
self._issn_l_to_canonical: Dict[str, str] = {} # ISSN-L → canonical name
|
|
63
|
+
self._last_updated: float = 0
|
|
64
|
+
self._loaded = False
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def get_instance(cls, cache_dir: Optional[Path] = None) -> 'OASourcesCache':
|
|
68
|
+
"""Get singleton instance."""
|
|
69
|
+
if cls._instance is None:
|
|
70
|
+
cls._instance = cls(cache_dir)
|
|
71
|
+
return cls._instance
|
|
72
|
+
|
|
73
|
+
def _is_cache_valid(self) -> bool:
|
|
74
|
+
"""Check if cache exists and is within TTL."""
|
|
75
|
+
if not self._cache_file.exists():
|
|
76
|
+
return False
|
|
77
|
+
try:
|
|
78
|
+
with open(self._cache_file, 'r') as f:
|
|
79
|
+
data = json.load(f)
|
|
80
|
+
cached_time = data.get('timestamp', 0)
|
|
81
|
+
return (time.time() - cached_time) < CACHE_TTL_SECONDS
|
|
82
|
+
except (json.JSONDecodeError, IOError):
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
def _load_from_cache(self) -> bool:
|
|
86
|
+
"""Load cached data from file."""
|
|
87
|
+
if not self._cache_file.exists():
|
|
88
|
+
return False
|
|
89
|
+
try:
|
|
90
|
+
with open(self._cache_file, 'r') as f:
|
|
91
|
+
data = json.load(f)
|
|
92
|
+
|
|
93
|
+
self._oa_source_names = set(data.get('source_names', []))
|
|
94
|
+
self._oa_issns = set(data.get('issns', []))
|
|
95
|
+
self._last_updated = data.get('timestamp', 0)
|
|
96
|
+
self._loaded = True
|
|
97
|
+
|
|
98
|
+
logger.info(f"Loaded {len(self._oa_source_names)} OA sources from cache")
|
|
99
|
+
return True
|
|
100
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
101
|
+
logger.warning(f"Failed to load OA cache: {e}")
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def _save_to_cache(self) -> None:
|
|
105
|
+
"""Save current data to cache file."""
|
|
106
|
+
try:
|
|
107
|
+
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
data = {
|
|
109
|
+
'timestamp': time.time(),
|
|
110
|
+
'source_names': list(self._oa_source_names),
|
|
111
|
+
'issns': list(self._oa_issns),
|
|
112
|
+
'count': len(self._oa_source_names),
|
|
113
|
+
}
|
|
114
|
+
with open(self._cache_file, 'w') as f:
|
|
115
|
+
json.dump(data, f)
|
|
116
|
+
logger.info(f"Saved {len(self._oa_source_names)} OA sources to cache")
|
|
117
|
+
except IOError as e:
|
|
118
|
+
logger.warning(f"Failed to save OA cache: {e}")
|
|
119
|
+
|
|
120
|
+
async def _fetch_oa_sources_async(self, max_pages: int = 100) -> None:
|
|
121
|
+
"""
|
|
122
|
+
Fetch OA sources from OpenAlex API.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
max_pages: Maximum pages to fetch (200 sources per page)
|
|
126
|
+
"""
|
|
127
|
+
source_names: Set[str] = set()
|
|
128
|
+
issns: Set[str] = set()
|
|
129
|
+
|
|
130
|
+
per_page = 200
|
|
131
|
+
cursor = "*"
|
|
132
|
+
pages_fetched = 0
|
|
133
|
+
|
|
134
|
+
async with aiohttp.ClientSession() as session:
|
|
135
|
+
while pages_fetched < max_pages:
|
|
136
|
+
url = (
|
|
137
|
+
f"{OPENALEX_OA_SOURCES_URL}"
|
|
138
|
+
f"?filter=is_oa:true"
|
|
139
|
+
f"&per_page={per_page}"
|
|
140
|
+
f"&cursor={cursor}"
|
|
141
|
+
f"&mailto={OPENALEX_POLITE_EMAIL}"
|
|
142
|
+
f"&select=display_name,issn"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
|
147
|
+
if resp.status != 200:
|
|
148
|
+
logger.warning(f"OpenAlex API returned {resp.status}")
|
|
149
|
+
break
|
|
150
|
+
|
|
151
|
+
data = await resp.json()
|
|
152
|
+
results = data.get('results', [])
|
|
153
|
+
|
|
154
|
+
if not results:
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
for source in results:
|
|
158
|
+
name = source.get('display_name', '')
|
|
159
|
+
if name:
|
|
160
|
+
source_names.add(name.lower())
|
|
161
|
+
|
|
162
|
+
# Also store ISSNs for precise matching
|
|
163
|
+
source_issns = source.get('issn', []) or []
|
|
164
|
+
for issn in source_issns:
|
|
165
|
+
if issn:
|
|
166
|
+
issns.add(issn)
|
|
167
|
+
|
|
168
|
+
# Get next cursor
|
|
169
|
+
meta = data.get('meta', {})
|
|
170
|
+
next_cursor = meta.get('next_cursor')
|
|
171
|
+
if not next_cursor or next_cursor == cursor:
|
|
172
|
+
break
|
|
173
|
+
cursor = next_cursor
|
|
174
|
+
pages_fetched += 1
|
|
175
|
+
|
|
176
|
+
# Progress log every 10 pages
|
|
177
|
+
if pages_fetched % 10 == 0:
|
|
178
|
+
logger.info(f"Fetched {pages_fetched} pages, {len(source_names)} sources so far...")
|
|
179
|
+
|
|
180
|
+
except asyncio.TimeoutError:
|
|
181
|
+
logger.warning("OpenAlex API timeout")
|
|
182
|
+
break
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f"Error fetching OA sources: {e}")
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
if source_names:
|
|
188
|
+
self._oa_source_names = source_names
|
|
189
|
+
self._oa_issns = issns
|
|
190
|
+
self._last_updated = time.time()
|
|
191
|
+
self._loaded = True
|
|
192
|
+
self._save_to_cache()
|
|
193
|
+
logger.info(f"Fetched {len(source_names)} OA sources from OpenAlex")
|
|
194
|
+
|
|
195
|
+
def _fetch_oa_sources_sync(self, max_pages: int = 100) -> None:
|
|
196
|
+
"""Synchronous wrapper for fetching OA sources."""
|
|
197
|
+
try:
|
|
198
|
+
loop = asyncio.get_event_loop()
|
|
199
|
+
except RuntimeError:
|
|
200
|
+
loop = asyncio.new_event_loop()
|
|
201
|
+
asyncio.set_event_loop(loop)
|
|
202
|
+
|
|
203
|
+
loop.run_until_complete(self._fetch_oa_sources_async(max_pages))
|
|
204
|
+
|
|
205
|
+
def ensure_loaded(self, force_refresh: bool = False) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Ensure cache is loaded, fetching from API if needed.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
force_refresh: Force refresh even if cache is valid
|
|
211
|
+
"""
|
|
212
|
+
if self._loaded and not force_refresh and self._is_cache_valid():
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
# Try loading from cache first
|
|
216
|
+
if not force_refresh and self._load_from_cache() and self._is_cache_valid():
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
# Fetch from API
|
|
220
|
+
logger.info("Refreshing OA sources cache from OpenAlex...")
|
|
221
|
+
self._fetch_oa_sources_sync()
|
|
222
|
+
|
|
223
|
+
def is_oa_source(self, source_name: str) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Check if a source/journal name is in the OA list.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
source_name: Journal or source name to check
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
True if source is known to be Open Access
|
|
232
|
+
"""
|
|
233
|
+
self.ensure_loaded()
|
|
234
|
+
if not source_name:
|
|
235
|
+
return False
|
|
236
|
+
return source_name.lower() in self._oa_source_names
|
|
237
|
+
|
|
238
|
+
def is_oa_issn(self, issn: str) -> bool:
|
|
239
|
+
"""
|
|
240
|
+
Check if an ISSN belongs to an OA journal.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
issn: ISSN to check
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
True if ISSN belongs to an OA journal
|
|
247
|
+
"""
|
|
248
|
+
self.ensure_loaded()
|
|
249
|
+
if not issn:
|
|
250
|
+
return False
|
|
251
|
+
# Normalize ISSN format
|
|
252
|
+
issn = issn.replace('-', '').upper()
|
|
253
|
+
return issn in self._oa_issns or f"{issn[:4]}-{issn[4:]}" in self._oa_issns
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def source_count(self) -> int:
|
|
257
|
+
"""Get number of cached OA sources."""
|
|
258
|
+
self.ensure_loaded()
|
|
259
|
+
return len(self._oa_source_names)
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def cache_age_hours(self) -> float:
|
|
263
|
+
"""Get cache age in hours."""
|
|
264
|
+
if self._last_updated == 0:
|
|
265
|
+
return float('inf')
|
|
266
|
+
return (time.time() - self._last_updated) / 3600
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# Convenience functions
|
|
270
|
+
def get_oa_cache(cache_dir: Optional[Path] = None) -> OASourcesCache:
|
|
271
|
+
"""Get the OA sources cache singleton."""
|
|
272
|
+
return OASourcesCache.get_instance(cache_dir)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def is_oa_journal_cached(journal_name: str) -> bool:
|
|
276
|
+
"""Check if journal is OA using cached OpenAlex data."""
|
|
277
|
+
return get_oa_cache().is_oa_source(journal_name)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def refresh_oa_cache() -> None:
|
|
281
|
+
"""Force refresh the OA sources cache."""
|
|
282
|
+
get_oa_cache().ensure_loaded(force_refresh=True)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# EOF
|