scitex 2.4.2__py3-none-any.whl → 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,285 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/oa_cache.py
4
+ """
5
+ Open Access Sources Cache.
6
+
7
+ Caches OA journal/source information from OpenAlex API with daily refresh.
8
+ Provides fast local lookups without per-paper API calls.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Optional, Set, Dict, Any
17
+ import asyncio
18
+ import aiohttp
19
+
20
+ import os
21
+
22
+ from scitex import logging
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Cache settings
27
+ CACHE_TTL_SECONDS = 86400 # 1 day
28
+ OPENALEX_OA_SOURCES_URL = "https://api.openalex.org/sources"
29
+ OPENALEX_POLITE_EMAIL = "research@scitex.io"
30
+
31
+
32
+ def _get_default_cache_dir() -> Path:
33
+ """Get default cache directory respecting SCITEX_DIR env var."""
34
+ scitex_dir = os.environ.get("SCITEX_DIR", "~/.scitex")
35
+ return Path(scitex_dir).expanduser() / "scholar" / "cache"
36
+
37
+
38
+ class OASourcesCache:
39
+ """
40
+ Manages cached Open Access sources from OpenAlex.
41
+
42
+ Features:
43
+ - Lazy loading on first access
44
+ - 1-day TTL with automatic refresh
45
+ - Thread-safe singleton pattern
46
+ - Fallback to config YAML if API fails
47
+ - Journal name normalization via ISSN-L
48
+ - Handles abbreviations, variants, and historical names
49
+ """
50
+
51
+ _instance: Optional['OASourcesCache'] = None
52
+ _lock = asyncio.Lock() if hasattr(asyncio, 'Lock') else None
53
+
54
+ def __init__(self, cache_dir: Optional[Path] = None):
55
+ self._cache_dir = cache_dir or _get_default_cache_dir()
56
+ self._cache_file = self._cache_dir / "oa_sources_cache.json"
57
+ self._oa_source_ids: Set[str] = set() # OpenAlex source IDs
58
+ self._oa_source_names: Set[str] = set() # Lowercase source names
59
+ self._oa_issns: Set[str] = set() # ISSNs for journal lookup
60
+ self._issn_l_map: Dict[str, str] = {} # ISSN → ISSN-L mapping
61
+ self._name_to_issn_l: Dict[str, str] = {} # name variant → ISSN-L
62
+ self._issn_l_to_canonical: Dict[str, str] = {} # ISSN-L → canonical name
63
+ self._last_updated: float = 0
64
+ self._loaded = False
65
+
66
+ @classmethod
67
+ def get_instance(cls, cache_dir: Optional[Path] = None) -> 'OASourcesCache':
68
+ """Get singleton instance."""
69
+ if cls._instance is None:
70
+ cls._instance = cls(cache_dir)
71
+ return cls._instance
72
+
73
+ def _is_cache_valid(self) -> bool:
74
+ """Check if cache exists and is within TTL."""
75
+ if not self._cache_file.exists():
76
+ return False
77
+ try:
78
+ with open(self._cache_file, 'r') as f:
79
+ data = json.load(f)
80
+ cached_time = data.get('timestamp', 0)
81
+ return (time.time() - cached_time) < CACHE_TTL_SECONDS
82
+ except (json.JSONDecodeError, IOError):
83
+ return False
84
+
85
+ def _load_from_cache(self) -> bool:
86
+ """Load cached data from file."""
87
+ if not self._cache_file.exists():
88
+ return False
89
+ try:
90
+ with open(self._cache_file, 'r') as f:
91
+ data = json.load(f)
92
+
93
+ self._oa_source_names = set(data.get('source_names', []))
94
+ self._oa_issns = set(data.get('issns', []))
95
+ self._last_updated = data.get('timestamp', 0)
96
+ self._loaded = True
97
+
98
+ logger.info(f"Loaded {len(self._oa_source_names)} OA sources from cache")
99
+ return True
100
+ except (json.JSONDecodeError, IOError) as e:
101
+ logger.warning(f"Failed to load OA cache: {e}")
102
+ return False
103
+
104
+ def _save_to_cache(self) -> None:
105
+ """Save current data to cache file."""
106
+ try:
107
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
108
+ data = {
109
+ 'timestamp': time.time(),
110
+ 'source_names': list(self._oa_source_names),
111
+ 'issns': list(self._oa_issns),
112
+ 'count': len(self._oa_source_names),
113
+ }
114
+ with open(self._cache_file, 'w') as f:
115
+ json.dump(data, f)
116
+ logger.info(f"Saved {len(self._oa_source_names)} OA sources to cache")
117
+ except IOError as e:
118
+ logger.warning(f"Failed to save OA cache: {e}")
119
+
120
+ async def _fetch_oa_sources_async(self, max_pages: int = 100) -> None:
121
+ """
122
+ Fetch OA sources from OpenAlex API.
123
+
124
+ Args:
125
+ max_pages: Maximum pages to fetch (200 sources per page)
126
+ """
127
+ source_names: Set[str] = set()
128
+ issns: Set[str] = set()
129
+
130
+ per_page = 200
131
+ cursor = "*"
132
+ pages_fetched = 0
133
+
134
+ async with aiohttp.ClientSession() as session:
135
+ while pages_fetched < max_pages:
136
+ url = (
137
+ f"{OPENALEX_OA_SOURCES_URL}"
138
+ f"?filter=is_oa:true"
139
+ f"&per_page={per_page}"
140
+ f"&cursor={cursor}"
141
+ f"&mailto={OPENALEX_POLITE_EMAIL}"
142
+ f"&select=display_name,issn"
143
+ )
144
+
145
+ try:
146
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
147
+ if resp.status != 200:
148
+ logger.warning(f"OpenAlex API returned {resp.status}")
149
+ break
150
+
151
+ data = await resp.json()
152
+ results = data.get('results', [])
153
+
154
+ if not results:
155
+ break
156
+
157
+ for source in results:
158
+ name = source.get('display_name', '')
159
+ if name:
160
+ source_names.add(name.lower())
161
+
162
+ # Also store ISSNs for precise matching
163
+ source_issns = source.get('issn', []) or []
164
+ for issn in source_issns:
165
+ if issn:
166
+ issns.add(issn)
167
+
168
+ # Get next cursor
169
+ meta = data.get('meta', {})
170
+ next_cursor = meta.get('next_cursor')
171
+ if not next_cursor or next_cursor == cursor:
172
+ break
173
+ cursor = next_cursor
174
+ pages_fetched += 1
175
+
176
+ # Progress log every 10 pages
177
+ if pages_fetched % 10 == 0:
178
+ logger.info(f"Fetched {pages_fetched} pages, {len(source_names)} sources so far...")
179
+
180
+ except asyncio.TimeoutError:
181
+ logger.warning("OpenAlex API timeout")
182
+ break
183
+ except Exception as e:
184
+ logger.error(f"Error fetching OA sources: {e}")
185
+ break
186
+
187
+ if source_names:
188
+ self._oa_source_names = source_names
189
+ self._oa_issns = issns
190
+ self._last_updated = time.time()
191
+ self._loaded = True
192
+ self._save_to_cache()
193
+ logger.info(f"Fetched {len(source_names)} OA sources from OpenAlex")
194
+
195
+ def _fetch_oa_sources_sync(self, max_pages: int = 100) -> None:
196
+ """Synchronous wrapper for fetching OA sources."""
197
+ try:
198
+ loop = asyncio.get_event_loop()
199
+ except RuntimeError:
200
+ loop = asyncio.new_event_loop()
201
+ asyncio.set_event_loop(loop)
202
+
203
+ loop.run_until_complete(self._fetch_oa_sources_async(max_pages))
204
+
205
+ def ensure_loaded(self, force_refresh: bool = False) -> None:
206
+ """
207
+ Ensure cache is loaded, fetching from API if needed.
208
+
209
+ Args:
210
+ force_refresh: Force refresh even if cache is valid
211
+ """
212
+ if self._loaded and not force_refresh and self._is_cache_valid():
213
+ return
214
+
215
+ # Try loading from cache first
216
+ if not force_refresh and self._load_from_cache() and self._is_cache_valid():
217
+ return
218
+
219
+ # Fetch from API
220
+ logger.info("Refreshing OA sources cache from OpenAlex...")
221
+ self._fetch_oa_sources_sync()
222
+
223
+ def is_oa_source(self, source_name: str) -> bool:
224
+ """
225
+ Check if a source/journal name is in the OA list.
226
+
227
+ Args:
228
+ source_name: Journal or source name to check
229
+
230
+ Returns:
231
+ True if source is known to be Open Access
232
+ """
233
+ self.ensure_loaded()
234
+ if not source_name:
235
+ return False
236
+ return source_name.lower() in self._oa_source_names
237
+
238
+ def is_oa_issn(self, issn: str) -> bool:
239
+ """
240
+ Check if an ISSN belongs to an OA journal.
241
+
242
+ Args:
243
+ issn: ISSN to check
244
+
245
+ Returns:
246
+ True if ISSN belongs to an OA journal
247
+ """
248
+ self.ensure_loaded()
249
+ if not issn:
250
+ return False
251
+ # Normalize ISSN format
252
+ issn = issn.replace('-', '').upper()
253
+ return issn in self._oa_issns or f"{issn[:4]}-{issn[4:]}" in self._oa_issns
254
+
255
+ @property
256
+ def source_count(self) -> int:
257
+ """Get number of cached OA sources."""
258
+ self.ensure_loaded()
259
+ return len(self._oa_source_names)
260
+
261
+ @property
262
+ def cache_age_hours(self) -> float:
263
+ """Get cache age in hours."""
264
+ if self._last_updated == 0:
265
+ return float('inf')
266
+ return (time.time() - self._last_updated) / 3600
267
+
268
+
269
+ # Convenience functions
270
+ def get_oa_cache(cache_dir: Optional[Path] = None) -> OASourcesCache:
271
+ """Get the OA sources cache singleton."""
272
+ return OASourcesCache.get_instance(cache_dir)
273
+
274
+
275
+ def is_oa_journal_cached(journal_name: str) -> bool:
276
+ """Check if journal is OA using cached OpenAlex data."""
277
+ return get_oa_cache().is_oa_source(journal_name)
278
+
279
+
280
+ def refresh_oa_cache() -> None:
281
+ """Force refresh the OA sources cache."""
282
+ get_oa_cache().ensure_loaded(force_refresh=True)
283
+
284
+
285
+ # EOF