scitex 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. scitex/__version__.py +1 -1
  2. scitex/browser/__init__.py +53 -0
  3. scitex/browser/debugging/__init__.py +56 -0
  4. scitex/browser/debugging/_failure_capture.py +372 -0
  5. scitex/browser/debugging/_sync_session.py +259 -0
  6. scitex/browser/debugging/_test_monitor.py +284 -0
  7. scitex/browser/debugging/_visual_cursor.py +432 -0
  8. scitex/io/_load.py +5 -0
  9. scitex/io/_load_modules/_canvas.py +171 -0
  10. scitex/io/_save.py +8 -0
  11. scitex/io/_save_modules/_canvas.py +356 -0
  12. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +77 -22
  13. scitex/plt/docs/FIGURE_ARCHITECTURE.md +257 -0
  14. scitex/plt/utils/__init__.py +10 -0
  15. scitex/plt/utils/_collect_figure_metadata.py +14 -12
  16. scitex/plt/utils/_csv_column_naming.py +237 -0
  17. scitex/scholar/citation_graph/database.py +9 -2
  18. scitex/scholar/config/ScholarConfig.py +23 -3
  19. scitex/scholar/config/default.yaml +55 -0
  20. scitex/scholar/core/Paper.py +102 -0
  21. scitex/scholar/core/__init__.py +44 -0
  22. scitex/scholar/core/journal_normalizer.py +524 -0
  23. scitex/scholar/core/oa_cache.py +285 -0
  24. scitex/scholar/core/open_access.py +457 -0
  25. scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
  26. scitex/scholar/pdf_download/strategies/__init__.py +6 -0
  27. scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
  28. scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
  29. scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
  30. scitex/session/_decorator.py +13 -1
  31. scitex/vis/README.md +246 -615
  32. scitex/vis/__init__.py +138 -78
  33. scitex/vis/canvas.py +423 -0
  34. scitex/vis/docs/CANVAS_ARCHITECTURE.md +307 -0
  35. scitex/vis/editor/__init__.py +1 -1
  36. scitex/vis/editor/_dearpygui_editor.py +1830 -0
  37. scitex/vis/editor/_defaults.py +40 -1
  38. scitex/vis/editor/_edit.py +54 -18
  39. scitex/vis/editor/_flask_editor.py +37 -0
  40. scitex/vis/editor/_qt_editor.py +865 -0
  41. scitex/vis/editor/flask_editor/__init__.py +21 -0
  42. scitex/vis/editor/flask_editor/bbox.py +216 -0
  43. scitex/vis/editor/flask_editor/core.py +152 -0
  44. scitex/vis/editor/flask_editor/plotter.py +130 -0
  45. scitex/vis/editor/flask_editor/renderer.py +184 -0
  46. scitex/vis/editor/flask_editor/templates/__init__.py +33 -0
  47. scitex/vis/editor/flask_editor/templates/html.py +295 -0
  48. scitex/vis/editor/flask_editor/templates/scripts.py +614 -0
  49. scitex/vis/editor/flask_editor/templates/styles.py +549 -0
  50. scitex/vis/editor/flask_editor/utils.py +81 -0
  51. scitex/vis/io/__init__.py +84 -21
  52. scitex/vis/io/canvas.py +226 -0
  53. scitex/vis/io/data.py +204 -0
  54. scitex/vis/io/directory.py +202 -0
  55. scitex/vis/io/export.py +460 -0
  56. scitex/vis/io/panel.py +424 -0
  57. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/METADATA +9 -2
  58. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/RECORD +61 -32
  59. scitex/vis/DJANGO_INTEGRATION.md +0 -677
  60. scitex/vis/editor/_web_editor.py +0 -1440
  61. scitex/vis/tmp.txt +0 -239
  62. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/WHEEL +0 -0
  63. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/entry_points.txt +0 -0
  64. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,524 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # File: /home/ywatanabe/proj/scitex-code/src/scitex/scholar/core/journal_normalizer.py
4
+ """
5
+ Journal Name Normalizer.
6
+
7
+ Handles journal name variations, abbreviations, and historical names
8
+ using ISSN-L as the unique identifier (single source of truth).
9
+
10
+ Data sources:
11
+ - OpenAlex API (display_name, alternate_titles, abbreviated_title, issn_l)
12
+ - Crossref API (container-title, short-container-title)
13
+ - Local cache with 1-day TTL
14
+
15
+ Usage:
16
+ from scitex.scholar.core import JournalNormalizer
17
+
18
+ normalizer = JournalNormalizer.get_instance()
19
+
20
+ # Normalize any journal name variant
21
+ canonical = normalizer.normalize("J. Neurosci.") # → "Journal of Neuroscience"
22
+
23
+ # Get ISSN-L for a journal
24
+ issn_l = normalizer.get_issn_l("PLOS ONE") # → "1932-6203"
25
+
26
+ # Check if two names refer to same journal
27
+ normalizer.is_same_journal("J Neurosci", "Journal of Neuroscience") # → True
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import json
33
+ import os
34
+ import re
35
+ import time
36
+ from pathlib import Path
37
+ from typing import Optional, Set, Dict, List, Tuple, Any
38
+ import asyncio
39
+ import aiohttp
40
+
41
+ from scitex import logging
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Cache settings
46
+ CACHE_TTL_SECONDS = 86400 # 1 day
47
+ OPENALEX_SOURCES_URL = "https://api.openalex.org/sources"
48
+ OPENALEX_POLITE_EMAIL = "research@scitex.io"
49
+
50
+
51
+ def _get_default_cache_dir() -> Path:
52
+ """Get default cache directory respecting SCITEX_DIR env var."""
53
+ scitex_dir = os.environ.get("SCITEX_DIR", "~/.scitex")
54
+ return Path(scitex_dir).expanduser() / "scholar" / "cache"
55
+
56
+
57
+ def _normalize_name(name: str) -> str:
58
+ """
59
+ Basic string normalization for matching.
60
+
61
+ - Lowercase
62
+ - Remove extra whitespace
63
+ - Normalize punctuation
64
+ """
65
+ if not name:
66
+ return ""
67
+ # Lowercase
68
+ name = name.lower()
69
+ # Normalize whitespace
70
+ name = " ".join(name.split())
71
+ # Remove common punctuation variations
72
+ name = name.replace(".", "").replace(",", "").replace(":", "")
73
+ # Normalize ampersand
74
+ name = name.replace(" & ", " and ")
75
+ return name.strip()
76
+
77
+
78
+ def _normalize_issn(issn: str) -> str:
79
+ """Normalize ISSN format to XXXX-XXXX."""
80
+ if not issn:
81
+ return ""
82
+ issn = issn.upper().replace("-", "").replace(" ", "")
83
+ if len(issn) == 8:
84
+ return f"{issn[:4]}-{issn[4:]}"
85
+ return issn
86
+
87
+
88
+ class JournalNormalizer:
89
+ """
90
+ Journal name normalizer using ISSN-L as unique identifier.
91
+
92
+ Handles:
93
+ - Full names ↔ abbreviations
94
+ - Name variants (spelling, punctuation, capitalization)
95
+ - Historical/former names
96
+ - Publisher variations
97
+
98
+ Data is cached locally with daily refresh from OpenAlex.
99
+ """
100
+
101
+ _instance: Optional['JournalNormalizer'] = None
102
+
103
+ def __init__(self, cache_dir: Optional[Path] = None):
104
+ self._cache_dir = cache_dir or _get_default_cache_dir()
105
+ self._cache_file = self._cache_dir / "journal_normalizer_cache.json"
106
+
107
+ # Core mappings (ISSN-L is the key)
108
+ self._issn_l_data: Dict[str, Dict[str, Any]] = {} # ISSN-L → full metadata
109
+
110
+ # Lookup indexes (for fast search)
111
+ self._name_to_issn_l: Dict[str, str] = {} # normalized name → ISSN-L
112
+ self._issn_to_issn_l: Dict[str, str] = {} # any ISSN → ISSN-L
113
+ self._abbrev_to_issn_l: Dict[str, str] = {} # abbreviated name → ISSN-L
114
+
115
+ # Stats
116
+ self._last_updated: float = 0
117
+ self._loaded = False
118
+ self._journal_count = 0
119
+
120
+ @classmethod
121
+ def get_instance(cls, cache_dir: Optional[Path] = None) -> 'JournalNormalizer':
122
+ """Get singleton instance."""
123
+ if cls._instance is None:
124
+ cls._instance = cls(cache_dir)
125
+ return cls._instance
126
+
127
+ def _is_cache_valid(self) -> bool:
128
+ """Check if cache exists and is within TTL."""
129
+ if not self._cache_file.exists():
130
+ return False
131
+ try:
132
+ with open(self._cache_file, 'r') as f:
133
+ data = json.load(f)
134
+ cached_time = data.get('timestamp', 0)
135
+ return (time.time() - cached_time) < CACHE_TTL_SECONDS
136
+ except (json.JSONDecodeError, IOError):
137
+ return False
138
+
139
+ def _load_from_cache(self) -> bool:
140
+ """Load cached data from file."""
141
+ if not self._cache_file.exists():
142
+ return False
143
+ try:
144
+ with open(self._cache_file, 'r') as f:
145
+ data = json.load(f)
146
+
147
+ self._issn_l_data = data.get('issn_l_data', {})
148
+ self._name_to_issn_l = data.get('name_to_issn_l', {})
149
+ self._issn_to_issn_l = data.get('issn_to_issn_l', {})
150
+ self._abbrev_to_issn_l = data.get('abbrev_to_issn_l', {})
151
+ self._last_updated = data.get('timestamp', 0)
152
+ self._journal_count = len(self._issn_l_data)
153
+ self._loaded = True
154
+
155
+ logger.info(f"Loaded {self._journal_count} journals from normalizer cache")
156
+ return True
157
+ except (json.JSONDecodeError, IOError) as e:
158
+ logger.warning(f"Failed to load journal normalizer cache: {e}")
159
+ return False
160
+
161
+ def _save_to_cache(self) -> None:
162
+ """Save current data to cache file."""
163
+ try:
164
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
165
+ data = {
166
+ 'timestamp': time.time(),
167
+ 'journal_count': len(self._issn_l_data),
168
+ 'issn_l_data': self._issn_l_data,
169
+ 'name_to_issn_l': self._name_to_issn_l,
170
+ 'issn_to_issn_l': self._issn_to_issn_l,
171
+ 'abbrev_to_issn_l': self._abbrev_to_issn_l,
172
+ }
173
+ with open(self._cache_file, 'w') as f:
174
+ json.dump(data, f)
175
+ logger.info(f"Saved {len(self._issn_l_data)} journals to normalizer cache")
176
+ except IOError as e:
177
+ logger.warning(f"Failed to save journal normalizer cache: {e}")
178
+
179
+ def _add_journal(self, source_data: Dict[str, Any]) -> None:
180
+ """
181
+ Add a journal to the normalizer from OpenAlex source data.
182
+
183
+ Args:
184
+ source_data: OpenAlex source object with display_name, issn_l, etc.
185
+ """
186
+ issn_l = source_data.get('issn_l')
187
+ if not issn_l:
188
+ return
189
+
190
+ issn_l = _normalize_issn(issn_l)
191
+ display_name = source_data.get('display_name', '')
192
+ abbreviated_title = source_data.get('abbreviated_title', '')
193
+ alternate_titles = source_data.get('alternate_titles', []) or []
194
+ issns = source_data.get('issn', []) or []
195
+ is_oa = source_data.get('is_oa', False)
196
+
197
+ # Store full metadata
198
+ self._issn_l_data[issn_l] = {
199
+ 'canonical_name': display_name,
200
+ 'abbreviated_title': abbreviated_title,
201
+ 'alternate_titles': alternate_titles,
202
+ 'issns': [_normalize_issn(i) for i in issns if i],
203
+ 'is_oa': is_oa,
204
+ 'publisher': source_data.get('host_organization_name', ''),
205
+ }
206
+
207
+ # Build lookup indexes
208
+ # 1. Canonical name
209
+ if display_name:
210
+ norm_name = _normalize_name(display_name)
211
+ self._name_to_issn_l[norm_name] = issn_l
212
+
213
+ # 2. Alternate titles (variants)
214
+ for alt in alternate_titles:
215
+ if alt:
216
+ norm_alt = _normalize_name(alt)
217
+ if norm_alt and norm_alt not in self._name_to_issn_l:
218
+ self._name_to_issn_l[norm_alt] = issn_l
219
+
220
+ # 3. Abbreviated title
221
+ if abbreviated_title:
222
+ norm_abbrev = _normalize_name(abbreviated_title)
223
+ self._abbrev_to_issn_l[norm_abbrev] = issn_l
224
+ # Also add without periods (common variation)
225
+ self._abbrev_to_issn_l[norm_abbrev.replace(".", "")] = issn_l
226
+
227
+ # 4. All ISSNs → ISSN-L
228
+ for issn in issns:
229
+ if issn:
230
+ norm_issn = _normalize_issn(issn)
231
+ self._issn_to_issn_l[norm_issn] = issn_l
232
+ self._issn_to_issn_l[issn_l] = issn_l # Self-reference
233
+
234
+ async def _fetch_journals_async(self, max_pages: int = 500, filter_oa_only: bool = False) -> None:
235
+ """
236
+ Fetch journal data from OpenAlex API.
237
+
238
+ Args:
239
+ max_pages: Maximum pages to fetch (200 per page)
240
+ filter_oa_only: If True, only fetch OA journals
241
+ """
242
+ per_page = 200
243
+ cursor = "*"
244
+ pages_fetched = 0
245
+
246
+ # Select fields to minimize response size
247
+ select_fields = "display_name,issn_l,issn,abbreviated_title,alternate_titles,is_oa,host_organization_name"
248
+
249
+ filter_param = "is_oa:true" if filter_oa_only else "type:journal"
250
+
251
+ async with aiohttp.ClientSession() as session:
252
+ while pages_fetched < max_pages:
253
+ url = (
254
+ f"{OPENALEX_SOURCES_URL}"
255
+ f"?filter={filter_param}"
256
+ f"&per_page={per_page}"
257
+ f"&cursor={cursor}"
258
+ f"&mailto={OPENALEX_POLITE_EMAIL}"
259
+ f"&select={select_fields}"
260
+ )
261
+
262
+ try:
263
+ async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
264
+ if resp.status != 200:
265
+ logger.warning(f"OpenAlex API returned {resp.status}")
266
+ break
267
+
268
+ data = await resp.json()
269
+ results = data.get('results', [])
270
+
271
+ if not results:
272
+ break
273
+
274
+ for source in results:
275
+ self._add_journal(source)
276
+
277
+ # Get next cursor
278
+ meta = data.get('meta', {})
279
+ next_cursor = meta.get('next_cursor')
280
+ if not next_cursor or next_cursor == cursor:
281
+ break
282
+ cursor = next_cursor
283
+ pages_fetched += 1
284
+
285
+ # Progress log
286
+ if pages_fetched % 20 == 0:
287
+ logger.info(f"Fetched {pages_fetched} pages, {len(self._issn_l_data)} journals...")
288
+
289
+ except asyncio.TimeoutError:
290
+ logger.warning("OpenAlex API timeout")
291
+ break
292
+ except Exception as e:
293
+ logger.error(f"Error fetching journals: {e}")
294
+ break
295
+
296
+ self._journal_count = len(self._issn_l_data)
297
+ self._last_updated = time.time()
298
+ self._loaded = True
299
+
300
+ if self._journal_count > 0:
301
+ self._save_to_cache()
302
+ logger.info(f"Fetched {self._journal_count} journals from OpenAlex")
303
+
304
+ def _fetch_journals_sync(self, max_pages: int = 500, filter_oa_only: bool = False) -> None:
305
+ """Synchronous wrapper for fetching journals."""
306
+ try:
307
+ loop = asyncio.get_event_loop()
308
+ except RuntimeError:
309
+ loop = asyncio.new_event_loop()
310
+ asyncio.set_event_loop(loop)
311
+
312
+ loop.run_until_complete(self._fetch_journals_async(max_pages, filter_oa_only))
313
+
314
+ def ensure_loaded(self, force_refresh: bool = False, max_pages: int = 500) -> None:
315
+ """
316
+ Ensure cache is loaded, fetching from API if needed.
317
+
318
+ Args:
319
+ force_refresh: Force refresh even if cache is valid
320
+ max_pages: Max pages to fetch if refreshing
321
+ """
322
+ if self._loaded and not force_refresh and self._is_cache_valid():
323
+ return
324
+
325
+ # Try loading from cache first
326
+ if not force_refresh and self._load_from_cache() and self._is_cache_valid():
327
+ return
328
+
329
+ # Fetch from API
330
+ logger.info("Refreshing journal normalizer cache from OpenAlex...")
331
+ self._fetch_journals_sync(max_pages)
332
+
333
+ # ==================== Public API ====================
334
+
335
+ def get_issn_l(self, journal_name: str) -> Optional[str]:
336
+ """
337
+ Get ISSN-L for a journal name.
338
+
339
+ Args:
340
+ journal_name: Any journal name variant, abbreviation, or ISSN
341
+
342
+ Returns:
343
+ ISSN-L if found, None otherwise
344
+ """
345
+ self.ensure_loaded()
346
+
347
+ if not journal_name:
348
+ return None
349
+
350
+ # Check if it's an ISSN
351
+ if re.match(r'^\d{4}-?\d{3}[\dXx]$', journal_name.replace(" ", "")):
352
+ norm_issn = _normalize_issn(journal_name)
353
+ if norm_issn in self._issn_to_issn_l:
354
+ return self._issn_to_issn_l[norm_issn]
355
+
356
+ # Try normalized name lookup
357
+ norm_name = _normalize_name(journal_name)
358
+
359
+ # Check full names
360
+ if norm_name in self._name_to_issn_l:
361
+ return self._name_to_issn_l[norm_name]
362
+
363
+ # Check abbreviations
364
+ if norm_name in self._abbrev_to_issn_l:
365
+ return self._abbrev_to_issn_l[norm_name]
366
+
367
+ return None
368
+
369
+ def normalize(self, journal_name: str) -> Optional[str]:
370
+ """
371
+ Normalize journal name to canonical form.
372
+
373
+ Args:
374
+ journal_name: Any journal name variant
375
+
376
+ Returns:
377
+ Canonical journal name, or original if not found
378
+ """
379
+ issn_l = self.get_issn_l(journal_name)
380
+ if issn_l and issn_l in self._issn_l_data:
381
+ return self._issn_l_data[issn_l].get('canonical_name', journal_name)
382
+ return journal_name
383
+
384
+ def get_abbreviation(self, journal_name: str) -> Optional[str]:
385
+ """
386
+ Get abbreviated title for a journal.
387
+
388
+ Args:
389
+ journal_name: Any journal name variant
390
+
391
+ Returns:
392
+ Abbreviated title if available
393
+ """
394
+ issn_l = self.get_issn_l(journal_name)
395
+ if issn_l and issn_l in self._issn_l_data:
396
+ return self._issn_l_data[issn_l].get('abbreviated_title')
397
+ return None
398
+
399
+ def get_journal_info(self, journal_name: str) -> Optional[Dict[str, Any]]:
400
+ """
401
+ Get full journal metadata.
402
+
403
+ Args:
404
+ journal_name: Any journal name variant
405
+
406
+ Returns:
407
+ Dict with canonical_name, abbreviated_title, alternate_titles, issns, is_oa, publisher
408
+ """
409
+ issn_l = self.get_issn_l(journal_name)
410
+ if issn_l and issn_l in self._issn_l_data:
411
+ return {
412
+ 'issn_l': issn_l,
413
+ **self._issn_l_data[issn_l]
414
+ }
415
+ return None
416
+
417
+ def is_same_journal(self, name1: str, name2: str) -> bool:
418
+ """
419
+ Check if two names refer to the same journal.
420
+
421
+ Args:
422
+ name1: First journal name
423
+ name2: Second journal name
424
+
425
+ Returns:
426
+ True if both names resolve to the same ISSN-L
427
+ """
428
+ issn_l_1 = self.get_issn_l(name1)
429
+ issn_l_2 = self.get_issn_l(name2)
430
+
431
+ if issn_l_1 and issn_l_2:
432
+ return issn_l_1 == issn_l_2
433
+
434
+ # Fallback: simple normalization comparison
435
+ return _normalize_name(name1) == _normalize_name(name2)
436
+
437
+ def is_open_access(self, journal_name: str) -> bool:
438
+ """
439
+ Check if journal is Open Access.
440
+
441
+ Args:
442
+ journal_name: Any journal name variant
443
+
444
+ Returns:
445
+ True if journal is OA
446
+ """
447
+ issn_l = self.get_issn_l(journal_name)
448
+ if issn_l and issn_l in self._issn_l_data:
449
+ return self._issn_l_data[issn_l].get('is_oa', False)
450
+ return False
451
+
452
+ def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
453
+ """
454
+ Search for journals by name (prefix/substring match).
455
+
456
+ Args:
457
+ query: Search query
458
+ limit: Maximum results
459
+
460
+ Returns:
461
+ List of matching journal info dicts
462
+ """
463
+ self.ensure_loaded()
464
+
465
+ if not query:
466
+ return []
467
+
468
+ norm_query = _normalize_name(query)
469
+ results = []
470
+
471
+ for norm_name, issn_l in self._name_to_issn_l.items():
472
+ if norm_query in norm_name:
473
+ if issn_l in self._issn_l_data:
474
+ results.append({
475
+ 'issn_l': issn_l,
476
+ **self._issn_l_data[issn_l]
477
+ })
478
+ if len(results) >= limit:
479
+ break
480
+
481
+ return results
482
+
483
+ @property
484
+ def journal_count(self) -> int:
485
+ """Get number of cached journals."""
486
+ self.ensure_loaded()
487
+ return self._journal_count
488
+
489
+ @property
490
+ def cache_age_hours(self) -> float:
491
+ """Get cache age in hours."""
492
+ if self._last_updated == 0:
493
+ return float('inf')
494
+ return (time.time() - self._last_updated) / 3600
495
+
496
+
497
+ # ==================== Convenience Functions ====================
498
+
499
+ def get_journal_normalizer(cache_dir: Optional[Path] = None) -> JournalNormalizer:
500
+ """Get the journal normalizer singleton."""
501
+ return JournalNormalizer.get_instance(cache_dir)
502
+
503
+
504
+ def normalize_journal_name(name: str) -> str:
505
+ """Normalize journal name to canonical form."""
506
+ return get_journal_normalizer().normalize(name)
507
+
508
+
509
+ def get_journal_issn_l(name: str) -> Optional[str]:
510
+ """Get ISSN-L for a journal name."""
511
+ return get_journal_normalizer().get_issn_l(name)
512
+
513
+
514
+ def is_same_journal(name1: str, name2: str) -> bool:
515
+ """Check if two names refer to the same journal."""
516
+ return get_journal_normalizer().is_same_journal(name1, name2)
517
+
518
+
519
+ def refresh_journal_cache() -> None:
520
+ """Force refresh the journal normalizer cache."""
521
+ get_journal_normalizer().ensure_loaded(force_refresh=True)
522
+
523
+
524
+ # EOF