scitex 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. scitex/__version__.py +1 -1
  2. scitex/browser/__init__.py +53 -0
  3. scitex/browser/debugging/__init__.py +56 -0
  4. scitex/browser/debugging/_failure_capture.py +372 -0
  5. scitex/browser/debugging/_sync_session.py +259 -0
  6. scitex/browser/debugging/_test_monitor.py +284 -0
  7. scitex/browser/debugging/_visual_cursor.py +432 -0
  8. scitex/io/_load.py +5 -0
  9. scitex/io/_load_modules/_canvas.py +171 -0
  10. scitex/io/_save.py +8 -0
  11. scitex/io/_save_modules/_canvas.py +356 -0
  12. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +77 -22
  13. scitex/plt/docs/FIGURE_ARCHITECTURE.md +257 -0
  14. scitex/plt/utils/__init__.py +10 -0
  15. scitex/plt/utils/_collect_figure_metadata.py +14 -12
  16. scitex/plt/utils/_csv_column_naming.py +237 -0
  17. scitex/scholar/citation_graph/database.py +9 -2
  18. scitex/scholar/config/ScholarConfig.py +23 -3
  19. scitex/scholar/config/default.yaml +55 -0
  20. scitex/scholar/core/Paper.py +102 -0
  21. scitex/scholar/core/__init__.py +44 -0
  22. scitex/scholar/core/journal_normalizer.py +524 -0
  23. scitex/scholar/core/oa_cache.py +285 -0
  24. scitex/scholar/core/open_access.py +457 -0
  25. scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
  26. scitex/scholar/pdf_download/strategies/__init__.py +6 -0
  27. scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
  28. scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
  29. scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
  30. scitex/session/_decorator.py +13 -1
  31. scitex/vis/README.md +246 -615
  32. scitex/vis/__init__.py +138 -78
  33. scitex/vis/canvas.py +423 -0
  34. scitex/vis/docs/CANVAS_ARCHITECTURE.md +307 -0
  35. scitex/vis/editor/__init__.py +1 -1
  36. scitex/vis/editor/_dearpygui_editor.py +1830 -0
  37. scitex/vis/editor/_defaults.py +40 -1
  38. scitex/vis/editor/_edit.py +54 -18
  39. scitex/vis/editor/_flask_editor.py +37 -0
  40. scitex/vis/editor/_qt_editor.py +865 -0
  41. scitex/vis/editor/flask_editor/__init__.py +21 -0
  42. scitex/vis/editor/flask_editor/bbox.py +216 -0
  43. scitex/vis/editor/flask_editor/core.py +152 -0
  44. scitex/vis/editor/flask_editor/plotter.py +130 -0
  45. scitex/vis/editor/flask_editor/renderer.py +184 -0
  46. scitex/vis/editor/flask_editor/templates/__init__.py +33 -0
  47. scitex/vis/editor/flask_editor/templates/html.py +295 -0
  48. scitex/vis/editor/flask_editor/templates/scripts.py +614 -0
  49. scitex/vis/editor/flask_editor/templates/styles.py +549 -0
  50. scitex/vis/editor/flask_editor/utils.py +81 -0
  51. scitex/vis/io/__init__.py +84 -21
  52. scitex/vis/io/canvas.py +226 -0
  53. scitex/vis/io/data.py +204 -0
  54. scitex/vis/io/directory.py +202 -0
  55. scitex/vis/io/export.py +460 -0
  56. scitex/vis/io/panel.py +424 -0
  57. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/METADATA +9 -2
  58. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/RECORD +61 -32
  59. scitex/vis/DJANGO_INTEGRATION.md +0 -677
  60. scitex/vis/editor/_web_editor.py +0 -1440
  61. scitex/vis/tmp.txt +0 -239
  62. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/WHEEL +0 -0
  63. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/entry_points.txt +0 -0
  64. {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
4
+ """
5
+ Open Access PDF Download Strategy.
6
+
7
+ Downloads PDFs from known Open Access sources with appropriate handling
8
+ for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
9
+ """
10
+
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, Any
13
+ import aiohttp
14
+
15
+ from scitex import logging
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # Known OA source patterns and their handlers
21
+ OA_SOURCE_PATTERNS = {
22
+ 'arxiv': {
23
+ 'patterns': ['arxiv.org'],
24
+ 'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
25
+ },
26
+ 'pmc': {
27
+ 'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
28
+ 'pdf_transform': lambda url: url, # PMC links are usually direct
29
+ },
30
+ 'biorxiv': {
31
+ 'patterns': ['biorxiv.org', 'medrxiv.org'],
32
+ 'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
33
+ },
34
+ 'doaj': {
35
+ 'patterns': ['doaj.org'],
36
+ 'pdf_transform': lambda url: url,
37
+ },
38
+ 'zenodo': {
39
+ 'patterns': ['zenodo.org'],
40
+ 'pdf_transform': lambda url: url,
41
+ },
42
+ }
43
+
44
+
45
+ def _identify_oa_source(url: str) -> Optional[str]:
46
+ """Identify which OA source a URL belongs to."""
47
+ url_lower = url.lower()
48
+ for source_name, config in OA_SOURCE_PATTERNS.items():
49
+ for pattern in config['patterns']:
50
+ if pattern in url_lower:
51
+ return source_name
52
+ return None
53
+
54
+
55
+ def _transform_to_pdf_url(url: str, source: str) -> str:
56
+ """Transform URL to direct PDF URL based on source."""
57
+ if source in OA_SOURCE_PATTERNS:
58
+ transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
59
+ return transform_func(url)
60
+ return url
61
+
62
+
63
+ async def try_download_open_access_async(
64
+ oa_url: str,
65
+ output_path: Path,
66
+ metadata: Optional[Dict[str, Any]] = None,
67
+ func_name: str = "try_download_open_access_async",
68
+ timeout: int = 60,
69
+ ) -> Optional[Path]:
70
+ """
71
+ Download PDF from an Open Access URL.
72
+
73
+ This strategy is simpler than browser-based strategies because OA PDFs
74
+ are typically directly accessible without authentication.
75
+
76
+ Args:
77
+ oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
78
+ output_path: Path to save the downloaded PDF
79
+ metadata: Optional paper metadata for logging
80
+ func_name: Function name for logging
81
+ timeout: Download timeout in seconds
82
+
83
+ Returns:
84
+ Path to downloaded PDF if successful, None otherwise
85
+ """
86
+ if not oa_url:
87
+ logger.debug(f"{func_name}: No OA URL provided")
88
+ return None
89
+
90
+ # Identify source and transform URL if needed
91
+ source = _identify_oa_source(oa_url)
92
+ pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
93
+
94
+ logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
95
+
96
+ try:
97
+ # Create output directory if needed
98
+ output_path = Path(output_path)
99
+ output_path.parent.mkdir(parents=True, exist_ok=True)
100
+
101
+ # Use aiohttp for async download
102
+ async with aiohttp.ClientSession() as session:
103
+ headers = {
104
+ 'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
105
+ 'Accept': 'application/pdf,*/*',
106
+ }
107
+
108
+ async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
109
+ if response.status != 200:
110
+ logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
111
+ return None
112
+
113
+ content_type = response.headers.get('Content-Type', '')
114
+
115
+ # Verify we're getting a PDF
116
+ if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
117
+ # Some servers don't set content-type correctly, check magic bytes
118
+ first_bytes = await response.content.read(5)
119
+ if first_bytes != b'%PDF-':
120
+ logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
121
+ return None
122
+ # Reset for full download
123
+ content = first_bytes + await response.content.read()
124
+ else:
125
+ content = await response.read()
126
+
127
+ # Validate PDF content
128
+ if len(content) < 1000: # PDF should be at least 1KB
129
+ logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
130
+ return None
131
+
132
+ if not content.startswith(b'%PDF-'):
133
+ logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
134
+ return None
135
+
136
+ # Save to file
137
+ with open(output_path, 'wb') as f:
138
+ f.write(content)
139
+
140
+ size_mb = len(content) / 1024 / 1024
141
+ logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
142
+ return output_path
143
+
144
+ except aiohttp.ClientError as e:
145
+ logger.warning(f"{func_name}: HTTP client error: {e}")
146
+ return None
147
+ except TimeoutError:
148
+ logger.warning(f"{func_name}: Download timed out after {timeout}s")
149
+ return None
150
+ except Exception as e:
151
+ logger.error(f"{func_name}: Download failed: {e}")
152
+ return None
153
+
154
+
155
+ def try_download_open_access_sync(
156
+ oa_url: str,
157
+ output_path: Path,
158
+ metadata: Optional[Dict[str, Any]] = None,
159
+ timeout: int = 60,
160
+ ) -> Optional[Path]:
161
+ """
162
+ Synchronous wrapper for try_download_open_access_async.
163
+
164
+ Args:
165
+ oa_url: Open Access URL
166
+ output_path: Path to save the downloaded PDF
167
+ metadata: Optional paper metadata
168
+ timeout: Download timeout in seconds
169
+
170
+ Returns:
171
+ Path to downloaded PDF if successful, None otherwise
172
+ """
173
+ import asyncio
174
+
175
+ try:
176
+ loop = asyncio.get_event_loop()
177
+ except RuntimeError:
178
+ loop = asyncio.new_event_loop()
179
+ asyncio.set_event_loop(loop)
180
+
181
+ return loop.run_until_complete(
182
+ try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
183
+ )
184
+
185
+
186
+ # EOF
@@ -32,6 +32,7 @@ from datetime import datetime
32
32
 
33
33
  from scitex import logging
34
34
  from scitex.scholar.core import Paper
35
+ from scitex.scholar.core import normalize_journal_name
35
36
  from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
36
37
  from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
37
38
  from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
@@ -331,12 +332,18 @@ class ScholarPipelineSearchParallel:
331
332
  if 'metrics' in result:
332
333
  if result['metrics'].get('citation_count'):
333
334
  paper.metadata.citation_count.total = result['metrics']['citation_count']
334
- # Note: is_open_access not in Paper structure
335
+ if 'is_open_access' in result['metrics']:
336
+ paper.metadata.access.is_open_access = result['metrics']['is_open_access']
337
+ paper.metadata.access.is_open_access_engines = [engine_name]
335
338
 
336
339
  if 'urls' in result:
337
340
  if result['urls'].get('pdf'):
338
341
  # pdfs is a list of dicts with url/source keys
339
342
  paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
343
+ # If this is an open access paper, also store the PDF URL as oa_url
344
+ if paper.metadata.access.is_open_access:
345
+ paper.metadata.access.oa_url = result['urls']['pdf']
346
+ paper.metadata.access.oa_url_engines = [engine_name]
340
347
  if result['urls'].get('publisher'):
341
348
  paper.metadata.url.publisher = result['urls']['publisher']
342
349
  if result['urls'].get('doi_url'):
@@ -733,13 +740,21 @@ class ScholarPipelineSearchParallel:
733
740
 
734
741
  # Publication info
735
742
  if hasattr(meta, 'publication'):
736
- result['journal'] = meta.publication.journal or ''
743
+ journal_raw = meta.publication.journal or ''
744
+ result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
737
745
  result['impact_factor'] = meta.publication.impact_factor
738
746
 
739
747
  # Metrics
740
748
  if hasattr(meta, 'citation_count'):
741
749
  result['citation_count'] = meta.citation_count.total or 0
742
- result['is_open_access'] = False # Not stored in current Paper structure
750
+
751
+ # Access metadata
752
+ if hasattr(meta, 'access'):
753
+ result['is_open_access'] = meta.access.is_open_access or False
754
+ result['oa_status'] = meta.access.oa_status
755
+ result['oa_url'] = meta.access.oa_url
756
+ else:
757
+ result['is_open_access'] = False
743
758
 
744
759
  # URLs
745
760
  if hasattr(meta, 'url'):
@@ -268,12 +268,18 @@ class ScholarPipelineSearchSingle:
268
268
  if 'metrics' in result:
269
269
  if result['metrics'].get('citation_count'):
270
270
  paper.metadata.citation_count.total = result['metrics']['citation_count']
271
- # Note: is_open_access not in Paper structure
271
+ if 'is_open_access' in result['metrics']:
272
+ paper.metadata.access.is_open_access = result['metrics']['is_open_access']
273
+ paper.metadata.access.is_open_access_engines = [engine_name]
272
274
 
273
275
  if 'urls' in result:
274
276
  if result['urls'].get('pdf'):
275
277
  # pdfs is a list of dicts with url/source keys
276
278
  paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
279
+ # If this is an open access paper, also store the PDF URL as oa_url
280
+ if paper.metadata.access.is_open_access:
281
+ paper.metadata.access.oa_url = result['urls']['pdf']
282
+ paper.metadata.access.oa_url_engines = [engine_name]
277
283
  if result['urls'].get('publisher'):
278
284
  paper.metadata.url.publisher = result['urls']['publisher']
279
285
  if result['urls'].get('doi_url'):
@@ -461,7 +467,14 @@ class ScholarPipelineSearchSingle:
461
467
  # Metrics
462
468
  if hasattr(meta, 'citation_count'):
463
469
  result['citation_count'] = meta.citation_count.total or 0
464
- result['is_open_access'] = False # Not stored in current Paper structure
470
+
471
+ # Access metadata
472
+ if hasattr(meta, 'access'):
473
+ result['is_open_access'] = meta.access.is_open_access or False
474
+ result['oa_status'] = meta.access.oa_status
475
+ result['oa_url'] = meta.access.oa_url
476
+ else:
477
+ result['is_open_access'] = False
465
478
 
466
479
  # URLs
467
480
  if hasattr(meta, 'url'):
@@ -495,6 +495,7 @@ def _add_argument(
495
495
  type_hints: Type hints dictionary
496
496
  short_form: Optional short form (e.g., 'a' for -a)
497
497
  """
498
+ from typing import get_origin, get_args, Literal
498
499
 
499
500
  # Get type
500
501
  param_type = type_hints.get(param_name, param.annotation)
@@ -513,6 +514,13 @@ def _add_argument(
513
514
  if short_form:
514
515
  arg_names.insert(0, f"-{short_form}")
515
516
 
517
+ # Check for Literal type (choices)
518
+ choices = None
519
+ origin = get_origin(param_type)
520
+ if origin is Literal:
521
+ choices = list(get_args(param_type))
522
+ param_type = type(choices[0]) if choices else str
523
+
516
524
  # Handle different types
517
525
  if param_type == bool:
518
526
  # Boolean flags
@@ -524,11 +532,15 @@ def _add_argument(
524
532
  )
525
533
  else:
526
534
  # Regular arguments
535
+ choices_str = f", choices: {choices}" if choices else ""
527
536
  kwargs = {
528
537
  'type': param_type,
529
- 'help': f"(default: {default})" if has_default else "(required)",
538
+ 'help': f"(default: {default}{choices_str})" if has_default else f"(required{choices_str})",
530
539
  }
531
540
 
541
+ if choices:
542
+ kwargs['choices'] = choices
543
+
532
544
  if has_default:
533
545
  kwargs['default'] = default
534
546
  else: