scitex 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__version__.py +1 -1
- scitex/browser/__init__.py +53 -0
- scitex/browser/debugging/__init__.py +56 -0
- scitex/browser/debugging/_failure_capture.py +372 -0
- scitex/browser/debugging/_sync_session.py +259 -0
- scitex/browser/debugging/_test_monitor.py +284 -0
- scitex/browser/debugging/_visual_cursor.py +432 -0
- scitex/io/_load.py +5 -0
- scitex/io/_load_modules/_canvas.py +171 -0
- scitex/io/_save.py +8 -0
- scitex/io/_save_modules/_canvas.py +356 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +77 -22
- scitex/plt/docs/FIGURE_ARCHITECTURE.md +257 -0
- scitex/plt/utils/__init__.py +10 -0
- scitex/plt/utils/_collect_figure_metadata.py +14 -12
- scitex/plt/utils/_csv_column_naming.py +237 -0
- scitex/scholar/citation_graph/database.py +9 -2
- scitex/scholar/config/ScholarConfig.py +23 -3
- scitex/scholar/config/default.yaml +55 -0
- scitex/scholar/core/Paper.py +102 -0
- scitex/scholar/core/__init__.py +44 -0
- scitex/scholar/core/journal_normalizer.py +524 -0
- scitex/scholar/core/oa_cache.py +285 -0
- scitex/scholar/core/open_access.py +457 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
- scitex/scholar/pdf_download/strategies/__init__.py +6 -0
- scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
- scitex/session/_decorator.py +13 -1
- scitex/vis/README.md +246 -615
- scitex/vis/__init__.py +138 -78
- scitex/vis/canvas.py +423 -0
- scitex/vis/docs/CANVAS_ARCHITECTURE.md +307 -0
- scitex/vis/editor/__init__.py +1 -1
- scitex/vis/editor/_dearpygui_editor.py +1830 -0
- scitex/vis/editor/_defaults.py +40 -1
- scitex/vis/editor/_edit.py +54 -18
- scitex/vis/editor/_flask_editor.py +37 -0
- scitex/vis/editor/_qt_editor.py +865 -0
- scitex/vis/editor/flask_editor/__init__.py +21 -0
- scitex/vis/editor/flask_editor/bbox.py +216 -0
- scitex/vis/editor/flask_editor/core.py +152 -0
- scitex/vis/editor/flask_editor/plotter.py +130 -0
- scitex/vis/editor/flask_editor/renderer.py +184 -0
- scitex/vis/editor/flask_editor/templates/__init__.py +33 -0
- scitex/vis/editor/flask_editor/templates/html.py +295 -0
- scitex/vis/editor/flask_editor/templates/scripts.py +614 -0
- scitex/vis/editor/flask_editor/templates/styles.py +549 -0
- scitex/vis/editor/flask_editor/utils.py +81 -0
- scitex/vis/io/__init__.py +84 -21
- scitex/vis/io/canvas.py +226 -0
- scitex/vis/io/data.py +204 -0
- scitex/vis/io/directory.py +202 -0
- scitex/vis/io/export.py +460 -0
- scitex/vis/io/panel.py +424 -0
- {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/METADATA +9 -2
- {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/RECORD +61 -32
- scitex/vis/DJANGO_INTEGRATION.md +0 -677
- scitex/vis/editor/_web_editor.py +0 -1440
- scitex/vis/tmp.txt +0 -239
- {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/WHEEL +0 -0
- {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/entry_points.txt +0 -0
- {scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
|
|
4
|
+
"""
|
|
5
|
+
Open Access PDF Download Strategy.
|
|
6
|
+
|
|
7
|
+
Downloads PDFs from known Open Access sources with appropriate handling
|
|
8
|
+
for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional, Dict, Any
|
|
13
|
+
import aiohttp
|
|
14
|
+
|
|
15
|
+
from scitex import logging
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Known OA source patterns and their handlers
|
|
21
|
+
OA_SOURCE_PATTERNS = {
|
|
22
|
+
'arxiv': {
|
|
23
|
+
'patterns': ['arxiv.org'],
|
|
24
|
+
'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
|
|
25
|
+
},
|
|
26
|
+
'pmc': {
|
|
27
|
+
'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
|
|
28
|
+
'pdf_transform': lambda url: url, # PMC links are usually direct
|
|
29
|
+
},
|
|
30
|
+
'biorxiv': {
|
|
31
|
+
'patterns': ['biorxiv.org', 'medrxiv.org'],
|
|
32
|
+
'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
|
|
33
|
+
},
|
|
34
|
+
'doaj': {
|
|
35
|
+
'patterns': ['doaj.org'],
|
|
36
|
+
'pdf_transform': lambda url: url,
|
|
37
|
+
},
|
|
38
|
+
'zenodo': {
|
|
39
|
+
'patterns': ['zenodo.org'],
|
|
40
|
+
'pdf_transform': lambda url: url,
|
|
41
|
+
},
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _identify_oa_source(url: str) -> Optional[str]:
|
|
46
|
+
"""Identify which OA source a URL belongs to."""
|
|
47
|
+
url_lower = url.lower()
|
|
48
|
+
for source_name, config in OA_SOURCE_PATTERNS.items():
|
|
49
|
+
for pattern in config['patterns']:
|
|
50
|
+
if pattern in url_lower:
|
|
51
|
+
return source_name
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _transform_to_pdf_url(url: str, source: str) -> str:
|
|
56
|
+
"""Transform URL to direct PDF URL based on source."""
|
|
57
|
+
if source in OA_SOURCE_PATTERNS:
|
|
58
|
+
transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
|
|
59
|
+
return transform_func(url)
|
|
60
|
+
return url
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def try_download_open_access_async(
|
|
64
|
+
oa_url: str,
|
|
65
|
+
output_path: Path,
|
|
66
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
67
|
+
func_name: str = "try_download_open_access_async",
|
|
68
|
+
timeout: int = 60,
|
|
69
|
+
) -> Optional[Path]:
|
|
70
|
+
"""
|
|
71
|
+
Download PDF from an Open Access URL.
|
|
72
|
+
|
|
73
|
+
This strategy is simpler than browser-based strategies because OA PDFs
|
|
74
|
+
are typically directly accessible without authentication.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
|
|
78
|
+
output_path: Path to save the downloaded PDF
|
|
79
|
+
metadata: Optional paper metadata for logging
|
|
80
|
+
func_name: Function name for logging
|
|
81
|
+
timeout: Download timeout in seconds
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Path to downloaded PDF if successful, None otherwise
|
|
85
|
+
"""
|
|
86
|
+
if not oa_url:
|
|
87
|
+
logger.debug(f"{func_name}: No OA URL provided")
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# Identify source and transform URL if needed
|
|
91
|
+
source = _identify_oa_source(oa_url)
|
|
92
|
+
pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
|
|
93
|
+
|
|
94
|
+
logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Create output directory if needed
|
|
98
|
+
output_path = Path(output_path)
|
|
99
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Use aiohttp for async download
|
|
102
|
+
async with aiohttp.ClientSession() as session:
|
|
103
|
+
headers = {
|
|
104
|
+
'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
|
|
105
|
+
'Accept': 'application/pdf,*/*',
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
|
|
109
|
+
if response.status != 200:
|
|
110
|
+
logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
content_type = response.headers.get('Content-Type', '')
|
|
114
|
+
|
|
115
|
+
# Verify we're getting a PDF
|
|
116
|
+
if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
|
|
117
|
+
# Some servers don't set content-type correctly, check magic bytes
|
|
118
|
+
first_bytes = await response.content.read(5)
|
|
119
|
+
if first_bytes != b'%PDF-':
|
|
120
|
+
logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
|
|
121
|
+
return None
|
|
122
|
+
# Reset for full download
|
|
123
|
+
content = first_bytes + await response.content.read()
|
|
124
|
+
else:
|
|
125
|
+
content = await response.read()
|
|
126
|
+
|
|
127
|
+
# Validate PDF content
|
|
128
|
+
if len(content) < 1000: # PDF should be at least 1KB
|
|
129
|
+
logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
if not content.startswith(b'%PDF-'):
|
|
133
|
+
logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Save to file
|
|
137
|
+
with open(output_path, 'wb') as f:
|
|
138
|
+
f.write(content)
|
|
139
|
+
|
|
140
|
+
size_mb = len(content) / 1024 / 1024
|
|
141
|
+
logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
|
|
142
|
+
return output_path
|
|
143
|
+
|
|
144
|
+
except aiohttp.ClientError as e:
|
|
145
|
+
logger.warning(f"{func_name}: HTTP client error: {e}")
|
|
146
|
+
return None
|
|
147
|
+
except TimeoutError:
|
|
148
|
+
logger.warning(f"{func_name}: Download timed out after {timeout}s")
|
|
149
|
+
return None
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"{func_name}: Download failed: {e}")
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def try_download_open_access_sync(
|
|
156
|
+
oa_url: str,
|
|
157
|
+
output_path: Path,
|
|
158
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
159
|
+
timeout: int = 60,
|
|
160
|
+
) -> Optional[Path]:
|
|
161
|
+
"""
|
|
162
|
+
Synchronous wrapper for try_download_open_access_async.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
oa_url: Open Access URL
|
|
166
|
+
output_path: Path to save the downloaded PDF
|
|
167
|
+
metadata: Optional paper metadata
|
|
168
|
+
timeout: Download timeout in seconds
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Path to downloaded PDF if successful, None otherwise
|
|
172
|
+
"""
|
|
173
|
+
import asyncio
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
loop = asyncio.get_event_loop()
|
|
177
|
+
except RuntimeError:
|
|
178
|
+
loop = asyncio.new_event_loop()
|
|
179
|
+
asyncio.set_event_loop(loop)
|
|
180
|
+
|
|
181
|
+
return loop.run_until_complete(
|
|
182
|
+
try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# EOF
|
|
@@ -32,6 +32,7 @@ from datetime import datetime
|
|
|
32
32
|
|
|
33
33
|
from scitex import logging
|
|
34
34
|
from scitex.scholar.core import Paper
|
|
35
|
+
from scitex.scholar.core import normalize_journal_name
|
|
35
36
|
from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
|
|
36
37
|
from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
|
|
37
38
|
from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
|
|
@@ -331,12 +332,18 @@ class ScholarPipelineSearchParallel:
|
|
|
331
332
|
if 'metrics' in result:
|
|
332
333
|
if result['metrics'].get('citation_count'):
|
|
333
334
|
paper.metadata.citation_count.total = result['metrics']['citation_count']
|
|
334
|
-
|
|
335
|
+
if 'is_open_access' in result['metrics']:
|
|
336
|
+
paper.metadata.access.is_open_access = result['metrics']['is_open_access']
|
|
337
|
+
paper.metadata.access.is_open_access_engines = [engine_name]
|
|
335
338
|
|
|
336
339
|
if 'urls' in result:
|
|
337
340
|
if result['urls'].get('pdf'):
|
|
338
341
|
# pdfs is a list of dicts with url/source keys
|
|
339
342
|
paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
|
|
343
|
+
# If this is an open access paper, also store the PDF URL as oa_url
|
|
344
|
+
if paper.metadata.access.is_open_access:
|
|
345
|
+
paper.metadata.access.oa_url = result['urls']['pdf']
|
|
346
|
+
paper.metadata.access.oa_url_engines = [engine_name]
|
|
340
347
|
if result['urls'].get('publisher'):
|
|
341
348
|
paper.metadata.url.publisher = result['urls']['publisher']
|
|
342
349
|
if result['urls'].get('doi_url'):
|
|
@@ -733,13 +740,21 @@ class ScholarPipelineSearchParallel:
|
|
|
733
740
|
|
|
734
741
|
# Publication info
|
|
735
742
|
if hasattr(meta, 'publication'):
|
|
736
|
-
|
|
743
|
+
journal_raw = meta.publication.journal or ''
|
|
744
|
+
result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
|
|
737
745
|
result['impact_factor'] = meta.publication.impact_factor
|
|
738
746
|
|
|
739
747
|
# Metrics
|
|
740
748
|
if hasattr(meta, 'citation_count'):
|
|
741
749
|
result['citation_count'] = meta.citation_count.total or 0
|
|
742
|
-
|
|
750
|
+
|
|
751
|
+
# Access metadata
|
|
752
|
+
if hasattr(meta, 'access'):
|
|
753
|
+
result['is_open_access'] = meta.access.is_open_access or False
|
|
754
|
+
result['oa_status'] = meta.access.oa_status
|
|
755
|
+
result['oa_url'] = meta.access.oa_url
|
|
756
|
+
else:
|
|
757
|
+
result['is_open_access'] = False
|
|
743
758
|
|
|
744
759
|
# URLs
|
|
745
760
|
if hasattr(meta, 'url'):
|
|
@@ -268,12 +268,18 @@ class ScholarPipelineSearchSingle:
|
|
|
268
268
|
if 'metrics' in result:
|
|
269
269
|
if result['metrics'].get('citation_count'):
|
|
270
270
|
paper.metadata.citation_count.total = result['metrics']['citation_count']
|
|
271
|
-
|
|
271
|
+
if 'is_open_access' in result['metrics']:
|
|
272
|
+
paper.metadata.access.is_open_access = result['metrics']['is_open_access']
|
|
273
|
+
paper.metadata.access.is_open_access_engines = [engine_name]
|
|
272
274
|
|
|
273
275
|
if 'urls' in result:
|
|
274
276
|
if result['urls'].get('pdf'):
|
|
275
277
|
# pdfs is a list of dicts with url/source keys
|
|
276
278
|
paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
|
|
279
|
+
# If this is an open access paper, also store the PDF URL as oa_url
|
|
280
|
+
if paper.metadata.access.is_open_access:
|
|
281
|
+
paper.metadata.access.oa_url = result['urls']['pdf']
|
|
282
|
+
paper.metadata.access.oa_url_engines = [engine_name]
|
|
277
283
|
if result['urls'].get('publisher'):
|
|
278
284
|
paper.metadata.url.publisher = result['urls']['publisher']
|
|
279
285
|
if result['urls'].get('doi_url'):
|
|
@@ -461,7 +467,14 @@ class ScholarPipelineSearchSingle:
|
|
|
461
467
|
# Metrics
|
|
462
468
|
if hasattr(meta, 'citation_count'):
|
|
463
469
|
result['citation_count'] = meta.citation_count.total or 0
|
|
464
|
-
|
|
470
|
+
|
|
471
|
+
# Access metadata
|
|
472
|
+
if hasattr(meta, 'access'):
|
|
473
|
+
result['is_open_access'] = meta.access.is_open_access or False
|
|
474
|
+
result['oa_status'] = meta.access.oa_status
|
|
475
|
+
result['oa_url'] = meta.access.oa_url
|
|
476
|
+
else:
|
|
477
|
+
result['is_open_access'] = False
|
|
465
478
|
|
|
466
479
|
# URLs
|
|
467
480
|
if hasattr(meta, 'url'):
|
scitex/session/_decorator.py
CHANGED
|
@@ -495,6 +495,7 @@ def _add_argument(
|
|
|
495
495
|
type_hints: Type hints dictionary
|
|
496
496
|
short_form: Optional short form (e.g., 'a' for -a)
|
|
497
497
|
"""
|
|
498
|
+
from typing import get_origin, get_args, Literal
|
|
498
499
|
|
|
499
500
|
# Get type
|
|
500
501
|
param_type = type_hints.get(param_name, param.annotation)
|
|
@@ -513,6 +514,13 @@ def _add_argument(
|
|
|
513
514
|
if short_form:
|
|
514
515
|
arg_names.insert(0, f"-{short_form}")
|
|
515
516
|
|
|
517
|
+
# Check for Literal type (choices)
|
|
518
|
+
choices = None
|
|
519
|
+
origin = get_origin(param_type)
|
|
520
|
+
if origin is Literal:
|
|
521
|
+
choices = list(get_args(param_type))
|
|
522
|
+
param_type = type(choices[0]) if choices else str
|
|
523
|
+
|
|
516
524
|
# Handle different types
|
|
517
525
|
if param_type == bool:
|
|
518
526
|
# Boolean flags
|
|
@@ -524,11 +532,15 @@ def _add_argument(
|
|
|
524
532
|
)
|
|
525
533
|
else:
|
|
526
534
|
# Regular arguments
|
|
535
|
+
choices_str = f", choices: {choices}" if choices else ""
|
|
527
536
|
kwargs = {
|
|
528
537
|
'type': param_type,
|
|
529
|
-
'help': f"(default: {default})" if has_default else "(required)",
|
|
538
|
+
'help': f"(default: {default}{choices_str})" if has_default else f"(required{choices_str})",
|
|
530
539
|
}
|
|
531
540
|
|
|
541
|
+
if choices:
|
|
542
|
+
kwargs['choices'] = choices
|
|
543
|
+
|
|
532
544
|
if has_default:
|
|
533
545
|
kwargs['default'] = default
|
|
534
546
|
else:
|