PyPI - scitex - Versions diffs - 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl - Mend

scitex 2.4.2py3-none-any.whl → 2.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

scitex/__version__.py +1 -1
scitex/browser/__init__.py +53 -0
scitex/browser/debugging/__init__.py +56 -0
scitex/browser/debugging/_failure_capture.py +372 -0
scitex/browser/debugging/_sync_session.py +259 -0
scitex/browser/debugging/_test_monitor.py +284 -0
scitex/browser/debugging/_visual_cursor.py +432 -0
scitex/io/_load.py +5 -0
scitex/io/_load_modules/_canvas.py +171 -0
scitex/io/_save.py +8 -0
scitex/io/_save_modules/_canvas.py +356 -0
scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +77 -22
scitex/plt/docs/FIGURE_ARCHITECTURE.md +257 -0
scitex/plt/utils/__init__.py +10 -0
scitex/plt/utils/_collect_figure_metadata.py +14 -12
scitex/plt/utils/_csv_column_naming.py +237 -0
scitex/scholar/citation_graph/database.py +9 -2
scitex/scholar/config/ScholarConfig.py +23 -3
scitex/scholar/config/default.yaml +55 -0
scitex/scholar/core/Paper.py +102 -0
scitex/scholar/core/__init__.py +44 -0
scitex/scholar/core/journal_normalizer.py +524 -0
scitex/scholar/core/oa_cache.py +285 -0
scitex/scholar/core/open_access.py +457 -0
scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
scitex/scholar/pdf_download/strategies/__init__.py +6 -0
scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +18 -3
scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +15 -2
scitex/session/_decorator.py +13 -1
scitex/vis/README.md +246 -615
scitex/vis/__init__.py +138 -78
scitex/vis/canvas.py +423 -0
scitex/vis/docs/CANVAS_ARCHITECTURE.md +307 -0
scitex/vis/editor/__init__.py +1 -1
scitex/vis/editor/_dearpygui_editor.py +1830 -0
scitex/vis/editor/_defaults.py +40 -1
scitex/vis/editor/_edit.py +54 -18
scitex/vis/editor/_flask_editor.py +37 -0
scitex/vis/editor/_qt_editor.py +865 -0
scitex/vis/editor/flask_editor/__init__.py +21 -0
scitex/vis/editor/flask_editor/bbox.py +216 -0
scitex/vis/editor/flask_editor/core.py +152 -0
scitex/vis/editor/flask_editor/plotter.py +130 -0
scitex/vis/editor/flask_editor/renderer.py +184 -0
scitex/vis/editor/flask_editor/templates/__init__.py +33 -0
scitex/vis/editor/flask_editor/templates/html.py +295 -0
scitex/vis/editor/flask_editor/templates/scripts.py +614 -0
scitex/vis/editor/flask_editor/templates/styles.py +549 -0
scitex/vis/editor/flask_editor/utils.py +81 -0
scitex/vis/io/__init__.py +84 -21
scitex/vis/io/canvas.py +226 -0
scitex/vis/io/data.py +204 -0
scitex/vis/io/directory.py +202 -0
scitex/vis/io/export.py +460 -0
scitex/vis/io/panel.py +424 -0
{scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/METADATA +9 -2
{scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/RECORD +61 -32
scitex/vis/DJANGO_INTEGRATION.md +0 -677
scitex/vis/editor/_web_editor.py +0 -1440
scitex/vis/tmp.txt +0 -239
{scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/WHEEL +0 -0
{scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/entry_points.txt +0 -0
{scitex-2.4.2.dist-info → scitex-2.5.0.dist-info}/licenses/LICENSE +0 -0

scitex/scholar/pdf_download/strategies/open_access_download.py ADDED Viewed

@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
+"""
+Open Access PDF Download Strategy.
+Downloads PDFs from known Open Access sources with appropriate handling
+for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
+"""
+from pathlib import Path
+from typing import Optional, Dict, Any
+import aiohttp
+from scitex import logging
+logger = logging.getLogger(__name__)
+# Known OA source patterns and their handlers
+OA_SOURCE_PATTERNS = {
+    'arxiv': {
+        'patterns': ['arxiv.org'],
+        'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
+    },
+    'pmc': {
+        'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
+        'pdf_transform': lambda url: url,  # PMC links are usually direct
+    },
+    'biorxiv': {
+        'patterns': ['biorxiv.org', 'medrxiv.org'],
+        'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
+    },
+    'doaj': {
+        'patterns': ['doaj.org'],
+        'pdf_transform': lambda url: url,
+    },
+    'zenodo': {
+        'patterns': ['zenodo.org'],
+        'pdf_transform': lambda url: url,
+    },
+}
+def _identify_oa_source(url: str) -> Optional[str]:
+    """Identify which OA source a URL belongs to."""
+    url_lower = url.lower()
+    for source_name, config in OA_SOURCE_PATTERNS.items():
+        for pattern in config['patterns']:
+            if pattern in url_lower:
+                return source_name
+    return None
+def _transform_to_pdf_url(url: str, source: str) -> str:
+    """Transform URL to direct PDF URL based on source."""
+    if source in OA_SOURCE_PATTERNS:
+        transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
+        return transform_func(url)
+    return url
+async def try_download_open_access_async(
+    oa_url: str,
+    output_path: Path,
+    metadata: Optional[Dict[str, Any]] = None,
+    func_name: str = "try_download_open_access_async",
+    timeout: int = 60,
+) -> Optional[Path]:
+    """
+    Download PDF from an Open Access URL.
+    This strategy is simpler than browser-based strategies because OA PDFs
+    are typically directly accessible without authentication.
+    Args:
+        oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
+        output_path: Path to save the downloaded PDF
+        metadata: Optional paper metadata for logging
+        func_name: Function name for logging
+        timeout: Download timeout in seconds
+    Returns:
+        Path to downloaded PDF if successful, None otherwise
+    """
+    if not oa_url:
+        logger.debug(f"{func_name}: No OA URL provided")
+        return None
+    # Identify source and transform URL if needed
+    source = _identify_oa_source(oa_url)
+    pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
+    logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
+    try:
+        # Create output directory if needed
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Use aiohttp for async download
+        async with aiohttp.ClientSession() as session:
+            headers = {
+                'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
+                'Accept': 'application/pdf,*/*',
+            }
+            async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
+                if response.status != 200:
+                    logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
+                    return None
+                content_type = response.headers.get('Content-Type', '')
+                # Verify we're getting a PDF
+                if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
+                    # Some servers don't set content-type correctly, check magic bytes
+                    first_bytes = await response.content.read(5)
+                    if first_bytes != b'%PDF-':
+                        logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
+                        return None
+                    # Reset for full download
+                    content = first_bytes + await response.content.read()
+                else:
+                    content = await response.read()
+                # Validate PDF content
+                if len(content) < 1000:  # PDF should be at least 1KB
+                    logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
+                    return None
+                if not content.startswith(b'%PDF-'):
+                    logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
+                    return None
+                # Save to file
+                with open(output_path, 'wb') as f:
+                    f.write(content)
+                size_mb = len(content) / 1024 / 1024
+                logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
+                return output_path
+    except aiohttp.ClientError as e:
+        logger.warning(f"{func_name}: HTTP client error: {e}")
+        return None
+    except TimeoutError:
+        logger.warning(f"{func_name}: Download timed out after {timeout}s")
+        return None
+    except Exception as e:
+        logger.error(f"{func_name}: Download failed: {e}")
+        return None
+def try_download_open_access_sync(
+    oa_url: str,
+    output_path: Path,
+    metadata: Optional[Dict[str, Any]] = None,
+    timeout: int = 60,
+) -> Optional[Path]:
+    """
+    Synchronous wrapper for try_download_open_access_async.
+    Args:
+        oa_url: Open Access URL
+        output_path: Path to save the downloaded PDF
+        metadata: Optional paper metadata
+        timeout: Download timeout in seconds
+    Returns:
+        Path to downloaded PDF if successful, None otherwise
+    """
+    import asyncio
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    return loop.run_until_complete(
+        try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
+    )
+# EOF

scitex/scholar/pipelines/ScholarPipelineSearchParallel.py CHANGED Viewed

@@ -32,6 +32,7 @@ from datetime import datetime
 from scitex import logging
 from scitex.scholar.core import Paper
+from scitex.scholar.core import normalize_journal_name
 from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
 from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
 from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
@@ -331,12 +332,18 @@ class ScholarPipelineSearchParallel:
                 if 'metrics' in result:
                     if result['metrics'].get('citation_count'):
                         paper.metadata.citation_count.total = result['metrics']['citation_count']
-                    # Note: is_open_access not in Paper structure
+                    if 'is_open_access' in result['metrics']:
+                        paper.metadata.access.is_open_access = result['metrics']['is_open_access']
+                        paper.metadata.access.is_open_access_engines = [engine_name]
                 if 'urls' in result:
                     if result['urls'].get('pdf'):
                         # pdfs is a list of dicts with url/source keys
                         paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
+                        # If this is an open access paper, also store the PDF URL as oa_url
+                        if paper.metadata.access.is_open_access:
+                            paper.metadata.access.oa_url = result['urls']['pdf']
+                            paper.metadata.access.oa_url_engines = [engine_name]
                     if result['urls'].get('publisher'):
                         paper.metadata.url.publisher = result['urls']['publisher']
                     if result['urls'].get('doi_url'):
@@ -733,13 +740,21 @@ class ScholarPipelineSearchParallel:
         # Publication info
         if hasattr(meta, 'publication'):
-            result['journal'] = meta.publication.journal or ''
+            journal_raw = meta.publication.journal or ''
+            result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
             result['impact_factor'] = meta.publication.impact_factor
         # Metrics
         if hasattr(meta, 'citation_count'):
             result['citation_count'] = meta.citation_count.total or 0
-        result['is_open_access'] = False  # Not stored in current Paper structure
+        # Access metadata
+        if hasattr(meta, 'access'):
+            result['is_open_access'] = meta.access.is_open_access or False
+            result['oa_status'] = meta.access.oa_status
+            result['oa_url'] = meta.access.oa_url
+        else:
+            result['is_open_access'] = False
         # URLs
         if hasattr(meta, 'url'):

scitex/scholar/pipelines/ScholarPipelineSearchSingle.py CHANGED Viewed

@@ -268,12 +268,18 @@ class ScholarPipelineSearchSingle:
                 if 'metrics' in result:
                     if result['metrics'].get('citation_count'):
                         paper.metadata.citation_count.total = result['metrics']['citation_count']
-                    # Note: is_open_access not in Paper structure
+                    if 'is_open_access' in result['metrics']:
+                        paper.metadata.access.is_open_access = result['metrics']['is_open_access']
+                        paper.metadata.access.is_open_access_engines = [engine_name]
                 if 'urls' in result:
                     if result['urls'].get('pdf'):
                         # pdfs is a list of dicts with url/source keys
                         paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
+                        # If this is an open access paper, also store the PDF URL as oa_url
+                        if paper.metadata.access.is_open_access:
+                            paper.metadata.access.oa_url = result['urls']['pdf']
+                            paper.metadata.access.oa_url_engines = [engine_name]
                     if result['urls'].get('publisher'):
                         paper.metadata.url.publisher = result['urls']['publisher']
                     if result['urls'].get('doi_url'):
@@ -461,7 +467,14 @@ class ScholarPipelineSearchSingle:
         # Metrics
         if hasattr(meta, 'citation_count'):
             result['citation_count'] = meta.citation_count.total or 0
-        result['is_open_access'] = False  # Not stored in current Paper structure
+        # Access metadata
+        if hasattr(meta, 'access'):
+            result['is_open_access'] = meta.access.is_open_access or False
+            result['oa_status'] = meta.access.oa_status
+            result['oa_url'] = meta.access.oa_url
+        else:
+            result['is_open_access'] = False
         # URLs
         if hasattr(meta, 'url'):

scitex/session/_decorator.py CHANGED Viewed

@@ -495,6 +495,7 @@ def _add_argument(
         type_hints: Type hints dictionary
         short_form: Optional short form (e.g., 'a' for -a)
     """
+    from typing import get_origin, get_args, Literal
     # Get type
     param_type = type_hints.get(param_name, param.annotation)
@@ -513,6 +514,13 @@ def _add_argument(
     if short_form:
         arg_names.insert(0, f"-{short_form}")
+    # Check for Literal type (choices)
+    choices = None
+    origin = get_origin(param_type)
+    if origin is Literal:
+        choices = list(get_args(param_type))
+        param_type = type(choices[0]) if choices else str
     # Handle different types
     if param_type == bool:
         # Boolean flags
@@ -524,11 +532,15 @@ def _add_argument(
         )
     else:
         # Regular arguments
+        choices_str = f", choices: {choices}" if choices else ""
         kwargs = {
             'type': param_type,
-            'help': f"(default: {default})" if has_default else "(required)",
+            'help': f"(default: {default}{choices_str})" if has_default else f"(required{choices_str})",
         }
+        if choices:
+            kwargs['choices'] = choices
         if has_default:
             kwargs['default'] = default
         else:

scitex 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

scitex 2.4.2py3-none-any.whl → 2.5.0py3-none-any.whl