PyPI - scitex - Versions diffs - 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl - Mend

scitex 2.4.1py3-none-any.whl → 2.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

scitex/__version__.py +1 -1
scitex/browser/__init__.py +53 -0
scitex/browser/auth/__init__.py +35 -0
scitex/browser/auth/google.py +381 -0
scitex/browser/collaboration/__init__.py +5 -0
scitex/browser/debugging/__init__.py +56 -0
scitex/browser/debugging/_failure_capture.py +372 -0
scitex/browser/debugging/_sync_session.py +259 -0
scitex/browser/debugging/_test_monitor.py +284 -0
scitex/browser/debugging/_visual_cursor.py +432 -0
scitex/scholar/citation_graph/README.md +117 -0
scitex/scholar/citation_graph/__init__.py +29 -0
scitex/scholar/citation_graph/builder.py +214 -0
scitex/scholar/citation_graph/database.py +246 -0
scitex/scholar/citation_graph/example.py +96 -0
scitex/scholar/citation_graph/models.py +80 -0
scitex/scholar/config/ScholarConfig.py +23 -3
scitex/scholar/config/default.yaml +56 -0
scitex/scholar/core/Paper.py +102 -0
scitex/scholar/core/__init__.py +44 -0
scitex/scholar/core/journal_normalizer.py +524 -0
scitex/scholar/core/oa_cache.py +285 -0
scitex/scholar/core/open_access.py +457 -0
scitex/scholar/metadata_engines/ScholarEngine.py +9 -1
scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +82 -21
scitex/scholar/pdf_download/ScholarPDFDownloader.py +137 -0
scitex/scholar/pdf_download/strategies/__init__.py +6 -0
scitex/scholar/pdf_download/strategies/open_access_download.py +186 -0
scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +27 -9
scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +24 -8
scitex/scholar/search_engines/ScholarSearchEngine.py +6 -1
{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA +1 -1
{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/RECORD +36 -20
{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/WHEEL +0 -0
{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/entry_points.txt +0 -0
{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/licenses/LICENSE +0 -0

scitex/scholar/pdf_download/ScholarPDFDownloader.py CHANGED Viewed

@@ -33,6 +33,7 @@ from scitex.scholar.pdf_download.strategies import (
     try_download_direct_async,
     try_download_manual_async,
     try_download_response_body_async,
+    try_download_open_access_async,
 )
 logger = logging.getLogger(__name__)
@@ -65,6 +66,17 @@ class ScholarPDFDownloader:
         self.context = context
         self.output_dir = self.config.get_library_downloads_dir()
+        # Load access preferences from config
+        self.prefer_open_access = self.config.resolve(
+            "prefer_open_access", default=True, type=bool
+        )
+        self.enable_paywall_access = self.config.resolve(
+            "enable_paywall_access", default=False, type=bool
+        )
+        self.track_paywall_attempts = self.config.resolve(
+            "track_paywall_attempts", default=True, type=bool
+        )
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         pass
@@ -130,6 +142,131 @@ class ScholarPDFDownloader:
         )
         return saved_paths
+    async def download_open_access(
+        self,
+        oa_url: str,
+        output_path: Union[str, Path],
+        metadata: Optional[dict] = None,
+    ) -> Optional[Path]:
+        """Download PDF from an Open Access URL.
+        This is a simpler path for known OA papers - no browser automation needed.
+        Uses direct HTTP download with appropriate handling for different OA sources
+        (arXiv, PMC, OpenAlex OA URLs, etc.).
+        Args:
+            oa_url: Open Access URL (from paper.metadata.access.oa_url)
+            output_path: Path to save the downloaded PDF
+            metadata: Optional paper metadata for logging
+        Returns:
+            Path to downloaded PDF if successful, None otherwise
+        """
+        if not oa_url:
+            logger.debug(f"{self.name}: No OA URL provided")
+            return None
+        if isinstance(output_path, str):
+            output_path = Path(output_path)
+        if not str(output_path).endswith(".pdf"):
+            output_path = Path(str(output_path) + ".pdf")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...")
+        result = await try_download_open_access_async(
+            oa_url=oa_url,
+            output_path=output_path,
+            metadata=metadata,
+            func_name=self.name,
+        )
+        if result:
+            logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}")
+        else:
+            logger.debug(f"{self.name}: OA download failed, may need browser-based download")
+        return result
+    async def download_smart(
+        self,
+        paper,
+        output_path: Union[str, Path],
+    ) -> Optional[Path]:
+        """Smart download method that chooses the best strategy based on paper metadata.
+        Priority order:
+        1. Try Open Access URL if available and prefer_open_access is True
+        2. Try regular PDF URLs if available
+        3. Try paywall access if enable_paywall_access is True and OA failed
+        Args:
+            paper: Paper object with metadata (from scitex.scholar.core.Paper)
+            output_path: Path to save the downloaded PDF
+        Returns:
+            Path to downloaded PDF if successful, None otherwise
+        """
+        from scitex.scholar.core.Paper import Paper
+        if isinstance(output_path, str):
+            output_path = Path(output_path)
+        if not str(output_path).endswith(".pdf"):
+            output_path = Path(str(output_path) + ".pdf")
+        # Extract metadata
+        meta = paper.metadata if hasattr(paper, 'metadata') else paper
+        access = getattr(meta, 'access', None)
+        url_meta = getattr(meta, 'url', None)
+        id_meta = getattr(meta, 'id', None)
+        is_open_access = getattr(access, 'is_open_access', False) if access else False
+        oa_url = getattr(access, 'oa_url', None) if access else None
+        pdf_urls = getattr(url_meta, 'pdfs', []) if url_meta else []
+        doi = getattr(id_meta, 'doi', None) if id_meta else None
+        logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}")
+        # Strategy 1: Try Open Access if available
+        if self.prefer_open_access and oa_url:
+            logger.info(f"{self.name}: Trying Open Access URL first")
+            result = await self.download_open_access(oa_url, output_path)
+            if result:
+                # Update access metadata to record successful OA download
+                if access and self.track_paywall_attempts:
+                    access.paywall_bypass_attempted = False
+                return result
+        # Strategy 2: Try available PDF URLs
+        for pdf_entry in pdf_urls:
+            pdf_url = pdf_entry.get('url') if isinstance(pdf_entry, dict) else pdf_entry
+            if pdf_url:
+                logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...")
+                result = await self.download_from_url(pdf_url, output_path, doi=doi)
+                if result:
+                    return result
+        # Strategy 3: Try paywall access if enabled
+        if self.enable_paywall_access and not is_open_access:
+            logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)")
+            if access and self.track_paywall_attempts:
+                access.paywall_bypass_attempted = True
+            # Use DOI-based URL if available
+            if doi:
+                doi_url = f"https://doi.org/{doi}"
+                result = await self.download_from_url(doi_url, output_path, doi=doi)
+                if result:
+                    if access and self.track_paywall_attempts:
+                        access.paywall_bypass_success = True
+                    return result
+                else:
+                    if access and self.track_paywall_attempts:
+                        access.paywall_bypass_success = False
+        logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}")
+        return None
     async def download_from_url(
         self,
         pdf_url: str,

scitex/scholar/pdf_download/strategies/__init__.py CHANGED Viewed

@@ -11,6 +11,10 @@ from .chrome_pdf_viewer import try_download_chrome_pdf_viewer_async
 from .direct_download import try_download_direct_async
 from .response_body import try_download_response_body_async
 from .manual_download_fallback import try_download_manual_async
+from .open_access_download import (
+    try_download_open_access_async,
+    try_download_open_access_sync,
+)
 # Manual download utilities
 from .manual_download_utils import (
@@ -27,6 +31,8 @@ __all__ = [
     "try_download_direct_async",
     "try_download_response_body_async",
     "try_download_manual_async",
+    "try_download_open_access_async",
+    "try_download_open_access_sync",
     # Manual download utilities
     "DownloadMonitorAndSync",
     "FlexibleFilenameGenerator",

scitex/scholar/pdf_download/strategies/open_access_download.py ADDED Viewed

@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# File: ./src/scitex/scholar/pdf_download/strategies/open_access_download.py
+"""
+Open Access PDF Download Strategy.
+Downloads PDFs from known Open Access sources with appropriate handling
+for each source type (arXiv, PubMed Central, OpenAlex OA URLs, etc.).
+"""
+from pathlib import Path
+from typing import Optional, Dict, Any
+import aiohttp
+from scitex import logging
+logger = logging.getLogger(__name__)
+# Known OA source patterns and their handlers
+OA_SOURCE_PATTERNS = {
+    'arxiv': {
+        'patterns': ['arxiv.org'],
+        'pdf_transform': lambda url: url.replace('/abs/', '/pdf/') + '.pdf' if '/abs/' in url else url,
+    },
+    'pmc': {
+        'patterns': ['ncbi.nlm.nih.gov/pmc', 'europepmc.org'],
+        'pdf_transform': lambda url: url,  # PMC links are usually direct
+    },
+    'biorxiv': {
+        'patterns': ['biorxiv.org', 'medrxiv.org'],
+        'pdf_transform': lambda url: url + '.full.pdf' if not url.endswith('.pdf') else url,
+    },
+    'doaj': {
+        'patterns': ['doaj.org'],
+        'pdf_transform': lambda url: url,
+    },
+    'zenodo': {
+        'patterns': ['zenodo.org'],
+        'pdf_transform': lambda url: url,
+    },
+}
+def _identify_oa_source(url: str) -> Optional[str]:
+    """Identify which OA source a URL belongs to."""
+    url_lower = url.lower()
+    for source_name, config in OA_SOURCE_PATTERNS.items():
+        for pattern in config['patterns']:
+            if pattern in url_lower:
+                return source_name
+    return None
+def _transform_to_pdf_url(url: str, source: str) -> str:
+    """Transform URL to direct PDF URL based on source."""
+    if source in OA_SOURCE_PATTERNS:
+        transform_func = OA_SOURCE_PATTERNS[source]['pdf_transform']
+        return transform_func(url)
+    return url
+async def try_download_open_access_async(
+    oa_url: str,
+    output_path: Path,
+    metadata: Optional[Dict[str, Any]] = None,
+    func_name: str = "try_download_open_access_async",
+    timeout: int = 60,
+) -> Optional[Path]:
+    """
+    Download PDF from an Open Access URL.
+    This strategy is simpler than browser-based strategies because OA PDFs
+    are typically directly accessible without authentication.
+    Args:
+        oa_url: Open Access URL (from OpenAlex oa_url, arXiv, PMC, etc.)
+        output_path: Path to save the downloaded PDF
+        metadata: Optional paper metadata for logging
+        func_name: Function name for logging
+        timeout: Download timeout in seconds
+    Returns:
+        Path to downloaded PDF if successful, None otherwise
+    """
+    if not oa_url:
+        logger.debug(f"{func_name}: No OA URL provided")
+        return None
+    # Identify source and transform URL if needed
+    source = _identify_oa_source(oa_url)
+    pdf_url = _transform_to_pdf_url(oa_url, source) if source else oa_url
+    logger.info(f"{func_name}: Attempting OA download from {source or 'unknown'}: {pdf_url[:80]}...")
+    try:
+        # Create output directory if needed
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Use aiohttp for async download
+        async with aiohttp.ClientSession() as session:
+            headers = {
+                'User-Agent': 'SciTeX/1.0 (Academic Research Tool; mailto:contact@scitex.io)',
+                'Accept': 'application/pdf,*/*',
+            }
+            async with session.get(pdf_url, headers=headers, timeout=aiohttp.ClientTimeout(total=timeout)) as response:
+                if response.status != 200:
+                    logger.warning(f"{func_name}: HTTP {response.status} from {pdf_url}")
+                    return None
+                content_type = response.headers.get('Content-Type', '')
+                # Verify we're getting a PDF
+                if 'pdf' not in content_type.lower() and not pdf_url.endswith('.pdf'):
+                    # Some servers don't set content-type correctly, check magic bytes
+                    first_bytes = await response.content.read(5)
+                    if first_bytes != b'%PDF-':
+                        logger.warning(f"{func_name}: Response is not a PDF (content-type: {content_type})")
+                        return None
+                    # Reset for full download
+                    content = first_bytes + await response.content.read()
+                else:
+                    content = await response.read()
+                # Validate PDF content
+                if len(content) < 1000:  # PDF should be at least 1KB
+                    logger.warning(f"{func_name}: Downloaded content too small ({len(content)} bytes)")
+                    return None
+                if not content.startswith(b'%PDF-'):
+                    logger.warning(f"{func_name}: Downloaded content is not a valid PDF")
+                    return None
+                # Save to file
+                with open(output_path, 'wb') as f:
+                    f.write(content)
+                size_mb = len(content) / 1024 / 1024
+                logger.info(f"{func_name}: Successfully downloaded {size_mb:.2f} MB to {output_path}")
+                return output_path
+    except aiohttp.ClientError as e:
+        logger.warning(f"{func_name}: HTTP client error: {e}")
+        return None
+    except TimeoutError:
+        logger.warning(f"{func_name}: Download timed out after {timeout}s")
+        return None
+    except Exception as e:
+        logger.error(f"{func_name}: Download failed: {e}")
+        return None
+def try_download_open_access_sync(
+    oa_url: str,
+    output_path: Path,
+    metadata: Optional[Dict[str, Any]] = None,
+    timeout: int = 60,
+) -> Optional[Path]:
+    """
+    Synchronous wrapper for try_download_open_access_async.
+    Args:
+        oa_url: Open Access URL
+        output_path: Path to save the downloaded PDF
+        metadata: Optional paper metadata
+        timeout: Download timeout in seconds
+    Returns:
+        Path to downloaded PDF if successful, None otherwise
+    """
+    import asyncio
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    return loop.run_until_complete(
+        try_download_open_access_async(oa_url, output_path, metadata, timeout=timeout)
+    )
+# EOF

scitex/scholar/pipelines/ScholarPipelineSearchParallel.py CHANGED Viewed

@@ -32,6 +32,7 @@ from datetime import datetime
 from scitex import logging
 from scitex.scholar.core import Paper
+from scitex.scholar.core import normalize_journal_name
 from scitex.scholar.search_engines.individual.PubMedSearchEngine import PubMedSearchEngine
 from scitex.scholar.search_engines.individual.CrossRefSearchEngine import CrossRefSearchEngine
 from scitex.scholar.search_engines.individual.ArXivSearchEngine import ArXivSearchEngine
@@ -49,6 +50,7 @@ class ScholarPipelineSearchParallel:
         max_workers: int = 5,
         timeout_per_engine: float = 30.0,
         use_cache: bool = True,
+        email: str = None,
     ):
         """Initialize parallel search pipeline.
@@ -56,19 +58,21 @@ class ScholarPipelineSearchParallel:
             max_workers: Maximum number of parallel engine queries
             timeout_per_engine: Timeout for each engine in seconds
             use_cache: Whether to use caching for API results
+            email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
         """
         self.name = self.__class__.__name__
         self.max_workers = max_workers
         self.timeout_per_engine = timeout_per_engine
         self.use_cache = use_cache
+        self.email = email or "research@scitex.io"
-        # Initialize search engines
+        # Initialize search engines with email for rate limit benefits
         self.engines = {
-            'PubMed': PubMedSearchEngine(),
-            'CrossRef': CrossRefSearchEngine(),
-            'arXiv': ArXivSearchEngine(),
-            'Semantic_Scholar': SemanticScholarSearchEngine(),
-            'OpenAlex': OpenAlexSearchEngine(),
+            'PubMed': PubMedSearchEngine(email=self.email),
+            'CrossRef': CrossRefSearchEngine(email=self.email),
+            'arXiv': ArXivSearchEngine(email=self.email),
+            'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
+            'OpenAlex': OpenAlexSearchEngine(email=self.email),
         }
         # Statistics
@@ -328,12 +332,18 @@ class ScholarPipelineSearchParallel:
                 if 'metrics' in result:
                     if result['metrics'].get('citation_count'):
                         paper.metadata.citation_count.total = result['metrics']['citation_count']
-                    # Note: is_open_access not in Paper structure
+                    if 'is_open_access' in result['metrics']:
+                        paper.metadata.access.is_open_access = result['metrics']['is_open_access']
+                        paper.metadata.access.is_open_access_engines = [engine_name]
                 if 'urls' in result:
                     if result['urls'].get('pdf'):
                         # pdfs is a list of dicts with url/source keys
                         paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
+                        # If this is an open access paper, also store the PDF URL as oa_url
+                        if paper.metadata.access.is_open_access:
+                            paper.metadata.access.oa_url = result['urls']['pdf']
+                            paper.metadata.access.oa_url_engines = [engine_name]
                     if result['urls'].get('publisher'):
                         paper.metadata.url.publisher = result['urls']['publisher']
                     if result['urls'].get('doi_url'):
@@ -730,13 +740,21 @@ class ScholarPipelineSearchParallel:
         # Publication info
         if hasattr(meta, 'publication'):
-            result['journal'] = meta.publication.journal or ''
+            journal_raw = meta.publication.journal or ''
+            result['journal'] = normalize_journal_name(journal_raw) if journal_raw else ''
             result['impact_factor'] = meta.publication.impact_factor
         # Metrics
         if hasattr(meta, 'citation_count'):
             result['citation_count'] = meta.citation_count.total or 0
-        result['is_open_access'] = False  # Not stored in current Paper structure
+        # Access metadata
+        if hasattr(meta, 'access'):
+            result['is_open_access'] = meta.access.is_open_access or False
+            result['oa_status'] = meta.access.oa_status
+            result['oa_url'] = meta.access.oa_url
+        else:
+            result['is_open_access'] = False
         # URLs
         if hasattr(meta, 'url'):

scitex/scholar/pipelines/ScholarPipelineSearchSingle.py CHANGED Viewed

@@ -47,22 +47,25 @@ class ScholarPipelineSearchSingle:
     def __init__(
         self,
         use_cache: bool = True,
+        email: str = None,
     ):
         """Initialize sequential search pipeline.
         Args:
             use_cache: Whether to use caching for API results
+            email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
         """
         self.name = self.__class__.__name__
         self.use_cache = use_cache
+        self.email = email or "research@scitex.io"
-        # Initialize search engines
+        # Initialize search engines with email for rate limit benefits
         self.engines = {
-            'PubMed': PubMedSearchEngine(),
-            'CrossRef': CrossRefSearchEngine(),
-            'arXiv': ArXivSearchEngine(),
-            'Semantic_Scholar': SemanticScholarSearchEngine(),
-            'OpenAlex': OpenAlexSearchEngine(),
+            'PubMed': PubMedSearchEngine(email=self.email),
+            'CrossRef': CrossRefSearchEngine(email=self.email),
+            'arXiv': ArXivSearchEngine(email=self.email),
+            'Semantic_Scholar': SemanticScholarSearchEngine(email=self.email),
+            'OpenAlex': OpenAlexSearchEngine(email=self.email),
         }
         # Statistics
@@ -265,12 +268,18 @@ class ScholarPipelineSearchSingle:
                 if 'metrics' in result:
                     if result['metrics'].get('citation_count'):
                         paper.metadata.citation_count.total = result['metrics']['citation_count']
-                    # Note: is_open_access not in Paper structure
+                    if 'is_open_access' in result['metrics']:
+                        paper.metadata.access.is_open_access = result['metrics']['is_open_access']
+                        paper.metadata.access.is_open_access_engines = [engine_name]
                 if 'urls' in result:
                     if result['urls'].get('pdf'):
                         # pdfs is a list of dicts with url/source keys
                         paper.metadata.url.pdfs = [{'url': result['urls']['pdf'], 'source': 'search'}]
+                        # If this is an open access paper, also store the PDF URL as oa_url
+                        if paper.metadata.access.is_open_access:
+                            paper.metadata.access.oa_url = result['urls']['pdf']
+                            paper.metadata.access.oa_url_engines = [engine_name]
                     if result['urls'].get('publisher'):
                         paper.metadata.url.publisher = result['urls']['publisher']
                     if result['urls'].get('doi_url'):
@@ -458,7 +467,14 @@ class ScholarPipelineSearchSingle:
         # Metrics
         if hasattr(meta, 'citation_count'):
             result['citation_count'] = meta.citation_count.total or 0
-        result['is_open_access'] = False  # Not stored in current Paper structure
+        # Access metadata
+        if hasattr(meta, 'access'):
+            result['is_open_access'] = meta.access.is_open_access or False
+            result['oa_status'] = meta.access.oa_status
+            result['oa_url'] = meta.access.oa_url
+        else:
+            result['is_open_access'] = False
         # URLs
         if hasattr(meta, 'url'):

scitex/scholar/search_engines/ScholarSearchEngine.py CHANGED Viewed

@@ -45,26 +45,31 @@ class ScholarSearchEngine:
         self,
         default_mode: Literal['parallel', 'single'] = 'parallel',
         use_cache: bool = True,
+        email: str = None,
     ):
         """Initialize unified search engine.
         Args:
             default_mode: Default search mode ('parallel' or 'single')
             use_cache: Whether to use caching for API results
+            email: User email for API rate limit benefits (PubMed, CrossRef, OpenAlex)
         """
         self.name = self.__class__.__name__
         self.default_mode = default_mode
         self.use_cache = use_cache
+        self.email = email
-        # Initialize both pipeline modes
+        # Initialize both pipeline modes with email for rate limit benefits
         self.parallel_pipeline = ScholarPipelineSearchParallel(
             max_workers=5,
             timeout_per_engine=30.0,
             use_cache=use_cache,
+            email=email,
         )
         self.single_pipeline = ScholarPipelineSearchSingle(
             use_cache=use_cache,
+            email=email,
         )
         # Statistics

{scitex-2.4.1.dist-info → scitex-2.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scitex
-Version: 2.4.1
+Version: 2.4.3
 Summary: A comprehensive Python library for scientific computing and data analysis
 Project-URL: Homepage, https://github.com/ywatanabe1989/scitex-code
 Project-URL: Documentation, https://scitex.readthedocs.io

scitex 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

scitex 2.4.1py3-none-any.whl → 2.4.3py3-none-any.whl