PyPI - academic-refchecker - Versions diffs - 2.0.11__tar.gz → 2.0.12__tar.gz - Mend

academic-refchecker 2.0.11tar.gz → 2.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{academic_refchecker-2.0.11/academic_refchecker.egg-info → academic_refchecker-2.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.11
+Version: 2.0.12
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
 Requires-Dist: pandas<2.4.0,>=1.3.0
 Requires-Dist: numpy<2.0.0,>=1.22.4
 Requires-Dist: pdfplumber>=0.6.0
+Requires-Dist: bibtexparser>=1.4.0
 Provides-Extra: dev
 Requires-Dist: pytest>=6.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=2.0.0; extra == "dev"

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.11
+Version: 2.0.12
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
 Requires-Dist: pandas<2.4.0,>=1.3.0
 Requires-Dist: numpy<2.0.0,>=1.22.4
 Requires-Dist: pdfplumber>=0.6.0
+Requires-Dist: bibtexparser>=1.4.0
 Provides-Extra: dev
 Requires-Dist: pytest>=6.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=2.0.0; extra == "dev"

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/academic_refchecker.egg-info/SOURCES.txt RENAMED Viewed

@@ -31,6 +31,7 @@ src/refchecker/__init__.py
 src/refchecker/__main__.py
 src/refchecker/__version__.py
 src/refchecker/checkers/__init__.py
+src/refchecker/checkers/arxiv_citation.py
 src/refchecker/checkers/crossref.py
 src/refchecker/checkers/enhanced_hybrid_checker.py
 src/refchecker/checkers/github_checker.py
@@ -57,6 +58,7 @@ src/refchecker/scripts/start_vllm_server.py
 src/refchecker/services/__init__.py
 src/refchecker/services/pdf_processor.py
 src/refchecker/utils/__init__.py
+src/refchecker/utils/arxiv_rate_limiter.py
 src/refchecker/utils/arxiv_utils.py
 src/refchecker/utils/author_utils.py
 src/refchecker/utils/biblatex_parser.py

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/academic_refchecker.egg-info/requires.txt RENAMED Viewed

@@ -10,6 +10,7 @@ python-Levenshtein>=0.12.0
 pandas<2.4.0,>=1.3.0
 numpy<2.0.0,>=1.22.4
 pdfplumber>=0.6.0
+bibtexparser>=1.4.0
 [dev]
 pytest>=6.0.0

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/pyproject.toml RENAMED Viewed

@@ -37,6 +37,7 @@ dependencies = [
     "pandas>=1.3.0,<2.4.0",
     "numpy>=1.22.4,<2.0.0",
     "pdfplumber>=0.6.0",
+    "bibtexparser>=1.4.0",
 ]
 [project.optional-dependencies]

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/requirements.txt RENAMED Viewed

@@ -10,6 +10,7 @@ colorama>=0.4.4
 fuzzywuzzy>=0.18.0
 python-Levenshtein>=0.12.0
 cryptography>=42.0.0  # For API key encryption in web UI
+bibtexparser>=1.4.0  # For parsing ArXiv BibTeX citations
 # Additional core dependencies found in codebase
 pandas>=1.3.0

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "2.0.11"
+__version__ = "2.0.12"

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/checkers/__init__.py RENAMED Viewed

@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
 from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
 from .openalex import OpenAlexReferenceChecker
 from .crossref import CrossRefReferenceChecker
+from .arxiv_citation import ArXivCitationChecker
 __all__ = [
     "NonArxivReferenceChecker",
     "LocalNonArxivReferenceChecker",
     "EnhancedHybridReferenceChecker",
     "OpenAlexReferenceChecker",
-    "CrossRefReferenceChecker"
+    "CrossRefReferenceChecker",
+    "ArXivCitationChecker",
 ]

academic_refchecker-2.0.12/src/refchecker/checkers/arxiv_citation.py ADDED Viewed

@@ -0,0 +1,460 @@
+#!/usr/bin/env python3
+"""
+ArXiv Citation Checker - Authoritative Source for ArXiv Papers
+This module provides functionality to verify ArXiv papers by fetching the official
+BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
+for papers found on ArXiv, as it reflects the author-submitted metadata.
+Key features:
+- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
+- Always uses the latest version metadata (strips version suffixes)
+- Logs warnings when cited version differs from latest version
+- Parses BibTeX to extract normalized metadata matching refchecker schema
+Usage:
+    from refchecker.checkers.arxiv_citation import ArXivCitationChecker
+    checker = ArXivCitationChecker()
+    reference = {
+        'title': 'Attention Is All You Need',
+        'authors': ['Ashish Vaswani', 'Noam Shazeer'],
+        'year': 2017,
+        'url': 'https://arxiv.org/abs/1706.03762v5',
+    }
+    verified_data, errors, url = checker.verify_reference(reference)
+"""
+import re
+import logging
+import requests
+from typing import Dict, List, Tuple, Optional, Any
+import bibtexparser
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.customization import convert_to_unicode
+from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
+from refchecker.utils.text_utils import (
+    normalize_text,
+    compare_authors,
+    compare_titles_with_latex_cleaning,
+    strip_latex_commands,
+)
+from refchecker.utils.error_utils import format_title_mismatch, validate_year
+from refchecker.config.settings import get_config
+logger = logging.getLogger(__name__)
+# Get configuration
+config = get_config()
+SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
+class ArXivCitationChecker:
+    """
+    Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
+    This checker fetches the official BibTeX citation from ArXiv for papers identified
+    by their ArXiv ID. It uses the latest version's metadata as the authoritative source
+    and logs warnings when the cited version differs from the latest.
+    """
+    def __init__(self, timeout: int = 30):
+        """
+        Initialize the ArXiv Citation Checker.
+        Args:
+            timeout: HTTP request timeout in seconds
+        """
+        self.base_url = "https://arxiv.org/bibtex"
+        self.abs_url = "https://arxiv.org/abs"
+        self.timeout = timeout
+        self.rate_limiter = ArXivRateLimiter.get_instance()
+        # Pattern to extract arXiv IDs from various URL formats
+        self.arxiv_id_patterns = [
+            # Standard arxiv.org URLs
+            r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            # Old format with category
+            r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
+            r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
+            # arXiv: prefix in text
+            r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
+            # export.arxiv.org URLs
+            r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+        ]
+    def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract ArXiv ID from a reference, returning both the base ID and version.
+        Args:
+            reference: Reference dictionary containing url, raw_text, etc.
+        Returns:
+            Tuple of (arxiv_id_without_version, version_string_or_None)
+            For example: ("2301.12345", "v2") or ("2301.12345", None)
+        """
+        # Sources to check for ArXiv ID
+        sources = [
+            reference.get('url', ''),
+            reference.get('cited_url', ''),
+            reference.get('raw_text', ''),
+            reference.get('eprint', ''),  # BibTeX field
+        ]
+        for source in sources:
+            if not source:
+                continue
+            for pattern in self.arxiv_id_patterns:
+                match = re.search(pattern, source, re.IGNORECASE)
+                if match:
+                    arxiv_id = match.group(1)
+                    version = match.group(2) if len(match.groups()) > 1 else None
+                    logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
+                    return arxiv_id, version
+        return None, None
+    def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
+        """
+        Fetch the official BibTeX citation from ArXiv.
+        This always fetches the latest version's BibTeX (ArXiv default behavior).
+        Args:
+            arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
+        Returns:
+            BibTeX string or None if fetch failed
+        """
+        url = f"{self.base_url}/{arxiv_id}"
+        # Wait for rate limit
+        self.rate_limiter.wait()
+        try:
+            logger.debug(f"Fetching ArXiv BibTeX from: {url}")
+            response = requests.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            bibtex_content = response.text.strip()
+            # Validate it looks like BibTeX
+            if bibtex_content and bibtex_content.startswith('@'):
+                logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
+                return bibtex_content
+            else:
+                logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
+                return None
+        except requests.exceptions.Timeout:
+            logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
+            return None
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
+            return None
+    def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
+        """
+        Parse BibTeX string and extract metadata in refchecker schema format.
+        Args:
+            bibtex_str: BibTeX content string
+        Returns:
+            Dictionary with parsed metadata or None if parsing failed
+        """
+        try:
+            # Configure parser
+            parser = BibTexParser(common_strings=True)
+            parser.customization = convert_to_unicode
+            # Parse BibTeX
+            bib_database = bibtexparser.loads(bibtex_str, parser=parser)
+            if not bib_database.entries:
+                logger.debug("No entries found in BibTeX")
+                return None
+            entry = bib_database.entries[0]
+            # Extract and normalize fields
+            title = entry.get('title', '')
+            # Clean title - remove braces used for capitalization protection
+            title = re.sub(r'\{([^}]*)\}', r'\1', title)
+            title = title.strip()
+            # Extract authors
+            authors_str = entry.get('author', '')
+            authors = self._parse_authors(authors_str)
+            # Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
+            arxiv_id = entry.get('eprint', '')
+            year = self._extract_year_from_eprint(arxiv_id)
+            # Fall back to BibTeX year field if eprint year extraction fails
+            if not year and entry.get('year'):
+                try:
+                    year = int(entry['year'])
+                except ValueError:
+                    pass
+            # Build result in refchecker schema format
+            result = {
+                'title': title,
+                'authors': [{'name': author} for author in authors],
+                'year': year,
+                'venue': 'arXiv',
+                'externalIds': {
+                    'ArXiv': arxiv_id,
+                },
+                'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
+                'isOpenAccess': True,
+                'openAccessPdf': {
+                    'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
+                },
+                # Store original bibtex for reference
+                '_bibtex_entry': entry,
+                '_source': 'ArXiv BibTeX Reference',
+                '_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
+            }
+            # Add DOI if present (some ArXiv papers have DOIs)
+            if entry.get('doi'):
+                result['externalIds']['DOI'] = entry['doi']
+            logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
+            return result
+        except Exception as e:
+            logger.warning(f"Failed to parse BibTeX: {e}")
+            return None
+    def _parse_authors(self, authors_str: str) -> List[str]:
+        """
+        Parse BibTeX author string into list of author names.
+        BibTeX format: "Last1, First1 and Last2, First2 and ..."
+        Args:
+            authors_str: BibTeX author field value
+        Returns:
+            List of author names in "First Last" format
+        """
+        if not authors_str:
+            return []
+        authors = []
+        # Split by " and " (BibTeX convention)
+        author_parts = re.split(r'\s+and\s+', authors_str)
+        for part in author_parts:
+            part = part.strip()
+            if not part:
+                continue
+            # Handle "Last, First" format
+            if ',' in part:
+                parts = part.split(',', 1)
+                if len(parts) == 2:
+                    last = parts[0].strip()
+                    first = parts[1].strip()
+                    # Convert to "First Last" format
+                    name = f"{first} {last}".strip()
+                else:
+                    name = part
+            else:
+                # Already in "First Last" format
+                name = part
+            # Clean up the name
+            name = re.sub(r'\s+', ' ', name)  # Normalize whitespace
+            name = re.sub(r'\{([^}]*)\}', r'\1', name)  # Remove braces
+            if name:
+                authors.append(name)
+        return authors
+    def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
+        """
+        Extract year from ArXiv eprint ID.
+        New format (YYMM.NNNNN): First two digits are year
+        Old format (cat-name/YYMMNNN): Digits after slash, first two are year
+        Args:
+            eprint: ArXiv eprint ID
+        Returns:
+            Year as integer or None
+        """
+        if not eprint:
+            return None
+        # New format: 2301.12345
+        match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
+        if match:
+            yy = int(match.group(1))
+            # ArXiv started in 1991, new format started in 2007
+            if yy >= 7:
+                return 2000 + yy
+            else:
+                # Very early 2000s papers (unlikely in new format)
+                return 2000 + yy
+        # Old format: hep-th/9901001
+        match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
+        if match:
+            yy = int(match.group(1))
+            if yy >= 91:  # ArXiv started in 1991
+                return 1900 + yy
+            else:
+                return 2000 + yy
+        return None
+    def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
+        """
+        Get the latest version number for an ArXiv paper.
+        Note: This requires fetching the abstract page, so it's optional.
+        For now, we rely on the BibTeX always returning latest version metadata.
+        Args:
+            arxiv_id: ArXiv ID without version
+        Returns:
+            Latest version string (e.g., "v3") or None if couldn't determine
+        """
+        # The BibTeX endpoint always returns the latest version's metadata,
+        # so we don't need to explicitly fetch version info
+        return None
+    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
+        """
+        Check if a reference is an ArXiv paper.
+        Args:
+            reference: Reference dictionary
+        Returns:
+            True if reference appears to be an ArXiv paper
+        """
+        arxiv_id, _ = self.extract_arxiv_id(reference)
+        return arxiv_id is not None
+    def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a reference using ArXiv's official BibTeX as authoritative source.
+        This method:
+        1. Extracts the ArXiv ID from the reference
+        2. Fetches the official BibTeX from ArXiv (always latest version)
+        3. Parses the BibTeX to get authoritative metadata
+        4. Compares cited metadata against authoritative source
+        5. Logs warnings for version mismatches
+        Args:
+            reference: Reference dictionary with title, authors, year, url, etc.
+        Returns:
+            Tuple of (verified_data, errors, url)
+            - verified_data: Authoritative paper metadata from ArXiv or None
+            - errors: List of error/warning dictionaries
+            - url: ArXiv URL for the paper
+        """
+        errors = []
+        # Extract ArXiv ID
+        arxiv_id, cited_version = self.extract_arxiv_id(reference)
+        if not arxiv_id:
+            logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
+            return None, [], None
+        logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
+        # Fetch authoritative BibTeX
+        bibtex_content = self.fetch_bibtex(arxiv_id)
+        if not bibtex_content:
+            logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
+            return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
+        # Parse BibTeX
+        verified_data = self.parse_bibtex(bibtex_content)
+        if not verified_data:
+            logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
+            return None, [], None
+        # Log version mismatch warning if cited version differs from latest
+        if cited_version:
+            # ArXiv BibTeX always returns latest version metadata
+            # We don't know the actual latest version number without additional API call,
+            # but we can warn that a specific version was cited
+            errors.append({
+                'warning_type': 'version',
+                'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
+            })
+            logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
+        # Compare title
+        cited_title = reference.get('title', '').strip()
+        authoritative_title = verified_data.get('title', '').strip()
+        if cited_title and authoritative_title:
+            title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
+            if title_similarity < SIMILARITY_THRESHOLD:
+                clean_cited_title = strip_latex_commands(cited_title)
+                errors.append({
+                    'error_type': 'title',
+                    'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
+                    'ref_title_correct': authoritative_title
+                })
+        # Compare authors
+        cited_authors = reference.get('authors', [])
+        if cited_authors:
+            authoritative_authors = verified_data.get('authors', [])
+            authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
+            if not authors_match:
+                correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
+                errors.append({
+                    'error_type': 'author',
+                    'error_details': author_error,
+                    'ref_authors_correct': correct_author_names
+                })
+        # Compare year
+        cited_year = reference.get('year')
+        authoritative_year = verified_data.get('year')
+        year_warning = validate_year(
+            cited_year=cited_year,
+            paper_year=authoritative_year,
+            use_flexible_validation=True,
+            context={'arxiv_match': True}
+        )
+        if year_warning:
+            errors.append(year_warning)
+        # Build URL
+        paper_url = f"https://arxiv.org/abs/{arxiv_id}"
+        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        return verified_data, errors, paper_url

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/checkers/enhanced_hybrid_checker.py RENAMED Viewed

@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
                  contact_email: Optional[str] = None,
                  enable_openalex: bool = True,
                  enable_crossref: bool = True,
+                 enable_arxiv_citation: bool = True,
                  debug_mode: bool = False):
         """
         Initialize the enhanced hybrid reference checker
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
             contact_email: Email for polite pool access to APIs
             enable_openalex: Whether to use OpenAlex API
             enable_crossref: Whether to use CrossRef API
+            enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
             debug_mode: Whether to enable debug logging
         """
         self.contact_email = contact_email
         self.debug_mode = debug_mode
+        # Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
+        self.arxiv_citation = None
+        if enable_arxiv_citation:
+            try:
+                from .arxiv_citation import ArXivCitationChecker
+                self.arxiv_citation = ArXivCitationChecker()
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
+            except Exception as e:
+                logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
         # Initialize local database checker if available
         self.local_db = None
         if db_path:
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
         # Track API performance for adaptive selection
         self.api_stats = {
+            'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
         # PHASE 1: Try all APIs once in priority order
+        # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
+        # This fetches the official BibTeX from ArXiv which is the author-submitted metadata
+        if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
+            logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
+            verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
+            if success:
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
+                return verified_data, errors, url
+            if failure_type in ['throttled', 'timeout', 'server_error']:
+                failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
         # Strategy 1: Always try local database first (fastest)
         if self.local_db:
             verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/config/settings.py RENAMED Viewed

@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
         "timeout": 30,
     },
+    "arxiv_citation": {
+        "base_url": "https://arxiv.org/bibtex",
+        "rate_limit_delay": 3.0,  # Share rate limiting with other ArXiv endpoints
+        "timeout": 30,
+        "use_as_authoritative": True,  # Use ArXiv BibTeX as authoritative source
+        "enabled": True,  # Enable ArXiv citation checker in hybrid checker
+    },
     # Processing Settings
     "processing": {
         "max_papers": 50,

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/llm/providers.py RENAMED Viewed

@@ -106,7 +106,8 @@ Instructions:
     - When extracting URLs, preserve the complete URL including protocol
     - For BibTeX howpublished fields, extract the full URL from the field value
 12. When parsing multi-line references, combine all authors from all lines before the title
-13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), simply return nothing - do NOT explain why you cannot extract references
+13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), return ONLY an empty response with no text at all - do NOT explain why, do NOT describe what you see, do NOT say "I return nothing" or similar phrases
+14. OUTPUT FORMAT: Your response must contain ONLY extracted references in the format specified above (Author1*Author2#Title#Venue#Year#URL), one per line. No introductory text, no explanations, no commentary, no "Looking at this text..." statements. If there are no references to extract, output absolutely nothing.
 Bibliography text:
 {cleaned_bibliography}
@@ -160,6 +161,24 @@ Bibliography text:
                 continue
             if ref.lower().startswith('i cannot'):
                 continue
+            # Skip "Looking at this text..." explanatory responses
+            if ref.lower().startswith('looking at'):
+                continue
+            # Skip responses that say "I return nothing" or similar
+            if 'i return nothing' in ref.lower() or 'return nothing' in ref.lower():
+                continue
+            # Skip responses that mention "no valid bibliographic references"
+            if 'no valid bibliographic' in ref.lower():
+                continue
+            # Skip responses that say "Since there are no"
+            if ref.lower().startswith('since there are no'):
+                continue
+            # Skip responses that mention "numbered format specified"
+            if 'numbered format specified' in ref.lower():
+                continue
+            # Skip responses that describe what the text contains instead of extracting
+            if ('it contains' in ref.lower() or 'it does not contain' in ref.lower()) and 'bibliography' in ref.lower():
+                continue
             # Remove common prefixes (bullets, numbers, etc.)
             ref = ref.lstrip('- *•')

academic_refchecker-2.0.12/src/refchecker/utils/arxiv_rate_limiter.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+Shared ArXiv Rate Limiter utility.
+ArXiv requests a polite delay of 3 seconds between requests.
+This module provides a centralized rate limiter to coordinate all ArXiv API calls
+across different checkers and utilities.
+Usage:
+    from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
+    # Get the shared limiter instance
+    limiter = ArXivRateLimiter.get_instance()
+    # Wait for rate limit before making a request
+    limiter.wait()
+    # Then make your request
+    response = requests.get(arxiv_url)
+"""
+import time
+import threading
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class ArXivRateLimiter:
+    """
+    Singleton rate limiter for ArXiv API requests.
+    ArXiv requests a minimum of 3 seconds between requests for polite access.
+    This class ensures all ArXiv API calls from any part of refchecker
+    are properly rate limited.
+    """
+    _instance: Optional['ArXivRateLimiter'] = None
+    _lock = threading.Lock()
+    # ArXiv recommends at least 3 seconds between requests
+    DEFAULT_DELAY = 3.0
+    def __init__(self):
+        """Initialize the rate limiter (use get_instance() instead of direct construction)."""
+        self._last_request_time: float = 0.0
+        self._request_lock = threading.Lock()
+        self._delay: float = self.DEFAULT_DELAY
+    @classmethod
+    def get_instance(cls) -> 'ArXivRateLimiter':
+        """
+        Get the singleton instance of the ArXiv rate limiter.
+        Returns:
+            The shared ArXivRateLimiter instance
+        """
+        if cls._instance is None:
+            with cls._lock:
+                # Double-check locking pattern
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+    @classmethod
+    def reset_instance(cls) -> None:
+        """
+        Reset the singleton instance (primarily for testing).
+        """
+        with cls._lock:
+            cls._instance = None
+    @property
+    def delay(self) -> float:
+        """Get the current delay between requests in seconds."""
+        return self._delay
+    @delay.setter
+    def delay(self, value: float) -> None:
+        """
+        Set the delay between requests.
+        Args:
+            value: Delay in seconds (minimum 0.5 seconds enforced)
+        """
+        self._delay = max(0.5, value)
+    def wait(self) -> float:
+        """
+        Wait for the rate limit before making a request.
+        This method blocks until the required time has passed since the last request.
+        It is thread-safe and can be called from multiple threads simultaneously.
+        Returns:
+            The actual time waited in seconds (0 if no wait was needed)
+        """
+        with self._request_lock:
+            current_time = time.time()
+            time_since_last = current_time - self._last_request_time
+            if time_since_last < self._delay:
+                wait_time = self._delay - time_since_last
+                logger.debug(f"ArXiv rate limiter: waiting {wait_time:.2f}s")
+                time.sleep(wait_time)
+            else:
+                wait_time = 0.0
+            self._last_request_time = time.time()
+            return wait_time
+    def mark_request(self) -> None:
+        """
+        Mark that a request was just made (without waiting).
+        Use this if you're managing timing externally but still want to
+        update the rate limiter's state.
+        """
+        with self._request_lock:
+            self._last_request_time = time.time()
+    def time_until_next(self) -> float:
+        """
+        Get the time remaining until the next request is allowed.
+        Returns:
+            Time in seconds until next request (0 if allowed now)
+        """
+        with self._request_lock:
+            current_time = time.time()
+            time_since_last = current_time - self._last_request_time
+            remaining = self._delay - time_since_last
+            return max(0.0, remaining)

{academic_refchecker-2.0.11 → academic_refchecker-2.0.12}/src/refchecker/utils/text_utils.py RENAMED Viewed

@@ -1319,6 +1319,38 @@ def is_name_match(name1: str, name2: str) -> bool:
     # This handles both surname particle normalization effects and standard 3-part names
     def match_initials_with_names(init_parts, name_parts):
         """Helper function to match initials against full names"""
+        # Handle 4-part initials vs 2-part compound surname
+        # e.g., ['M.', 'V.', 'D.', 'Briel'] vs ['Menkes', 'van den Briel']
+        # where "van den" particles are treated as initials "V. D."
+        if len(init_parts) == 4 and len(name_parts) == 2:
+            # Check if first 3 parts are initials and last is surname
+            if (len(init_parts[0].rstrip('.')) == 1 and
+                len(init_parts[1].rstrip('.')) == 1 and
+                len(init_parts[2].rstrip('.')) == 1 and
+                len(init_parts[3]) > 1 and
+                len(name_parts[0]) > 1 and len(name_parts[1]) > 1):
+                first_initial = init_parts[0].rstrip('.')
+                second_initial = init_parts[1].rstrip('.')
+                third_initial = init_parts[2].rstrip('.')
+                last_name = init_parts[3]
+                first_name = name_parts[0]
+                compound_last = name_parts[1]
+                # Extract parts from compound lastname (e.g., "van den Briel" -> ["van", "den", "Briel"])
+                compound_parts = compound_last.split()
+                if len(compound_parts) >= 3:
+                    # compound_parts = ["van", "den", "Briel"]
+                    particle1 = compound_parts[0]
+                    particle2 = compound_parts[1]
+                    actual_last = compound_parts[-1]
+                    if (last_name == actual_last and
+                        first_initial == first_name[0] and
+                        second_initial == particle1[0] and
+                        third_initial == particle2[0]):
+                        return True
         if len(init_parts) == 3 and len(name_parts) == 2:
             # After surname particle normalization: ['g.', 'v.', 'horn'] vs ['grant', 'van horn']
             if (len(init_parts[0].rstrip('.')) == 1 and len(init_parts[1].rstrip('.')) == 1 and len(init_parts[2]) > 1 and