PyPI - academic-refchecker - Versions diffs - 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl - Mend

academic-refchecker 2.0.11py3-none-any.whl → 2.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.11
+Version: 2.0.13
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -32,6 +32,7 @@ Requires-Dist: python-Levenshtein>=0.12.0
 Requires-Dist: pandas<2.4.0,>=1.3.0
 Requires-Dist: numpy<2.0.0,>=1.22.4
 Requires-Dist: pdfplumber>=0.6.0
+Requires-Dist: bibtexparser>=1.4.0
 Provides-Extra: dev
 Requires-Dist: pytest>=6.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=2.0.0; extra == "dev"

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-academic_refchecker-2.0.11.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+academic_refchecker-2.0.13.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
 backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
 backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
@@ -16,10 +16,11 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
 backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
 refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
 refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
-refchecker/__version__.py,sha256=xQXcCOSnpBnaLZygtDKbuiGK368plb0wUEcXNuWi7_s,66
-refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
+refchecker/__version__.py,sha256=4nD_XJ2nhdUPe68-UmSGWSjF8JFBkti-Is16FFYXHAI,66
+refchecker/checkers/__init__.py,sha256=-dR7HX0bfPq9YMXrnODoYbfNWFLqu706xoVsUdWHYRI,611
+refchecker/checkers/arxiv_citation.py,sha256=_oQxWt5uUSy-pAGEQjdwBb7dxoFNqWkYgpkV_ZVS-Ho,17332
 refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
-refchecker/checkers/enhanced_hybrid_checker.py,sha256=2jIeUX7hankPok3M4de9o2bsJZ17ZomuLkdfdr9EV0s,28671
+refchecker/checkers/enhanced_hybrid_checker.py,sha256=HSjxbUo4tr1L1DF8FFG8dfH-Y7mM67sKmqi-KAX_31I,30310
 refchecker/checkers/github_checker.py,sha256=YJ2sLj22qezw3uWjA0jhtDO0fOW4HUwcVbv2DQ4LjR0,14277
 refchecker/checkers/local_semantic_scholar.py,sha256=c-KUTh99s-Di71h-pzdrwlPgoSTwB-tgVAZnCrMFXmw,21011
 refchecker/checkers/openalex.py,sha256=WEjEppQMbutPs8kWOSorCIoXWqpJ9o1CXUicThHSWYU,20120
@@ -29,7 +30,7 @@ refchecker/checkers/semantic_scholar.py,sha256=yvatQM5fXdW0qagqrTUpgotd0RbT7N_pq
 refchecker/checkers/webpage_checker.py,sha256=A_d5kg3OOsyliC00OVq_l0J-RJ4Ln7hUoURk21aO2fs,43653
 refchecker/config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
 refchecker/config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
-refchecker/config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
+refchecker/config/settings.py,sha256=O8PETl_O7uyUl1r_spWhOMHbIaiBM-golfdIN82eigI,6512
 refchecker/core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 refchecker/core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
 refchecker/core/parallel_processor.py,sha256=HpVFEMwPBiP2FRjvGqlaXpjV5S0qP-hxdB_Wdl_lACo,17704
@@ -37,13 +38,14 @@ refchecker/core/refchecker.py,sha256=nX8guDXFL1ZdT-K6KUJT_3iZjuoYsWj4e0rKrqd5VZA
 refchecker/database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 refchecker/database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 refchecker/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-refchecker/llm/base.py,sha256=uMF-KOqZ9ZQ7rccOQLpKJiW9sEMMxr7ePXBSF0yYDJY,16782
-refchecker/llm/providers.py,sha256=RhsYbUqHV5YznJcJ8vTa6M-nUKltdREeG5mYrLdBS2c,40992
+refchecker/llm/base.py,sha256=BhpnUn7nrN8LzAnA8rQuG3zBvNovFYxShk1V9oAHlHU,16248
+refchecker/llm/providers.py,sha256=2pOEre_OH_shgm0b9m3_nVIxyoY-MxhFM5KAP_qKo_Q,39131
 refchecker/scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
 refchecker/scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
 refchecker/services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
 refchecker/services/pdf_processor.py,sha256=7i5x043qfnyzE5EQmytfy_uPjbeCJp4Ka5OPyH-bwOE,10577
 refchecker/utils/__init__.py,sha256=SKTEQeKpLOFFMIzZiakzctsW9zGe_J7LDNJlygWV6RY,1221
+refchecker/utils/arxiv_rate_limiter.py,sha256=axOv84Ge6q_mJ69lcyAFsCmHx9qXvV1aX71oSaxhnjE,4119
 refchecker/utils/arxiv_utils.py,sha256=C7wqoCy9FZUQpoF92vLeJyrK1-6XoMmmL6u_hfDV3ro,18031
 refchecker/utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
 refchecker/utils/biblatex_parser.py,sha256=IKRUMtRsjdXIktyk9XGArt_ms0asmqP549uhFvvumuE,25581
@@ -54,11 +56,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
 refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
 refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
 refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-refchecker/utils/text_utils.py,sha256=ZIdvP75F_4o_p2lB24CkuX_eEjB9x-BY2FlXsOiYjkQ,234082
+refchecker/utils/text_utils.py,sha256=Tx1k0SqS1cmw4N9BDJY-Ipep2T-HMmKPqi4SMcq1ZJ8,235751
 refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
-academic_refchecker-2.0.11.dist-info/METADATA,sha256=oQhQAzud3SET3ya5MUc_z7FCN3FeguPeUYNew2jXSXc,26576
-academic_refchecker-2.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-2.0.11.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
-academic_refchecker-2.0.11.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
-academic_refchecker-2.0.11.dist-info/RECORD,,
+academic_refchecker-2.0.13.dist-info/METADATA,sha256=N6lsqdFWT6K34WNLqA_W0MO3WB2BEFjx_57jEdyHYes,26611
+academic_refchecker-2.0.13.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+academic_refchecker-2.0.13.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
+academic_refchecker-2.0.13.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
+academic_refchecker-2.0.13.dist-info/RECORD,,

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

refchecker/__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "2.0.11"
+__version__ = "2.0.13"

refchecker/checkers/__init__.py CHANGED Viewed

@@ -7,11 +7,13 @@ from .local_semantic_scholar import LocalNonArxivReferenceChecker
 from .enhanced_hybrid_checker import EnhancedHybridReferenceChecker
 from .openalex import OpenAlexReferenceChecker
 from .crossref import CrossRefReferenceChecker
+from .arxiv_citation import ArXivCitationChecker
 __all__ = [
     "NonArxivReferenceChecker",
     "LocalNonArxivReferenceChecker",
     "EnhancedHybridReferenceChecker",
     "OpenAlexReferenceChecker",
-    "CrossRefReferenceChecker"
+    "CrossRefReferenceChecker",
+    "ArXivCitationChecker",
 ]

refchecker/checkers/arxiv_citation.py ADDED Viewed

@@ -0,0 +1,460 @@
+#!/usr/bin/env python3
+"""
+ArXiv Citation Checker - Authoritative Source for ArXiv Papers
+This module provides functionality to verify ArXiv papers by fetching the official
+BibTeX citation directly from ArXiv. This is used as the authoritative metadata source
+for papers found on ArXiv, as it reflects the author-submitted metadata.
+Key features:
+- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
+- Always uses the latest version metadata (strips version suffixes)
+- Logs warnings when cited version differs from latest version
+- Parses BibTeX to extract normalized metadata matching refchecker schema
+Usage:
+    from refchecker.checkers.arxiv_citation import ArXivCitationChecker
+    checker = ArXivCitationChecker()
+    reference = {
+        'title': 'Attention Is All You Need',
+        'authors': ['Ashish Vaswani', 'Noam Shazeer'],
+        'year': 2017,
+        'url': 'https://arxiv.org/abs/1706.03762v5',
+    }
+    verified_data, errors, url = checker.verify_reference(reference)
+"""
+import re
+import logging
+import requests
+from typing import Dict, List, Tuple, Optional, Any
+import bibtexparser
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.customization import convert_to_unicode
+from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
+from refchecker.utils.text_utils import (
+    normalize_text,
+    compare_authors,
+    compare_titles_with_latex_cleaning,
+    strip_latex_commands,
+)
+from refchecker.utils.error_utils import format_title_mismatch, validate_year
+from refchecker.config.settings import get_config
+logger = logging.getLogger(__name__)
+# Get configuration
+config = get_config()
+SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
+class ArXivCitationChecker:
+    """
+    Reference checker that uses ArXiv's official BibTeX export as the authoritative source.
+    This checker fetches the official BibTeX citation from ArXiv for papers identified
+    by their ArXiv ID. It uses the latest version's metadata as the authoritative source
+    and logs warnings when the cited version differs from the latest.
+    """
+    def __init__(self, timeout: int = 30):
+        """
+        Initialize the ArXiv Citation Checker.
+        Args:
+            timeout: HTTP request timeout in seconds
+        """
+        self.base_url = "https://arxiv.org/bibtex"
+        self.abs_url = "https://arxiv.org/abs"
+        self.timeout = timeout
+        self.rate_limiter = ArXivRateLimiter.get_instance()
+        # Pattern to extract arXiv IDs from various URL formats
+        self.arxiv_id_patterns = [
+            # Standard arxiv.org URLs
+            r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            # Old format with category
+            r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
+            r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
+            # arXiv: prefix in text
+            r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
+            # export.arxiv.org URLs
+            r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+        ]
+    def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract ArXiv ID from a reference, returning both the base ID and version.
+        Args:
+            reference: Reference dictionary containing url, raw_text, etc.
+        Returns:
+            Tuple of (arxiv_id_without_version, version_string_or_None)
+            For example: ("2301.12345", "v2") or ("2301.12345", None)
+        """
+        # Sources to check for ArXiv ID
+        sources = [
+            reference.get('url', ''),
+            reference.get('cited_url', ''),
+            reference.get('raw_text', ''),
+            reference.get('eprint', ''),  # BibTeX field
+        ]
+        for source in sources:
+            if not source:
+                continue
+            for pattern in self.arxiv_id_patterns:
+                match = re.search(pattern, source, re.IGNORECASE)
+                if match:
+                    arxiv_id = match.group(1)
+                    version = match.group(2) if len(match.groups()) > 1 else None
+                    logger.debug(f"Extracted ArXiv ID: {arxiv_id}, version: {version}")
+                    return arxiv_id, version
+        return None, None
+    def fetch_bibtex(self, arxiv_id: str) -> Optional[str]:
+        """
+        Fetch the official BibTeX citation from ArXiv.
+        This always fetches the latest version's BibTeX (ArXiv default behavior).
+        Args:
+            arxiv_id: ArXiv ID without version suffix (e.g., "2301.12345")
+        Returns:
+            BibTeX string or None if fetch failed
+        """
+        url = f"{self.base_url}/{arxiv_id}"
+        # Wait for rate limit
+        self.rate_limiter.wait()
+        try:
+            logger.debug(f"Fetching ArXiv BibTeX from: {url}")
+            response = requests.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            bibtex_content = response.text.strip()
+            # Validate it looks like BibTeX
+            if bibtex_content and bibtex_content.startswith('@'):
+                logger.debug(f"Successfully fetched BibTeX for ArXiv paper {arxiv_id}")
+                return bibtex_content
+            else:
+                logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
+                return None
+        except requests.exceptions.Timeout:
+            logger.warning(f"Timeout fetching ArXiv BibTeX for {arxiv_id}")
+            return None
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Failed to fetch ArXiv BibTeX for {arxiv_id}: {e}")
+            return None
+    def parse_bibtex(self, bibtex_str: str) -> Optional[Dict[str, Any]]:
+        """
+        Parse BibTeX string and extract metadata in refchecker schema format.
+        Args:
+            bibtex_str: BibTeX content string
+        Returns:
+            Dictionary with parsed metadata or None if parsing failed
+        """
+        try:
+            # Configure parser
+            parser = BibTexParser(common_strings=True)
+            parser.customization = convert_to_unicode
+            # Parse BibTeX
+            bib_database = bibtexparser.loads(bibtex_str, parser=parser)
+            if not bib_database.entries:
+                logger.debug("No entries found in BibTeX")
+                return None
+            entry = bib_database.entries[0]
+            # Extract and normalize fields
+            title = entry.get('title', '')
+            # Clean title - remove braces used for capitalization protection
+            title = re.sub(r'\{([^}]*)\}', r'\1', title)
+            title = title.strip()
+            # Extract authors
+            authors_str = entry.get('author', '')
+            authors = self._parse_authors(authors_str)
+            # Extract year - prefer year from eprint ID (original submission) over BibTeX year (latest revision)
+            arxiv_id = entry.get('eprint', '')
+            year = self._extract_year_from_eprint(arxiv_id)
+            # Fall back to BibTeX year field if eprint year extraction fails
+            if not year and entry.get('year'):
+                try:
+                    year = int(entry['year'])
+                except ValueError:
+                    pass
+            # Build result in refchecker schema format
+            result = {
+                'title': title,
+                'authors': [{'name': author} for author in authors],
+                'year': year,
+                'venue': 'arXiv',
+                'externalIds': {
+                    'ArXiv': arxiv_id,
+                },
+                'url': f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else None,
+                'isOpenAccess': True,
+                'openAccessPdf': {
+                    'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else None
+                },
+                # Store original bibtex for reference
+                '_bibtex_entry': entry,
+                '_source': 'ArXiv BibTeX Reference',
+                '_source_url': f"https://arxiv.org/bibtex/{arxiv_id}" if arxiv_id else None,
+            }
+            # Add DOI if present (some ArXiv papers have DOIs)
+            if entry.get('doi'):
+                result['externalIds']['DOI'] = entry['doi']
+            logger.debug(f"Parsed ArXiv BibTeX: title='{title[:50]}...', authors={len(authors)}, year={year}")
+            return result
+        except Exception as e:
+            logger.warning(f"Failed to parse BibTeX: {e}")
+            return None
+    def _parse_authors(self, authors_str: str) -> List[str]:
+        """
+        Parse BibTeX author string into list of author names.
+        BibTeX format: "Last1, First1 and Last2, First2 and ..."
+        Args:
+            authors_str: BibTeX author field value
+        Returns:
+            List of author names in "First Last" format
+        """
+        if not authors_str:
+            return []
+        authors = []
+        # Split by " and " (BibTeX convention)
+        author_parts = re.split(r'\s+and\s+', authors_str)
+        for part in author_parts:
+            part = part.strip()
+            if not part:
+                continue
+            # Handle "Last, First" format
+            if ',' in part:
+                parts = part.split(',', 1)
+                if len(parts) == 2:
+                    last = parts[0].strip()
+                    first = parts[1].strip()
+                    # Convert to "First Last" format
+                    name = f"{first} {last}".strip()
+                else:
+                    name = part
+            else:
+                # Already in "First Last" format
+                name = part
+            # Clean up the name
+            name = re.sub(r'\s+', ' ', name)  # Normalize whitespace
+            name = re.sub(r'\{([^}]*)\}', r'\1', name)  # Remove braces
+            if name:
+                authors.append(name)
+        return authors
+    def _extract_year_from_eprint(self, eprint: str) -> Optional[int]:
+        """
+        Extract year from ArXiv eprint ID.
+        New format (YYMM.NNNNN): First two digits are year
+        Old format (cat-name/YYMMNNN): Digits after slash, first two are year
+        Args:
+            eprint: ArXiv eprint ID
+        Returns:
+            Year as integer or None
+        """
+        if not eprint:
+            return None
+        # New format: 2301.12345
+        match = re.match(r'^(\d{2})\d{2}\.\d{4,5}', eprint)
+        if match:
+            yy = int(match.group(1))
+            # ArXiv started in 1991, new format started in 2007
+            if yy >= 7:
+                return 2000 + yy
+            else:
+                # Very early 2000s papers (unlikely in new format)
+                return 2000 + yy
+        # Old format: hep-th/9901001
+        match = re.match(r'^[a-z-]+/(\d{2})\d+', eprint)
+        if match:
+            yy = int(match.group(1))
+            if yy >= 91:  # ArXiv started in 1991
+                return 1900 + yy
+            else:
+                return 2000 + yy
+        return None
+    def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
+        """
+        Get the latest version number for an ArXiv paper.
+        Note: This requires fetching the abstract page, so it's optional.
+        For now, we rely on the BibTeX always returning latest version metadata.
+        Args:
+            arxiv_id: ArXiv ID without version
+        Returns:
+            Latest version string (e.g., "v3") or None if couldn't determine
+        """
+        # The BibTeX endpoint always returns the latest version's metadata,
+        # so we don't need to explicitly fetch version info
+        return None
+    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
+        """
+        Check if a reference is an ArXiv paper.
+        Args:
+            reference: Reference dictionary
+        Returns:
+            True if reference appears to be an ArXiv paper
+        """
+        arxiv_id, _ = self.extract_arxiv_id(reference)
+        return arxiv_id is not None
+    def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a reference using ArXiv's official BibTeX as authoritative source.
+        This method:
+        1. Extracts the ArXiv ID from the reference
+        2. Fetches the official BibTeX from ArXiv (always latest version)
+        3. Parses the BibTeX to get authoritative metadata
+        4. Compares cited metadata against authoritative source
+        5. Logs warnings for version mismatches
+        Args:
+            reference: Reference dictionary with title, authors, year, url, etc.
+        Returns:
+            Tuple of (verified_data, errors, url)
+            - verified_data: Authoritative paper metadata from ArXiv or None
+            - errors: List of error/warning dictionaries
+            - url: ArXiv URL for the paper
+        """
+        errors = []
+        # Extract ArXiv ID
+        arxiv_id, cited_version = self.extract_arxiv_id(reference)
+        if not arxiv_id:
+            logger.debug("ArXivCitationChecker: No ArXiv ID found in reference")
+            return None, [], None
+        logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
+        # Fetch authoritative BibTeX
+        bibtex_content = self.fetch_bibtex(arxiv_id)
+        if not bibtex_content:
+            logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
+            return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
+        # Parse BibTeX
+        verified_data = self.parse_bibtex(bibtex_content)
+        if not verified_data:
+            logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
+            return None, [], None
+        # Log version mismatch warning if cited version differs from latest
+        if cited_version:
+            # ArXiv BibTeX always returns latest version metadata
+            # We don't know the actual latest version number without additional API call,
+            # but we can warn that a specific version was cited
+            errors.append({
+                'warning_type': 'version',
+                'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
+            })
+            logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
+        # Compare title
+        cited_title = reference.get('title', '').strip()
+        authoritative_title = verified_data.get('title', '').strip()
+        if cited_title and authoritative_title:
+            title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
+            if title_similarity < SIMILARITY_THRESHOLD:
+                clean_cited_title = strip_latex_commands(cited_title)
+                errors.append({
+                    'error_type': 'title',
+                    'error_details': format_title_mismatch(clean_cited_title, authoritative_title),
+                    'ref_title_correct': authoritative_title
+                })
+        # Compare authors
+        cited_authors = reference.get('authors', [])
+        if cited_authors:
+            authoritative_authors = verified_data.get('authors', [])
+            authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
+            if not authors_match:
+                correct_author_names = ', '.join([a.get('name', '') for a in authoritative_authors])
+                errors.append({
+                    'error_type': 'author',
+                    'error_details': author_error,
+                    'ref_authors_correct': correct_author_names
+                })
+        # Compare year
+        cited_year = reference.get('year')
+        authoritative_year = verified_data.get('year')
+        year_warning = validate_year(
+            cited_year=cited_year,
+            paper_year=authoritative_year,
+            use_flexible_validation=True,
+            context={'arxiv_match': True}
+        )
+        if year_warning:
+            errors.append(year_warning)
+        # Build URL
+        paper_url = f"https://arxiv.org/abs/{arxiv_id}"
+        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        return verified_data, errors, paper_url

refchecker/checkers/enhanced_hybrid_checker.py CHANGED Viewed

@@ -43,6 +43,7 @@ class EnhancedHybridReferenceChecker:
                  contact_email: Optional[str] = None,
                  enable_openalex: bool = True,
                  enable_crossref: bool = True,
+                 enable_arxiv_citation: bool = True,
                  debug_mode: bool = False):
         """
         Initialize the enhanced hybrid reference checker
@@ -53,11 +54,22 @@ class EnhancedHybridReferenceChecker:
             contact_email: Email for polite pool access to APIs
             enable_openalex: Whether to use OpenAlex API
             enable_crossref: Whether to use CrossRef API
+            enable_arxiv_citation: Whether to use ArXiv Citation checker as authoritative source
             debug_mode: Whether to enable debug logging
         """
         self.contact_email = contact_email
         self.debug_mode = debug_mode
+        # Initialize ArXiv Citation checker (authoritative source for ArXiv papers)
+        self.arxiv_citation = None
+        if enable_arxiv_citation:
+            try:
+                from .arxiv_citation import ArXivCitationChecker
+                self.arxiv_citation = ArXivCitationChecker()
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker initialized")
+            except Exception as e:
+                logger.warning(f"Enhanced Hybrid: Failed to initialize ArXiv Citation checker: {e}")
         # Initialize local database checker if available
         self.local_db = None
         if db_path:
@@ -112,6 +124,7 @@ class EnhancedHybridReferenceChecker:
         # Track API performance for adaptive selection
         self.api_stats = {
+            'arxiv_citation': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
             'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
@@ -276,6 +289,17 @@ class EnhancedHybridReferenceChecker:
         # PHASE 1: Try all APIs once in priority order
+        # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
+        # This fetches the official BibTeX from ArXiv which is the author-submitted metadata
+        if self.arxiv_citation and self.arxiv_citation.is_arxiv_reference(reference):
+            logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
+            verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
+            if success:
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
+                return verified_data, errors, url
+            if failure_type in ['throttled', 'timeout', 'server_error']:
+                failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
         # Strategy 1: Always try local database first (fastest)
         if self.local_db:
             verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)

refchecker/config/settings.py CHANGED Viewed

@@ -22,6 +22,14 @@ DEFAULT_CONFIG = {
         "timeout": 30,
     },
+    "arxiv_citation": {
+        "base_url": "https://arxiv.org/bibtex",
+        "rate_limit_delay": 3.0,  # Share rate limiting with other ArXiv endpoints
+        "timeout": 30,
+        "use_as_authoritative": True,  # Use ArXiv BibTeX as authoritative source
+        "enabled": True,  # Enable ArXiv citation checker in hybrid checker
+    },
     # Processing Settings
     "processing": {
         "max_papers": 50,

refchecker/llm/base.py CHANGED Viewed

@@ -110,21 +110,7 @@ class LLMProvider(ABC):
         logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
         return chunks
-    def _parse_llm_response(self, response_text: str) -> List[str]:
-        """Parse LLM response and extract individual references"""
-        if not response_text:
-            return []
-        # Split by newlines and filter out empty lines
-        references = []
-        for line in response_text.strip().split('\n'):
-            line = line.strip()
-            if line and not line.startswith('#') and len(line) > 10:  # Basic filtering
-                references.append(line)
-        return references
     def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
         """
         Template method that handles chunking for all providers.

refchecker/llm/providers.py CHANGED Viewed

@@ -62,51 +62,25 @@ class LLMProviderMixin:
         """Create prompt for reference extraction"""
         # Clean BibTeX formatting before sending to LLM
         cleaned_bibliography = self._clean_bibtex_for_llm(bibliography_text)
-        return f"""
-Please extract individual references from the following bibliography text. Each reference should be a complete bibliographic entry.
-Instructions:
-1. Split the bibliography into individual references based on numbered markers like [1], [2], etc.
-2. IMPORTANT: References may span multiple lines. A single reference includes everything from one number marker (e.g., [37]) until the next number marker (e.g., [38])
-3. For each reference, extract: authors, title, publication venue, year, and any URLs/DOIs
-   - For BibTeX entries, extract fields correctly:
-     * title = the actual paper title from "title" field
-     * venue = from "journal", "booktitle", "conference" fields
-     * Do NOT confuse journal names like "arXiv preprint arXiv:1234.5678" with paper titles
-4. Include references that are incomplete, like only author names and titles, but ignore ones that are just a URL without other details
-5. Place a hashmark (#) rather than period between fields of a reference, but asterisks (*) between individual authors
-   e.g. Author1*Author2*Author3#Title#Venue#Year#URL
-6. CRITICAL: When extracting authors, understand BibTeX author field format correctly
-   - In BibTeX, the "author" field contains author names separated by " and " (not commas)
-   - Individual author names may be in "Last, First" format (e.g., "Smith, John")
-   - Multiple authors are separated by " and " (e.g., "Smith, John and Doe, Jane")
-   - SPECIAL CASE for collaborations: Handle "Last, First and others" pattern correctly
-     * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
-     * author = {"Smith, John and others"} → ONE explicit author plus et al: "John Smith*et al"
-     * The "Last, First and others" pattern indicates a collaboration paper where only the first author is listed explicitly
-   - EXAMPLES:
-     * author = {"Dolan, Brian P."} → ONE author: "Dolan, Brian P."
-     * author = {"Smith, John and Doe, Jane"} → TWO authors: "Smith, John*Doe, Jane"
-     * author = {"Arnab, Anurag and Dehghani, Mostafa and Heigold, Georg"} → THREE authors: "Arnab, Anurag*Dehghani, Mostafa*Heigold, Georg"
-     * author = {"Khachatryan, Vardan and others"} → ONE explicit author plus et al: "Vardan Khachatryan*et al"
-   - Use asterisks (*) to separate individual authors in your output
-   - For "Last, First" format, convert to "First Last" for readability (e.g., "Smith, John" → "John Smith")
-   - If a BibTeX entry has NO author field, output an empty author field (nothing before the first #)
-   - Do NOT infer or guess authors based on title or context - only use what is explicitly stated
-7. CRITICAL: When extracting authors, preserve "et al" and similar indicators exactly as they appear
-   - If the original says "John Smith, Jane Doe, et al" then output "John Smith, Jane Doe, et al"
-   - If the original says "John Smith et al." then output "John Smith et al."
-   - Also preserve variations like "and others", "etc.", "..." when used to indicate additional authors
-   - Do NOT expand "et al" into individual author names, even if you know them
-8. Return ONLY the references, one per line
-9. Do not include reference numbers like [1], [2], etc. in your output
-10. Do not add any additional text or explanations
-11. Ensure that URLs and DOIs are from the specific reference only
-    - When extracting URLs, preserve the complete URL including protocol
-    - For BibTeX howpublished fields, extract the full URL from the field value
-12. When parsing multi-line references, combine all authors from all lines before the title
-13. CRITICAL: If the text contains no valid bibliographic references (e.g., only figures, appendix material, or explanatory text), simply return nothing - do NOT explain why you cannot extract references
+        return f"""OUTPUT FORMAT (MANDATORY):
+- Each line must be: Author1*Author2#Title#Venue#Year#URL
+- Use # between fields, * between authors
+- One reference per line
+- NO other text allowed - no explanations, descriptions, or commentary
+- If no valid references exist, return NOTHING (completely empty response)
+EXTRACTION RULES:
+1. Split by numbered markers [1], [2], etc. - references may span multiple lines
+2. Extract: authors, title, venue (journal/booktitle), year, URLs/DOIs
+3. For BibTeX: "title" field = paper title, "journal"/"booktitle" = venue
+4. Handle author formats:
+   - "Last, First and others" → "First Last*et al"
+   - "Last, First" → "First Last"
+   - Separate multiple authors with *
+   - Preserve "et al" exactly as written
+5. Skip entries that are only URLs without bibliographic data
+6. If no author field exists, start with # (empty author)
 Bibliography text:
 {cleaned_bibliography}
@@ -116,67 +90,120 @@ Bibliography text:
         """Parse LLM response into list of references"""
         if not content:
             return []
         # Ensure content is a string
         if not isinstance(content, str):
             content = str(content)
         # Clean the content - remove leading/trailing whitespace
         content = content.strip()
+        # Early check: if no # delimiters at all, likely all prose/explanatory text
+        if '#' not in content:
+            logger.warning("LLM response contains no structured references (no # delimiters found)")
+            return []
         # Split by double newlines first to handle paragraph-style formatting
         # then fall back to single newlines
         references = []
         # Try double newline splitting first (paragraph style)
         if '\n\n' in content:
             potential_refs = content.split('\n\n')
         else:
             # Fall back to single newline splitting
             potential_refs = content.split('\n')
+        import re
+        # Common prose patterns that indicate explanatory text
+        prose_starters = (
+            'this ', 'the ', 'i ', 'looking ', 'based on', 'it ',
+            'there ', 'these ', 'here ', 'note', 'please ', 'however',
+            'unfortunately', 'appears to', 'contains', 'following',
+            'above', 'below', 'after', 'before', 'when ', 'if ',
+            'as ', 'for ', 'from ', 'with ', 'without ', 'although'
+        )
         for ref in potential_refs:
             ref = ref.strip()
-            # Skip empty lines, headers, and explanatory text
+            # Skip empty lines
             if not ref:
                 continue
-            if ref.lower().startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
-                continue
-            if ref.startswith('#'):
-                continue
-            if 'extracted from the bibliography' in ref.lower():
+            # Skip lines starting with # (markdown headers or empty author field without title)
+            if ref.startswith('#') and not re.match(r'^#[^#]', ref):
                 continue
-            if 'formatted as a complete' in ref.lower():
+            # Check for prose/explanatory text patterns
+            ref_lower = ref.lower()
+            # Skip common explanatory headers
+            if ref_lower.startswith(('reference', 'here are', 'below are', 'extracted', 'bibliography')):
                 continue
             # Skip verbose LLM explanatory responses
-            if 'cannot extract' in ref.lower() and ('references' in ref.lower() or 'bibliographic' in ref.lower()):
+            skip_patterns = [
+                'extracted from the bibliography',
+                'formatted as a complete',
+                'cannot extract',
+                'appears to be from',
+                'no numbered reference markers',
+                'only figures',
+                'i cannot',
+                'i return nothing',
+                'return nothing',
+                'no valid bibliographic',
+                'numbered format specified',
+                'it contains',
+                'it does not contain',
+                'text appears to be',
+                'does not appear to contain',
+                'no references found',
+                'empty response',
+                'no bibliography',
+                'no actual bibliographic',
+                'no academic references',
+                'contains only numerical',
+                'data tables',
+                'evaluation rubric',
+                'publication metadata',
+                'citable sources',
+                'reference list',
+            ]
+            if any(pattern in ref_lower for pattern in skip_patterns):
                 continue
-            if 'appears to be from' in ref.lower() and 'appendix' in ref.lower():
+            # Skip lines starting with common prose patterns
+            if ref_lower.startswith(prose_starters):
                 continue
-            if 'no numbered reference markers' in ref.lower():
+            if ref_lower.startswith('looking at'):
                 continue
-            if 'only figures' in ref.lower() and 'learning curves' in ref.lower():
+            if ref_lower.startswith('since there are'):
                 continue
-            if ref.lower().startswith('i cannot'):
+            # Key structural check: valid references MUST have # delimiters
+            if '#' not in ref:
+                # No delimiter = not a valid reference, skip it
+                logger.debug(f"Skipping line without # delimiter: {ref[:80]}...")
                 continue
             # Remove common prefixes (bullets, numbers, etc.)
             ref = ref.lstrip('- *•')
             ref = ref.strip()
             # Remove reference numbers like "1.", "[1]", "(1)" from the beginning
-            import re
             ref = re.sub(r'^(\d+\.|\[\d+\]|\(\d+\))\s*', '', ref)
             # Filter out very short lines (likely not complete references)
-            if len(ref) > 30:  # Increased minimum length for academic references
+            if len(ref) > 30:  # Minimum length for academic references
                 references.append(ref)
         return references
-class OpenAIProvider(LLMProvider, LLMProviderMixin):
+class OpenAIProvider(LLMProviderMixin, LLMProvider):
     """OpenAI GPT provider for reference extraction"""
     def __init__(self, config: Dict[str, Any]):
@@ -197,10 +224,6 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
     def extract_references(self, bibliography_text: str) -> List[str]:
         return self.extract_references_with_chunking(bibliography_text)
-    def _create_extraction_prompt(self, bibliography_text: str) -> str:
-        """Create prompt for reference extraction"""
-        return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
     def _call_llm(self, prompt: str) -> str:
         """Make the actual OpenAI API call and return the response text"""
         try:
@@ -220,7 +243,7 @@ class OpenAIProvider(LLMProvider, LLMProviderMixin):
             raise
-class AnthropicProvider(LLMProvider, LLMProviderMixin):
+class AnthropicProvider(LLMProviderMixin, LLMProvider):
     """Anthropic Claude provider for reference extraction"""
     def __init__(self, config: Dict[str, Any]):
@@ -241,10 +264,6 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
     def extract_references(self, bibliography_text: str) -> List[str]:
         return self.extract_references_with_chunking(bibliography_text)
-    def _create_extraction_prompt(self, bibliography_text: str) -> str:
-        """Create prompt for reference extraction"""
-        return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
     def _call_llm(self, prompt: str) -> str:
         """Make the actual Anthropic API call and return the response text"""
         try:
@@ -252,6 +271,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
                 model=self.model or "claude-sonnet-4-20250514",
                 max_tokens=self.max_tokens,
                 temperature=self.temperature,
+                system="You are a bibliographic reference extractor. You output ONLY structured reference data in the exact format specified. Never explain, describe, or comment on the input. Never output prose or sentences. If input contains no extractable references, return a completely empty response with no text.",
                 messages=[
                     {"role": "user", "content": prompt}
                 ]
@@ -281,7 +301,7 @@ class AnthropicProvider(LLMProvider, LLMProviderMixin):
             raise
-class GoogleProvider(LLMProvider, LLMProviderMixin):
+class GoogleProvider(LLMProviderMixin, LLMProvider):
     """Google Gemini provider for reference extraction"""
     def __init__(self, config: Dict[str, Any]):
@@ -303,10 +323,6 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
     def extract_references(self, bibliography_text: str) -> List[str]:
         return self.extract_references_with_chunking(bibliography_text)
-    def _create_extraction_prompt(self, bibliography_text: str) -> str:
-        """Create prompt for reference extraction"""
-        return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
     def _call_llm(self, prompt: str) -> str:
         """Make the actual Google API call and return the response text"""
         try:
@@ -341,7 +357,7 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
             raise
-class AzureProvider(LLMProvider, LLMProviderMixin):
+class AzureProvider(LLMProviderMixin, LLMProvider):
     """Azure OpenAI provider for reference extraction"""
     def __init__(self, config: Dict[str, Any]):
@@ -375,10 +391,6 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
     def extract_references(self, bibliography_text: str) -> List[str]:
         return self.extract_references_with_chunking(bibliography_text)
-    def _create_extraction_prompt(self, bibliography_text: str) -> str:
-        """Create prompt for reference extraction"""
-        return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
     def _call_llm(self, prompt: str) -> str:
         """Make the actual Azure OpenAI API call and return the response text"""
         try:
@@ -397,7 +409,7 @@ class AzureProvider(LLMProvider, LLMProviderMixin):
             logger.error(f"Azure API call failed: {e}")
             raise
-class vLLMProvider(LLMProvider, LLMProviderMixin):
+class vLLMProvider(LLMProviderMixin, LLMProvider):
     """vLLM provider using OpenAI-compatible server mode for local Hugging Face models"""
     def __init__(self, config: Dict[str, Any]):
@@ -838,10 +850,6 @@ class vLLMProvider(LLMProvider, LLMProviderMixin):
     def extract_references(self, bibliography_text: str) -> List[str]:
         return self.extract_references_with_chunking(bibliography_text)
-    def _create_extraction_prompt(self, bibliography_text: str) -> str:
-        """Create prompt for reference extraction"""
-        return LLMProviderMixin._create_extraction_prompt(self, bibliography_text)
     def _call_llm(self, prompt: str) -> str:
         """Make the actual vLLM API call and return the response text"""
         try:

refchecker/utils/arxiv_rate_limiter.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+Shared ArXiv Rate Limiter utility.
+ArXiv requests a polite delay of 3 seconds between requests.
+This module provides a centralized rate limiter to coordinate all ArXiv API calls
+across different checkers and utilities.
+Usage:
+    from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
+    # Get the shared limiter instance
+    limiter = ArXivRateLimiter.get_instance()
+    # Wait for rate limit before making a request
+    limiter.wait()
+    # Then make your request
+    response = requests.get(arxiv_url)
+"""
+import time
+import threading
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class ArXivRateLimiter:
+    """
+    Singleton rate limiter for ArXiv API requests.
+    ArXiv requests a minimum of 3 seconds between requests for polite access.
+    This class ensures all ArXiv API calls from any part of refchecker
+    are properly rate limited.
+    """
+    _instance: Optional['ArXivRateLimiter'] = None
+    _lock = threading.Lock()
+    # ArXiv recommends at least 3 seconds between requests
+    DEFAULT_DELAY = 3.0
+    def __init__(self):
+        """Initialize the rate limiter (use get_instance() instead of direct construction)."""
+        self._last_request_time: float = 0.0
+        self._request_lock = threading.Lock()
+        self._delay: float = self.DEFAULT_DELAY
+    @classmethod
+    def get_instance(cls) -> 'ArXivRateLimiter':
+        """
+        Get the singleton instance of the ArXiv rate limiter.
+        Returns:
+            The shared ArXivRateLimiter instance
+        """
+        if cls._instance is None:
+            with cls._lock:
+                # Double-check locking pattern
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+    @classmethod
+    def reset_instance(cls) -> None:
+        """
+        Reset the singleton instance (primarily for testing).
+        """
+        with cls._lock:
+            cls._instance = None
+    @property
+    def delay(self) -> float:
+        """Get the current delay between requests in seconds."""
+        return self._delay
+    @delay.setter
+    def delay(self, value: float) -> None:
+        """
+        Set the delay between requests.
+        Args:
+            value: Delay in seconds (minimum 0.5 seconds enforced)
+        """
+        self._delay = max(0.5, value)
+    def wait(self) -> float:
+        """
+        Wait for the rate limit before making a request.
+        This method blocks until the required time has passed since the last request.
+        It is thread-safe and can be called from multiple threads simultaneously.
+        Returns:
+            The actual time waited in seconds (0 if no wait was needed)
+        """
+        with self._request_lock:
+            current_time = time.time()
+            time_since_last = current_time - self._last_request_time
+            if time_since_last < self._delay:
+                wait_time = self._delay - time_since_last
+                logger.debug(f"ArXiv rate limiter: waiting {wait_time:.2f}s")
+                time.sleep(wait_time)
+            else:
+                wait_time = 0.0
+            self._last_request_time = time.time()
+            return wait_time
+    def mark_request(self) -> None:
+        """
+        Mark that a request was just made (without waiting).
+        Use this if you're managing timing externally but still want to
+        update the rate limiter's state.
+        """
+        with self._request_lock:
+            self._last_request_time = time.time()
+    def time_until_next(self) -> float:
+        """
+        Get the time remaining until the next request is allowed.
+        Returns:
+            Time in seconds until next request (0 if allowed now)
+        """
+        with self._request_lock:
+            current_time = time.time()
+            time_since_last = current_time - self._last_request_time
+            remaining = self._delay - time_since_last
+            return max(0.0, remaining)

refchecker/utils/text_utils.py CHANGED Viewed

@@ -1319,6 +1319,38 @@ def is_name_match(name1: str, name2: str) -> bool:
     # This handles both surname particle normalization effects and standard 3-part names
     def match_initials_with_names(init_parts, name_parts):
         """Helper function to match initials against full names"""
+        # Handle 4-part initials vs 2-part compound surname
+        # e.g., ['M.', 'V.', 'D.', 'Briel'] vs ['Menkes', 'van den Briel']
+        # where "van den" particles are treated as initials "V. D."
+        if len(init_parts) == 4 and len(name_parts) == 2:
+            # Check if first 3 parts are initials and last is surname
+            if (len(init_parts[0].rstrip('.')) == 1 and
+                len(init_parts[1].rstrip('.')) == 1 and
+                len(init_parts[2].rstrip('.')) == 1 and
+                len(init_parts[3]) > 1 and
+                len(name_parts[0]) > 1 and len(name_parts[1]) > 1):
+                first_initial = init_parts[0].rstrip('.')
+                second_initial = init_parts[1].rstrip('.')
+                third_initial = init_parts[2].rstrip('.')
+                last_name = init_parts[3]
+                first_name = name_parts[0]
+                compound_last = name_parts[1]
+                # Extract parts from compound lastname (e.g., "van den Briel" -> ["van", "den", "Briel"])
+                compound_parts = compound_last.split()
+                if len(compound_parts) >= 3:
+                    # compound_parts = ["van", "den", "Briel"]
+                    particle1 = compound_parts[0]
+                    particle2 = compound_parts[1]
+                    actual_last = compound_parts[-1]
+                    if (last_name == actual_last and
+                        first_initial == first_name[0] and
+                        second_initial == particle1[0] and
+                        third_initial == particle2[0]):
+                        return True
         if len(init_parts) == 3 and len(name_parts) == 2:
             # After surname particle normalization: ['g.', 'v.', 'horn'] vs ['grant', 'van horn']
             if (len(init_parts[0].rstrip('.')) == 1 and len(init_parts[1].rstrip('.')) == 1 and len(init_parts[2]) > 1 and

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-2.0.11.dist-info → academic_refchecker-2.0.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl

academic-refchecker 2.0.11py3-none-any.whl → 2.0.13py3-none-any.whl