PyPI - academic-refchecker - Versions diffs - 1.2.37__py3-none-any.whl → 1.2.39__py3-none-any.whl - Mend

academic-refchecker 1.2.37py3-none-any.whl → 1.2.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.37"
+__version__ = "1.2.39"

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.37
+Version: 1.2.39
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=rsFw2SftIDg9yKghlFCWIN2abJx55aqbjODKqOrszDE,65
-academic_refchecker-1.2.37.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
+academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
 core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
-core/refchecker.py,sha256=XI5yVa8KrVPEE8VTigG_G7K91SeGKxU0Uz8L8o6REu4,276733
+core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,20 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
 services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
 services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
 utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
-utils/arxiv_utils.py,sha256=HNmUg3mfvQDZOI8dO5T3n_NUaJ4UVluLcOx0A4Q6cbs,14757
+utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
-utils/biblatex_parser.py,sha256=gfcwNa-DpLG5BCJ3yS7IXDybCxwQZjBFj0hAqUwsfLU,19536
+utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
+utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
 utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
 utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
 utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
 utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=pscNw6EgxBZKSzcHjLErWUvWpnsowo8SBev8hbhMGBc,186581
+utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
-utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
-academic_refchecker-1.2.37.dist-info/METADATA,sha256=sPihBUqydlGpu9kb9o--begd-_bvAwQmUXGAFSEZhRM,22298
-academic_refchecker-1.2.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.37.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.37.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.37.dist-info/RECORD,,
+utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
+academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
+academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.39.dist-info/RECORD,,

core/refchecker.py CHANGED Viewed

@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
     def extract_arxiv_id_from_url(self, url):
         """
-        Extract ArXiv ID from a URL or text containing ArXiv reference
+        Extract ArXiv ID from a URL or text containing ArXiv reference.
+        Uses the common extraction function from utils.url_utils.
         """
-        if not url:
-            return None
-        # First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
-        arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
-        if arxiv_match:
-            arxiv_id = arxiv_match.group(1)
-            # Remove version number if present
-            arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
-            return arxiv_id
-        # Remove version string from end if present (e.g., 'v1')
-        url = re.sub(r'v\d+$', '', url)
-        # Parse URL
-        parsed_url = urlparse(url)
-        # Check if it's an arxiv.org URL
-        if 'arxiv.org' in parsed_url.netloc:
-            # Extract ID from path
-            path = parsed_url.path.strip('/')
-            # Handle different URL formats
-            if path.startswith('abs/'):
-                arxiv_id = path.replace('abs/', '')
-            elif path.startswith('pdf/'):
-                arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
-            elif '/abs/' in path:
-                arxiv_id = path.split('/abs/')[1]
-            elif '/pdf/' in path:
-                arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
-            else:
-                arxiv_id = path
-            # Remove version number from the extracted ID
-            arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
-            return arxiv_id
-        return None
+        return extract_arxiv_id_from_url(url)
     def get_paper_metadata(self, arxiv_id):
         """
@@ -3581,11 +3544,9 @@ class ArxivReferenceChecker:
                 # Clean author part and extract authors
                 author_part_clean = strip_latex_commands(author_part).strip()
                 if author_part_clean and not author_part_clean.startswith('\\'):
-                    # Parse author names - handle comma-separated list and "and"
-                    if ', and ' in author_part_clean:
-                        author_names = re.split(r', and |, ', author_part_clean)
-                    else:
-                        author_names = [name.strip() for name in author_part_clean.split(',')]
+                    # Parse author names using the robust author parsing function
+                    from utils.text_utils import parse_authors_with_initials
+                    author_names = parse_authors_with_initials(author_part_clean)
                     # Clean up author names
                     authors = []
@@ -4264,8 +4225,17 @@ class ArxivReferenceChecker:
             return True
         # Also check if authors have significant overlap (at least 50% of the shorter author list)
-        author1_parts = seg1['author'].split('*') if '*' in seg1['author'] else seg1['author'].split(',')
-        author2_parts = seg2['author'].split('*') if '*' in seg2['author'] else seg2['author'].split(',')
+        from utils.text_utils import parse_authors_with_initials
+        if '*' in seg1['author']:
+            author1_parts = seg1['author'].split('*')
+        else:
+            author1_parts = parse_authors_with_initials(seg1['author'])
+        if '*' in seg2['author']:
+            author2_parts = seg2['author'].split('*')
+        else:
+            author2_parts = parse_authors_with_initials(seg2['author'])
         # Clean and normalize author names
         author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
@@ -4780,55 +4750,6 @@ class ArxivReferenceChecker:
         }
-    def _get_bibtex_content(self, paper):
-        """
-        Try to get BibTeX content for a paper from various sources.
-        Args:
-            paper: Paper object
-        Returns:
-            str: BibTeX content if found, None otherwise
-        """
-        # Try ArXiv source if it's an ArXiv paper
-        from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
-        arxiv_id = extract_arxiv_id_from_paper(paper)
-        if arxiv_id:
-            logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
-            tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-            # Prefer .bib files (most structured), then .bbl files
-            if bib_content:
-                logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
-                # If we have LaTeX content, filter BibTeX by cited keys
-                if tex_content:
-                    from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
-                    cited_keys = extract_cited_keys_from_latex(tex_content)
-                    if cited_keys:
-                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                        filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
-                        return filtered_content
-                return bib_content
-            elif bbl_content:
-                logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
-                return bbl_content
-            elif tex_content:
-                # Check for embedded bibliography in LaTeX
-                from utils.text_utils import detect_latex_bibliography_format
-                latex_format = detect_latex_bibliography_format(tex_content)
-                if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
-                    logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
-                    # Skip embedded bibliography and return None to trigger fallback methods
-                    return None
-        # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
-        return None
     def extract_bibliography(self, paper, debug_mode=False):
@@ -4843,7 +4764,8 @@ class ArxivReferenceChecker:
         logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
         # Check if we can get BibTeX content for this paper (ArXiv or other sources)
-        bibtex_content = self._get_bibtex_content(paper)
+        from utils.arxiv_utils import get_bibtex_content
+        bibtex_content = get_bibtex_content(paper)
         if bibtex_content:
             logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
@@ -4897,7 +4819,7 @@ class ArxivReferenceChecker:
                     else:
                         logger.warning("No LLM available for fallback, using original parsing results")
                 else:
-                    logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
+                    logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
             else:
                 # Parse BibTeX using the standard flow (LLM or regex based on config)
                 references = self.parse_references(bibtex_content)
@@ -5458,7 +5380,7 @@ class ArxivReferenceChecker:
                 error_details = unverified_errors[0].get('error_details', '')
                 if error_details:
                     subreason = self._categorize_unverified_reason(error_details)
-                    print(f"          Subreason: {subreason}")
+                    print(f"         Subreason: {subreason}")
             year_str = self._format_year_string(reference.get('year'))

utils/arxiv_utils.py CHANGED Viewed

@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
     return '\n\n'.join(filtered_parts) + '\n'
+def get_bibtex_content(paper):
+    """
+    Try to get BibTeX content for a paper from various sources.
+    Args:
+        paper: Paper object
+    Returns:
+        str: BibTeX content if found, None otherwise
+    """
+    import re
+    # Try ArXiv source if it's an ArXiv paper
+    arxiv_id = extract_arxiv_id_from_paper(paper)
+    if arxiv_id:
+        logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
+        tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
+        # Choose between .bib and .bbl files based on content richness
+        # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
+        if bib_content and bbl_content:
+            # Count entries in both
+            bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
+            bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
+            # If we have LaTeX content, get filtered BibTeX count
+            filtered_bib_count = bib_entry_count
+            filtered_content = bib_content
+            if tex_content:
+                cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                if cited_keys:
+                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                    filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
+            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
+            # Prioritize .bbl if it has significantly more entries
+            if bbl_entry_count > filtered_bib_count * 1.5:  # 50% more entries threshold
+                logger.info(f"Using .bbl files from ArXiv source")
+                return bbl_content
+            else:
+                logger.info(f"Using filtered .bib files")
+                return filtered_content
+        elif bib_content:
+            logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
+            # If we have LaTeX content, filter BibTeX by cited keys
+            if tex_content:
+                cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                if cited_keys:
+                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                    return filtered_content
+            return bib_content
+        elif bbl_content:
+            logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
+            return bbl_content
+        elif tex_content:
+            # Check for embedded bibliography in LaTeX
+            from utils.text_utils import detect_latex_bibliography_format
+            latex_format = detect_latex_bibliography_format(tex_content)
+            if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
+                logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
+                # Skip embedded bibliography and return None to trigger fallback methods
+                return None
+    # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
+    return None

utils/biblatex_parser.py CHANGED Viewed

@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
     else:
         # If no quoted title, look for title after author names
         # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
+        # Order matters: more specific patterns first
         title_patterns = [
-            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year"
-            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing)
-            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
+            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
             r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}',  # ".Title. Year" - for cases where authors end without space
+            r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}',  # "Name.Title. Year" - missing space after period
+            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year" - LESS SPECIFIC
+            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
             r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https',  # "Title . https" - handle space before period
         ]
@@ -391,10 +393,10 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
         # Fallback: split by common patterns if parse_authors_with_initials failed
         if not authors:
             if 'et al' in authors_text.lower():
-                # Handle "FirstAuthor et al." case
+                # Handle "FirstAuthor et al." case - separate base author from "et al"
                 base_author = authors_text.split(' et al')[0].strip()
                 if base_author:
-                    authors = [base_author + ' et al']
+                    authors = [base_author, 'et al']
             elif ' and ' in authors_text:
                 # Handle "Author1 and Author2 and Author3" format
                 author_parts = [p.strip() for p in authors_text.split(' and ')]
@@ -404,18 +406,29 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
                     if part and len(part) > 2:
                         authors.append(part)
             else:
-                # Try comma separation for "Author1, Author2, Author3"
-                author_parts = [p.strip() for p in authors_text.split(',')]
-                authors = []
-                for part in author_parts:
-                    part = part.strip(' .')
-                    # Remove "and" prefix if present
-                    if part.startswith('and '):
-                        part = part[4:].strip()
-                    # Skip parts that are too short or look like initials only
-                    if (part and len(part) > 2 and
-                        not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
-                        authors.append(part)
+                # Try sophisticated parsing one more time with relaxed constraints
+                try:
+                    # Remove "and" connectors for cleaner parsing
+                    clean_text = re.sub(r'\s+and\s+', ', ', authors_text)
+                    fallback_authors = parse_authors_with_initials(clean_text)
+                    if fallback_authors and len(fallback_authors) >= 1:
+                        authors = fallback_authors
+                    else:
+                        raise ValueError("Fallback parsing failed")
+                except:
+                    # Last resort: naive comma separation for "Author1, Author2, Author3"
+                    # This should rarely be reached now
+                    author_parts = [p.strip() for p in authors_text.split(',')]
+                    authors = []
+                    for part in author_parts:
+                        part = part.strip(' .')
+                        # Remove "and" prefix if present
+                        if part.startswith('and '):
+                            part = part[4:].strip()
+                        # Skip parts that are too short or look like initials only
+                        if (part and len(part) > 2 and
+                            not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
+                            authors.append(part)
     # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
     journal_patterns = [

utils/bibliography_utils.py ADDED Viewed

@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+"""
+Bibliography extraction and parsing utilities.
+This module provides utilities for extracting and parsing bibliographies from
+academic papers in various formats (LaTeX, BibTeX, PDF text, etc.).
+"""
+import re
+import logging
+import os
+logger = logging.getLogger(__name__)
+def extract_text_from_latex(latex_file_path):
+    """
+    Extract text from a LaTeX file
+    Args:
+        latex_file_path: Path to the LaTeX file
+    Returns:
+        String containing the LaTeX file content
+    """
+    try:
+        logger.info(f"Reading LaTeX file: {latex_file_path}")
+        with open(latex_file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        logger.info(f"Successfully read LaTeX file with {len(content)} characters")
+        return content
+    except UnicodeDecodeError:
+        # Try with latin-1 encoding if utf-8 fails
+        try:
+            logger.warning(f"UTF-8 encoding failed for {latex_file_path}, trying latin-1")
+            with open(latex_file_path, 'r', encoding='latin-1') as f:
+                content = f.read()
+            logger.info(f"Successfully read LaTeX file with latin-1 encoding")
+            return content
+        except Exception as e:
+            logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
+            return None
+    except Exception as e:
+        logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
+        return None
+def find_bibliography_section(text):
+    """
+    Find the bibliography section in the text
+    """
+    if not text:
+        logger.warning("No text provided to find_bibliography_section")
+        return None
+    # Log a sample of the text for debugging
+    text_sample = text[:500] + "..." if len(text) > 500 else text
+    logger.debug(f"Text sample: {text_sample}")
+    # Common section titles for bibliography
+    section_patterns = [
+        # Patterns for numbered sections with potential spacing issues from PDF extraction
+        r'(?i)\d+\s*ref\s*er\s*ences\s*\n',  # "12 Refer ences" with spaces
+        r'(?i)\d+\s*references\s*\n',  # "12References" or "12 References"
+        r'(?i)^\s*\d+\.\s*references\s*$',  # Numbered section: "7. References"
+        r'(?i)\d+\s+references\s*\.',  # "9 References." format used in Georgia Tech paper
+        # Standard reference patterns
+        r'(?i)references\s*\n',
+        r'(?i)bibliography\s*\n',
+        r'(?i)works cited\s*\n',
+        r'(?i)literature cited\s*\n',
+        r'(?i)references\s*$',  # End of document
+        r'(?i)\[\s*references\s*\]',  # [References]
+        r'(?i)^\s*references\s*$',  # References as a standalone line
+        r'(?i)^\s*bibliography\s*$',  # Bibliography as a standalone line
+        r'(?i)references\s*and\s*citations',  # References and Citations
+        r'(?i)cited\s*references',  # Cited References
+        r'(?i)reference\s*list',  # Reference List
+    ]
+    bibliography_start = None
+    matched_pattern = None
+    for pattern in section_patterns:
+        matches = re.search(pattern, text, re.MULTILINE)
+        if matches:
+            bibliography_start = matches.end()
+            matched_pattern = pattern
+            logger.debug(f"Bibliography section found using pattern: {pattern}")
+            break
+    if bibliography_start is None:
+        logger.debug("No bibliography section header found, trying end-of-document approach")
+        # Try to find bibliography at the end of the document without explicit headers
+        lines = text.split('\n')
+        for i in range(len(lines) - 1, max(0, len(lines) - 100), -1):  # Check last 100 lines
+            line = lines[i].strip()
+            if re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line):
+                # Found what looks like reference entries
+                bibliography_start = text.rfind('\n'.join(lines[i:]))
+                logger.debug(f"Bibliography section found at end of document starting with: {line[:50]}")
+                break
+    if bibliography_start is not None:
+        bibliography_text = text[bibliography_start:].strip()
+        logger.debug(f"Bibliography text length: {len(bibliography_text)}")
+        # Optional: Try to find the end of the bibliography section
+        # This is challenging because it might go to the end of the document
+        # or be followed by appendices, acknowledgments, etc.
+        return bibliography_text
+    logger.debug("Bibliography section not found")
+    return None
+def parse_references(bibliography_text):
+    """
+    Parse references from bibliography text using multiple parsing strategies.
+    Args:
+        bibliography_text: String containing bibliography content
+    Returns:
+        List of parsed reference dictionaries
+    """
+    if not bibliography_text:
+        logger.warning("No bibliography text provided to parse_references")
+        return []
+    # Try different parsing strategies in order of preference
+    parsing_strategies = [
+        ('BibTeX', _parse_bibtex_references),
+        ('biblatex', _parse_biblatex_references),
+        ('ACM/natbib', _parse_standard_acm_natbib_references),
+        ('regex-based', _parse_references_regex)
+    ]
+    for strategy_name, parse_func in parsing_strategies:
+        try:
+            logger.debug(f"Attempting {strategy_name} parsing")
+            references = parse_func(bibliography_text)
+            if references and len(references) > 0:
+                logger.info(f"Successfully parsed {len(references)} references using {strategy_name} format")
+                return references
+            else:
+                logger.debug(f"{strategy_name} parsing returned no references")
+        except Exception as e:
+            logger.debug(f"{strategy_name} parsing failed: {e}")
+            continue
+    logger.warning("All parsing strategies failed to extract references")
+    return []
+def _parse_bibtex_references(bibliography_text):
+    """
+    Parse BibTeX formatted references like @inproceedings{...}, @article{...}, etc.
+    Args:
+        bibliography_text: String containing BibTeX entries
+    Returns:
+        List of reference dictionaries
+    """
+    from utils.bibtex_parser import parse_bibtex_entries
+    return parse_bibtex_entries(bibliography_text)
+def _parse_biblatex_references(bibliography_text):
+    """
+    Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
+    Args:
+        bibliography_text: String containing biblatex .bbl entries
+    Returns:
+        List of reference dictionaries
+    """
+    from utils.text_utils import extract_latex_references
+    return extract_latex_references(bibliography_text)
+def _parse_standard_acm_natbib_references(bibliography_text):
+    """
+    Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
+    """
+    from utils.text_utils import detect_standard_acm_natbib_format
+    references = []
+    # Check if this is standard ACM natbib format
+    format_info = detect_standard_acm_natbib_format(bibliography_text)
+    if format_info['is_acm_natbib']:
+        logger.debug("Detected standard ACM natbib format")
+        # Split by reference entries
+        ref_pattern = r'\[(\d+)\]\s*'
+        entries = re.split(ref_pattern, bibliography_text)[1:]  # Skip first empty element
+        for i in range(0, len(entries), 2):
+            if i + 1 < len(entries):
+                ref_num = entries[i]
+                ref_content = entries[i + 1].strip()
+                try:
+                    reference = _parse_simple_natbib_format(int(ref_num), ref_content, f"[{ref_num}]")
+                    if reference:
+                        references.append(reference)
+                        logger.debug(f"Parsed reference {ref_num}: {reference.get('title', 'No title')[:50]}...")
+                except Exception as e:
+                    logger.debug(f"Error parsing reference {ref_num}: {e}")
+                    continue
+        logger.debug(f"ACM natbib parsing extracted {len(references)} references")
+    return references
+def _parse_simple_natbib_format(ref_num, content, label):
+    """
+    Parse a simple natbib format reference entry.
+    Args:
+        ref_num: Reference number
+        content: Reference content text
+        label: Reference label (e.g., "[1]")
+    Returns:
+        Dictionary containing parsed reference information
+    """
+    from utils.text_utils import extract_url_from_reference, extract_year_from_reference
+    # Basic parsing - this could be enhanced with more sophisticated NLP
+    reference = {
+        'raw_text': content,
+        'label': label,
+        'type': 'unknown'
+    }
+    # Try to extract basic information
+    # This is a simplified parser - real parsing would be much more complex
+    # Look for URL
+    url = extract_url_from_reference(content)
+    if url:
+        reference['url'] = url
+    # Look for year
+    year = extract_year_from_reference(content)
+    if year:
+        reference['year'] = year
+    # Try to identify the type based on content
+    content_lower = content.lower()
+    if 'proceedings' in content_lower or 'conference' in content_lower:
+        reference['type'] = 'inproceedings'
+    elif 'journal' in content_lower or 'trans.' in content_lower:
+        reference['type'] = 'article'
+    elif 'arxiv' in content_lower:
+        reference['type'] = 'misc'
+        reference['note'] = 'arXiv preprint'
+    return reference
+def _parse_references_regex(bibliography_text):
+    """
+    Parse references using regex-based approach (original implementation)
+    """
+    references = []
+    # Split bibliography into individual references
+    # Look for patterns like [1], [2], etc.
+    ref_pattern = r'\[(\d+)\](.*?)(?=\[\d+\]|$)'
+    matches = re.findall(ref_pattern, bibliography_text, re.DOTALL)
+    for ref_num, ref_content in matches:
+        ref_content = ref_content.strip()
+        if not ref_content:
+            continue
+        reference = {
+            'raw_text': ref_content,
+            'label': f"[{ref_num}]",
+            'type': 'unknown'
+        }
+        # Basic information extraction
+        from utils.text_utils import extract_url_from_reference, extract_year_from_reference
+        url = extract_url_from_reference(ref_content)
+        if url:
+            reference['url'] = url
+        year = extract_year_from_reference(ref_content)
+        if year:
+            reference['year'] = year
+        references.append(reference)
+    return references
+def _is_bibtex_surname_given_format(surname_part, given_part):
+    """
+    Check if this appears to be a BibTeX "Surname, Given" format.
+    Args:
+        surname_part: The part before the comma
+        given_part: The part after the comma
+    Returns:
+        Boolean indicating if this looks like BibTeX name format
+    """
+    # Simple heuristics to detect BibTeX format
+    if not surname_part or not given_part:
+        return False
+    # Check if surname looks like a surname (capitalized, not too long)
+    if not re.match(r'^[A-Z][a-zA-Z\s\-\']+$', surname_part.strip()):
+        return False
+    # Check if given part looks like given names (often abbreviated)
+    given_clean = given_part.strip()
+    if re.match(r'^[A-Z](\.\s*[A-Z]\.?)*$', given_clean):  # Like "J. R." or "M. K."
+        return True
+    if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]*)*$', given_clean):  # Like "John Robert"
+        return True
+    return False

utils/text_utils.py CHANGED Viewed

@@ -11,6 +11,31 @@ from typing import List
 logger = logging.getLogger(__name__)
+def normalize_apostrophes(text):
+    """
+    Normalize all apostrophe variants to standard ASCII apostrophe
+    """
+    if not text:
+        return text
+    # All known apostrophe variants
+    apostrophe_variants = [
+        "'",      # U+0027 ASCII apostrophe
+        "'",      # U+2019 Right single quotation mark (most common)
+        "'",      # U+2018 Left single quotation mark
+        "ʼ",      # U+02BC Modifier letter apostrophe
+        "ˈ",      # U+02C8 Modifier letter vertical line (primary stress)
+        "`",      # U+0060 Grave accent (sometimes used as apostrophe)
+        "´",      # U+00B4 Acute accent (sometimes used as apostrophe)
+    ]
+    # Replace all variants with standard ASCII apostrophe
+    for variant in apostrophe_variants:
+        text = text.replace(variant, "'")
+    return text
 def normalize_text(text):
     """
     Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
     if not text:
         return ""
+    # First normalize apostrophes to standard form
+    text = normalize_apostrophes(text)
     # Replace common special characters with their ASCII equivalents
     replacements = {
         'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
         'Ł': 'L', 'ł': 'l',
         '¨': '', '´': '', '`': '', '^': '', '~': '',
         '–': '-', '—': '-', '−': '-',
-        '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
+        '„': '"', '"': '"', '"': '"',
         '«': '"', '»': '"',
         '¡': '!', '¿': '?',
         '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
         '\u00A0': ' ',  # Non-breaking space
         '\u2013': '-',  # En dash
         '\u2014': '-',  # Em dash
-        '\u2018': "'",  # Left single quotation mark
-        '\u2019': "'",  # Right single quotation mark
-        '\u201C': '"',  # Left double quotation mark
-        '\u201D': '"',  # Right double quotation mark
         '\u2026': '...',  # Horizontal ellipsis
         '\u00B7': '.',  # Middle dot
         '\u2022': '.',  # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
     # Remove any remaining diacritical marks
     text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
-    # Remove special characters
-    text = re.sub(r'[^\w\s]', '', text)
+    # Remove special characters except apostrophes
+    text = re.sub(r"[^\w\s']", '', text)
     # Normalize whitespace
     text = re.sub(r'\s+', ' ', text).strip()
@@ -94,6 +118,15 @@ def parse_authors_with_initials(authors_text):
     # by converting to "Haotian Liu and Chunyuan Li and Qingyang Wu"
     authors_text = re.sub(r'\s+', ' ', authors_text.strip())
+    # Special case: Handle single author followed by "et al" (e.g., "Mubashara Akhtar et al.")
+    # This should be split into ["Mubashara Akhtar", "et al"]
+    single_et_al_match = re.match(r'^(.+?)\s+et\s+al\.?$', authors_text, re.IGNORECASE)
+    if single_et_al_match:
+        base_author = single_et_al_match.group(1).strip()
+        if base_author and not ' and ' in base_author and not ',' in base_author:
+            # This is a simple "FirstName LastName et al" case
+            return [base_author, 'et al']
     # Check if this is a semicolon-separated format (e.g., "Hashimoto, K.; Saoud, A.; Kishida, M.")
     if ';' in authors_text:
         # Split by semicolons and handle the last part which might have "and"
@@ -359,6 +392,9 @@ def clean_author_name(author):
     # Normalize Unicode characters (e.g., combining diacritics)
     author = unicodedata.normalize('NFKC', author)
+    # Normalize apostrophes first before other processing
+    author = normalize_apostrophes(author)
     # Handle common Unicode escape sequences and LaTeX encodings
     # Note: Order matters - process longer patterns first
     unicode_replacements = [
@@ -518,31 +554,19 @@ def clean_title(title):
 def extract_arxiv_id_from_url(url):
     """
-    Extract ArXiv ID from URL
+    Extract ArXiv ID from URL or text containing ArXiv reference.
+    This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
+    Kept for backwards compatibility.
     Args:
-        url: URL string
+        url: URL string or text containing arXiv reference
     Returns:
         ArXiv ID or None if not found
     """
-    if not isinstance(url, str):
-        return None
-    # Common ArXiv URL patterns
-    patterns = [
-        r'arxiv\.org/abs/(\d+\.\d+(?:v\d+)?)',
-        r'arxiv\.org/pdf/(\d+\.\d+(?:v\d+)?)',
-        r'arxiv:(\d+\.\d+(?:v\d+)?)',
-        r'arXiv:(\d+\.\d+(?:v\d+)?)'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url, re.IGNORECASE)
-        if match:
-            return match.group(1)
-    return None
+    from utils.url_utils import extract_arxiv_id_from_url as common_extract
+    return common_extract(url)
 def extract_year_from_text(text):
     """
@@ -706,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
         'José' -> 'jose'
         'Łukasz' -> 'lukasz'
         'J. Gl¨ uck' -> 'J. Gluck'
+        'D'Amato' -> 'D'Amato' (apostrophes normalized)
     """
-    # First handle special characters that don't decompose properly
+    # First normalize apostrophes
+    text = normalize_apostrophes(text)
+    # Then handle special characters that don't decompose properly
     # Including common transliterations
     special_chars = {
         'ł': 'l', 'Ł': 'L',
@@ -847,6 +875,10 @@ def is_name_match(name1: str, name2: str) -> bool:
     name1_primary = normalize_diacritics(name1.strip().lower())
     name2_primary = normalize_diacritics(name2.strip().lower())
+    # Remove trailing periods that are not part of initials (e.g., "J. L. D'Amato." -> "J. L. D'Amato")
+    name1_primary = re.sub(r'\.+$', '', name1_primary)
+    name2_primary = re.sub(r'\.+$', '', name2_primary)
     # Handle spacing variations around periods: "F.Last" vs "F. Last"
     name1_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name1_primary)
     name2_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name2_primary)
@@ -859,6 +891,10 @@ def is_name_match(name1: str, name2: str) -> bool:
     name1_alt = normalize_diacritics_simple(name1.strip().lower())
     name2_alt = normalize_diacritics_simple(name2.strip().lower())
+    # Remove trailing periods for alternative normalization too
+    name1_alt = re.sub(r'\.+$', '', name1_alt)
+    name2_alt = re.sub(r'\.+$', '', name2_alt)
     name1_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name1_alt)
     name2_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name2_alt)
@@ -2219,7 +2255,8 @@ def format_author_for_display(author_name):
     if not author_name:
         return author_name
-    author_name = author_name.strip()
+    # Normalize apostrophes for consistent display
+    author_name = normalize_apostrophes(author_name.strip())
     # Check if it's in "Lastname, Firstname" format
     if ',' in author_name:
@@ -2866,8 +2903,9 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     author_part_clean = strip_latex_commands(author_part).strip()
                     # Simple fix: just improve the organization detection without complex parsing
-                    # Remove year pattern first
+                    # Remove year pattern first - handle both parenthetical and standalone years
                     author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
+                    author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
                     # Better organization detection - check if it looks like multiple authors
                     is_multi_author = (
@@ -2889,24 +2927,41 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                                     author = re.sub(r'^and\s+', '', author.strip())
                                     # Remove trailing periods that shouldn't be there
                                     author = clean_author_name(author)
-                                    # Skip all "et al" variants for LaTeX bibliographies
-                                    if author.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
+                                    # Preserve "et al" variants to enable proper author count handling
+                                    if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
+                                        cleaned_authors.append('et al')  # Normalize to standard form
+                                    else:
                                         cleaned_authors.append(author)
                                 if cleaned_authors:
                                     ref['authors'] = cleaned_authors
                             else:
-                                # Fallback: simple comma split
+                                # Fallback: try once more with semicolon handling, then simple comma split
                                 simple_authors = []
-                                for a in author_text_clean.split(','):
-                                    a = a.strip()
-                                    # Remove "and" prefix and skip short/empty entries
-                                    a = re.sub(r'^and\s+', '', a)
-                                    # Clean author name (remove unnecessary periods)
-                                    a = clean_author_name(a)
-                                    if a and len(a) > 2:
-                                        # Skip all "et al" variants for LaTeX bibliographies
-                                        if a.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
-                                            simple_authors.append(a)
+                                try:
+                                    # Try parsing again with normalized separators
+                                    normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
+                                    fallback_authors = parse_authors_with_initials(normalized_text)
+                                    if fallback_authors and len(fallback_authors) >= 2:
+                                        simple_authors = fallback_authors
+                                    else:
+                                        raise ValueError("Fallback parsing failed")
+                                except:
+                                    # Last resort: naive comma split
+                                    for a in author_text_clean.split(','):
+                                        a = a.strip()
+                                        # Remove "and" prefix and skip short/empty entries
+                                        a = re.sub(r'^and\s+', '', a)
+                                        # Clean author name (remove unnecessary periods)
+                                        a = clean_author_name(a)
+                                        if a and len(a) > 2:
+                                            # Preserve "et al" variants to enable proper author count handling
+                                            if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                                simple_authors.append('et al')  # Normalize to standard form
+                                            else:
+                                                simple_authors.append(a)
+                                        elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                            simple_authors.append('et al')  # Handle short "et al" variants
                                 if simple_authors:
                                     ref['authors'] = simple_authors
                         except Exception:
@@ -2919,9 +2974,13 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                                 # Clean author name (remove unnecessary periods)
                                 a = clean_author_name(a)
                                 if a and len(a) > 2:
-                                    # Skip all "et al" variants for LaTeX bibliographies
-                                    if a.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                    # Preserve "et al" variants to enable proper author count handling
+                                    if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                        simple_authors.append('et al')  # Normalize to standard form
+                                    else:
                                         simple_authors.append(a)
+                                elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                    simple_authors.append('et al')  # Handle short "et al" variants
                             if simple_authors:
                                 ref['authors'] = simple_authors
                     else:
@@ -3716,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
             if abbrev in expanded_text:
                 expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
+                break  # Only apply the first (longest) matching abbreviation to avoid conflicts
         # Second pass: handle single word abbreviations
         words = expanded_text.split()
@@ -4110,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         return False
     # Order-aware fuzzy matching - words should match in sequence
-    words1_list = list(words1)
-    words2_list = list(words2)
+    # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
+    words1_list = sorted(list(words1))
+    words2_list = sorted(list(words2))
     # If word counts are very different, they're likely different venues
     if len(words1) > 0 and len(words2) > 0:

utils/url_utils.py CHANGED Viewed

@@ -33,26 +33,43 @@ def construct_doi_url(doi: str) -> str:
 def extract_arxiv_id_from_url(url: str) -> Optional[str]:
     """
-    Extract ArXiv ID from an ArXiv URL.
+    Extract ArXiv ID from an ArXiv URL or text containing ArXiv reference.
+    This is the common function that handles all ArXiv ID extraction patterns:
+    - URLs: https://arxiv.org/abs/1234.5678, https://arxiv.org/pdf/1234.5678.pdf, https://arxiv.org/html/1234.5678
+    - Text references: arXiv:1234.5678, arXiv preprint arXiv:1234.5678
+    - Version handling: removes version numbers (v1, v2, etc.)
     Args:
-        url: ArXiv URL (abs or pdf)
+        url: ArXiv URL or text containing ArXiv reference
     Returns:
-        ArXiv ID if found, None otherwise
+        ArXiv ID (without version) if found, None otherwise
     """
-    if not url:
+    if not url or not isinstance(url, str):
         return None
-    # Use the more comprehensive regex from text_utils.py
-    arxiv_match = re.search(r'arxiv\.org/(?:abs|pdf)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url)
-    if arxiv_match:
-        return arxiv_match.group(1)
-    # Fallback to simpler regex for edge cases
-    fallback_match = re.search(r'arxiv\.org/(?:abs|pdf)/([^/?#]+)', url)
+    # Pattern 1: arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
+    arxiv_text_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
+    if arxiv_text_match:
+        arxiv_id = arxiv_text_match.group(1)
+        # Remove version number if present
+        return re.sub(r'v\d+$', '', arxiv_id)
+    # Pattern 2: arxiv.org URLs (abs, pdf, html)
+    # Handle URLs with version numbers and various formats
+    arxiv_url_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url, re.IGNORECASE)
+    if arxiv_url_match:
+        arxiv_id = arxiv_url_match.group(1)
+        # Remove version number if present
+        return re.sub(r'v\d+$', '', arxiv_id)
+    # Pattern 3: Fallback for simpler URL patterns
+    fallback_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^/?#]+)', url, re.IGNORECASE)
     if fallback_match:
-        return fallback_match.group(1).replace('.pdf', '')
+        arxiv_id = fallback_match.group(1).replace('.pdf', '')
+        # Remove version number if present
+        return re.sub(r'v\d+$', '', arxiv_id)
     return None

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.37__py3-none-any.whl → 1.2.39__py3-none-any.whl

academic-refchecker 1.2.37py3-none-any.whl → 1.2.39py3-none-any.whl