PyPI - academic-refchecker - Versions diffs - 1.2.65__py3-none-any.whl → 1.2.66__py3-none-any.whl - Mend

academic-refchecker 1.2.65py3-none-any.whl → 1.2.66py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/METADATA +72 -7
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/RECORD +28 -18
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/entry_points.txt +1 -0
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/top_level.txt +1 -0
backend/__init__.py +21 -0
backend/__main__.py +11 -0
backend/cli.py +56 -0
backend/concurrency.py +100 -0
backend/database.py +686 -0
backend/main.py +1266 -0
backend/models.py +99 -0
backend/refchecker_wrapper.py +1126 -0
backend/thumbnail.py +517 -0
backend/websocket_manager.py +104 -0
refchecker/__version__.py +2 -2
refchecker/checkers/crossref.py +15 -6
refchecker/checkers/enhanced_hybrid_checker.py +18 -4
refchecker/checkers/local_semantic_scholar.py +2 -2
refchecker/checkers/openalex.py +15 -6
refchecker/checkers/semantic_scholar.py +15 -6
refchecker/core/refchecker.py +17 -6
refchecker/utils/__init__.py +2 -1
refchecker/utils/arxiv_utils.py +18 -60
refchecker/utils/doi_utils.py +32 -1
refchecker/utils/error_utils.py +20 -9
refchecker/utils/text_utils.py +143 -27
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/WHEEL +0 -0
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/licenses/LICENSE +0 -0

refchecker/checkers/local_semantic_scholar.py CHANGED Viewed

@@ -37,9 +37,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from refchecker.utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
 from refchecker.utils.error_utils import create_author_error, create_year_warning, create_doi_error
-from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity, extract_arxiv_id_from_url
+from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity
+from refchecker.utils.url_utils import extract_arxiv_id_from_url, get_best_available_url
 from refchecker.utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
-from refchecker.utils.url_utils import get_best_available_url
 from refchecker.config.settings import get_config
 # Set up logging

refchecker/checkers/openalex.py CHANGED Viewed

@@ -460,13 +460,22 @@ class OpenAlexReferenceChecker:
         if doi and work_doi:
             # Compare DOIs using the proper comparison function
-            from refchecker.utils.doi_utils import compare_dois
+            from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
             if not compare_dois(doi, work_doi):
-                errors.append({
-                    'error_type': 'doi',
-                    'error_details': format_doi_mismatch(doi, work_doi),
-                    'ref_doi_correct': work_doi
-                })
+                # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
+                # Treat as warning instead of error
+                if validate_doi_resolves(doi):
+                    errors.append({
+                        'warning_type': 'doi',
+                        'warning_details': format_doi_mismatch(doi, work_doi),
+                        'ref_doi_correct': work_doi
+                    })
+                else:
+                    errors.append({
+                        'error_type': 'doi',
+                        'error_details': format_doi_mismatch(doi, work_doi),
+                        'ref_doi_correct': work_doi
+                    })
         # Extract URL from work data
         work_url = self.extract_url_from_work(work_data)

refchecker/checkers/semantic_scholar.py CHANGED Viewed

@@ -612,14 +612,23 @@ class NonArxivReferenceChecker:
             paper_doi = external_ids['DOI']
             # Compare DOIs using the proper comparison function
-            from refchecker.utils.doi_utils import compare_dois
+            from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
             if doi and paper_doi and not compare_dois(doi, paper_doi):
                 from refchecker.utils.error_utils import format_doi_mismatch
-                errors.append({
-                    'error_type': 'doi',
-                    'error_details': format_doi_mismatch(doi, paper_doi),
-                    'ref_doi_correct': paper_doi
-                })
+                # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
+                # Treat as warning instead of error
+                if validate_doi_resolves(doi):
+                    errors.append({
+                        'warning_type': 'doi',
+                        'warning_details': format_doi_mismatch(doi, paper_doi),
+                        'ref_doi_correct': paper_doi
+                    })
+                else:
+                    errors.append({
+                        'error_type': 'doi',
+                        'error_details': format_doi_mismatch(doi, paper_doi),
+                        'ref_doi_correct': paper_doi
+                    })
         # Extract URL from paper data - prioritize arXiv URLs when available
         paper_url = None

refchecker/core/refchecker.py CHANGED Viewed

@@ -46,12 +46,13 @@ import json
 import random
 from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
 from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
-                       extract_arxiv_id_from_url, normalize_text as common_normalize_text,
+                       normalize_text as common_normalize_text,
                        detect_latex_bibliography_format, extract_latex_references,
                        detect_standard_acm_natbib_format, strip_latex_commands,
                        format_corrected_reference, is_name_match, enhanced_name_match,
                        calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
                        compare_authors)
+from refchecker.utils.url_utils import extract_arxiv_id_from_url
 from refchecker.utils.config_validator import ConfigValidator
 from refchecker.services.pdf_processor import PDFProcessor
 from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
@@ -1963,11 +1964,21 @@ class ArxivReferenceChecker:
                 if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
                     logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
                     from refchecker.utils.error_utils import format_doi_mismatch
-                    errors.append({
-                        'error_type': 'doi',
-                        'error_details': format_doi_mismatch(doi, external_ids['DOI']),
-                        'ref_doi_correct': external_ids['DOI']
-                    })
+                    from refchecker.utils.doi_utils import validate_doi_resolves
+                    # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
+                    # Treat as warning instead of error
+                    if validate_doi_resolves(doi):
+                        errors.append({
+                            'warning_type': 'doi',
+                            'warning_details': format_doi_mismatch(doi, external_ids['DOI']),
+                            'ref_doi_correct': external_ids['DOI']
+                        })
+                    else:
+                        errors.append({
+                            'error_type': 'doi',
+                            'error_details': format_doi_mismatch(doi, external_ids['DOI']),
+                            'ref_doi_correct': external_ids['DOI']
+                        })
                 else:
                     logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")

refchecker/utils/__init__.py CHANGED Viewed

@@ -4,9 +4,10 @@ Utility functions for text processing, author comparison, mocking, and configura
 from .text_utils import (
     clean_author_name, clean_title, normalize_text,
-    extract_arxiv_id_from_url, clean_conference_markers_from_title,
+    clean_conference_markers_from_title,
     remove_year_from_title
 )
+from .url_utils import extract_arxiv_id_from_url
 from .author_utils import compare_authors, levenshtein_distance, extract_authors_list
 from .mock_objects import (
     MockPaper, MockReference, MockLLMProvider, MockSemanticScholarAPI, MockArxivAPI,

refchecker/utils/arxiv_utils.py CHANGED Viewed

@@ -422,6 +422,11 @@ def get_bibtex_content(paper):
     """
     Try to get BibTeX content for a paper from various sources.
+    For ArXiv papers, only use .bbl files (compiled bibliography).
+    The .bbl file contains only the actually-cited references, while .bib files
+    are unreliable - they may contain entire bibliography databases (e.g., full
+    ACL Anthology with 80k+ entries) or unfiltered reference collections.
     Args:
         paper: Paper object
@@ -433,71 +438,24 @@ def get_bibtex_content(paper):
     # Try ArXiv source if it's an ArXiv paper
     arxiv_id = extract_arxiv_id_from_paper(paper)
     if arxiv_id:
-        logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
+        logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for .bbl bibliography")
         tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-        # Choose between .bib and .bbl files based on what the main TeX file actually uses
-        # Check the main TeX file to see if it uses \bibliography{...} (BibTeX) or not (BBL)
-        uses_bibtex = False
-        if tex_content:
-            # Look for \bibliography{...} commands in the main TeX file
-            bib_pattern = r'\\bibliography\{([^}]+)\}'
-            bib_matches = re.findall(bib_pattern, tex_content)
-            if bib_matches:
-                uses_bibtex = True
-                referenced_bibs = []
-                for match in bib_matches:
-                    bib_names = [name.strip() for name in match.split(',')]
-                    referenced_bibs.extend(bib_names)
-                logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
-        if bib_content and bbl_content:
-            # Count entries in both for logging
-            bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
+        # Only use .bbl files for ArXiv papers (.bib files are unreliable)
+        if bbl_content:
             bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
-            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
-            # IMPORTANT: Prefer .bbl when .bib is excessively large (e.g., includes full ACL Anthology)
-            # The .bbl file contains only the actually-cited references, while .bib may contain
-            # entire bibliography databases. Parsing 80k+ entries would cause the tool to hang.
-            # Use .bbl if: (1) .bbl has entries AND (2) .bib has >10x more entries than .bbl OR >1000 entries
-            excessive_bib = bib_entry_count > 1000 or (bbl_entry_count > 0 and bib_entry_count > bbl_entry_count * 10)
-            if bbl_entry_count > 0 and excessive_bib:
-                logger.info(f"Using .bbl files from ArXiv source (.bib has {bib_entry_count} entries which is excessive, .bbl has {bbl_entry_count})")
-                return bbl_content
-            elif uses_bibtex and bib_entry_count > 0 and not excessive_bib:
-                logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
-                return bib_content
-            elif bbl_entry_count > 0:
-                logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
+            if bbl_entry_count > 0:
+                logger.info(f"Using .bbl files from ArXiv source ({bbl_entry_count} entries)")
                 return bbl_content
-            elif bib_entry_count > 0:
-                logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
-                return bib_content
             else:
-                logger.warning(f"Both .bib and .bbl files appear to be empty")
-                return bib_content  # Default to bib_content as fallback
-        elif bib_content:
-            logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
-            return bib_content
-        elif bbl_content:
-            logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
-            return bbl_content
-        elif tex_content:
-            # Check for embedded bibliography in LaTeX
-            from refchecker.utils.text_utils import detect_latex_bibliography_format
-            latex_format = detect_latex_bibliography_format(tex_content)
-            if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
-                logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
-                # Skip embedded bibliography and return None to trigger fallback methods
-                return None
-    # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
+                logger.debug(f"Found .bbl file but it appears empty")
+        # No .bbl available - return None to trigger PDF fallback
+        if bib_content:
+            bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
+            logger.debug(f"Skipping .bib file ({bib_entry_count} entries) - unreliable, falling back to PDF extraction")
+        logger.debug(f"No usable .bbl file found for ArXiv paper {arxiv_id}")
     return None

refchecker/utils/doi_utils.py CHANGED Viewed

@@ -156,4 +156,35 @@ def construct_doi_url(doi: str) -> str:
     normalized_doi = normalize_doi(doi)
     # Construct URL
-    return f"https://doi.org/{normalized_doi}"
+    return f"https://doi.org/{normalized_doi}"
+def validate_doi_resolves(doi: str, timeout: float = 5.0) -> bool:
+    """
+    Validate that a DOI resolves by checking if doi.org returns a redirect.
+    This is useful for determining if a DOI is valid, even if it's different
+    from what a verification source has stored (e.g., arXiv DOI vs conference DOI).
+    Args:
+        doi: DOI string to validate
+        timeout: Request timeout in seconds
+    Returns:
+        True if DOI resolves (returns 302/301/200), False otherwise
+    """
+    if not doi or not is_valid_doi_format(normalize_doi(doi)):
+        return False
+    try:
+        import requests
+        url = construct_doi_url(doi)
+        # Use HEAD request first (faster), fall back to GET if needed
+        response = requests.head(url, allow_redirects=False, timeout=timeout)
+        # DOI.org returns 302 for valid DOIs that redirect to the paper
+        # Some may return 301 (permanent redirect) or 200 (direct response)
+        return response.status_code in (200, 301, 302, 303, 307, 308)
+    except Exception:
+        # On any error (timeout, connection error, etc.), assume DOI might be valid
+        # to avoid false negatives due to network issues
+        return True

refchecker/utils/error_utils.py CHANGED Viewed

@@ -126,28 +126,39 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
 def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str]]:
     """
-    Create a standardized DOI error dictionary.
+    Create a standardized DOI error or warning dictionary.
+    If the cited DOI resolves (is valid), this returns a warning instead of an error,
+    since papers can have multiple valid DOIs (e.g., arXiv DOI vs conference DOI).
     Args:
         cited_doi: DOI as cited in the reference
         correct_doi: Correct DOI from database
     Returns:
-        Standardized error dictionary if DOIs differ, None if they match after cleaning
+        Standardized error/warning dictionary if DOIs differ, None if they match after cleaning
     """
-    # Strip trailing periods before comparison to avoid false mismatches
-    cited_doi_clean = cited_doi.rstrip('.')
-    correct_doi_clean = correct_doi.rstrip('.')
+    from refchecker.utils.doi_utils import validate_doi_resolves, compare_dois
+    # Use compare_dois which handles normalization (case, prefixes, trailing punctuation)
+    if compare_dois(cited_doi, correct_doi):
+        return None
-    # Only create error if DOIs are actually different after cleaning
-    if cited_doi_clean != correct_doi_clean:
+    # DOIs are different - determine if this should be error or warning
+    # If cited DOI resolves, it's likely a valid alternate DOI
+    # Treat as warning instead of error
+    if validate_doi_resolves(cited_doi):
+        return {
+            'warning_type': 'doi',
+            'warning_details': format_doi_mismatch(cited_doi, correct_doi),
+            'ref_doi_correct': correct_doi
+        }
+    else:
         return {
             'error_type': 'doi',
             'error_details': format_doi_mismatch(cited_doi, correct_doi),
             'ref_doi_correct': correct_doi
         }
-    return None
 def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]:

refchecker/utils/text_utils.py CHANGED Viewed

@@ -676,22 +676,6 @@ def clean_title(title):
     return title
-def extract_arxiv_id_from_url(url):
-    """
-    Extract ArXiv ID from URL or text containing ArXiv reference.
-    This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
-    Kept for backwards compatibility.
-    Args:
-        url: URL string or text containing arXiv reference
-    Returns:
-        ArXiv ID or None if not found
-    """
-    from refchecker.utils.url_utils import extract_arxiv_id_from_url as common_extract
-    return common_extract(url)
 def extract_year_from_text(text):
     """
     Extract a 4-digit year from text
@@ -808,6 +792,9 @@ def normalize_paper_title(title: str) -> str:
     # Strip LaTeX commands first to handle math formatting consistently
     normalized = strip_latex_commands(title)
+    # Normalize diacritics (ü -> u, é -> e, etc.) for consistent comparison
+    normalized = normalize_diacritics(normalized)
     # Convert to lowercase
     normalized = normalized.lower()
@@ -2343,6 +2330,17 @@ def detect_latex_bibliography_format(text):
             'details': details
         }
+    # Check for standalone \bibitem entries (common in .bbl files without full environment wrapper)
+    # This handles cases where the \begin{thebibliography} wrapper is missing
+    bibitem_matches = re.findall(r'\\bibitem(?:\[[^\]]*\])?\{[^}]+\}', text)
+    if bibitem_matches:
+        details['bibitem_count'] = len(bibitem_matches)
+        return {
+            'is_latex': True,
+            'format_type': 'thebibliography',
+            'details': details
+        }
     # Check for \bibliography{} command
     bibcommand_pattern = r'\\bibliography\{([^}]+)\}'
     bibcommand_match = re.search(bibcommand_pattern, text, re.IGNORECASE)
@@ -3125,7 +3123,8 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
         # Parse \bibitem entries (improved for .bbl files with ACM-Reference-Format)
         # Handle both simple \bibitem{key} and complex \bibitem[label]{key} formats
         # Also handle line continuation with % and various spacing patterns
-        bibitem_pattern = r'\\bibitem(?:\[([^\]]*)\])?\s*%?\s*\n?\s*\{([^}]+)\}\s*(.*?)(?=\\bibitem|\\end\{thebibliography\})'
+        # Updated to also match end-of-string ($) for standalone bibitem entries
+        bibitem_pattern = r'\\bibitem(?:\[([^\]]*)\])?\s*%?\s*\n?\s*\{([^}]+)\}\s*(.*?)(?=\\bibitem|\\end\{thebibliography\}|$)'
         matches = re.finditer(bibitem_pattern, text, re.DOTALL | re.IGNORECASE)
@@ -3196,10 +3195,21 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     if label_year_match:
                         ref['year'] = int(label_year_match.group(1))
                     else:
-                        # Try to extract from content
-                        content_year_match = re.search(r'\b(19|20)\d{2}\b', content)
-                        if content_year_match:
-                            ref['year'] = int(content_year_match.group())
+                        # Try to extract from content - be careful to avoid ArXiv IDs like 1907.10641
+                        # Look for year at end of content or after a comma (typical citation format)
+                        # Pattern: standalone year after comma/space, not followed by a dot and more digits (ArXiv ID)
+                        year_patterns = [
+                            r',\s*((?:19|20)\d{2})\s*\.$',  # Year at end after comma: ", 2019."
+                            r',\s*((?:19|20)\d{2})\s*$',     # Year at end after comma: ", 2019"
+                            r'\s+((?:19|20)\d{2})\s*\.$',    # Year at end after space: " 2019."
+                            r'\s+((?:19|20)\d{2})\s*$',      # Year at end after space: " 2019"
+                            r'\b((?:19|20)\d{2})(?!\.\d)',   # Year not followed by decimal (avoid ArXiv IDs)
+                        ]
+                        for pattern in year_patterns:
+                            content_year_match = re.search(pattern, content)
+                            if content_year_match:
+                                ref['year'] = int(content_year_match.group(1))
+                                break
                 # Parse natbib format: usually has author line, then \newblock title, then \newblock venue
                 parts = re.split(r'\\newblock', content, flags=re.IGNORECASE)
@@ -3300,7 +3310,80 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     # Second part is usually title
                     if len(parts) >= 2:
                         title_part = parts[1].strip()
-                        title_clean = strip_latex_commands(title_part).strip()
+                        # Handle \href{URL}{text} or \href {URL} {text} format
+                        # Extract URL before stripping LaTeX commands
+                        # We need to use balanced brace matching because titles can contain
+                        # nested braces like {LLM} for capitalization protection
+                        href_url = None
+                        title_text = None
+                        href_start = title_part.find('\\href')
+                        if href_start != -1:
+                            # Find first opening brace (URL)
+                            pos = href_start + 5  # Skip \href
+                            while pos < len(title_part) and title_part[pos] in ' \t\n':
+                                pos += 1
+                            if pos < len(title_part) and title_part[pos] == '{':
+                                # Extract URL using balanced braces
+                                brace_count = 0
+                                url_start = pos + 1
+                                url_end = pos
+                                for i in range(pos, len(title_part)):
+                                    if title_part[i] == '{':
+                                        brace_count += 1
+                                    elif title_part[i] == '}':
+                                        brace_count -= 1
+                                        if brace_count == 0:
+                                            url_end = i
+                                            break
+                                if url_end > url_start:
+                                    href_url = title_part[url_start:url_end].strip()
+                                    # Now find the second brace group (title text)
+                                    pos = url_end + 1
+                                    while pos < len(title_part) and title_part[pos] in ' \t\n':
+                                        pos += 1
+                                    if pos < len(title_part) and title_part[pos] == '{':
+                                        # Extract title text using balanced braces
+                                        brace_count = 0
+                                        text_start = pos + 1
+                                        text_end = pos
+                                        for i in range(pos, len(title_part)):
+                                            if title_part[i] == '{':
+                                                brace_count += 1
+                                            elif title_part[i] == '}':
+                                                brace_count -= 1
+                                                if brace_count == 0:
+                                                    text_end = i
+                                                    break
+                                        if text_end > text_start:
+                                            title_text = title_part[text_start:text_end].strip()
+                        if href_url and title_text:
+                            # Extract DOI if it's a doi.org URL
+                            if 'doi.org/' in href_url and not ref.get('doi'):
+                                doi_match = re.search(r'doi\.org/(.+)$', href_url)
+                                if doi_match:
+                                    ref['doi'] = doi_match.group(1)
+                                    ref['url'] = href_url
+                            # Extract arXiv ID if it's an arxiv URL
+                            elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
+                                ref['url'] = href_url
+                            # Generic URL
+                            elif not ref.get('url'):
+                                ref['url'] = href_url
+                            # Use the title text (second part of href), not the URL
+                            title_clean = strip_latex_commands(title_text).strip()
+                        else:
+                            title_clean = strip_latex_commands(title_part).strip()
                         # Remove trailing dots and clean up
                         title_clean = title_clean.rstrip('.')
                         if title_clean and len(title_clean) > 5:  # Reasonable title length
@@ -3310,9 +3393,13 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     if len(parts) >= 3:
                         venue_part = parts[2].strip()
                         venue_clean = strip_latex_commands(venue_part).strip()
-                        # Remove year and clean up
+                        # Remove "In " prefix if present (common in bbl format)
+                        venue_clean = re.sub(r'^In\s+', '', venue_clean)
+                        # Remove trailing year only (at end of string), not year in the middle of venue name
+                        # e.g., "2020 Conference on..." should keep the conference name
                         if ref['year']:
-                            venue_clean = re.sub(rf'\b{ref["year"]}\b.*', '', venue_clean)
+                            # Only remove year if it appears at the very end (possibly with punctuation)
+                            venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
                         venue_clean = venue_clean.rstrip(',. ')
                         # Filter out common non-venue patterns that shouldn't be treated as venues
                         non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
@@ -3387,11 +3474,24 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     from refchecker.utils.url_utils import clean_url_punctuation
                     ref['url'] = clean_url_punctuation(url_match.group(1))
-            # Extract DOI from \href{https://doi.org/...}
+            # Extract DOI from \href{https://doi.org/...} or \href {URL} {text} with spaces
             if not ref.get('doi'):
-                doi_match = re.search(r'\\href\{https?://doi\.org/([^}]+)\}', content)
+                # Handle both \href{URL}{text} and \href {URL} {text} formats
+                doi_match = re.search(r'\\href\s*\{(https?://doi\.org/[^}]+)\}', content)
                 if doi_match:
-                    ref['doi'] = doi_match.group(1)
+                    doi_url = doi_match.group(1)
+                    # Extract DOI from the URL
+                    doi_id_match = re.search(r'doi\.org/(.+)$', doi_url)
+                    if doi_id_match:
+                        ref['doi'] = doi_id_match.group(1)
+                        if not ref.get('url'):
+                            ref['url'] = doi_url
+            # Extract URL from \href{URL}{text} if not already set (for non-DOI URLs like arXiv)
+            if not ref.get('url'):
+                href_url_match = re.search(r'\\href\s*\{([^}]+)\}\s*\{[^}]*\}', content)
+                if href_url_match:
+                    ref['url'] = href_url_match.group(1).strip()
             # Extract arXiv ID from \showeprint[arxiv]{...} (ACM format) or from content (natbib format)
             arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -4020,6 +4120,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
     Returns:
         True if venues are substantially different, False if they match/overlap
     """
+    # Import here to avoid circular dependency
+    from refchecker.utils.url_utils import extract_arxiv_id_from_url
     if not venue1 or not venue2:
         return bool(venue1 != venue2)
@@ -4088,6 +4191,19 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         venue_lower = expand_abbreviations(venue_lower)
+        # Strip page numbers (e.g., "pages 38--55", "pp. 123-456", "page 42")
+        venue_lower = re.sub(r',?\s*pages?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
+        venue_lower = re.sub(r',?\s*pp\.?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
+        venue_lower = re.sub(r',?\s*pages?\s*\d+', '', venue_lower)
+        venue_lower = re.sub(r',?\s*pp\.?\s*\d+', '', venue_lower)
+        # Strip publisher names that are commonly appended
+        publishers = ['springer', 'elsevier', 'wiley', 'acm', 'ieee', 'mit press',
+                      'cambridge university press', 'oxford university press',
+                      'morgan kaufmann', 'addison-wesley', 'prentice hall']
+        for publisher in publishers:
+            venue_lower = re.sub(rf',?\s*{re.escape(publisher)}\s*$', '', venue_lower, flags=re.IGNORECASE)
         # Remove punctuation and normalize spacing for comparison
         venue_lower = re.sub(r'[.,;:]', '', venue_lower)  # Remove punctuation
         venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower)  # Remove \"on\" preposition

{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

academic-refchecker 1.2.65__py3-none-any.whl → 1.2.66__py3-none-any.whl

academic-refchecker 1.2.65py3-none-any.whl → 1.2.66py3-none-any.whl