PyPI - academic-refchecker - Versions diffs - 1.2.38__tar.gz → 1.2.40__tar.gz - Mend

academic-refchecker 1.2.38tar.gz → 1.2.40tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{academic_refchecker-1.2.38/src/academic_refchecker.egg-info → academic_refchecker-1.2.40}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.38
+Version: 1.2.40
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.38"
+__version__ = "1.2.40"

{academic_refchecker-1.2.38 → academic_refchecker-1.2.40/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.38
+Version: 1.2.40
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/refchecker.py RENAMED Viewed

@@ -3386,7 +3386,25 @@ class ArxivReferenceChecker:
             logger.info("Detected biblatex format, using biblatex parser")
             self.used_regex_extraction = True
             # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
-            return self._parse_biblatex_references(bibliography_text)
+            biblatex_refs = self._parse_biblatex_references(bibliography_text)
+            # If biblatex parsing returned empty results (due to quality validation),
+            # fallback to LLM if available
+            if not biblatex_refs and self.llm_extractor:
+                logger.debug("Biblatex parser returned no results due to quality validation, trying LLM fallback")
+                try:
+                    references = self.llm_extractor.extract_references(bibliography_text)
+                    if references:
+                        logger.debug(f"LLM fallback extracted {len(references)} references")
+                        return self._process_llm_extracted_references(references)
+                    else:
+                        logger.warning("LLM fallback also returned no results")
+                        return []
+                except Exception as e:
+                    logger.error(f"LLM fallback failed: {e}")
+                    return []
+            return biblatex_refs
         # For non-standard formats, try LLM-based extraction if available
         if self.llm_extractor:
@@ -3610,7 +3628,14 @@ class ArxivReferenceChecker:
         if detect_biblatex_format(bibliography_text):
             logger.debug("Detected biblatex format, using biblatex-specific parsing")
             # biblatex parsing is also robust, so we don't set used_unreliable_extraction
-            return self._parse_biblatex_references(bibliography_text)
+            biblatex_refs = self._parse_biblatex_references(bibliography_text)
+            # If biblatex parsing returned empty results (due to quality validation),
+            # we'll continue with the unreliable fallback regex parsing
+            if not biblatex_refs:
+                logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
+            else:
+                return biblatex_refs
         # If we reach here, we're using the unreliable fallback regex parsing
         self.used_unreliable_extraction = True

{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/biblatex_parser.py RENAMED Viewed

@@ -138,6 +138,57 @@ def detect_biblatex_format(text: str) -> bool:
     return has_biblatex_marker or has_numbered_refs
+def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
+    """
+    Validate that biblatex parsing results are of acceptable quality.
+    If quality is poor, we should fallback to LLM parsing instead.
+    Args:
+        references: List of parsed reference dictionaries
+    Returns:
+        True if parsing quality is acceptable, False if should fallback to LLM
+    """
+    if not references:
+        return False
+    # Count problematic entries
+    unknown_authors = 0
+    unknown_titles = 0
+    total_entries = len(references)
+    for ref in references:
+        authors = ref.get('authors', [])
+        title = ref.get('title', '')
+        # Check for "Unknown Author" entries
+        if not authors or authors == ['Unknown Author']:
+            unknown_authors += 1
+        # Check for "Unknown Title" entries
+        if not title or title == 'Unknown Title':
+            unknown_titles += 1
+    # Calculate failure rates
+    author_failure_rate = unknown_authors / total_entries
+    title_failure_rate = unknown_titles / total_entries
+    # Quality thresholds - if more than 20% of entries have parsing failures,
+    # fallback to LLM which is more robust
+    MAX_ACCEPTABLE_FAILURE_RATE = 0.2
+    if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
+        logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
+        return False
+    if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
+        logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
+        return False
+    logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
+    return True
 def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
     """
     Parse biblatex formatted references into structured format
@@ -146,7 +197,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
         text: String containing biblatex .bbl entries
     Returns:
-        List of structured reference dictionaries
+        List of structured reference dictionaries, or empty list if
+        parsing quality is poor (to trigger LLM fallback)
     """
     from utils.text_utils import parse_authors_with_initials, clean_title
     from utils.doi_utils import construct_doi_url, is_valid_doi_format
@@ -171,7 +223,7 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
         # Find the content between this entry and the next (or end of text)
         if i + 1 < len(entry_starts):
             next_start = entry_starts[i + 1][1]
-            content = text[end:next_start].strip()
+            raw_content = text[end:next_start].strip()
         else:
             # Last entry - take everything to end, but be smart about stopping
             remaining = text[end:].strip()
@@ -190,9 +242,20 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
                 if match and match.start() < min_stop:
                     min_stop = match.start()
-            content = remaining[:min_stop].strip()
+            raw_content = remaining[:min_stop].strip()
-        if content:
+        # Clean up content - handle cases where entry might be incomplete or malformed
+        if raw_content:
+            # Remove stray closing brackets or incomplete markers
+            content = raw_content
+            # Remove trailing "]" if it's the only thing on the last line
+            lines = content.split('\n')
+            if len(lines) > 1 and lines[-1].strip() == ']':
+                content = '\n'.join(lines[:-1]).strip()
+            elif content.strip() == ']':
+                # If content is only "], skip this entry as it's incomplete
+                continue
             matches.append((entry_num, content))
     for entry_num, content in matches:
@@ -218,6 +281,11 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
             references.append(parsed_ref)
     logger.debug(f"Extracted {len(references)} biblatex references")
+    # Validate parsing quality - if poor, return empty list to trigger LLM fallback
+    if not _validate_parsing_quality(references):
+        return []
     return references
@@ -261,11 +329,15 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
     else:
         # If no quoted title, look for title after author names
         # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
+        # Order matters: more specific patterns first
         title_patterns = [
-            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year"
-            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing)
-            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
+            # Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
+            r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
+            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
             r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}',  # ".Title. Year" - for cases where authors end without space
+            r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}',  # "Name.Title. Year" - missing space after period
+            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year" - LESS SPECIFIC
+            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
             r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https',  # "Title . https" - handle space before period
         ]
@@ -274,7 +346,14 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
             if title_match:
                 potential_title = title_match.group(1)
                 # Make sure it looks like a title and not author names
-                if len(potential_title) > 10 and not re.match(r'^[A-Z][a-z]+,\s*[A-Z]', potential_title):
+                # Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
+                author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$'  # "Smith, J." or "Smith, J"
+                multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$'  # "Smith, John" - but still reject this
+                is_author_like = (re.match(author_like_pattern, potential_title) or
+                                re.match(multi_word_author, potential_title))
+                if len(potential_title) > 2 and not is_author_like:
                     title = clean_title(potential_title)
                     break
@@ -328,16 +407,25 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
     # Examples we need to handle:
     # "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
     # "Andrej Karpathy. Intro to Large Language Models. https://... year."
+    # "A. Author and B. Coauthor, \"Title\","  <- handle this format
     # Try multiple patterns to extract authors
+    # Order matters - more specific patterns first!
     author_patterns = [
         # Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
+        r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]',  # "Authors, \"Title\"" - more restrictive, requires comma before quote
         r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]',  # "Authors. \"Title\"" or smart quotes
-        # Pattern 2: Authors followed by title, then period, then year or venue
+        # Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
+        r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.',  # "Author1 and Author2, Title: Subtitle." - book format
+        # Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
+        r'^([^.]+?)\.([A-Z][^.]*)\.',  # "Authors.Title." - missing space after period
+        # Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
         r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})',  # "Authors. Title. In:/URL/Year" (allow no space after period)
-        # Pattern 3: Authors ending with period followed by capital letter (simpler fallback)
+        # Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
         r'^([^.]+?)\.\s*[A-Z]',  # Allow no space after period
     ]
@@ -347,9 +435,17 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
             potential_authors = author_match.group(1).strip()
             # For patterns that also capture title, extract it
-            if i == 1 and not title and len(author_match.groups()) > 1:
+            if i == 2 and not title and len(author_match.groups()) > 2:
+                # Pattern 2 (book format) captures authors, title, and subtitle
+                title_part = author_match.group(2).strip()
+                subtitle_part = author_match.group(3).strip()
+                combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
+                if len(combined_title) > 2:
+                    title = clean_title(combined_title)
+            elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
+                # Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
                 potential_title = author_match.group(2).strip()
-                if len(potential_title) > 5 and not re.match(r'^[A-Z][a-z]+,', potential_title):
+                if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
                     title = clean_title(potential_title)
             # Validate that this looks like authors
@@ -429,8 +525,11 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
                             authors.append(part)
     # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
+    # Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
     journal_patterns = [
         r'In:\s*([^.]+?)(?:\.|$)',  # "In: Conference Name"
+        r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)',  # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
+        r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)',  # Missing space after quote like "Tasks"Adv. Neural"
         r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)',  # Conference/journal names
     ]

{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/text_utils.py RENAMED Viewed

@@ -11,6 +11,31 @@ from typing import List
 logger = logging.getLogger(__name__)
+def normalize_apostrophes(text):
+    """
+    Normalize all apostrophe variants to standard ASCII apostrophe
+    """
+    if not text:
+        return text
+    # All known apostrophe variants
+    apostrophe_variants = [
+        "'",      # U+0027 ASCII apostrophe
+        "'",      # U+2019 Right single quotation mark (most common)
+        "'",      # U+2018 Left single quotation mark
+        "ʼ",      # U+02BC Modifier letter apostrophe
+        "ˈ",      # U+02C8 Modifier letter vertical line (primary stress)
+        "`",      # U+0060 Grave accent (sometimes used as apostrophe)
+        "´",      # U+00B4 Acute accent (sometimes used as apostrophe)
+    ]
+    # Replace all variants with standard ASCII apostrophe
+    for variant in apostrophe_variants:
+        text = text.replace(variant, "'")
+    return text
 def normalize_text(text):
     """
     Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
     if not text:
         return ""
+    # First normalize apostrophes to standard form
+    text = normalize_apostrophes(text)
     # Replace common special characters with their ASCII equivalents
     replacements = {
         'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
         'Ł': 'L', 'ł': 'l',
         '¨': '', '´': '', '`': '', '^': '', '~': '',
         '–': '-', '—': '-', '−': '-',
-        '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
+        '„': '"', '"': '"', '"': '"',
         '«': '"', '»': '"',
         '¡': '!', '¿': '?',
         '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
         '\u00A0': ' ',  # Non-breaking space
         '\u2013': '-',  # En dash
         '\u2014': '-',  # Em dash
-        '\u2018': "'",  # Left single quotation mark
-        '\u2019': "'",  # Right single quotation mark
-        '\u201C': '"',  # Left double quotation mark
-        '\u201D': '"',  # Right double quotation mark
         '\u2026': '...',  # Horizontal ellipsis
         '\u00B7': '.',  # Middle dot
         '\u2022': '.',  # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
     # Remove any remaining diacritical marks
     text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
-    # Remove special characters
-    text = re.sub(r'[^\w\s]', '', text)
+    # Remove special characters except apostrophes
+    text = re.sub(r"[^\w\s']", '', text)
     # Normalize whitespace
     text = re.sub(r'\s+', ' ', text).strip()
@@ -368,6 +392,9 @@ def clean_author_name(author):
     # Normalize Unicode characters (e.g., combining diacritics)
     author = unicodedata.normalize('NFKC', author)
+    # Normalize apostrophes first before other processing
+    author = normalize_apostrophes(author)
     # Handle common Unicode escape sequences and LaTeX encodings
     # Note: Order matters - process longer patterns first
     unicode_replacements = [
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
         'José' -> 'jose'
         'Łukasz' -> 'lukasz'
         'J. Gl¨ uck' -> 'J. Gluck'
+        'D'Amato' -> 'D'Amato' (apostrophes normalized)
     """
-    # First handle special characters that don't decompose properly
+    # First normalize apostrophes
+    text = normalize_apostrophes(text)
+    # Then handle special characters that don't decompose properly
     # Including common transliterations
     special_chars = {
         'ł': 'l', 'Ł': 'L',
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
     if not author_name:
         return author_name
-    author_name = author_name.strip()
+    # Normalize apostrophes for consistent display
+    author_name = normalize_apostrophes(author_name.strip())
     # Check if it's in "Lastname, Firstname" format
     if ',' in author_name:
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
             if abbrev in expanded_text:
                 expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
+                break  # Only apply the first (longest) matching abbreviation to avoid conflicts
         # Second pass: handle single word abbreviations
         words = expanded_text.split()
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         return False
     # Order-aware fuzzy matching - words should match in sequence
-    words1_list = list(words1)
-    words2_list = list(words2)
+    # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
+    words1_list = sorted(list(words1))
+    words2_list = sorted(list(words2))
     # If word counts are very different, they're likely different venues
     if len(words1) > 0 and len(words2) > 0: