PyPI - academic-refchecker - Versions diffs - 1.2.49__tar.gz → 1.2.51__tar.gz - Mend

academic-refchecker 1.2.49tar.gz → 1.2.51tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{academic_refchecker-1.2.49/src/academic_refchecker.egg-info → academic_refchecker-1.2.51}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.49
+Version: 1.2.51
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.49"
+__version__ = "1.2.51"

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.49
+Version: 1.2.51
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/github_checker.py RENAMED Viewed

@@ -5,6 +5,7 @@ import re
 import logging
 from urllib.parse import urlparse
 from typing import Dict, Optional, Tuple, List, Any
+from utils.text_utils import strip_latex_commands
 logger = logging.getLogger(__name__)
@@ -170,7 +171,9 @@ class GitHubChecker:
                 title_match = self._check_title_match(cited_title, actual_name, actual_description)
                 if not title_match:
                     from utils.error_utils import format_title_mismatch
-                    details = format_title_mismatch(cited_title, actual_name)
+                    # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
+                    clean_cited_title = strip_latex_commands(cited_title)
+                    details = format_title_mismatch(clean_cited_title, actual_name)
                     if actual_description:
                         snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
                         details += f" ({snippet})"

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/openreview_checker.py RENAMED Viewed

@@ -36,7 +36,8 @@ from utils.text_utils import (
     normalize_text, clean_title_basic, is_name_match,
     calculate_title_similarity, compare_authors,
     clean_title_for_search, are_venues_substantially_different,
-    is_year_substantially_different
+    is_year_substantially_different, strip_latex_commands,
+    compare_titles_with_latex_cleaning
 )
 # Set up logging
@@ -423,10 +424,12 @@ class OpenReviewReferenceChecker:
         paper_title = paper_data.get('title', '').strip()
         if cited_title and paper_title:
-            similarity = calculate_title_similarity(cited_title, paper_title)
+            similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
             if similarity < 0.7:  # Using a reasonable threshold
                 from utils.error_utils import format_title_mismatch
-                details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
+                # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
+                clean_cited_title = strip_latex_commands(cited_title)
+                details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
                 errors.append({
                     "warning_type": "title",
                     "warning_details": details
@@ -547,10 +550,12 @@ class OpenReviewReferenceChecker:
         paper_title = best_match.get('title', '').strip()
         if cited_title and paper_title:
-            similarity = calculate_title_similarity(cited_title, paper_title)
+            similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
             if similarity < 0.8:  # Slightly higher threshold for search results
                 from utils.error_utils import format_title_mismatch
-                details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
+                # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
+                clean_cited_title = strip_latex_commands(cited_title)
+                details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
                 errors.append({
                     "warning_type": "title",
                     "warning_details": details

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/semantic_scholar.py RENAMED Viewed

@@ -28,7 +28,7 @@ import time
 import logging
 import re
 from typing import Dict, List, Tuple, Optional, Any, Union
-from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
+from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
 from utils.error_utils import format_title_mismatch
 from config.settings import get_config
@@ -353,7 +353,7 @@ class NonArxivReferenceChecker:
                             cited_title = title.strip()
                             if cited_title and result_title:
-                                title_similarity = calculate_title_similarity(cited_title.lower(), result_title.lower())
+                                title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
                                 logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
                                 logger.debug(f"Cited title: '{cited_title}'")
                                 logger.debug(f"Found title: '{result_title}'")
@@ -385,7 +385,7 @@ class NonArxivReferenceChecker:
                         logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
                         if cited_title and arxiv_title:
-                            title_similarity = calculate_title_similarity(cited_title.lower(), arxiv_title.lower())
+                            title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
                             logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
                             logger.debug(f"Cited title: '{cited_title}'")
                             logger.debug(f"ArXiv title: '{arxiv_title}'")
@@ -419,7 +419,7 @@ class NonArxivReferenceChecker:
                         arxiv_title_check = arxiv_paper_check.get('title', '').strip()
                         cited_title_check = title.strip()
                         if cited_title_check and arxiv_title_check:
-                            title_similarity_check = calculate_title_similarity(cited_title_check.lower(), arxiv_title_check.lower())
+                            title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
                             if title_similarity_check < SIMILARITY_THRESHOLD:
                                 logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
                                 arxiv_id_mismatch_detected = True
@@ -468,11 +468,13 @@ class NonArxivReferenceChecker:
                 return None, [], None
         # Check title using similarity function to handle formatting differences
-        title_similarity = calculate_title_similarity(title, found_title) if found_title else 0.0
+        title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
         if found_title and title_similarity < SIMILARITY_THRESHOLD:
+            # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
+            clean_cited_title = strip_latex_commands(title)
             errors.append({
                 'error_type': 'title',
-                'error_details': format_title_mismatch(title, found_title),
+                'error_details': format_title_mismatch(clean_cited_title, found_title),
                 'ref_title_correct': paper_data.get('title', '')
             })

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/webpage_checker.py RENAMED Viewed

@@ -7,6 +7,7 @@ from urllib.parse import urlparse, urljoin
 from typing import Dict, Optional, Tuple, List, Any
 from bs4 import BeautifulSoup
 import time
+from utils.text_utils import strip_latex_commands
 logger = logging.getLogger(__name__)
@@ -185,9 +186,11 @@ class WebPageChecker:
             if cited_title and page_title:
                 if not self._check_title_match(cited_title, page_title, page_description):
                     from utils.error_utils import format_title_mismatch
+                    # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
+                    clean_cited_title = strip_latex_commands(cited_title)
                     errors.append({
                         "warning_type": "title",
-                        "warning_details": format_title_mismatch(cited_title, page_title)
+                        "warning_details": format_title_mismatch(clean_cited_title, page_title)
                     })
             # Check if this is a documentation page for the cited topic

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/parallel_processor.py RENAMED Viewed

@@ -275,7 +275,10 @@ class ParallelReferenceProcessor:
         reference = result.reference
         # Print reference info in the same format as sequential mode
-        title = reference.get('title', 'Untitled')
+        raw_title = reference.get('title', 'Untitled')
+        # Clean LaTeX commands from title for display
+        from utils.text_utils import strip_latex_commands
+        title = strip_latex_commands(raw_title)
         from utils.text_utils import format_authors_for_display
         authors = format_authors_for_display(reference.get('authors', []))
         year = reference.get('year', '')

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/refchecker.py RENAMED Viewed

@@ -50,7 +50,8 @@ from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
                        detect_latex_bibliography_format, extract_latex_references,
                        detect_standard_acm_natbib_format, strip_latex_commands,
                        format_corrected_reference, is_name_match, enhanced_name_match,
-                       calculate_title_similarity, normalize_arxiv_url, deduplicate_urls)
+                       calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
+                       compare_authors)
 from utils.config_validator import ConfigValidator
 from services.pdf_processor import PDFProcessor
 from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
@@ -1789,7 +1790,7 @@ class ArxivReferenceChecker:
                     if authors:
                         db_authors = [author.get('name', '') for author in check_paper_data['authors']]
-                        authors_match, author_error = self.compare_authors(authors, db_authors)
+                        authors_match, author_error = compare_authors(authors, db_authors)
                         if authors_match:
                             paper_data = check_paper_data
                             search_strategy = "Normalized title with author match"
@@ -1901,10 +1902,12 @@ class ArxivReferenceChecker:
             if normalized_title != db_title:
                 from utils.error_utils import format_title_mismatch
+                # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
+                clean_cited_title = strip_latex_commands(title)
                 logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
                 errors.append({
                     'error_type': 'title',
-                    'error_details': format_title_mismatch(title, paper_data.get('title')),
+                    'error_details': format_title_mismatch(clean_cited_title, paper_data.get('title')),
                     'ref_title_correct': paper_data.get('title')
                 })
@@ -1912,7 +1915,7 @@ class ArxivReferenceChecker:
         if authors and paper_data.get('authors'):
             # Extract author names from database data
             correct_names = [author.get('name', '') for author in paper_data['authors']]
-            authors_match, author_error = self.compare_authors(authors, correct_names)
+            authors_match, author_error = compare_authors(authors, correct_names)
             if not authors_match:
                 logger.debug(f"DB Verification: Author mismatch - {author_error}")
@@ -3054,6 +3057,13 @@ class ArxivReferenceChecker:
                 try:
                     # Extract bibliography
                     bibliography = self.extract_bibliography(paper, debug_mode)
+                    # Apply deduplication to all bibliography sources (not just LLM-extracted)
+                    if len(bibliography) > 1:  # Only deduplicate if we have multiple references
+                        original_count = len(bibliography)
+                        bibliography = self._deduplicate_bibliography_entries(bibliography)
+                        if len(bibliography) < original_count:
+                            logger.debug(f"Deduplicated {original_count} references to {len(bibliography)} unique references")
                     # Update statistics
                     self.total_papers_processed += 1
@@ -3493,8 +3503,9 @@ class ArxivReferenceChecker:
                 except Exception as e:
                     logger.error(f"LLM fallback failed: {e}")
                     return []
-            logger.debug("Using biblatex file")
-            return biblatex_refs
+            if len(biblatex_refs) > 0:
+                logger.debug("Using biblatex file")
+                return biblatex_refs
         # For non-standard formats, try LLM-based extraction if available
         if self.llm_extractor:
@@ -4284,9 +4295,9 @@ class ArxivReferenceChecker:
             # If either has no title, can't reliably determine if duplicate
             return False
-        # If titles match exactly, consider them duplicates
-        # This handles the case where the same paper appears multiple times
-        if seg1['title'] == seg2['title']:
+        # If titles match exactly (case-insensitive), consider them duplicates
+        # This handles the case where the same paper appears multiple times with different capitalization
+        if seg1['title'].lower() == seg2['title'].lower():
             return True
         # Special case: Check if one title is an arXiv identifier and the other is a real title
@@ -4299,16 +4310,54 @@ class ArxivReferenceChecker:
         author1 = seg1['author']
         author2 = seg2['author']
-        if author1 and author2 and author1 == author2:
+        if author1 and author2 and author1.lower() == author2.lower():
             # Same authors - check if one title is substring of other or significant similarity
-            title1 = seg1['title']
-            title2 = seg2['title']
+            title1 = seg1['title'].lower()
+            title2 = seg2['title'].lower()
             if (title1 in title2 or title2 in title1):
                 return True
         return False
+    def _deduplicate_bibliography_entries(self, bibliography):
+        """
+        Deduplicate bibliography entries using title and author comparison.
+        This works with structured reference dictionaries from BibTeX/LaTeX parsing,
+        as opposed to _deduplicate_references_with_segment_matching which works with raw text.
+        Args:
+            bibliography: List of reference dictionaries with 'title', 'authors', etc.
+        Returns:
+            List of unique reference dictionaries
+        """
+        if len(bibliography) <= 1:
+            return bibliography
+        unique_refs = []
+        seen_titles = set()
+        for ref in bibliography:
+            title = ref.get('title', '').strip()
+            if not title:
+                # Keep references without titles (they can't be deduplicated)
+                unique_refs.append(ref)
+                continue
+            # Normalize title for comparison (case-insensitive, basic cleanup)
+            normalized_title = title.lower().strip()
+            # Check if we've seen this title before (case-insensitive)
+            if normalized_title in seen_titles:
+                logger.debug(f"Skipping duplicate reference: '{title}'")
+            else:
+                unique_refs.append(ref)
+                seen_titles.add(normalized_title)
+        return unique_refs
     def _is_arxiv_identifier_title_mismatch(self, seg1, seg2):
         """
         Check if one reference has an arXiv identifier as title while the other has a real title,
@@ -5087,60 +5136,6 @@ class ArxivReferenceChecker:
         return references
-    def compare_authors(self, cited_authors, correct_authors):
-        """
-        Compare author lists to check if they match using improved name matching.
-        Uses the utility function is_name_match for robust author name comparison.
-        """
-        # Clean up author names
-        cleaned_cited = []
-        for author in cited_authors:
-            # Remove reference numbers (e.g., "[1]")
-            author = re.sub(r'^\[\d+\]', '', author)
-            # Remove line breaks
-            author = author.replace('\n', ' ')
-            # Handle "et al" cases properly
-            author_clean = author.strip()
-            if author_clean.lower() == 'et al':
-                # Skip pure "et al" entries
-                continue
-            elif 'et al' in author_clean.lower():
-                # Remove "et al" from the author name (e.g., "S. M. Lundberg et al" -> "S. M. Lundberg")
-                author_clean = re.sub(r'\s+et\s+al\.?', '', author_clean, flags=re.IGNORECASE).strip()
-                if author_clean:  # Only add if something remains
-                    cleaned_cited.append(author_clean)
-            else:
-                cleaned_cited.append(author_clean)
-        if not cleaned_cited:
-            return True, "No authors to compare"
-        # Handle "et al" cases and length mismatches
-        has_et_al = any('et al' in a.lower() for a in cited_authors)
-        if len(cleaned_cited) < len(correct_authors) and (has_et_al or len(cleaned_cited) <= 3):
-            # Only compare the authors that are listed
-            correct_authors = correct_authors[:len(cleaned_cited)]
-        elif len(cleaned_cited) > len(correct_authors) and len(correct_authors) >= 3:
-            # Use available correct authors
-            cleaned_cited = cleaned_cited[:len(correct_authors)]
-        # If there's a big count mismatch and no "et al", it's likely an error
-        if abs(len(cleaned_cited) - len(correct_authors)) > 3 and not has_et_al:
-            return False, "Author count mismatch"
-        # Compare first author (most important) using the improved utility function
-        if cleaned_cited and correct_authors:
-            # Use raw names for comparison (is_name_match handles normalization internally)
-            cited_first = cleaned_cited[0]
-            correct_first = correct_authors[0]
-            if not enhanced_name_match(cited_first, correct_first):
-                from utils.error_utils import format_first_author_mismatch
-                return False, format_first_author_mismatch(cited_first, correct_first)
-        return True, "Authors match"
     def normalize_text(self, text):
         """
@@ -5251,6 +5246,19 @@ class ArxivReferenceChecker:
             return False
         return True
+    def compare_authors(self, authors1, authors2):
+        """
+        Compare authors using the text_utils compare_authors function.
+        Args:
+            authors1: First list of authors
+            authors2: Second list of authors
+        Returns:
+            Tuple of (match_result, error_message)
+        """
+        return compare_authors(authors1, authors2)
     def _verify_references_sequential(self, paper, bibliography, paper_errors, error_types, unverified_count, debug_mode):
         """
         Sequential reference verification (original implementation)
@@ -5267,7 +5275,10 @@ class ArxivReferenceChecker:
             ref_id = self.extract_arxiv_id_from_url(reference['url'])
             # Print reference info in non-debug mode (improved formatting)
-            title = reference.get('title', 'Untitled')
+            raw_title = reference.get('title', 'Untitled')
+            # Clean LaTeX commands from title for display
+            from utils.text_utils import strip_latex_commands
+            title = strip_latex_commands(raw_title)
             from utils.text_utils import format_authors_for_display
             authors = format_authors_for_display(reference.get('authors', []))
             year = reference.get('year', '')

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/arxiv_utils.py RENAMED Viewed

@@ -111,56 +111,8 @@ def download_arxiv_source(arxiv_id):
                     main_tex_content = largest_file[1]
                     logger.debug(f"Using largest tex file: {largest_file[0]}")
-            # Find which .bib files are actually referenced in the main tex file
-            bib_content = None
-            if bib_files and main_tex_content:
-                # Extract bibliography references from main tex file
-                referenced_bibs = []
-                bib_pattern = r'\\bibliography\{([^}]+)\}'
-                matches = re.findall(bib_pattern, main_tex_content)
-                for match in matches:
-                    # Handle multiple bib files separated by commas
-                    bib_names = [name.strip() for name in match.split(',')]
-                    for bib_name in bib_names:
-                        # Add .bib extension if not present
-                        if not bib_name.endswith('.bib'):
-                            bib_name += '.bib'
-                        referenced_bibs.append(bib_name)
-                # Use only referenced .bib files, or all if no references found
-                if referenced_bibs:
-                    used_bibs = []
-                    for bib_name in referenced_bibs:
-                        if bib_name in bib_files:
-                            used_bibs.append(bib_files[bib_name])
-                            logger.debug(f"Using referenced .bib file: {bib_name}")
-                        else:
-                            logger.debug(f"Referenced .bib file not found: {bib_name}")
-                    if used_bibs:
-                        raw_bib_content = '\n\n'.join(used_bibs)
-                        # Filter BibTeX to only include cited references
-                        bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
-                        logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
-                    else:
-                        # Fallback to all bib files if none of the referenced ones found
-                        raw_bib_content = '\n\n'.join(bib_files.values())
-                        bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
-                        logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
-                else:
-                    # No \bibliography command found, use all bib files
-                    raw_bib_content = '\n\n'.join(bib_files.values())
-                    bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
-                    logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
-            elif bib_files:
-                # No main tex file but have bib files
-                raw_bib_content = '\n\n'.join(bib_files.values())
-                # Can't filter without tex files, so use original content
-                bib_content = raw_bib_content
-                logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
+            # Process .bib files using shared logic
+            bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
             # Combine all bbl file contents
             bbl_content = None
@@ -219,6 +171,78 @@ def download_arxiv_bibtex(arxiv_id):
         return None
+def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
+    """
+    Select appropriate .bib files based on main TeX file references and filter by citations.
+    Args:
+        bib_files: Dict of .bib files {filename: content}
+        main_tex_content: Content of main tex file
+        tex_files: Dict of all tex files {filename: content} (for filtering)
+    Returns:
+        Filtered BibTeX content or None if no files available
+    """
+    import re
+    if not bib_files:
+        return None
+    if main_tex_content:
+        # Extract bibliography references from main tex file
+        referenced_bibs = []
+        bib_pattern = r'\\bibliography\{([^}]+)\}'
+        matches = re.findall(bib_pattern, main_tex_content)
+        for match in matches:
+            # Handle multiple bib files separated by commas
+            bib_names = [name.strip() for name in match.split(',')]
+            for bib_name in bib_names:
+                # Add .bib extension if not present
+                if not bib_name.endswith('.bib'):
+                    bib_name += '.bib'
+                referenced_bibs.append(bib_name)
+        # Use only referenced .bib files, or all if no references found
+        if referenced_bibs:
+            used_bibs = []
+            seen_bib_names = set()  # Track which bib files we've already added
+            for bib_name in referenced_bibs:
+                if bib_name in bib_files and bib_name not in seen_bib_names:
+                    used_bibs.append(bib_files[bib_name])
+                    seen_bib_names.add(bib_name)
+                    logger.debug(f"Using referenced .bib file: {bib_name}")
+                elif bib_name in seen_bib_names:
+                    logger.debug(f"Skipping duplicate .bib file: {bib_name}")
+                else:
+                    logger.debug(f"Referenced .bib file not found: {bib_name}")
+            if used_bibs:
+                raw_bib_content = '\n\n'.join(used_bibs)
+                # Filter BibTeX to only include cited references
+                filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
+                logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
+                return filtered_content
+            else:
+                # Fallback to all bib files if none of the referenced ones found
+                raw_bib_content = '\n\n'.join(bib_files.values())
+                filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
+                logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
+                return filtered_content
+        else:
+            # No \bibliography command found, use all bib files
+            raw_bib_content = '\n\n'.join(bib_files.values())
+            filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
+            logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
+            return filtered_content
+    else:
+        # No main tex file but have bib files
+        raw_bib_content = '\n\n'.join(bib_files.values())
+        # Can't filter without tex files, so use original content
+        logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
+        return raw_bib_content
 def extract_cited_keys_from_tex(tex_files, main_tex_content):
     """
     Extract all citation keys from TeX files.
@@ -261,7 +285,11 @@ def is_reference_used(reference_key, cited_keys):
     Returns:
         True if the reference is cited, False otherwise
     """
-    return reference_key in cited_keys
+    result = reference_key in cited_keys
+    # Add debugging for the first few mismatches to understand the issue
+    if not result and len([k for k in cited_keys if k.startswith('a')]) < 3:  # Limit debug output
+        logger.debug(f"Key '{reference_key}' not found in cited_keys")
+    return result
 def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
@@ -291,14 +319,30 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
         from utils.bibtex_parser import parse_bibtex_entries
         entries = parse_bibtex_entries(bib_content)
-        # Filter entries to only cited ones
+        # Filter entries to only cited ones and remove duplicates
         cited_entries = []
+        seen_keys = set()
+        not_cited_count = 0
+        duplicate_count = 0
         for entry in entries:
             entry_key = entry.get('key', '')
             if is_reference_used(entry_key, cited_keys):
-                cited_entries.append(entry)
+                if entry_key not in seen_keys:
+                    cited_entries.append(entry)
+                    seen_keys.add(entry_key)
+                else:
+                    duplicate_count += 1
+                    logger.debug(f"Skipping duplicate entry: '{entry_key}'")
+            else:
+                not_cited_count += 1
+                # Log first few entries that are NOT cited for debugging
+                if not_cited_count <= 5:
+                    logger.debug(f"Entry NOT cited: '{entry_key}'")
-        logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited")
+        logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
+        logger.debug(f"Citation keys found: {len(cited_keys)} keys")
+        logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
         # Reconstruct BibTeX content from cited entries
         if not cited_entries:

{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/text_utils.py RENAMED Viewed

@@ -580,6 +580,9 @@ def clean_title_for_search(title):
     if not isinstance(title, str):
         return str(title) if title is not None else ''
+    # Strip LaTeX commands to handle math formatting and other LaTeX markup
+    title = strip_latex_commands(title)
     # Clean up newlines and normalize whitespace (but preserve other structure)
     title = title.replace('\n', ' ').strip()
     title = re.sub(r'\s+', ' ', title)  # Normalize whitespace only
@@ -753,8 +756,11 @@ def normalize_paper_title(title: str) -> str:
     if not title:
         return ""
+    # Strip LaTeX commands first to handle math formatting consistently
+    normalized = strip_latex_commands(title)
     # Convert to lowercase
-    normalized = title.lower()
+    normalized = normalized.lower()
     # Remove common prefixes that don't affect the actual title content
     prefixes_to_remove = [
@@ -2107,21 +2113,37 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
+    # Detect if cited authors look like parsing fragments
+    # (many short single-word entries that might be first/last name fragments)
+    def looks_like_fragments(authors_list):
+        if len(authors_list) < 4:  # Need at least 4 to detect fragment pattern
+            return False
+        single_word_count = sum(1 for author in authors_list if len(author.strip().split()) == 1)
+        return single_word_count >= len(authors_list) * 0.7  # 70% or more are single words
     # Normal case without "et al" - compare all authors
     if len(cleaned_cited) != len(correct_names):
-        # For non-et-al cases, be more strict about count mismatches
-        # Allow minor flexibility (1 author difference) but not more
-        if abs(len(cleaned_cited) - len(correct_names)) > 1:
+        # Check if cited authors look like parsing fragments
+        if looks_like_fragments(cleaned_cited):
             from utils.error_utils import format_author_count_mismatch
-            # Convert cited names to display format (First Last) before showing in error
             display_cited = [format_author_for_display(author) for author in cleaned_cited]
             error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
             return False, error_msg
-        # Use the shorter list for comparison
-        min_len = min(len(cleaned_cited), len(correct_names))
-        comparison_cited = cleaned_cited[:min_len]
-        comparison_correct = correct_names[:min_len]
+        # For all count mismatches, show the count mismatch error
+        if len(cleaned_cited) < len(correct_names):
+            from utils.error_utils import format_author_count_mismatch
+            display_cited = [format_author_for_display(author) for author in cleaned_cited]
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
+            return False, error_msg
+        # For cases where cited > correct, also show count mismatch
+        elif len(cleaned_cited) > len(correct_names):
+            from utils.error_utils import format_author_count_mismatch
+            display_cited = [format_author_for_display(author) for author in cleaned_cited]
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
+            return False, error_msg
     else:
         comparison_cited = cleaned_cited
         comparison_correct = correct_names
@@ -2484,8 +2506,64 @@ def strip_latex_commands(text):
     # Remove font size commands
     text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\b', '', text)
-    # Remove math mode delimiters
-    text = re.sub(r'\$([^$]*)\$', r'\1', text)
+    # Handle complex math mode patterns first
+    # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
+    def process_complex_math(match):
+        content = match.group(1)
+        # Handle common Greek letters
+        content = re.sub(r'\\mu\b', 'μ', content)  # \mu -> μ
+        content = re.sub(r'\\alpha\b', 'α', content)  # \alpha -> α
+        content = re.sub(r'\\beta\b', 'β', content)   # \beta -> β
+        content = re.sub(r'\\gamma\b', 'γ', content)  # \gamma -> γ
+        content = re.sub(r'\\delta\b', 'δ', content)  # \delta -> δ
+        content = re.sub(r'\\epsilon\b', 'ε', content)  # \epsilon -> ε
+        content = re.sub(r'\\lambda\b', 'λ', content)  # \lambda -> λ
+        content = re.sub(r'\\pi\b', 'π', content)    # \pi -> π
+        content = re.sub(r'\\sigma\b', 'σ', content)  # \sigma -> σ
+        content = re.sub(r'\\theta\b', 'θ', content)  # \theta -> θ
+        # Remove any remaining LaTeX commands and braces from inside math
+        content = re.sub(r'\\[a-zA-Z]+\b', '', content)
+        content = re.sub(r'[{}]', '', content)
+        # Clean up any remaining $ signs
+        content = re.sub(r'\$+', '', content)
+        return content
+    # Handle complex nested math patterns first
+    # Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
+    def process_nested_math_specifically(match):
+        content = match.group(0)
+        # Handle the specific pattern: $\{$$\mu$second-scale$\}$
+        # Extract the meaningful parts
+        if r'\mu' in content:
+            # Replace \mu with μ and extract the surrounding text
+            content = re.sub(r'\\mu\b', 'μ', content)
+        # Remove all LaTeX math markup
+        content = re.sub(r'[\$\{\}\\]+', '', content)
+        return content
+    # Handle the specific problematic pattern
+    text = re.sub(r'\$\\\{[^}]*\\\}\$', process_nested_math_specifically, text)
+    # Handle Greek letters in math mode before removing delimiters
+    def process_standard_math(match):
+        content = match.group(1)
+        # Handle common Greek letters - content has single backslashes
+        content = re.sub(r'\\mu\b', 'μ', content)
+        content = re.sub(r'\\alpha\b', 'α', content)
+        content = re.sub(r'\\beta\b', 'β', content)
+        content = re.sub(r'\\gamma\b', 'γ', content)
+        content = re.sub(r'\\delta\b', 'δ', content)
+        content = re.sub(r'\\epsilon\b', 'ε', content)
+        content = re.sub(r'\\lambda\b', 'λ', content)
+        content = re.sub(r'\\pi\b', 'π', content)
+        content = re.sub(r'\\sigma\b', 'σ', content)
+        content = re.sub(r'\\theta\b', 'θ', content)
+        # Remove any remaining LaTeX commands
+        content = re.sub(r'\\[a-zA-Z]+\b', '', content)
+        return content
+    # Remove standard math mode delimiters with Greek letter processing
+    text = re.sub(r'\$([^$]*)\$', process_standard_math, text)
     text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
     text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)
@@ -3369,7 +3447,18 @@ def _extract_corrected_reference_data(error_entry: dict, corrected_data: dict) -
     """
     # Get the corrected information
     correct_title = error_entry.get('ref_title_correct') or corrected_data.get('title', '')
-    correct_authors = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
+    # Handle authors - can be string or list of dicts from API
+    authors_raw = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
+    if isinstance(authors_raw, list):
+        # Convert list of author dicts to comma-separated string
+        if authors_raw and isinstance(authors_raw[0], dict):
+            correct_authors = ', '.join([author.get('name', '') for author in authors_raw])
+        else:
+            correct_authors = ', '.join(authors_raw)
+    else:
+        correct_authors = str(authors_raw) if authors_raw else ''
     correct_year = error_entry.get('ref_year_correct') or corrected_data.get('year', '')
     # Prioritize the verified URL that was actually used for verification
@@ -3573,7 +3662,39 @@ def format_corrected_plaintext(original_reference, corrected_data, error_entry):
     if correct_url:
         citation_parts.append(f"{correct_url}")
-    return '. '.join(citation_parts) + '.'
+    citation_text = '. '.join(citation_parts) + '.'
+    # Add citation key information if available (for easy copying)
+    citation_key = original_reference.get('bibtex_key') or original_reference.get('bibitem_key')
+    if citation_key and citation_key != 'unknown':
+        bibtex_type = original_reference.get('bibtex_type', 'misc')
+        citation_text += f"\n\n% Citation key for BibTeX: @{bibtex_type}{{{citation_key}, ...}}"
+    return citation_text
+def compare_titles_with_latex_cleaning(cited_title: str, database_title: str) -> float:
+    """
+    Compare two titles with proper LaTeX cleaning for accurate similarity scoring.
+    This function ensures both titles are cleaned of LaTeX commands before comparison
+    to avoid false mismatches due to formatting differences like {LLM}s vs LLMs.
+    Args:
+        cited_title: Title from cited reference (may contain LaTeX)
+        database_title: Title from database (usually already clean)
+    Returns:
+        Similarity score between 0 and 1
+    """
+    if not cited_title or not database_title:
+        return 0.0
+    # Clean LaTeX commands from cited title to match database format
+    clean_cited = strip_latex_commands(cited_title)
+    # Calculate similarity using cleaned titles
+    return calculate_title_similarity(clean_cited, database_title)
 def calculate_title_similarity(title1: str, title2: str) -> float:
@@ -3902,6 +4023,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
                 # Handle specific multi-word patterns and well-known acronyms
                 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
                 'pnas': 'proceedings of the national academy of sciences',
+                # Special cases that don't follow standard acronym patterns
+                'neurips': 'neural information processing systems',  # Special case
+                'nips': 'neural information processing systems',     # old name for neurips
             }
             # Sort by length (longest first) to ensure longer matches take precedence
             for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):