PyPI - academic-refchecker - Versions diffs - 1.2.45__tar.gz → 1.2.47__tar.gz - Mend

academic-refchecker 1.2.45tar.gz → 1.2.47tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{academic_refchecker-1.2.45/src/academic_refchecker.egg-info → academic_refchecker-1.2.47}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.45
+Version: 1.2.47
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.45"
+__version__ = "1.2.47"

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.45
+Version: 1.2.47
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/local_semantic_scholar.py RENAMED Viewed

@@ -430,11 +430,22 @@ class LocalNonArxivReferenceChecker:
                 logger.debug(f"Local DB: Author mismatch - {author_error}")
                 errors.append(create_author_error(author_error, paper_data.get('authors', [])))
-        # Verify year
+        # Verify year (with tolerance)
         paper_year = paper_data.get('year')
-        if year and paper_year and year != paper_year:
-            logger.debug(f"Local DB: Year mismatch - cited: {year}, actual: {paper_year}")
-            errors.append(create_year_warning(year, paper_year))
+        if year and paper_year:
+            # Get year tolerance from config (default to 1 if not available)
+            year_tolerance = 1  # Default tolerance
+            try:
+                from config.settings import get_config
+                config = get_config()
+                year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
+            except (ImportError, Exception):
+                pass  # Use default if config not available
+            # Only flag as mismatch if the difference is greater than tolerance
+            if abs(year - paper_year) > year_tolerance:
+                logger.debug(f"Local DB: Year mismatch - cited: {year}, actual: {paper_year}")
+                errors.append(create_year_warning(year, paper_year))
         # Verify DOI
         paper_doi = None

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/semantic_scholar.py RENAMED Viewed

@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
         elif paper_venue and not isinstance(paper_venue, str):
             paper_venue = str(paper_venue)
+        # Check venue mismatches
         if cited_venue and paper_venue:
             # Use the utility function to check if venues are substantially different
             if are_venues_substantially_different(cited_venue, paper_venue):
                 from utils.error_utils import create_venue_warning
                 errors.append(create_venue_warning(cited_venue, paper_venue))
         elif not cited_venue and paper_venue:
-            # Check if this is an arXiv paper first
-            external_ids = paper_data.get('externalIds', {})
-            arxiv_id = external_ids.get('ArXiv') if external_ids else None
-            if arxiv_id:
-                # For arXiv papers, suggest including the arXiv URL instead of venue
-                arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
-                # Check if the reference already includes this ArXiv URL or equivalent DOI
-                reference_url = reference.get('url', '')
-                # Check for direct arXiv URL match
-                has_arxiv_url = arxiv_url in reference_url
-                # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
-                arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
-                has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
-                if not (has_arxiv_url or has_arxiv_doi):
+            # Original reference has the venue in raw text but not parsed correctly
+            raw_text = reference.get('raw_text', '')
+            if raw_text and '#' in raw_text:
+                # Check if venue might be in the raw text format (author#title#venue#year#url)
+                parts = raw_text.split('#')
+                if len(parts) >= 3 and parts[2].strip():
+                    # Venue is present in raw text but missing from parsed reference
                     errors.append({
                         'warning_type': 'venue',
-                        'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
-                        'ref_url_correct': arxiv_url
+                        'warning_details': f"Venue missing: should include '{paper_venue}'",
+                        'ref_venue_correct': paper_venue
                     })
-            else:
-                # Original reference has the venue in raw text but not parsed correctly
-                raw_text = reference.get('raw_text', '')
-                if raw_text and '#' in raw_text:
-                    # Check if venue might be in the raw text format (author#title#venue#year#url)
-                    parts = raw_text.split('#')
-                    if len(parts) >= 3 and parts[2].strip():
-                        # Venue is present in raw text but missing from parsed reference
-                        errors.append({
-                            'warning_type': 'venue',
-                            'warning_details': f"Venue missing: should include '{paper_venue}'",
-                            'ref_venue_correct': paper_venue
-                        })
+        # Always check for missing arXiv URLs when paper has arXiv ID
+        external_ids = paper_data.get('externalIds', {})
+        arxiv_id = external_ids.get('ArXiv') if external_ids else None
+        if arxiv_id:
+            # For arXiv papers, check if reference includes the arXiv URL
+            arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
+            # Check if the reference already includes this ArXiv URL or equivalent DOI
+            reference_url = reference.get('url', '')
+            # Check for direct arXiv URL match
+            has_arxiv_url = arxiv_url in reference_url
+            # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
+            arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
+            has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
+            if not (has_arxiv_url or has_arxiv_doi):
+                errors.append({
+                    'warning_type': 'url',
+                    'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
+                    'ref_url_correct': arxiv_url
+                })
         # Verify DOI
         paper_doi = None

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/webpage_checker.py RENAMED Viewed

@@ -71,7 +71,8 @@ class WebPageChecker:
         doc_indicators = [
             'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
             'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
-            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
+            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
+            'posts'  # For blog posts and forum posts like LessWrong
         ]
         return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
         doc_domains = [
             'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
             'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
-            'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
+            'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
+            'lesswrong.com'  # LessWrong rationality and AI safety blog platform
         ]
         return any(domain in parsed.netloc for domain in doc_domains)
@@ -395,6 +397,14 @@ class WebPageChecker:
         organization = site_info.get('organization', '').lower()
         domain = site_info.get('domain', '').lower()
+        # Accept generic web resource terms - these are valid for any web URL
+        generic_web_terms = [
+            'web resource', 'web site', 'website', 'online resource',
+            'online', 'web', 'internet resource', 'web page', 'webpage'
+        ]
+        if cited_lower in generic_web_terms:
+            return True
         # Direct matches
         if cited_lower in organization or organization in cited_lower:
             return True

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/refchecker.py RENAMED Viewed

@@ -1922,16 +1922,27 @@ class ArxivReferenceChecker:
                     'ref_authors_correct': ', '.join(correct_names)
                 })
-        # Verify year
+        # Verify year (with tolerance)
         paper_year = paper_data.get('year')
-        if year and paper_year and year != paper_year:
-            logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
-            from utils.error_utils import format_year_mismatch
-            errors.append({
-                'warning_type': 'year',
-                'warning_details': format_year_mismatch(year, paper_year),
-                'ref_year_correct': paper_year
-            })
+        if year and paper_year:
+            # Get year tolerance from config (default to 1 if not available)
+            year_tolerance = 1  # Default tolerance
+            try:
+                from config.settings import get_config
+                config = get_config()
+                year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
+            except (ImportError, Exception):
+                pass  # Use default if config not available
+            # Only flag as mismatch if the difference is greater than tolerance
+            if abs(year - paper_year) > year_tolerance:
+                logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
+                from utils.error_utils import format_year_mismatch
+                errors.append({
+                    'warning_type': 'year',
+                    'warning_details': format_year_mismatch(year, paper_year),
+                    'ref_year_correct': paper_year
+                })
         # Verify DOI
         if doi and external_ids.get('DOI'):

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/arxiv_utils.py RENAMED Viewed

@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
         logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
         tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-        # Choose between .bib and .bbl files based on content richness
-        # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
+        # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
+        # .bbl files are processed biblatex output that reflects exactly what was cited
         if bib_content and bbl_content:
-            # Count entries in both
+            # Count entries in both for logging
             bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
             bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
-            # If we have LaTeX content, get filtered BibTeX count
-            filtered_bib_count = bib_entry_count
-            filtered_content = bib_content
-            if tex_content:
-                cited_keys = extract_cited_keys_from_tex({}, tex_content)
-                if cited_keys:
-                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
-                    filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
-            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
+            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
-            # Prioritize .bbl if it has significantly more entries
-            if bbl_entry_count > filtered_bib_count * 1.5:  # 50% more entries threshold
-                logger.info(f"Using .bbl files from ArXiv source")
+            # Only use .bbl if it actually contains bibliography entries
+            if bbl_entry_count > 0:
+                logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
                 return bbl_content
             else:
-                logger.info(f"Using filtered .bib files")
-                return filtered_content
+                logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
+                # If we have LaTeX content, filter BibTeX by cited keys
+                if tex_content:
+                    cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                    if cited_keys:
+                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                        filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                        return filtered_content
+                return bib_content
         elif bib_content:
             logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/doi_utils.py RENAMED Viewed

@@ -99,9 +99,8 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     """
     Compare two DOIs for equality, handling different formats and prefixes.
-    This function performs exact matching after normalization, which means
-    DOIs are only considered equal if they are identical after removing
-    prefixes, case differences, and punctuation.
+    This function performs exact matching after normalization, with support
+    for partial DOI citations where a shorter DOI is a valid prefix of a longer one.
     Args:
         doi1: First DOI to compare
@@ -117,8 +116,27 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     norm_doi1 = normalize_doi(doi1)
     norm_doi2 = normalize_doi(doi2)
-    # DOIs must be exactly identical after normalization
-    return norm_doi1 == norm_doi2
+    # First try exact match
+    if norm_doi1 == norm_doi2:
+        return True
+    # Handle partial DOI citations - if one DOI is a prefix of the other, consider it a match
+    # This handles cases like "10.1007" being cited instead of the full "10.1007/s10458-025-09691-y"
+    if len(norm_doi1) != len(norm_doi2):
+        shorter_doi = norm_doi1 if len(norm_doi1) < len(norm_doi2) else norm_doi2
+        longer_doi = norm_doi2 if len(norm_doi1) < len(norm_doi2) else norm_doi1
+        # Only consider it a valid partial match if:
+        # 1. The shorter DOI is at least 7 characters (e.g., "10.1007")
+        # 2. The longer DOI starts with the shorter DOI
+        # 3. The next character in the longer DOI is '/' or '.' (valid DOI separators)
+        if (len(shorter_doi) >= 7 and
+            longer_doi.startswith(shorter_doi) and
+            len(longer_doi) > len(shorter_doi) and
+            longer_doi[len(shorter_doi)] in ['/', '.']):
+            return True
+    return False
 def construct_doi_url(doi: str) -> str:

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/error_utils.py RENAMED Viewed

@@ -183,6 +183,14 @@ def clean_venue_for_comparison(venue: str) -> str:
     return normalize_venue_for_display(venue)
+def format_missing_venue(correct_venue: str) -> str:
+    """
+    Format a missing venue message with only the actual value.
+    """
+    # Only show the actual venue; omit the empty cited line
+    return f"Missing venue: '{correct_venue}'"
 def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
     """
     Create a standardized venue warning dictionary.
@@ -197,7 +205,15 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
     # Clean both venues for display in the warning
     clean_cited = clean_venue_for_comparison(cited_venue)
     clean_correct = clean_venue_for_comparison(correct_venue)
+    # If cited venue cleans to empty, treat as missing venue instead of mismatch
+    if not clean_cited and clean_correct:
+        return {
+            'warning_type': 'venue',
+            'warning_details': format_missing_venue(clean_correct),
+            'ref_venue_correct': correct_venue
+        }
     return {
         'warning_type': 'venue',
         'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),

{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/text_utils.py RENAMED Viewed

@@ -506,8 +506,10 @@ def clean_author_name(author):
     # Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li")
     author = re.sub(r'(\w)\s+\.', r'\1.', author)
-    # Remove common prefixes/suffixes
-    author = re.sub(r'\b(Dr\.?|Prof\.?|Professor|Mr\.?|Ms\.?|Mrs\.?)\s*', '', author, flags=re.IGNORECASE)
+    # Remove common honorific prefixes only when they are standalone at the start (require trailing whitespace)
+    # Previous pattern falsely removed the leading "Mr" from names like "Mrinmaya" due to optional whitespace.
+    # Anchor to start and require at least one space after the title to avoid stripping inside longer names.
+    author = re.sub(r'^(?:Dr|Prof|Professor|Mr|Ms|Mrs)\.?\s+', '', author, flags=re.IGNORECASE)
     # Remove email addresses
     author = re.sub(r'\S+@\S+\.\S+', '', author)
@@ -2111,7 +2113,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         # Allow minor flexibility (1 author difference) but not more
         if abs(len(cleaned_cited) - len(correct_names)) > 1:
             from utils.error_utils import format_author_count_mismatch
-            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
+            # Convert cited names to display format (First Last) before showing in error
+            display_cited = [format_author_for_display(author) for author in cleaned_cited]
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
             return False, error_msg
         # Use the shorter list for comparison
@@ -3586,6 +3590,12 @@ def calculate_title_similarity(title1: str, title2: str) -> float:
     # Normalize titles for comparison
     t1 = title1.lower().strip()
     t2 = title2.lower().strip()
+    # Remove trailing year suffixes like ", 2024" or " 2024" for robust matching
+    def strip_trailing_year(s: str) -> str:
+        return re.sub(r"[,\s]*\b(19|20)\d{2}\b\s*$", "", s).strip()
+    t1 = strip_trailing_year(t1)
+    t2 = strip_trailing_year(t2)
     # Exact match
     if t1 == t2:
@@ -4674,6 +4684,13 @@ def normalize_venue_for_display(venue: str) -> str:
     venue_text = venue.strip()
+    # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
+    # This prevents author/editor lists from being treated as venue
+    # Match 'editors,' 'editor,' or 'eds.,' possibly after a comma; capture the remainder as venue
+    editors_match = re.search(r"(?:^|,)\s*(?:editors?|eds?\.?|editor)\s*,\s*(.+)$", venue_text, re.IGNORECASE)
+    if editors_match:
+        venue_text = editors_match.group(1).strip()
     # Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
     # This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
     editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
@@ -4700,7 +4717,9 @@ def normalize_venue_for_display(venue: str) -> str:
     prefixes_to_remove = [
         r'^\d{4}\s+\d+(st|nd|rd|th)\s+',  # "2012 IEEE/RSJ"
         r'^\d{4}\s+',                     # "2024 "
-        r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?',  # "Proceedings of the [ORG] [ORG] 29th"
+    # Remove 'Proceedings of [the] [ORG]* [ordinal]*' only when followed by at least one word
+    # This avoids cutting a venue down to just 'Proceedings of the'
+    r'^proceedings\s+of\s+(?!the\s*$)(?:the\s+)?(?:(?:acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(?:\d+(?:st|nd|rd|th)\s+)?',
         r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?',        # "Proc. of the IEEE" (require "of")
         r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?',       # "Procs. of the IEEE" (require "of")
         r'^in\s+',
@@ -4739,4 +4758,8 @@ def normalize_venue_for_display(venue: str) -> str:
     venue_text = re.sub(r'\s+', ' ', venue_text)     # Normalize whitespace
     venue_text = venue_text.strip()
+    # If what's left is too generic (e.g., just 'Proceedings of the'), treat as no venue
+    if venue_text.lower() in {"proceedings of the", "proceedings of"}:
+        return ""
     return venue_text