PyPI - academic-refchecker - Versions diffs - 1.2.45__py3-none-any.whl → 1.2.46__py3-none-any.whl - Mend

academic-refchecker 1.2.45py3-none-any.whl → 1.2.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.45"
+__version__ = "1.2.46"

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.45
+Version: 1.2.46
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=8vdrigO4-YfHufQMfh_RQ9NlN5btmqndss2dAOLxa1Q,65
-academic_refchecker-1.2.45.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=PuEdWOtQEDRwfL2m1mk5h6WKAoScQu-kbHu9VkBS764,65
+academic_refchecker-1.2.46.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -7,8 +7,8 @@ checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14
 checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
 checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
 checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
-checkers/semantic_scholar.py,sha256=0LcVahf3twyHqaD7bQ2eJiTyg-AQ9NGvVohb9nqaHdA,34884
-checkers/webpage_checker.py,sha256=woY8mNgZ4Lr9Ug53CN-Xo_2P62BTpR2u_FZyUPgTEuA,21833
+checkers/semantic_scholar.py,sha256=99KOHLAiYs31nSdx-gcMR_TWIlV8G4juNL0bmV4AoUs,34768
+checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
 config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
 config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
 config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
@@ -26,7 +26,7 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
 services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
 services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
 utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
-utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
+utils/arxiv_utils.py,sha256=MxyD3Q0EzrmE0xORMJw8wdVtZ4Fp-ux_cn6jLMQimV8,18168
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
 utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
 utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
 utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
 utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=T67Y-HSNokj-mOcdCtOcULNviBxyaG9xTjRd_l9titI,210088
+utils/text_utils.py,sha256=gFI-qu6g-9Lo1s3w1OjgBZ9SvdPufL1mMg-05l0BwD0,210269
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
-academic_refchecker-1.2.45.dist-info/METADATA,sha256=mY4M9FRaDKcyS5yOFvR3X0Y0bj47_YmZeMayvrrpS38,22576
-academic_refchecker-1.2.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.45.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.45.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.45.dist-info/RECORD,,
+academic_refchecker-1.2.46.dist-info/METADATA,sha256=lP_pMHS9uI4hhXEsox_yGlSnT6pSl1_6W58CtzZEbDM,22576
+academic_refchecker-1.2.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.46.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.46.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.46.dist-info/RECORD,,

checkers/semantic_scholar.py CHANGED Viewed

@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
         elif paper_venue and not isinstance(paper_venue, str):
             paper_venue = str(paper_venue)
+        # Check venue mismatches
         if cited_venue and paper_venue:
             # Use the utility function to check if venues are substantially different
             if are_venues_substantially_different(cited_venue, paper_venue):
                 from utils.error_utils import create_venue_warning
                 errors.append(create_venue_warning(cited_venue, paper_venue))
         elif not cited_venue and paper_venue:
-            # Check if this is an arXiv paper first
-            external_ids = paper_data.get('externalIds', {})
-            arxiv_id = external_ids.get('ArXiv') if external_ids else None
-            if arxiv_id:
-                # For arXiv papers, suggest including the arXiv URL instead of venue
-                arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
-                # Check if the reference already includes this ArXiv URL or equivalent DOI
-                reference_url = reference.get('url', '')
-                # Check for direct arXiv URL match
-                has_arxiv_url = arxiv_url in reference_url
-                # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
-                arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
-                has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
-                if not (has_arxiv_url or has_arxiv_doi):
+            # Original reference has the venue in raw text but not parsed correctly
+            raw_text = reference.get('raw_text', '')
+            if raw_text and '#' in raw_text:
+                # Check if venue might be in the raw text format (author#title#venue#year#url)
+                parts = raw_text.split('#')
+                if len(parts) >= 3 and parts[2].strip():
+                    # Venue is present in raw text but missing from parsed reference
                     errors.append({
                         'warning_type': 'venue',
-                        'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
-                        'ref_url_correct': arxiv_url
+                        'warning_details': f"Venue missing: should include '{paper_venue}'",
+                        'ref_venue_correct': paper_venue
                     })
-            else:
-                # Original reference has the venue in raw text but not parsed correctly
-                raw_text = reference.get('raw_text', '')
-                if raw_text and '#' in raw_text:
-                    # Check if venue might be in the raw text format (author#title#venue#year#url)
-                    parts = raw_text.split('#')
-                    if len(parts) >= 3 and parts[2].strip():
-                        # Venue is present in raw text but missing from parsed reference
-                        errors.append({
-                            'warning_type': 'venue',
-                            'warning_details': f"Venue missing: should include '{paper_venue}'",
-                            'ref_venue_correct': paper_venue
-                        })
+        # Always check for missing arXiv URLs when paper has arXiv ID
+        external_ids = paper_data.get('externalIds', {})
+        arxiv_id = external_ids.get('ArXiv') if external_ids else None
+        if arxiv_id:
+            # For arXiv papers, check if reference includes the arXiv URL
+            arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
+            # Check if the reference already includes this ArXiv URL or equivalent DOI
+            reference_url = reference.get('url', '')
+            # Check for direct arXiv URL match
+            has_arxiv_url = arxiv_url in reference_url
+            # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
+            arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
+            has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
+            if not (has_arxiv_url or has_arxiv_doi):
+                errors.append({
+                    'warning_type': 'url',
+                    'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
+                    'ref_url_correct': arxiv_url
+                })
         # Verify DOI
         paper_doi = None

checkers/webpage_checker.py CHANGED Viewed

@@ -71,7 +71,8 @@ class WebPageChecker:
         doc_indicators = [
             'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
             'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
-            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
+            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
+            'posts'  # For blog posts and forum posts like LessWrong
         ]
         return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
         doc_domains = [
             'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
             'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
-            'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
+            'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
+            'lesswrong.com'  # LessWrong rationality and AI safety blog platform
         ]
         return any(domain in parsed.netloc for domain in doc_domains)
@@ -395,6 +397,14 @@ class WebPageChecker:
         organization = site_info.get('organization', '').lower()
         domain = site_info.get('domain', '').lower()
+        # Accept generic web resource terms - these are valid for any web URL
+        generic_web_terms = [
+            'web resource', 'web site', 'website', 'online resource',
+            'online', 'web', 'internet resource', 'web page', 'webpage'
+        ]
+        if cited_lower in generic_web_terms:
+            return True
         # Direct matches
         if cited_lower in organization or organization in cited_lower:
             return True

utils/arxiv_utils.py CHANGED Viewed

@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
         logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
         tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-        # Choose between .bib and .bbl files based on content richness
-        # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
+        # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
+        # .bbl files are processed biblatex output that reflects exactly what was cited
         if bib_content and bbl_content:
-            # Count entries in both
+            # Count entries in both for logging
             bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
             bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
-            # If we have LaTeX content, get filtered BibTeX count
-            filtered_bib_count = bib_entry_count
-            filtered_content = bib_content
-            if tex_content:
-                cited_keys = extract_cited_keys_from_tex({}, tex_content)
-                if cited_keys:
-                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
-                    filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
-            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
+            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
-            # Prioritize .bbl if it has significantly more entries
-            if bbl_entry_count > filtered_bib_count * 1.5:  # 50% more entries threshold
-                logger.info(f"Using .bbl files from ArXiv source")
+            # Only use .bbl if it actually contains bibliography entries
+            if bbl_entry_count > 0:
+                logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
                 return bbl_content
             else:
-                logger.info(f"Using filtered .bib files")
-                return filtered_content
+                logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
+                # If we have LaTeX content, filter BibTeX by cited keys
+                if tex_content:
+                    cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                    if cited_keys:
+                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                        filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                        return filtered_content
+                return bib_content
         elif bib_content:
             logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")

utils/text_utils.py CHANGED Viewed

@@ -2111,7 +2111,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         # Allow minor flexibility (1 author difference) but not more
         if abs(len(cleaned_cited) - len(correct_names)) > 1:
             from utils.error_utils import format_author_count_mismatch
-            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
+            # Convert cited names to display format (First Last) before showing in error
+            display_cited = [format_author_for_display(author) for author in cleaned_cited]
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
             return False, error_msg
         # Use the shorter list for comparison

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.45__py3-none-any.whl → 1.2.46__py3-none-any.whl

academic-refchecker 1.2.45py3-none-any.whl → 1.2.46py3-none-any.whl