PyPI - academic-refchecker - Versions diffs - 1.2.44__py3-none-any.whl → 1.2.46__py3-none-any.whl - Mend

academic-refchecker 1.2.44py3-none-any.whl → 1.2.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

__version__.py +1 -1
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/METADATA +25 -9
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/RECORD +19 -19
checkers/crossref.py +6 -7
checkers/github_checker.py +13 -3
checkers/openalex.py +6 -7
checkers/openreview_checker.py +7 -4
checkers/semantic_scholar.py +44 -42
checkers/webpage_checker.py +19 -4
core/parallel_processor.py +5 -2
core/refchecker.py +52 -43
utils/arxiv_utils.py +16 -19
utils/doi_utils.py +6 -12
utils/error_utils.py +145 -3
utils/text_utils.py +116 -8
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/WHEEL +0 -0
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/entry_points.txt +0 -0
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/licenses/LICENSE +0 -0
{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/top_level.txt +0 -0

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.44"
+__version__ = "1.2.46"

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.44
+Version: 1.2.46
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -78,7 +78,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
        ArXiv URL: https://arxiv.org/abs/1610.10099
        DOI URL: https://doi.org/10.48550/arxiv.1610.10099
-      ⚠️  Warning: Year mismatch: cited as 2017 but actually 2016
+      ⚠️  Warning: Year mismatch:
+               cited:  '2017'
+               actual: '2016'
 [2/45] Effective approaches to attention-based neural machine translation
        Minh-Thang Luong, Hieu Pham, Christopher D. Manning
@@ -87,7 +89,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
        ArXiv URL: https://arxiv.org/abs/1508.04025
        DOI URL: https://doi.org/10.18653/v1/d15-1166
-      ❌ Error: First author mismatch: 'Minh-Thang Luong' vs 'Thang Luong'
+      ❌ Error: First author mismatch:
+               cited:  'Minh-Thang Luong'
+               actual: 'Thang Luong'
 [3/45] Deep Residual Learning for Image Recognition
        Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
@@ -98,7 +102,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
        ArXiv URL: https://arxiv.org/abs/1512.03385
        DOI URL: https://doi.org/10.1109/CVPR.2016.90
-      ❌ Error: DOI mismatch: cited as '10.1109/CVPR.2016.91' but actually '10.1109/CVPR.2016.90'
+      ❌ Error: DOI mismatch:
+               cited:  '10.1109/CVPR.2016.91'
+               actual: '10.1109/CVPR.2016.90'
 ============================================================
 📋 SUMMARY
@@ -382,7 +388,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
            ArXiv URL: https://arxiv.org/abs/2312.02119
            DOI URL: https://doi.org/10.48550/arxiv.2312.02119
-          ❌ Error: First author mismatch: 'T. Xie' vs 'Zhao Xu'
+          ❌ Error: First author mismatch:
+                   cited:  'T. Xie'
+                   actual: 'Zhao Xu'
     ```
   - `title`: Title discrepancies
     ```
@@ -392,7 +400,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
            ArXiv URL: https://arxiv.org/abs/1810.04805
            DOI URL: https://doi.org/10.18653/v1/n19-1423
-          ❌ Error: Title mismatch: cited as 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' but actually 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
+          ❌ Error: Title mismatch:
+                   cited:  'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
+                   actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
     ```
   - `arxiv_id`: Incorrect URLs or arXiv IDs
     ```
@@ -415,7 +425,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
            ArXiv URL: https://arxiv.org/abs/1706.03762
            DOI URL: https://doi.org/10.48550/arXiv.1706.03762
-          ❌ Error: DOI mismatch: cited as '10.5555/3295222.3295349' but actually '10.48550/arXiv.1706.03762'
+          ❌ Error: DOI mismatch:
+                   cited:  '10.5555/3295222.3295349'
+                   actual: '10.48550/arXiv.1706.03762'
     ```
 - **⚠️ Warnings**: Minor issues that may need attention
@@ -428,7 +440,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
            ArXiv URL: https://arxiv.org/abs/2310.03684
            DOI URL: https://doi.org/10.48550/arxiv.2310.03684
-          ⚠️  Warning: Year mismatch: cited as 2024 but actually 2023
+          ⚠️  Warning: Year mismatch:
+                   cited:  '2024'
+                   actual: '2023'
     ```
   - `venue`: Venue format variations
     ```
@@ -439,7 +453,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
            ArXiv URL: https://arxiv.org/abs/2403.02151
            DOI URL: https://doi.org/10.48550/arxiv.2403.02151
-          ⚠️  Warning: Venue mismatch: cited as 'arXiv, 2024' but actually 'Neural Information Processing Systems'
+          ⚠️  Warning: Venue mismatch:
+                   cited:  'arXiv, 2024'
+                   actual: 'Neural Information Processing Systems'
     ```
 - **❓ Unverified**: References that couldn't be verified with any of the checker APIs

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,21 @@
-__version__.py,sha256=k3lYUlcZL-yL2e_2u3UPBtgwqMqZJ11x7KVMZOotlE8,65
-academic_refchecker-1.2.44.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=PuEdWOtQEDRwfL2m1mk5h6WKAoScQu-kbHu9VkBS764,65
+academic_refchecker-1.2.46.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
-checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
+checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
-checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
+checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
 checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
-checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
-checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
-checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
-checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
+checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
+checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
+checkers/semantic_scholar.py,sha256=99KOHLAiYs31nSdx-gcMR_TWIlV8G4juNL0bmV4AoUs,34768
+checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
 config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
 config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
 config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
-core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
-core/refchecker.py,sha256=w3KNWyyaZZVL3ghFhEfro8SPs4xXEUjmCJERfZ7Du6A,273648
+core/parallel_processor.py,sha256=AOnjqhBHXlSb1c-PSunat9Eug5y04gOygwbHdPUqxgk,17202
+core/refchecker.py,sha256=lU6r9cKpB8Fc4Wd7vOqdqhxP9cwYEoB6D4PlYznglGY,274337
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,21 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
 services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
 services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
 utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
-utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
+utils/arxiv_utils.py,sha256=MxyD3Q0EzrmE0xORMJw8wdVtZ4Fp-ux_cn6jLMQimV8,18168
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
 utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
 utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
 utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
 utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
-utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
-utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
+utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
+utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=uEwKasw3aTVgIDHbDJDSOcTUbPwfiivIdhKwmxQJy0U,206378
+utils/text_utils.py,sha256=gFI-qu6g-9Lo1s3w1OjgBZ9SvdPufL1mMg-05l0BwD0,210269
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
-academic_refchecker-1.2.44.dist-info/METADATA,sha256=ueA0mwKqmiqhR9WBLyPy2W40wfJc4JRiWSTbrQHKU14,22298
-academic_refchecker-1.2.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.44.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.44.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.44.dist-info/RECORD,,
+academic_refchecker-1.2.46.dist-info/METADATA,sha256=lP_pMHS9uI4hhXEsox_yGlSnT6pSl1_6W58CtzZEbDM,22576
+academic_refchecker-1.2.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.46.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.46.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.46.dist-info/RECORD,,

checkers/crossref.py CHANGED Viewed

@@ -31,6 +31,7 @@ import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from urllib.parse import quote_plus
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
+from utils.error_utils import format_year_mismatch, format_doi_mismatch
 from config.settings import get_config
 # Set up logging
@@ -478,21 +479,19 @@ class CrossRefReferenceChecker:
         if year and work_year and year != work_year:
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
+                'warning_details': format_year_mismatch(year, work_year),
                 'ref_year_correct': work_year
             })
         # Verify DOI
         work_doi = work_data.get('DOI')
         if doi and work_doi:
-            # Normalize DOIs for comparison (remove URL prefix and trailing periods)
-            cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            if cited_doi_clean.lower() != work_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if not compare_dois(doi, work_doi):
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
+                    'error_details': format_doi_mismatch(doi, work_doi),
                     'ref_doi_correct': work_doi
                 })

checkers/github_checker.py CHANGED Viewed

@@ -169,9 +169,14 @@ class GitHubChecker:
             if cited_title:
                 title_match = self._check_title_match(cited_title, actual_name, actual_description)
                 if not title_match:
+                    from utils.error_utils import format_title_mismatch
+                    details = format_title_mismatch(cited_title, actual_name)
+                    if actual_description:
+                        snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
+                        details += f" ({snippet})"
                     errors.append({
                         "warning_type": "title",
-                        "warning_details": f"Title mismatch: cited as '{cited_title}' but repository is '{actual_name}' ({actual_description[:100]}{'...' if len(actual_description) > 100 else ''})"
+                        "warning_details": details
                     })
             # Verify authors
@@ -180,9 +185,13 @@ class GitHubChecker:
                 author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
                 author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
                 if not author_match:
+                    from utils.error_utils import format_three_line_mismatch
+                    left = author_str
+                    right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
+                    details = format_three_line_mismatch("Author mismatch", left, right)
                     errors.append({
                         "warning_type": "author",
-                        "warning_details": f"Author mismatch: cited as '{author_str}' but repository owner is '{actual_owner}' ({actual_owner_name})"
+                        "warning_details": details
                     })
             # Verify year
@@ -191,9 +200,10 @@ class GitHubChecker:
                 try:
                     cited_year_int = int(cited_year)
                     if cited_year_int < creation_year:
+                        from utils.error_utils import format_year_mismatch
                         errors.append({
                             "warning_type": "year",
-                            "warning_details": f"Year mismatch: cited as {cited_year} but repository created in {creation_year}",
+                            "warning_details": format_year_mismatch(cited_year, creation_year),
                             "ref_year_correct": str(creation_year)
                         })
                 except (ValueError, TypeError):

checkers/openalex.py CHANGED Viewed

@@ -33,6 +33,7 @@ import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from urllib.parse import quote_plus
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
+from utils.error_utils import format_year_mismatch, format_doi_mismatch
 from config.settings import get_config
 # Set up logging
@@ -448,7 +449,7 @@ class OpenAlexReferenceChecker:
         if year and work_year and year != work_year:
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
+                'warning_details': format_year_mismatch(year, work_year),
                 'ref_year_correct': work_year
             })
@@ -458,14 +459,12 @@ class OpenAlexReferenceChecker:
             work_doi = work_data['ids']['doi']
         if doi and work_doi:
-            # Normalize DOIs for comparison (remove URL prefix and trailing periods)
-            cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            if cited_doi_clean.lower() != work_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if not compare_dois(doi, work_doi):
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
+                    'error_details': format_doi_mismatch(doi, work_doi),
                     'ref_doi_correct': work_doi
                 })

checkers/openreview_checker.py CHANGED Viewed

@@ -425,9 +425,11 @@ class OpenReviewReferenceChecker:
         if cited_title and paper_title:
             similarity = calculate_title_similarity(cited_title, paper_title)
             if similarity < 0.7:  # Using a reasonable threshold
+                from utils.error_utils import format_title_mismatch
+                details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
                 errors.append({
                     "warning_type": "title",
-                    "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
+                    "warning_details": details
                 })
         # Check authors
@@ -460,9 +462,10 @@ class OpenReviewReferenceChecker:
                 is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
                 if is_different and year_message:
+                    from utils.error_utils import format_year_mismatch
                     errors.append({
                         "warning_type": "year",
-                        "warning_details": year_message
+                        "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
                     })
             except (ValueError, TypeError):
                 pass  # Skip year validation if conversion fails
@@ -473,10 +476,10 @@ class OpenReviewReferenceChecker:
         if cited_venue and paper_venue:
             if are_venues_substantially_different(cited_venue, paper_venue):
-                from utils.error_utils import clean_venue_for_comparison
+                from utils.error_utils import format_venue_mismatch
                 errors.append({
                     "warning_type": "venue",
-                    "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
+                    "warning_details": format_venue_mismatch(cited_venue, paper_venue)
                 })
         # Create verified data structure

checkers/semantic_scholar.py CHANGED Viewed

@@ -29,6 +29,7 @@ import logging
 import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
+from utils.error_utils import format_title_mismatch
 from config.settings import get_config
 # Set up logging
@@ -471,7 +472,7 @@ class NonArxivReferenceChecker:
         if found_title and title_similarity < SIMILARITY_THRESHOLD:
             errors.append({
                 'error_type': 'title',
-                'error_details': f"Title mismatch: cited as '{title}' but actually '{found_title}'",
+                'error_details': format_title_mismatch(title, found_title),
                 'ref_title_correct': paper_data.get('title', '')
             })
@@ -525,9 +526,10 @@ class NonArxivReferenceChecker:
             is_different, warning_message = is_year_substantially_different(year, paper_year, context)
             if is_different and warning_message:
+                from utils.error_utils import format_year_mismatch
                 errors.append({
                     'warning_type': 'year',
-                    'warning_details': warning_message,
+                    'warning_details': format_year_mismatch(year, paper_year),
                     'ref_year_correct': paper_year
                 })
@@ -541,49 +543,50 @@ class NonArxivReferenceChecker:
         elif paper_venue and not isinstance(paper_venue, str):
             paper_venue = str(paper_venue)
+        # Check venue mismatches
         if cited_venue and paper_venue:
             # Use the utility function to check if venues are substantially different
             if are_venues_substantially_different(cited_venue, paper_venue):
                 from utils.error_utils import create_venue_warning
                 errors.append(create_venue_warning(cited_venue, paper_venue))
         elif not cited_venue and paper_venue:
-            # Check if this is an arXiv paper first
-            external_ids = paper_data.get('externalIds', {})
-            arxiv_id = external_ids.get('ArXiv') if external_ids else None
-            if arxiv_id:
-                # For arXiv papers, suggest including the arXiv URL instead of venue
-                arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
-                # Check if the reference already includes this ArXiv URL or equivalent DOI
-                reference_url = reference.get('url', '')
-                # Check for direct arXiv URL match
-                has_arxiv_url = arxiv_url in reference_url
-                # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
-                arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
-                has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
-                if not (has_arxiv_url or has_arxiv_doi):
+            # Original reference has the venue in raw text but not parsed correctly
+            raw_text = reference.get('raw_text', '')
+            if raw_text and '#' in raw_text:
+                # Check if venue might be in the raw text format (author#title#venue#year#url)
+                parts = raw_text.split('#')
+                if len(parts) >= 3 and parts[2].strip():
+                    # Venue is present in raw text but missing from parsed reference
                     errors.append({
                         'warning_type': 'venue',
-                        'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
-                        'ref_url_correct': arxiv_url
+                        'warning_details': f"Venue missing: should include '{paper_venue}'",
+                        'ref_venue_correct': paper_venue
                     })
-            else:
-                # Original reference has the venue in raw text but not parsed correctly
-                raw_text = reference.get('raw_text', '')
-                if raw_text and '#' in raw_text:
-                    # Check if venue might be in the raw text format (author#title#venue#year#url)
-                    parts = raw_text.split('#')
-                    if len(parts) >= 3 and parts[2].strip():
-                        # Venue is present in raw text but missing from parsed reference
-                        errors.append({
-                            'warning_type': 'venue',
-                            'warning_details': f"Venue missing: should include '{paper_venue}'",
-                            'ref_venue_correct': paper_venue
-                        })
+        # Always check for missing arXiv URLs when paper has arXiv ID
+        external_ids = paper_data.get('externalIds', {})
+        arxiv_id = external_ids.get('ArXiv') if external_ids else None
+        if arxiv_id:
+            # For arXiv papers, check if reference includes the arXiv URL
+            arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
+            # Check if the reference already includes this ArXiv URL or equivalent DOI
+            reference_url = reference.get('url', '')
+            # Check for direct arXiv URL match
+            has_arxiv_url = arxiv_url in reference_url
+            # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
+            arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
+            has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
+            if not (has_arxiv_url or has_arxiv_doi):
+                errors.append({
+                    'warning_type': 'url',
+                    'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
+                    'ref_url_correct': arxiv_url
+                })
         # Verify DOI
         paper_doi = None
@@ -591,14 +594,13 @@ class NonArxivReferenceChecker:
         if external_ids and 'DOI' in external_ids:
             paper_doi = external_ids['DOI']
-            # Compare DOIs, but strip hash fragments and trailing periods for comparison
-            cited_doi_clean = doi.split('#')[0].rstrip('.') if doi else ''
-            paper_doi_clean = paper_doi.split('#')[0].rstrip('.') if paper_doi else ''
-            if cited_doi_clean and paper_doi_clean and cited_doi_clean.lower() != paper_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if doi and paper_doi and not compare_dois(doi, paper_doi):
+                from utils.error_utils import format_doi_mismatch
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {paper_doi}",
+                    'error_details': format_doi_mismatch(doi, paper_doi),
                     'ref_doi_correct': paper_doi
                 })

checkers/webpage_checker.py CHANGED Viewed

@@ -71,7 +71,8 @@ class WebPageChecker:
         doc_indicators = [
             'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
             'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
-            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
+            'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
+            'posts'  # For blog posts and forum posts like LessWrong
         ]
         return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
         doc_domains = [
             'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
             'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
-            'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
+            'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
+            'lesswrong.com'  # LessWrong rationality and AI safety blog platform
         ]
         return any(domain in parsed.netloc for domain in doc_domains)
@@ -182,9 +184,10 @@ class WebPageChecker:
             # Check title match
             if cited_title and page_title:
                 if not self._check_title_match(cited_title, page_title, page_description):
+                    from utils.error_utils import format_title_mismatch
                     errors.append({
                         "warning_type": "title",
-                        "warning_details": f"Title mismatch: cited as '{cited_title}' but page title is '{page_title}'"
+                        "warning_details": format_title_mismatch(cited_title, page_title)
                     })
             # Check if this is a documentation page for the cited topic
@@ -201,9 +204,13 @@ class WebPageChecker:
             if cited_authors:
                 author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
                 if not self._check_author_match(author_str, site_info, web_url):
+                    from utils.error_utils import format_three_line_mismatch
+                    left = author_str
+                    right = site_info.get('organization', 'unknown')
+                    details = format_three_line_mismatch("Author/organization mismatch", left, right)
                     errors.append({
                         "warning_type": "author",
-                        "warning_details": f"Author/organization mismatch: cited as '{author_str}' but page is from '{site_info.get('organization', 'unknown')}'"
+                        "warning_details": details
                     })
             logger.debug(f"Web page verification completed for: {web_url}")
@@ -390,6 +397,14 @@ class WebPageChecker:
         organization = site_info.get('organization', '').lower()
         domain = site_info.get('domain', '').lower()
+        # Accept generic web resource terms - these are valid for any web URL
+        generic_web_terms = [
+            'web resource', 'web site', 'website', 'online resource',
+            'online', 'web', 'internet resource', 'web page', 'webpage'
+        ]
+        if cited_lower in generic_web_terms:
+            return True
         # Direct matches
         if cited_lower in organization or organization in cited_lower:
             return True

core/parallel_processor.py CHANGED Viewed

@@ -352,12 +352,15 @@ class ParallelReferenceProcessor:
                     error_type = error.get('error_type') or error.get('warning_type')
                     error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                    from utils.error_utils import print_labeled_multiline
                     if error_type == 'arxiv_id':
+                        # Keep existing style for arXiv ID errors
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
-                        print(f"      ❌ Error: {error_details}")
+                        print_labeled_multiline("❌ Error", error_details)
                     else:
-                        print(f"      ⚠️  Warning: {error_details}")
+                        print_labeled_multiline("⚠️  Warning", error_details)
         # Show timing info for slow references
         if result.processing_time > 5.0:

core/refchecker.py CHANGED Viewed

@@ -1900,10 +1900,11 @@ class ArxivReferenceChecker:
             db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
             if normalized_title != db_title:
+                from utils.error_utils import format_title_mismatch
                 logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
                 errors.append({
                     'error_type': 'title',
-                    'error_details': f"Title mismatch: cited as '{title}' but actually '{paper_data.get('title')}'",
+                    'error_details': format_title_mismatch(title, paper_data.get('title')),
                     'ref_title_correct': paper_data.get('title')
                 })
@@ -1925,30 +1926,36 @@ class ArxivReferenceChecker:
         paper_year = paper_data.get('year')
         if year and paper_year and year != paper_year:
             logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
+            from utils.error_utils import format_year_mismatch
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {paper_year}",
+                'warning_details': format_year_mismatch(year, paper_year),
                 'ref_year_correct': paper_year
             })
         # Verify DOI
-        if doi and external_ids.get('DOI') and doi.lower() != external_ids['DOI'].lower():
-            # Check if the cited DOI is a partial match of the actual DOI
-            # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
-            cited_doi_clean = doi.lower().rstrip('.')
-            actual_doi_clean = external_ids['DOI'].lower().rstrip('.')
-            # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
-            # Only flag as error if it's not a reasonable partial match
-            if not actual_doi_clean.startswith(cited_doi_clean):
-                logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
-                errors.append({
-                    'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {external_ids['DOI']}",
-                    'ref_doi_correct': external_ids['DOI']
-                })
-            else:
-                logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
+        if doi and external_ids.get('DOI'):
+            from utils.doi_utils import compare_dois, normalize_doi
+            # Use proper DOI comparison first
+            if not compare_dois(doi, external_ids['DOI']):
+                # Check if the cited DOI is a partial match of the actual DOI
+                # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
+                cited_doi_normalized = normalize_doi(doi)
+                actual_doi_normalized = normalize_doi(external_ids['DOI'])
+                # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
+                # Only flag as error if it's not a reasonable partial match
+                if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
+                    logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
+                    from utils.error_utils import format_doi_mismatch
+                    errors.append({
+                        'error_type': 'doi',
+                        'error_details': format_doi_mismatch(doi, external_ids['DOI']),
+                        'ref_doi_correct': external_ids['DOI']
+                    })
+                else:
+                    logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
         # Verify ArXiv ID
         if reference.get('type') == 'arxiv':
@@ -3489,8 +3496,9 @@ class ArxivReferenceChecker:
         author_field_match = re.search(r'\\bibfield\{author\}\{(.*?)\}(?:\s*\\bibinfo\{year\}|\s*\\newblock|$)', content, re.DOTALL)
         if author_field_match:
             author_content = author_field_match.group(1)
-            # Find all \bibinfo{person}{Name} entries
-            person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
+            # Find all \bibinfo{person}{Name} entries using balanced brace extraction
+            from utils.text_utils import extract_bibinfo_person_content
+            person_matches = extract_bibinfo_person_content(author_content)
             if person_matches:
                 authors = []
                 for person in person_matches:
@@ -3502,33 +3510,31 @@ class ArxivReferenceChecker:
                         authors.append(clean_name)
                 ref['authors'] = authors
-        # Extract title from \bibinfo{title}{Title}
-        title_match = re.search(r'\\bibinfo\{title\}\{([^}]+)\}', content)
-        if title_match:
-            title = strip_latex_commands(title_match.group(1)).strip()
+        # Import balanced brace extraction function
+        from utils.text_utils import extract_bibinfo_field_content
+        # Extract title from \bibinfo{title}{Title} using balanced brace extraction
+        title_content = extract_bibinfo_field_content(content, 'title')
+        if title_content:
+            title = strip_latex_commands(title_content).strip()
             ref['title'] = title
-        # Extract venue/journal from various fields
-        venue_patterns = [
-            r'\\bibinfo\{booktitle\}\{([^}]+)\}',
-            r'\\bibinfo\{journal\}\{([^}]+)\}',
-            r'\\bibinfo\{series\}\{([^}]+)\}',
-            r'\\bibinfo\{note\}\{([^}]+)\}'
-        ]
+        # Extract venue/journal from various fields using balanced brace extraction
+        venue_field_types = ['booktitle', 'journal', 'series', 'note']
-        for pattern in venue_patterns:
-            venue_match = re.search(pattern, content)
-            if venue_match:
-                venue = strip_latex_commands(venue_match.group(1)).strip()
+        for field_type in venue_field_types:
+            venue_content = extract_bibinfo_field_content(content, field_type)
+            if venue_content:
+                venue = strip_latex_commands(venue_content).strip()
                 if venue:
                     ref['venue'] = venue
                     ref['journal'] = venue  # For compatibility
                     break
-        # Extract DOI
-        doi_match = re.search(r'\\bibinfo\{doi\}\{([^}]+)\}', content)
-        if doi_match:
-            ref['doi'] = doi_match.group(1).strip()
+        # Extract DOI using balanced brace extraction
+        doi_content = extract_bibinfo_field_content(content, 'doi')
+        if doi_content:
+            ref['doi'] = doi_content.strip()
         # Extract ArXiv ID from \showeprint[arxiv]{ID}
         arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -5048,7 +5054,8 @@ class ArxivReferenceChecker:
             correct_first = correct_authors[0]
             if not enhanced_name_match(cited_first, correct_first):
-                return False, f"First author mismatch: '{cited_first}' vs '{correct_first}'"
+                from utils.error_utils import format_first_author_mismatch
+                return False, format_first_author_mismatch(cited_first, correct_first)
         return True, "Authors match"
@@ -5454,12 +5461,14 @@ class ArxivReferenceChecker:
                     error_type = error.get('error_type') or error.get('warning_type')
                     error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                    from utils.error_utils import print_labeled_multiline
                     if error_type == 'arxiv_id':
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
-                        print(f"      ❌ Error: {error_details}")
+                        print_labeled_multiline("❌ Error", error_details)
                     else:
-                        print(f"      ⚠️  Warning: {error_details}")
+                        print_labeled_multiline("⚠️  Warning", error_details)
     def _output_reference_errors(self, reference, errors, url):
         """

utils/arxiv_utils.py CHANGED Viewed

@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
         logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
         tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-        # Choose between .bib and .bbl files based on content richness
-        # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
+        # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
+        # .bbl files are processed biblatex output that reflects exactly what was cited
         if bib_content and bbl_content:
-            # Count entries in both
+            # Count entries in both for logging
             bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
             bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
-            # If we have LaTeX content, get filtered BibTeX count
-            filtered_bib_count = bib_entry_count
-            filtered_content = bib_content
-            if tex_content:
-                cited_keys = extract_cited_keys_from_tex({}, tex_content)
-                if cited_keys:
-                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
-                    filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
-            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
+            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
-            # Prioritize .bbl if it has significantly more entries
-            if bbl_entry_count > filtered_bib_count * 1.5:  # 50% more entries threshold
-                logger.info(f"Using .bbl files from ArXiv source")
+            # Only use .bbl if it actually contains bibliography entries
+            if bbl_entry_count > 0:
+                logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
                 return bbl_content
             else:
-                logger.info(f"Using filtered .bib files")
-                return filtered_content
+                logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
+                # If we have LaTeX content, filter BibTeX by cited keys
+                if tex_content:
+                    cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                    if cited_keys:
+                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                        filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                        return filtered_content
+                return bib_content
         elif bib_content:
             logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")

utils/doi_utils.py CHANGED Viewed

@@ -99,6 +99,10 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     """
     Compare two DOIs for equality, handling different formats and prefixes.
+    This function performs exact matching after normalization, which means
+    DOIs are only considered equal if they are identical after removing
+    prefixes, case differences, and punctuation.
     Args:
         doi1: First DOI to compare
         doi2: Second DOI to compare
@@ -109,21 +113,11 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     if not doi1 or not doi2:
         return False
-    # Normalize both DOIs (already converted to lowercase)
+    # Normalize both DOIs (handles prefixes, case, punctuation)
     norm_doi1 = normalize_doi(doi1)
     norm_doi2 = normalize_doi(doi2)
-    # If DOIs are identical, they match
-    if norm_doi1 == norm_doi2:
-        return True
-    # Check if first two components match (publisher.registrant)
-    doi1_parts = norm_doi1.split('.')
-    doi2_parts = norm_doi2.split('.')
-    if len(doi1_parts) >= 2 and len(doi2_parts) >= 2:
-        return doi1_parts[0] == doi2_parts[0] and doi1_parts[1].split('/')[0] == doi2_parts[1].split('/')[0]
+    # DOIs must be exactly identical after normalization
     return norm_doi1 == norm_doi2

utils/error_utils.py CHANGED Viewed

@@ -9,6 +9,86 @@ for reference checkers.
 from typing import Dict, List, Any, Optional
+def print_labeled_multiline(label: str, text: str) -> None:
+    """
+    Print a multi-line message with consistent label formatting.
+    This function ensures consistent indentation for all error and warning messages,
+    regardless of emoji width differences in the labels.
+    Args:
+        label: The label (e.g., "❌ Error", "⚠️  Warning")
+        text: The multi-line text to print
+    """
+    prefix = f"      {label}: "
+    lines = (text or "").splitlines() or [""]
+    # Print the first line with the label prefix
+    print(prefix + lines[0])
+    # Print subsequent lines with fixed indentation to ensure consistency
+    # Use fixed 19-character indentation to align regardless of emoji width
+    fixed_indent = " " * 15
+    for line in lines[1:]:
+        print(fixed_indent + line)
+def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
+    """
+    Format a three-line mismatch message with fixed indentation.
+    This creates a clean, consistently formatted mismatch message that separates
+    the mismatch type from the values being compared:
+    Example:
+    Title mismatch:
+        'Cited Title'
+    vs: 'Correct Title'
+    Args:
+        mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
+        left: The cited/incorrect value
+        right: The correct value
+    Returns:
+        Three-line formatted mismatch message
+    """
+    # Ensure mismatch_type ends with a colon
+    if not mismatch_type.endswith(":"):
+        mismatch_type = mismatch_type.rstrip() + ":"
+    # Use fixed indentation for clean, consistent alignment
+    indent = ""  # spaces for content indentation
+    vs_indent = ""   # vs: starts at column 0 for clear visual separation
+    return f"{mismatch_type}\n{indent}cited:  '{left}'\n{vs_indent}actual: '{right}'"
+def format_title_mismatch(cited_title: str, verified_title: str) -> str:
+    """
+    Format a three-line title mismatch message.
+    Output format:
+    Title mismatch:
+        'Cited Title'
+    vs: 'Correct Title'
+    """
+    return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
+def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
+    """
+    Three-line year mismatch message.
+    """
+    return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
+def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
+    """
+    Three-line DOI mismatch message.
+    """
+    return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
 def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
     """
     Create a standardized author error dictionary.
@@ -40,7 +120,7 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
     """
     return {
         'warning_type': 'year',
-        'warning_details': f"Year mismatch: cited as {cited_year} but actually {correct_year}",
+        'warning_details': format_year_mismatch(cited_year, correct_year),
         'ref_year_correct': correct_year
     }
@@ -64,7 +144,7 @@ def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str
     if cited_doi_clean != correct_doi_clean:
         return {
             'error_type': 'doi',
-            'error_details': f"DOI mismatch: cited as {cited_doi} but actually {correct_doi}",
+            'error_details': format_doi_mismatch(cited_doi, correct_doi),
             'ref_doi_correct': correct_doi
         }
@@ -120,11 +200,20 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
     return {
         'warning_type': 'venue',
-        'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
+        'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
         'ref_venue_correct': correct_venue
     }
+def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
+    """
+    Format a three-line venue mismatch message with cleaned venue names.
+    """
+    clean_cited = clean_venue_for_comparison(cited_venue)
+    clean_verified = clean_venue_for_comparison(verified_venue)
+    return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
 def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
     """
     Create a standardized URL error dictionary.
@@ -189,6 +278,59 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
     return warning_dict
+def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line author mismatch message.
+    Args:
+        author_number: The author position (1-based)
+        cited_author: The cited author name
+        correct_author: The correct author name
+    Returns:
+        Formatted three-line author mismatch message
+    """
+    return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
+def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line first author mismatch message.
+    Args:
+        cited_author: The cited first author name
+        correct_author: The correct first author name
+    Returns:
+        Formatted three-line first author mismatch message
+    """
+    return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
+def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
+    """
+    Format an author count mismatch message showing all cited and correct authors.
+    Args:
+        cited_count: Number of cited authors
+        correct_count: Number of correct authors
+        cited_authors: List of cited author names
+        correct_authors: List of correct author names
+    Returns:
+        Formatted multi-line author count mismatch message
+    """
+    # Create the header with count information
+    header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
+    # Format author lists
+    cited_list = ", ".join(cited_authors) if cited_authors else "None"
+    correct_list = ", ".join(correct_authors) if correct_authors else "None"
+    # Use the same format as other mismatches
+    return format_three_line_mismatch(header, cited_list, correct_list)
 def format_authors_list(authors: List[Dict[str, str]]) -> str:
     """
     Format a list of author dictionaries into a readable string.

utils/text_utils.py CHANGED Viewed

@@ -554,6 +554,10 @@ def clean_title_basic(title):
     # Remove trailing punctuation
     title = re.sub(r'[.,;:]+$', '', title)
+    # Remove BibTeX publication type indicators at the end (common in Chinese and some international BibTeX styles)
+    # [J] = Journal, [C] = Conference, [M] = Monograph/Book, [D] = Dissertation, [P] = Patent, [R] = Report
+    title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
     return title
@@ -578,6 +582,9 @@ def clean_title_for_search(title):
     title = title.replace('\n', ' ').strip()
     title = re.sub(r'\s+', ' ', title)  # Normalize whitespace only
+    # Remove BibTeX publication type indicators that are not part of the actual title
+    title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
     # Note: We intentionally preserve:
     # - Capitalization (helps with exact matching)
     # - Colons and other meaningful punctuation (structural markers)
@@ -2076,6 +2083,8 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
     # The key insight: if the citation has "et al", we should only verify the listed authors
     # and not penalize for the authoritative source having more authors
     if has_et_al:
+        # Import here to avoid circular imports
+        from utils.error_utils import format_author_mismatch
         # For et al cases, check if each cited author matches ANY author in the correct list
         # rather than comparing positionally, since author order can vary
         for i, cited_author in enumerate(cleaned_cited):
@@ -2088,10 +2097,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
                     break
             if not author_found:
-                # Create a more informative error message that doesn't assume positional matching
-                # Show the full list of correct authors instead of truncating
+                # Use standardized three-line formatting for author mismatch
+                cited_display = format_author_for_display(cited_author)
                 full_author_list = ', '.join(correct_names)
-                return False, f"Author {i+1} mismatch: '{cited_author}' not found in author list (et al case). Correct authors include: {full_author_list}"
+                error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
+                return False, error_msg
         return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
@@ -2100,7 +2110,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         # For non-et-al cases, be more strict about count mismatches
         # Allow minor flexibility (1 author difference) but not more
         if abs(len(cleaned_cited) - len(correct_names)) > 1:
-            return False, f"Author count mismatch: {len(cleaned_cited)} cited vs {len(correct_names)} correct"
+            from utils.error_utils import format_author_count_mismatch
+            # Convert cited names to display format (First Last) before showing in error
+            display_cited = [format_author_for_display(author) for author in cleaned_cited]
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
+            return False, error_msg
         # Use the shorter list for comparison
         min_len = min(len(cleaned_cited), len(correct_names))
@@ -2110,6 +2124,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         comparison_cited = cleaned_cited
         comparison_correct = correct_names
+    # Use shared three-line formatter (imported lazily to avoid circular imports)
+    from utils.error_utils import format_first_author_mismatch, format_author_mismatch
     # Compare first author (most important) using the enhanced name matching
     if comparison_cited and comparison_correct:
         cited_first = comparison_cited[0]
@@ -2119,7 +2136,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
             # Use consistent display format for both names
             cited_display = format_author_for_display(cited_first)
             correct_display = format_author_for_display(correct_first)
-            return False, f"First author mismatch: '{cited_display}' vs '{correct_display}'"
+            return False, format_first_author_mismatch(cited_display, correct_display)
     # For complete verification, check all authors if reasonable number
     if len(comparison_cited) <= 5:  # Only do full check for reasonable author counts
@@ -2128,7 +2145,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
                 # Use consistent display format for both names
                 cited_display = format_author_for_display(cited_author)
                 correct_display = format_author_for_display(correct_author)
-                return False, f"Author {i+1} mismatch: '{cited_display}' vs '{correct_display}'"
+                return False, format_author_mismatch(i+1, cited_display, correct_display)
     return True, "Authors match"
@@ -2512,6 +2529,97 @@ def strip_latex_commands(text):
     return text
+def extract_balanced_braces(text, start_pos):
+    """
+    Extract content from balanced braces starting at start_pos.
+    This function properly handles nested braces, which is important for LaTeX content
+    where patterns like {Jos{\'e} Meseguer} need to be extracted as complete units.
+    Args:
+        text: The text to search in
+        start_pos: Position of the opening brace
+    Returns:
+        tuple: (content, end_pos) or (None, start_pos) if no balanced content found
+    """
+    if start_pos >= len(text) or text[start_pos] != '{':
+        return None, start_pos
+    brace_count = 1
+    pos = start_pos + 1
+    while pos < len(text) and brace_count > 0:
+        if text[pos] == '{':
+            brace_count += 1
+        elif text[pos] == '}':
+            brace_count -= 1
+        pos += 1
+    if brace_count == 0:
+        return text[start_pos + 1:pos - 1], pos
+    else:
+        return None, start_pos
+def extract_bibinfo_person_content(text):
+    """
+    Extract all person names from \\bibinfo{person}{...} with proper brace handling.
+    This function correctly handles nested braces in author names, such as:
+    \\bibinfo{person}{Jos{\\'e} Meseguer}
+    Args:
+        text: Text containing \\bibinfo{person}{...} patterns
+    Returns:
+        list: List of extracted person names with balanced braces preserved
+    """
+    return extract_bibinfo_field_content(text, 'person', return_all=True)
+def extract_bibinfo_field_content(text, field_type, return_all=False):
+    """
+    Extract content from \\bibinfo{field_type}{...} with proper brace handling.
+    This function correctly handles nested braces in field content, such as:
+    \\bibinfo{journal}{\\emph{Commun. ACM}}
+    Args:
+        text: Text containing \\bibinfo{field_type}{...} patterns
+        field_type: The field type to extract (e.g., 'person', 'journal', 'title')
+        return_all: If True, return list of all matches; if False, return first match or None
+    Returns:
+        list or str or None: Extracted content based on return_all parameter
+    """
+    pattern = f'\\\\bibinfo\\{{{re.escape(field_type)}\\}}\\{{'
+    matches = []
+    pos = 0
+    while True:
+        match = re.search(pattern, text[pos:])
+        if not match:
+            break
+        # Find the start of the content braces
+        brace_start = pos + match.end() - 1  # -1 because we want the opening brace
+        content, end_pos = extract_balanced_braces(text, brace_start)
+        if content is not None:
+            matches.append(content)
+            pos = end_pos
+            if not return_all:
+                break  # Return first match only
+        else:
+            pos += match.end()
+    if return_all:
+        return matches
+    else:
+        return matches[0] if matches else None
 def extract_cited_keys_from_latex(tex_content):
     r"""
     Extract citation keys from LaTeX content by finding \cite{} commands.
@@ -2936,8 +3044,8 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     if brace_count == 0:
                         author_content = content[start_pos:pos-1]
-                        # Extract individual authors from \bibinfo{person}{Name} tags
-                        person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
+                        # Extract individual authors from \bibinfo{person}{Name} tags using balanced brace extraction
+                        person_matches = extract_bibinfo_person_content(author_content)
                         if person_matches:
                             # Clean and format author names
                             authors = []

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.44.dist-info → academic_refchecker-1.2.46.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.44__py3-none-any.whl → 1.2.46__py3-none-any.whl

academic-refchecker 1.2.44py3-none-any.whl → 1.2.46py3-none-any.whl