PyPI - academic-refchecker - Versions diffs - 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl - Mend

academic-refchecker 1.2.43py3-none-any.whl → 1.2.45py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

__version__.py +1 -1
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/METADATA +25 -9
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/RECORD +19 -19
checkers/crossref.py +6 -7
checkers/github_checker.py +13 -3
checkers/openalex.py +6 -7
checkers/openreview_checker.py +7 -4
checkers/semantic_scholar.py +9 -8
checkers/webpage_checker.py +7 -2
core/parallel_processor.py +5 -2
core/refchecker.py +53 -44
utils/doi_utils.py +6 -12
utils/error_utils.py +145 -3
utils/text_utils.py +115 -9
utils/url_utils.py +17 -0
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/WHEEL +0 -0
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/entry_points.txt +0 -0
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/licenses/LICENSE +0 -0
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/top_level.txt +0 -0

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.43"
+__version__ = "1.2.45"

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.43
+Version: 1.2.45
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -78,7 +78,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
        ArXiv URL: https://arxiv.org/abs/1610.10099
        DOI URL: https://doi.org/10.48550/arxiv.1610.10099
-      ⚠️  Warning: Year mismatch: cited as 2017 but actually 2016
+      ⚠️  Warning: Year mismatch:
+               cited:  '2017'
+               actual: '2016'
 [2/45] Effective approaches to attention-based neural machine translation
        Minh-Thang Luong, Hieu Pham, Christopher D. Manning
@@ -87,7 +89,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
        ArXiv URL: https://arxiv.org/abs/1508.04025
        DOI URL: https://doi.org/10.18653/v1/d15-1166
-      ❌ Error: First author mismatch: 'Minh-Thang Luong' vs 'Thang Luong'
+      ❌ Error: First author mismatch:
+               cited:  'Minh-Thang Luong'
+               actual: 'Thang Luong'
 [3/45] Deep Residual Learning for Image Recognition
        Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
@@ -98,7 +102,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
        Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
        ArXiv URL: https://arxiv.org/abs/1512.03385
        DOI URL: https://doi.org/10.1109/CVPR.2016.90
-      ❌ Error: DOI mismatch: cited as '10.1109/CVPR.2016.91' but actually '10.1109/CVPR.2016.90'
+      ❌ Error: DOI mismatch:
+               cited:  '10.1109/CVPR.2016.91'
+               actual: '10.1109/CVPR.2016.90'
 ============================================================
 📋 SUMMARY
@@ -382,7 +388,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
            ArXiv URL: https://arxiv.org/abs/2312.02119
            DOI URL: https://doi.org/10.48550/arxiv.2312.02119
-          ❌ Error: First author mismatch: 'T. Xie' vs 'Zhao Xu'
+          ❌ Error: First author mismatch:
+                   cited:  'T. Xie'
+                   actual: 'Zhao Xu'
     ```
   - `title`: Title discrepancies
     ```
@@ -392,7 +400,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
            ArXiv URL: https://arxiv.org/abs/1810.04805
            DOI URL: https://doi.org/10.18653/v1/n19-1423
-          ❌ Error: Title mismatch: cited as 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' but actually 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
+          ❌ Error: Title mismatch:
+                   cited:  'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
+                   actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
     ```
   - `arxiv_id`: Incorrect URLs or arXiv IDs
     ```
@@ -415,7 +425,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
            ArXiv URL: https://arxiv.org/abs/1706.03762
            DOI URL: https://doi.org/10.48550/arXiv.1706.03762
-          ❌ Error: DOI mismatch: cited as '10.5555/3295222.3295349' but actually '10.48550/arXiv.1706.03762'
+          ❌ Error: DOI mismatch:
+                   cited:  '10.5555/3295222.3295349'
+                   actual: '10.48550/arXiv.1706.03762'
     ```
 - **⚠️ Warnings**: Minor issues that may need attention
@@ -428,7 +440,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
            ArXiv URL: https://arxiv.org/abs/2310.03684
            DOI URL: https://doi.org/10.48550/arxiv.2310.03684
-          ⚠️  Warning: Year mismatch: cited as 2024 but actually 2023
+          ⚠️  Warning: Year mismatch:
+                   cited:  '2024'
+                   actual: '2023'
     ```
   - `venue`: Venue format variations
     ```
@@ -439,7 +453,9 @@ This enhanced URL display helps users access multiple authoritative sources for
            Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
            ArXiv URL: https://arxiv.org/abs/2403.02151
            DOI URL: https://doi.org/10.48550/arxiv.2403.02151
-          ⚠️  Warning: Venue mismatch: cited as 'arXiv, 2024' but actually 'Neural Information Processing Systems'
+          ⚠️  Warning: Venue mismatch:
+                   cited:  'arXiv, 2024'
+                   actual: 'Neural Information Processing Systems'
     ```
 - **❓ Unverified**: References that couldn't be verified with any of the checker APIs

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,21 @@
-__version__.py,sha256=JbybFux4Juuafz1jN0cgsedPmzBO8U9DJ874tJu2saA,65
-academic_refchecker-1.2.43.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=8vdrigO4-YfHufQMfh_RQ9NlN5btmqndss2dAOLxa1Q,65
+academic_refchecker-1.2.45.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
-checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
+checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
-checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
+checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
 checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
-checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
-checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
-checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
-checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
+checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
+checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
+checkers/semantic_scholar.py,sha256=0LcVahf3twyHqaD7bQ2eJiTyg-AQ9NGvVohb9nqaHdA,34884
+checkers/webpage_checker.py,sha256=woY8mNgZ4Lr9Ug53CN-Xo_2P62BTpR2u_FZyUPgTEuA,21833
 config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
 config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
 config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
-core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
-core/refchecker.py,sha256=sVRg3PUzrs2vLFlEBoi4bxUy-TpO5iQHCkokGas-ygQ,273616
+core/parallel_processor.py,sha256=AOnjqhBHXlSb1c-PSunat9Eug5y04gOygwbHdPUqxgk,17202
+core/refchecker.py,sha256=lU6r9cKpB8Fc4Wd7vOqdqhxP9cwYEoB6D4PlYznglGY,274337
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -33,14 +33,14 @@ utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,1
 utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
 utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
-utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
-utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
+utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
+utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=jPgCOBTVboLRJyypoOtL-dg1wBDQrKBux2ImvC6wL58,206296
+utils/text_utils.py,sha256=T67Y-HSNokj-mOcdCtOcULNviBxyaG9xTjRd_l9titI,210088
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
-utils/url_utils.py,sha256=aq1hSYEA888bOKuBOGWRclgTFIjw32rpFdsBO_Ja8ZM,8402
-academic_refchecker-1.2.43.dist-info/METADATA,sha256=ZsJhIw1n7Yjoug6mpV4zpAPf-eSW5xSMdd3Dl_WTOlI,22298
-academic_refchecker-1.2.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.43.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.43.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.43.dist-info/RECORD,,
+utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
+academic_refchecker-1.2.45.dist-info/METADATA,sha256=mY4M9FRaDKcyS5yOFvR3X0Y0bj47_YmZeMayvrrpS38,22576
+academic_refchecker-1.2.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.45.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.45.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.45.dist-info/RECORD,,

checkers/crossref.py CHANGED Viewed

@@ -31,6 +31,7 @@ import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from urllib.parse import quote_plus
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
+from utils.error_utils import format_year_mismatch, format_doi_mismatch
 from config.settings import get_config
 # Set up logging
@@ -478,21 +479,19 @@ class CrossRefReferenceChecker:
         if year and work_year and year != work_year:
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
+                'warning_details': format_year_mismatch(year, work_year),
                 'ref_year_correct': work_year
             })
         # Verify DOI
         work_doi = work_data.get('DOI')
         if doi and work_doi:
-            # Normalize DOIs for comparison (remove URL prefix and trailing periods)
-            cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            if cited_doi_clean.lower() != work_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if not compare_dois(doi, work_doi):
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
+                    'error_details': format_doi_mismatch(doi, work_doi),
                     'ref_doi_correct': work_doi
                 })

checkers/github_checker.py CHANGED Viewed

@@ -169,9 +169,14 @@ class GitHubChecker:
             if cited_title:
                 title_match = self._check_title_match(cited_title, actual_name, actual_description)
                 if not title_match:
+                    from utils.error_utils import format_title_mismatch
+                    details = format_title_mismatch(cited_title, actual_name)
+                    if actual_description:
+                        snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
+                        details += f" ({snippet})"
                     errors.append({
                         "warning_type": "title",
-                        "warning_details": f"Title mismatch: cited as '{cited_title}' but repository is '{actual_name}' ({actual_description[:100]}{'...' if len(actual_description) > 100 else ''})"
+                        "warning_details": details
                     })
             # Verify authors
@@ -180,9 +185,13 @@ class GitHubChecker:
                 author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
                 author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
                 if not author_match:
+                    from utils.error_utils import format_three_line_mismatch
+                    left = author_str
+                    right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
+                    details = format_three_line_mismatch("Author mismatch", left, right)
                     errors.append({
                         "warning_type": "author",
-                        "warning_details": f"Author mismatch: cited as '{author_str}' but repository owner is '{actual_owner}' ({actual_owner_name})"
+                        "warning_details": details
                     })
             # Verify year
@@ -191,9 +200,10 @@ class GitHubChecker:
                 try:
                     cited_year_int = int(cited_year)
                     if cited_year_int < creation_year:
+                        from utils.error_utils import format_year_mismatch
                         errors.append({
                             "warning_type": "year",
-                            "warning_details": f"Year mismatch: cited as {cited_year} but repository created in {creation_year}",
+                            "warning_details": format_year_mismatch(cited_year, creation_year),
                             "ref_year_correct": str(creation_year)
                         })
                 except (ValueError, TypeError):

checkers/openalex.py CHANGED Viewed

@@ -33,6 +33,7 @@ import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from urllib.parse import quote_plus
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
+from utils.error_utils import format_year_mismatch, format_doi_mismatch
 from config.settings import get_config
 # Set up logging
@@ -448,7 +449,7 @@ class OpenAlexReferenceChecker:
         if year and work_year and year != work_year:
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
+                'warning_details': format_year_mismatch(year, work_year),
                 'ref_year_correct': work_year
             })
@@ -458,14 +459,12 @@ class OpenAlexReferenceChecker:
             work_doi = work_data['ids']['doi']
         if doi and work_doi:
-            # Normalize DOIs for comparison (remove URL prefix and trailing periods)
-            cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
-            if cited_doi_clean.lower() != work_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if not compare_dois(doi, work_doi):
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
+                    'error_details': format_doi_mismatch(doi, work_doi),
                     'ref_doi_correct': work_doi
                 })

checkers/openreview_checker.py CHANGED Viewed

@@ -425,9 +425,11 @@ class OpenReviewReferenceChecker:
         if cited_title and paper_title:
             similarity = calculate_title_similarity(cited_title, paper_title)
             if similarity < 0.7:  # Using a reasonable threshold
+                from utils.error_utils import format_title_mismatch
+                details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
                 errors.append({
                     "warning_type": "title",
-                    "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
+                    "warning_details": details
                 })
         # Check authors
@@ -460,9 +462,10 @@ class OpenReviewReferenceChecker:
                 is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
                 if is_different and year_message:
+                    from utils.error_utils import format_year_mismatch
                     errors.append({
                         "warning_type": "year",
-                        "warning_details": year_message
+                        "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
                     })
             except (ValueError, TypeError):
                 pass  # Skip year validation if conversion fails
@@ -473,10 +476,10 @@ class OpenReviewReferenceChecker:
         if cited_venue and paper_venue:
             if are_venues_substantially_different(cited_venue, paper_venue):
-                from utils.error_utils import clean_venue_for_comparison
+                from utils.error_utils import format_venue_mismatch
                 errors.append({
                     "warning_type": "venue",
-                    "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
+                    "warning_details": format_venue_mismatch(cited_venue, paper_venue)
                 })
         # Create verified data structure

checkers/semantic_scholar.py CHANGED Viewed

@@ -29,6 +29,7 @@ import logging
 import re
 from typing import Dict, List, Tuple, Optional, Any, Union
 from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
+from utils.error_utils import format_title_mismatch
 from config.settings import get_config
 # Set up logging
@@ -471,7 +472,7 @@ class NonArxivReferenceChecker:
         if found_title and title_similarity < SIMILARITY_THRESHOLD:
             errors.append({
                 'error_type': 'title',
-                'error_details': f"Title mismatch: cited as '{title}' but actually '{found_title}'",
+                'error_details': format_title_mismatch(title, found_title),
                 'ref_title_correct': paper_data.get('title', '')
             })
@@ -525,9 +526,10 @@ class NonArxivReferenceChecker:
             is_different, warning_message = is_year_substantially_different(year, paper_year, context)
             if is_different and warning_message:
+                from utils.error_utils import format_year_mismatch
                 errors.append({
                     'warning_type': 'year',
-                    'warning_details': warning_message,
+                    'warning_details': format_year_mismatch(year, paper_year),
                     'ref_year_correct': paper_year
                 })
@@ -591,14 +593,13 @@ class NonArxivReferenceChecker:
         if external_ids and 'DOI' in external_ids:
             paper_doi = external_ids['DOI']
-            # Compare DOIs, but strip hash fragments and trailing periods for comparison
-            cited_doi_clean = doi.split('#')[0].rstrip('.') if doi else ''
-            paper_doi_clean = paper_doi.split('#')[0].rstrip('.') if paper_doi else ''
-            if cited_doi_clean and paper_doi_clean and cited_doi_clean.lower() != paper_doi_clean.lower():
+            # Compare DOIs using the proper comparison function
+            from utils.doi_utils import compare_dois
+            if doi and paper_doi and not compare_dois(doi, paper_doi):
+                from utils.error_utils import format_doi_mismatch
                 errors.append({
                     'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {paper_doi}",
+                    'error_details': format_doi_mismatch(doi, paper_doi),
                     'ref_doi_correct': paper_doi
                 })

checkers/webpage_checker.py CHANGED Viewed

@@ -182,9 +182,10 @@ class WebPageChecker:
             # Check title match
             if cited_title and page_title:
                 if not self._check_title_match(cited_title, page_title, page_description):
+                    from utils.error_utils import format_title_mismatch
                     errors.append({
                         "warning_type": "title",
-                        "warning_details": f"Title mismatch: cited as '{cited_title}' but page title is '{page_title}'"
+                        "warning_details": format_title_mismatch(cited_title, page_title)
                     })
             # Check if this is a documentation page for the cited topic
@@ -201,9 +202,13 @@ class WebPageChecker:
             if cited_authors:
                 author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
                 if not self._check_author_match(author_str, site_info, web_url):
+                    from utils.error_utils import format_three_line_mismatch
+                    left = author_str
+                    right = site_info.get('organization', 'unknown')
+                    details = format_three_line_mismatch("Author/organization mismatch", left, right)
                     errors.append({
                         "warning_type": "author",
-                        "warning_details": f"Author/organization mismatch: cited as '{author_str}' but page is from '{site_info.get('organization', 'unknown')}'"
+                        "warning_details": details
                     })
             logger.debug(f"Web page verification completed for: {web_url}")

core/parallel_processor.py CHANGED Viewed

@@ -352,12 +352,15 @@ class ParallelReferenceProcessor:
                     error_type = error.get('error_type') or error.get('warning_type')
                     error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                    from utils.error_utils import print_labeled_multiline
                     if error_type == 'arxiv_id':
+                        # Keep existing style for arXiv ID errors
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
-                        print(f"      ❌ Error: {error_details}")
+                        print_labeled_multiline("❌ Error", error_details)
                     else:
-                        print(f"      ⚠️  Warning: {error_details}")
+                        print_labeled_multiline("⚠️  Warning", error_details)
         # Show timing info for slow references
         if result.processing_time > 5.0:

core/refchecker.py CHANGED Viewed

@@ -1900,10 +1900,11 @@ class ArxivReferenceChecker:
             db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
             if normalized_title != db_title:
+                from utils.error_utils import format_title_mismatch
                 logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
                 errors.append({
                     'error_type': 'title',
-                    'error_details': f"Title mismatch: cited as '{title}' but actually '{paper_data.get('title')}'",
+                    'error_details': format_title_mismatch(title, paper_data.get('title')),
                     'ref_title_correct': paper_data.get('title')
                 })
@@ -1925,30 +1926,36 @@ class ArxivReferenceChecker:
         paper_year = paper_data.get('year')
         if year and paper_year and year != paper_year:
             logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
+            from utils.error_utils import format_year_mismatch
             errors.append({
                 'warning_type': 'year',
-                'warning_details': f"Year mismatch: cited as {year} but actually {paper_year}",
+                'warning_details': format_year_mismatch(year, paper_year),
                 'ref_year_correct': paper_year
             })
         # Verify DOI
-        if doi and external_ids.get('DOI') and doi.lower() != external_ids['DOI'].lower():
-            # Check if the cited DOI is a partial match of the actual DOI
-            # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
-            cited_doi_clean = doi.lower().rstrip('.')
-            actual_doi_clean = external_ids['DOI'].lower().rstrip('.')
-            # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
-            # Only flag as error if it's not a reasonable partial match
-            if not actual_doi_clean.startswith(cited_doi_clean):
-                logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
-                errors.append({
-                    'error_type': 'doi',
-                    'error_details': f"DOI mismatch: cited as {doi} but actually {external_ids['DOI']}",
-                    'ref_doi_correct': external_ids['DOI']
-                })
-            else:
-                logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
+        if doi and external_ids.get('DOI'):
+            from utils.doi_utils import compare_dois, normalize_doi
+            # Use proper DOI comparison first
+            if not compare_dois(doi, external_ids['DOI']):
+                # Check if the cited DOI is a partial match of the actual DOI
+                # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
+                cited_doi_normalized = normalize_doi(doi)
+                actual_doi_normalized = normalize_doi(external_ids['DOI'])
+                # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
+                # Only flag as error if it's not a reasonable partial match
+                if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
+                    logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
+                    from utils.error_utils import format_doi_mismatch
+                    errors.append({
+                        'error_type': 'doi',
+                        'error_details': format_doi_mismatch(doi, external_ids['DOI']),
+                        'ref_doi_correct': external_ids['DOI']
+                    })
+                else:
+                    logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
         # Verify ArXiv ID
         if reference.get('type') == 'arxiv':
@@ -3489,8 +3496,9 @@ class ArxivReferenceChecker:
         author_field_match = re.search(r'\\bibfield\{author\}\{(.*?)\}(?:\s*\\bibinfo\{year\}|\s*\\newblock|$)', content, re.DOTALL)
         if author_field_match:
             author_content = author_field_match.group(1)
-            # Find all \bibinfo{person}{Name} entries
-            person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
+            # Find all \bibinfo{person}{Name} entries using balanced brace extraction
+            from utils.text_utils import extract_bibinfo_person_content
+            person_matches = extract_bibinfo_person_content(author_content)
             if person_matches:
                 authors = []
                 for person in person_matches:
@@ -3502,33 +3510,31 @@ class ArxivReferenceChecker:
                         authors.append(clean_name)
                 ref['authors'] = authors
-        # Extract title from \bibinfo{title}{Title}
-        title_match = re.search(r'\\bibinfo\{title\}\{([^}]+)\}', content)
-        if title_match:
-            title = strip_latex_commands(title_match.group(1)).strip()
+        # Import balanced brace extraction function
+        from utils.text_utils import extract_bibinfo_field_content
+        # Extract title from \bibinfo{title}{Title} using balanced brace extraction
+        title_content = extract_bibinfo_field_content(content, 'title')
+        if title_content:
+            title = strip_latex_commands(title_content).strip()
             ref['title'] = title
-        # Extract venue/journal from various fields
-        venue_patterns = [
-            r'\\bibinfo\{booktitle\}\{([^}]+)\}',
-            r'\\bibinfo\{journal\}\{([^}]+)\}',
-            r'\\bibinfo\{series\}\{([^}]+)\}',
-            r'\\bibinfo\{note\}\{([^}]+)\}'
-        ]
+        # Extract venue/journal from various fields using balanced brace extraction
+        venue_field_types = ['booktitle', 'journal', 'series', 'note']
-        for pattern in venue_patterns:
-            venue_match = re.search(pattern, content)
-            if venue_match:
-                venue = strip_latex_commands(venue_match.group(1)).strip()
+        for field_type in venue_field_types:
+            venue_content = extract_bibinfo_field_content(content, field_type)
+            if venue_content:
+                venue = strip_latex_commands(venue_content).strip()
                 if venue:
                     ref['venue'] = venue
                     ref['journal'] = venue  # For compatibility
                     break
-        # Extract DOI
-        doi_match = re.search(r'\\bibinfo\{doi\}\{([^}]+)\}', content)
-        if doi_match:
-            ref['doi'] = doi_match.group(1).strip()
+        # Extract DOI using balanced brace extraction
+        doi_content = extract_bibinfo_field_content(content, 'doi')
+        if doi_content:
+            ref['doi'] = doi_content.strip()
         # Extract ArXiv ID from \showeprint[arxiv]{ID}
         arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -5048,7 +5054,8 @@ class ArxivReferenceChecker:
             correct_first = correct_authors[0]
             if not enhanced_name_match(cited_first, correct_first):
-                return False, f"First author mismatch: '{cited_first}' vs '{correct_first}'"
+                from utils.error_utils import format_first_author_mismatch
+                return False, format_first_author_mismatch(cited_first, correct_first)
         return True, "Authors match"
@@ -5181,7 +5188,7 @@ class ArxivReferenceChecker:
             from utils.text_utils import format_authors_for_display
             authors = format_authors_for_display(reference.get('authors', []))
             year = reference.get('year', '')
-            venue = reference.get('venue', '')
+            venue = reference.get('venue', '') or reference.get('journal', '')
             url = reference.get('url', '')
             doi = reference.get('doi', '')
             # Extract actual reference number from raw text for accurate display
@@ -5454,12 +5461,14 @@ class ArxivReferenceChecker:
                     error_type = error.get('error_type') or error.get('warning_type')
                     error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                    from utils.error_utils import print_labeled_multiline
                     if error_type == 'arxiv_id':
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
-                        print(f"      ❌ Error: {error_details}")
+                        print_labeled_multiline("❌ Error", error_details)
                     else:
-                        print(f"      ⚠️  Warning: {error_details}")
+                        print_labeled_multiline("⚠️  Warning", error_details)
     def _output_reference_errors(self, reference, errors, url):
         """

utils/doi_utils.py CHANGED Viewed

@@ -99,6 +99,10 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     """
     Compare two DOIs for equality, handling different formats and prefixes.
+    This function performs exact matching after normalization, which means
+    DOIs are only considered equal if they are identical after removing
+    prefixes, case differences, and punctuation.
     Args:
         doi1: First DOI to compare
         doi2: Second DOI to compare
@@ -109,21 +113,11 @@ def compare_dois(doi1: str, doi2: str) -> bool:
     if not doi1 or not doi2:
         return False
-    # Normalize both DOIs (already converted to lowercase)
+    # Normalize both DOIs (handles prefixes, case, punctuation)
     norm_doi1 = normalize_doi(doi1)
     norm_doi2 = normalize_doi(doi2)
-    # If DOIs are identical, they match
-    if norm_doi1 == norm_doi2:
-        return True
-    # Check if first two components match (publisher.registrant)
-    doi1_parts = norm_doi1.split('.')
-    doi2_parts = norm_doi2.split('.')
-    if len(doi1_parts) >= 2 and len(doi2_parts) >= 2:
-        return doi1_parts[0] == doi2_parts[0] and doi1_parts[1].split('/')[0] == doi2_parts[1].split('/')[0]
+    # DOIs must be exactly identical after normalization
     return norm_doi1 == norm_doi2

utils/error_utils.py CHANGED Viewed

@@ -9,6 +9,86 @@ for reference checkers.
 from typing import Dict, List, Any, Optional
+def print_labeled_multiline(label: str, text: str) -> None:
+    """
+    Print a multi-line message with consistent label formatting.
+    This function ensures consistent indentation for all error and warning messages,
+    regardless of emoji width differences in the labels.
+    Args:
+        label: The label (e.g., "❌ Error", "⚠️  Warning")
+        text: The multi-line text to print
+    """
+    prefix = f"      {label}: "
+    lines = (text or "").splitlines() or [""]
+    # Print the first line with the label prefix
+    print(prefix + lines[0])
+    # Print subsequent lines with fixed indentation to ensure consistency
+    # Use fixed 19-character indentation to align regardless of emoji width
+    fixed_indent = " " * 15
+    for line in lines[1:]:
+        print(fixed_indent + line)
+def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
+    """
+    Format a three-line mismatch message with fixed indentation.
+    This creates a clean, consistently formatted mismatch message that separates
+    the mismatch type from the values being compared:
+    Example:
+    Title mismatch:
+        'Cited Title'
+    vs: 'Correct Title'
+    Args:
+        mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
+        left: The cited/incorrect value
+        right: The correct value
+    Returns:
+        Three-line formatted mismatch message
+    """
+    # Ensure mismatch_type ends with a colon
+    if not mismatch_type.endswith(":"):
+        mismatch_type = mismatch_type.rstrip() + ":"
+    # Use fixed indentation for clean, consistent alignment
+    indent = ""  # spaces for content indentation
+    vs_indent = ""   # vs: starts at column 0 for clear visual separation
+    return f"{mismatch_type}\n{indent}cited:  '{left}'\n{vs_indent}actual: '{right}'"
+def format_title_mismatch(cited_title: str, verified_title: str) -> str:
+    """
+    Format a three-line title mismatch message.
+    Output format:
+    Title mismatch:
+        'Cited Title'
+    vs: 'Correct Title'
+    """
+    return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
+def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
+    """
+    Three-line year mismatch message.
+    """
+    return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
+def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
+    """
+    Three-line DOI mismatch message.
+    """
+    return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
 def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
     """
     Create a standardized author error dictionary.
@@ -40,7 +120,7 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
     """
     return {
         'warning_type': 'year',
-        'warning_details': f"Year mismatch: cited as {cited_year} but actually {correct_year}",
+        'warning_details': format_year_mismatch(cited_year, correct_year),
         'ref_year_correct': correct_year
     }
@@ -64,7 +144,7 @@ def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str
     if cited_doi_clean != correct_doi_clean:
         return {
             'error_type': 'doi',
-            'error_details': f"DOI mismatch: cited as {cited_doi} but actually {correct_doi}",
+            'error_details': format_doi_mismatch(cited_doi, correct_doi),
             'ref_doi_correct': correct_doi
         }
@@ -120,11 +200,20 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
     return {
         'warning_type': 'venue',
-        'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
+        'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
         'ref_venue_correct': correct_venue
     }
+def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
+    """
+    Format a three-line venue mismatch message with cleaned venue names.
+    """
+    clean_cited = clean_venue_for_comparison(cited_venue)
+    clean_verified = clean_venue_for_comparison(verified_venue)
+    return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
 def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
     """
     Create a standardized URL error dictionary.
@@ -189,6 +278,59 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
     return warning_dict
+def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line author mismatch message.
+    Args:
+        author_number: The author position (1-based)
+        cited_author: The cited author name
+        correct_author: The correct author name
+    Returns:
+        Formatted three-line author mismatch message
+    """
+    return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
+def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line first author mismatch message.
+    Args:
+        cited_author: The cited first author name
+        correct_author: The correct first author name
+    Returns:
+        Formatted three-line first author mismatch message
+    """
+    return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
+def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
+    """
+    Format an author count mismatch message showing all cited and correct authors.
+    Args:
+        cited_count: Number of cited authors
+        correct_count: Number of correct authors
+        cited_authors: List of cited author names
+        correct_authors: List of correct author names
+    Returns:
+        Formatted multi-line author count mismatch message
+    """
+    # Create the header with count information
+    header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
+    # Format author lists
+    cited_list = ", ".join(cited_authors) if cited_authors else "None"
+    correct_list = ", ".join(correct_authors) if correct_authors else "None"
+    # Use the same format as other mismatches
+    return format_three_line_mismatch(header, cited_list, correct_list)
 def format_authors_list(authors: List[Dict[str, str]]) -> str:
     """
     Format a list of author dictionaries into a readable string.

utils/text_utils.py CHANGED Viewed

@@ -554,6 +554,10 @@ def clean_title_basic(title):
     # Remove trailing punctuation
     title = re.sub(r'[.,;:]+$', '', title)
+    # Remove BibTeX publication type indicators at the end (common in Chinese and some international BibTeX styles)
+    # [J] = Journal, [C] = Conference, [M] = Monograph/Book, [D] = Dissertation, [P] = Patent, [R] = Report
+    title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
     return title
@@ -578,6 +582,9 @@ def clean_title_for_search(title):
     title = title.replace('\n', ' ').strip()
     title = re.sub(r'\s+', ' ', title)  # Normalize whitespace only
+    # Remove BibTeX publication type indicators that are not part of the actual title
+    title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
     # Note: We intentionally preserve:
     # - Capitalization (helps with exact matching)
     # - Colons and other meaningful punctuation (structural markers)
@@ -2076,6 +2083,8 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
     # The key insight: if the citation has "et al", we should only verify the listed authors
     # and not penalize for the authoritative source having more authors
     if has_et_al:
+        # Import here to avoid circular imports
+        from utils.error_utils import format_author_mismatch
         # For et al cases, check if each cited author matches ANY author in the correct list
         # rather than comparing positionally, since author order can vary
         for i, cited_author in enumerate(cleaned_cited):
@@ -2088,10 +2097,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
                     break
             if not author_found:
-                # Create a more informative error message that doesn't assume positional matching
-                # Show the full list of correct authors instead of truncating
+                # Use standardized three-line formatting for author mismatch
+                cited_display = format_author_for_display(cited_author)
                 full_author_list = ', '.join(correct_names)
-                return False, f"Author {i+1} mismatch: '{cited_author}' not found in author list (et al case). Correct authors include: {full_author_list}"
+                error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
+                return False, error_msg
         return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
@@ -2100,7 +2110,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         # For non-et-al cases, be more strict about count mismatches
         # Allow minor flexibility (1 author difference) but not more
         if abs(len(cleaned_cited) - len(correct_names)) > 1:
-            return False, f"Author count mismatch: {len(cleaned_cited)} cited vs {len(correct_names)} correct"
+            from utils.error_utils import format_author_count_mismatch
+            error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
+            return False, error_msg
         # Use the shorter list for comparison
         min_len = min(len(cleaned_cited), len(correct_names))
@@ -2110,6 +2122,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
         comparison_cited = cleaned_cited
         comparison_correct = correct_names
+    # Use shared three-line formatter (imported lazily to avoid circular imports)
+    from utils.error_utils import format_first_author_mismatch, format_author_mismatch
     # Compare first author (most important) using the enhanced name matching
     if comparison_cited and comparison_correct:
         cited_first = comparison_cited[0]
@@ -2119,7 +2134,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
             # Use consistent display format for both names
             cited_display = format_author_for_display(cited_first)
             correct_display = format_author_for_display(correct_first)
-            return False, f"First author mismatch: '{cited_display}' vs '{correct_display}'"
+            return False, format_first_author_mismatch(cited_display, correct_display)
     # For complete verification, check all authors if reasonable number
     if len(comparison_cited) <= 5:  # Only do full check for reasonable author counts
@@ -2128,7 +2143,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
                 # Use consistent display format for both names
                 cited_display = format_author_for_display(cited_author)
                 correct_display = format_author_for_display(correct_author)
-                return False, f"Author {i+1} mismatch: '{cited_display}' vs '{correct_display}'"
+                return False, format_author_mismatch(i+1, cited_display, correct_display)
     return True, "Authors match"
@@ -2512,6 +2527,97 @@ def strip_latex_commands(text):
     return text
+def extract_balanced_braces(text, start_pos):
+    """
+    Extract content from balanced braces starting at start_pos.
+    This function properly handles nested braces, which is important for LaTeX content
+    where patterns like {Jos{\'e} Meseguer} need to be extracted as complete units.
+    Args:
+        text: The text to search in
+        start_pos: Position of the opening brace
+    Returns:
+        tuple: (content, end_pos) or (None, start_pos) if no balanced content found
+    """
+    if start_pos >= len(text) or text[start_pos] != '{':
+        return None, start_pos
+    brace_count = 1
+    pos = start_pos + 1
+    while pos < len(text) and brace_count > 0:
+        if text[pos] == '{':
+            brace_count += 1
+        elif text[pos] == '}':
+            brace_count -= 1
+        pos += 1
+    if brace_count == 0:
+        return text[start_pos + 1:pos - 1], pos
+    else:
+        return None, start_pos
+def extract_bibinfo_person_content(text):
+    """
+    Extract all person names from \\bibinfo{person}{...} with proper brace handling.
+    This function correctly handles nested braces in author names, such as:
+    \\bibinfo{person}{Jos{\\'e} Meseguer}
+    Args:
+        text: Text containing \\bibinfo{person}{...} patterns
+    Returns:
+        list: List of extracted person names with balanced braces preserved
+    """
+    return extract_bibinfo_field_content(text, 'person', return_all=True)
+def extract_bibinfo_field_content(text, field_type, return_all=False):
+    """
+    Extract content from \\bibinfo{field_type}{...} with proper brace handling.
+    This function correctly handles nested braces in field content, such as:
+    \\bibinfo{journal}{\\emph{Commun. ACM}}
+    Args:
+        text: Text containing \\bibinfo{field_type}{...} patterns
+        field_type: The field type to extract (e.g., 'person', 'journal', 'title')
+        return_all: If True, return list of all matches; if False, return first match or None
+    Returns:
+        list or str or None: Extracted content based on return_all parameter
+    """
+    pattern = f'\\\\bibinfo\\{{{re.escape(field_type)}\\}}\\{{'
+    matches = []
+    pos = 0
+    while True:
+        match = re.search(pattern, text[pos:])
+        if not match:
+            break
+        # Find the start of the content braces
+        brace_start = pos + match.end() - 1  # -1 because we want the opening brace
+        content, end_pos = extract_balanced_braces(text, brace_start)
+        if content is not None:
+            matches.append(content)
+            pos = end_pos
+            if not return_all:
+                break  # Return first match only
+        else:
+            pos += match.end()
+    if return_all:
+        return matches
+    else:
+        return matches[0] if matches else None
 def extract_cited_keys_from_latex(tex_content):
     r"""
     Extract citation keys from LaTeX content by finding \cite{} commands.
@@ -2936,8 +3042,8 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     if brace_count == 0:
                         author_content = content[start_pos:pos-1]
-                        # Extract individual authors from \bibinfo{person}{Name} tags
-                        person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
+                        # Extract individual authors from \bibinfo{person}{Name} tags using balanced brace extraction
+                        person_matches = extract_bibinfo_person_content(author_content)
                         if person_matches:
                             # Clean and format author names
                             authors = []
@@ -4594,7 +4700,7 @@ def normalize_venue_for_display(venue: str) -> str:
     prefixes_to_remove = [
         r'^\d{4}\s+\d+(st|nd|rd|th)\s+',  # "2012 IEEE/RSJ"
         r'^\d{4}\s+',                     # "2024 "
-        r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?',  # "Proceedings of the IEEE"
+        r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?',  # "Proceedings of the [ORG] [ORG] 29th"
         r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?',        # "Proc. of the IEEE" (require "of")
         r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?',       # "Procs. of the IEEE" (require "of")
         r'^in\s+',

utils/url_utils.py CHANGED Viewed

@@ -214,6 +214,7 @@ def clean_url(url: str) -> str:
     This function handles:
     - Whitespace trimming
     - Malformed LaTeX URL wrappers like \\url{https://...}
+    - Markdown-style links like [text](url)
     - Trailing punctuation from academic references
     - DOI URL query parameter cleanup
@@ -237,6 +238,14 @@ def clean_url(url: str) -> str:
     if url_match:
         url = url_match.group(1)
+    # Handle markdown-style links like [text](url) or [url](url)
+    # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
+    markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
+    markdown_match = re.search(markdown_pattern, url)
+    if markdown_match:
+        # Use the URL from parentheses
+        url = markdown_match.group(2)
     # Remove trailing punctuation that's commonly part of sentence structure
     # but preserve legitimate URL characters
     url = url.rstrip('.,;!?)')
@@ -280,6 +289,14 @@ def clean_url_punctuation(url: str) -> str:
     if url_match:
         url = url_match.group(1)
+    # Handle markdown-style links like [text](url) or [url](url)
+    # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
+    markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
+    markdown_match = re.search(markdown_pattern, url)
+    if markdown_match:
+        # Use the URL from parentheses
+        url = markdown_match.group(2)
     # Remove trailing punctuation that's commonly part of sentence structure
     # but preserve legitimate URL characters
     url = url.rstrip('.,;!?)')

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl

academic-refchecker 1.2.43py3-none-any.whl → 1.2.45py3-none-any.whl