academic-refchecker 1.2.44__py3-none-any.whl → 1.2.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.44"
3
+ __version__ = "1.2.45"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.44
3
+ Version: 1.2.45
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -78,7 +78,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
78
78
  Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
79
79
  ArXiv URL: https://arxiv.org/abs/1610.10099
80
80
  DOI URL: https://doi.org/10.48550/arxiv.1610.10099
81
- ⚠️ Warning: Year mismatch: cited as 2017 but actually 2016
81
+ ⚠️ Warning: Year mismatch:
82
+ cited: '2017'
83
+ actual: '2016'
82
84
 
83
85
  [2/45] Effective approaches to attention-based neural machine translation
84
86
  Minh-Thang Luong, Hieu Pham, Christopher D. Manning
@@ -87,7 +89,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
87
89
  Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
88
90
  ArXiv URL: https://arxiv.org/abs/1508.04025
89
91
  DOI URL: https://doi.org/10.18653/v1/d15-1166
90
- ❌ Error: First author mismatch: 'Minh-Thang Luong' vs 'Thang Luong'
92
+ ❌ Error: First author mismatch:
93
+ cited: 'Minh-Thang Luong'
94
+ actual: 'Thang Luong'
91
95
 
92
96
  [3/45] Deep Residual Learning for Image Recognition
93
97
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
@@ -98,7 +102,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
98
102
  Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
99
103
  ArXiv URL: https://arxiv.org/abs/1512.03385
100
104
  DOI URL: https://doi.org/10.1109/CVPR.2016.90
101
- ❌ Error: DOI mismatch: cited as '10.1109/CVPR.2016.91' but actually '10.1109/CVPR.2016.90'
105
+ ❌ Error: DOI mismatch:
106
+ cited: '10.1109/CVPR.2016.91'
107
+ actual: '10.1109/CVPR.2016.90'
102
108
 
103
109
  ============================================================
104
110
  📋 SUMMARY
@@ -382,7 +388,9 @@ This enhanced URL display helps users access multiple authoritative sources for
382
388
  Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
383
389
  ArXiv URL: https://arxiv.org/abs/2312.02119
384
390
  DOI URL: https://doi.org/10.48550/arxiv.2312.02119
385
- ❌ Error: First author mismatch: 'T. Xie' vs 'Zhao Xu'
391
+ ❌ Error: First author mismatch:
392
+ cited: 'T. Xie'
393
+ actual: 'Zhao Xu'
386
394
  ```
387
395
  - `title`: Title discrepancies
388
396
  ```
@@ -392,7 +400,9 @@ This enhanced URL display helps users access multiple authoritative sources for
392
400
  Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
393
401
  ArXiv URL: https://arxiv.org/abs/1810.04805
394
402
  DOI URL: https://doi.org/10.18653/v1/n19-1423
395
- ❌ Error: Title mismatch: cited as 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' but actually 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
403
+ ❌ Error: Title mismatch:
404
+ cited: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
405
+ actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
396
406
  ```
397
407
  - `arxiv_id`: Incorrect URLs or arXiv IDs
398
408
  ```
@@ -415,7 +425,9 @@ This enhanced URL display helps users access multiple authoritative sources for
415
425
  Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
416
426
  ArXiv URL: https://arxiv.org/abs/1706.03762
417
427
  DOI URL: https://doi.org/10.48550/arXiv.1706.03762
418
- ❌ Error: DOI mismatch: cited as '10.5555/3295222.3295349' but actually '10.48550/arXiv.1706.03762'
428
+ ❌ Error: DOI mismatch:
429
+ cited: '10.5555/3295222.3295349'
430
+ actual: '10.48550/arXiv.1706.03762'
419
431
  ```
420
432
 
421
433
  - **⚠️ Warnings**: Minor issues that may need attention
@@ -428,7 +440,9 @@ This enhanced URL display helps users access multiple authoritative sources for
428
440
  Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
429
441
  ArXiv URL: https://arxiv.org/abs/2310.03684
430
442
  DOI URL: https://doi.org/10.48550/arxiv.2310.03684
431
- ⚠️ Warning: Year mismatch: cited as 2024 but actually 2023
443
+ ⚠️ Warning: Year mismatch:
444
+ cited: '2024'
445
+ actual: '2023'
432
446
  ```
433
447
  - `venue`: Venue format variations
434
448
  ```
@@ -439,7 +453,9 @@ This enhanced URL display helps users access multiple authoritative sources for
439
453
  Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
440
454
  ArXiv URL: https://arxiv.org/abs/2403.02151
441
455
  DOI URL: https://doi.org/10.48550/arxiv.2403.02151
442
- ⚠️ Warning: Venue mismatch: cited as 'arXiv, 2024' but actually 'Neural Information Processing Systems'
456
+ ⚠️ Warning: Venue mismatch:
457
+ cited: 'arXiv, 2024'
458
+ actual: 'Neural Information Processing Systems'
443
459
  ```
444
460
 
445
461
  - **❓ Unverified**: References that couldn't be verified with any of the checker APIs
@@ -1,21 +1,21 @@
1
- __version__.py,sha256=k3lYUlcZL-yL2e_2u3UPBtgwqMqZJ11x7KVMZOotlE8,65
2
- academic_refchecker-1.2.44.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=8vdrigO4-YfHufQMfh_RQ9NlN5btmqndss2dAOLxa1Q,65
2
+ academic_refchecker-1.2.45.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
- checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
4
+ checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
6
- checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
6
+ checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
7
7
  checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
8
- checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
9
- checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
10
- checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
11
- checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
8
+ checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
+ checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
10
+ checkers/semantic_scholar.py,sha256=0LcVahf3twyHqaD7bQ2eJiTyg-AQ9NGvVohb9nqaHdA,34884
11
+ checkers/webpage_checker.py,sha256=woY8mNgZ4Lr9Ug53CN-Xo_2P62BTpR2u_FZyUPgTEuA,21833
12
12
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
13
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
14
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
- core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
18
- core/refchecker.py,sha256=w3KNWyyaZZVL3ghFhEfro8SPs4xXEUjmCJERfZ7Du6A,273648
17
+ core/parallel_processor.py,sha256=AOnjqhBHXlSb1c-PSunat9Eug5y04gOygwbHdPUqxgk,17202
18
+ core/refchecker.py,sha256=lU6r9cKpB8Fc4Wd7vOqdqhxP9cwYEoB6D4PlYznglGY,274337
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -33,14 +33,14 @@ utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,1
33
33
  utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
34
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
35
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
- utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
37
- utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
36
+ utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
37
+ utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=uEwKasw3aTVgIDHbDJDSOcTUbPwfiivIdhKwmxQJy0U,206378
39
+ utils/text_utils.py,sha256=T67Y-HSNokj-mOcdCtOcULNviBxyaG9xTjRd_l9titI,210088
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
41
  utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
42
- academic_refchecker-1.2.44.dist-info/METADATA,sha256=ueA0mwKqmiqhR9WBLyPy2W40wfJc4JRiWSTbrQHKU14,22298
43
- academic_refchecker-1.2.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.44.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.44.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.44.dist-info/RECORD,,
42
+ academic_refchecker-1.2.45.dist-info/METADATA,sha256=mY4M9FRaDKcyS5yOFvR3X0Y0bj47_YmZeMayvrrpS38,22576
43
+ academic_refchecker-1.2.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.45.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.45.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.45.dist-info/RECORD,,
checkers/crossref.py CHANGED
@@ -31,6 +31,7 @@ import re
31
31
  from typing import Dict, List, Tuple, Optional, Any, Union
32
32
  from urllib.parse import quote_plus
33
33
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
34
+ from utils.error_utils import format_year_mismatch, format_doi_mismatch
34
35
  from config.settings import get_config
35
36
 
36
37
  # Set up logging
@@ -478,21 +479,19 @@ class CrossRefReferenceChecker:
478
479
  if year and work_year and year != work_year:
479
480
  errors.append({
480
481
  'warning_type': 'year',
481
- 'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
482
+ 'warning_details': format_year_mismatch(year, work_year),
482
483
  'ref_year_correct': work_year
483
484
  })
484
485
 
485
486
  # Verify DOI
486
487
  work_doi = work_data.get('DOI')
487
488
  if doi and work_doi:
488
- # Normalize DOIs for comparison (remove URL prefix and trailing periods)
489
- cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
490
- work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
491
-
492
- if cited_doi_clean.lower() != work_doi_clean.lower():
489
+ # Compare DOIs using the proper comparison function
490
+ from utils.doi_utils import compare_dois
491
+ if not compare_dois(doi, work_doi):
493
492
  errors.append({
494
493
  'error_type': 'doi',
495
- 'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
494
+ 'error_details': format_doi_mismatch(doi, work_doi),
496
495
  'ref_doi_correct': work_doi
497
496
  })
498
497
 
@@ -169,9 +169,14 @@ class GitHubChecker:
169
169
  if cited_title:
170
170
  title_match = self._check_title_match(cited_title, actual_name, actual_description)
171
171
  if not title_match:
172
+ from utils.error_utils import format_title_mismatch
173
+ details = format_title_mismatch(cited_title, actual_name)
174
+ if actual_description:
175
+ snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
176
+ details += f" ({snippet})"
172
177
  errors.append({
173
178
  "warning_type": "title",
174
- "warning_details": f"Title mismatch: cited as '{cited_title}' but repository is '{actual_name}' ({actual_description[:100]}{'...' if len(actual_description) > 100 else ''})"
179
+ "warning_details": details
175
180
  })
176
181
 
177
182
  # Verify authors
@@ -180,9 +185,13 @@ class GitHubChecker:
180
185
  author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
181
186
  author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
182
187
  if not author_match:
188
+ from utils.error_utils import format_three_line_mismatch
189
+ left = author_str
190
+ right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
191
+ details = format_three_line_mismatch("Author mismatch", left, right)
183
192
  errors.append({
184
193
  "warning_type": "author",
185
- "warning_details": f"Author mismatch: cited as '{author_str}' but repository owner is '{actual_owner}' ({actual_owner_name})"
194
+ "warning_details": details
186
195
  })
187
196
 
188
197
  # Verify year
@@ -191,9 +200,10 @@ class GitHubChecker:
191
200
  try:
192
201
  cited_year_int = int(cited_year)
193
202
  if cited_year_int < creation_year:
203
+ from utils.error_utils import format_year_mismatch
194
204
  errors.append({
195
205
  "warning_type": "year",
196
- "warning_details": f"Year mismatch: cited as {cited_year} but repository created in {creation_year}",
206
+ "warning_details": format_year_mismatch(cited_year, creation_year),
197
207
  "ref_year_correct": str(creation_year)
198
208
  })
199
209
  except (ValueError, TypeError):
checkers/openalex.py CHANGED
@@ -33,6 +33,7 @@ import re
33
33
  from typing import Dict, List, Tuple, Optional, Any, Union
34
34
  from urllib.parse import quote_plus
35
35
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
36
+ from utils.error_utils import format_year_mismatch, format_doi_mismatch
36
37
  from config.settings import get_config
37
38
 
38
39
  # Set up logging
@@ -448,7 +449,7 @@ class OpenAlexReferenceChecker:
448
449
  if year and work_year and year != work_year:
449
450
  errors.append({
450
451
  'warning_type': 'year',
451
- 'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
452
+ 'warning_details': format_year_mismatch(year, work_year),
452
453
  'ref_year_correct': work_year
453
454
  })
454
455
 
@@ -458,14 +459,12 @@ class OpenAlexReferenceChecker:
458
459
  work_doi = work_data['ids']['doi']
459
460
 
460
461
  if doi and work_doi:
461
- # Normalize DOIs for comparison (remove URL prefix and trailing periods)
462
- cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
463
- work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
464
-
465
- if cited_doi_clean.lower() != work_doi_clean.lower():
462
+ # Compare DOIs using the proper comparison function
463
+ from utils.doi_utils import compare_dois
464
+ if not compare_dois(doi, work_doi):
466
465
  errors.append({
467
466
  'error_type': 'doi',
468
- 'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
467
+ 'error_details': format_doi_mismatch(doi, work_doi),
469
468
  'ref_doi_correct': work_doi
470
469
  })
471
470
 
@@ -425,9 +425,11 @@ class OpenReviewReferenceChecker:
425
425
  if cited_title and paper_title:
426
426
  similarity = calculate_title_similarity(cited_title, paper_title)
427
427
  if similarity < 0.7: # Using a reasonable threshold
428
+ from utils.error_utils import format_title_mismatch
429
+ details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
428
430
  errors.append({
429
431
  "warning_type": "title",
430
- "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
432
+ "warning_details": details
431
433
  })
432
434
 
433
435
  # Check authors
@@ -460,9 +462,10 @@ class OpenReviewReferenceChecker:
460
462
 
461
463
  is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
462
464
  if is_different and year_message:
465
+ from utils.error_utils import format_year_mismatch
463
466
  errors.append({
464
467
  "warning_type": "year",
465
- "warning_details": year_message
468
+ "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
466
469
  })
467
470
  except (ValueError, TypeError):
468
471
  pass # Skip year validation if conversion fails
@@ -473,10 +476,10 @@ class OpenReviewReferenceChecker:
473
476
 
474
477
  if cited_venue and paper_venue:
475
478
  if are_venues_substantially_different(cited_venue, paper_venue):
476
- from utils.error_utils import clean_venue_for_comparison
479
+ from utils.error_utils import format_venue_mismatch
477
480
  errors.append({
478
481
  "warning_type": "venue",
479
- "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
482
+ "warning_details": format_venue_mismatch(cited_venue, paper_venue)
480
483
  })
481
484
 
482
485
  # Create verified data structure
@@ -29,6 +29,7 @@ import logging
29
29
  import re
30
30
  from typing import Dict, List, Tuple, Optional, Any, Union
31
31
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
32
+ from utils.error_utils import format_title_mismatch
32
33
  from config.settings import get_config
33
34
 
34
35
  # Set up logging
@@ -471,7 +472,7 @@ class NonArxivReferenceChecker:
471
472
  if found_title and title_similarity < SIMILARITY_THRESHOLD:
472
473
  errors.append({
473
474
  'error_type': 'title',
474
- 'error_details': f"Title mismatch: cited as '{title}' but actually '{found_title}'",
475
+ 'error_details': format_title_mismatch(title, found_title),
475
476
  'ref_title_correct': paper_data.get('title', '')
476
477
  })
477
478
 
@@ -525,9 +526,10 @@ class NonArxivReferenceChecker:
525
526
  is_different, warning_message = is_year_substantially_different(year, paper_year, context)
526
527
 
527
528
  if is_different and warning_message:
529
+ from utils.error_utils import format_year_mismatch
528
530
  errors.append({
529
531
  'warning_type': 'year',
530
- 'warning_details': warning_message,
532
+ 'warning_details': format_year_mismatch(year, paper_year),
531
533
  'ref_year_correct': paper_year
532
534
  })
533
535
 
@@ -591,14 +593,13 @@ class NonArxivReferenceChecker:
591
593
  if external_ids and 'DOI' in external_ids:
592
594
  paper_doi = external_ids['DOI']
593
595
 
594
- # Compare DOIs, but strip hash fragments and trailing periods for comparison
595
- cited_doi_clean = doi.split('#')[0].rstrip('.') if doi else ''
596
- paper_doi_clean = paper_doi.split('#')[0].rstrip('.') if paper_doi else ''
597
-
598
- if cited_doi_clean and paper_doi_clean and cited_doi_clean.lower() != paper_doi_clean.lower():
596
+ # Compare DOIs using the proper comparison function
597
+ from utils.doi_utils import compare_dois
598
+ if doi and paper_doi and not compare_dois(doi, paper_doi):
599
+ from utils.error_utils import format_doi_mismatch
599
600
  errors.append({
600
601
  'error_type': 'doi',
601
- 'error_details': f"DOI mismatch: cited as {doi} but actually {paper_doi}",
602
+ 'error_details': format_doi_mismatch(doi, paper_doi),
602
603
  'ref_doi_correct': paper_doi
603
604
  })
604
605
 
@@ -182,9 +182,10 @@ class WebPageChecker:
182
182
  # Check title match
183
183
  if cited_title and page_title:
184
184
  if not self._check_title_match(cited_title, page_title, page_description):
185
+ from utils.error_utils import format_title_mismatch
185
186
  errors.append({
186
187
  "warning_type": "title",
187
- "warning_details": f"Title mismatch: cited as '{cited_title}' but page title is '{page_title}'"
188
+ "warning_details": format_title_mismatch(cited_title, page_title)
188
189
  })
189
190
 
190
191
  # Check if this is a documentation page for the cited topic
@@ -201,9 +202,13 @@ class WebPageChecker:
201
202
  if cited_authors:
202
203
  author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
203
204
  if not self._check_author_match(author_str, site_info, web_url):
205
+ from utils.error_utils import format_three_line_mismatch
206
+ left = author_str
207
+ right = site_info.get('organization', 'unknown')
208
+ details = format_three_line_mismatch("Author/organization mismatch", left, right)
204
209
  errors.append({
205
210
  "warning_type": "author",
206
- "warning_details": f"Author/organization mismatch: cited as '{author_str}' but page is from '{site_info.get('organization', 'unknown')}'"
211
+ "warning_details": details
207
212
  })
208
213
 
209
214
  logger.debug(f"Web page verification completed for: {web_url}")
@@ -352,12 +352,15 @@ class ParallelReferenceProcessor:
352
352
  error_type = error.get('error_type') or error.get('warning_type')
353
353
  error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
354
354
 
355
+ from utils.error_utils import print_labeled_multiline
356
+
355
357
  if error_type == 'arxiv_id':
358
+ # Keep existing style for arXiv ID errors
356
359
  print(f" ❌ {error_details}")
357
360
  elif 'error_type' in error:
358
- print(f" ❌ Error: {error_details}")
361
+ print_labeled_multiline("❌ Error", error_details)
359
362
  else:
360
- print(f" ⚠️ Warning: {error_details}")
363
+ print_labeled_multiline("⚠️ Warning", error_details)
361
364
 
362
365
  # Show timing info for slow references
363
366
  if result.processing_time > 5.0:
core/refchecker.py CHANGED
@@ -1900,10 +1900,11 @@ class ArxivReferenceChecker:
1900
1900
  db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
1901
1901
 
1902
1902
  if normalized_title != db_title:
1903
+ from utils.error_utils import format_title_mismatch
1903
1904
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
1904
1905
  errors.append({
1905
1906
  'error_type': 'title',
1906
- 'error_details': f"Title mismatch: cited as '{title}' but actually '{paper_data.get('title')}'",
1907
+ 'error_details': format_title_mismatch(title, paper_data.get('title')),
1907
1908
  'ref_title_correct': paper_data.get('title')
1908
1909
  })
1909
1910
 
@@ -1925,30 +1926,36 @@ class ArxivReferenceChecker:
1925
1926
  paper_year = paper_data.get('year')
1926
1927
  if year and paper_year and year != paper_year:
1927
1928
  logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1929
+ from utils.error_utils import format_year_mismatch
1928
1930
  errors.append({
1929
1931
  'warning_type': 'year',
1930
- 'warning_details': f"Year mismatch: cited as {year} but actually {paper_year}",
1932
+ 'warning_details': format_year_mismatch(year, paper_year),
1931
1933
  'ref_year_correct': paper_year
1932
1934
  })
1933
1935
 
1934
1936
  # Verify DOI
1935
- if doi and external_ids.get('DOI') and doi.lower() != external_ids['DOI'].lower():
1936
- # Check if the cited DOI is a partial match of the actual DOI
1937
- # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
1938
- cited_doi_clean = doi.lower().rstrip('.')
1939
- actual_doi_clean = external_ids['DOI'].lower().rstrip('.')
1940
-
1941
- # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
1942
- # Only flag as error if it's not a reasonable partial match
1943
- if not actual_doi_clean.startswith(cited_doi_clean):
1944
- logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1945
- errors.append({
1946
- 'error_type': 'doi',
1947
- 'error_details': f"DOI mismatch: cited as {doi} but actually {external_ids['DOI']}",
1948
- 'ref_doi_correct': external_ids['DOI']
1949
- })
1950
- else:
1951
- logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
1937
+ if doi and external_ids.get('DOI'):
1938
+ from utils.doi_utils import compare_dois, normalize_doi
1939
+
1940
+ # Use proper DOI comparison first
1941
+ if not compare_dois(doi, external_ids['DOI']):
1942
+ # Check if the cited DOI is a partial match of the actual DOI
1943
+ # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
1944
+ cited_doi_normalized = normalize_doi(doi)
1945
+ actual_doi_normalized = normalize_doi(external_ids['DOI'])
1946
+
1947
+ # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
1948
+ # Only flag as error if it's not a reasonable partial match
1949
+ if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
1950
+ logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1951
+ from utils.error_utils import format_doi_mismatch
1952
+ errors.append({
1953
+ 'error_type': 'doi',
1954
+ 'error_details': format_doi_mismatch(doi, external_ids['DOI']),
1955
+ 'ref_doi_correct': external_ids['DOI']
1956
+ })
1957
+ else:
1958
+ logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
1952
1959
 
1953
1960
  # Verify ArXiv ID
1954
1961
  if reference.get('type') == 'arxiv':
@@ -3489,8 +3496,9 @@ class ArxivReferenceChecker:
3489
3496
  author_field_match = re.search(r'\\bibfield\{author\}\{(.*?)\}(?:\s*\\bibinfo\{year\}|\s*\\newblock|$)', content, re.DOTALL)
3490
3497
  if author_field_match:
3491
3498
  author_content = author_field_match.group(1)
3492
- # Find all \bibinfo{person}{Name} entries
3493
- person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
3499
+ # Find all \bibinfo{person}{Name} entries using balanced brace extraction
3500
+ from utils.text_utils import extract_bibinfo_person_content
3501
+ person_matches = extract_bibinfo_person_content(author_content)
3494
3502
  if person_matches:
3495
3503
  authors = []
3496
3504
  for person in person_matches:
@@ -3502,33 +3510,31 @@ class ArxivReferenceChecker:
3502
3510
  authors.append(clean_name)
3503
3511
  ref['authors'] = authors
3504
3512
 
3505
- # Extract title from \bibinfo{title}{Title}
3506
- title_match = re.search(r'\\bibinfo\{title\}\{([^}]+)\}', content)
3507
- if title_match:
3508
- title = strip_latex_commands(title_match.group(1)).strip()
3513
+ # Import balanced brace extraction function
3514
+ from utils.text_utils import extract_bibinfo_field_content
3515
+
3516
+ # Extract title from \bibinfo{title}{Title} using balanced brace extraction
3517
+ title_content = extract_bibinfo_field_content(content, 'title')
3518
+ if title_content:
3519
+ title = strip_latex_commands(title_content).strip()
3509
3520
  ref['title'] = title
3510
3521
 
3511
- # Extract venue/journal from various fields
3512
- venue_patterns = [
3513
- r'\\bibinfo\{booktitle\}\{([^}]+)\}',
3514
- r'\\bibinfo\{journal\}\{([^}]+)\}',
3515
- r'\\bibinfo\{series\}\{([^}]+)\}',
3516
- r'\\bibinfo\{note\}\{([^}]+)\}'
3517
- ]
3522
+ # Extract venue/journal from various fields using balanced brace extraction
3523
+ venue_field_types = ['booktitle', 'journal', 'series', 'note']
3518
3524
 
3519
- for pattern in venue_patterns:
3520
- venue_match = re.search(pattern, content)
3521
- if venue_match:
3522
- venue = strip_latex_commands(venue_match.group(1)).strip()
3525
+ for field_type in venue_field_types:
3526
+ venue_content = extract_bibinfo_field_content(content, field_type)
3527
+ if venue_content:
3528
+ venue = strip_latex_commands(venue_content).strip()
3523
3529
  if venue:
3524
3530
  ref['venue'] = venue
3525
3531
  ref['journal'] = venue # For compatibility
3526
3532
  break
3527
3533
 
3528
- # Extract DOI
3529
- doi_match = re.search(r'\\bibinfo\{doi\}\{([^}]+)\}', content)
3530
- if doi_match:
3531
- ref['doi'] = doi_match.group(1).strip()
3534
+ # Extract DOI using balanced brace extraction
3535
+ doi_content = extract_bibinfo_field_content(content, 'doi')
3536
+ if doi_content:
3537
+ ref['doi'] = doi_content.strip()
3532
3538
 
3533
3539
  # Extract ArXiv ID from \showeprint[arxiv]{ID}
3534
3540
  arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -5048,7 +5054,8 @@ class ArxivReferenceChecker:
5048
5054
  correct_first = correct_authors[0]
5049
5055
 
5050
5056
  if not enhanced_name_match(cited_first, correct_first):
5051
- return False, f"First author mismatch: '{cited_first}' vs '{correct_first}'"
5057
+ from utils.error_utils import format_first_author_mismatch
5058
+ return False, format_first_author_mismatch(cited_first, correct_first)
5052
5059
 
5053
5060
  return True, "Authors match"
5054
5061
 
@@ -5454,12 +5461,14 @@ class ArxivReferenceChecker:
5454
5461
  error_type = error.get('error_type') or error.get('warning_type')
5455
5462
  error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
5456
5463
 
5464
+ from utils.error_utils import print_labeled_multiline
5465
+
5457
5466
  if error_type == 'arxiv_id':
5458
5467
  print(f" ❌ {error_details}")
5459
5468
  elif 'error_type' in error:
5460
- print(f" ❌ Error: {error_details}")
5469
+ print_labeled_multiline("❌ Error", error_details)
5461
5470
  else:
5462
- print(f" ⚠️ Warning: {error_details}")
5471
+ print_labeled_multiline("⚠️ Warning", error_details)
5463
5472
 
5464
5473
  def _output_reference_errors(self, reference, errors, url):
5465
5474
  """
utils/doi_utils.py CHANGED
@@ -99,6 +99,10 @@ def compare_dois(doi1: str, doi2: str) -> bool:
99
99
  """
100
100
  Compare two DOIs for equality, handling different formats and prefixes.
101
101
 
102
+ This function performs exact matching after normalization, which means
103
+ DOIs are only considered equal if they are identical after removing
104
+ prefixes, case differences, and punctuation.
105
+
102
106
  Args:
103
107
  doi1: First DOI to compare
104
108
  doi2: Second DOI to compare
@@ -109,21 +113,11 @@ def compare_dois(doi1: str, doi2: str) -> bool:
109
113
  if not doi1 or not doi2:
110
114
  return False
111
115
 
112
- # Normalize both DOIs (already converted to lowercase)
116
+ # Normalize both DOIs (handles prefixes, case, punctuation)
113
117
  norm_doi1 = normalize_doi(doi1)
114
118
  norm_doi2 = normalize_doi(doi2)
115
119
 
116
- # If DOIs are identical, they match
117
- if norm_doi1 == norm_doi2:
118
- return True
119
-
120
- # Check if first two components match (publisher.registrant)
121
- doi1_parts = norm_doi1.split('.')
122
- doi2_parts = norm_doi2.split('.')
123
-
124
- if len(doi1_parts) >= 2 and len(doi2_parts) >= 2:
125
- return doi1_parts[0] == doi2_parts[0] and doi1_parts[1].split('/')[0] == doi2_parts[1].split('/')[0]
126
-
120
+ # DOIs must be exactly identical after normalization
127
121
  return norm_doi1 == norm_doi2
128
122
 
129
123
 
utils/error_utils.py CHANGED
@@ -9,6 +9,86 @@ for reference checkers.
9
9
  from typing import Dict, List, Any, Optional
10
10
 
11
11
 
12
+ def print_labeled_multiline(label: str, text: str) -> None:
13
+ """
14
+ Print a multi-line message with consistent label formatting.
15
+
16
+ This function ensures consistent indentation for all error and warning messages,
17
+ regardless of emoji width differences in the labels.
18
+
19
+ Args:
20
+ label: The label (e.g., "❌ Error", "⚠️ Warning")
21
+ text: The multi-line text to print
22
+ """
23
+ prefix = f" {label}: "
24
+ lines = (text or "").splitlines() or [""]
25
+
26
+ # Print the first line with the label prefix
27
+ print(prefix + lines[0])
28
+
29
+ # Print subsequent lines with fixed indentation to ensure consistency
30
+ # Use fixed 19-character indentation to align regardless of emoji width
31
+ fixed_indent = " " * 15
32
+ for line in lines[1:]:
33
+ print(fixed_indent + line)
34
+
35
+
36
+ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
37
+ """
38
+ Format a three-line mismatch message with fixed indentation.
39
+
40
+ This creates a clean, consistently formatted mismatch message that separates
41
+ the mismatch type from the values being compared:
42
+
43
+ Example:
44
+ Title mismatch:
45
+ 'Cited Title'
46
+ vs: 'Correct Title'
47
+
48
+ Args:
49
+ mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
50
+ left: The cited/incorrect value
51
+ right: The correct value
52
+
53
+ Returns:
54
+ Three-line formatted mismatch message
55
+ """
56
+ # Ensure mismatch_type ends with a colon
57
+ if not mismatch_type.endswith(":"):
58
+ mismatch_type = mismatch_type.rstrip() + ":"
59
+
60
+ # Use fixed indentation for clean, consistent alignment
61
+ indent = "" # spaces for content indentation
62
+ vs_indent = "" # vs: starts at column 0 for clear visual separation
63
+
64
+ return f"{mismatch_type}\n{indent}cited: '{left}'\n{vs_indent}actual: '{right}'"
65
+
66
+
67
+ def format_title_mismatch(cited_title: str, verified_title: str) -> str:
68
+ """
69
+ Format a three-line title mismatch message.
70
+
71
+ Output format:
72
+ Title mismatch:
73
+ 'Cited Title'
74
+ vs: 'Correct Title'
75
+ """
76
+ return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
77
+
78
+
79
+ def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
80
+ """
81
+ Three-line year mismatch message.
82
+ """
83
+ return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
84
+
85
+
86
+ def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
87
+ """
88
+ Three-line DOI mismatch message.
89
+ """
90
+ return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
91
+
12
92
  def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
13
93
  """
14
94
  Create a standardized author error dictionary.
@@ -40,7 +120,7 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
40
120
  """
41
121
  return {
42
122
  'warning_type': 'year',
43
- 'warning_details': f"Year mismatch: cited as {cited_year} but actually {correct_year}",
123
+ 'warning_details': format_year_mismatch(cited_year, correct_year),
44
124
  'ref_year_correct': correct_year
45
125
  }
46
126
 
@@ -64,7 +144,7 @@ def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str
64
144
  if cited_doi_clean != correct_doi_clean:
65
145
  return {
66
146
  'error_type': 'doi',
67
- 'error_details': f"DOI mismatch: cited as {cited_doi} but actually {correct_doi}",
147
+ 'error_details': format_doi_mismatch(cited_doi, correct_doi),
68
148
  'ref_doi_correct': correct_doi
69
149
  }
70
150
 
@@ -120,11 +200,20 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
120
200
 
121
201
  return {
122
202
  'warning_type': 'venue',
123
- 'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
203
+ 'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
124
204
  'ref_venue_correct': correct_venue
125
205
  }
126
206
 
127
207
 
208
+ def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
209
+ """
210
+ Format a three-line venue mismatch message with cleaned venue names.
211
+ """
212
+ clean_cited = clean_venue_for_comparison(cited_venue)
213
+ clean_verified = clean_venue_for_comparison(verified_venue)
214
+ return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
215
+
216
+
128
217
  def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
129
218
  """
130
219
  Create a standardized URL error dictionary.
@@ -189,6 +278,59 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
189
278
  return warning_dict
190
279
 
191
280
 
281
+ def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
282
+ """
283
+ Format a three-line author mismatch message.
284
+
285
+ Args:
286
+ author_number: The author position (1-based)
287
+ cited_author: The cited author name
288
+ correct_author: The correct author name
289
+
290
+ Returns:
291
+ Formatted three-line author mismatch message
292
+ """
293
+ return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
294
+
295
+
296
+ def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
297
+ """
298
+ Format a three-line first author mismatch message.
299
+
300
+ Args:
301
+ cited_author: The cited first author name
302
+ correct_author: The correct first author name
303
+
304
+ Returns:
305
+ Formatted three-line first author mismatch message
306
+ """
307
+ return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
308
+
309
+
310
+ def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
311
+ """
312
+ Format an author count mismatch message showing all cited and correct authors.
313
+
314
+ Args:
315
+ cited_count: Number of cited authors
316
+ correct_count: Number of correct authors
317
+ cited_authors: List of cited author names
318
+ correct_authors: List of correct author names
319
+
320
+ Returns:
321
+ Formatted multi-line author count mismatch message
322
+ """
323
+ # Create the header with count information
324
+ header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
325
+
326
+ # Format author lists
327
+ cited_list = ", ".join(cited_authors) if cited_authors else "None"
328
+ correct_list = ", ".join(correct_authors) if correct_authors else "None"
329
+
330
+ # Use the same format as other mismatches
331
+ return format_three_line_mismatch(header, cited_list, correct_list)
332
+
333
+
192
334
  def format_authors_list(authors: List[Dict[str, str]]) -> str:
193
335
  """
194
336
  Format a list of author dictionaries into a readable string.
utils/text_utils.py CHANGED
@@ -554,6 +554,10 @@ def clean_title_basic(title):
554
554
  # Remove trailing punctuation
555
555
  title = re.sub(r'[.,;:]+$', '', title)
556
556
 
557
+ # Remove BibTeX publication type indicators at the end (common in Chinese and some international BibTeX styles)
558
+ # [J] = Journal, [C] = Conference, [M] = Monograph/Book, [D] = Dissertation, [P] = Patent, [R] = Report
559
+ title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
560
+
557
561
  return title
558
562
 
559
563
 
@@ -578,6 +582,9 @@ def clean_title_for_search(title):
578
582
  title = title.replace('\n', ' ').strip()
579
583
  title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
580
584
 
585
+ # Remove BibTeX publication type indicators that are not part of the actual title
586
+ title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
587
+
581
588
  # Note: We intentionally preserve:
582
589
  # - Capitalization (helps with exact matching)
583
590
  # - Colons and other meaningful punctuation (structural markers)
@@ -2076,6 +2083,8 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2076
2083
  # The key insight: if the citation has "et al", we should only verify the listed authors
2077
2084
  # and not penalize for the authoritative source having more authors
2078
2085
  if has_et_al:
2086
+ # Import here to avoid circular imports
2087
+ from utils.error_utils import format_author_mismatch
2079
2088
  # For et al cases, check if each cited author matches ANY author in the correct list
2080
2089
  # rather than comparing positionally, since author order can vary
2081
2090
  for i, cited_author in enumerate(cleaned_cited):
@@ -2088,10 +2097,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2088
2097
  break
2089
2098
 
2090
2099
  if not author_found:
2091
- # Create a more informative error message that doesn't assume positional matching
2092
- # Show the full list of correct authors instead of truncating
2100
+ # Use standardized three-line formatting for author mismatch
2101
+ cited_display = format_author_for_display(cited_author)
2093
2102
  full_author_list = ', '.join(correct_names)
2094
- return False, f"Author {i+1} mismatch: '{cited_author}' not found in author list (et al case). Correct authors include: {full_author_list}"
2103
+ error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
2104
+ return False, error_msg
2095
2105
 
2096
2106
  return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
2097
2107
 
@@ -2100,7 +2110,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2100
2110
  # For non-et-al cases, be more strict about count mismatches
2101
2111
  # Allow minor flexibility (1 author difference) but not more
2102
2112
  if abs(len(cleaned_cited) - len(correct_names)) > 1:
2103
- return False, f"Author count mismatch: {len(cleaned_cited)} cited vs {len(correct_names)} correct"
2113
+ from utils.error_utils import format_author_count_mismatch
2114
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
2115
+ return False, error_msg
2104
2116
 
2105
2117
  # Use the shorter list for comparison
2106
2118
  min_len = min(len(cleaned_cited), len(correct_names))
@@ -2110,6 +2122,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2110
2122
  comparison_cited = cleaned_cited
2111
2123
  comparison_correct = correct_names
2112
2124
 
2125
+ # Use shared three-line formatter (imported lazily to avoid circular imports)
2126
+ from utils.error_utils import format_first_author_mismatch, format_author_mismatch
2127
+
2113
2128
  # Compare first author (most important) using the enhanced name matching
2114
2129
  if comparison_cited and comparison_correct:
2115
2130
  cited_first = comparison_cited[0]
@@ -2119,7 +2134,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2119
2134
  # Use consistent display format for both names
2120
2135
  cited_display = format_author_for_display(cited_first)
2121
2136
  correct_display = format_author_for_display(correct_first)
2122
- return False, f"First author mismatch: '{cited_display}' vs '{correct_display}'"
2137
+ return False, format_first_author_mismatch(cited_display, correct_display)
2123
2138
 
2124
2139
  # For complete verification, check all authors if reasonable number
2125
2140
  if len(comparison_cited) <= 5: # Only do full check for reasonable author counts
@@ -2128,7 +2143,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2128
2143
  # Use consistent display format for both names
2129
2144
  cited_display = format_author_for_display(cited_author)
2130
2145
  correct_display = format_author_for_display(correct_author)
2131
- return False, f"Author {i+1} mismatch: '{cited_display}' vs '{correct_display}'"
2146
+ return False, format_author_mismatch(i+1, cited_display, correct_display)
2132
2147
 
2133
2148
  return True, "Authors match"
2134
2149
 
@@ -2512,6 +2527,97 @@ def strip_latex_commands(text):
2512
2527
  return text
2513
2528
 
2514
2529
 
2530
+ def extract_balanced_braces(text, start_pos):
2531
+ """
2532
+ Extract content from balanced braces starting at start_pos.
2533
+
2534
+ This function properly handles nested braces, which is important for LaTeX content
2535
+ where patterns like {Jos{\'e} Meseguer} need to be extracted as complete units.
2536
+
2537
+ Args:
2538
+ text: The text to search in
2539
+ start_pos: Position of the opening brace
2540
+
2541
+ Returns:
2542
+ tuple: (content, end_pos) or (None, start_pos) if no balanced content found
2543
+ """
2544
+ if start_pos >= len(text) or text[start_pos] != '{':
2545
+ return None, start_pos
2546
+
2547
+ brace_count = 1
2548
+ pos = start_pos + 1
2549
+
2550
+ while pos < len(text) and brace_count > 0:
2551
+ if text[pos] == '{':
2552
+ brace_count += 1
2553
+ elif text[pos] == '}':
2554
+ brace_count -= 1
2555
+ pos += 1
2556
+
2557
+ if brace_count == 0:
2558
+ return text[start_pos + 1:pos - 1], pos
2559
+ else:
2560
+ return None, start_pos
2561
+
2562
+
2563
+ def extract_bibinfo_person_content(text):
2564
+ """
2565
+ Extract all person names from \\bibinfo{person}{...} with proper brace handling.
2566
+
2567
+ This function correctly handles nested braces in author names, such as:
2568
+ \\bibinfo{person}{Jos{\\'e} Meseguer}
2569
+
2570
+ Args:
2571
+ text: Text containing \\bibinfo{person}{...} patterns
2572
+
2573
+ Returns:
2574
+ list: List of extracted person names with balanced braces preserved
2575
+ """
2576
+ return extract_bibinfo_field_content(text, 'person', return_all=True)
2577
+
2578
+
2579
+ def extract_bibinfo_field_content(text, field_type, return_all=False):
2580
+ """
2581
+ Extract content from \\bibinfo{field_type}{...} with proper brace handling.
2582
+
2583
+ This function correctly handles nested braces in field content, such as:
2584
+ \\bibinfo{journal}{\\emph{Commun. ACM}}
2585
+
2586
+ Args:
2587
+ text: Text containing \\bibinfo{field_type}{...} patterns
2588
+ field_type: The field type to extract (e.g., 'person', 'journal', 'title')
2589
+ return_all: If True, return list of all matches; if False, return first match or None
2590
+
2591
+ Returns:
2592
+ list or str or None: Extracted content based on return_all parameter
2593
+ """
2594
+ pattern = f'\\\\bibinfo\\{{{re.escape(field_type)}\\}}\\{{'
2595
+ matches = []
2596
+ pos = 0
2597
+
2598
+ while True:
2599
+ match = re.search(pattern, text[pos:])
2600
+ if not match:
2601
+ break
2602
+
2603
+ # Find the start of the content braces
2604
+ brace_start = pos + match.end() - 1 # -1 because we want the opening brace
2605
+ content, end_pos = extract_balanced_braces(text, brace_start)
2606
+
2607
+ if content is not None:
2608
+ matches.append(content)
2609
+ pos = end_pos
2610
+ if not return_all:
2611
+ break # Return first match only
2612
+ else:
2613
+ pos += match.end()
2614
+
2615
+ if return_all:
2616
+ return matches
2617
+ else:
2618
+ return matches[0] if matches else None
2619
+
2620
+
2515
2621
  def extract_cited_keys_from_latex(tex_content):
2516
2622
  r"""
2517
2623
  Extract citation keys from LaTeX content by finding \cite{} commands.
@@ -2936,8 +3042,8 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
2936
3042
 
2937
3043
  if brace_count == 0:
2938
3044
  author_content = content[start_pos:pos-1]
2939
- # Extract individual authors from \bibinfo{person}{Name} tags
2940
- person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
3045
+ # Extract individual authors from \bibinfo{person}{Name} tags using balanced brace extraction
3046
+ person_matches = extract_bibinfo_person_content(author_content)
2941
3047
  if person_matches:
2942
3048
  # Clean and format author names
2943
3049
  authors = []