academic-refchecker 1.2.44__py3-none-any.whl → 1.2.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.44"
3
+ __version__ = "1.2.46"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.44
3
+ Version: 1.2.46
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -78,7 +78,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
78
78
  Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
79
79
  ArXiv URL: https://arxiv.org/abs/1610.10099
80
80
  DOI URL: https://doi.org/10.48550/arxiv.1610.10099
81
- ⚠️ Warning: Year mismatch: cited as 2017 but actually 2016
81
+ ⚠️ Warning: Year mismatch:
82
+ cited: '2017'
83
+ actual: '2016'
82
84
 
83
85
  [2/45] Effective approaches to attention-based neural machine translation
84
86
  Minh-Thang Luong, Hieu Pham, Christopher D. Manning
@@ -87,7 +89,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
87
89
  Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
88
90
  ArXiv URL: https://arxiv.org/abs/1508.04025
89
91
  DOI URL: https://doi.org/10.18653/v1/d15-1166
90
- ❌ Error: First author mismatch: 'Minh-Thang Luong' vs 'Thang Luong'
92
+ ❌ Error: First author mismatch:
93
+ cited: 'Minh-Thang Luong'
94
+ actual: 'Thang Luong'
91
95
 
92
96
  [3/45] Deep Residual Learning for Image Recognition
93
97
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
@@ -98,7 +102,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
98
102
  Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
99
103
  ArXiv URL: https://arxiv.org/abs/1512.03385
100
104
  DOI URL: https://doi.org/10.1109/CVPR.2016.90
101
- ❌ Error: DOI mismatch: cited as '10.1109/CVPR.2016.91' but actually '10.1109/CVPR.2016.90'
105
+ ❌ Error: DOI mismatch:
106
+ cited: '10.1109/CVPR.2016.91'
107
+ actual: '10.1109/CVPR.2016.90'
102
108
 
103
109
  ============================================================
104
110
  📋 SUMMARY
@@ -382,7 +388,9 @@ This enhanced URL display helps users access multiple authoritative sources for
382
388
  Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
383
389
  ArXiv URL: https://arxiv.org/abs/2312.02119
384
390
  DOI URL: https://doi.org/10.48550/arxiv.2312.02119
385
- ❌ Error: First author mismatch: 'T. Xie' vs 'Zhao Xu'
391
+ ❌ Error: First author mismatch:
392
+ cited: 'T. Xie'
393
+ actual: 'Zhao Xu'
386
394
  ```
387
395
  - `title`: Title discrepancies
388
396
  ```
@@ -392,7 +400,9 @@ This enhanced URL display helps users access multiple authoritative sources for
392
400
  Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
393
401
  ArXiv URL: https://arxiv.org/abs/1810.04805
394
402
  DOI URL: https://doi.org/10.18653/v1/n19-1423
395
- ❌ Error: Title mismatch: cited as 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' but actually 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
403
+ ❌ Error: Title mismatch:
404
+ cited: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
405
+ actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
396
406
  ```
397
407
  - `arxiv_id`: Incorrect URLs or arXiv IDs
398
408
  ```
@@ -415,7 +425,9 @@ This enhanced URL display helps users access multiple authoritative sources for
415
425
  Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
416
426
  ArXiv URL: https://arxiv.org/abs/1706.03762
417
427
  DOI URL: https://doi.org/10.48550/arXiv.1706.03762
418
- ❌ Error: DOI mismatch: cited as '10.5555/3295222.3295349' but actually '10.48550/arXiv.1706.03762'
428
+ ❌ Error: DOI mismatch:
429
+ cited: '10.5555/3295222.3295349'
430
+ actual: '10.48550/arXiv.1706.03762'
419
431
  ```
420
432
 
421
433
  - **⚠️ Warnings**: Minor issues that may need attention
@@ -428,7 +440,9 @@ This enhanced URL display helps users access multiple authoritative sources for
428
440
  Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
429
441
  ArXiv URL: https://arxiv.org/abs/2310.03684
430
442
  DOI URL: https://doi.org/10.48550/arxiv.2310.03684
431
- ⚠️ Warning: Year mismatch: cited as 2024 but actually 2023
443
+ ⚠️ Warning: Year mismatch:
444
+ cited: '2024'
445
+ actual: '2023'
432
446
  ```
433
447
  - `venue`: Venue format variations
434
448
  ```
@@ -439,7 +453,9 @@ This enhanced URL display helps users access multiple authoritative sources for
439
453
  Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
440
454
  ArXiv URL: https://arxiv.org/abs/2403.02151
441
455
  DOI URL: https://doi.org/10.48550/arxiv.2403.02151
442
- ⚠️ Warning: Venue mismatch: cited as 'arXiv, 2024' but actually 'Neural Information Processing Systems'
456
+ ⚠️ Warning: Venue mismatch:
457
+ cited: 'arXiv, 2024'
458
+ actual: 'Neural Information Processing Systems'
443
459
  ```
444
460
 
445
461
  - **❓ Unverified**: References that couldn't be verified with any of the checker APIs
@@ -1,21 +1,21 @@
1
- __version__.py,sha256=k3lYUlcZL-yL2e_2u3UPBtgwqMqZJ11x7KVMZOotlE8,65
2
- academic_refchecker-1.2.44.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=PuEdWOtQEDRwfL2m1mk5h6WKAoScQu-kbHu9VkBS764,65
2
+ academic_refchecker-1.2.46.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
- checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
4
+ checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
6
- checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
6
+ checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
7
7
  checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
8
- checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
9
- checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
10
- checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
11
- checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
8
+ checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
+ checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
10
+ checkers/semantic_scholar.py,sha256=99KOHLAiYs31nSdx-gcMR_TWIlV8G4juNL0bmV4AoUs,34768
11
+ checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
12
12
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
13
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
14
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
- core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
18
- core/refchecker.py,sha256=w3KNWyyaZZVL3ghFhEfro8SPs4xXEUjmCJERfZ7Du6A,273648
17
+ core/parallel_processor.py,sha256=AOnjqhBHXlSb1c-PSunat9Eug5y04gOygwbHdPUqxgk,17202
18
+ core/refchecker.py,sha256=lU6r9cKpB8Fc4Wd7vOqdqhxP9cwYEoB6D4PlYznglGY,274337
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,21 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
26
26
  services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
27
27
  services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
28
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
29
- utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
29
+ utils/arxiv_utils.py,sha256=MxyD3Q0EzrmE0xORMJw8wdVtZ4Fp-ux_cn6jLMQimV8,18168
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
31
  utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
32
32
  utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
33
33
  utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
34
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
35
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
- utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
37
- utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
36
+ utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
37
+ utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=uEwKasw3aTVgIDHbDJDSOcTUbPwfiivIdhKwmxQJy0U,206378
39
+ utils/text_utils.py,sha256=gFI-qu6g-9Lo1s3w1OjgBZ9SvdPufL1mMg-05l0BwD0,210269
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
41
  utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
42
- academic_refchecker-1.2.44.dist-info/METADATA,sha256=ueA0mwKqmiqhR9WBLyPy2W40wfJc4JRiWSTbrQHKU14,22298
43
- academic_refchecker-1.2.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.44.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.44.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.44.dist-info/RECORD,,
42
+ academic_refchecker-1.2.46.dist-info/METADATA,sha256=lP_pMHS9uI4hhXEsox_yGlSnT6pSl1_6W58CtzZEbDM,22576
43
+ academic_refchecker-1.2.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.46.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.46.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.46.dist-info/RECORD,,
checkers/crossref.py CHANGED
@@ -31,6 +31,7 @@ import re
31
31
  from typing import Dict, List, Tuple, Optional, Any, Union
32
32
  from urllib.parse import quote_plus
33
33
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
34
+ from utils.error_utils import format_year_mismatch, format_doi_mismatch
34
35
  from config.settings import get_config
35
36
 
36
37
  # Set up logging
@@ -478,21 +479,19 @@ class CrossRefReferenceChecker:
478
479
  if year and work_year and year != work_year:
479
480
  errors.append({
480
481
  'warning_type': 'year',
481
- 'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
482
+ 'warning_details': format_year_mismatch(year, work_year),
482
483
  'ref_year_correct': work_year
483
484
  })
484
485
 
485
486
  # Verify DOI
486
487
  work_doi = work_data.get('DOI')
487
488
  if doi and work_doi:
488
- # Normalize DOIs for comparison (remove URL prefix and trailing periods)
489
- cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
490
- work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
491
-
492
- if cited_doi_clean.lower() != work_doi_clean.lower():
489
+ # Compare DOIs using the proper comparison function
490
+ from utils.doi_utils import compare_dois
491
+ if not compare_dois(doi, work_doi):
493
492
  errors.append({
494
493
  'error_type': 'doi',
495
- 'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
494
+ 'error_details': format_doi_mismatch(doi, work_doi),
496
495
  'ref_doi_correct': work_doi
497
496
  })
498
497
 
@@ -169,9 +169,14 @@ class GitHubChecker:
169
169
  if cited_title:
170
170
  title_match = self._check_title_match(cited_title, actual_name, actual_description)
171
171
  if not title_match:
172
+ from utils.error_utils import format_title_mismatch
173
+ details = format_title_mismatch(cited_title, actual_name)
174
+ if actual_description:
175
+ snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
176
+ details += f" ({snippet})"
172
177
  errors.append({
173
178
  "warning_type": "title",
174
- "warning_details": f"Title mismatch: cited as '{cited_title}' but repository is '{actual_name}' ({actual_description[:100]}{'...' if len(actual_description) > 100 else ''})"
179
+ "warning_details": details
175
180
  })
176
181
 
177
182
  # Verify authors
@@ -180,9 +185,13 @@ class GitHubChecker:
180
185
  author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
181
186
  author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
182
187
  if not author_match:
188
+ from utils.error_utils import format_three_line_mismatch
189
+ left = author_str
190
+ right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
191
+ details = format_three_line_mismatch("Author mismatch", left, right)
183
192
  errors.append({
184
193
  "warning_type": "author",
185
- "warning_details": f"Author mismatch: cited as '{author_str}' but repository owner is '{actual_owner}' ({actual_owner_name})"
194
+ "warning_details": details
186
195
  })
187
196
 
188
197
  # Verify year
@@ -191,9 +200,10 @@ class GitHubChecker:
191
200
  try:
192
201
  cited_year_int = int(cited_year)
193
202
  if cited_year_int < creation_year:
203
+ from utils.error_utils import format_year_mismatch
194
204
  errors.append({
195
205
  "warning_type": "year",
196
- "warning_details": f"Year mismatch: cited as {cited_year} but repository created in {creation_year}",
206
+ "warning_details": format_year_mismatch(cited_year, creation_year),
197
207
  "ref_year_correct": str(creation_year)
198
208
  })
199
209
  except (ValueError, TypeError):
checkers/openalex.py CHANGED
@@ -33,6 +33,7 @@ import re
33
33
  from typing import Dict, List, Tuple, Optional, Any, Union
34
34
  from urllib.parse import quote_plus
35
35
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
36
+ from utils.error_utils import format_year_mismatch, format_doi_mismatch
36
37
  from config.settings import get_config
37
38
 
38
39
  # Set up logging
@@ -448,7 +449,7 @@ class OpenAlexReferenceChecker:
448
449
  if year and work_year and year != work_year:
449
450
  errors.append({
450
451
  'warning_type': 'year',
451
- 'warning_details': f"Year mismatch: cited as {year} but actually {work_year}",
452
+ 'warning_details': format_year_mismatch(year, work_year),
452
453
  'ref_year_correct': work_year
453
454
  })
454
455
 
@@ -458,14 +459,12 @@ class OpenAlexReferenceChecker:
458
459
  work_doi = work_data['ids']['doi']
459
460
 
460
461
  if doi and work_doi:
461
- # Normalize DOIs for comparison (remove URL prefix and trailing periods)
462
- cited_doi_clean = doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
463
- work_doi_clean = work_doi.replace('https://doi.org/', '').replace('http://doi.org/', '').strip().rstrip('.')
464
-
465
- if cited_doi_clean.lower() != work_doi_clean.lower():
462
+ # Compare DOIs using the proper comparison function
463
+ from utils.doi_utils import compare_dois
464
+ if not compare_dois(doi, work_doi):
466
465
  errors.append({
467
466
  'error_type': 'doi',
468
- 'error_details': f"DOI mismatch: cited as {doi} but actually {work_doi}",
467
+ 'error_details': format_doi_mismatch(doi, work_doi),
469
468
  'ref_doi_correct': work_doi
470
469
  })
471
470
 
@@ -425,9 +425,11 @@ class OpenReviewReferenceChecker:
425
425
  if cited_title and paper_title:
426
426
  similarity = calculate_title_similarity(cited_title, paper_title)
427
427
  if similarity < 0.7: # Using a reasonable threshold
428
+ from utils.error_utils import format_title_mismatch
429
+ details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
428
430
  errors.append({
429
431
  "warning_type": "title",
430
- "warning_details": f"Title mismatch: cited as '{cited_title}' but OpenReview shows '{paper_title}' (similarity: {similarity:.2f})"
432
+ "warning_details": details
431
433
  })
432
434
 
433
435
  # Check authors
@@ -460,9 +462,10 @@ class OpenReviewReferenceChecker:
460
462
 
461
463
  is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
462
464
  if is_different and year_message:
465
+ from utils.error_utils import format_year_mismatch
463
466
  errors.append({
464
467
  "warning_type": "year",
465
- "warning_details": year_message
468
+ "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
466
469
  })
467
470
  except (ValueError, TypeError):
468
471
  pass # Skip year validation if conversion fails
@@ -473,10 +476,10 @@ class OpenReviewReferenceChecker:
473
476
 
474
477
  if cited_venue and paper_venue:
475
478
  if are_venues_substantially_different(cited_venue, paper_venue):
476
- from utils.error_utils import clean_venue_for_comparison
479
+ from utils.error_utils import format_venue_mismatch
477
480
  errors.append({
478
481
  "warning_type": "venue",
479
- "warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
482
+ "warning_details": format_venue_mismatch(cited_venue, paper_venue)
480
483
  })
481
484
 
482
485
  # Create verified data structure
@@ -29,6 +29,7 @@ import logging
29
29
  import re
30
30
  from typing import Dict, List, Tuple, Optional, Any, Union
31
31
  from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
32
+ from utils.error_utils import format_title_mismatch
32
33
  from config.settings import get_config
33
34
 
34
35
  # Set up logging
@@ -471,7 +472,7 @@ class NonArxivReferenceChecker:
471
472
  if found_title and title_similarity < SIMILARITY_THRESHOLD:
472
473
  errors.append({
473
474
  'error_type': 'title',
474
- 'error_details': f"Title mismatch: cited as '{title}' but actually '{found_title}'",
475
+ 'error_details': format_title_mismatch(title, found_title),
475
476
  'ref_title_correct': paper_data.get('title', '')
476
477
  })
477
478
 
@@ -525,9 +526,10 @@ class NonArxivReferenceChecker:
525
526
  is_different, warning_message = is_year_substantially_different(year, paper_year, context)
526
527
 
527
528
  if is_different and warning_message:
529
+ from utils.error_utils import format_year_mismatch
528
530
  errors.append({
529
531
  'warning_type': 'year',
530
- 'warning_details': warning_message,
532
+ 'warning_details': format_year_mismatch(year, paper_year),
531
533
  'ref_year_correct': paper_year
532
534
  })
533
535
 
@@ -541,49 +543,50 @@ class NonArxivReferenceChecker:
541
543
  elif paper_venue and not isinstance(paper_venue, str):
542
544
  paper_venue = str(paper_venue)
543
545
 
546
+ # Check venue mismatches
544
547
  if cited_venue and paper_venue:
545
548
  # Use the utility function to check if venues are substantially different
546
549
  if are_venues_substantially_different(cited_venue, paper_venue):
547
550
  from utils.error_utils import create_venue_warning
548
551
  errors.append(create_venue_warning(cited_venue, paper_venue))
549
552
  elif not cited_venue and paper_venue:
550
- # Check if this is an arXiv paper first
551
- external_ids = paper_data.get('externalIds', {})
552
- arxiv_id = external_ids.get('ArXiv') if external_ids else None
553
-
554
- if arxiv_id:
555
- # For arXiv papers, suggest including the arXiv URL instead of venue
556
- arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
557
-
558
- # Check if the reference already includes this ArXiv URL or equivalent DOI
559
- reference_url = reference.get('url', '')
560
-
561
- # Check for direct arXiv URL match
562
- has_arxiv_url = arxiv_url in reference_url
563
-
564
- # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
565
- arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
566
- has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
567
-
568
- if not (has_arxiv_url or has_arxiv_doi):
553
+ # Original reference has the venue in raw text but not parsed correctly
554
+ raw_text = reference.get('raw_text', '')
555
+ if raw_text and '#' in raw_text:
556
+ # Check if venue might be in the raw text format (author#title#venue#year#url)
557
+ parts = raw_text.split('#')
558
+ if len(parts) >= 3 and parts[2].strip():
559
+ # Venue is present in raw text but missing from parsed reference
569
560
  errors.append({
570
561
  'warning_type': 'venue',
571
- 'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
572
- 'ref_url_correct': arxiv_url
562
+ 'warning_details': f"Venue missing: should include '{paper_venue}'",
563
+ 'ref_venue_correct': paper_venue
573
564
  })
574
- else:
575
- # Original reference has the venue in raw text but not parsed correctly
576
- raw_text = reference.get('raw_text', '')
577
- if raw_text and '#' in raw_text:
578
- # Check if venue might be in the raw text format (author#title#venue#year#url)
579
- parts = raw_text.split('#')
580
- if len(parts) >= 3 and parts[2].strip():
581
- # Venue is present in raw text but missing from parsed reference
582
- errors.append({
583
- 'warning_type': 'venue',
584
- 'warning_details': f"Venue missing: should include '{paper_venue}'",
585
- 'ref_venue_correct': paper_venue
586
- })
565
+
566
+ # Always check for missing arXiv URLs when paper has arXiv ID
567
+ external_ids = paper_data.get('externalIds', {})
568
+ arxiv_id = external_ids.get('ArXiv') if external_ids else None
569
+
570
+ if arxiv_id:
571
+ # For arXiv papers, check if reference includes the arXiv URL
572
+ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
573
+
574
+ # Check if the reference already includes this ArXiv URL or equivalent DOI
575
+ reference_url = reference.get('url', '')
576
+
577
+ # Check for direct arXiv URL match
578
+ has_arxiv_url = arxiv_url in reference_url
579
+
580
+ # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
581
+ arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
582
+ has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
583
+
584
+ if not (has_arxiv_url or has_arxiv_doi):
585
+ errors.append({
586
+ 'warning_type': 'url',
587
+ 'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
588
+ 'ref_url_correct': arxiv_url
589
+ })
587
590
 
588
591
  # Verify DOI
589
592
  paper_doi = None
@@ -591,14 +594,13 @@ class NonArxivReferenceChecker:
591
594
  if external_ids and 'DOI' in external_ids:
592
595
  paper_doi = external_ids['DOI']
593
596
 
594
- # Compare DOIs, but strip hash fragments and trailing periods for comparison
595
- cited_doi_clean = doi.split('#')[0].rstrip('.') if doi else ''
596
- paper_doi_clean = paper_doi.split('#')[0].rstrip('.') if paper_doi else ''
597
-
598
- if cited_doi_clean and paper_doi_clean and cited_doi_clean.lower() != paper_doi_clean.lower():
597
+ # Compare DOIs using the proper comparison function
598
+ from utils.doi_utils import compare_dois
599
+ if doi and paper_doi and not compare_dois(doi, paper_doi):
600
+ from utils.error_utils import format_doi_mismatch
599
601
  errors.append({
600
602
  'error_type': 'doi',
601
- 'error_details': f"DOI mismatch: cited as {doi} but actually {paper_doi}",
603
+ 'error_details': format_doi_mismatch(doi, paper_doi),
602
604
  'ref_doi_correct': paper_doi
603
605
  })
604
606
 
@@ -71,7 +71,8 @@ class WebPageChecker:
71
71
  doc_indicators = [
72
72
  'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
73
73
  'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
74
- 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
74
+ 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
75
+ 'posts' # For blog posts and forum posts like LessWrong
75
76
  ]
76
77
 
77
78
  return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
84
85
  doc_domains = [
85
86
  'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
86
87
  'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
87
- 'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
88
+ 'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
89
+ 'lesswrong.com' # LessWrong rationality and AI safety blog platform
88
90
  ]
89
91
 
90
92
  return any(domain in parsed.netloc for domain in doc_domains)
@@ -182,9 +184,10 @@ class WebPageChecker:
182
184
  # Check title match
183
185
  if cited_title and page_title:
184
186
  if not self._check_title_match(cited_title, page_title, page_description):
187
+ from utils.error_utils import format_title_mismatch
185
188
  errors.append({
186
189
  "warning_type": "title",
187
- "warning_details": f"Title mismatch: cited as '{cited_title}' but page title is '{page_title}'"
190
+ "warning_details": format_title_mismatch(cited_title, page_title)
188
191
  })
189
192
 
190
193
  # Check if this is a documentation page for the cited topic
@@ -201,9 +204,13 @@ class WebPageChecker:
201
204
  if cited_authors:
202
205
  author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
203
206
  if not self._check_author_match(author_str, site_info, web_url):
207
+ from utils.error_utils import format_three_line_mismatch
208
+ left = author_str
209
+ right = site_info.get('organization', 'unknown')
210
+ details = format_three_line_mismatch("Author/organization mismatch", left, right)
204
211
  errors.append({
205
212
  "warning_type": "author",
206
- "warning_details": f"Author/organization mismatch: cited as '{author_str}' but page is from '{site_info.get('organization', 'unknown')}'"
213
+ "warning_details": details
207
214
  })
208
215
 
209
216
  logger.debug(f"Web page verification completed for: {web_url}")
@@ -390,6 +397,14 @@ class WebPageChecker:
390
397
  organization = site_info.get('organization', '').lower()
391
398
  domain = site_info.get('domain', '').lower()
392
399
 
400
+ # Accept generic web resource terms - these are valid for any web URL
401
+ generic_web_terms = [
402
+ 'web resource', 'web site', 'website', 'online resource',
403
+ 'online', 'web', 'internet resource', 'web page', 'webpage'
404
+ ]
405
+ if cited_lower in generic_web_terms:
406
+ return True
407
+
393
408
  # Direct matches
394
409
  if cited_lower in organization or organization in cited_lower:
395
410
  return True
@@ -352,12 +352,15 @@ class ParallelReferenceProcessor:
352
352
  error_type = error.get('error_type') or error.get('warning_type')
353
353
  error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
354
354
 
355
+ from utils.error_utils import print_labeled_multiline
356
+
355
357
  if error_type == 'arxiv_id':
358
+ # Keep existing style for arXiv ID errors
356
359
  print(f" ❌ {error_details}")
357
360
  elif 'error_type' in error:
358
- print(f" ❌ Error: {error_details}")
361
+ print_labeled_multiline("❌ Error", error_details)
359
362
  else:
360
- print(f" ⚠️ Warning: {error_details}")
363
+ print_labeled_multiline("⚠️ Warning", error_details)
361
364
 
362
365
  # Show timing info for slow references
363
366
  if result.processing_time > 5.0:
core/refchecker.py CHANGED
@@ -1900,10 +1900,11 @@ class ArxivReferenceChecker:
1900
1900
  db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
1901
1901
 
1902
1902
  if normalized_title != db_title:
1903
+ from utils.error_utils import format_title_mismatch
1903
1904
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
1904
1905
  errors.append({
1905
1906
  'error_type': 'title',
1906
- 'error_details': f"Title mismatch: cited as '{title}' but actually '{paper_data.get('title')}'",
1907
+ 'error_details': format_title_mismatch(title, paper_data.get('title')),
1907
1908
  'ref_title_correct': paper_data.get('title')
1908
1909
  })
1909
1910
 
@@ -1925,30 +1926,36 @@ class ArxivReferenceChecker:
1925
1926
  paper_year = paper_data.get('year')
1926
1927
  if year and paper_year and year != paper_year:
1927
1928
  logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1929
+ from utils.error_utils import format_year_mismatch
1928
1930
  errors.append({
1929
1931
  'warning_type': 'year',
1930
- 'warning_details': f"Year mismatch: cited as {year} but actually {paper_year}",
1932
+ 'warning_details': format_year_mismatch(year, paper_year),
1931
1933
  'ref_year_correct': paper_year
1932
1934
  })
1933
1935
 
1934
1936
  # Verify DOI
1935
- if doi and external_ids.get('DOI') and doi.lower() != external_ids['DOI'].lower():
1936
- # Check if the cited DOI is a partial match of the actual DOI
1937
- # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
1938
- cited_doi_clean = doi.lower().rstrip('.')
1939
- actual_doi_clean = external_ids['DOI'].lower().rstrip('.')
1940
-
1941
- # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
1942
- # Only flag as error if it's not a reasonable partial match
1943
- if not actual_doi_clean.startswith(cited_doi_clean):
1944
- logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1945
- errors.append({
1946
- 'error_type': 'doi',
1947
- 'error_details': f"DOI mismatch: cited as {doi} but actually {external_ids['DOI']}",
1948
- 'ref_doi_correct': external_ids['DOI']
1949
- })
1950
- else:
1951
- logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
1937
+ if doi and external_ids.get('DOI'):
1938
+ from utils.doi_utils import compare_dois, normalize_doi
1939
+
1940
+ # Use proper DOI comparison first
1941
+ if not compare_dois(doi, external_ids['DOI']):
1942
+ # Check if the cited DOI is a partial match of the actual DOI
1943
+ # This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
1944
+ cited_doi_normalized = normalize_doi(doi)
1945
+ actual_doi_normalized = normalize_doi(external_ids['DOI'])
1946
+
1947
+ # If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
1948
+ # Only flag as error if it's not a reasonable partial match
1949
+ if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
1950
+ logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1951
+ from utils.error_utils import format_doi_mismatch
1952
+ errors.append({
1953
+ 'error_type': 'doi',
1954
+ 'error_details': format_doi_mismatch(doi, external_ids['DOI']),
1955
+ 'ref_doi_correct': external_ids['DOI']
1956
+ })
1957
+ else:
1958
+ logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
1952
1959
 
1953
1960
  # Verify ArXiv ID
1954
1961
  if reference.get('type') == 'arxiv':
@@ -3489,8 +3496,9 @@ class ArxivReferenceChecker:
3489
3496
  author_field_match = re.search(r'\\bibfield\{author\}\{(.*?)\}(?:\s*\\bibinfo\{year\}|\s*\\newblock|$)', content, re.DOTALL)
3490
3497
  if author_field_match:
3491
3498
  author_content = author_field_match.group(1)
3492
- # Find all \bibinfo{person}{Name} entries
3493
- person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
3499
+ # Find all \bibinfo{person}{Name} entries using balanced brace extraction
3500
+ from utils.text_utils import extract_bibinfo_person_content
3501
+ person_matches = extract_bibinfo_person_content(author_content)
3494
3502
  if person_matches:
3495
3503
  authors = []
3496
3504
  for person in person_matches:
@@ -3502,33 +3510,31 @@ class ArxivReferenceChecker:
3502
3510
  authors.append(clean_name)
3503
3511
  ref['authors'] = authors
3504
3512
 
3505
- # Extract title from \bibinfo{title}{Title}
3506
- title_match = re.search(r'\\bibinfo\{title\}\{([^}]+)\}', content)
3507
- if title_match:
3508
- title = strip_latex_commands(title_match.group(1)).strip()
3513
+ # Import balanced brace extraction function
3514
+ from utils.text_utils import extract_bibinfo_field_content
3515
+
3516
+ # Extract title from \bibinfo{title}{Title} using balanced brace extraction
3517
+ title_content = extract_bibinfo_field_content(content, 'title')
3518
+ if title_content:
3519
+ title = strip_latex_commands(title_content).strip()
3509
3520
  ref['title'] = title
3510
3521
 
3511
- # Extract venue/journal from various fields
3512
- venue_patterns = [
3513
- r'\\bibinfo\{booktitle\}\{([^}]+)\}',
3514
- r'\\bibinfo\{journal\}\{([^}]+)\}',
3515
- r'\\bibinfo\{series\}\{([^}]+)\}',
3516
- r'\\bibinfo\{note\}\{([^}]+)\}'
3517
- ]
3522
+ # Extract venue/journal from various fields using balanced brace extraction
3523
+ venue_field_types = ['booktitle', 'journal', 'series', 'note']
3518
3524
 
3519
- for pattern in venue_patterns:
3520
- venue_match = re.search(pattern, content)
3521
- if venue_match:
3522
- venue = strip_latex_commands(venue_match.group(1)).strip()
3525
+ for field_type in venue_field_types:
3526
+ venue_content = extract_bibinfo_field_content(content, field_type)
3527
+ if venue_content:
3528
+ venue = strip_latex_commands(venue_content).strip()
3523
3529
  if venue:
3524
3530
  ref['venue'] = venue
3525
3531
  ref['journal'] = venue # For compatibility
3526
3532
  break
3527
3533
 
3528
- # Extract DOI
3529
- doi_match = re.search(r'\\bibinfo\{doi\}\{([^}]+)\}', content)
3530
- if doi_match:
3531
- ref['doi'] = doi_match.group(1).strip()
3534
+ # Extract DOI using balanced brace extraction
3535
+ doi_content = extract_bibinfo_field_content(content, 'doi')
3536
+ if doi_content:
3537
+ ref['doi'] = doi_content.strip()
3532
3538
 
3533
3539
  # Extract ArXiv ID from \showeprint[arxiv]{ID}
3534
3540
  arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
@@ -5048,7 +5054,8 @@ class ArxivReferenceChecker:
5048
5054
  correct_first = correct_authors[0]
5049
5055
 
5050
5056
  if not enhanced_name_match(cited_first, correct_first):
5051
- return False, f"First author mismatch: '{cited_first}' vs '{correct_first}'"
5057
+ from utils.error_utils import format_first_author_mismatch
5058
+ return False, format_first_author_mismatch(cited_first, correct_first)
5052
5059
 
5053
5060
  return True, "Authors match"
5054
5061
 
@@ -5454,12 +5461,14 @@ class ArxivReferenceChecker:
5454
5461
  error_type = error.get('error_type') or error.get('warning_type')
5455
5462
  error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
5456
5463
 
5464
+ from utils.error_utils import print_labeled_multiline
5465
+
5457
5466
  if error_type == 'arxiv_id':
5458
5467
  print(f" ❌ {error_details}")
5459
5468
  elif 'error_type' in error:
5460
- print(f" ❌ Error: {error_details}")
5469
+ print_labeled_multiline("❌ Error", error_details)
5461
5470
  else:
5462
- print(f" ⚠️ Warning: {error_details}")
5471
+ print_labeled_multiline("⚠️ Warning", error_details)
5463
5472
 
5464
5473
  def _output_reference_errors(self, reference, errors, url):
5465
5474
  """
utils/arxiv_utils.py CHANGED
@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
392
392
  logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
393
  tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
394
 
395
- # Choose between .bib and .bbl files based on content richness
396
- # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
395
+ # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
396
+ # .bbl files are processed biblatex output that reflects exactly what was cited
397
397
  if bib_content and bbl_content:
398
- # Count entries in both
398
+ # Count entries in both for logging
399
399
  bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
400
  bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
401
401
 
402
- # If we have LaTeX content, get filtered BibTeX count
403
- filtered_bib_count = bib_entry_count
404
- filtered_content = bib_content
405
- if tex_content:
406
- cited_keys = extract_cited_keys_from_tex({}, tex_content)
407
- if cited_keys:
408
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
409
- filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
410
- filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
411
-
412
- logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
402
+ logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
413
403
 
414
- # Prioritize .bbl if it has significantly more entries
415
- if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
416
- logger.info(f"Using .bbl files from ArXiv source")
404
+ # Only use .bbl if it actually contains bibliography entries
405
+ if bbl_entry_count > 0:
406
+ logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
417
407
  return bbl_content
418
408
  else:
419
- logger.info(f"Using filtered .bib files")
420
- return filtered_content
409
+ logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
410
+ # If we have LaTeX content, filter BibTeX by cited keys
411
+ if tex_content:
412
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
413
+ if cited_keys:
414
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
415
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
416
+ return filtered_content
417
+ return bib_content
421
418
 
422
419
  elif bib_content:
423
420
  logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
utils/doi_utils.py CHANGED
@@ -99,6 +99,10 @@ def compare_dois(doi1: str, doi2: str) -> bool:
99
99
  """
100
100
  Compare two DOIs for equality, handling different formats and prefixes.
101
101
 
102
+ This function performs exact matching after normalization, which means
103
+ DOIs are only considered equal if they are identical after removing
104
+ prefixes, case differences, and punctuation.
105
+
102
106
  Args:
103
107
  doi1: First DOI to compare
104
108
  doi2: Second DOI to compare
@@ -109,21 +113,11 @@ def compare_dois(doi1: str, doi2: str) -> bool:
109
113
  if not doi1 or not doi2:
110
114
  return False
111
115
 
112
- # Normalize both DOIs (already converted to lowercase)
116
+ # Normalize both DOIs (handles prefixes, case, punctuation)
113
117
  norm_doi1 = normalize_doi(doi1)
114
118
  norm_doi2 = normalize_doi(doi2)
115
119
 
116
- # If DOIs are identical, they match
117
- if norm_doi1 == norm_doi2:
118
- return True
119
-
120
- # Check if first two components match (publisher.registrant)
121
- doi1_parts = norm_doi1.split('.')
122
- doi2_parts = norm_doi2.split('.')
123
-
124
- if len(doi1_parts) >= 2 and len(doi2_parts) >= 2:
125
- return doi1_parts[0] == doi2_parts[0] and doi1_parts[1].split('/')[0] == doi2_parts[1].split('/')[0]
126
-
120
+ # DOIs must be exactly identical after normalization
127
121
  return norm_doi1 == norm_doi2
128
122
 
129
123
 
utils/error_utils.py CHANGED
@@ -9,6 +9,86 @@ for reference checkers.
9
9
  from typing import Dict, List, Any, Optional
10
10
 
11
11
 
12
+ def print_labeled_multiline(label: str, text: str) -> None:
13
+ """
14
+ Print a multi-line message with consistent label formatting.
15
+
16
+ This function ensures consistent indentation for all error and warning messages,
17
+ regardless of emoji width differences in the labels.
18
+
19
+ Args:
20
+ label: The label (e.g., "❌ Error", "⚠️ Warning")
21
+ text: The multi-line text to print
22
+ """
23
+ prefix = f" {label}: "
24
+ lines = (text or "").splitlines() or [""]
25
+
26
+ # Print the first line with the label prefix
27
+ print(prefix + lines[0])
28
+
29
+ # Print subsequent lines with fixed indentation to ensure consistency
30
+ # Use fixed 19-character indentation to align regardless of emoji width
31
+ fixed_indent = " " * 15
32
+ for line in lines[1:]:
33
+ print(fixed_indent + line)
34
+
35
+
36
+ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
37
+ """
38
+ Format a three-line mismatch message with fixed indentation.
39
+
40
+ This creates a clean, consistently formatted mismatch message that separates
41
+ the mismatch type from the values being compared:
42
+
43
+ Example:
44
+ Title mismatch:
45
+ 'Cited Title'
46
+ vs: 'Correct Title'
47
+
48
+ Args:
49
+ mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
50
+ left: The cited/incorrect value
51
+ right: The correct value
52
+
53
+ Returns:
54
+ Three-line formatted mismatch message
55
+ """
56
+ # Ensure mismatch_type ends with a colon
57
+ if not mismatch_type.endswith(":"):
58
+ mismatch_type = mismatch_type.rstrip() + ":"
59
+
60
+ # Use fixed indentation for clean, consistent alignment
61
+ indent = "" # spaces for content indentation
62
+ vs_indent = "" # vs: starts at column 0 for clear visual separation
63
+
64
+ return f"{mismatch_type}\n{indent}cited: '{left}'\n{vs_indent}actual: '{right}'"
65
+
66
+
67
+ def format_title_mismatch(cited_title: str, verified_title: str) -> str:
68
+ """
69
+ Format a three-line title mismatch message.
70
+
71
+ Output format:
72
+ Title mismatch:
73
+ 'Cited Title'
74
+ vs: 'Correct Title'
75
+ """
76
+ return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
77
+
78
+
79
+ def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
80
+ """
81
+ Three-line year mismatch message.
82
+ """
83
+ return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
84
+
85
+
86
+ def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
87
+ """
88
+ Three-line DOI mismatch message.
89
+ """
90
+ return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
91
+
12
92
  def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
13
93
  """
14
94
  Create a standardized author error dictionary.
@@ -40,7 +120,7 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
40
120
  """
41
121
  return {
42
122
  'warning_type': 'year',
43
- 'warning_details': f"Year mismatch: cited as {cited_year} but actually {correct_year}",
123
+ 'warning_details': format_year_mismatch(cited_year, correct_year),
44
124
  'ref_year_correct': correct_year
45
125
  }
46
126
 
@@ -64,7 +144,7 @@ def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str
64
144
  if cited_doi_clean != correct_doi_clean:
65
145
  return {
66
146
  'error_type': 'doi',
67
- 'error_details': f"DOI mismatch: cited as {cited_doi} but actually {correct_doi}",
147
+ 'error_details': format_doi_mismatch(cited_doi, correct_doi),
68
148
  'ref_doi_correct': correct_doi
69
149
  }
70
150
 
@@ -120,11 +200,20 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
120
200
 
121
201
  return {
122
202
  'warning_type': 'venue',
123
- 'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
203
+ 'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
124
204
  'ref_venue_correct': correct_venue
125
205
  }
126
206
 
127
207
 
208
+ def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
209
+ """
210
+ Format a three-line venue mismatch message with cleaned venue names.
211
+ """
212
+ clean_cited = clean_venue_for_comparison(cited_venue)
213
+ clean_verified = clean_venue_for_comparison(verified_venue)
214
+ return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
215
+
216
+
128
217
  def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
129
218
  """
130
219
  Create a standardized URL error dictionary.
@@ -189,6 +278,59 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
189
278
  return warning_dict
190
279
 
191
280
 
281
+ def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
282
+ """
283
+ Format a three-line author mismatch message.
284
+
285
+ Args:
286
+ author_number: The author position (1-based)
287
+ cited_author: The cited author name
288
+ correct_author: The correct author name
289
+
290
+ Returns:
291
+ Formatted three-line author mismatch message
292
+ """
293
+ return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
294
+
295
+
296
+ def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
297
+ """
298
+ Format a three-line first author mismatch message.
299
+
300
+ Args:
301
+ cited_author: The cited first author name
302
+ correct_author: The correct first author name
303
+
304
+ Returns:
305
+ Formatted three-line first author mismatch message
306
+ """
307
+ return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
308
+
309
+
310
+ def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
311
+ """
312
+ Format an author count mismatch message showing all cited and correct authors.
313
+
314
+ Args:
315
+ cited_count: Number of cited authors
316
+ correct_count: Number of correct authors
317
+ cited_authors: List of cited author names
318
+ correct_authors: List of correct author names
319
+
320
+ Returns:
321
+ Formatted multi-line author count mismatch message
322
+ """
323
+ # Create the header with count information
324
+ header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
325
+
326
+ # Format author lists
327
+ cited_list = ", ".join(cited_authors) if cited_authors else "None"
328
+ correct_list = ", ".join(correct_authors) if correct_authors else "None"
329
+
330
+ # Use the same format as other mismatches
331
+ return format_three_line_mismatch(header, cited_list, correct_list)
332
+
333
+
192
334
  def format_authors_list(authors: List[Dict[str, str]]) -> str:
193
335
  """
194
336
  Format a list of author dictionaries into a readable string.
utils/text_utils.py CHANGED
@@ -554,6 +554,10 @@ def clean_title_basic(title):
554
554
  # Remove trailing punctuation
555
555
  title = re.sub(r'[.,;:]+$', '', title)
556
556
 
557
+ # Remove BibTeX publication type indicators at the end (common in Chinese and some international BibTeX styles)
558
+ # [J] = Journal, [C] = Conference, [M] = Monograph/Book, [D] = Dissertation, [P] = Patent, [R] = Report
559
+ title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
560
+
557
561
  return title
558
562
 
559
563
 
@@ -578,6 +582,9 @@ def clean_title_for_search(title):
578
582
  title = title.replace('\n', ' ').strip()
579
583
  title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
580
584
 
585
+ # Remove BibTeX publication type indicators that are not part of the actual title
586
+ title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
587
+
581
588
  # Note: We intentionally preserve:
582
589
  # - Capitalization (helps with exact matching)
583
590
  # - Colons and other meaningful punctuation (structural markers)
@@ -2076,6 +2083,8 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2076
2083
  # The key insight: if the citation has "et al", we should only verify the listed authors
2077
2084
  # and not penalize for the authoritative source having more authors
2078
2085
  if has_et_al:
2086
+ # Import here to avoid circular imports
2087
+ from utils.error_utils import format_author_mismatch
2079
2088
  # For et al cases, check if each cited author matches ANY author in the correct list
2080
2089
  # rather than comparing positionally, since author order can vary
2081
2090
  for i, cited_author in enumerate(cleaned_cited):
@@ -2088,10 +2097,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2088
2097
  break
2089
2098
 
2090
2099
  if not author_found:
2091
- # Create a more informative error message that doesn't assume positional matching
2092
- # Show the full list of correct authors instead of truncating
2100
+ # Use standardized three-line formatting for author mismatch
2101
+ cited_display = format_author_for_display(cited_author)
2093
2102
  full_author_list = ', '.join(correct_names)
2094
- return False, f"Author {i+1} mismatch: '{cited_author}' not found in author list (et al case). Correct authors include: {full_author_list}"
2103
+ error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
2104
+ return False, error_msg
2095
2105
 
2096
2106
  return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
2097
2107
 
@@ -2100,7 +2110,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2100
2110
  # For non-et-al cases, be more strict about count mismatches
2101
2111
  # Allow minor flexibility (1 author difference) but not more
2102
2112
  if abs(len(cleaned_cited) - len(correct_names)) > 1:
2103
- return False, f"Author count mismatch: {len(cleaned_cited)} cited vs {len(correct_names)} correct"
2113
+ from utils.error_utils import format_author_count_mismatch
2114
+ # Convert cited names to display format (First Last) before showing in error
2115
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2116
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2117
+ return False, error_msg
2104
2118
 
2105
2119
  # Use the shorter list for comparison
2106
2120
  min_len = min(len(cleaned_cited), len(correct_names))
@@ -2110,6 +2124,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2110
2124
  comparison_cited = cleaned_cited
2111
2125
  comparison_correct = correct_names
2112
2126
 
2127
+ # Use shared three-line formatter (imported lazily to avoid circular imports)
2128
+ from utils.error_utils import format_first_author_mismatch, format_author_mismatch
2129
+
2113
2130
  # Compare first author (most important) using the enhanced name matching
2114
2131
  if comparison_cited and comparison_correct:
2115
2132
  cited_first = comparison_cited[0]
@@ -2119,7 +2136,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2119
2136
  # Use consistent display format for both names
2120
2137
  cited_display = format_author_for_display(cited_first)
2121
2138
  correct_display = format_author_for_display(correct_first)
2122
- return False, f"First author mismatch: '{cited_display}' vs '{correct_display}'"
2139
+ return False, format_first_author_mismatch(cited_display, correct_display)
2123
2140
 
2124
2141
  # For complete verification, check all authors if reasonable number
2125
2142
  if len(comparison_cited) <= 5: # Only do full check for reasonable author counts
@@ -2128,7 +2145,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2128
2145
  # Use consistent display format for both names
2129
2146
  cited_display = format_author_for_display(cited_author)
2130
2147
  correct_display = format_author_for_display(correct_author)
2131
- return False, f"Author {i+1} mismatch: '{cited_display}' vs '{correct_display}'"
2148
+ return False, format_author_mismatch(i+1, cited_display, correct_display)
2132
2149
 
2133
2150
  return True, "Authors match"
2134
2151
 
@@ -2512,6 +2529,97 @@ def strip_latex_commands(text):
2512
2529
  return text
2513
2530
 
2514
2531
 
2532
+ def extract_balanced_braces(text, start_pos):
2533
+ """
2534
+ Extract content from balanced braces starting at start_pos.
2535
+
2536
+ This function properly handles nested braces, which is important for LaTeX content
2537
+ where patterns like {Jos{\'e} Meseguer} need to be extracted as complete units.
2538
+
2539
+ Args:
2540
+ text: The text to search in
2541
+ start_pos: Position of the opening brace
2542
+
2543
+ Returns:
2544
+ tuple: (content, end_pos) or (None, start_pos) if no balanced content found
2545
+ """
2546
+ if start_pos >= len(text) or text[start_pos] != '{':
2547
+ return None, start_pos
2548
+
2549
+ brace_count = 1
2550
+ pos = start_pos + 1
2551
+
2552
+ while pos < len(text) and brace_count > 0:
2553
+ if text[pos] == '{':
2554
+ brace_count += 1
2555
+ elif text[pos] == '}':
2556
+ brace_count -= 1
2557
+ pos += 1
2558
+
2559
+ if brace_count == 0:
2560
+ return text[start_pos + 1:pos - 1], pos
2561
+ else:
2562
+ return None, start_pos
2563
+
2564
+
2565
+ def extract_bibinfo_person_content(text):
2566
+ """
2567
+ Extract all person names from \\bibinfo{person}{...} with proper brace handling.
2568
+
2569
+ This function correctly handles nested braces in author names, such as:
2570
+ \\bibinfo{person}{Jos{\\'e} Meseguer}
2571
+
2572
+ Args:
2573
+ text: Text containing \\bibinfo{person}{...} patterns
2574
+
2575
+ Returns:
2576
+ list: List of extracted person names with balanced braces preserved
2577
+ """
2578
+ return extract_bibinfo_field_content(text, 'person', return_all=True)
2579
+
2580
+
2581
+ def extract_bibinfo_field_content(text, field_type, return_all=False):
2582
+ """
2583
+ Extract content from \\bibinfo{field_type}{...} with proper brace handling.
2584
+
2585
+ This function correctly handles nested braces in field content, such as:
2586
+ \\bibinfo{journal}{\\emph{Commun. ACM}}
2587
+
2588
+ Args:
2589
+ text: Text containing \\bibinfo{field_type}{...} patterns
2590
+ field_type: The field type to extract (e.g., 'person', 'journal', 'title')
2591
+ return_all: If True, return list of all matches; if False, return first match or None
2592
+
2593
+ Returns:
2594
+ list or str or None: Extracted content based on return_all parameter
2595
+ """
2596
+ pattern = f'\\\\bibinfo\\{{{re.escape(field_type)}\\}}\\{{'
2597
+ matches = []
2598
+ pos = 0
2599
+
2600
+ while True:
2601
+ match = re.search(pattern, text[pos:])
2602
+ if not match:
2603
+ break
2604
+
2605
+ # Find the start of the content braces
2606
+ brace_start = pos + match.end() - 1 # -1 because we want the opening brace
2607
+ content, end_pos = extract_balanced_braces(text, brace_start)
2608
+
2609
+ if content is not None:
2610
+ matches.append(content)
2611
+ pos = end_pos
2612
+ if not return_all:
2613
+ break # Return first match only
2614
+ else:
2615
+ pos += match.end()
2616
+
2617
+ if return_all:
2618
+ return matches
2619
+ else:
2620
+ return matches[0] if matches else None
2621
+
2622
+
2515
2623
  def extract_cited_keys_from_latex(tex_content):
2516
2624
  r"""
2517
2625
  Extract citation keys from LaTeX content by finding \cite{} commands.
@@ -2936,8 +3044,8 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
2936
3044
 
2937
3045
  if brace_count == 0:
2938
3046
  author_content = content[start_pos:pos-1]
2939
- # Extract individual authors from \bibinfo{person}{Name} tags
2940
- person_matches = re.findall(r'\\bibinfo\{person\}\{([^}]+)\}', author_content)
3047
+ # Extract individual authors from \bibinfo{person}{Name} tags using balanced brace extraction
3048
+ person_matches = extract_bibinfo_person_content(author_content)
2941
3049
  if person_matches:
2942
3050
  # Clean and format author names
2943
3051
  authors = []