academic-refchecker 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/METADATA +25 -9
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/RECORD +19 -19
- checkers/crossref.py +6 -7
- checkers/github_checker.py +13 -3
- checkers/openalex.py +6 -7
- checkers/openreview_checker.py +7 -4
- checkers/semantic_scholar.py +9 -8
- checkers/webpage_checker.py +7 -2
- core/parallel_processor.py +5 -2
- core/refchecker.py +53 -44
- utils/doi_utils.py +6 -12
- utils/error_utils.py +145 -3
- utils/text_utils.py +115 -9
- utils/url_utils.py +17 -0
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: academic-refchecker
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.45
|
|
4
4
|
Summary: A comprehensive tool for validating reference accuracy in academic papers
|
|
5
5
|
Author-email: Mark Russinovich <markrussinovich@hotmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -78,7 +78,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
78
78
|
Verified URL: https://www.semanticscholar.org/paper/5f4ac1ac7ca4b17d3db1b52d9aafd9e8b26c0d7
|
|
79
79
|
ArXiv URL: https://arxiv.org/abs/1610.10099
|
|
80
80
|
DOI URL: https://doi.org/10.48550/arxiv.1610.10099
|
|
81
|
-
⚠️ Warning: Year mismatch:
|
|
81
|
+
⚠️ Warning: Year mismatch:
|
|
82
|
+
cited: '2017'
|
|
83
|
+
actual: '2016'
|
|
82
84
|
|
|
83
85
|
[2/45] Effective approaches to attention-based neural machine translation
|
|
84
86
|
Minh-Thang Luong, Hieu Pham, Christopher D. Manning
|
|
@@ -87,7 +89,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
87
89
|
Verified URL: https://www.semanticscholar.org/paper/93499a7c7f699b6630a86fad964536f9423bb6d0
|
|
88
90
|
ArXiv URL: https://arxiv.org/abs/1508.04025
|
|
89
91
|
DOI URL: https://doi.org/10.18653/v1/d15-1166
|
|
90
|
-
❌ Error: First author mismatch:
|
|
92
|
+
❌ Error: First author mismatch:
|
|
93
|
+
cited: 'Minh-Thang Luong'
|
|
94
|
+
actual: 'Thang Luong'
|
|
91
95
|
|
|
92
96
|
[3/45] Deep Residual Learning for Image Recognition
|
|
93
97
|
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
|
@@ -98,7 +102,9 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
|
|
|
98
102
|
Verified URL: https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d
|
|
99
103
|
ArXiv URL: https://arxiv.org/abs/1512.03385
|
|
100
104
|
DOI URL: https://doi.org/10.1109/CVPR.2016.90
|
|
101
|
-
❌ Error: DOI mismatch:
|
|
105
|
+
❌ Error: DOI mismatch:
|
|
106
|
+
cited: '10.1109/CVPR.2016.91'
|
|
107
|
+
actual: '10.1109/CVPR.2016.90'
|
|
102
108
|
|
|
103
109
|
============================================================
|
|
104
110
|
📋 SUMMARY
|
|
@@ -382,7 +388,9 @@ This enhanced URL display helps users access multiple authoritative sources for
|
|
|
382
388
|
Verified URL: https://www.semanticscholar.org/paper/a1b2c3d4e5f6789012345678901234567890abcd
|
|
383
389
|
ArXiv URL: https://arxiv.org/abs/2312.02119
|
|
384
390
|
DOI URL: https://doi.org/10.48550/arxiv.2312.02119
|
|
385
|
-
❌ Error: First author mismatch:
|
|
391
|
+
❌ Error: First author mismatch:
|
|
392
|
+
cited: 'T. Xie'
|
|
393
|
+
actual: 'Zhao Xu'
|
|
386
394
|
```
|
|
387
395
|
- `title`: Title discrepancies
|
|
388
396
|
```
|
|
@@ -392,7 +400,9 @@ This enhanced URL display helps users access multiple authoritative sources for
|
|
|
392
400
|
Verified URL: https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992
|
|
393
401
|
ArXiv URL: https://arxiv.org/abs/1810.04805
|
|
394
402
|
DOI URL: https://doi.org/10.18653/v1/n19-1423
|
|
395
|
-
❌ Error: Title mismatch:
|
|
403
|
+
❌ Error: Title mismatch:
|
|
404
|
+
cited: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
|
|
405
|
+
actual: 'BERT: Pre-training of Deep Bidirectional Transformers for Language Comprehension'
|
|
396
406
|
```
|
|
397
407
|
- `arxiv_id`: Incorrect URLs or arXiv IDs
|
|
398
408
|
```
|
|
@@ -415,7 +425,9 @@ This enhanced URL display helps users access multiple authoritative sources for
|
|
|
415
425
|
Verified URL: https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776
|
|
416
426
|
ArXiv URL: https://arxiv.org/abs/1706.03762
|
|
417
427
|
DOI URL: https://doi.org/10.48550/arXiv.1706.03762
|
|
418
|
-
❌ Error: DOI mismatch:
|
|
428
|
+
❌ Error: DOI mismatch:
|
|
429
|
+
cited: '10.5555/3295222.3295349'
|
|
430
|
+
actual: '10.48550/arXiv.1706.03762'
|
|
419
431
|
```
|
|
420
432
|
|
|
421
433
|
- **⚠️ Warnings**: Minor issues that may need attention
|
|
@@ -428,7 +440,9 @@ This enhanced URL display helps users access multiple authoritative sources for
|
|
|
428
440
|
Verified URL: https://www.semanticscholar.org/paper/f1a2b3c4d5e6f7890123456789012345678901ab
|
|
429
441
|
ArXiv URL: https://arxiv.org/abs/2310.03684
|
|
430
442
|
DOI URL: https://doi.org/10.48550/arxiv.2310.03684
|
|
431
|
-
⚠️ Warning: Year mismatch:
|
|
443
|
+
⚠️ Warning: Year mismatch:
|
|
444
|
+
cited: '2024'
|
|
445
|
+
actual: '2023'
|
|
432
446
|
```
|
|
433
447
|
- `venue`: Venue format variations
|
|
434
448
|
```
|
|
@@ -439,7 +453,9 @@ This enhanced URL display helps users access multiple authoritative sources for
|
|
|
439
453
|
Verified URL: https://www.semanticscholar.org/paper/c1d2e3f4a5b6c7d8e9f0123456789012345678ab
|
|
440
454
|
ArXiv URL: https://arxiv.org/abs/2403.02151
|
|
441
455
|
DOI URL: https://doi.org/10.48550/arxiv.2403.02151
|
|
442
|
-
⚠️ Warning: Venue mismatch:
|
|
456
|
+
⚠️ Warning: Venue mismatch:
|
|
457
|
+
cited: 'arXiv, 2024'
|
|
458
|
+
actual: 'Neural Information Processing Systems'
|
|
443
459
|
```
|
|
444
460
|
|
|
445
461
|
- **❓ Unverified**: References that couldn't be verified with any of the checker APIs
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=8vdrigO4-YfHufQMfh_RQ9NlN5btmqndss2dAOLxa1Q,65
|
|
2
|
+
academic_refchecker-1.2.45.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
|
-
checkers/crossref.py,sha256=
|
|
4
|
+
checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
6
|
-
checkers/github_checker.py,sha256=
|
|
6
|
+
checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
|
|
8
|
-
checkers/openalex.py,sha256=
|
|
9
|
-
checkers/openreview_checker.py,sha256=
|
|
10
|
-
checkers/semantic_scholar.py,sha256=
|
|
11
|
-
checkers/webpage_checker.py,sha256=
|
|
8
|
+
checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
|
|
9
|
+
checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
|
|
10
|
+
checkers/semantic_scholar.py,sha256=0LcVahf3twyHqaD7bQ2eJiTyg-AQ9NGvVohb9nqaHdA,34884
|
|
11
|
+
checkers/webpage_checker.py,sha256=woY8mNgZ4Lr9Ug53CN-Xo_2P62BTpR2u_FZyUPgTEuA,21833
|
|
12
12
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
13
13
|
config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
14
14
|
config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
15
15
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
16
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
|
-
core/parallel_processor.py,sha256=
|
|
18
|
-
core/refchecker.py,sha256=
|
|
17
|
+
core/parallel_processor.py,sha256=AOnjqhBHXlSb1c-PSunat9Eug5y04gOygwbHdPUqxgk,17202
|
|
18
|
+
core/refchecker.py,sha256=lU6r9cKpB8Fc4Wd7vOqdqhxP9cwYEoB6D4PlYznglGY,274337
|
|
19
19
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
20
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
21
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -33,14 +33,14 @@ utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,1
|
|
|
33
33
|
utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
|
|
34
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
35
35
|
utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
36
|
-
utils/doi_utils.py,sha256=
|
|
37
|
-
utils/error_utils.py,sha256=
|
|
36
|
+
utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
|
|
37
|
+
utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
|
|
38
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
39
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=T67Y-HSNokj-mOcdCtOcULNviBxyaG9xTjRd_l9titI,210088
|
|
40
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
|
-
utils/url_utils.py,sha256=
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
41
|
+
utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
|
|
42
|
+
academic_refchecker-1.2.45.dist-info/METADATA,sha256=mY4M9FRaDKcyS5yOFvR3X0Y0bj47_YmZeMayvrrpS38,22576
|
|
43
|
+
academic_refchecker-1.2.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.45.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.45.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.45.dist-info/RECORD,,
|
checkers/crossref.py
CHANGED
|
@@ -31,6 +31,7 @@ import re
|
|
|
31
31
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
32
32
|
from urllib.parse import quote_plus
|
|
33
33
|
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
34
|
+
from utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
34
35
|
from config.settings import get_config
|
|
35
36
|
|
|
36
37
|
# Set up logging
|
|
@@ -478,21 +479,19 @@ class CrossRefReferenceChecker:
|
|
|
478
479
|
if year and work_year and year != work_year:
|
|
479
480
|
errors.append({
|
|
480
481
|
'warning_type': 'year',
|
|
481
|
-
'warning_details':
|
|
482
|
+
'warning_details': format_year_mismatch(year, work_year),
|
|
482
483
|
'ref_year_correct': work_year
|
|
483
484
|
})
|
|
484
485
|
|
|
485
486
|
# Verify DOI
|
|
486
487
|
work_doi = work_data.get('DOI')
|
|
487
488
|
if doi and work_doi:
|
|
488
|
-
#
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
if cited_doi_clean.lower() != work_doi_clean.lower():
|
|
489
|
+
# Compare DOIs using the proper comparison function
|
|
490
|
+
from utils.doi_utils import compare_dois
|
|
491
|
+
if not compare_dois(doi, work_doi):
|
|
493
492
|
errors.append({
|
|
494
493
|
'error_type': 'doi',
|
|
495
|
-
'error_details':
|
|
494
|
+
'error_details': format_doi_mismatch(doi, work_doi),
|
|
496
495
|
'ref_doi_correct': work_doi
|
|
497
496
|
})
|
|
498
497
|
|
checkers/github_checker.py
CHANGED
|
@@ -169,9 +169,14 @@ class GitHubChecker:
|
|
|
169
169
|
if cited_title:
|
|
170
170
|
title_match = self._check_title_match(cited_title, actual_name, actual_description)
|
|
171
171
|
if not title_match:
|
|
172
|
+
from utils.error_utils import format_title_mismatch
|
|
173
|
+
details = format_title_mismatch(cited_title, actual_name)
|
|
174
|
+
if actual_description:
|
|
175
|
+
snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
|
|
176
|
+
details += f" ({snippet})"
|
|
172
177
|
errors.append({
|
|
173
178
|
"warning_type": "title",
|
|
174
|
-
"warning_details":
|
|
179
|
+
"warning_details": details
|
|
175
180
|
})
|
|
176
181
|
|
|
177
182
|
# Verify authors
|
|
@@ -180,9 +185,13 @@ class GitHubChecker:
|
|
|
180
185
|
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
181
186
|
author_match = self._check_author_match(author_str, actual_owner, actual_owner_name)
|
|
182
187
|
if not author_match:
|
|
188
|
+
from utils.error_utils import format_three_line_mismatch
|
|
189
|
+
left = author_str
|
|
190
|
+
right = f"{actual_owner} ({actual_owner_name})" if actual_owner_name else actual_owner
|
|
191
|
+
details = format_three_line_mismatch("Author mismatch", left, right)
|
|
183
192
|
errors.append({
|
|
184
193
|
"warning_type": "author",
|
|
185
|
-
"warning_details":
|
|
194
|
+
"warning_details": details
|
|
186
195
|
})
|
|
187
196
|
|
|
188
197
|
# Verify year
|
|
@@ -191,9 +200,10 @@ class GitHubChecker:
|
|
|
191
200
|
try:
|
|
192
201
|
cited_year_int = int(cited_year)
|
|
193
202
|
if cited_year_int < creation_year:
|
|
203
|
+
from utils.error_utils import format_year_mismatch
|
|
194
204
|
errors.append({
|
|
195
205
|
"warning_type": "year",
|
|
196
|
-
"warning_details":
|
|
206
|
+
"warning_details": format_year_mismatch(cited_year, creation_year),
|
|
197
207
|
"ref_year_correct": str(creation_year)
|
|
198
208
|
})
|
|
199
209
|
except (ValueError, TypeError):
|
checkers/openalex.py
CHANGED
|
@@ -33,6 +33,7 @@ import re
|
|
|
33
33
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
34
34
|
from urllib.parse import quote_plus
|
|
35
35
|
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
|
|
36
|
+
from utils.error_utils import format_year_mismatch, format_doi_mismatch
|
|
36
37
|
from config.settings import get_config
|
|
37
38
|
|
|
38
39
|
# Set up logging
|
|
@@ -448,7 +449,7 @@ class OpenAlexReferenceChecker:
|
|
|
448
449
|
if year and work_year and year != work_year:
|
|
449
450
|
errors.append({
|
|
450
451
|
'warning_type': 'year',
|
|
451
|
-
'warning_details':
|
|
452
|
+
'warning_details': format_year_mismatch(year, work_year),
|
|
452
453
|
'ref_year_correct': work_year
|
|
453
454
|
})
|
|
454
455
|
|
|
@@ -458,14 +459,12 @@ class OpenAlexReferenceChecker:
|
|
|
458
459
|
work_doi = work_data['ids']['doi']
|
|
459
460
|
|
|
460
461
|
if doi and work_doi:
|
|
461
|
-
#
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
if cited_doi_clean.lower() != work_doi_clean.lower():
|
|
462
|
+
# Compare DOIs using the proper comparison function
|
|
463
|
+
from utils.doi_utils import compare_dois
|
|
464
|
+
if not compare_dois(doi, work_doi):
|
|
466
465
|
errors.append({
|
|
467
466
|
'error_type': 'doi',
|
|
468
|
-
'error_details':
|
|
467
|
+
'error_details': format_doi_mismatch(doi, work_doi),
|
|
469
468
|
'ref_doi_correct': work_doi
|
|
470
469
|
})
|
|
471
470
|
|
checkers/openreview_checker.py
CHANGED
|
@@ -425,9 +425,11 @@ class OpenReviewReferenceChecker:
|
|
|
425
425
|
if cited_title and paper_title:
|
|
426
426
|
similarity = calculate_title_similarity(cited_title, paper_title)
|
|
427
427
|
if similarity < 0.7: # Using a reasonable threshold
|
|
428
|
+
from utils.error_utils import format_title_mismatch
|
|
429
|
+
details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
428
430
|
errors.append({
|
|
429
431
|
"warning_type": "title",
|
|
430
|
-
"warning_details":
|
|
432
|
+
"warning_details": details
|
|
431
433
|
})
|
|
432
434
|
|
|
433
435
|
# Check authors
|
|
@@ -460,9 +462,10 @@ class OpenReviewReferenceChecker:
|
|
|
460
462
|
|
|
461
463
|
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
462
464
|
if is_different and year_message:
|
|
465
|
+
from utils.error_utils import format_year_mismatch
|
|
463
466
|
errors.append({
|
|
464
467
|
"warning_type": "year",
|
|
465
|
-
"warning_details":
|
|
468
|
+
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
466
469
|
})
|
|
467
470
|
except (ValueError, TypeError):
|
|
468
471
|
pass # Skip year validation if conversion fails
|
|
@@ -473,10 +476,10 @@ class OpenReviewReferenceChecker:
|
|
|
473
476
|
|
|
474
477
|
if cited_venue and paper_venue:
|
|
475
478
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
476
|
-
from utils.error_utils import
|
|
479
|
+
from utils.error_utils import format_venue_mismatch
|
|
477
480
|
errors.append({
|
|
478
481
|
"warning_type": "venue",
|
|
479
|
-
"warning_details":
|
|
482
|
+
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
480
483
|
})
|
|
481
484
|
|
|
482
485
|
# Create verified data structure
|
checkers/semantic_scholar.py
CHANGED
|
@@ -29,6 +29,7 @@ import logging
|
|
|
29
29
|
import re
|
|
30
30
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
31
|
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
|
|
32
|
+
from utils.error_utils import format_title_mismatch
|
|
32
33
|
from config.settings import get_config
|
|
33
34
|
|
|
34
35
|
# Set up logging
|
|
@@ -471,7 +472,7 @@ class NonArxivReferenceChecker:
|
|
|
471
472
|
if found_title and title_similarity < SIMILARITY_THRESHOLD:
|
|
472
473
|
errors.append({
|
|
473
474
|
'error_type': 'title',
|
|
474
|
-
'error_details':
|
|
475
|
+
'error_details': format_title_mismatch(title, found_title),
|
|
475
476
|
'ref_title_correct': paper_data.get('title', '')
|
|
476
477
|
})
|
|
477
478
|
|
|
@@ -525,9 +526,10 @@ class NonArxivReferenceChecker:
|
|
|
525
526
|
is_different, warning_message = is_year_substantially_different(year, paper_year, context)
|
|
526
527
|
|
|
527
528
|
if is_different and warning_message:
|
|
529
|
+
from utils.error_utils import format_year_mismatch
|
|
528
530
|
errors.append({
|
|
529
531
|
'warning_type': 'year',
|
|
530
|
-
'warning_details':
|
|
532
|
+
'warning_details': format_year_mismatch(year, paper_year),
|
|
531
533
|
'ref_year_correct': paper_year
|
|
532
534
|
})
|
|
533
535
|
|
|
@@ -591,14 +593,13 @@ class NonArxivReferenceChecker:
|
|
|
591
593
|
if external_ids and 'DOI' in external_ids:
|
|
592
594
|
paper_doi = external_ids['DOI']
|
|
593
595
|
|
|
594
|
-
# Compare DOIs
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
if cited_doi_clean and paper_doi_clean and cited_doi_clean.lower() != paper_doi_clean.lower():
|
|
596
|
+
# Compare DOIs using the proper comparison function
|
|
597
|
+
from utils.doi_utils import compare_dois
|
|
598
|
+
if doi and paper_doi and not compare_dois(doi, paper_doi):
|
|
599
|
+
from utils.error_utils import format_doi_mismatch
|
|
599
600
|
errors.append({
|
|
600
601
|
'error_type': 'doi',
|
|
601
|
-
'error_details':
|
|
602
|
+
'error_details': format_doi_mismatch(doi, paper_doi),
|
|
602
603
|
'ref_doi_correct': paper_doi
|
|
603
604
|
})
|
|
604
605
|
|
checkers/webpage_checker.py
CHANGED
|
@@ -182,9 +182,10 @@ class WebPageChecker:
|
|
|
182
182
|
# Check title match
|
|
183
183
|
if cited_title and page_title:
|
|
184
184
|
if not self._check_title_match(cited_title, page_title, page_description):
|
|
185
|
+
from utils.error_utils import format_title_mismatch
|
|
185
186
|
errors.append({
|
|
186
187
|
"warning_type": "title",
|
|
187
|
-
"warning_details":
|
|
188
|
+
"warning_details": format_title_mismatch(cited_title, page_title)
|
|
188
189
|
})
|
|
189
190
|
|
|
190
191
|
# Check if this is a documentation page for the cited topic
|
|
@@ -201,9 +202,13 @@ class WebPageChecker:
|
|
|
201
202
|
if cited_authors:
|
|
202
203
|
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
203
204
|
if not self._check_author_match(author_str, site_info, web_url):
|
|
205
|
+
from utils.error_utils import format_three_line_mismatch
|
|
206
|
+
left = author_str
|
|
207
|
+
right = site_info.get('organization', 'unknown')
|
|
208
|
+
details = format_three_line_mismatch("Author/organization mismatch", left, right)
|
|
204
209
|
errors.append({
|
|
205
210
|
"warning_type": "author",
|
|
206
|
-
"warning_details":
|
|
211
|
+
"warning_details": details
|
|
207
212
|
})
|
|
208
213
|
|
|
209
214
|
logger.debug(f"Web page verification completed for: {web_url}")
|
core/parallel_processor.py
CHANGED
|
@@ -352,12 +352,15 @@ class ParallelReferenceProcessor:
|
|
|
352
352
|
error_type = error.get('error_type') or error.get('warning_type')
|
|
353
353
|
error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
|
|
354
354
|
|
|
355
|
+
from utils.error_utils import print_labeled_multiline
|
|
356
|
+
|
|
355
357
|
if error_type == 'arxiv_id':
|
|
358
|
+
# Keep existing style for arXiv ID errors
|
|
356
359
|
print(f" ❌ {error_details}")
|
|
357
360
|
elif 'error_type' in error:
|
|
358
|
-
|
|
361
|
+
print_labeled_multiline("❌ Error", error_details)
|
|
359
362
|
else:
|
|
360
|
-
|
|
363
|
+
print_labeled_multiline("⚠️ Warning", error_details)
|
|
361
364
|
|
|
362
365
|
# Show timing info for slow references
|
|
363
366
|
if result.processing_time > 5.0:
|
core/refchecker.py
CHANGED
|
@@ -1900,10 +1900,11 @@ class ArxivReferenceChecker:
|
|
|
1900
1900
|
db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
|
|
1901
1901
|
|
|
1902
1902
|
if normalized_title != db_title:
|
|
1903
|
+
from utils.error_utils import format_title_mismatch
|
|
1903
1904
|
logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
|
|
1904
1905
|
errors.append({
|
|
1905
1906
|
'error_type': 'title',
|
|
1906
|
-
'error_details':
|
|
1907
|
+
'error_details': format_title_mismatch(title, paper_data.get('title')),
|
|
1907
1908
|
'ref_title_correct': paper_data.get('title')
|
|
1908
1909
|
})
|
|
1909
1910
|
|
|
@@ -1925,30 +1926,36 @@ class ArxivReferenceChecker:
|
|
|
1925
1926
|
paper_year = paper_data.get('year')
|
|
1926
1927
|
if year and paper_year and year != paper_year:
|
|
1927
1928
|
logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
|
|
1929
|
+
from utils.error_utils import format_year_mismatch
|
|
1928
1930
|
errors.append({
|
|
1929
1931
|
'warning_type': 'year',
|
|
1930
|
-
'warning_details':
|
|
1932
|
+
'warning_details': format_year_mismatch(year, paper_year),
|
|
1931
1933
|
'ref_year_correct': paper_year
|
|
1932
1934
|
})
|
|
1933
1935
|
|
|
1934
1936
|
# Verify DOI
|
|
1935
|
-
if doi and external_ids.get('DOI')
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1937
|
+
if doi and external_ids.get('DOI'):
|
|
1938
|
+
from utils.doi_utils import compare_dois, normalize_doi
|
|
1939
|
+
|
|
1940
|
+
# Use proper DOI comparison first
|
|
1941
|
+
if not compare_dois(doi, external_ids['DOI']):
|
|
1942
|
+
# Check if the cited DOI is a partial match of the actual DOI
|
|
1943
|
+
# This handles cases like "10.1111/j.2044-8260." vs "10.1111/J.2044-8260.1997.TB01237.X"
|
|
1944
|
+
cited_doi_normalized = normalize_doi(doi)
|
|
1945
|
+
actual_doi_normalized = normalize_doi(external_ids['DOI'])
|
|
1946
|
+
|
|
1947
|
+
# If the cited DOI is a prefix of the actual DOI, it's likely a partial citation
|
|
1948
|
+
# Only flag as error if it's not a reasonable partial match
|
|
1949
|
+
if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
|
|
1950
|
+
logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
|
|
1951
|
+
from utils.error_utils import format_doi_mismatch
|
|
1952
|
+
errors.append({
|
|
1953
|
+
'error_type': 'doi',
|
|
1954
|
+
'error_details': format_doi_mismatch(doi, external_ids['DOI']),
|
|
1955
|
+
'ref_doi_correct': external_ids['DOI']
|
|
1956
|
+
})
|
|
1957
|
+
else:
|
|
1958
|
+
logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
|
|
1952
1959
|
|
|
1953
1960
|
# Verify ArXiv ID
|
|
1954
1961
|
if reference.get('type') == 'arxiv':
|
|
@@ -3489,8 +3496,9 @@ class ArxivReferenceChecker:
|
|
|
3489
3496
|
author_field_match = re.search(r'\\bibfield\{author\}\{(.*?)\}(?:\s*\\bibinfo\{year\}|\s*\\newblock|$)', content, re.DOTALL)
|
|
3490
3497
|
if author_field_match:
|
|
3491
3498
|
author_content = author_field_match.group(1)
|
|
3492
|
-
# Find all \bibinfo{person}{Name} entries
|
|
3493
|
-
|
|
3499
|
+
# Find all \bibinfo{person}{Name} entries using balanced brace extraction
|
|
3500
|
+
from utils.text_utils import extract_bibinfo_person_content
|
|
3501
|
+
person_matches = extract_bibinfo_person_content(author_content)
|
|
3494
3502
|
if person_matches:
|
|
3495
3503
|
authors = []
|
|
3496
3504
|
for person in person_matches:
|
|
@@ -3502,33 +3510,31 @@ class ArxivReferenceChecker:
|
|
|
3502
3510
|
authors.append(clean_name)
|
|
3503
3511
|
ref['authors'] = authors
|
|
3504
3512
|
|
|
3505
|
-
#
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3513
|
+
# Import balanced brace extraction function
|
|
3514
|
+
from utils.text_utils import extract_bibinfo_field_content
|
|
3515
|
+
|
|
3516
|
+
# Extract title from \bibinfo{title}{Title} using balanced brace extraction
|
|
3517
|
+
title_content = extract_bibinfo_field_content(content, 'title')
|
|
3518
|
+
if title_content:
|
|
3519
|
+
title = strip_latex_commands(title_content).strip()
|
|
3509
3520
|
ref['title'] = title
|
|
3510
3521
|
|
|
3511
|
-
# Extract venue/journal from various fields
|
|
3512
|
-
|
|
3513
|
-
r'\\bibinfo\{booktitle\}\{([^}]+)\}',
|
|
3514
|
-
r'\\bibinfo\{journal\}\{([^}]+)\}',
|
|
3515
|
-
r'\\bibinfo\{series\}\{([^}]+)\}',
|
|
3516
|
-
r'\\bibinfo\{note\}\{([^}]+)\}'
|
|
3517
|
-
]
|
|
3522
|
+
# Extract venue/journal from various fields using balanced brace extraction
|
|
3523
|
+
venue_field_types = ['booktitle', 'journal', 'series', 'note']
|
|
3518
3524
|
|
|
3519
|
-
for
|
|
3520
|
-
|
|
3521
|
-
if
|
|
3522
|
-
venue = strip_latex_commands(
|
|
3525
|
+
for field_type in venue_field_types:
|
|
3526
|
+
venue_content = extract_bibinfo_field_content(content, field_type)
|
|
3527
|
+
if venue_content:
|
|
3528
|
+
venue = strip_latex_commands(venue_content).strip()
|
|
3523
3529
|
if venue:
|
|
3524
3530
|
ref['venue'] = venue
|
|
3525
3531
|
ref['journal'] = venue # For compatibility
|
|
3526
3532
|
break
|
|
3527
3533
|
|
|
3528
|
-
# Extract DOI
|
|
3529
|
-
|
|
3530
|
-
if
|
|
3531
|
-
ref['doi'] =
|
|
3534
|
+
# Extract DOI using balanced brace extraction
|
|
3535
|
+
doi_content = extract_bibinfo_field_content(content, 'doi')
|
|
3536
|
+
if doi_content:
|
|
3537
|
+
ref['doi'] = doi_content.strip()
|
|
3532
3538
|
|
|
3533
3539
|
# Extract ArXiv ID from \showeprint[arxiv]{ID}
|
|
3534
3540
|
arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
|
|
@@ -5048,7 +5054,8 @@ class ArxivReferenceChecker:
|
|
|
5048
5054
|
correct_first = correct_authors[0]
|
|
5049
5055
|
|
|
5050
5056
|
if not enhanced_name_match(cited_first, correct_first):
|
|
5051
|
-
|
|
5057
|
+
from utils.error_utils import format_first_author_mismatch
|
|
5058
|
+
return False, format_first_author_mismatch(cited_first, correct_first)
|
|
5052
5059
|
|
|
5053
5060
|
return True, "Authors match"
|
|
5054
5061
|
|
|
@@ -5181,7 +5188,7 @@ class ArxivReferenceChecker:
|
|
|
5181
5188
|
from utils.text_utils import format_authors_for_display
|
|
5182
5189
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
5183
5190
|
year = reference.get('year', '')
|
|
5184
|
-
venue = reference.get('venue', '')
|
|
5191
|
+
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
5185
5192
|
url = reference.get('url', '')
|
|
5186
5193
|
doi = reference.get('doi', '')
|
|
5187
5194
|
# Extract actual reference number from raw text for accurate display
|
|
@@ -5454,12 +5461,14 @@ class ArxivReferenceChecker:
|
|
|
5454
5461
|
error_type = error.get('error_type') or error.get('warning_type')
|
|
5455
5462
|
error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
|
|
5456
5463
|
|
|
5464
|
+
from utils.error_utils import print_labeled_multiline
|
|
5465
|
+
|
|
5457
5466
|
if error_type == 'arxiv_id':
|
|
5458
5467
|
print(f" ❌ {error_details}")
|
|
5459
5468
|
elif 'error_type' in error:
|
|
5460
|
-
|
|
5469
|
+
print_labeled_multiline("❌ Error", error_details)
|
|
5461
5470
|
else:
|
|
5462
|
-
|
|
5471
|
+
print_labeled_multiline("⚠️ Warning", error_details)
|
|
5463
5472
|
|
|
5464
5473
|
def _output_reference_errors(self, reference, errors, url):
|
|
5465
5474
|
"""
|
utils/doi_utils.py
CHANGED
|
@@ -99,6 +99,10 @@ def compare_dois(doi1: str, doi2: str) -> bool:
|
|
|
99
99
|
"""
|
|
100
100
|
Compare two DOIs for equality, handling different formats and prefixes.
|
|
101
101
|
|
|
102
|
+
This function performs exact matching after normalization, which means
|
|
103
|
+
DOIs are only considered equal if they are identical after removing
|
|
104
|
+
prefixes, case differences, and punctuation.
|
|
105
|
+
|
|
102
106
|
Args:
|
|
103
107
|
doi1: First DOI to compare
|
|
104
108
|
doi2: Second DOI to compare
|
|
@@ -109,21 +113,11 @@ def compare_dois(doi1: str, doi2: str) -> bool:
|
|
|
109
113
|
if not doi1 or not doi2:
|
|
110
114
|
return False
|
|
111
115
|
|
|
112
|
-
# Normalize both DOIs (
|
|
116
|
+
# Normalize both DOIs (handles prefixes, case, punctuation)
|
|
113
117
|
norm_doi1 = normalize_doi(doi1)
|
|
114
118
|
norm_doi2 = normalize_doi(doi2)
|
|
115
119
|
|
|
116
|
-
#
|
|
117
|
-
if norm_doi1 == norm_doi2:
|
|
118
|
-
return True
|
|
119
|
-
|
|
120
|
-
# Check if first two components match (publisher.registrant)
|
|
121
|
-
doi1_parts = norm_doi1.split('.')
|
|
122
|
-
doi2_parts = norm_doi2.split('.')
|
|
123
|
-
|
|
124
|
-
if len(doi1_parts) >= 2 and len(doi2_parts) >= 2:
|
|
125
|
-
return doi1_parts[0] == doi2_parts[0] and doi1_parts[1].split('/')[0] == doi2_parts[1].split('/')[0]
|
|
126
|
-
|
|
120
|
+
# DOIs must be exactly identical after normalization
|
|
127
121
|
return norm_doi1 == norm_doi2
|
|
128
122
|
|
|
129
123
|
|
utils/error_utils.py
CHANGED
|
@@ -9,6 +9,86 @@ for reference checkers.
|
|
|
9
9
|
from typing import Dict, List, Any, Optional
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
def print_labeled_multiline(label: str, text: str) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Print a multi-line message with consistent label formatting.
|
|
15
|
+
|
|
16
|
+
This function ensures consistent indentation for all error and warning messages,
|
|
17
|
+
regardless of emoji width differences in the labels.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
label: The label (e.g., "❌ Error", "⚠️ Warning")
|
|
21
|
+
text: The multi-line text to print
|
|
22
|
+
"""
|
|
23
|
+
prefix = f" {label}: "
|
|
24
|
+
lines = (text or "").splitlines() or [""]
|
|
25
|
+
|
|
26
|
+
# Print the first line with the label prefix
|
|
27
|
+
print(prefix + lines[0])
|
|
28
|
+
|
|
29
|
+
# Print subsequent lines with fixed indentation to ensure consistency
|
|
30
|
+
# Use fixed 19-character indentation to align regardless of emoji width
|
|
31
|
+
fixed_indent = " " * 15
|
|
32
|
+
for line in lines[1:]:
|
|
33
|
+
print(fixed_indent + line)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Format a three-line mismatch message with fixed indentation.
|
|
39
|
+
|
|
40
|
+
This creates a clean, consistently formatted mismatch message that separates
|
|
41
|
+
the mismatch type from the values being compared:
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
Title mismatch:
|
|
45
|
+
'Cited Title'
|
|
46
|
+
vs: 'Correct Title'
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
|
|
50
|
+
left: The cited/incorrect value
|
|
51
|
+
right: The correct value
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Three-line formatted mismatch message
|
|
55
|
+
"""
|
|
56
|
+
# Ensure mismatch_type ends with a colon
|
|
57
|
+
if not mismatch_type.endswith(":"):
|
|
58
|
+
mismatch_type = mismatch_type.rstrip() + ":"
|
|
59
|
+
|
|
60
|
+
# Use fixed indentation for clean, consistent alignment
|
|
61
|
+
indent = "" # spaces for content indentation
|
|
62
|
+
vs_indent = "" # vs: starts at column 0 for clear visual separation
|
|
63
|
+
|
|
64
|
+
return f"{mismatch_type}\n{indent}cited: '{left}'\n{vs_indent}actual: '{right}'"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def format_title_mismatch(cited_title: str, verified_title: str) -> str:
|
|
68
|
+
"""
|
|
69
|
+
Format a three-line title mismatch message.
|
|
70
|
+
|
|
71
|
+
Output format:
|
|
72
|
+
Title mismatch:
|
|
73
|
+
'Cited Title'
|
|
74
|
+
vs: 'Correct Title'
|
|
75
|
+
"""
|
|
76
|
+
return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Three-line year mismatch message.
|
|
82
|
+
"""
|
|
83
|
+
return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
|
|
87
|
+
"""
|
|
88
|
+
Three-line DOI mismatch message.
|
|
89
|
+
"""
|
|
90
|
+
return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
|
|
91
|
+
|
|
12
92
|
def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
|
|
13
93
|
"""
|
|
14
94
|
Create a standardized author error dictionary.
|
|
@@ -40,7 +120,7 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
|
|
|
40
120
|
"""
|
|
41
121
|
return {
|
|
42
122
|
'warning_type': 'year',
|
|
43
|
-
'warning_details':
|
|
123
|
+
'warning_details': format_year_mismatch(cited_year, correct_year),
|
|
44
124
|
'ref_year_correct': correct_year
|
|
45
125
|
}
|
|
46
126
|
|
|
@@ -64,7 +144,7 @@ def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str
|
|
|
64
144
|
if cited_doi_clean != correct_doi_clean:
|
|
65
145
|
return {
|
|
66
146
|
'error_type': 'doi',
|
|
67
|
-
'error_details':
|
|
147
|
+
'error_details': format_doi_mismatch(cited_doi, correct_doi),
|
|
68
148
|
'ref_doi_correct': correct_doi
|
|
69
149
|
}
|
|
70
150
|
|
|
@@ -120,11 +200,20 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
|
|
|
120
200
|
|
|
121
201
|
return {
|
|
122
202
|
'warning_type': 'venue',
|
|
123
|
-
'warning_details':
|
|
203
|
+
'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
|
|
124
204
|
'ref_venue_correct': correct_venue
|
|
125
205
|
}
|
|
126
206
|
|
|
127
207
|
|
|
208
|
+
def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Format a three-line venue mismatch message with cleaned venue names.
|
|
211
|
+
"""
|
|
212
|
+
clean_cited = clean_venue_for_comparison(cited_venue)
|
|
213
|
+
clean_verified = clean_venue_for_comparison(verified_venue)
|
|
214
|
+
return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
|
|
215
|
+
|
|
216
|
+
|
|
128
217
|
def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
|
|
129
218
|
"""
|
|
130
219
|
Create a standardized URL error dictionary.
|
|
@@ -189,6 +278,59 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
|
|
|
189
278
|
return warning_dict
|
|
190
279
|
|
|
191
280
|
|
|
281
|
+
def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
|
|
282
|
+
"""
|
|
283
|
+
Format a three-line author mismatch message.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
author_number: The author position (1-based)
|
|
287
|
+
cited_author: The cited author name
|
|
288
|
+
correct_author: The correct author name
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Formatted three-line author mismatch message
|
|
292
|
+
"""
|
|
293
|
+
return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
|
|
297
|
+
"""
|
|
298
|
+
Format a three-line first author mismatch message.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
cited_author: The cited first author name
|
|
302
|
+
correct_author: The correct first author name
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Formatted three-line first author mismatch message
|
|
306
|
+
"""
|
|
307
|
+
return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
|
|
311
|
+
"""
|
|
312
|
+
Format an author count mismatch message showing all cited and correct authors.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
cited_count: Number of cited authors
|
|
316
|
+
correct_count: Number of correct authors
|
|
317
|
+
cited_authors: List of cited author names
|
|
318
|
+
correct_authors: List of correct author names
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
Formatted multi-line author count mismatch message
|
|
322
|
+
"""
|
|
323
|
+
# Create the header with count information
|
|
324
|
+
header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
|
|
325
|
+
|
|
326
|
+
# Format author lists
|
|
327
|
+
cited_list = ", ".join(cited_authors) if cited_authors else "None"
|
|
328
|
+
correct_list = ", ".join(correct_authors) if correct_authors else "None"
|
|
329
|
+
|
|
330
|
+
# Use the same format as other mismatches
|
|
331
|
+
return format_three_line_mismatch(header, cited_list, correct_list)
|
|
332
|
+
|
|
333
|
+
|
|
192
334
|
def format_authors_list(authors: List[Dict[str, str]]) -> str:
|
|
193
335
|
"""
|
|
194
336
|
Format a list of author dictionaries into a readable string.
|
utils/text_utils.py
CHANGED
|
@@ -554,6 +554,10 @@ def clean_title_basic(title):
|
|
|
554
554
|
# Remove trailing punctuation
|
|
555
555
|
title = re.sub(r'[.,;:]+$', '', title)
|
|
556
556
|
|
|
557
|
+
# Remove BibTeX publication type indicators at the end (common in Chinese and some international BibTeX styles)
|
|
558
|
+
# [J] = Journal, [C] = Conference, [M] = Monograph/Book, [D] = Dissertation, [P] = Patent, [R] = Report
|
|
559
|
+
title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
|
|
560
|
+
|
|
557
561
|
return title
|
|
558
562
|
|
|
559
563
|
|
|
@@ -578,6 +582,9 @@ def clean_title_for_search(title):
|
|
|
578
582
|
title = title.replace('\n', ' ').strip()
|
|
579
583
|
title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
|
|
580
584
|
|
|
585
|
+
# Remove BibTeX publication type indicators that are not part of the actual title
|
|
586
|
+
title = re.sub(r'\s*\[[JCMDPRS]\]\s*$', '', title)
|
|
587
|
+
|
|
581
588
|
# Note: We intentionally preserve:
|
|
582
589
|
# - Capitalization (helps with exact matching)
|
|
583
590
|
# - Colons and other meaningful punctuation (structural markers)
|
|
@@ -2076,6 +2083,8 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2076
2083
|
# The key insight: if the citation has "et al", we should only verify the listed authors
|
|
2077
2084
|
# and not penalize for the authoritative source having more authors
|
|
2078
2085
|
if has_et_al:
|
|
2086
|
+
# Import here to avoid circular imports
|
|
2087
|
+
from utils.error_utils import format_author_mismatch
|
|
2079
2088
|
# For et al cases, check if each cited author matches ANY author in the correct list
|
|
2080
2089
|
# rather than comparing positionally, since author order can vary
|
|
2081
2090
|
for i, cited_author in enumerate(cleaned_cited):
|
|
@@ -2088,10 +2097,11 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2088
2097
|
break
|
|
2089
2098
|
|
|
2090
2099
|
if not author_found:
|
|
2091
|
-
#
|
|
2092
|
-
|
|
2100
|
+
# Use standardized three-line formatting for author mismatch
|
|
2101
|
+
cited_display = format_author_for_display(cited_author)
|
|
2093
2102
|
full_author_list = ', '.join(correct_names)
|
|
2094
|
-
|
|
2103
|
+
error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
|
|
2104
|
+
return False, error_msg
|
|
2095
2105
|
|
|
2096
2106
|
return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
|
|
2097
2107
|
|
|
@@ -2100,7 +2110,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2100
2110
|
# For non-et-al cases, be more strict about count mismatches
|
|
2101
2111
|
# Allow minor flexibility (1 author difference) but not more
|
|
2102
2112
|
if abs(len(cleaned_cited) - len(correct_names)) > 1:
|
|
2103
|
-
|
|
2113
|
+
from utils.error_utils import format_author_count_mismatch
|
|
2114
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
|
|
2115
|
+
return False, error_msg
|
|
2104
2116
|
|
|
2105
2117
|
# Use the shorter list for comparison
|
|
2106
2118
|
min_len = min(len(cleaned_cited), len(correct_names))
|
|
@@ -2110,6 +2122,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2110
2122
|
comparison_cited = cleaned_cited
|
|
2111
2123
|
comparison_correct = correct_names
|
|
2112
2124
|
|
|
2125
|
+
# Use shared three-line formatter (imported lazily to avoid circular imports)
|
|
2126
|
+
from utils.error_utils import format_first_author_mismatch, format_author_mismatch
|
|
2127
|
+
|
|
2113
2128
|
# Compare first author (most important) using the enhanced name matching
|
|
2114
2129
|
if comparison_cited and comparison_correct:
|
|
2115
2130
|
cited_first = comparison_cited[0]
|
|
@@ -2119,7 +2134,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2119
2134
|
# Use consistent display format for both names
|
|
2120
2135
|
cited_display = format_author_for_display(cited_first)
|
|
2121
2136
|
correct_display = format_author_for_display(correct_first)
|
|
2122
|
-
return False,
|
|
2137
|
+
return False, format_first_author_mismatch(cited_display, correct_display)
|
|
2123
2138
|
|
|
2124
2139
|
# For complete verification, check all authors if reasonable number
|
|
2125
2140
|
if len(comparison_cited) <= 5: # Only do full check for reasonable author counts
|
|
@@ -2128,7 +2143,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2128
2143
|
# Use consistent display format for both names
|
|
2129
2144
|
cited_display = format_author_for_display(cited_author)
|
|
2130
2145
|
correct_display = format_author_for_display(correct_author)
|
|
2131
|
-
return False,
|
|
2146
|
+
return False, format_author_mismatch(i+1, cited_display, correct_display)
|
|
2132
2147
|
|
|
2133
2148
|
return True, "Authors match"
|
|
2134
2149
|
|
|
@@ -2512,6 +2527,97 @@ def strip_latex_commands(text):
|
|
|
2512
2527
|
return text
|
|
2513
2528
|
|
|
2514
2529
|
|
|
2530
|
+
def extract_balanced_braces(text, start_pos):
|
|
2531
|
+
"""
|
|
2532
|
+
Extract content from balanced braces starting at start_pos.
|
|
2533
|
+
|
|
2534
|
+
This function properly handles nested braces, which is important for LaTeX content
|
|
2535
|
+
where patterns like {Jos{\'e} Meseguer} need to be extracted as complete units.
|
|
2536
|
+
|
|
2537
|
+
Args:
|
|
2538
|
+
text: The text to search in
|
|
2539
|
+
start_pos: Position of the opening brace
|
|
2540
|
+
|
|
2541
|
+
Returns:
|
|
2542
|
+
tuple: (content, end_pos) or (None, start_pos) if no balanced content found
|
|
2543
|
+
"""
|
|
2544
|
+
if start_pos >= len(text) or text[start_pos] != '{':
|
|
2545
|
+
return None, start_pos
|
|
2546
|
+
|
|
2547
|
+
brace_count = 1
|
|
2548
|
+
pos = start_pos + 1
|
|
2549
|
+
|
|
2550
|
+
while pos < len(text) and brace_count > 0:
|
|
2551
|
+
if text[pos] == '{':
|
|
2552
|
+
brace_count += 1
|
|
2553
|
+
elif text[pos] == '}':
|
|
2554
|
+
brace_count -= 1
|
|
2555
|
+
pos += 1
|
|
2556
|
+
|
|
2557
|
+
if brace_count == 0:
|
|
2558
|
+
return text[start_pos + 1:pos - 1], pos
|
|
2559
|
+
else:
|
|
2560
|
+
return None, start_pos
|
|
2561
|
+
|
|
2562
|
+
|
|
2563
|
+
def extract_bibinfo_person_content(text):
|
|
2564
|
+
"""
|
|
2565
|
+
Extract all person names from \\bibinfo{person}{...} with proper brace handling.
|
|
2566
|
+
|
|
2567
|
+
This function correctly handles nested braces in author names, such as:
|
|
2568
|
+
\\bibinfo{person}{Jos{\\'e} Meseguer}
|
|
2569
|
+
|
|
2570
|
+
Args:
|
|
2571
|
+
text: Text containing \\bibinfo{person}{...} patterns
|
|
2572
|
+
|
|
2573
|
+
Returns:
|
|
2574
|
+
list: List of extracted person names with balanced braces preserved
|
|
2575
|
+
"""
|
|
2576
|
+
return extract_bibinfo_field_content(text, 'person', return_all=True)
|
|
2577
|
+
|
|
2578
|
+
|
|
2579
|
+
def extract_bibinfo_field_content(text, field_type, return_all=False):
|
|
2580
|
+
"""
|
|
2581
|
+
Extract content from \\bibinfo{field_type}{...} with proper brace handling.
|
|
2582
|
+
|
|
2583
|
+
This function correctly handles nested braces in field content, such as:
|
|
2584
|
+
\\bibinfo{journal}{\\emph{Commun. ACM}}
|
|
2585
|
+
|
|
2586
|
+
Args:
|
|
2587
|
+
text: Text containing \\bibinfo{field_type}{...} patterns
|
|
2588
|
+
field_type: The field type to extract (e.g., 'person', 'journal', 'title')
|
|
2589
|
+
return_all: If True, return list of all matches; if False, return first match or None
|
|
2590
|
+
|
|
2591
|
+
Returns:
|
|
2592
|
+
list or str or None: Extracted content based on return_all parameter
|
|
2593
|
+
"""
|
|
2594
|
+
pattern = f'\\\\bibinfo\\{{{re.escape(field_type)}\\}}\\{{'
|
|
2595
|
+
matches = []
|
|
2596
|
+
pos = 0
|
|
2597
|
+
|
|
2598
|
+
while True:
|
|
2599
|
+
match = re.search(pattern, text[pos:])
|
|
2600
|
+
if not match:
|
|
2601
|
+
break
|
|
2602
|
+
|
|
2603
|
+
# Find the start of the content braces
|
|
2604
|
+
brace_start = pos + match.end() - 1 # -1 because we want the opening brace
|
|
2605
|
+
content, end_pos = extract_balanced_braces(text, brace_start)
|
|
2606
|
+
|
|
2607
|
+
if content is not None:
|
|
2608
|
+
matches.append(content)
|
|
2609
|
+
pos = end_pos
|
|
2610
|
+
if not return_all:
|
|
2611
|
+
break # Return first match only
|
|
2612
|
+
else:
|
|
2613
|
+
pos += match.end()
|
|
2614
|
+
|
|
2615
|
+
if return_all:
|
|
2616
|
+
return matches
|
|
2617
|
+
else:
|
|
2618
|
+
return matches[0] if matches else None
|
|
2619
|
+
|
|
2620
|
+
|
|
2515
2621
|
def extract_cited_keys_from_latex(tex_content):
|
|
2516
2622
|
r"""
|
|
2517
2623
|
Extract citation keys from LaTeX content by finding \cite{} commands.
|
|
@@ -2936,8 +3042,8 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
2936
3042
|
|
|
2937
3043
|
if brace_count == 0:
|
|
2938
3044
|
author_content = content[start_pos:pos-1]
|
|
2939
|
-
# Extract individual authors from \bibinfo{person}{Name} tags
|
|
2940
|
-
person_matches =
|
|
3045
|
+
# Extract individual authors from \bibinfo{person}{Name} tags using balanced brace extraction
|
|
3046
|
+
person_matches = extract_bibinfo_person_content(author_content)
|
|
2941
3047
|
if person_matches:
|
|
2942
3048
|
# Clean and format author names
|
|
2943
3049
|
authors = []
|
|
@@ -4594,7 +4700,7 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
4594
4700
|
prefixes_to_remove = [
|
|
4595
4701
|
r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
|
|
4596
4702
|
r'^\d{4}\s+', # "2024 "
|
|
4597
|
-
r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?
|
|
4703
|
+
r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?', # "Proceedings of the [ORG] [ORG] 29th"
|
|
4598
4704
|
r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
|
|
4599
4705
|
r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
|
|
4600
4706
|
r'^in\s+',
|
utils/url_utils.py
CHANGED
|
@@ -214,6 +214,7 @@ def clean_url(url: str) -> str:
|
|
|
214
214
|
This function handles:
|
|
215
215
|
- Whitespace trimming
|
|
216
216
|
- Malformed LaTeX URL wrappers like \\url{https://...}
|
|
217
|
+
- Markdown-style links like [text](url)
|
|
217
218
|
- Trailing punctuation from academic references
|
|
218
219
|
- DOI URL query parameter cleanup
|
|
219
220
|
|
|
@@ -237,6 +238,14 @@ def clean_url(url: str) -> str:
|
|
|
237
238
|
if url_match:
|
|
238
239
|
url = url_match.group(1)
|
|
239
240
|
|
|
241
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
242
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
243
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
244
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
245
|
+
if markdown_match:
|
|
246
|
+
# Use the URL from parentheses
|
|
247
|
+
url = markdown_match.group(2)
|
|
248
|
+
|
|
240
249
|
# Remove trailing punctuation that's commonly part of sentence structure
|
|
241
250
|
# but preserve legitimate URL characters
|
|
242
251
|
url = url.rstrip('.,;!?)')
|
|
@@ -280,6 +289,14 @@ def clean_url_punctuation(url: str) -> str:
|
|
|
280
289
|
if url_match:
|
|
281
290
|
url = url_match.group(1)
|
|
282
291
|
|
|
292
|
+
# Handle markdown-style links like [text](url) or [url](url)
|
|
293
|
+
# e.g., "[https://example.com](https://example.com)" -> "https://example.com"
|
|
294
|
+
markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
|
|
295
|
+
markdown_match = re.search(markdown_pattern, url)
|
|
296
|
+
if markdown_match:
|
|
297
|
+
# Use the URL from parentheses
|
|
298
|
+
url = markdown_match.group(2)
|
|
299
|
+
|
|
283
300
|
# Remove trailing punctuation that's commonly part of sentence structure
|
|
284
301
|
# but preserve legitimate URL characters
|
|
285
302
|
url = url.rstrip('.,;!?)')
|
|
File without changes
|
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.43.dist-info → academic_refchecker-1.2.45.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|