academic-refchecker 1.2.40__py3-none-any.whl → 1.2.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/RECORD +13 -13
- checkers/openreview_checker.py +2 -1
- checkers/semantic_scholar.py +2 -5
- core/parallel_processor.py +5 -1
- core/refchecker.py +4 -3
- utils/error_utils.py +19 -1
- utils/text_utils.py +226 -10
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=jrP5O1rb9OpfyEnz9IJjKo7ZhdOr-9_yzLGwvjDTLWA,65
|
|
2
|
+
academic_refchecker-1.2.42.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
6
6
|
checkers/github_checker.py,sha256=54K6_YJW5w2GtzodnSOLfK5d1ErFJxbTOIIV5P_kFX0,13543
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
|
|
8
8
|
checkers/openalex.py,sha256=GxYUH9GZ0AyF-WFKgXiFHqkalrSnySgFSkiM1PsK0VI,19757
|
|
9
|
-
checkers/openreview_checker.py,sha256=
|
|
10
|
-
checkers/semantic_scholar.py,sha256=
|
|
9
|
+
checkers/openreview_checker.py,sha256=FLh21F0Zr7Gj3BI0u-gE6IwGNOZiRcViirDBeNvUp94,20432
|
|
10
|
+
checkers/semantic_scholar.py,sha256=BelhyIJ-W8navRdqEGpk12CIXYWmVL2Cq8HHZR7ynJs,34905
|
|
11
11
|
checkers/webpage_checker.py,sha256=BvNwOqukTX9IeQUpUfIrI_5Gr2w9VLBt5x_PB-hKUIo,21616
|
|
12
12
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
13
13
|
config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
14
14
|
config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
15
15
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
16
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
|
-
core/parallel_processor.py,sha256=
|
|
18
|
-
core/refchecker.py,sha256=
|
|
17
|
+
core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
|
|
18
|
+
core/refchecker.py,sha256=ElXgD1iPI-rDDFZmCPMZpkIP4UeX3nPAJVCfsVPNgcw,274640
|
|
19
19
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
20
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
21
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -34,13 +34,13 @@ utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
|
|
|
34
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
35
35
|
utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
36
36
|
utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
|
|
37
|
-
utils/error_utils.py,sha256=
|
|
37
|
+
utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
|
|
38
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
39
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=F5o-37KUkkr-ie4sg6ld5om3-uDpAxPUSjDFxY0fsL4,203063
|
|
40
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
41
|
utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
42
|
+
academic_refchecker-1.2.42.dist-info/METADATA,sha256=k7fzk4fhb-kz-CdJE-gaeU2I5xM16D1rNNeEuer_9Hk,22298
|
|
43
|
+
academic_refchecker-1.2.42.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.42.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.42.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.42.dist-info/RECORD,,
|
checkers/openreview_checker.py
CHANGED
|
@@ -473,9 +473,10 @@ class OpenReviewReferenceChecker:
|
|
|
473
473
|
|
|
474
474
|
if cited_venue and paper_venue:
|
|
475
475
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
476
|
+
from utils.error_utils import clean_venue_for_comparison
|
|
476
477
|
errors.append({
|
|
477
478
|
"warning_type": "venue",
|
|
478
|
-
"warning_details": f"Venue mismatch: cited as '{cited_venue}' but OpenReview shows '{paper_venue}'"
|
|
479
|
+
"warning_details": f"Venue mismatch: cited as '{clean_venue_for_comparison(cited_venue)}' but OpenReview shows '{clean_venue_for_comparison(paper_venue)}'"
|
|
479
480
|
})
|
|
480
481
|
|
|
481
482
|
# Create verified data structure
|
checkers/semantic_scholar.py
CHANGED
|
@@ -544,11 +544,8 @@ class NonArxivReferenceChecker:
|
|
|
544
544
|
if cited_venue and paper_venue:
|
|
545
545
|
# Use the utility function to check if venues are substantially different
|
|
546
546
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
'warning_details': f"Venue mismatch: cited as '{cited_venue}' but actually '{paper_venue}'",
|
|
550
|
-
'ref_venue_correct': paper_venue
|
|
551
|
-
})
|
|
547
|
+
from utils.error_utils import create_venue_warning
|
|
548
|
+
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
552
549
|
elif not cited_venue and paper_venue:
|
|
553
550
|
# Check if this is an arXiv paper first
|
|
554
551
|
external_ids = paper_data.get('externalIds', {})
|
core/parallel_processor.py
CHANGED
|
@@ -279,7 +279,11 @@ class ParallelReferenceProcessor:
|
|
|
279
279
|
from utils.text_utils import format_authors_for_display
|
|
280
280
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
281
281
|
year = reference.get('year', '')
|
|
282
|
-
venue
|
|
282
|
+
# Get venue from either 'venue' or 'journal' field and clean it up
|
|
283
|
+
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
284
|
+
if venue:
|
|
285
|
+
from utils.error_utils import clean_venue_for_comparison
|
|
286
|
+
venue = clean_venue_for_comparison(venue)
|
|
283
287
|
url = reference.get('url', '')
|
|
284
288
|
doi = reference.get('doi', '')
|
|
285
289
|
|
core/refchecker.py
CHANGED
|
@@ -3383,7 +3383,7 @@ class ArxivReferenceChecker:
|
|
|
3383
3383
|
# Check if this is biblatex format
|
|
3384
3384
|
from utils.biblatex_parser import detect_biblatex_format
|
|
3385
3385
|
if detect_biblatex_format(bibliography_text):
|
|
3386
|
-
logger.
|
|
3386
|
+
logger.debug("Detected biblatex format")
|
|
3387
3387
|
self.used_regex_extraction = True
|
|
3388
3388
|
# Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3389
3389
|
biblatex_refs = self._parse_biblatex_references(bibliography_text)
|
|
@@ -3391,7 +3391,7 @@ class ArxivReferenceChecker:
|
|
|
3391
3391
|
# If biblatex parsing returned empty results (due to quality validation),
|
|
3392
3392
|
# fallback to LLM if available
|
|
3393
3393
|
if not biblatex_refs and self.llm_extractor:
|
|
3394
|
-
logger.debug("Biblatex
|
|
3394
|
+
logger.debug("Biblatex is incompatible with parser")
|
|
3395
3395
|
try:
|
|
3396
3396
|
references = self.llm_extractor.extract_references(bibliography_text)
|
|
3397
3397
|
if references:
|
|
@@ -3403,7 +3403,7 @@ class ArxivReferenceChecker:
|
|
|
3403
3403
|
except Exception as e:
|
|
3404
3404
|
logger.error(f"LLM fallback failed: {e}")
|
|
3405
3405
|
return []
|
|
3406
|
-
|
|
3406
|
+
logger.debug("Using biblatex file")
|
|
3407
3407
|
return biblatex_refs
|
|
3408
3408
|
|
|
3409
3409
|
# For non-standard formats, try LLM-based extraction if available
|
|
@@ -3634,6 +3634,7 @@ class ArxivReferenceChecker:
|
|
|
3634
3634
|
# we'll continue with the unreliable fallback regex parsing
|
|
3635
3635
|
if not biblatex_refs:
|
|
3636
3636
|
logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
|
|
3637
|
+
print(f"⚠️ Biblatex parser found no valid references (failed quality validation) - falling back to regex parsing")
|
|
3637
3638
|
else:
|
|
3638
3639
|
return biblatex_refs
|
|
3639
3640
|
|
utils/error_utils.py
CHANGED
|
@@ -89,6 +89,20 @@ def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]
|
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
|
|
92
|
+
def clean_venue_for_comparison(venue: str) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Clean venue name for display in warnings using the shared normalization logic.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
venue: Raw venue string
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Cleaned venue name suitable for display
|
|
101
|
+
"""
|
|
102
|
+
from utils.text_utils import normalize_venue_for_display
|
|
103
|
+
return normalize_venue_for_display(venue)
|
|
104
|
+
|
|
105
|
+
|
|
92
106
|
def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
|
|
93
107
|
"""
|
|
94
108
|
Create a standardized venue warning dictionary.
|
|
@@ -100,9 +114,13 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
|
|
|
100
114
|
Returns:
|
|
101
115
|
Standardized warning dictionary
|
|
102
116
|
"""
|
|
117
|
+
# Clean both venues for display in the warning
|
|
118
|
+
clean_cited = clean_venue_for_comparison(cited_venue)
|
|
119
|
+
clean_correct = clean_venue_for_comparison(correct_venue)
|
|
120
|
+
|
|
103
121
|
return {
|
|
104
122
|
'warning_type': 'venue',
|
|
105
|
-
'warning_details': f"Venue mismatch: cited as '{
|
|
123
|
+
'warning_details': f"Venue mismatch: cited as '{clean_cited}' but actually '{clean_correct}'",
|
|
106
124
|
'ref_venue_correct': correct_venue
|
|
107
125
|
}
|
|
108
126
|
|
utils/text_utils.py
CHANGED
|
@@ -2255,8 +2255,13 @@ def format_author_for_display(author_name):
|
|
|
2255
2255
|
if not author_name:
|
|
2256
2256
|
return author_name
|
|
2257
2257
|
|
|
2258
|
+
# Clean up any stray punctuation that might have been attached during parsing
|
|
2259
|
+
author_name = author_name.strip()
|
|
2260
|
+
# Remove trailing semicolons that sometimes get attached during bibliographic parsing
|
|
2261
|
+
author_name = re.sub(r'[;,]\s*$', '', author_name)
|
|
2262
|
+
|
|
2258
2263
|
# Normalize apostrophes for consistent display
|
|
2259
|
-
author_name = normalize_apostrophes(author_name
|
|
2264
|
+
author_name = normalize_apostrophes(author_name)
|
|
2260
2265
|
|
|
2261
2266
|
# Check if it's in "Lastname, Firstname" format
|
|
2262
2267
|
if ',' in author_name:
|
|
@@ -3006,7 +3011,9 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3006
3011
|
if ref['year']:
|
|
3007
3012
|
venue_clean = re.sub(rf'\b{ref["year"]}\b.*', '', venue_clean)
|
|
3008
3013
|
venue_clean = venue_clean.rstrip(',. ')
|
|
3009
|
-
|
|
3014
|
+
# Filter out common non-venue patterns that shouldn't be treated as venues
|
|
3015
|
+
non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
|
|
3016
|
+
if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
|
|
3010
3017
|
ref['journal'] = venue_clean
|
|
3011
3018
|
|
|
3012
3019
|
# Extract URL if present
|
|
@@ -3665,8 +3672,77 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3665
3672
|
return bool(venue1 != venue2)
|
|
3666
3673
|
|
|
3667
3674
|
# Clean LaTeX commands from both venues first
|
|
3668
|
-
|
|
3669
|
-
|
|
3675
|
+
venue1_latex_cleaned = strip_latex_commands(venue1)
|
|
3676
|
+
venue2_latex_cleaned = strip_latex_commands(venue2)
|
|
3677
|
+
|
|
3678
|
+
# For comparison, we need lowercase normalized versions
|
|
3679
|
+
def normalize_for_comparison(venue_text):
|
|
3680
|
+
# Get the cleaned display version first
|
|
3681
|
+
cleaned = normalize_venue_for_display(venue_text)
|
|
3682
|
+
# Then normalize for comparison: lowercase, expand abbreviations, remove punctuation
|
|
3683
|
+
venue_lower = cleaned.lower()
|
|
3684
|
+
|
|
3685
|
+
# Handle LaTeX penalty commands before abbreviation expansion
|
|
3686
|
+
venue_lower = re.sub(r'\\penalty\d+\s*', ' ', venue_lower) # Remove \\penalty0 etc
|
|
3687
|
+
venue_lower = re.sub(r'\s+', ' ', venue_lower).strip() # Clean up extra spaces
|
|
3688
|
+
|
|
3689
|
+
# Expand abbreviations for comparison
|
|
3690
|
+
def expand_abbreviations(text):
|
|
3691
|
+
common_abbrevs = {
|
|
3692
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
3693
|
+
'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
|
|
3694
|
+
'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
|
|
3695
|
+
'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
|
|
3696
|
+
'mechatron.': 'mechatronics', 'intell.': 'intelligence',
|
|
3697
|
+
'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
|
|
3698
|
+
# General academic abbreviations (only expand with periods)
|
|
3699
|
+
'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
|
|
3700
|
+
'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
|
|
3701
|
+
'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
|
|
3702
|
+
'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
|
|
3703
|
+
'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
|
|
3704
|
+
'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
|
|
3705
|
+
'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
|
|
3706
|
+
'workshop': 'workshop', 'worksh.': 'workshop',
|
|
3707
|
+
'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
|
|
3708
|
+
# Physics journal abbreviations
|
|
3709
|
+
'phys.': 'physics', 'phys. rev.': 'physical review',
|
|
3710
|
+
'phys. rev. lett.': 'physical review letters',
|
|
3711
|
+
'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
|
|
3712
|
+
'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
|
|
3713
|
+
'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
|
|
3714
|
+
'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
|
|
3715
|
+
'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
|
|
3716
|
+
'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
|
|
3717
|
+
'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
|
|
3718
|
+
# Nature journals
|
|
3719
|
+
'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
|
|
3720
|
+
# Handle specific multi-word patterns and well-known acronyms
|
|
3721
|
+
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
3722
|
+
'pnas': 'proceedings of the national academy of sciences',
|
|
3723
|
+
}
|
|
3724
|
+
# Sort by length (longest first) to ensure longer matches take precedence
|
|
3725
|
+
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
3726
|
+
# For abbreviations ending in period, use word boundary at start only
|
|
3727
|
+
if abbrev.endswith('.'):
|
|
3728
|
+
pattern = r'\b' + re.escape(abbrev)
|
|
3729
|
+
else:
|
|
3730
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
3731
|
+
text = re.sub(pattern, expansion, text)
|
|
3732
|
+
return text
|
|
3733
|
+
|
|
3734
|
+
venue_lower = expand_abbreviations(venue_lower)
|
|
3735
|
+
|
|
3736
|
+
# Remove punctuation and normalize spacing for comparison
|
|
3737
|
+
venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
|
|
3738
|
+
venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition
|
|
3739
|
+
venue_lower = re.sub(r'\\s+for\\s+', ' ', venue_lower) # Remove \"for\" preposition
|
|
3740
|
+
venue_lower = re.sub(r'\\s+', ' ', venue_lower).strip() # Normalize whitespace
|
|
3741
|
+
|
|
3742
|
+
return venue_lower
|
|
3743
|
+
|
|
3744
|
+
normalized_venue1 = normalize_for_comparison(venue1_latex_cleaned)
|
|
3745
|
+
normalized_venue2 = normalize_for_comparison(venue2_latex_cleaned)
|
|
3670
3746
|
|
|
3671
3747
|
def expand_abbreviations(text):
|
|
3672
3748
|
"""Generic abbreviation expansion using common academic patterns"""
|
|
@@ -3983,8 +4059,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3983
4059
|
if not acronym or not full_text:
|
|
3984
4060
|
return False
|
|
3985
4061
|
|
|
3986
|
-
#
|
|
3987
|
-
normalized_full =
|
|
4062
|
+
# Use the internal comparison normalization function
|
|
4063
|
+
normalized_full = normalize_for_comparison(full_text)
|
|
3988
4064
|
|
|
3989
4065
|
# Generate all possible acronyms from the full text
|
|
3990
4066
|
possible_acronyms = []
|
|
@@ -4098,9 +4174,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4098
4174
|
if (arxiv1 == 'arxiv' and arxiv2.startswith('https://arxiv.org')) or (arxiv2 == 'arxiv' and arxiv1.startswith('https://arxiv.org')):
|
|
4099
4175
|
return False
|
|
4100
4176
|
|
|
4101
|
-
#
|
|
4102
|
-
norm1 =
|
|
4103
|
-
norm2 =
|
|
4177
|
+
# Use normalized venues from shared function
|
|
4178
|
+
norm1 = normalized_venue1
|
|
4179
|
+
norm2 = normalized_venue2
|
|
4104
4180
|
|
|
4105
4181
|
# Direct match after normalization (highest priority)
|
|
4106
4182
|
if norm1 == norm2:
|
|
@@ -4354,4 +4430,144 @@ def is_year_substantially_different(cited_year: int, correct_year: int, context:
|
|
|
4354
4430
|
|
|
4355
4431
|
# Any year difference should be flagged as a warning for manual review
|
|
4356
4432
|
warning_msg = f"Year mismatch: cited as {cited_year} but actually {correct_year}"
|
|
4357
|
-
return True, warning_msg
|
|
4433
|
+
return True, warning_msg
|
|
4434
|
+
|
|
4435
|
+
|
|
4436
|
+
def normalize_venue_for_display(venue: str) -> str:
|
|
4437
|
+
"""
|
|
4438
|
+
Normalize venue names for consistent display and comparison.
|
|
4439
|
+
|
|
4440
|
+
This function is used both for display in warnings and for venue comparison
|
|
4441
|
+
to ensure consistent normalization across the system.
|
|
4442
|
+
|
|
4443
|
+
Args:
|
|
4444
|
+
venue: Raw venue string
|
|
4445
|
+
|
|
4446
|
+
Returns:
|
|
4447
|
+
Normalized venue string with prefixes removed and abbreviations expanded
|
|
4448
|
+
"""
|
|
4449
|
+
if not venue:
|
|
4450
|
+
return ""
|
|
4451
|
+
|
|
4452
|
+
def expand_abbreviations(text):
|
|
4453
|
+
"""Generic abbreviation expansion using common academic patterns"""
|
|
4454
|
+
# Common academic abbreviations mapping
|
|
4455
|
+
common_abbrevs = {
|
|
4456
|
+
# IEEE specific abbreviations (only expand with periods, not full words)
|
|
4457
|
+
'robot.': 'robotics',
|
|
4458
|
+
'autom.': 'automation',
|
|
4459
|
+
'lett.': 'letters',
|
|
4460
|
+
'trans.': 'transactions',
|
|
4461
|
+
'syst.': 'systems',
|
|
4462
|
+
'netw.': 'networks',
|
|
4463
|
+
'learn.': 'learning',
|
|
4464
|
+
'ind.': 'industrial',
|
|
4465
|
+
'electron.': 'electronics',
|
|
4466
|
+
'mechatron.': 'mechatronics',
|
|
4467
|
+
'intell.': 'intelligence',
|
|
4468
|
+
'transp.': 'transportation',
|
|
4469
|
+
'contr.': 'control',
|
|
4470
|
+
'mag.': 'magazine',
|
|
4471
|
+
|
|
4472
|
+
# General academic abbreviations (only expand with periods)
|
|
4473
|
+
'int.': 'international',
|
|
4474
|
+
'intl.': 'international',
|
|
4475
|
+
'conf.': 'conference',
|
|
4476
|
+
'j.': 'journal',
|
|
4477
|
+
'proc.': 'proceedings',
|
|
4478
|
+
'assoc.': 'association',
|
|
4479
|
+
'comput.': 'computing',
|
|
4480
|
+
'sci.': 'science',
|
|
4481
|
+
'eng.': 'engineering',
|
|
4482
|
+
'tech.': 'technology',
|
|
4483
|
+
'artif.': 'artificial',
|
|
4484
|
+
'mach.': 'machine',
|
|
4485
|
+
'stat.': 'statistics',
|
|
4486
|
+
'math.': 'mathematics',
|
|
4487
|
+
'phys.': 'physics',
|
|
4488
|
+
'chem.': 'chemistry',
|
|
4489
|
+
'bio.': 'biology',
|
|
4490
|
+
'med.': 'medicine',
|
|
4491
|
+
'adv.': 'advances',
|
|
4492
|
+
'ann.': 'annual',
|
|
4493
|
+
'symp.': 'symposium',
|
|
4494
|
+
'workshop': 'workshop',
|
|
4495
|
+
'worksh.': 'workshop',
|
|
4496
|
+
}
|
|
4497
|
+
|
|
4498
|
+
text_lower = text.lower()
|
|
4499
|
+
for abbrev, expansion in common_abbrevs.items():
|
|
4500
|
+
# Only replace if it's a word boundary to avoid partial replacements
|
|
4501
|
+
pattern = r'\b' + re.escape(abbrev) + r'\b'
|
|
4502
|
+
text_lower = re.sub(pattern, expansion, text_lower)
|
|
4503
|
+
|
|
4504
|
+
return text_lower
|
|
4505
|
+
|
|
4506
|
+
venue_text = venue.strip()
|
|
4507
|
+
|
|
4508
|
+
# Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
|
|
4509
|
+
# This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
|
|
4510
|
+
editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
|
|
4511
|
+
if editor_match:
|
|
4512
|
+
# Extract the venue part from editor string (preserve original case)
|
|
4513
|
+
venue_text = editor_match.group(1).strip()
|
|
4514
|
+
# Clean up any remaining metadata like "volume X of Proceedings..." (case-insensitive)
|
|
4515
|
+
venue_text = re.sub(r',\s*volume\s+\d+.*$', '', venue_text, flags=re.IGNORECASE)
|
|
4516
|
+
venue_text = re.sub(r'\s+of\s+proceedings.*$', '', venue_text, flags=re.IGNORECASE)
|
|
4517
|
+
|
|
4518
|
+
# Remove years, volumes, pages, and other citation metadata
|
|
4519
|
+
# But preserve arXiv IDs (don't remove digits after arXiv:)
|
|
4520
|
+
if not re.match(r'arxiv:', venue_text, re.IGNORECASE):
|
|
4521
|
+
venue_text = re.sub(r',?\s*\d{4}[a-z]?\s*$', '', venue_text) # Years like "2024" or "2024b"
|
|
4522
|
+
venue_text = re.sub(r',?\s*\(\d{4}\)$', '', venue_text) # Years in parentheses
|
|
4523
|
+
venue_text = re.sub(r"'\d{2}$", '', venue_text) # Year suffixes like 'CVPR'16'
|
|
4524
|
+
venue_text = re.sub(r',?\s*(vol\.?\s*|volume\s*)\d+.*$', '', venue_text, flags=re.IGNORECASE) # Volume info
|
|
4525
|
+
venue_text = re.sub(r',?\s*\d+\s*\([^)]*\).*$', '', venue_text) # Issue info with optional spaces
|
|
4526
|
+
venue_text = re.sub(r',?\s*pp?\.\s*\d+.*$', '', venue_text, flags=re.IGNORECASE) # Page info
|
|
4527
|
+
venue_text = re.sub(r'\s*\(print\).*$', '', venue_text, flags=re.IGNORECASE) # Print designation
|
|
4528
|
+
venue_text = re.sub(r'\s*\(\d{4}\.\s*print\).*$', '', venue_text, flags=re.IGNORECASE) # Year.Print
|
|
4529
|
+
|
|
4530
|
+
# Remove procedural prefixes (case-insensitive)
|
|
4531
|
+
prefixes_to_remove = [
|
|
4532
|
+
r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
|
|
4533
|
+
r'^\d{4}\s+', # "2024 "
|
|
4534
|
+
r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proceedings of the IEEE"
|
|
4535
|
+
r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
|
|
4536
|
+
r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
|
|
4537
|
+
r'^in\s+',
|
|
4538
|
+
r'^advances\s+in\s+', # "Advances in Neural Information Processing Systems"
|
|
4539
|
+
r'^adv\.\s+', # "Adv. Neural Information Processing Systems"
|
|
4540
|
+
# Handle ordinal prefixes: "The Twelfth", "The Ninth", etc.
|
|
4541
|
+
r'^the\s+(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|thirty-second|thirty-third|thirty-fourth|thirty-fifth|thirty-sixth|thirty-seventh|thirty-eighth|thirty-ninth|fortieth|forty-first|forty-second|forty-third|forty-fourth|forty-fifth|forty-sixth|forty-seventh|forty-eighth|forty-ninth|fiftieth)\s+',
|
|
4542
|
+
# Handle numeric ordinals: "The 41st", "The 12th", etc.
|
|
4543
|
+
r'^the\s+\d+(st|nd|rd|th)\s+',
|
|
4544
|
+
# Handle standalone "The" prefix
|
|
4545
|
+
r'^the\s+',
|
|
4546
|
+
]
|
|
4547
|
+
|
|
4548
|
+
for prefix_pattern in prefixes_to_remove:
|
|
4549
|
+
venue_text = re.sub(prefix_pattern, '', venue_text, flags=re.IGNORECASE)
|
|
4550
|
+
|
|
4551
|
+
# Note: For display purposes, we preserve case and don't expand abbreviations
|
|
4552
|
+
# Only do minimal cleaning needed for proper display
|
|
4553
|
+
|
|
4554
|
+
# Remove organization prefixes/suffixes that don't affect identity (case-insensitive)
|
|
4555
|
+
# But preserve IEEE when it's part of a journal name like \"IEEE Transactions\"
|
|
4556
|
+
if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
|
|
4557
|
+
venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
|
|
4558
|
+
venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
|
|
4559
|
+
venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
|
|
4560
|
+
venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
|
|
4561
|
+
|
|
4562
|
+
# IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display
|
|
4563
|
+
# Only remove specific org-prefixed conference patterns where the org is clear
|
|
4564
|
+
venue_text = re.sub(r'^(ieee|acm|aaai|nips)(/\w+)?\s+conference\s+on\s+', '', venue_text, flags=re.IGNORECASE)
|
|
4565
|
+
|
|
4566
|
+
# Note: Don't remove "Conference on" as it's often part of the actual venue name
|
|
4567
|
+
# Only remove it if it's clearly a procedural prefix (handled in prefixes_to_remove above)
|
|
4568
|
+
|
|
4569
|
+
# Clean up spacing (preserve punctuation and case for display)
|
|
4570
|
+
venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
|
|
4571
|
+
venue_text = venue_text.strip()
|
|
4572
|
+
|
|
4573
|
+
return venue_text
|
|
File without changes
|
{academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.40.dist-info → academic_refchecker-1.2.42.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|