academic-refchecker 1.2.38__py3-none-any.whl → 1.2.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/RECORD +9 -9
- utils/biblatex_parser.py +5 -3
- utils/text_utils.py +45 -11
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
|
|
2
|
+
academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
@@ -28,7 +28,7 @@ services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,942
|
|
|
28
28
|
utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
29
29
|
utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
|
|
30
30
|
utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
31
|
-
utils/biblatex_parser.py,sha256=
|
|
31
|
+
utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
|
|
32
32
|
utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
|
|
33
33
|
utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
|
|
34
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
|
36
36
|
utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
|
|
37
37
|
utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
|
|
38
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
39
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
|
|
40
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
41
|
utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
42
|
+
academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
|
|
43
|
+
academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.39.dist-info/RECORD,,
|
utils/biblatex_parser.py
CHANGED
|
@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
261
261
|
else:
|
|
262
262
|
# If no quoted title, look for title after author names
|
|
263
263
|
# Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
|
|
264
|
+
# Order matters: more specific patterns first
|
|
264
265
|
title_patterns = [
|
|
265
|
-
r'[A-Z][
|
|
266
|
-
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing)
|
|
267
|
-
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
266
|
+
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
|
|
268
267
|
r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
|
|
268
|
+
r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
|
|
269
|
+
r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
|
|
270
|
+
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
269
271
|
r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
|
|
270
272
|
]
|
|
271
273
|
|
utils/text_utils.py
CHANGED
|
@@ -11,6 +11,31 @@ from typing import List
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def normalize_apostrophes(text):
|
|
15
|
+
"""
|
|
16
|
+
Normalize all apostrophe variants to standard ASCII apostrophe
|
|
17
|
+
"""
|
|
18
|
+
if not text:
|
|
19
|
+
return text
|
|
20
|
+
|
|
21
|
+
# All known apostrophe variants
|
|
22
|
+
apostrophe_variants = [
|
|
23
|
+
"'", # U+0027 ASCII apostrophe
|
|
24
|
+
"'", # U+2019 Right single quotation mark (most common)
|
|
25
|
+
"'", # U+2018 Left single quotation mark
|
|
26
|
+
"ʼ", # U+02BC Modifier letter apostrophe
|
|
27
|
+
"ˈ", # U+02C8 Modifier letter vertical line (primary stress)
|
|
28
|
+
"`", # U+0060 Grave accent (sometimes used as apostrophe)
|
|
29
|
+
"´", # U+00B4 Acute accent (sometimes used as apostrophe)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# Replace all variants with standard ASCII apostrophe
|
|
33
|
+
for variant in apostrophe_variants:
|
|
34
|
+
text = text.replace(variant, "'")
|
|
35
|
+
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
14
39
|
def normalize_text(text):
|
|
15
40
|
"""
|
|
16
41
|
Normalize text by removing diacritical marks and special characters
|
|
@@ -18,6 +43,9 @@ def normalize_text(text):
|
|
|
18
43
|
if not text:
|
|
19
44
|
return ""
|
|
20
45
|
|
|
46
|
+
# First normalize apostrophes to standard form
|
|
47
|
+
text = normalize_apostrophes(text)
|
|
48
|
+
|
|
21
49
|
# Replace common special characters with their ASCII equivalents
|
|
22
50
|
replacements = {
|
|
23
51
|
'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
|
|
@@ -29,7 +57,7 @@ def normalize_text(text):
|
|
|
29
57
|
'Ł': 'L', 'ł': 'l',
|
|
30
58
|
'¨': '', '´': '', '`': '', '^': '', '~': '',
|
|
31
59
|
'–': '-', '—': '-', '−': '-',
|
|
32
|
-
'„': '"', '"': '"', '"': '"',
|
|
60
|
+
'„': '"', '"': '"', '"': '"',
|
|
33
61
|
'«': '"', '»': '"',
|
|
34
62
|
'¡': '!', '¿': '?',
|
|
35
63
|
'°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
|
|
@@ -39,10 +67,6 @@ def normalize_text(text):
|
|
|
39
67
|
'\u00A0': ' ', # Non-breaking space
|
|
40
68
|
'\u2013': '-', # En dash
|
|
41
69
|
'\u2014': '-', # Em dash
|
|
42
|
-
'\u2018': "'", # Left single quotation mark
|
|
43
|
-
'\u2019': "'", # Right single quotation mark
|
|
44
|
-
'\u201C': '"', # Left double quotation mark
|
|
45
|
-
'\u201D': '"', # Right double quotation mark
|
|
46
70
|
'\u2026': '...', # Horizontal ellipsis
|
|
47
71
|
'\u00B7': '.', # Middle dot
|
|
48
72
|
'\u2022': '.', # Bullet
|
|
@@ -54,8 +78,8 @@ def normalize_text(text):
|
|
|
54
78
|
# Remove any remaining diacritical marks
|
|
55
79
|
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
|
56
80
|
|
|
57
|
-
# Remove special characters
|
|
58
|
-
text = re.sub(r
|
|
81
|
+
# Remove special characters except apostrophes
|
|
82
|
+
text = re.sub(r"[^\w\s']", '', text)
|
|
59
83
|
|
|
60
84
|
# Normalize whitespace
|
|
61
85
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
@@ -368,6 +392,9 @@ def clean_author_name(author):
|
|
|
368
392
|
# Normalize Unicode characters (e.g., combining diacritics)
|
|
369
393
|
author = unicodedata.normalize('NFKC', author)
|
|
370
394
|
|
|
395
|
+
# Normalize apostrophes first before other processing
|
|
396
|
+
author = normalize_apostrophes(author)
|
|
397
|
+
|
|
371
398
|
# Handle common Unicode escape sequences and LaTeX encodings
|
|
372
399
|
# Note: Order matters - process longer patterns first
|
|
373
400
|
unicode_replacements = [
|
|
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
|
|
|
703
730
|
'José' -> 'jose'
|
|
704
731
|
'Łukasz' -> 'lukasz'
|
|
705
732
|
'J. Gl¨ uck' -> 'J. Gluck'
|
|
733
|
+
'D'Amato' -> 'D'Amato' (apostrophes normalized)
|
|
706
734
|
"""
|
|
707
|
-
# First
|
|
735
|
+
# First normalize apostrophes
|
|
736
|
+
text = normalize_apostrophes(text)
|
|
737
|
+
|
|
738
|
+
# Then handle special characters that don't decompose properly
|
|
708
739
|
# Including common transliterations
|
|
709
740
|
special_chars = {
|
|
710
741
|
'ł': 'l', 'Ł': 'L',
|
|
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
|
|
|
2224
2255
|
if not author_name:
|
|
2225
2256
|
return author_name
|
|
2226
2257
|
|
|
2227
|
-
|
|
2258
|
+
# Normalize apostrophes for consistent display
|
|
2259
|
+
author_name = normalize_apostrophes(author_name.strip())
|
|
2228
2260
|
|
|
2229
2261
|
# Check if it's in "Lastname, Firstname" format
|
|
2230
2262
|
if ',' in author_name:
|
|
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3743
3775
|
for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
|
|
3744
3776
|
if abbrev in expanded_text:
|
|
3745
3777
|
expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
|
|
3778
|
+
break # Only apply the first (longest) matching abbreviation to avoid conflicts
|
|
3746
3779
|
|
|
3747
3780
|
# Second pass: handle single word abbreviations
|
|
3748
3781
|
words = expanded_text.split()
|
|
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4137
4170
|
return False
|
|
4138
4171
|
|
|
4139
4172
|
# Order-aware fuzzy matching - words should match in sequence
|
|
4140
|
-
|
|
4141
|
-
|
|
4173
|
+
# Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
|
|
4174
|
+
words1_list = sorted(list(words1))
|
|
4175
|
+
words2_list = sorted(list(words2))
|
|
4142
4176
|
|
|
4143
4177
|
# If word counts are very different, they're likely different venues
|
|
4144
4178
|
if len(words1) > 0 and len(words2) > 0:
|
|
File without changes
|
{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|