academic-refchecker 1.2.38__py3-none-any.whl → 1.2.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.38"
3
+ __version__ = "1.2.39"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.38
3
+ Version: 1.2.39
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=9ez-UBx1mkgUvDMk-z63_XpqOh2QnPCeTrDEuricP1w,65
2
- academic_refchecker-1.2.38.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
2
+ academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -28,7 +28,7 @@ services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,942
28
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
29
29
  utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
- utils/biblatex_parser.py,sha256=Vznt-BfNtQQb4XQ6iPab2CgFcV2JIjva1OU33NzQ51g,20253
31
+ utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
32
32
  utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
33
33
  utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
34
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
36
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
37
37
  utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=KjNx_UJvVhz-oowu4CCdryEuN0hYLu4X8yVkjdYP8fM,189261
39
+ utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
41
  utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
42
- academic_refchecker-1.2.38.dist-info/METADATA,sha256=7V0yEKZy9zao6s3_TBHPOg7Gi86h4lG2m_rhyhStq5w,22298
43
- academic_refchecker-1.2.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.38.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.38.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.38.dist-info/RECORD,,
42
+ academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
43
+ academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.39.dist-info/RECORD,,
utils/biblatex_parser.py CHANGED
@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
261
261
  else:
262
262
  # If no quoted title, look for title after author names
263
263
  # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
264
+ # Order matters: more specific patterns first
264
265
  title_patterns = [
265
- r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year"
266
- r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing)
267
- r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
266
+ r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
268
267
  r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
268
+ r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
269
+ r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
270
+ r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
269
271
  r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
270
272
  ]
271
273
 
utils/text_utils.py CHANGED
@@ -11,6 +11,31 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def normalize_apostrophes(text):
15
+ """
16
+ Normalize all apostrophe variants to standard ASCII apostrophe
17
+ """
18
+ if not text:
19
+ return text
20
+
21
+ # All known apostrophe variants
22
+ apostrophe_variants = [
23
+ "'", # U+0027 ASCII apostrophe
24
+ "'", # U+2019 Right single quotation mark (most common)
25
+ "'", # U+2018 Left single quotation mark
26
+ "ʼ", # U+02BC Modifier letter apostrophe
27
+ "ˈ", # U+02C8 Modifier letter vertical line (primary stress)
28
+ "`", # U+0060 Grave accent (sometimes used as apostrophe)
29
+ "´", # U+00B4 Acute accent (sometimes used as apostrophe)
30
+ ]
31
+
32
+ # Replace all variants with standard ASCII apostrophe
33
+ for variant in apostrophe_variants:
34
+ text = text.replace(variant, "'")
35
+
36
+ return text
37
+
38
+
14
39
  def normalize_text(text):
15
40
  """
16
41
  Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
18
43
  if not text:
19
44
  return ""
20
45
 
46
+ # First normalize apostrophes to standard form
47
+ text = normalize_apostrophes(text)
48
+
21
49
  # Replace common special characters with their ASCII equivalents
22
50
  replacements = {
23
51
  'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
29
57
  'Ł': 'L', 'ł': 'l',
30
58
  '¨': '', '´': '', '`': '', '^': '', '~': '',
31
59
  '–': '-', '—': '-', '−': '-',
32
- '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
60
+ '„': '"', '"': '"', '"': '"',
33
61
  '«': '"', '»': '"',
34
62
  '¡': '!', '¿': '?',
35
63
  '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
39
67
  '\u00A0': ' ', # Non-breaking space
40
68
  '\u2013': '-', # En dash
41
69
  '\u2014': '-', # Em dash
42
- '\u2018': "'", # Left single quotation mark
43
- '\u2019': "'", # Right single quotation mark
44
- '\u201C': '"', # Left double quotation mark
45
- '\u201D': '"', # Right double quotation mark
46
70
  '\u2026': '...', # Horizontal ellipsis
47
71
  '\u00B7': '.', # Middle dot
48
72
  '\u2022': '.', # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
54
78
  # Remove any remaining diacritical marks
55
79
  text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
56
80
 
57
- # Remove special characters
58
- text = re.sub(r'[^\w\s]', '', text)
81
+ # Remove special characters except apostrophes
82
+ text = re.sub(r"[^\w\s']", '', text)
59
83
 
60
84
  # Normalize whitespace
61
85
  text = re.sub(r'\s+', ' ', text).strip()
@@ -368,6 +392,9 @@ def clean_author_name(author):
368
392
  # Normalize Unicode characters (e.g., combining diacritics)
369
393
  author = unicodedata.normalize('NFKC', author)
370
394
 
395
+ # Normalize apostrophes first before other processing
396
+ author = normalize_apostrophes(author)
397
+
371
398
  # Handle common Unicode escape sequences and LaTeX encodings
372
399
  # Note: Order matters - process longer patterns first
373
400
  unicode_replacements = [
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
703
730
  'José' -> 'jose'
704
731
  'Łukasz' -> 'lukasz'
705
732
  'J. Gl¨ uck' -> 'J. Gluck'
733
+ 'D'Amato' -> 'D'Amato' (apostrophes normalized)
706
734
  """
707
- # First handle special characters that don't decompose properly
735
+ # First normalize apostrophes
736
+ text = normalize_apostrophes(text)
737
+
738
+ # Then handle special characters that don't decompose properly
708
739
  # Including common transliterations
709
740
  special_chars = {
710
741
  'ł': 'l', 'Ł': 'L',
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
2224
2255
  if not author_name:
2225
2256
  return author_name
2226
2257
 
2227
- author_name = author_name.strip()
2258
+ # Normalize apostrophes for consistent display
2259
+ author_name = normalize_apostrophes(author_name.strip())
2228
2260
 
2229
2261
  # Check if it's in "Lastname, Firstname" format
2230
2262
  if ',' in author_name:
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3743
3775
  for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
3744
3776
  if abbrev in expanded_text:
3745
3777
  expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
3778
+ break # Only apply the first (longest) matching abbreviation to avoid conflicts
3746
3779
 
3747
3780
  # Second pass: handle single word abbreviations
3748
3781
  words = expanded_text.split()
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4137
4170
  return False
4138
4171
 
4139
4172
  # Order-aware fuzzy matching - words should match in sequence
4140
- words1_list = list(words1)
4141
- words2_list = list(words2)
4173
+ # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
4174
+ words1_list = sorted(list(words1))
4175
+ words2_list = sorted(list(words2))
4142
4176
 
4143
4177
  # If word counts are very different, they're likely different venues
4144
4178
  if len(words1) > 0 and len(words2) > 0: