academic-refchecker 2.0.21__py3-none-any.whl → 2.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.21
3
+ Version: 2.0.22
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,4 +1,4 @@
1
- academic_refchecker-2.0.21.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ academic_refchecker-2.0.22.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
2
2
  backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
3
3
  backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
4
4
  backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
@@ -19,7 +19,7 @@ backend/static/assets/index-DMZJNrR0.js,sha256=UhK5CQ8IufZmx6FTvXUCtkRxTqpGK7czS
19
19
  backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
20
20
  refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
21
21
  refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
22
- refchecker/__version__.py,sha256=XyBLo7S1kdSvGrt9yRtdTYCZwfWFfzPLcvw8uZes7kM,66
22
+ refchecker/__version__.py,sha256=X_6kKif5oE_F5mSm6cC1kZmg0Jh44PqY0vD3imYvOKY,66
23
23
  refchecker/checkers/__init__.py,sha256=-dR7HX0bfPq9YMXrnODoYbfNWFLqu706xoVsUdWHYRI,611
24
24
  refchecker/checkers/arxiv_citation.py,sha256=j_waQmQSP3iuZdVuBE92ghtiOdGFTCx09s6f4mHik6o,27777
25
25
  refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
@@ -59,11 +59,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
59
59
  refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
60
60
  refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
61
61
  refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
62
- refchecker/utils/text_utils.py,sha256=4sT6YKYqINLGCrRwKOkzrZ2t2cJorHgAXT5Gd3_hKCM,235856
62
+ refchecker/utils/text_utils.py,sha256=n6Ng1rWtQXNnE5G8MJSge0VrregKD6vcOMMqMTpiTew,237214
63
63
  refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
64
64
  refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
65
- academic_refchecker-2.0.21.dist-info/METADATA,sha256=D2y73gwXBF9-kvb2OHeE0bMNpcxx-Y1VksqkkUcrpyI,12443
66
- academic_refchecker-2.0.21.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
67
- academic_refchecker-2.0.21.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
68
- academic_refchecker-2.0.21.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
69
- academic_refchecker-2.0.21.dist-info/RECORD,,
65
+ academic_refchecker-2.0.22.dist-info/METADATA,sha256=ob1p0ikneyJv3z1_nLTk_ckZYdIa8YMBu7tpX8a7NqU,12443
66
+ academic_refchecker-2.0.22.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
67
+ academic_refchecker-2.0.22.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
68
+ academic_refchecker-2.0.22.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
69
+ academic_refchecker-2.0.22.dist-info/RECORD,,
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.21"
3
+ __version__ = "2.0.22"
@@ -6,7 +6,6 @@ Text processing utilities for ArXiv Reference Checker
6
6
  import re
7
7
  import logging
8
8
  import unicodedata
9
- import html
10
9
  from typing import List
11
10
 
12
11
  logger = logging.getLogger(__name__)
@@ -1373,6 +1372,15 @@ def is_name_match(name1: str, name2: str) -> bool:
1373
1372
  first_initial == first_name[0] and
1374
1373
  middle_initial == middle_name[0]):
1375
1374
  return True
1375
+ else:
1376
+ # Simple last name case: "W. R. Weimer" vs "Westley Weimer"
1377
+ # The cited name has an extra middle initial that the actual name doesn't have
1378
+ # Allow match if first initial and last name match (tolerate extra middle initial)
1379
+ # BUT: Exclude cases where first_name is just concatenated initials (like "gv")
1380
+ # which should require exact initial matching, not tolerance
1381
+ is_real_first_name = len(first_name) > 2 # "Westley" yes, "gv" no
1382
+ if is_real_first_name and last_name == compound_last and first_initial == first_name[0]:
1383
+ return True
1376
1384
 
1377
1385
  elif len(init_parts) == 3 and len(name_parts) == 3:
1378
1386
  # Check for "Last, First Middle" vs "First Middle Last" format
@@ -4290,6 +4298,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4290
4298
  # Handle specific multi-word patterns and well-known acronyms
4291
4299
  'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
4292
4300
  'pnas': 'proceedings of the national academy of sciences',
4301
+ 'cacm': 'communications of the acm',
4293
4302
  # Special cases that don't follow standard acronym patterns
4294
4303
  'neurips': 'neural information processing systems', # Special case
4295
4304
  'nips': 'neural information processing systems', # old name for neurips
@@ -4426,6 +4435,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4426
4435
  'neurips': 'neural information processing systems', # Special case: doesn't follow standard acronym rules
4427
4436
  'nips': 'neural information processing systems', # old name for neurips
4428
4437
  'nsdi': 'networked systems design and implementation', # USENIX NSDI
4438
+ 'cacm': 'communications of the acm',
4439
+ 'communications of the': 'communications of the acm',
4429
4440
  }
4430
4441
 
4431
4442
  # Apply abbreviation expansion - handle multi-word phrases first
@@ -5089,8 +5100,18 @@ def normalize_venue_for_display(venue: str) -> str:
5089
5100
 
5090
5101
  return text_lower
5091
5102
 
5092
- # Decode any HTML entities (e.g., "&amp;" -> "&") before further cleaning
5093
- venue_text = html.unescape(venue).strip()
5103
+ venue_text = venue.strip()
5104
+
5105
+ # Fix common truncated venues that lose their organization suffix during PDF extraction
5106
+ truncated_aliases = {
5107
+ "communications of the": "Communications of the ACM",
5108
+ }
5109
+
5110
+ # Allow trailing punctuation/whitespace while matching truncated forms
5111
+ normalized_candidate = re.sub(r"[\s.,;:]+$", "", venue_text, flags=re.IGNORECASE)
5112
+ alias = truncated_aliases.get(normalized_candidate.lower())
5113
+ if alias:
5114
+ return alias
5094
5115
 
5095
5116
  # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
5096
5117
  # This prevents author/editor lists from being treated as venue
@@ -5152,7 +5173,8 @@ def normalize_venue_for_display(venue: str) -> str:
5152
5173
  if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
5153
5174
  venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
5154
5175
  venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
5155
- venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
5176
+ # Remove org suffixes, but NOT when preceded by "of the" (e.g., "Communications of the ACM", "Journal of the ACM")
5177
+ venue_text = re.sub(r'(?<!of the)\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
5156
5178
  venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
5157
5179
 
5158
5180
  # IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display