PyPI - academic-refchecker - Versions diffs - 1.2.38__py3-none-any.whl → 1.2.39__py3-none-any.whl - Mend

academic-refchecker 1.2.38py3-none-any.whl → 1.2.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.38"
+__version__ = "1.2.39"

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.38
+Version: 1.2.39
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=9ez-UBx1mkgUvDMk-z63_XpqOh2QnPCeTrDEuricP1w,65
-academic_refchecker-1.2.38.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
+academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -28,7 +28,7 @@ services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,942
 utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
 utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
-utils/biblatex_parser.py,sha256=Vznt-BfNtQQb4XQ6iPab2CgFcV2JIjva1OU33NzQ51g,20253
+utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
 utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
 utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
 utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
 utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=KjNx_UJvVhz-oowu4CCdryEuN0hYLu4X8yVkjdYP8fM,189261
+utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
-academic_refchecker-1.2.38.dist-info/METADATA,sha256=7V0yEKZy9zao6s3_TBHPOg7Gi86h4lG2m_rhyhStq5w,22298
-academic_refchecker-1.2.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.38.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.38.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.38.dist-info/RECORD,,
+academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
+academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.39.dist-info/RECORD,,

utils/biblatex_parser.py CHANGED Viewed

@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
     else:
         # If no quoted title, look for title after author names
         # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
+        # Order matters: more specific patterns first
         title_patterns = [
-            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year"
-            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing)
-            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
+            r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})',  # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
             r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}',  # ".Title. Year" - for cases where authors end without space
+            r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}',  # "Name.Title. Year" - missing space after period
+            r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}',  # "Author et al. Title. Year" - LESS SPECIFIC
+            r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}',  # "Name, Name. Title. Year"
             r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https',  # "Title . https" - handle space before period
         ]

utils/text_utils.py CHANGED Viewed

@@ -11,6 +11,31 @@ from typing import List
 logger = logging.getLogger(__name__)
+def normalize_apostrophes(text):
+    """
+    Normalize all apostrophe variants to standard ASCII apostrophe
+    """
+    if not text:
+        return text
+    # All known apostrophe variants
+    apostrophe_variants = [
+        "'",      # U+0027 ASCII apostrophe
+        "'",      # U+2019 Right single quotation mark (most common)
+        "'",      # U+2018 Left single quotation mark
+        "ʼ",      # U+02BC Modifier letter apostrophe
+        "ˈ",      # U+02C8 Modifier letter vertical line (primary stress)
+        "`",      # U+0060 Grave accent (sometimes used as apostrophe)
+        "´",      # U+00B4 Acute accent (sometimes used as apostrophe)
+    ]
+    # Replace all variants with standard ASCII apostrophe
+    for variant in apostrophe_variants:
+        text = text.replace(variant, "'")
+    return text
 def normalize_text(text):
     """
     Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
     if not text:
         return ""
+    # First normalize apostrophes to standard form
+    text = normalize_apostrophes(text)
     # Replace common special characters with their ASCII equivalents
     replacements = {
         'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
         'Ł': 'L', 'ł': 'l',
         '¨': '', '´': '', '`': '', '^': '', '~': '',
         '–': '-', '—': '-', '−': '-',
-        '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
+        '„': '"', '"': '"', '"': '"',
         '«': '"', '»': '"',
         '¡': '!', '¿': '?',
         '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
         '\u00A0': ' ',  # Non-breaking space
         '\u2013': '-',  # En dash
         '\u2014': '-',  # Em dash
-        '\u2018': "'",  # Left single quotation mark
-        '\u2019': "'",  # Right single quotation mark
-        '\u201C': '"',  # Left double quotation mark
-        '\u201D': '"',  # Right double quotation mark
         '\u2026': '...',  # Horizontal ellipsis
         '\u00B7': '.',  # Middle dot
         '\u2022': '.',  # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
     # Remove any remaining diacritical marks
     text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
-    # Remove special characters
-    text = re.sub(r'[^\w\s]', '', text)
+    # Remove special characters except apostrophes
+    text = re.sub(r"[^\w\s']", '', text)
     # Normalize whitespace
     text = re.sub(r'\s+', ' ', text).strip()
@@ -368,6 +392,9 @@ def clean_author_name(author):
     # Normalize Unicode characters (e.g., combining diacritics)
     author = unicodedata.normalize('NFKC', author)
+    # Normalize apostrophes first before other processing
+    author = normalize_apostrophes(author)
     # Handle common Unicode escape sequences and LaTeX encodings
     # Note: Order matters - process longer patterns first
     unicode_replacements = [
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
         'José' -> 'jose'
         'Łukasz' -> 'lukasz'
         'J. Gl¨ uck' -> 'J. Gluck'
+        'D'Amato' -> 'D'Amato' (apostrophes normalized)
     """
-    # First handle special characters that don't decompose properly
+    # First normalize apostrophes
+    text = normalize_apostrophes(text)
+    # Then handle special characters that don't decompose properly
     # Including common transliterations
     special_chars = {
         'ł': 'l', 'Ł': 'L',
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
     if not author_name:
         return author_name
-    author_name = author_name.strip()
+    # Normalize apostrophes for consistent display
+    author_name = normalize_apostrophes(author_name.strip())
     # Check if it's in "Lastname, Firstname" format
     if ',' in author_name:
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
             if abbrev in expanded_text:
                 expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
+                break  # Only apply the first (longest) matching abbreviation to avoid conflicts
         # Second pass: handle single word abbreviations
         words = expanded_text.split()
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
         return False
     # Order-aware fuzzy matching - words should match in sequence
-    words1_list = list(words1)
-    words2_list = list(words2)
+    # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
+    words1_list = sorted(list(words1))
+    words2_list = sorted(list(words2))
     # If word counts are very different, they're likely different venues
     if len(words1) > 0 and len(words2) > 0:

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.38.dist-info → academic_refchecker-1.2.39.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.38__py3-none-any.whl → 1.2.39__py3-none-any.whl

academic-refchecker 1.2.38py3-none-any.whl → 1.2.39py3-none-any.whl