PyPI - academic-refchecker - Versions diffs - 1.2.42__py3-none-any.whl → 1.2.43__py3-none-any.whl - Mend

academic-refchecker 1.2.42py3-none-any.whl → 1.2.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.42"
+__version__ = "1.2.43"

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.42
+Version: 1.2.43
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=jrP5O1rb9OpfyEnz9IJjKo7ZhdOr-9_yzLGwvjDTLWA,65
-academic_refchecker-1.2.42.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=JbybFux4Juuafz1jN0cgsedPmzBO8U9DJ874tJu2saA,65
+academic_refchecker-1.2.43.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
 core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
-core/refchecker.py,sha256=ElXgD1iPI-rDDFZmCPMZpkIP4UeX3nPAJVCfsVPNgcw,274640
+core/refchecker.py,sha256=sVRg3PUzrs2vLFlEBoi4bxUy-TpO5iQHCkokGas-ygQ,273616
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,17 +30,17 @@ utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
 utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
 utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
-utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
+utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
 utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
 utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
 utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=F5o-37KUkkr-ie4sg6ld5om3-uDpAxPUSjDFxY0fsL4,203063
+utils/text_utils.py,sha256=jPgCOBTVboLRJyypoOtL-dg1wBDQrKBux2ImvC6wL58,206296
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
-utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
-academic_refchecker-1.2.42.dist-info/METADATA,sha256=k7fzk4fhb-kz-CdJE-gaeU2I5xM16D1rNNeEuer_9Hk,22298
-academic_refchecker-1.2.42.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.42.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.42.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.42.dist-info/RECORD,,
+utils/url_utils.py,sha256=aq1hSYEA888bOKuBOGWRclgTFIjw32rpFdsBO_Ja8ZM,8402
+academic_refchecker-1.2.43.dist-info/METADATA,sha256=ZsJhIw1n7Yjoug6mpV4zpAPf-eSW5xSMdd3Dl_WTOlI,22298
+academic_refchecker-1.2.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.43.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.43.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.43.dist-info/RECORD,,

core/refchecker.py CHANGED Viewed

@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
                 if error_details:
                     subreason = self._categorize_unverified_reason(error_details)
                     print(f"         Subreason: {subreason}")
-            year_str = self._format_year_string(reference.get('year'))
-            # Apply LaTeX cleaning and formatting to authors for display
-            authors = reference.get('authors', [])
-            if authors:
-                from utils.text_utils import strip_latex_commands, format_authors_for_display
-                cleaned_authors = [strip_latex_commands(author) for author in authors]
-                authors_display = format_authors_for_display(cleaned_authors)
-            else:
-                authors_display = 'Unknown authors'
-            # Only show URL if it exists and is different from reference_url
-            ref_url = reference.get('url', '').strip()
-            if ref_url and ref_url != reference_url:
-                # Clean trailing punctuation from URL display
-                from utils.url_utils import clean_url_punctuation
-                clean_ref_url = clean_url_punctuation(ref_url)
-                print(f"          URL: {clean_ref_url}")
     def _categorize_unverified_reason(self, error_details):
         """Categorize the unverified error into checker error or not found"""

utils/bibtex_parser.py CHANGED Viewed

@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
     Returns:
         Dictionary with parsed entry data
     """
-    # Extract fields using regex
     fields = {}
-    # Pattern to match field = {value} or field = "value"
-    # Handle nested braces properly
-    field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
-    for match in re.finditer(field_pattern, content, re.DOTALL):
-        field_name = match.group(1).lower()
-        field_value = match.group(2) or match.group(3) or ""
-        # Strip outer quotes if present (handles cases like title = {"Some Title"})
-        field_value = field_value.strip()
-        if field_value.startswith('"') and field_value.endswith('"'):
-            field_value = field_value[1:-1]
-        fields[field_name] = field_value
+    # Use a more robust approach with manual parsing
+    i = 0
+    while i < len(content):
+        # Skip whitespace
+        while i < len(content) and content[i].isspace():
+            i += 1
+        if i >= len(content):
+            break
+        # Look for field name
+        field_start = i
+        while i < len(content) and (content[i].isalnum() or content[i] == '_'):
+            i += 1
+        if i == field_start:
+            i += 1  # Skip non-alphanumeric character
+            continue
+        field_name = content[field_start:i].lower()
+        # Skip whitespace
+        while i < len(content) and content[i].isspace():
+            i += 1
+        # Look for equals sign
+        if i >= len(content) or content[i] != '=':
+            continue
+        i += 1  # Skip '='
+        # Skip whitespace
+        while i < len(content) and content[i].isspace():
+            i += 1
+        if i >= len(content):
+            break
+        # Parse field value
+        field_value = ""
+        if content[i] == '"':
+            # Handle quoted strings
+            i += 1  # Skip opening quote
+            value_start = i
+            while i < len(content) and content[i] != '"':
+                i += 1
+            if i < len(content):
+                field_value = content[value_start:i]
+                i += 1  # Skip closing quote
+        elif content[i] == '{':
+            # Handle braced strings with proper nesting
+            brace_count = 0
+            value_start = i + 1  # Skip opening brace
+            i += 1
+            while i < len(content):
+                if content[i] == '{':
+                    brace_count += 1
+                elif content[i] == '}':
+                    if brace_count == 0:
+                        break
+                    brace_count -= 1
+                i += 1
+            if i < len(content):
+                field_value = content[value_start:i]
+                i += 1  # Skip closing brace
+        if field_value:
+            field_value = field_value.strip()
+            # Strip outer quotes if present (handles cases like title = {"Some Title"})
+            if field_value.startswith('"') and field_value.endswith('"'):
+                field_value = field_value[1:-1]
+            fields[field_name] = field_value
+        # Skip to next field (look for comma)
+        while i < len(content) and content[i] not in ',}':
+            i += 1
+        if i < len(content) and content[i] == ',':
+            i += 1
-    # If field extraction failed, try a simpler approach
+    # Fallback to regex if manual parsing failed
     if not fields:
-        logger.debug("Field extraction failed, trying line-by-line approach")
-        lines = content.split('\n')
-        for line in lines:
-            line = line.strip()
-            if '=' in line:
-                field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
-                if field_match:
-                    field_name = field_match.group(1).lower()
-                    field_value = field_match.group(2).strip()
-                    # Strip outer quotes if present
-                    if field_value.startswith('"') and field_value.endswith('"'):
-                        field_value = field_value[1:-1]
-                    fields[field_name] = field_value
+        logger.debug("Manual parsing failed, trying regex approach")
+        field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
+        for match in re.finditer(field_pattern, content, re.DOTALL):
+            field_name = match.group(1).lower()
+            field_value = match.group(2) or match.group(3) or ""
+            field_value = field_value.strip()
+            if field_value.startswith('"') and field_value.endswith('"'):
+                field_value = field_value[1:-1]
+            fields[field_name] = field_value
     return {
         'type': entry_type,
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
         # Extract journal/venue
         journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
+        # Remove braces from journal/venue names
+        if journal and journal.startswith('{') and journal.endswith('}'):
+            journal = journal[1:-1]
         # Extract DOI and construct URL
         doi = fields.get('doi', '')
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
         # Extract other URLs
         url = fields.get('url', '')
+        if url:
+            from utils.url_utils import clean_url
+            url = clean_url(url)
         # Handle special @misc entries with only howpublished field
         if not title and not authors and entry_type == 'misc':
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
                         else:
                             url = howpublished
+                        # Clean the reconstructed URL
+                        from utils.url_utils import clean_url
+                        url = clean_url(url)
                         # Generate title from domain/path
                         if 'jailbreakchat.com' in domain:
                             title = 'JailbreakChat Website'
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
         if url.startswith('\\url{') and url.endswith('}'):
             url = url[5:-1]  # Remove \url{...}
+        # Clean any URL we extracted
+        if url:
+            from utils.url_utils import clean_url
+            url = clean_url(url)
         # Construct ArXiv URL from eprint field if no URL present
         if not url and not doi_url:

utils/text_utils.py CHANGED Viewed

@@ -11,6 +11,69 @@ from typing import List
 logger = logging.getLogger(__name__)
+def expand_abbreviations(text: str) -> str:
+    """
+    Generic abbreviation expansion using common academic patterns.
+    This function expands common academic abbreviations to their full forms
+    to improve venue name matching and comparison.
+    Args:
+        text: Text containing potential abbreviations
+    Returns:
+        Text with abbreviations expanded
+    """
+    if not text:
+        return text
+    common_abbrevs = {
+        # IEEE specific abbreviations (only expand with periods, not full words)
+        'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
+        'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
+        'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
+        'mechatron.': 'mechatronics', 'intell.': 'intelligence',
+        'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
+        # General academic abbreviations (only expand with periods)
+        'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
+        'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
+        'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
+        'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
+        'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
+        'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
+        'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
+        'workshop': 'workshop', 'worksh.': 'workshop',
+        'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
+        # Physics journal abbreviations
+        'phys.': 'physics', 'phys. rev.': 'physical review',
+        'phys. rev. lett.': 'physical review letters',
+        'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
+        'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
+        'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
+        'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
+        'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
+        'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
+        'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
+        # Nature journals
+        'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
+        # Handle specific multi-word patterns and well-known acronyms
+        'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
+        'pnas': 'proceedings of the national academy of sciences',
+        'neurips': 'neural information processing systems',
+    }
+    # Sort by length (longest first) to ensure longer matches take precedence
+    for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
+        # For abbreviations ending in period, use word boundary at start only
+        if abbrev.endswith('.'):
+            pattern = r'\b' + re.escape(abbrev)
+        else:
+            pattern = r'\b' + re.escape(abbrev) + r'\b'
+        text = re.sub(pattern, expansion, text)
+    return text
 def normalize_apostrophes(text):
     """
     Normalize all apostrophe variants to standard ASCII apostrophe

utils/url_utils.py CHANGED Viewed

@@ -209,7 +209,13 @@ def validate_url_format(url: str) -> bool:
 def clean_url(url: str) -> str:
     """
-    Clean a URL by removing common issues like extra spaces, fragments, etc.
+    Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
+    This function handles:
+    - Whitespace trimming
+    - Malformed LaTeX URL wrappers like \\url{https://...}
+    - Trailing punctuation from academic references
+    - DOI URL query parameter cleanup
     Args:
         url: URL to clean
@@ -223,6 +229,18 @@ def clean_url(url: str) -> str:
     # Remove leading/trailing whitespace
     url = url.strip()
+    # Handle malformed URLs that contain \url{} wrappers within the URL text
+    # e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
+    import re
+    url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
+    url_match = re.search(url_pattern, url)
+    if url_match:
+        url = url_match.group(1)
+    # Remove trailing punctuation that's commonly part of sentence structure
+    # but preserve legitimate URL characters
+    url = url.rstrip('.,;!?)')
     # Note: Preserving query parameters for all URLs now
     # Previously this function removed query parameters for non-DOI URLs,
     # but this was causing issues with OpenReview and other URLs that need their parameters
@@ -254,6 +272,14 @@ def clean_url_punctuation(url: str) -> str:
     # Remove leading/trailing whitespace
     url = url.strip()
+    # Handle malformed URLs that contain \\url{} wrappers within the URL text
+    # e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
+    import re
+    url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
+    url_match = re.search(url_pattern, url)
+    if url_match:
+        url = url_match.group(1)
     # Remove trailing punctuation that's commonly part of sentence structure
     # but preserve legitimate URL characters
     url = url.rstrip('.,;!?)')

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.42.dist-info → academic_refchecker-1.2.43.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.42__py3-none-any.whl → 1.2.43__py3-none-any.whl

academic-refchecker 1.2.42py3-none-any.whl → 1.2.43py3-none-any.whl