PyPI - academic-refchecker - Versions diffs - 2.0.8__py3-none-any.whl → 2.0.10__py3-none-any.whl - Mend

academic-refchecker 2.0.8py3-none-any.whl → 2.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.8
+Version: 2.0.10
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-academic_refchecker-2.0.8.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+academic_refchecker-2.0.10.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
 backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
 backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
 backend/concurrency.py,sha256=2KY9I_8dDkyl_HTGx27ZxU4rFXx2vqbGOlo5RrRbPjA,3223
 backend/database.py,sha256=1jLP1m9vNk5sEs4bh_xmX0T5ilZkUTX1c7nOVz5XnNc,30681
-backend/main.py,sha256=2ziCLwEmvPPtSiF6nuh2az2Lqg8JI9PytKWiow1V-4M,54586
+backend/main.py,sha256=cenE0Vxleh1LP45EOUqh4FTCXCS0OXbPOYWxaOLMfGE,54778
 backend/models.py,sha256=El2F-RTHgxQ7-WODmiYCpjsTFDpjwF9PBt-JDa_XipE,2591
 backend/refchecker_wrapper.py,sha256=ZOg5Rc0Mgac3ALwxA55pTCeqCL06AWOBZLQeTeZEJcY,52038
 backend/thumbnail.py,sha256=wPFXp3RlmcL9jVKZmSBRB7Pfy9Ti7nCnzNtL4osfNtM,17618
@@ -16,7 +16,7 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
 backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
 refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
 refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
-refchecker/__version__.py,sha256=Zjb1PH2--VphovcG6srpeLZmZ4Kukc7voiH8Phuvx7c,65
+refchecker/__version__.py,sha256=-Z5Qa0W7m3Azi2xuo3NQNPvyofIq7M771Vvd2YjQ1-4,66
 refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
 refchecker/checkers/enhanced_hybrid_checker.py,sha256=2jIeUX7hankPok3M4de9o2bsJZ17ZomuLkdfdr9EV0s,28671
@@ -54,11 +54,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
 refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
 refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
 refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-refchecker/utils/text_utils.py,sha256=v5beDt_fyx4ETfTXLYrDMp3CuUGoDoLs7-d1H2GdySE,228585
+refchecker/utils/text_utils.py,sha256=ZIdvP75F_4o_p2lB24CkuX_eEjB9x-BY2FlXsOiYjkQ,234082
 refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
-academic_refchecker-2.0.8.dist-info/METADATA,sha256=-hTJhL3BwqS2hvFrvt3AKnNqIncD9wU3ltrJxdsK1F0,26575
-academic_refchecker-2.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-2.0.8.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
-academic_refchecker-2.0.8.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
-academic_refchecker-2.0.8.dist-info/RECORD,,
+academic_refchecker-2.0.10.dist-info/METADATA,sha256=1cTa4-OOQW4VJHJbGuc9OWStOx9j8qKRKF0iDhe7vbk,26576
+academic_refchecker-2.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-2.0.10.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
+academic_refchecker-2.0.10.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
+academic_refchecker-2.0.10.dist-info/RECORD,,

backend/main.py CHANGED Viewed

@@ -13,6 +13,7 @@ from fastapi.responses import FileResponse, HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 import logging
+from refchecker.__version__ import __version__
 import aiosqlite
 from .database import db
@@ -136,6 +137,12 @@ async def health():
     return {"status": "healthy"}
+@app.get("/api/version")
+async def version():
+    """Return server/CLI version from refchecker package."""
+    return {"version": __version__}
 @app.websocket("/api/ws/{session_id}")
 async def websocket_endpoint(websocket: WebSocket, session_id: str):
     """WebSocket endpoint for real-time updates"""

refchecker/__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "2.0.8"
+__version__ = "2.0.10"

refchecker/utils/text_utils.py CHANGED Viewed

@@ -3095,6 +3095,35 @@ def validate_parsed_references(references):
     }
+def is_access_note(text):
+    """
+    Check if text is an access note like '[Online; accessed DD-MM-YYYY]' or '[Accessed: YYYY-MM-DD]'
+    These should not be treated as titles or venues.
+    Args:
+        text: Text to check
+    Returns:
+        True if text appears to be an access/retrieval note
+    """
+    if not text:
+        return False
+    text_clean = text.strip().rstrip('.')
+    # Common patterns for access notes
+    access_patterns = [
+        r'^\[Online;?\s*accessed\s+[\d\-/]+\]$',  # [Online; accessed 07-12-2024]
+        r'^\[Accessed:?\s+[\d\-/]+\]$',            # [Accessed: 2024-07-12]
+        r'^\[Online\]$',                           # [Online]
+        r'^\[accessed\s+[\d\-/]+\]$',              # [accessed 07-12-2024]
+        r'^\[Online,?\s+accessed\s+[\d\-/]+\]$',   # [Online, accessed 07-12-2024]
+        r'^Online;\s*accessed\s+[\d\-/]+$',        # Online; accessed 07-12-2024 (without brackets)
+    ]
+    for pattern in access_patterns:
+        if re.match(pattern, text_clean, re.IGNORECASE):
+            return True
+    return False
 def extract_latex_references(text, file_path=None):  # pylint: disable=unused-argument
     """
     Extract references from LaTeX content programmatically
@@ -3220,191 +3249,244 @@ def extract_latex_references(text, file_path=None):  # pylint: disable=unused-ar
                     # Clean and extract authors
                     author_part_clean = strip_latex_commands(author_part).strip()
-                    # Simple fix: just improve the organization detection without complex parsing
-                    # Remove year pattern first - handle both parenthetical and standalone years
-                    author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
-                    author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
+                    # Special case: Check if second part is just an access note like [Online; accessed ...]
+                    # This indicates the reference has no authors, and the first part is actually the title
+                    # e.g., "The caida anonymized internet traces.\n\newblock [Online; accessed 07-12-2024]."
+                    first_part_is_title = False
+                    if len(parts) >= 2:
+                        second_part_clean = strip_latex_commands(parts[1]).strip()
+                        if is_access_note(second_part_clean):
+                            first_part_is_title = True
+                            # Use first part as title, not authors
+                            title_text_from_first = author_part_clean.rstrip('.')
+                            if title_text_from_first and len(title_text_from_first) > 5:
+                                ref['title'] = title_text_from_first
+                            # Don't set authors - this reference has none (or just a dataset name)
+                    if first_part_is_title:
+                        # Skip normal author/title parsing - already handled above
+                        pass
+                    else:
+                        # Normal case: first part contains authors
+                        # Simple fix: just improve the organization detection without complex parsing
+                        # Remove year pattern first - handle both parenthetical and standalone years
+                        author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
+                        author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
-                    # Better organization detection - check if it looks like multiple authors
-                    is_multi_author = (
-                        ', and ' in author_text_clean or  # "A, B, and C" format
-                        ' and ' in author_text_clean or    # "A and B" format
-                        re.search(r'\w+,\s+[A-Z]\.', author_text_clean) or  # "Last, F." patterns
-                        (author_text_clean.count(',') >= 2 and len(author_text_clean) > 30)  # Multiple commas in longer text
-                    )
+                        # Better organization detection - check if it looks like multiple authors
+                        is_multi_author = (
+                            ', and ' in author_text_clean or  # "A, B, and C" format
+                            ' and ' in author_text_clean or    # "A and B" format
+                            re.search(r'\w+,\s+[A-Z]\.', author_text_clean) or  # "Last, F." patterns
+                            (author_text_clean.count(',') >= 2 and len(author_text_clean) > 30)  # Multiple commas in longer text
+                        )
-                    if is_multi_author:
-                        # Parse multiple authors - use existing logic from parse_authors_with_initials
-                        try:
-                            parsed_authors = parse_authors_with_initials(author_text_clean)
-                            if parsed_authors and len(parsed_authors) > 1:
-                                # Clean up "and" prefixes, periods, and preserve "et al"
-                                cleaned_authors = []
-                                for author in parsed_authors:
-                                    # Remove leading "and"
-                                    author = re.sub(r'^and\s+', '', author.strip())
-                                    # Remove trailing periods that shouldn't be there
-                                    author = clean_author_name(author)
-                                    # Preserve "et al" variants to enable proper author count handling
-                                    if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
-                                        cleaned_authors.append('et al')  # Normalize to standard form
-                                    else:
-                                        cleaned_authors.append(author)
-                                if cleaned_authors:
-                                    ref['authors'] = cleaned_authors
-                            else:
-                                # Fallback: try once more with semicolon handling, then simple comma split
+                        if is_multi_author:
+                            # Parse multiple authors - use existing logic from parse_authors_with_initials
+                            try:
+                                parsed_authors = parse_authors_with_initials(author_text_clean)
+                                if parsed_authors and len(parsed_authors) > 1:
+                                    # Clean up "and" prefixes, periods, and preserve "et al"
+                                    cleaned_authors = []
+                                    for author in parsed_authors:
+                                        # Remove leading "and"
+                                        author = re.sub(r'^and\s+', '', author.strip())
+                                        # Remove trailing periods that shouldn't be there
+                                        author = clean_author_name(author)
+                                        # Preserve "et al" variants to enable proper author count handling
+                                        if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
+                                            cleaned_authors.append('et al')  # Normalize to standard form
+                                        else:
+                                            cleaned_authors.append(author)
+                                    if cleaned_authors:
+                                        ref['authors'] = cleaned_authors
+                                else:
+                                    # Fallback: try once more with semicolon handling, then simple comma split
+                                    simple_authors = []
+                                    try:
+                                        # Try parsing again with normalized separators
+                                        normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
+                                        fallback_authors = parse_authors_with_initials(normalized_text)
+                                        if fallback_authors and len(fallback_authors) >= 2:
+                                            simple_authors = fallback_authors
+                                        else:
+                                            raise ValueError("Fallback parsing failed")
+                                    except:
+                                        # Last resort: naive comma split
+                                        for a in author_text_clean.split(','):
+                                            a = a.strip()
+                                            # Remove "and" prefix and skip short/empty entries
+                                            a = re.sub(r'^and\s+', '', a)
+                                            # Clean author name (remove unnecessary periods)
+                                            a = clean_author_name(a)
+                                            if a and len(a) > 2:
+                                                # Preserve "et al" variants to enable proper author count handling
+                                                if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                                    simple_authors.append('et al')  # Normalize to standard form
+                                                else:
+                                                    simple_authors.append(a)
+                                            elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                                simple_authors.append('et al')  # Handle short "et al" variants
+                                    if simple_authors:
+                                        ref['authors'] = simple_authors
+                            except Exception:
+                                # Fallback: simple comma split with cleanup
                                 simple_authors = []
-                                try:
-                                    # Try parsing again with normalized separators
-                                    normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
-                                    fallback_authors = parse_authors_with_initials(normalized_text)
-                                    if fallback_authors and len(fallback_authors) >= 2:
-                                        simple_authors = fallback_authors
-                                    else:
-                                        raise ValueError("Fallback parsing failed")
-                                except:
-                                    # Last resort: naive comma split
-                                    for a in author_text_clean.split(','):
-                                        a = a.strip()
-                                        # Remove "and" prefix and skip short/empty entries
-                                        a = re.sub(r'^and\s+', '', a)
-                                        # Clean author name (remove unnecessary periods)
-                                        a = clean_author_name(a)
-                                        if a and len(a) > 2:
-                                            # Preserve "et al" variants to enable proper author count handling
-                                            if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
-                                                simple_authors.append('et al')  # Normalize to standard form
-                                            else:
-                                                simple_authors.append(a)
-                                        elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
-                                            simple_authors.append('et al')  # Handle short "et al" variants
+                                for a in author_text_clean.split(','):
+                                    a = a.strip()
+                                    # Remove "and" prefix and skip short/empty entries
+                                    a = re.sub(r'^and\s+', '', a)
+                                    # Clean author name (remove unnecessary periods)
+                                    a = clean_author_name(a)
+                                    if a and len(a) > 2:
+                                        # Preserve "et al" variants to enable proper author count handling
+                                        if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                            simple_authors.append('et al')  # Normalize to standard form
+                                        else:
+                                            simple_authors.append(a)
+                                    elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
+                                        simple_authors.append('et al')  # Handle short "et al" variants
                                 if simple_authors:
                                     ref['authors'] = simple_authors
-                        except Exception:
-                            # Fallback: simple comma split with cleanup
-                            simple_authors = []
-                            for a in author_text_clean.split(','):
-                                a = a.strip()
-                                # Remove "and" prefix and skip short/empty entries
-                                a = re.sub(r'^and\s+', '', a)
-                                # Clean author name (remove unnecessary periods)
-                                a = clean_author_name(a)
-                                if a and len(a) > 2:
-                                    # Preserve "et al" variants to enable proper author count handling
-                                    if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
-                                        simple_authors.append('et al')  # Normalize to standard form
-                                    else:
-                                        simple_authors.append(a)
-                                elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
-                                    simple_authors.append('et al')  # Handle short "et al" variants
-                            if simple_authors:
-                                ref['authors'] = simple_authors
-                    else:
-                        # Single organization author
-                        author_name = clean_author_name(author_text_clean)
-                        if author_name and len(author_name) > 2:
-                            ref['authors'] = [author_name]
+                        else:
+                            # Single organization author
+                            author_name = clean_author_name(author_text_clean)
+                            if author_name and len(author_name) > 2:
+                                ref['authors'] = [author_name]
                     # Second part is usually title
-                    if len(parts) >= 2:
+                    if len(parts) >= 2 and not first_part_is_title:
                         title_part = parts[1].strip()
-                        # Handle \href{URL}{text} or \href {URL} {text} format
-                        # Extract URL before stripping LaTeX commands
-                        # We need to use balanced brace matching because titles can contain
-                        # nested braces like {LLM} for capitalization protection
-                        href_url = None
-                        title_text = None
+                        # Check if this is an access note - skip if so
+                        title_part_clean = strip_latex_commands(title_part).strip()
+                        if is_access_note(title_part_clean):
+                            # This is just an access note, not a title
+                            pass
+                        else:
+                            # Check if this is a URL-only part (common for @misc website references)
+                            # Pattern: \url{...}, YEAR or just \url{...}
+                            # In this case, use the author/organization name as the title instead
+                            url_only_match = re.match(r'^\\url\{[^}]+\}(?:\s*,\s*\d{4})?\.?\s*$', title_part)
+                            if url_only_match:
+                                # This is a URL-only block, not a title
+                                # For website/misc references, the org name IS the title
+                                # Use the author_part_clean as title if it looks like an org name
+                                if author_part_clean and not ref.get('title'):
+                                    # Organization names are often in braces, clean them up
+                                    org_title = author_part_clean.strip('{}.')
+                                    if org_title and len(org_title) > 2:
+                                        ref['title'] = org_title
+                                # Continue to extract URL below
-                        href_start = title_part.find('\\href')
-                        if href_start != -1:
-                            # Find first opening brace (URL)
-                            pos = href_start + 5  # Skip \href
-                            while pos < len(title_part) and title_part[pos] in ' \t\n':
-                                pos += 1
+                            # Handle \href{URL}{text} or \href {URL} {text} format
+                            # Extract URL before stripping LaTeX commands
+                            # We need to use balanced brace matching because titles can contain
+                            # nested braces like {LLM} for capitalization protection
+                            href_url = None
+                            title_text = None
-                            if pos < len(title_part) and title_part[pos] == '{':
-                                # Extract URL using balanced braces
-                                brace_count = 0
-                                url_start = pos + 1
-                                url_end = pos
-                                for i in range(pos, len(title_part)):
-                                    if title_part[i] == '{':
-                                        brace_count += 1
-                                    elif title_part[i] == '}':
-                                        brace_count -= 1
-                                        if brace_count == 0:
-                                            url_end = i
-                                            break
+                            href_start = title_part.find('\\href')
+                            if href_start != -1:
+                                # Find first opening brace (URL)
+                                pos = href_start + 5  # Skip \href
+                                while pos < len(title_part) and title_part[pos] in ' \t\n':
+                                    pos += 1
-                                if url_end > url_start:
-                                    href_url = title_part[url_start:url_end].strip()
-                                    # Now find the second brace group (title text)
-                                    pos = url_end + 1
-                                    while pos < len(title_part) and title_part[pos] in ' \t\n':
-                                        pos += 1
+                                if pos < len(title_part) and title_part[pos] == '{':
+                                    # Extract URL using balanced braces
+                                    brace_count = 0
+                                    url_start = pos + 1
+                                    url_end = pos
+                                    for i in range(pos, len(title_part)):
+                                        if title_part[i] == '{':
+                                            brace_count += 1
+                                        elif title_part[i] == '}':
+                                            brace_count -= 1
+                                            if brace_count == 0:
+                                                url_end = i
+                                                break
-                                    if pos < len(title_part) and title_part[pos] == '{':
-                                        # Extract title text using balanced braces
-                                        brace_count = 0
-                                        text_start = pos + 1
-                                        text_end = pos
-                                        for i in range(pos, len(title_part)):
-                                            if title_part[i] == '{':
-                                                brace_count += 1
-                                            elif title_part[i] == '}':
-                                                brace_count -= 1
-                                                if brace_count == 0:
-                                                    text_end = i
-                                                    break
+                                    if url_end > url_start:
+                                        href_url = title_part[url_start:url_end].strip()
-                                        if text_end > text_start:
-                                            title_text = title_part[text_start:text_end].strip()
-                        if href_url and title_text:
+                                        # Now find the second brace group (title text)
+                                        pos = url_end + 1
+                                        while pos < len(title_part) and title_part[pos] in ' \t\n':
+                                            pos += 1
+                                        if pos < len(title_part) and title_part[pos] == '{':
+                                            # Extract title text using balanced braces
+                                            brace_count = 0
+                                            text_start = pos + 1
+                                            text_end = pos
+                                            for i in range(pos, len(title_part)):
+                                                if title_part[i] == '{':
+                                                    brace_count += 1
+                                                elif title_part[i] == '}':
+                                                    brace_count -= 1
+                                                    if brace_count == 0:
+                                                        text_end = i
+                                                        break
+                                            if text_end > text_start:
+                                                title_text = title_part[text_start:text_end].strip()
-                            # Extract DOI if it's a doi.org URL
-                            if 'doi.org/' in href_url and not ref.get('doi'):
-                                doi_match = re.search(r'doi\.org/(.+)$', href_url)
-                                if doi_match:
-                                    ref['doi'] = doi_match.group(1)
+                            if href_url and title_text:
+                                # Extract DOI if it's a doi.org URL
+                                if 'doi.org/' in href_url and not ref.get('doi'):
+                                    doi_match = re.search(r'doi\.org/(.+)$', href_url)
+                                    if doi_match:
+                                        ref['doi'] = doi_match.group(1)
+                                        ref['url'] = href_url
+                                # Extract arXiv ID if it's an arxiv URL
+                                elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
+                                    ref['url'] = href_url
+                                # Generic URL
+                                elif not ref.get('url'):
                                     ref['url'] = href_url
-                            # Extract arXiv ID if it's an arxiv URL
-                            elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
-                                ref['url'] = href_url
-                            # Generic URL
-                            elif not ref.get('url'):
-                                ref['url'] = href_url
+                                # Use the title text (second part of href), not the URL
+                                title_clean = strip_latex_commands(title_text).strip()
+                            elif not url_only_match:
+                                # Only extract title from this part if it's not a URL-only block
+                                title_clean = strip_latex_commands(title_part).strip()
+                            else:
+                                # URL-only block - title already set from org name above
+                                title_clean = None
-                            # Use the title text (second part of href), not the URL
-                            title_clean = strip_latex_commands(title_text).strip()
-                        else:
-                            title_clean = strip_latex_commands(title_part).strip()
-                        # Remove trailing dots and clean up
-                        title_clean = title_clean.rstrip('.')
-                        if title_clean and len(title_clean) > 5:  # Reasonable title length
-                            ref['title'] = title_clean
+                            # Remove trailing dots and clean up
+                            if title_clean:
+                                title_clean = title_clean.rstrip('.')
+                                # Also remove leading comma and year pattern that may remain from URL stripping
+                                title_clean = re.sub(r'^,\s*\d{4}\s*$', '', title_clean).strip()
+                                title_clean = re.sub(r'^,\s*', '', title_clean).strip()
+                            if title_clean and len(title_clean) > 5:  # Reasonable title length
+                                ref['title'] = title_clean
                     # Third part is usually venue/journal
                     if len(parts) >= 3:
                         venue_part = parts[2].strip()
                         venue_clean = strip_latex_commands(venue_part).strip()
-                        # Remove "In " prefix if present (common in bbl format)
-                        venue_clean = re.sub(r'^In\s+', '', venue_clean)
-                        # Remove trailing year only (at end of string), not year in the middle of venue name
-                        # e.g., "2020 Conference on..." should keep the conference name
-                        if ref['year']:
-                            # Only remove year if it appears at the very end (possibly with punctuation)
-                            venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
-                        venue_clean = venue_clean.rstrip(',. ')
-                        # Filter out common non-venue patterns that shouldn't be treated as venues
-                        non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
-                        if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
-                            ref['journal'] = venue_clean
+                        # Check if this is an access note - skip if so
+                        if is_access_note(venue_clean):
+                            pass  # Don't treat access notes as venues
+                        else:
+                            # Remove "In " prefix if present (common in bbl format)
+                            venue_clean = re.sub(r'^In\s+', '', venue_clean)
+                            # Remove trailing year only (at end of string), not year in the middle of venue name
+                            # e.g., "2020 Conference on..." should keep the conference name
+                            if ref['year']:
+                                # Only remove year if it appears at the very end (possibly with punctuation)
+                                venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
+                            venue_clean = venue_clean.rstrip(',. ')
+                            # Filter out common non-venue patterns that shouldn't be treated as venues
+                            non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
+                            if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
+                                ref['journal'] = venue_clean
                 # Extract URL if present
                 url_match = re.search(r'\\url\{([^}]+)\}', content)

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-2.0.8.dist-info → academic_refchecker-2.0.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 2.0.8__py3-none-any.whl → 2.0.10__py3-none-any.whl

academic-refchecker 2.0.8py3-none-any.whl → 2.0.10py3-none-any.whl