PyPI - academic-refchecker - Versions diffs - 2.0.13__tar.gz → 2.0.14__tar.gz - Mend

academic-refchecker 2.0.13tar.gz → 2.0.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{academic_refchecker-2.0.13/academic_refchecker.egg-info → academic_refchecker-2.0.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.13
+Version: 2.0.14
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 2.0.13
+Version: 2.0.14
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/main.py RENAMED Viewed

@@ -27,6 +27,7 @@ from .thumbnail import (
     generate_pdf_thumbnail_async,
     generate_pdf_preview_async,
     get_text_thumbnail_async,
+    get_text_preview_async,
     get_thumbnail_cache_path,
     get_preview_cache_path
 )
@@ -220,12 +221,15 @@ async def start_check(
         elif source_type == "text":
             if not source_text:
                 raise HTTPException(status_code=400, detail="No text provided")
+            # Normalize line endings - remove all \r to prevent double carriage returns
+            # Browser may send \r\n, and Windows file writing can add extra \r
+            normalized_text = source_text.replace('\r\n', '\n').replace('\r', '\n')
             # Save pasted text to a file for later retrieval and thumbnail generation
             text_dir = Path(tempfile.gettempdir()) / "refchecker_texts"
             text_dir.mkdir(parents=True, exist_ok=True)
             text_file_path = text_dir / f"pasted_{session_id}.txt"
-            with open(text_file_path, "w", encoding="utf-8") as f:
-                f.write(source_text)
+            with open(text_file_path, "w", encoding="utf-8", newline='\n') as f:
+                f.write(normalized_text)
             paper_source = str(text_file_path)
             paper_title = "Pasted Text"
         elif source_type == "url":
@@ -646,9 +650,33 @@ async def get_preview(check_id: int):
                 media_type="image/png",
                 headers={"Cache-Control": "public, max-age=86400"}  # Cache for 1 day
             )
-        else:
-            # Fall back to thumbnail if preview can't be generated
-            raise HTTPException(status_code=404, detail="Could not generate preview")
+        # For text sources, generate a high-resolution text preview for overlay display
+        if source_type == 'text':
+            logger.info(f"Generating text preview for check {check_id}")
+            preview_path = await get_text_preview_async(check_id, "", paper_source)
+            if preview_path and os.path.exists(preview_path):
+                return FileResponse(
+                    preview_path,
+                    media_type="image/png",
+                    headers={"Cache-Control": "public, max-age=86400"}
+                )
+        # For non-PDF file uploads, also generate a text preview
+        if source_type == 'file' and not paper_source.lower().endswith('.pdf'):
+            logger.info(f"Generating text preview for uploaded file check {check_id}")
+            if os.path.exists(paper_source):
+                preview_path = await get_text_preview_async(check_id, "", paper_source)
+            else:
+                preview_path = await get_text_preview_async(check_id, "Uploaded file")
+            if preview_path and os.path.exists(preview_path):
+                return FileResponse(
+                    preview_path,
+                    media_type="image/png",
+                    headers={"Cache-Control": "public, max-age=86400"}
+                )
+        raise HTTPException(status_code=404, detail="Could not generate preview")
     except HTTPException:
         raise

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/refchecker_wrapper.py RENAMED Viewed

@@ -3,6 +3,7 @@ Wrapper around refchecker library with progress callbacks for real-time updates
 """
 import sys
 import os
+import re
 import asyncio
 import logging
 import tempfile
@@ -238,6 +239,18 @@ class ProgressRefChecker:
                 if not any(u.get('url') == doi_url for u in authoritative_urls):
                     authoritative_urls.append({"type": "doi", "url": doi_url})
+            # Add Semantic Scholar URL if available
+            s2_paper_id = external_ids.get('S2PaperId')
+            if s2_paper_id:
+                s2_url = f"https://www.semanticscholar.org/paper/{s2_paper_id}"
+                if not any(u.get('url') == s2_url for u in authoritative_urls):
+                    authoritative_urls.append({"type": "semantic_scholar", "url": s2_url})
+            # Also check for inline S2 URL (from merged data)
+            s2_inline_url = verified_data.get('_semantic_scholar_url')
+            if s2_inline_url and not any(u.get('url') == s2_inline_url for u in authoritative_urls):
+                authoritative_urls.append({"type": "semantic_scholar", "url": s2_inline_url})
         # Format errors, warnings, and suggestions
         formatted_errors = []
         formatted_warnings = []
@@ -462,11 +475,20 @@ class ProgressRefChecker:
                         raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
                     pdf_processor = PDFProcessor()
                     paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
-                elif paper_source.lower().endswith(('.tex', '.txt')):
+                elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
                     def read_file():
                         with open(paper_source, 'r', encoding='utf-8') as f:
                             return f.read()
                     paper_text = await asyncio.to_thread(read_file)
+                    # For .bib files, extract references directly using BibTeX parser
+                    if paper_source.lower().endswith('.bib'):
+                        logger.info("Processing uploaded .bib file as BibTeX")
+                        refs_result = await self._extract_references_from_bibtex(paper_text)
+                        if refs_result and refs_result[0]:
+                            arxiv_source_references = refs_result[0]
+                            extraction_method = 'bib'
+                            logger.info(f"Extracted {len(arxiv_source_references)} references from .bib file")
                 else:
                     raise ValueError(f"Unsupported file type: {paper_source}")
             elif source_type == "text":
@@ -494,6 +516,25 @@ class ProgressRefChecker:
                         arxiv_source_references = refs_result[0]
                         extraction_method = 'bbl'  # Mark as bbl extraction
                         logger.info(f"Extracted {len(arxiv_source_references)} references from pasted .bbl content")
+                # Check if the pasted text is BibTeX format (@article, @misc, @inproceedings, etc.)
+                elif re.search(r'@\s*(article|book|inproceedings|incollection|misc|techreport|phdthesis|mastersthesis|conference|inbook|proceedings)\s*\{', paper_text, re.IGNORECASE):
+                    logger.info("Detected BibTeX format in pasted text")
+                    refs_result = await self._extract_references_from_bibtex(paper_text)
+                    if refs_result and refs_result[0]:
+                        arxiv_source_references = refs_result[0]
+                        extraction_method = 'bib'  # Mark as bib extraction
+                        logger.info(f"Extracted {len(arxiv_source_references)} references from pasted BibTeX content")
+                # Fallback: Try BibTeX parsing anyway for partial/malformed content
+                # This handles cases like incomplete paste, or BibTeX-like content without standard entry types
+                elif any(marker in paper_text for marker in ['title={', 'author={', 'year={', 'eprint={', '@']):
+                    logger.info("Detected possible BibTeX-like content, attempting parse")
+                    refs_result = await self._extract_references_from_bibtex(paper_text)
+                    if refs_result and refs_result[0]:
+                        arxiv_source_references = refs_result[0]
+                        extraction_method = 'bib'
+                        logger.info(f"Extracted {len(arxiv_source_references)} references from partial BibTeX content")
+                    else:
+                        logger.warning("BibTeX-like content detected but parsing failed, will try LLM extraction")
                 # Don't update title for pasted text - keep the placeholder
             else:
                 raise ValueError(f"Unsupported source type: {source_type}")

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/backend/thumbnail.py RENAMED Viewed

@@ -416,6 +416,13 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
             except Exception as e:
                 logger.warning(f"Could not read text file: {e}")
+        # Clean up text content - remove excessive blank lines that cause rendering issues
+        if text_content:
+            # Normalize line endings and remove consecutive blank lines
+            lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
+            # Keep only non-empty lines
+            text_content = '\n'.join(line for line in lines if line.strip())
         # Create a document-like image with actual text content
         doc = fitz.open()
         page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
@@ -483,6 +490,116 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
         return None
+def get_text_preview(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
+    """
+    Generate a high-resolution preview for pasted text showing actual content.
+    Creates a larger image (similar to PDF previews) with the text content.
+    Args:
+        check_id: Check ID for naming
+        text_preview: Optional first few lines of text to display
+        text_file_path: Optional path to the text file to read content from
+    Returns:
+        Path to the generated preview, or None if generation failed
+    """
+    try:
+        import fitz
+        output_path = get_preview_cache_path(f"text_{check_id}", check_id)
+        if output_path.exists():
+            return str(output_path)
+        # Try to read text content from file
+        text_content = text_preview
+        if text_file_path and os.path.exists(text_file_path):
+            try:
+                with open(text_file_path, 'r', encoding='utf-8') as f:
+                    text_content = f.read()
+            except Exception as e:
+                logger.warning(f"Could not read text file: {e}")
+        # Clean up text content - remove excessive blank lines that cause rendering issues
+        if text_content:
+            # Normalize line endings and remove consecutive blank lines
+            lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
+            # Keep only non-empty lines
+            text_content = '\n'.join(line for line in lines if line.strip())
+        # Create a document-like image with actual text content at high resolution
+        doc = fitz.open()
+        page = doc.new_page(width=PREVIEW_WIDTH, height=int(PREVIEW_WIDTH * 1.4))
+        # Fill with white/off-white background
+        page.draw_rect(page.rect, color=(0.9, 0.9, 0.9), fill=(0.98, 0.98, 0.98))
+        # Draw border
+        page.draw_rect(page.rect, color=(0.7, 0.7, 0.7), width=2)
+        # Draw actual text content if available
+        margin = 40
+        if text_content:
+            # Create a text box for the content
+            text_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, int(PREVIEW_WIDTH * 1.4) - margin)
+            # Truncate to first ~4000 chars for preview
+            display_text = text_content[:4000]
+            if len(text_content) > 4000:
+                display_text += "\n\n..."
+            # Insert text with readable font size
+            page.insert_textbox(
+                text_rect,
+                display_text,
+                fontsize=14,
+                color=(0.15, 0.15, 0.15),
+                fontname="helv"
+            )
+        else:
+            # Fallback: Draw placeholder
+            header_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, margin + 60)
+            page.insert_textbox(header_rect, "Pasted Text", fontsize=36, color=(0.3, 0.3, 0.5))
+            # Draw placeholder lines
+            line_height = 24
+            y = margin + 100
+            for i in range(20):
+                line_width = PREVIEW_WIDTH - 2 * margin
+                if i % 3 == 2:
+                    line_width = line_width * 0.7
+                page.draw_line(
+                    fitz.Point(margin, y),
+                    fitz.Point(margin + line_width, y),
+                    color=(0.7, 0.7, 0.7),
+                    width=3
+                )
+                y += line_height
+        # Render to pixmap and save
+        pix = page.get_pixmap(alpha=False)
+        pix.save(str(output_path))
+        doc.close()
+        logger.info(f"Generated text preview: {output_path}")
+        return str(output_path)
+    except ImportError:
+        logger.error("PyMuPDF (fitz) is not installed")
+        return None
+    except Exception as e:
+        logger.error(f"Error generating text preview: {e}")
+        return None
+async def get_text_preview_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
+    """Async wrapper for text preview generation."""
+    return await asyncio.to_thread(get_text_preview, check_id, text_preview, text_file_path)
 async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
     """Async wrapper for text thumbnail generation."""
     return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "2.0.13"
+__version__ = "2.0.14"

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/arxiv_citation.py RENAMED Viewed

@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
 Key features:
 - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
-- Always uses the latest version metadata (strips version suffixes)
-- Logs warnings when cited version differs from latest version
+- Checks reference against all historical versions when latest doesn't match
+- Annotates errors with version info when reference matches an older version
 - Parses BibTeX to extract normalized metadata matching refchecker schema
 Usage:
@@ -30,6 +30,7 @@ Usage:
 import re
 import logging
 import requests
+import html
 from typing import Dict, List, Tuple, Optional, Any
 import bibtexparser
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
             # export.arxiv.org URLs
             r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
             r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            # DOI format
+            r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
         ]
     def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
             reference.get('cited_url', ''),
             reference.get('raw_text', ''),
             reference.get('eprint', ''),  # BibTeX field
+            reference.get('journal', ''),
+            reference.get('doi', ''),  # DOI field (may contain arXiv ID)
         ]
         for source in sources:
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
         return None
-    def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
+    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
         """
-        Get the latest version number for an ArXiv paper.
+        Check if a reference is an ArXiv paper.
-        Note: This requires fetching the abstract page, so it's optional.
-        For now, we rely on the BibTeX always returning latest version metadata.
+        Args:
+            reference: Reference dictionary
+        Returns:
+            True if reference appears to be an ArXiv paper
+        """
+        arxiv_id, _ = self.extract_arxiv_id(reference)
+        return arxiv_id is not None
+    def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
+        """
+        Fetch and parse metadata for a specific version using HTML scraping.
         Args:
             arxiv_id: ArXiv ID without version
+            version_num: Version number to fetch (1, 2, 3, etc.)
         Returns:
-            Latest version string (e.g., "v3") or None if couldn't determine
+            Dictionary with version metadata or None if version doesn't exist
         """
-        # The BibTeX endpoint always returns the latest version's metadata,
-        # so we don't need to explicitly fetch version info
-        return None
-    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
+        version_str = f"v{version_num}"
+        url = f"{self.abs_url}/{arxiv_id}{version_str}"
+        self.rate_limiter.wait()
+        try:
+            logger.debug(f"Checking historical version: {url}")
+            response = requests.get(url, timeout=self.timeout)
+            if response.status_code == 404:
+                return None  # Version does not exist
+            response.raise_for_status()
+            html_content = response.text
+            # Parse meta tags for metadata
+            # Title
+            title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
+            title = html.unescape(title_match.group(1)).strip() if title_match else ""
+            # Authors
+            authors = []
+            for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
+                authors.append(html.unescape(auth).strip())
+            # Date/Year
+            date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
+            year = None
+            if date_match:
+                ym = re.search(r'^(\d{4})', date_match.group(1))
+                if ym:
+                    year = int(ym.group(1))
+            return {
+                'version': version_str,
+                'version_num': version_num,
+                'title': title,
+                'authors': [{'name': a} for a in authors],
+                'year': year,
+                'url': url,
+            }
+        except Exception as e:
+            logger.warning(f"Failed to fetch history {version_str}: {e}")
+            return None
+    def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
         """
-        Check if a reference is an ArXiv paper.
+        Get the latest version number by fetching the abstract page.
         Args:
-            reference: Reference dictionary
+            arxiv_id: ArXiv ID without version
         Returns:
-            True if reference appears to be an ArXiv paper
+            Latest version number as integer, or None if couldn't determine
         """
-        arxiv_id, _ = self.extract_arxiv_id(reference)
-        return arxiv_id is not None
+        url = f"{self.abs_url}/{arxiv_id}"
+        self.rate_limiter.wait()
+        try:
+            response = requests.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            # Look for version links like "[v1]", "[v2]", etc.
+            versions = re.findall(r'\[v(\d+)\]', response.text)
+            if versions:
+                return max(int(v) for v in versions)
+            return None
+        except Exception as e:
+            logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
+            return None
+    def _compare_info_match(
+            self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
+            authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
+        """
+        Compare the information of a cited paper with the authoritative information.
+        Args:
+            cited_title: Title from the reference
+            cited_authors: Authors from the reference
+            cited_year: Year from the reference
+            authoritative_title: Title from ArXiv version
+            authoritative_authors: Authors from ArXiv version
+            authoritative_year: Year from ArXiv version
+        Returns:
+            True if the information matches, False otherwise.
+        """
+        # Compare title
+        if cited_title and authoritative_title:
+            title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
+            if title_similarity < SIMILARITY_THRESHOLD:
+                return False
+        # Compare authors
+        if cited_authors and authoritative_authors:
+            authors_match, _ = compare_authors(cited_authors, authoritative_authors)
+            if not authors_match:
+                return False
+        # Compare year
+        if cited_year and authoritative_year:
+            if cited_year != authoritative_year:
+                return False
+        return True
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
         This method:
         1. Extracts the ArXiv ID from the reference
-        2. Fetches the official BibTeX from ArXiv (always latest version)
-        3. Parses the BibTeX to get authoritative metadata
-        4. Compares cited metadata against authoritative source
-        5. Logs warnings for version mismatches
+        2. Fetches the official BibTeX from ArXiv (latest version)
+        3. Compares cited metadata against latest version
+        4. If errors found, checks historical versions to find a match
+        5. Annotates errors with version info if reference matches an older version
         Args:
             reference: Reference dictionary with title, authors, year, url, etc.
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
         logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
-        # Fetch authoritative BibTeX
+        # Extract information from reference for comparison
+        cited_title = reference.get('title', '').strip()
+        cited_authors = reference.get('authors', [])
+        cited_year = reference.get('year')
+        # Fetch authoritative BibTeX (latest version)
         bibtex_content = self.fetch_bibtex(arxiv_id)
         if not bibtex_content:
             logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
             return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
-        # Parse BibTeX
-        verified_data = self.parse_bibtex(bibtex_content)
+        latest_data = self.parse_bibtex(bibtex_content)
-        if not verified_data:
+        if not latest_data:
             logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
             return None, [], None
-        # Log version mismatch warning if cited version differs from latest
-        if cited_version:
-            # ArXiv BibTeX always returns latest version metadata
-            # We don't know the actual latest version number without additional API call,
-            # but we can warn that a specific version was cited
-            errors.append({
-                'warning_type': 'version',
-                'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
-            })
-            logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
-        # Compare title
-        cited_title = reference.get('title', '').strip()
-        authoritative_title = verified_data.get('title', '').strip()
+        # Compare against latest version
+        authoritative_title = latest_data.get('title', '').strip()
         if cited_title and authoritative_title:
             title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
                 })
         # Compare authors
-        cited_authors = reference.get('authors', [])
         if cited_authors:
-            authoritative_authors = verified_data.get('authors', [])
+            authoritative_authors = latest_data.get('authors', [])
             authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
             if not authors_match:
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
                 })
         # Compare year
-        cited_year = reference.get('year')
-        authoritative_year = verified_data.get('year')
+        authoritative_year = latest_data.get('year')
         year_warning = validate_year(
             cited_year=cited_year,
             paper_year=authoritative_year,
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
         )
         if year_warning:
             errors.append(year_warning)
-        # Build URL
         paper_url = f"https://arxiv.org/abs/{arxiv_id}"
-        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        # If no errors against latest version, we're done
+        if len(errors) == 0:
+            logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
+            return latest_data, errors, paper_url
+        # Check if reference matches a historical version
+        # Get latest version number first
+        latest_version_num = self._get_latest_version_number(arxiv_id)
+        if latest_version_num and latest_version_num > 1:
+            # Check historical versions (1 to latest-1)
+            for version_num in range(1, latest_version_num):
+                version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
+                if not version_data:
+                    continue
+                # Check if reference matches this historical version
+                if self._compare_info_match(
+                        cited_title, cited_authors, cited_year,
+                        version_data['title'], version_data['authors'], version_data['year']):
+                    logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
+                    # Convert errors to warnings with version update info
+                    # Version update issues are informational, not errors - the citation was correct for its time
+                    version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
+                    warnings = []
+                    for error in errors:
+                        warning = {
+                            'warning_type': error.get('error_type', 'unknown') + version_suffix,
+                            'warning_details': error.get('error_details', ''),
+                        }
+                        # Preserve correction hints
+                        for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
+                            if key in error:
+                                warning[key] = error[key]
+                        warnings.append(warning)
+                    # Return with warnings instead of errors - URL points to the matched version
+                    matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
+                    return latest_data, warnings, matched_url
-        return verified_data, errors, paper_url
+        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        return latest_data, errors, paper_url

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/enhanced_hybrid_checker.py RENAMED Viewed

@@ -257,6 +257,90 @@ class EnhancedHybridReferenceChecker:
         return True
+    def _merge_arxiv_with_semantic_scholar(
+        self,
+        arxiv_data: Dict[str, Any],
+        arxiv_errors: List[Dict[str, Any]],
+        arxiv_url: str,
+        ss_data: Dict[str, Any],
+        ss_errors: List[Dict[str, Any]],
+        ss_url: str,
+        reference: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        """
+        Merge ArXiv verification results with Semantic Scholar data.
+        ArXiv is authoritative for title/author/year, but Semantic Scholar
+        provides venue information and additional URLs (DOI, S2 page).
+        Args:
+            arxiv_data: Verified data from ArXiv
+            arxiv_errors: Errors/warnings from ArXiv verification
+            arxiv_url: ArXiv URL
+            ss_data: Data from Semantic Scholar
+            ss_errors: Errors from Semantic Scholar (used for venue checking)
+            ss_url: Semantic Scholar URL
+            reference: Original reference
+        Returns:
+            Tuple of (merged_data, merged_errors)
+        """
+        merged_data = dict(arxiv_data) if arxiv_data else {}
+        merged_errors = list(arxiv_errors) if arxiv_errors else []
+        if not ss_data:
+            return merged_data, merged_errors
+        # Add Semantic Scholar URL to external IDs
+        if 'externalIds' not in merged_data:
+            merged_data['externalIds'] = {}
+        ss_external_ids = ss_data.get('externalIds', {})
+        # Add S2 paper ID
+        if ss_data.get('paperId'):
+            merged_data['externalIds']['S2PaperId'] = ss_data['paperId']
+        # Add DOI if available from Semantic Scholar
+        if ss_external_ids.get('DOI') and not merged_data['externalIds'].get('DOI'):
+            merged_data['externalIds']['DOI'] = ss_external_ids['DOI']
+        # Store Semantic Scholar URL
+        merged_data['_semantic_scholar_url'] = ss_url
+        # Check for venue mismatch - if paper was published at a venue but citation only says arXiv
+        ss_venue = ss_data.get('venue', '')
+        cited_venue = reference.get('venue', reference.get('journal', '')).strip().lower()
+        # Normalize ArXiv venue names
+        is_cited_as_arxiv = (
+            not cited_venue or
+            cited_venue in ['arxiv', 'arxiv preprint', 'arxiv.org', 'preprint']
+        )
+        # Check if Semantic Scholar shows a real publication venue
+        if ss_venue and is_cited_as_arxiv:
+            # Ignore generic/empty venues
+            ss_venue_lower = ss_venue.lower().strip()
+            is_real_venue = (
+                ss_venue_lower and
+                ss_venue_lower not in ['arxiv', 'arxiv.org', 'preprint', ''] and
+                not ss_venue_lower.startswith('arxiv')
+            )
+            if is_real_venue:
+                # This paper was published at a venue but is only cited as arXiv
+                logger.debug(f"Enhanced Hybrid: Paper published at '{ss_venue}' but cited as arXiv")
+                merged_errors.append({
+                    'warning_type': 'venue',
+                    'warning_details': f"Paper was published at venue but cited as arXiv preprint:\n       cited:  arXiv\n       actual: {ss_venue}",
+                    'ref_venue_correct': ss_venue
+                })
+                # Also add the venue to merged data
+                merged_data['venue'] = ss_venue
+        return merged_data, merged_errors
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """
         Verify a non-arXiv reference using multiple APIs in priority order
@@ -287,6 +371,9 @@ class EnhancedHybridReferenceChecker:
         # Track all APIs that failed and could be retried
         failed_apis = []
+        # Store ArXiv result for potential merging with Semantic Scholar
+        arxiv_result = None
         # PHASE 1: Try all APIs once in priority order
         # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
@@ -295,13 +382,15 @@ class EnhancedHybridReferenceChecker:
             logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
             verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
             if success:
-                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
-                return verified_data, errors, url
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded, also querying Semantic Scholar for venue/URLs")
+                arxiv_result = (verified_data, errors, url)
+                # Continue to Semantic Scholar to get venue and additional URLs
             if failure_type in ['throttled', 'timeout', 'server_error']:
                 failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
         # Strategy 1: Always try local database first (fastest)
-        if self.local_db:
+        # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
+        if self.local_db and not arxiv_result:
             verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
             if success:
                 return verified_data, errors, url
@@ -309,8 +398,9 @@ class EnhancedHybridReferenceChecker:
                 failed_apis.append(('local_db', self.local_db, failure_type))
         # Strategy 2: If reference has DOI, prioritize CrossRef
+        # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
         crossref_result = None
-        if self._should_try_doi_apis_first(reference) and self.crossref:
+        if self._should_try_doi_apis_first(reference) and self.crossref and not arxiv_result:
             verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
             if success:
                 # Check if the data is complete enough to use
@@ -327,11 +417,34 @@ class EnhancedHybridReferenceChecker:
         if self.semantic_scholar:
             verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
             if success:
+                # If we have ArXiv result, merge Semantic Scholar venue/URLs into it
+                if arxiv_result:
+                    # Check if SS data is valid and venue is not just arxiv
+                    # (skip merge if SS only found the arxiv version, no published venue)
+                    if verified_data:
+                        ss_venue = self.semantic_scholar.get_venue_from_paper_data(verified_data)
+                        if ss_venue and 'arxiv' in ss_venue.lower():
+                            # SS only found arxiv venue, skip merge and return arxiv result
+                            logger.debug("Enhanced Hybrid: Semantic Scholar only found ArXiv venue, skipping merge")
+                            return arxiv_result
+                    arxiv_data, arxiv_errors, arxiv_url = arxiv_result
+                    merged_data, merged_errors = self._merge_arxiv_with_semantic_scholar(
+                        arxiv_data, arxiv_errors, arxiv_url,
+                        verified_data, errors, url,
+                        reference
+                    )
+                    return merged_data, merged_errors, arxiv_url
                 return verified_data, errors, url
             # For Semantic Scholar, only retry retryable failures (not 'not_found')
             if failure_type in ['throttled', 'timeout', 'server_error']:
                 failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
+        # If ArXiv succeeded but Semantic Scholar failed, return ArXiv result
+        if arxiv_result:
+            logger.debug("Enhanced Hybrid: Returning ArXiv result (Semantic Scholar unavailable)")
+            return arxiv_result
         # Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
         openalex_result = None
         if self.openalex:

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/checkers/semantic_scholar.py RENAMED Viewed

@@ -223,7 +223,49 @@ class NonArxivReferenceChecker:
         """
         return compare_authors(cited_authors, correct_authors)
+    def get_venue_from_paper_data(self, paper_data: Dict[str, Any]) -> Optional[str]:
+        """
+        Extract venue from paper data dictionary.
+        Checks multiple fields since Semantic Scholar returns venue info
+        in different fields depending on publication type.
+        Args:
+            paper_data: Paper data dictionary from Semantic Scholar
+        Returns:
+            Venue string or None if not found
+        """
+        if not paper_data:
+            return None
+        paper_venue = None
+        # First try the simple 'venue' field (string)
+        if paper_data.get('venue'):
+            paper_venue = paper_data.get('venue')
+        # If no venue, try publicationVenue object
+        if not paper_venue and paper_data.get('publicationVenue'):
+            pub_venue = paper_data.get('publicationVenue')
+            if isinstance(pub_venue, dict):
+                paper_venue = pub_venue.get('name', '')
+            elif isinstance(pub_venue, str):
+                paper_venue = pub_venue
+        # If still no venue, try journal object
+        if not paper_venue and paper_data.get('journal'):
+            journal = paper_data.get('journal')
+            if isinstance(journal, dict):
+                paper_venue = journal.get('name', '')
+            elif isinstance(journal, str):
+                paper_venue = journal
+        # Ensure paper_venue is a string
+        if paper_venue and not isinstance(paper_venue, str):
+            paper_venue = str(paper_venue)
+        return paper_venue if paper_venue else None
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/author_utils.py RENAMED Viewed

@@ -42,13 +42,26 @@ def compare_authors(cited_authors, correct_authors, threshold=0.8):
     Compare two author lists and return similarity metrics
     Args:
-        cited_authors: List of authors as cited
-        correct_authors: List of correct authors
+        cited_authors: List of authors as cited (can be strings or dicts with 'name' key)
+        correct_authors: List of correct authors (can be strings or dicts with 'name' key)
         threshold: Similarity threshold (0-1)
     Returns:
         Dictionary with comparison results
     """
+    # Normalize author lists to strings (handle dict format from APIs)
+    def normalize_author_list(authors):
+        result = []
+        for a in authors:
+            if isinstance(a, dict):
+                result.append(a.get('name', str(a)))
+            else:
+                result.append(str(a))
+        return result
+    cited_authors = normalize_author_list(cited_authors) if cited_authors else []
+    correct_authors = normalize_author_list(correct_authors) if correct_authors else []
     if not cited_authors or not correct_authors:
         return {
             'match': False,

{academic_refchecker-2.0.13 → academic_refchecker-2.0.14}/src/refchecker/utils/bibliography_utils.py RENAMED Viewed

@@ -164,8 +164,8 @@ def _parse_bibtex_references(bibliography_text):
     Returns:
         List of reference dictionaries
     """
-    from refchecker.utils.bibtex_parser import parse_bibtex_entries
-    return parse_bibtex_entries(bibliography_text)
+    from refchecker.utils.bibtex_parser import parse_bibtex_references
+    return parse_bibtex_references(bibliography_text)
 def _parse_biblatex_references(bibliography_text):