PyPI - academic-refchecker - Versions diffs - 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl - Mend

academic-refchecker 2.0.12py3-none-any.whl → 2.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

refchecker/checkers/arxiv_citation.py CHANGED Viewed

@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
 Key features:
 - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
-- Always uses the latest version metadata (strips version suffixes)
-- Logs warnings when cited version differs from latest version
+- Checks reference against all historical versions when latest doesn't match
+- Annotates errors with version info when reference matches an older version
 - Parses BibTeX to extract normalized metadata matching refchecker schema
 Usage:
@@ -30,6 +30,7 @@ Usage:
 import re
 import logging
 import requests
+import html
 from typing import Dict, List, Tuple, Optional, Any
 import bibtexparser
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
             # export.arxiv.org URLs
             r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
             r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
+            # DOI format
+            r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
         ]
     def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
             reference.get('cited_url', ''),
             reference.get('raw_text', ''),
             reference.get('eprint', ''),  # BibTeX field
+            reference.get('journal', ''),
+            reference.get('doi', ''),  # DOI field (may contain arXiv ID)
         ]
         for source in sources:
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
         return None
-    def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
+    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
         """
-        Get the latest version number for an ArXiv paper.
+        Check if a reference is an ArXiv paper.
-        Note: This requires fetching the abstract page, so it's optional.
-        For now, we rely on the BibTeX always returning latest version metadata.
+        Args:
+            reference: Reference dictionary
+        Returns:
+            True if reference appears to be an ArXiv paper
+        """
+        arxiv_id, _ = self.extract_arxiv_id(reference)
+        return arxiv_id is not None
+    def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
+        """
+        Fetch and parse metadata for a specific version using HTML scraping.
         Args:
             arxiv_id: ArXiv ID without version
+            version_num: Version number to fetch (1, 2, 3, etc.)
         Returns:
-            Latest version string (e.g., "v3") or None if couldn't determine
+            Dictionary with version metadata or None if version doesn't exist
         """
-        # The BibTeX endpoint always returns the latest version's metadata,
-        # so we don't need to explicitly fetch version info
-        return None
-    def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
+        version_str = f"v{version_num}"
+        url = f"{self.abs_url}/{arxiv_id}{version_str}"
+        self.rate_limiter.wait()
+        try:
+            logger.debug(f"Checking historical version: {url}")
+            response = requests.get(url, timeout=self.timeout)
+            if response.status_code == 404:
+                return None  # Version does not exist
+            response.raise_for_status()
+            html_content = response.text
+            # Parse meta tags for metadata
+            # Title
+            title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
+            title = html.unescape(title_match.group(1)).strip() if title_match else ""
+            # Authors
+            authors = []
+            for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
+                authors.append(html.unescape(auth).strip())
+            # Date/Year
+            date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
+            year = None
+            if date_match:
+                ym = re.search(r'^(\d{4})', date_match.group(1))
+                if ym:
+                    year = int(ym.group(1))
+            return {
+                'version': version_str,
+                'version_num': version_num,
+                'title': title,
+                'authors': [{'name': a} for a in authors],
+                'year': year,
+                'url': url,
+            }
+        except Exception as e:
+            logger.warning(f"Failed to fetch history {version_str}: {e}")
+            return None
+    def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
         """
-        Check if a reference is an ArXiv paper.
+        Get the latest version number by fetching the abstract page.
         Args:
-            reference: Reference dictionary
+            arxiv_id: ArXiv ID without version
         Returns:
-            True if reference appears to be an ArXiv paper
+            Latest version number as integer, or None if couldn't determine
         """
-        arxiv_id, _ = self.extract_arxiv_id(reference)
-        return arxiv_id is not None
+        url = f"{self.abs_url}/{arxiv_id}"
+        self.rate_limiter.wait()
+        try:
+            response = requests.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            # Look for version links like "[v1]", "[v2]", etc.
+            versions = re.findall(r'\[v(\d+)\]', response.text)
+            if versions:
+                return max(int(v) for v in versions)
+            return None
+        except Exception as e:
+            logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
+            return None
+    def _compare_info_match(
+            self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
+            authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
+        """
+        Compare the information of a cited paper with the authoritative information.
+        Args:
+            cited_title: Title from the reference
+            cited_authors: Authors from the reference
+            cited_year: Year from the reference
+            authoritative_title: Title from ArXiv version
+            authoritative_authors: Authors from ArXiv version
+            authoritative_year: Year from ArXiv version
+        Returns:
+            True if the information matches, False otherwise.
+        """
+        # Compare title
+        if cited_title and authoritative_title:
+            title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
+            if title_similarity < SIMILARITY_THRESHOLD:
+                return False
+        # Compare authors
+        if cited_authors and authoritative_authors:
+            authors_match, _ = compare_authors(cited_authors, authoritative_authors)
+            if not authors_match:
+                return False
+        # Compare year
+        if cited_year and authoritative_year:
+            if cited_year != authoritative_year:
+                return False
+        return True
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
         This method:
         1. Extracts the ArXiv ID from the reference
-        2. Fetches the official BibTeX from ArXiv (always latest version)
-        3. Parses the BibTeX to get authoritative metadata
-        4. Compares cited metadata against authoritative source
-        5. Logs warnings for version mismatches
+        2. Fetches the official BibTeX from ArXiv (latest version)
+        3. Compares cited metadata against latest version
+        4. If errors found, checks historical versions to find a match
+        5. Annotates errors with version info if reference matches an older version
         Args:
             reference: Reference dictionary with title, authors, year, url, etc.
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
         logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
-        # Fetch authoritative BibTeX
+        # Extract information from reference for comparison
+        cited_title = reference.get('title', '').strip()
+        cited_authors = reference.get('authors', [])
+        cited_year = reference.get('year')
+        # Fetch authoritative BibTeX (latest version)
         bibtex_content = self.fetch_bibtex(arxiv_id)
         if not bibtex_content:
             logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
             return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
-        # Parse BibTeX
-        verified_data = self.parse_bibtex(bibtex_content)
+        latest_data = self.parse_bibtex(bibtex_content)
-        if not verified_data:
+        if not latest_data:
             logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
             return None, [], None
-        # Log version mismatch warning if cited version differs from latest
-        if cited_version:
-            # ArXiv BibTeX always returns latest version metadata
-            # We don't know the actual latest version number without additional API call,
-            # but we can warn that a specific version was cited
-            errors.append({
-                'warning_type': 'version',
-                'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
-            })
-            logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
-        # Compare title
-        cited_title = reference.get('title', '').strip()
-        authoritative_title = verified_data.get('title', '').strip()
+        # Compare against latest version
+        authoritative_title = latest_data.get('title', '').strip()
         if cited_title and authoritative_title:
             title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
                 })
         # Compare authors
-        cited_authors = reference.get('authors', [])
         if cited_authors:
-            authoritative_authors = verified_data.get('authors', [])
+            authoritative_authors = latest_data.get('authors', [])
             authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
             if not authors_match:
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
                 })
         # Compare year
-        cited_year = reference.get('year')
-        authoritative_year = verified_data.get('year')
+        authoritative_year = latest_data.get('year')
         year_warning = validate_year(
             cited_year=cited_year,
             paper_year=authoritative_year,
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
         )
         if year_warning:
             errors.append(year_warning)
-        # Build URL
         paper_url = f"https://arxiv.org/abs/{arxiv_id}"
-        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        # If no errors against latest version, we're done
+        if len(errors) == 0:
+            logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
+            return latest_data, errors, paper_url
+        # Check if reference matches a historical version
+        # Get latest version number first
+        latest_version_num = self._get_latest_version_number(arxiv_id)
+        if latest_version_num and latest_version_num > 1:
+            # Check historical versions (1 to latest-1)
+            for version_num in range(1, latest_version_num):
+                version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
+                if not version_data:
+                    continue
+                # Check if reference matches this historical version
+                if self._compare_info_match(
+                        cited_title, cited_authors, cited_year,
+                        version_data['title'], version_data['authors'], version_data['year']):
+                    logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
+                    # Convert errors to warnings with version update info
+                    # Version update issues are informational, not errors - the citation was correct for its time
+                    version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
+                    warnings = []
+                    for error in errors:
+                        warning = {
+                            'warning_type': error.get('error_type', 'unknown') + version_suffix,
+                            'warning_details': error.get('error_details', ''),
+                        }
+                        # Preserve correction hints
+                        for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
+                            if key in error:
+                                warning[key] = error[key]
+                        warnings.append(warning)
+                    # Return with warnings instead of errors - URL points to the matched version
+                    matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
+                    return latest_data, warnings, matched_url
-        return verified_data, errors, paper_url
+        logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
+        return latest_data, errors, paper_url

refchecker/checkers/enhanced_hybrid_checker.py CHANGED Viewed

@@ -257,6 +257,90 @@ class EnhancedHybridReferenceChecker:
         return True
+    def _merge_arxiv_with_semantic_scholar(
+        self,
+        arxiv_data: Dict[str, Any],
+        arxiv_errors: List[Dict[str, Any]],
+        arxiv_url: str,
+        ss_data: Dict[str, Any],
+        ss_errors: List[Dict[str, Any]],
+        ss_url: str,
+        reference: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        """
+        Merge ArXiv verification results with Semantic Scholar data.
+        ArXiv is authoritative for title/author/year, but Semantic Scholar
+        provides venue information and additional URLs (DOI, S2 page).
+        Args:
+            arxiv_data: Verified data from ArXiv
+            arxiv_errors: Errors/warnings from ArXiv verification
+            arxiv_url: ArXiv URL
+            ss_data: Data from Semantic Scholar
+            ss_errors: Errors from Semantic Scholar (used for venue checking)
+            ss_url: Semantic Scholar URL
+            reference: Original reference
+        Returns:
+            Tuple of (merged_data, merged_errors)
+        """
+        merged_data = dict(arxiv_data) if arxiv_data else {}
+        merged_errors = list(arxiv_errors) if arxiv_errors else []
+        if not ss_data:
+            return merged_data, merged_errors
+        # Add Semantic Scholar URL to external IDs
+        if 'externalIds' not in merged_data:
+            merged_data['externalIds'] = {}
+        ss_external_ids = ss_data.get('externalIds', {})
+        # Add S2 paper ID
+        if ss_data.get('paperId'):
+            merged_data['externalIds']['S2PaperId'] = ss_data['paperId']
+        # Add DOI if available from Semantic Scholar
+        if ss_external_ids.get('DOI') and not merged_data['externalIds'].get('DOI'):
+            merged_data['externalIds']['DOI'] = ss_external_ids['DOI']
+        # Store Semantic Scholar URL
+        merged_data['_semantic_scholar_url'] = ss_url
+        # Check for venue mismatch - if paper was published at a venue but citation only says arXiv
+        ss_venue = ss_data.get('venue', '')
+        cited_venue = reference.get('venue', reference.get('journal', '')).strip().lower()
+        # Normalize ArXiv venue names
+        is_cited_as_arxiv = (
+            not cited_venue or
+            cited_venue in ['arxiv', 'arxiv preprint', 'arxiv.org', 'preprint']
+        )
+        # Check if Semantic Scholar shows a real publication venue
+        if ss_venue and is_cited_as_arxiv:
+            # Ignore generic/empty venues
+            ss_venue_lower = ss_venue.lower().strip()
+            is_real_venue = (
+                ss_venue_lower and
+                ss_venue_lower not in ['arxiv', 'arxiv.org', 'preprint', ''] and
+                not ss_venue_lower.startswith('arxiv')
+            )
+            if is_real_venue:
+                # This paper was published at a venue but is only cited as arXiv
+                logger.debug(f"Enhanced Hybrid: Paper published at '{ss_venue}' but cited as arXiv")
+                merged_errors.append({
+                    'warning_type': 'venue',
+                    'warning_details': f"Paper was published at venue but cited as arXiv preprint:\n       cited:  arXiv\n       actual: {ss_venue}",
+                    'ref_venue_correct': ss_venue
+                })
+                # Also add the venue to merged data
+                merged_data['venue'] = ss_venue
+        return merged_data, merged_errors
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """
         Verify a non-arXiv reference using multiple APIs in priority order
@@ -287,6 +371,9 @@ class EnhancedHybridReferenceChecker:
         # Track all APIs that failed and could be retried
         failed_apis = []
+        # Store ArXiv result for potential merging with Semantic Scholar
+        arxiv_result = None
         # PHASE 1: Try all APIs once in priority order
         # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
@@ -295,13 +382,15 @@ class EnhancedHybridReferenceChecker:
             logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
             verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
             if success:
-                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
-                return verified_data, errors, url
+                logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded, also querying Semantic Scholar for venue/URLs")
+                arxiv_result = (verified_data, errors, url)
+                # Continue to Semantic Scholar to get venue and additional URLs
             if failure_type in ['throttled', 'timeout', 'server_error']:
                 failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
         # Strategy 1: Always try local database first (fastest)
-        if self.local_db:
+        # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
+        if self.local_db and not arxiv_result:
             verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
             if success:
                 return verified_data, errors, url
@@ -309,8 +398,9 @@ class EnhancedHybridReferenceChecker:
                 failed_apis.append(('local_db', self.local_db, failure_type))
         # Strategy 2: If reference has DOI, prioritize CrossRef
+        # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
         crossref_result = None
-        if self._should_try_doi_apis_first(reference) and self.crossref:
+        if self._should_try_doi_apis_first(reference) and self.crossref and not arxiv_result:
             verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
             if success:
                 # Check if the data is complete enough to use
@@ -327,11 +417,34 @@ class EnhancedHybridReferenceChecker:
         if self.semantic_scholar:
             verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
             if success:
+                # If we have ArXiv result, merge Semantic Scholar venue/URLs into it
+                if arxiv_result:
+                    # Check if SS data is valid and venue is not just arxiv
+                    # (skip merge if SS only found the arxiv version, no published venue)
+                    if verified_data:
+                        ss_venue = self.semantic_scholar.get_venue_from_paper_data(verified_data)
+                        if ss_venue and 'arxiv' in ss_venue.lower():
+                            # SS only found arxiv venue, skip merge and return arxiv result
+                            logger.debug("Enhanced Hybrid: Semantic Scholar only found ArXiv venue, skipping merge")
+                            return arxiv_result
+                    arxiv_data, arxiv_errors, arxiv_url = arxiv_result
+                    merged_data, merged_errors = self._merge_arxiv_with_semantic_scholar(
+                        arxiv_data, arxiv_errors, arxiv_url,
+                        verified_data, errors, url,
+                        reference
+                    )
+                    return merged_data, merged_errors, arxiv_url
                 return verified_data, errors, url
             # For Semantic Scholar, only retry retryable failures (not 'not_found')
             if failure_type in ['throttled', 'timeout', 'server_error']:
                 failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
+        # If ArXiv succeeded but Semantic Scholar failed, return ArXiv result
+        if arxiv_result:
+            logger.debug("Enhanced Hybrid: Returning ArXiv result (Semantic Scholar unavailable)")
+            return arxiv_result
         # Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
         openalex_result = None
         if self.openalex:

refchecker/checkers/semantic_scholar.py CHANGED Viewed

@@ -223,7 +223,49 @@ class NonArxivReferenceChecker:
         """
         return compare_authors(cited_authors, correct_authors)
+    def get_venue_from_paper_data(self, paper_data: Dict[str, Any]) -> Optional[str]:
+        """
+        Extract venue from paper data dictionary.
+        Checks multiple fields since Semantic Scholar returns venue info
+        in different fields depending on publication type.
+        Args:
+            paper_data: Paper data dictionary from Semantic Scholar
+        Returns:
+            Venue string or None if not found
+        """
+        if not paper_data:
+            return None
+        paper_venue = None
+        # First try the simple 'venue' field (string)
+        if paper_data.get('venue'):
+            paper_venue = paper_data.get('venue')
+        # If no venue, try publicationVenue object
+        if not paper_venue and paper_data.get('publicationVenue'):
+            pub_venue = paper_data.get('publicationVenue')
+            if isinstance(pub_venue, dict):
+                paper_venue = pub_venue.get('name', '')
+            elif isinstance(pub_venue, str):
+                paper_venue = pub_venue
+        # If still no venue, try journal object
+        if not paper_venue and paper_data.get('journal'):
+            journal = paper_data.get('journal')
+            if isinstance(journal, dict):
+                paper_venue = journal.get('name', '')
+            elif isinstance(journal, str):
+                paper_venue = journal
+        # Ensure paper_venue is a string
+        if paper_venue and not isinstance(paper_venue, str):
+            paper_venue = str(paper_venue)
+        return paper_venue if paper_venue else None
     def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
         """

refchecker/llm/base.py CHANGED Viewed

@@ -110,21 +110,7 @@ class LLMProvider(ABC):
         logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
         return chunks
-    def _parse_llm_response(self, response_text: str) -> List[str]:
-        """Parse LLM response and extract individual references"""
-        if not response_text:
-            return []
-        # Split by newlines and filter out empty lines
-        references = []
-        for line in response_text.strip().split('\n'):
-            line = line.strip()
-            if line and not line.startswith('#') and len(line) > 10:  # Basic filtering
-                references.append(line)
-        return references
     def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
         """
         Template method that handles chunking for all providers.

academic-refchecker 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl

academic-refchecker 2.0.12py3-none-any.whl → 2.0.14py3-none-any.whl