PyPI - academic-refchecker - Versions diffs - 1.2.47__tar.gz → 1.2.49__tar.gz - Mend

academic-refchecker 1.2.47tar.gz → 1.2.49tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{academic_refchecker-1.2.47/src/academic_refchecker.egg-info → academic_refchecker-1.2.49}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.47
+Version: 1.2.49
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.47"
+__version__ = "1.2.49"

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.47
+Version: 1.2.49
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/enhanced_hybrid_checker.py RENAMED Viewed

@@ -312,12 +312,36 @@ class EnhancedHybridReferenceChecker:
         if (self.openreview and
             hasattr(self.openreview, 'is_openreview_reference') and
             self.openreview.is_openreview_reference(reference)):
+            logger.debug("Enhanced Hybrid: Trying OpenReview URL-based verification")
             verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
             if success:
                 return verified_data, errors, url
             if failure_type in ['throttled', 'timeout', 'server_error']:
                 failed_apis.append(('openreview', self.openreview, failure_type))
+        # Strategy 5b: Try OpenReview by search if venue suggests it might be there
+        elif (self.openreview and
+              hasattr(self.openreview, 'verify_reference_by_search')):
+            # Check if venue suggests this might be on OpenReview
+            venue = reference.get('venue', reference.get('journal', '')).lower()
+            openreview_venues = [
+                'iclr', 'icml', 'neurips', 'nips', 'aaai', 'ijcai',
+                'international conference on learning representations',
+                'international conference on machine learning',
+                'neural information processing systems'
+            ]
+            venue_suggests_openreview = any(or_venue in venue for or_venue in openreview_venues)
+            logger.debug(f"Enhanced Hybrid: OpenReview venue check - venue: '{venue}', suggests: {venue_suggests_openreview}")
+            if venue_suggests_openreview:
+                logger.debug("Enhanced Hybrid: Trying OpenReview search-based verification")
+                verified_data, errors, url, success, failure_type = self._try_openreview_search(reference)
+                if success:
+                    return verified_data, errors, url
+                if failure_type in ['throttled', 'timeout', 'server_error']:
+                    failed_apis.append(('openreview_search', self.openreview, failure_type))
         # Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
         if not self._should_try_doi_apis_first(reference) and self.crossref:
             verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
@@ -399,6 +423,66 @@ class EnhancedHybridReferenceChecker:
             'error_details': 'Could not verify reference using any available API'
         }], None
+    def _try_openreview_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
+        """
+        Try to verify reference using OpenReview search
+        Returns:
+            Tuple of (verified_data, errors, url, success, failure_type)
+        """
+        if not self.openreview:
+            return None, [], None, False, 'none'
+        start_time = time.time()
+        failure_type = 'none'
+        try:
+            verified_data, errors, url = self.openreview.verify_reference_by_search(reference)
+            duration = time.time() - start_time
+            # Consider it successful if we found data or verification errors
+            success = verified_data is not None or len(errors) > 0
+            self._update_api_stats('openreview', success, duration)
+            if success:
+                logger.debug(f"Enhanced Hybrid: OpenReview search successful in {duration:.2f}s, URL: {url}")
+                return verified_data, errors, url, True, 'none'
+            else:
+                logger.debug(f"Enhanced Hybrid: OpenReview search found no results in {duration:.2f}s")
+                return None, [], None, False, 'not_found'
+        except requests.exceptions.Timeout as e:
+            duration = time.time() - start_time
+            self._update_api_stats('openreview', False, duration)
+            failure_type = 'timeout'
+            logger.debug(f"Enhanced Hybrid: OpenReview search timed out in {duration:.2f}s: {e}")
+            return None, [], None, False, failure_type
+        except requests.exceptions.RequestException as e:
+            duration = time.time() - start_time
+            self._update_api_stats('openreview', False, duration)
+            # Check if it's a rate limiting error
+            if hasattr(e, 'response') and e.response is not None:
+                if e.response.status_code in [429, 503]:
+                    failure_type = 'throttled'
+                elif e.response.status_code >= 500:
+                    failure_type = 'server_error'
+                else:
+                    failure_type = 'other'
+            else:
+                failure_type = 'other'
+            logger.debug(f"Enhanced Hybrid: OpenReview search failed in {duration:.2f}s: {type(e).__name__}: {e}")
+            return None, [], None, False, failure_type
+        except Exception as e:
+            duration = time.time() - start_time
+            self._update_api_stats('openreview', False, duration)
+            failure_type = 'other'
+            logger.debug(f"Enhanced Hybrid: OpenReview search error in {duration:.2f}s: {type(e).__name__}: {e}")
+            return None, [], None, False, failure_type
     def get_performance_stats(self) -> Dict[str, Any]:
         """
         Get performance statistics for all APIs

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/openreview_checker.py RENAMED Viewed

@@ -498,6 +498,160 @@ class OpenReviewReferenceChecker:
         logger.debug(f"OpenReview verification completed for: {openreview_url}")
         return verified_data, errors, openreview_url
+    def verify_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a reference by searching OpenReview (when no URL is provided)
+        Args:
+            reference: Reference dictionary with title, authors, year, etc.
+        Returns:
+            Tuple of (verified_data, errors, paper_url) where:
+            - verified_data: Dict with verified OpenReview paper data or None
+            - errors: List of error/warning dictionaries
+            - paper_url: The OpenReview URL if found
+        """
+        logger.debug(f"Searching OpenReview for reference: {reference.get('title', 'Untitled')}")
+        title = reference.get('title', '').strip()
+        authors = reference.get('authors', [])
+        year = reference.get('year')
+        venue = reference.get('venue', '').strip()
+        if not title:
+            return None, [], None
+        # Check if venue suggests this might be on OpenReview
+        if not self._is_likely_openreview_venue(venue):
+            logger.debug(f"Venue '{venue}' doesn't suggest OpenReview, skipping search")
+            return None, [], None
+        # Search for matching papers
+        search_results = self.search_paper(title, authors, year)
+        if not search_results:
+            logger.debug("No matching papers found on OpenReview")
+            return None, [], None
+        # Use the best match (first result, as they're sorted by relevance)
+        best_match = search_results[0]
+        paper_url = best_match.get('forum_url')
+        logger.debug(f"Found OpenReview match: {best_match.get('title', 'Untitled')}")
+        # Verify the reference against the found paper
+        errors = []
+        # Check title match
+        cited_title = reference.get('title', '').strip()
+        paper_title = best_match.get('title', '').strip()
+        if cited_title and paper_title:
+            similarity = calculate_title_similarity(cited_title, paper_title)
+            if similarity < 0.8:  # Slightly higher threshold for search results
+                from utils.error_utils import format_title_mismatch
+                details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
+                errors.append({
+                    "warning_type": "title",
+                    "warning_details": details
+                })
+        # Check authors
+        cited_authors = reference.get('authors', [])
+        paper_authors = best_match.get('authors', [])
+        if cited_authors and paper_authors:
+            # Convert to list format if needed
+            if isinstance(cited_authors, str):
+                cited_authors = [author.strip() for author in cited_authors.split(',')]
+            if isinstance(paper_authors, str):
+                paper_authors = [author.strip() for author in paper_authors.split(',')]
+            # Use the existing author comparison function
+            match, error_msg = compare_authors(cited_authors, paper_authors)
+            if not match and error_msg:
+                errors.append({
+                    "warning_type": "author",
+                    "warning_details": error_msg
+                })
+        # Check year
+        cited_year = reference.get('year')
+        paper_year = best_match.get('year')
+        if cited_year and paper_year:
+            try:
+                cited_year_int = int(cited_year)
+                paper_year_int = int(paper_year)
+                is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
+                if is_different and year_message:
+                    from utils.error_utils import format_year_mismatch
+                    errors.append({
+                        "warning_type": "year",
+                        "warning_details": format_year_mismatch(cited_year_int, paper_year_int)
+                    })
+            except (ValueError, TypeError):
+                pass  # Skip year validation if conversion fails
+        # Check venue if provided in reference
+        cited_venue = reference.get('venue', '').strip()
+        paper_venue = best_match.get('venue', '').strip()
+        if cited_venue and paper_venue:
+            if are_venues_substantially_different(cited_venue, paper_venue):
+                from utils.error_utils import format_venue_mismatch
+                errors.append({
+                    "warning_type": "venue",
+                    "warning_details": format_venue_mismatch(cited_venue, paper_venue)
+                })
+        # Create verified data structure
+        verified_data = {
+            'title': best_match.get('title', cited_title),
+            'authors': best_match.get('authors', cited_authors),
+            'year': best_match.get('year', cited_year),
+            'venue': best_match.get('venue', cited_venue),
+            'url': paper_url,
+            'abstract': best_match.get('abstract', ''),
+            'keywords': best_match.get('keywords', []),
+            'openreview_metadata': best_match,
+            'verification_source': 'OpenReview (search)'
+        }
+        logger.debug(f"OpenReview search verification completed for: {paper_url}")
+        return verified_data, errors, paper_url
+    def _is_likely_openreview_venue(self, venue: str) -> bool:
+        """
+        Check if a venue suggests the paper might be on OpenReview
+        Args:
+            venue: Venue string from reference
+        Returns:
+            True if venue suggests OpenReview
+        """
+        if not venue:
+            return False
+        venue_lower = venue.lower()
+        # Common venues that use OpenReview
+        openreview_venues = [
+            'iclr', 'international conference on learning representations',
+            'neurips', 'neural information processing systems', 'nips',
+            'icml', 'international conference on machine learning',
+            'iclr workshop', 'neurips workshop', 'icml workshop',
+            'aaai', 'ijcai', 'aistats'
+        ]
+        for or_venue in openreview_venues:
+            if or_venue in venue_lower:
+                return True
+        return False
     def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
         """
         Search for papers on OpenReview by title, authors, and/or year
@@ -510,7 +664,316 @@ class OpenReviewReferenceChecker:
         Returns:
             List of matching paper metadata dictionaries
         """
-        # This would implement search functionality if needed
-        # For now, OpenReview verification is primarily URL-based
-        logger.debug(f"Search functionality not yet implemented for OpenReview")
-        return []
+        if not title or not title.strip():
+            return []
+        logger.debug(f"Searching OpenReview for: {title}")
+        # Clean title for search
+        search_title = clean_title_for_search(title)
+        # Try API search first
+        results = self._search_via_api(search_title, authors, year)
+        if results:
+            return results
+        # If API search fails, try web search as fallback
+        return self._search_via_web(search_title, authors, year)
+    def _search_via_api(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
+        """
+        Search using OpenReview API
+        Args:
+            title: Clean title to search for
+            authors: List of author names (optional)
+            year: Publication year (optional)
+        Returns:
+            List of matching paper dictionaries
+        """
+        try:
+            # The OpenReview API requires specific parameters
+            # We'll search by content.title or content.venue (for venue-based search)
+            search_params = {
+                'limit': 20,  # Limit results to avoid overwhelming the API
+                'details': 'directReplies'  # Get basic details
+            }
+            # Try searching by venue first if year suggests recent conferences
+            if year and year >= 2017:  # OpenReview started around 2017
+                venues_by_year = {
+                    2025: ['ICLR 2025'],
+                    2024: ['ICLR 2024', 'NeurIPS 2024', 'ICML 2024'],
+                    2023: ['ICLR 2023', 'NeurIPS 2023', 'ICML 2023'],
+                    2022: ['ICLR 2022', 'NeurIPS 2022', 'ICML 2022'],
+                    2021: ['ICLR 2021', 'NeurIPS 2021', 'ICML 2021'],
+                    2020: ['ICLR 2020', 'NeurIPS 2020', 'ICML 2020'],
+                    2019: ['ICLR 2019', 'NeurIPS 2019', 'ICML 2019'],
+                    2018: ['ICLR 2018', 'NeurIPS 2018', 'ICML 2018'],
+                    2017: ['ICLR 2017']
+                }
+                possible_venues = venues_by_year.get(year, [])
+                results = []
+                for venue in possible_venues:
+                    # Search by venue and then filter by title
+                    venue_params = search_params.copy()
+                    venue_params['content.venue'] = venue
+                    api_url = f"{self.api_url}/notes"
+                    response = self._respectful_request(api_url, params=venue_params)
+                    if response and response.status_code == 200:
+                        try:
+                            data = response.json()
+                            if 'notes' in data and data['notes']:
+                                for note in data['notes']:
+                                    try:
+                                        metadata = self._parse_api_response(note)
+                                        if metadata and self._is_good_match(metadata, title, authors, year):
+                                            results.append(metadata)
+                                            if len(results) >= 5:  # Limit results
+                                                break
+                                    except Exception as e:
+                                        logger.debug(f"Error parsing note: {e}")
+                                        continue
+                                if results:
+                                    break  # Found results, no need to search other venues
+                        except (json.JSONDecodeError, KeyError) as e:
+                            logger.debug(f"Failed to parse venue search response: {e}")
+                            continue
+                    else:
+                        logger.debug(f"Venue search failed for {venue}: {response.status_code if response else 'No response'}")
+                if results:
+                    logger.debug(f"OpenReview API search found {len(results)} matches via venue search")
+                    return results
+            # If venue search didn't work, try other approaches
+            # OpenReview API is quite restrictive, so we might need to fall back to web scraping
+            logger.debug("OpenReview API venue search returned no results, trying web search")
+            return []
+        except Exception as e:
+            logger.debug(f"OpenReview API search error: {e}")
+            return []
+    def _search_via_web(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
+        """
+        Search using OpenReview web interface (fallback)
+        Args:
+            title: Clean title to search for
+            authors: List of author names (optional)
+            year: Publication year (optional)
+        Returns:
+            List of matching paper dictionaries
+        """
+        try:
+            # Build search URL
+            search_query = title.replace(' ', '+')
+            search_url = f"{self.base_url}/search?term={search_query}"
+            response = self._respectful_request(search_url)
+            if not response or response.status_code != 200:
+                return []
+            # Parse search results page
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Look for paper links in search results
+            # OpenReview search results typically contain links to forum pages
+            results = []
+            # Find links that look like OpenReview paper URLs
+            for link in soup.find_all('a', href=True):
+                href = link.get('href', '')
+                if '/forum?id=' in href:
+                    paper_id = self.extract_paper_id(href)
+                    if paper_id:
+                        # Get full metadata for this paper
+                        metadata = self.get_paper_metadata(paper_id)
+                        if metadata and self._is_good_match(metadata, title, authors, year):
+                            results.append(metadata)
+                            if len(results) >= 5:  # Limit results
+                                break
+            logger.debug(f"OpenReview web search found {len(results)} matches")
+            return results
+        except Exception as e:
+            logger.debug(f"OpenReview web search error: {e}")
+            return []
+    def _is_good_match(self, metadata: Dict[str, Any], search_title: str, authors: List[str] = None, year: int = None) -> bool:
+        """
+        Check if the found paper is a good match for the search criteria
+        Args:
+            metadata: Paper metadata from OpenReview
+            search_title: Title we're searching for
+            authors: Authors we're looking for (optional)
+            year: Year we're looking for (optional)
+        Returns:
+            True if it's a good match
+        """
+        paper_title = metadata.get('title', '')
+        if not paper_title:
+            return False
+        # Check title similarity
+        title_similarity = calculate_title_similarity(search_title, paper_title)
+        if title_similarity < 0.7:  # Require at least 70% similarity
+            return False
+        # Check year if provided
+        if year:
+            paper_year = metadata.get('year')
+            if paper_year and abs(int(paper_year) - year) > 1:  # Allow 1 year difference
+                return False
+        # Check authors if provided
+        if authors and len(authors) > 0:
+            paper_authors = metadata.get('authors', [])
+            if paper_authors:
+                # Check if at least one author matches
+                author_match = False
+                for search_author in authors[:2]:  # Check first 2 authors
+                    for paper_author in paper_authors[:3]:  # Check first 3 paper authors
+                        if is_name_match(search_author, paper_author):
+                            author_match = True
+                            break
+                    if author_match:
+                        break
+                if not author_match:
+                    return False
+        return True
+    def search_by_title(self, title: str, max_results: int = 5) -> List[Dict[str, Any]]:
+        """
+        Search OpenReview for papers by title using the working search API.
+        Args:
+            title: Paper title to search for
+            max_results: Maximum number of results to return
+        Returns:
+            List of paper data dictionaries
+        """
+        try:
+            # Use OpenReview's search API with term parameter (this works!)
+            params = {
+                'term': title,
+                'limit': max_results
+            }
+            response = self._respectful_request(f"{self.api_url}/notes/search", params=params)
+            if not response or response.status_code != 200:
+                logger.debug(f"OpenReview search API failed with status {response.status_code if response else 'None'}")
+                return []
+            data = response.json()
+            papers = []
+            for note in data.get('notes', []):
+                # Filter to exact or close title matches
+                note_title = note.get('content', {}).get('title', '')
+                if self._is_title_match(title, note_title):
+                    paper_data = self._parse_api_response(note)
+                    if paper_data:
+                        papers.append(paper_data)
+            logger.debug(f"OpenReview search found {len(papers)} matching papers for '{title}'")
+            return papers
+        except Exception as e:
+            logger.error(f"Error searching OpenReview by title '{title}': {e}")
+            return []
+    def _is_title_match(self, search_title: str, found_title: str, threshold: float = 0.8) -> bool:
+        """
+        Check if two titles match closely enough.
+        Args:
+            search_title: Title we're searching for
+            found_title: Title found in search results
+            threshold: Similarity threshold (0.0 to 1.0)
+        Returns:
+            True if titles match closely enough
+        """
+        if not search_title or not found_title:
+            return False
+        # Exact match
+        if search_title.lower().strip() == found_title.lower().strip():
+            return True
+        # Check if one contains the other (for cases where one is longer)
+        search_clean = search_title.lower().strip()
+        found_clean = found_title.lower().strip()
+        if search_clean in found_clean or found_clean in search_clean:
+            return True
+        # Use similarity calculation from text_utils
+        try:
+            from utils.text_utils import calculate_title_similarity
+            similarity = calculate_title_similarity(search_title, found_title)
+            return similarity >= threshold
+        except ImportError:
+            # Fallback to simple word matching
+            search_words = set(search_clean.split())
+            found_words = set(found_clean.split())
+            if not search_words or not found_words:
+                return False
+            intersection = search_words.intersection(found_words)
+            union = search_words.union(found_words)
+            jaccard_similarity = len(intersection) / len(union) if union else 0
+            return jaccard_similarity >= threshold
+    def verify_reference_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a reference by searching OpenReview (for papers without URLs).
+        Args:
+            reference: Reference data dictionary
+        Returns:
+            Tuple of (verified_data, errors_and_warnings, debug_info)
+        """
+        title = reference.get('title', '').strip()
+        if not title:
+            return None, [], "No title provided for search"
+        # Search for the paper
+        search_results = self.search_by_title(title)
+        if not search_results:
+            return None, [], f"No papers found on OpenReview for title: {title}"
+        # Take the best match (first result, as search is already filtered)
+        best_match = search_results[0]
+        # Use the existing verify_reference method with the found URL
+        forum_url = best_match.get('forum_url')
+        if forum_url:
+            # Create a reference with the OpenReview URL for verification
+            reference_with_url = reference.copy()
+            reference_with_url['url'] = forum_url
+            return self.verify_reference(reference_with_url)
+        # If no URL, return the metadata as verification
+        return best_match, [], f"Found on OpenReview: {best_match.get('title')}"

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/checkers/semantic_scholar.py RENAMED Viewed

@@ -583,8 +583,8 @@ class NonArxivReferenceChecker:
             if not (has_arxiv_url or has_arxiv_doi):
                 errors.append({
-                    'warning_type': 'url',
-                    'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
+                    'info_type': 'url',
+                    'info_details': f"Reference could include arXiv URL: {arxiv_url}",
                     'ref_url_correct': arxiv_url
                 })

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/parallel_processor.py RENAMED Viewed

@@ -340,7 +340,7 @@ class ParallelReferenceProcessor:
         # Display errors and warnings
         if result.errors:
             # Check if there's an unverified error
-            has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in result.errors)
+            has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in result.errors)
             if has_unverified_error:
                 # Use the centralized unverified error display function from base checker
@@ -348,9 +348,9 @@ class ParallelReferenceProcessor:
             # Display all non-unverified errors and warnings
             for error in result.errors:
-                if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
-                    error_type = error.get('error_type') or error.get('warning_type')
-                    error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
+                    error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
+                    error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
                     from utils.error_utils import print_labeled_multiline
@@ -359,8 +359,10 @@ class ParallelReferenceProcessor:
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
                         print_labeled_multiline("❌ Error", error_details)
-                    else:
+                    elif 'warning_type' in error:
                         print_labeled_multiline("⚠️  Warning", error_details)
+                    else:
+                        print_labeled_multiline("ℹ️  Information", error_details)
         # Show timing info for slow references
         if result.processing_time > 5.0:

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/core/refchecker.py RENAMED Viewed

@@ -2033,6 +2033,9 @@ class ArxivReferenceChecker:
                     elif 'warning_type' in error:
                         formatted_error['warning_type'] = error['warning_type']
                         formatted_error['warning_details'] = error['warning_details']
+                    elif 'info_type' in error:
+                        formatted_error['info_type'] = error['info_type']
+                        formatted_error['info_details'] = error['info_details']
                     # Add correct information based on error type
                     if error.get('error_type') == 'author':
@@ -2042,6 +2045,8 @@ class ArxivReferenceChecker:
                     elif error.get('error_type') == 'doi':
                         from utils.doi_utils import construct_doi_url
                         formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
+                    elif error.get('info_type') == 'url':
+                        formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
                     formatted_errors.append(formatted_error)
@@ -2153,17 +2158,22 @@ class ArxivReferenceChecker:
             for error in errors:
                 formatted_error = {}
-                # Handle error_type and warning_type properly
+                # Handle error_type, warning_type, and info_type properly
                 if 'error_type' in error:
                     formatted_error['error_type'] = error['error_type']
                     formatted_error['error_details'] = error['error_details']
                 elif 'warning_type' in error:
                     formatted_error['warning_type'] = error['warning_type']
                     formatted_error['warning_details'] = error['warning_details']
+                elif 'info_type' in error:
+                    formatted_error['info_type'] = error['info_type']
+                    formatted_error['info_details'] = error['info_details']
                 # Add correct information based on error type
                 if error.get('warning_type') == 'year':
                     formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
+                elif error.get('info_type') == 'url':
+                    formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
                 formatted_errors.append(formatted_error)
@@ -2214,13 +2224,16 @@ class ArxivReferenceChecker:
             for error in errors:
                 formatted_error = {}
-                # Handle error_type and warning_type properly
+                # Handle error_type, warning_type, and info_type properly
                 if 'error_type' in error:
                     formatted_error['error_type'] = error['error_type']
                     formatted_error['error_details'] = error['error_details']
                 elif 'warning_type' in error:
                     formatted_error['warning_type'] = error['warning_type']
                     formatted_error['warning_details'] = error['warning_details']
+                elif 'info_type' in error:
+                    formatted_error['info_type'] = error['info_type']
+                    formatted_error['info_details'] = error['info_details']
                 formatted_errors.append(formatted_error)
@@ -2335,13 +2348,16 @@ class ArxivReferenceChecker:
             logger.debug(f"DEBUG: Error {i}: {error}")
             formatted_error = {}
-            # Handle error_type and warning_type properly
+            # Handle error_type, warning_type, and info_type properly
             if 'error_type' in error:
                 formatted_error['error_type'] = error['error_type']
                 formatted_error['error_details'] = error['error_details']
             elif 'warning_type' in error:
                 formatted_error['warning_type'] = error['warning_type']
                 formatted_error['warning_details'] = error['warning_details']
+            elif 'info_type' in error:
+                formatted_error['info_type'] = error['info_type']
+                formatted_error['info_details'] = error['info_details']
             # Add correct information based on error type
             if error.get('error_type') == 'author':
@@ -2637,9 +2653,19 @@ class ArxivReferenceChecker:
             # Generate corrected reference using all available corrections
             corrected_data = self._extract_corrected_data_from_error(consolidated_entry, verified_data)
-            corrected_format = format_corrected_reference(reference, corrected_data, consolidated_entry)
-            if corrected_format:
-                consolidated_entry['ref_corrected_format'] = corrected_format
+            # Generate all three formats for user convenience
+            from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
+            plaintext_format = format_corrected_plaintext(reference, corrected_data, consolidated_entry)
+            bibtex_format = format_corrected_bibtex(reference, corrected_data, consolidated_entry)
+            bibitem_format = format_corrected_bibitem(reference, corrected_data, consolidated_entry)
+            if plaintext_format:
+                consolidated_entry['ref_corrected_plaintext'] = plaintext_format
+            if bibtex_format:
+                consolidated_entry['ref_corrected_bibtex'] = bibtex_format
+            if bibitem_format:
+                consolidated_entry['ref_corrected_bibitem'] = bibitem_format
             # Store the consolidated entry (write to file at end of run)
             self.errors.append(consolidated_entry)
@@ -2647,8 +2673,8 @@ class ArxivReferenceChecker:
         else:
             # Single error - handle as before
             error = errors[0]
-            error_type = error.get('error_type') or error.get('warning_type', 'unknown')
-            error_details = error.get('error_details') or error.get('warning_details', '')
+            error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type', 'unknown')
+            error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', '')
             error_entry = {
                 # Source paper metadata
@@ -2696,11 +2722,21 @@ class ArxivReferenceChecker:
             if error_type != 'unverified':
                 error_entry['ref_standard_format'] = self.format_standard_reference(error)
-                # Generate corrected reference in original format
+                # Generate corrected reference in all formats for user convenience
                 corrected_data = self._extract_corrected_data_from_error(error, verified_data)
-                corrected_format = format_corrected_reference(reference, corrected_data, error_entry)
-                if corrected_format:
-                    error_entry['ref_corrected_format'] = corrected_format
+                # Generate all three formats
+                from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
+                plaintext_format = format_corrected_plaintext(reference, corrected_data, error_entry)
+                bibtex_format = format_corrected_bibtex(reference, corrected_data, error_entry)
+                bibitem_format = format_corrected_bibitem(reference, corrected_data, error_entry)
+                if plaintext_format:
+                    error_entry['ref_corrected_plaintext'] = plaintext_format
+                if bibtex_format:
+                    error_entry['ref_corrected_bibtex'] = bibtex_format
+                if bibitem_format:
+                    error_entry['ref_corrected_bibitem'] = bibitem_format
             else:
                 error_entry['ref_standard_format'] = None
@@ -2755,7 +2791,9 @@ class ArxivReferenceChecker:
                         emoji = "❓"
                     elif error_type in ['year', 'venue']:  # Warning types
                         emoji = "⚠️"
-                    else:  # Error types (title, author, doi, url, multiple, etc.)
+                    elif error_type == 'url':  # Info type (ArXiv URL suggestion)
+                        emoji = "ℹ️"
+                    else:  # Error types (title, author, doi, multiple, etc.)
                         emoji = "❌"
                     f.write(f"Type: {emoji} {error_entry['error_type']}\n")
@@ -2772,8 +2810,29 @@ class ArxivReferenceChecker:
                         f.write(f"  {error_entry['ref_verified_url']}\n")
                         f.write("\n")
-                    # Show corrected reference in original format if available
-                    if error_entry.get('ref_corrected_format'):
+                    # Show corrected reference in all formats if available
+                    formats_written = False
+                    # Plain text format
+                    if error_entry.get('ref_corrected_plaintext'):
+                        f.write("CORRECTED REFERENCE (Plain Text):\n")
+                        f.write(f"{error_entry['ref_corrected_plaintext']}\n\n")
+                        formats_written = True
+                    # BibTeX format
+                    if error_entry.get('ref_corrected_bibtex'):
+                        f.write("CORRECTED REFERENCE (BibTeX):\n")
+                        f.write(f"{error_entry['ref_corrected_bibtex']}\n\n")
+                        formats_written = True
+                    # Bibitem/LaTeX format
+                    if error_entry.get('ref_corrected_bibitem'):
+                        f.write("CORRECTED REFERENCE (LaTeX/Biblatex):\n")
+                        f.write(f"{error_entry['ref_corrected_bibitem']}\n\n")
+                        formats_written = True
+                    # Fallback to legacy format if no new formats available
+                    if not formats_written and error_entry.get('ref_corrected_format'):
                         f.write("CORRECTED REFERENCE:\n")
                         f.write(f"{error_entry['ref_corrected_format']}\n\n")
@@ -2865,8 +2924,10 @@ class ArxivReferenceChecker:
         self.total_references_processed = 0
         self.papers_with_errors = 0
         self.papers_with_warnings = 0
+        self.papers_with_info = 0
         self.total_errors_found = 0
         self.total_warnings_found = 0
+        self.total_info_found = 0
         self.total_arxiv_refs = 0
         self.total_non_arxiv_refs = 0
         self.total_other_refs = 0
@@ -3025,18 +3086,21 @@ class ArxivReferenceChecker:
                         # Separate actual errors from warnings for paper classification
                         actual_errors = [e for e in paper_errors if 'error_type' in e and e['error_type'] != 'unverified']
                         warnings_only = [e for e in paper_errors if 'warning_type' in e]
+                        info_only = [e for e in paper_errors if 'info_type' in e]
                         if self.single_paper_mode:
                             # Single paper mode - show simple summary
-                            if actual_errors or warnings_only:
+                            if actual_errors or warnings_only or info_only:
                                 summary_parts = []
                                 if actual_errors:
                                     summary_parts.append(f"{len(actual_errors)} errors")
                                 if warnings_only:
                                     summary_parts.append(f"{len(warnings_only)} warnings")
+                                if info_only:
+                                    summary_parts.append(f"{len(info_only)} information")
                         else:
                             # Multi-paper mode - track paper statistics
-                            if actual_errors or warnings_only:
+                            if actual_errors or warnings_only or info_only:
                                 summary_parts = []
                                 if actual_errors:
                                     summary_parts.append(f"{len(actual_errors)} errors")
@@ -3045,6 +3109,10 @@ class ArxivReferenceChecker:
                                     summary_parts.append(f"{len(warnings_only)} warnings")
                                     # Count as paper with warnings if it has warnings (regardless of errors)
                                     self.papers_with_warnings += 1
+                                if info_only:
+                                    summary_parts.append(f"{len(info_only)} information")
+                                    # Count as paper with info if it has info messages (regardless of errors/warnings)
+                                    self.papers_with_info += 1
                 except Exception as e:
                     logger.error(f"Error processing paper {paper_id}: {str(e)}")
@@ -3086,9 +3154,11 @@ class ArxivReferenceChecker:
                     print(f"❌ Total errors: {self.total_errors_found}")
                 if self.total_warnings_found > 0:
                     print(f"⚠️  Total warnings: {self.total_warnings_found}")
+                if self.total_info_found > 0:
+                    print(f"ℹ️  Total information: {self.total_info_found}")
                 if self.total_unverified_refs > 0:
                     print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
-                if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
+                if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_info_found == 0 and self.total_unverified_refs == 0:
                     print(f"✅ All references verified successfully!")
                 # Show warning if unreliable extraction was used and there are many errors
@@ -3108,6 +3178,8 @@ class ArxivReferenceChecker:
                 print(f"         Total errors:   {self.total_errors_found}")
                 print(f"⚠️  Papers with warnings: {self.papers_with_warnings}")
                 print(f"         Total warnings: {self.total_warnings_found}")
+                print(f"ℹ️  Papers with information: {self.papers_with_info}")
+                print(f"         Total information: {self.total_info_found}")
                 print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
                 # Show warning if unreliable extraction was used and there are many errors
@@ -5307,7 +5379,7 @@ class ArxivReferenceChecker:
         # If errors found, add to dataset and optionally print details
         if errors:
             # Check if there's an unverified error among the errors
-            has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in errors)
+            has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in errors)
             if has_unverified_error:
                 self.total_unverified_refs += 1
@@ -5317,11 +5389,13 @@ class ArxivReferenceChecker:
             self.add_error_to_dataset(paper, reference, errors, reference_url, verified_data)
             paper_errors.extend(errors)
-            # Count errors vs warnings
+            # Count errors vs warnings vs info
             error_count = sum(1 for e in errors if 'error_type' in e and e['error_type'] != 'unverified')
             warning_count = sum(1 for e in errors if 'warning_type' in e)
+            info_count = sum(1 for e in errors if 'info_type' in e)
             self.total_errors_found += error_count
             self.total_warnings_found += warning_count
+            self.total_info_found += info_count
             # Display all non-unverified errors and warnings
             self._display_non_unverified_errors(errors, debug_mode, print_output)
@@ -5468,9 +5542,9 @@ class ArxivReferenceChecker:
         """Display all non-unverified errors and warnings"""
         if not debug_mode and print_output:
             for error in errors:
-                if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
-                    error_type = error.get('error_type') or error.get('warning_type')
-                    error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
+                if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
+                    error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
+                    error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
                     from utils.error_utils import print_labeled_multiline
@@ -5478,8 +5552,10 @@ class ArxivReferenceChecker:
                         print(f"      ❌ {error_details}")
                     elif 'error_type' in error:
                         print_labeled_multiline("❌ Error", error_details)
-                    else:
+                    elif 'warning_type' in error:
                         print_labeled_multiline("⚠️  Warning", error_details)
+                    else:
+                        print_labeled_multiline("ℹ️  Information", error_details)
     def _output_reference_errors(self, reference, errors, url):
         """

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/arxiv_utils.py RENAMED Viewed

@@ -392,41 +392,43 @@ def get_bibtex_content(paper):
         logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
         tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-        # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
-        # .bbl files are processed biblatex output that reflects exactly what was cited
+        # Choose between .bib and .bbl files based on what the main TeX file actually uses
+        # Check the main TeX file to see if it uses \bibliography{...} (BibTeX) or not (BBL)
+        uses_bibtex = False
+        if tex_content:
+            # Look for \bibliography{...} commands in the main TeX file
+            bib_pattern = r'\\bibliography\{([^}]+)\}'
+            bib_matches = re.findall(bib_pattern, tex_content)
+            if bib_matches:
+                uses_bibtex = True
+                referenced_bibs = []
+                for match in bib_matches:
+                    bib_names = [name.strip() for name in match.split(',')]
+                    referenced_bibs.extend(bib_names)
+                logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
         if bib_content and bbl_content:
             # Count entries in both for logging
             bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
-            bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
+            bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
             logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
-            # Only use .bbl if it actually contains bibliography entries
-            if bbl_entry_count > 0:
-                logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
+            if uses_bibtex and bib_entry_count > 0:
+                logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
+                return bib_content
+            elif bbl_entry_count > 0:
+                logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
                 return bbl_content
-            else:
+            elif bib_entry_count > 0:
                 logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
-                # If we have LaTeX content, filter BibTeX by cited keys
-                if tex_content:
-                    cited_keys = extract_cited_keys_from_tex({}, tex_content)
-                    if cited_keys:
-                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                        filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
-                        return filtered_content
                 return bib_content
+            else:
+                logger.warning(f"Both .bib and .bbl files appear to be empty")
+                return bib_content  # Default to bib_content as fallback
         elif bib_content:
             logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
-            # If we have LaTeX content, filter BibTeX by cited keys
-            if tex_content:
-                cited_keys = extract_cited_keys_from_tex({}, tex_content)
-                if cited_keys:
-                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
-                    return filtered_content
             return bib_content
         elif bbl_content:

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/error_utils.py RENAMED Viewed

@@ -294,6 +294,39 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
     return warning_dict
+def create_generic_info(info_type: str, info_details: str, **kwargs) -> Dict[str, Any]:
+    """
+    Create a generic info dictionary with custom fields.
+    Args:
+        info_type: Type of info (e.g., 'url')
+        info_details: Description of the information
+        **kwargs: Additional fields to include in the info dictionary
+    Returns:
+        Standardized info dictionary
+    """
+    info_dict = {
+        'info_type': info_type,
+        'info_details': info_details
+    }
+    info_dict.update(kwargs)
+    return info_dict
+def create_info_message(reference, reason, arxiv_url=None):
+    """Create a standardized info message structure."""
+    info_msg = {
+        'info_type': 'arxiv_url_available',
+        'reference': reference,
+        'reason': reason
+    }
+    if arxiv_url:
+        info_msg['arxiv_url'] = arxiv_url
+    return info_msg
 def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
     """
     Format a three-line author mismatch message.

{academic_refchecker-1.2.47 → academic_refchecker-1.2.49}/src/utils/text_utils.py RENAMED Viewed

@@ -2102,7 +2102,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
                 # Use standardized three-line formatting for author mismatch
                 cited_display = format_author_for_display(cited_author)
                 full_author_list = ', '.join(correct_names)
-                error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"Correct authors: {full_author_list}")
+                error_msg = format_author_mismatch(i+1, f"{cited_display} (not found in author list - et al case)", f"{full_author_list}")
                 return False, error_msg
         return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
@@ -2337,6 +2337,9 @@ def format_author_for_display(author_name):
     if not author_name:
         return author_name
+    # First clean the author name to remove asterisks and other unwanted characters
+    author_name = clean_author_name(author_name)
     # Clean up any stray punctuation that might have been attached during parsing
     author_name = author_name.strip()
     # Remove trailing semicolons that sometimes get attached during bibliographic parsing