PyPI - academic-refchecker - Versions diffs - 2.0.7__py3-none-any.whl - Mend

academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

academic_refchecker-2.0.7.dist-info/METADATA +738 -0
academic_refchecker-2.0.7.dist-info/RECORD +64 -0
academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
backend/__init__.py +21 -0
backend/__main__.py +11 -0
backend/cli.py +64 -0
backend/concurrency.py +100 -0
backend/database.py +711 -0
backend/main.py +1367 -0
backend/models.py +99 -0
backend/refchecker_wrapper.py +1126 -0
backend/static/assets/index-2P6L_39v.css +1 -0
backend/static/assets/index-hk21nqxR.js +25 -0
backend/static/favicon.svg +6 -0
backend/static/index.html +15 -0
backend/static/vite.svg +1 -0
backend/thumbnail.py +517 -0
backend/websocket_manager.py +104 -0
refchecker/__init__.py +13 -0
refchecker/__main__.py +11 -0
refchecker/__version__.py +3 -0
refchecker/checkers/__init__.py +17 -0
refchecker/checkers/crossref.py +541 -0
refchecker/checkers/enhanced_hybrid_checker.py +563 -0
refchecker/checkers/github_checker.py +326 -0
refchecker/checkers/local_semantic_scholar.py +540 -0
refchecker/checkers/openalex.py +513 -0
refchecker/checkers/openreview_checker.py +984 -0
refchecker/checkers/pdf_paper_checker.py +493 -0
refchecker/checkers/semantic_scholar.py +764 -0
refchecker/checkers/webpage_checker.py +938 -0
refchecker/config/__init__.py +1 -0
refchecker/config/logging.conf +36 -0
refchecker/config/settings.py +170 -0
refchecker/core/__init__.py +7 -0
refchecker/core/db_connection_pool.py +141 -0
refchecker/core/parallel_processor.py +415 -0
refchecker/core/refchecker.py +5838 -0
refchecker/database/__init__.py +6 -0
refchecker/database/download_semantic_scholar_db.py +1725 -0
refchecker/llm/__init__.py +0 -0
refchecker/llm/base.py +376 -0
refchecker/llm/providers.py +911 -0
refchecker/scripts/__init__.py +1 -0
refchecker/scripts/start_vllm_server.py +121 -0
refchecker/services/__init__.py +8 -0
refchecker/services/pdf_processor.py +268 -0
refchecker/utils/__init__.py +27 -0
refchecker/utils/arxiv_utils.py +462 -0
refchecker/utils/author_utils.py +179 -0
refchecker/utils/biblatex_parser.py +584 -0
refchecker/utils/bibliography_utils.py +332 -0
refchecker/utils/bibtex_parser.py +411 -0
refchecker/utils/config_validator.py +262 -0
refchecker/utils/db_utils.py +210 -0
refchecker/utils/doi_utils.py +190 -0
refchecker/utils/error_utils.py +482 -0
refchecker/utils/mock_objects.py +211 -0
refchecker/utils/text_utils.py +5057 -0
refchecker/utils/unicode_utils.py +335 -0
refchecker/utils/url_utils.py +307 -0

refchecker/utils/error_utils.py ADDED Viewed

@@ -0,0 +1,482 @@
+#!/usr/bin/env python3
+"""
+Error Utilities for Reference Checking
+This module provides standardized error and warning creation functions
+for reference checkers.
+"""
+from typing import Dict, List, Any, Optional
+def print_labeled_multiline(label: str, text: str) -> None:
+    """
+    Print a multi-line message with consistent label formatting.
+    This function ensures consistent indentation for all error and warning messages,
+    regardless of emoji width differences in the labels.
+    Args:
+        label: The label (e.g., "❌ Error", "⚠️  Warning")
+        text: The multi-line text to print
+    """
+    prefix = f"      {label}: "
+    lines = (text or "").splitlines() or [""]
+    # Print the first line with the label prefix
+    print(prefix + lines[0])
+    # Print subsequent lines with fixed indentation to ensure consistency
+    # Use fixed 19-character indentation to align regardless of emoji width
+    fixed_indent = " " * 15
+    for line in lines[1:]:
+        print(fixed_indent + line)
+def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str:
+    """
+    Format a three-line mismatch message with fixed indentation.
+    This creates a clean, consistently formatted mismatch message that separates
+    the mismatch type from the values being compared:
+    Example:
+    Title mismatch:
+           cited:  'Cited Title'
+           actual: 'Correct Title'
+    Args:
+        mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
+        left: The cited/incorrect value
+        right: The correct value
+    Returns:
+        Three-line formatted mismatch message
+    """
+    # Ensure mismatch_type ends with a colon
+    if not mismatch_type.endswith(":"):
+        mismatch_type = mismatch_type.rstrip() + ":"
+    # Use fixed indentation for labels, keeping detail column aligned
+    label_indent = "       "  # 7 spaces to indent labels
+    return f"{mismatch_type}\n{label_indent}cited:  {left}\n{label_indent}actual: {right}"
+def format_title_mismatch(cited_title: str, verified_title: str) -> str:
+    """
+    Format a three-line title mismatch message.
+    Output format:
+    Title mismatch:
+        'Cited Title'
+    vs: 'Correct Title'
+    """
+    return format_three_line_mismatch("Title mismatch", cited_title, verified_title)
+def format_year_mismatch(cited_year: int | str, correct_year: int | str) -> str:
+    """
+    Three-line year mismatch message.
+    """
+    return format_three_line_mismatch("Year mismatch", str(cited_year), str(correct_year))
+def format_doi_mismatch(cited_doi: str, correct_doi: str) -> str:
+    """
+    Three-line DOI mismatch message.
+    """
+    return format_three_line_mismatch("DOI mismatch", str(cited_doi), str(correct_doi))
+def create_author_error(error_details: str, correct_authors: List[Dict[str, str]]) -> Dict[str, str]:
+    """
+    Create a standardized author error dictionary.
+    Args:
+        error_details: Description of the author error
+        correct_authors: List of correct author data from database
+    Returns:
+        Standardized error dictionary
+    """
+    return {
+        'error_type': 'author',
+        'error_details': error_details,
+        'ref_authors_correct': ', '.join([author.get('name', '') for author in correct_authors])
+    }
+def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
+    """
+    Create a standardized year warning dictionary.
+    Args:
+        cited_year: Year as cited in the reference
+        correct_year: Correct year from database
+    Returns:
+        Standardized warning dictionary
+    """
+    return {
+        'warning_type': 'year',
+        'warning_details': format_year_mismatch(cited_year, correct_year),
+        'ref_year_correct': correct_year
+    }
+def create_year_missing_error(correct_year: int) -> Dict[str, Any]:
+    """
+    Create a standardized error for missing year in reference.
+    Args:
+        correct_year: Correct year from database
+    Returns:
+        Standardized error dictionary
+    """
+    return {
+        'error_type': 'year',
+        'error_details': f"Year missing: should include '{correct_year}'",
+        'ref_year_correct': correct_year
+    }
+def validate_year(cited_year: Optional[int], paper_year: Optional[int],
+                  year_tolerance: int = 1, use_flexible_validation: bool = False,
+                  context: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
+    """
+    Validate year field and return appropriate warning if needed.
+    This function handles:
+    - Year mismatch (with configurable tolerance)
+    - Missing year in reference
+    Args:
+        cited_year: Year as cited in the reference (may be None)
+        paper_year: Correct year from database/API (may be None)
+        year_tolerance: Maximum allowed difference between years (default 1)
+        use_flexible_validation: If True, use is_year_substantially_different for more context-aware checking
+        context: Optional context dict for flexible validation (e.g., {'arxiv_match': True})
+    Returns:
+        Warning dictionary if year issue found, None otherwise
+    """
+    if not paper_year:
+        # Can't validate without a known correct year
+        return None
+    if cited_year and paper_year:
+        if use_flexible_validation:
+            # Use the more sophisticated validation from text_utils
+            from refchecker.utils.text_utils import is_year_substantially_different
+            is_different, warning_message = is_year_substantially_different(
+                cited_year, paper_year, context or {}
+            )
+            if is_different and warning_message:
+                return create_year_warning(cited_year, paper_year)
+        else:
+            # Simple tolerance-based validation
+            if abs(cited_year - paper_year) > year_tolerance:
+                return create_year_warning(cited_year, paper_year)
+    elif not cited_year and paper_year:
+        # Reference is missing a year but paper has one
+        return create_year_missing_error(paper_year)
+    return None
+def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str]]:
+    """
+    Create a standardized DOI error or warning dictionary.
+    If the cited DOI resolves (is valid), this returns a warning instead of an error,
+    since papers can have multiple valid DOIs (e.g., arXiv DOI vs conference DOI).
+    Args:
+        cited_doi: DOI as cited in the reference
+        correct_doi: Correct DOI from database
+    Returns:
+        Standardized error/warning dictionary if DOIs differ, None if they match after cleaning
+    """
+    from refchecker.utils.doi_utils import validate_doi_resolves, compare_dois
+    # Use compare_dois which handles normalization (case, prefixes, trailing punctuation)
+    if compare_dois(cited_doi, correct_doi):
+        return None
+    # DOIs are different - determine if this should be error or warning
+    # If cited DOI resolves, it's likely a valid alternate DOI
+    # Treat as warning instead of error
+    if validate_doi_resolves(cited_doi):
+        return {
+            'warning_type': 'doi',
+            'warning_details': format_doi_mismatch(cited_doi, correct_doi),
+            'ref_doi_correct': correct_doi
+        }
+    else:
+        return {
+            'error_type': 'doi',
+            'error_details': format_doi_mismatch(cited_doi, correct_doi),
+            'ref_doi_correct': correct_doi
+        }
+def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]:
+    """
+    Create a standardized title error dictionary.
+    Args:
+        error_details: Description of the title error
+        correct_title: Correct title from database
+    Returns:
+        Standardized error dictionary
+    """
+    return {
+        'error_type': 'title',
+        'error_details': error_details,
+        'ref_title_correct': correct_title
+    }
+def clean_venue_for_comparison(venue: str) -> str:
+    """
+    Clean venue name for display in warnings using the shared normalization logic.
+    Args:
+        venue: Raw venue string
+    Returns:
+        Cleaned venue name suitable for display
+    """
+    from refchecker.utils.text_utils import normalize_venue_for_display
+    return normalize_venue_for_display(venue)
+def format_missing_venue(correct_venue: str) -> str:
+    """
+    Format a missing venue message with only the actual value.
+    """
+    # Only show the actual venue with indented label
+    label_indent = "       "  # 7 spaces to indent labels
+    return f"Missing venue:\n{label_indent}actual: {correct_venue}"
+def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
+    """
+    Create a standardized venue warning dictionary.
+    Args:
+        cited_venue: Venue as cited in the reference
+        correct_venue: Correct venue from database
+    Returns:
+        Standardized warning dictionary
+    """
+    # Clean both venues for display in the warning
+    clean_cited = clean_venue_for_comparison(cited_venue)
+    clean_correct = clean_venue_for_comparison(correct_venue)
+    # If cited venue cleans to empty, treat as missing venue instead of mismatch
+    if not clean_cited and clean_correct:
+        return {
+            'error_type': 'venue',
+            'error_details': format_missing_venue(clean_correct),
+            'ref_venue_correct': correct_venue
+        }
+    return {
+        'warning_type': 'venue',
+        'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
+        'ref_venue_correct': correct_venue
+    }
+def format_venue_mismatch(cited_venue: str, verified_venue: str) -> str:
+    """
+    Format a three-line venue mismatch message with cleaned venue names.
+    """
+    clean_cited = clean_venue_for_comparison(cited_venue)
+    clean_verified = clean_venue_for_comparison(verified_venue)
+    return format_three_line_mismatch("Venue mismatch", clean_cited, clean_verified)
+def create_url_error(error_details: str, correct_url: Optional[str] = None) -> Dict[str, str]:
+    """
+    Create a standardized URL error dictionary.
+    Args:
+        error_details: Description of the URL error
+        correct_url: Correct URL from database (optional)
+    Returns:
+        Standardized error dictionary
+    """
+    error_dict = {
+        'error_type': 'url',
+        'error_details': error_details
+    }
+    if correct_url:
+        error_dict['ref_url_correct'] = correct_url
+    return error_dict
+def create_generic_error(error_type: str, error_details: str, **kwargs) -> Dict[str, Any]:
+    """
+    Create a generic error dictionary with custom fields.
+    Args:
+        error_type: Type of error (e.g., 'author', 'doi', 'title')
+        error_details: Description of the error
+        **kwargs: Additional fields to include in the error dictionary
+    Returns:
+        Standardized error dictionary
+    """
+    error_dict = {
+        'error_type': error_type,
+        'error_details': error_details
+    }
+    error_dict.update(kwargs)
+    return error_dict
+def create_generic_warning(warning_type: str, warning_details: str, **kwargs) -> Dict[str, Any]:
+    """
+    Create a generic warning dictionary with custom fields.
+    Args:
+        warning_type: Type of warning (e.g., 'year', 'venue')
+        warning_details: Description of the warning
+        **kwargs: Additional fields to include in the warning dictionary
+    Returns:
+        Standardized warning dictionary
+    """
+    warning_dict = {
+        'warning_type': warning_type,
+        'warning_details': warning_details
+    }
+    warning_dict.update(kwargs)
+    return warning_dict
+def create_generic_info(info_type: str, info_details: str, **kwargs) -> Dict[str, Any]:
+    """
+    Create a generic info dictionary with custom fields.
+    Args:
+        info_type: Type of info (e.g., 'url')
+        info_details: Description of the information
+        **kwargs: Additional fields to include in the info dictionary
+    Returns:
+        Standardized info dictionary
+    """
+    info_dict = {
+        'info_type': info_type,
+        'info_details': info_details
+    }
+    info_dict.update(kwargs)
+    return info_dict
+def create_info_message(reference, reason, arxiv_url=None):
+    """Create a standardized info message structure."""
+    info_msg = {
+        'info_type': 'arxiv_url_available',
+        'reference': reference,
+        'reason': reason
+    }
+    if arxiv_url:
+        info_msg['arxiv_url'] = arxiv_url
+    return info_msg
+def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line author mismatch message.
+    Args:
+        author_number: The author position (1-based)
+        cited_author: The cited author name
+        correct_author: The correct author name
+    Returns:
+        Formatted three-line author mismatch message
+    """
+    return format_three_line_mismatch(f"Author {author_number} mismatch", cited_author, correct_author)
+def format_first_author_mismatch(cited_author: str, correct_author: str) -> str:
+    """
+    Format a three-line first author mismatch message.
+    Args:
+        cited_author: The cited first author name
+        correct_author: The correct first author name
+    Returns:
+        Formatted three-line first author mismatch message
+    """
+    return format_three_line_mismatch("First author mismatch", cited_author, correct_author)
+def format_author_count_mismatch(cited_count: int, correct_count: int, cited_authors: list, correct_authors: list) -> str:
+    """
+    Format an author count mismatch message showing all cited and correct authors.
+    Args:
+        cited_count: Number of cited authors
+        correct_count: Number of correct authors
+        cited_authors: List of cited author names
+        correct_authors: List of correct author names
+    Returns:
+        Formatted multi-line author count mismatch message
+    """
+    # Create the header with count information
+    header = f"Author count mismatch: {cited_count} cited vs {correct_count} correct"
+    # Format author lists
+    cited_list = ", ".join(cited_authors) if cited_authors else "None"
+    correct_list = ", ".join(correct_authors) if correct_authors else "None"
+    # Use the same format as other mismatches
+    return format_three_line_mismatch(header, cited_list, correct_list)
+def format_authors_list(authors: List[Dict[str, str]]) -> str:
+    """
+    Format a list of author dictionaries into a readable string.
+    Args:
+        authors: List of author data dictionaries
+    Returns:
+        Formatted authors string
+    """
+    if not authors:
+        return ""
+    return ', '.join([author.get('name', '') for author in authors])
+def validate_error_dict(error_dict: Dict[str, Any], required_fields: List[str]) -> bool:
+    """
+    Validate that an error dictionary contains all required fields.
+    Args:
+        error_dict: Error dictionary to validate
+        required_fields: List of required field names
+    Returns:
+        True if all required fields are present, False otherwise
+    """
+    return all(field in error_dict for field in required_fields)

refchecker/utils/mock_objects.py ADDED Viewed

@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+Mock objects and test utilities for ArXiv Reference Checker
+Provides shared mock objects for testing and development
+"""
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+@dataclass
+class MockPaper:
+    """Mock paper object for testing"""
+    title: str
+    authors: List[str]
+    abstract: str = ""
+    year: Optional[int] = None
+    venue: str = ""
+    url: str = ""
+    doi: str = ""
+    arxiv_id: str = ""
+    pdf_path: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format"""
+        return {
+            'title': self.title,
+            'authors': self.authors,
+            'abstract': self.abstract,
+            'year': self.year,
+            'venue': self.venue,
+            'url': self.url,
+            'doi': self.doi,
+            'arxiv_id': self.arxiv_id,
+            'pdf_path': self.pdf_path
+        }
+@dataclass
+class MockReference:
+    """Mock reference object for testing"""
+    raw_text: str
+    title: str = ""
+    authors: List[str] = None
+    venue: str = ""
+    year: Optional[int] = None
+    url: str = ""
+    doi: str = ""
+    arxiv_id: str = ""
+    def __post_init__(self):
+        if self.authors is None:
+            self.authors = []
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format"""
+        return {
+            'raw_text': self.raw_text,
+            'title': self.title,
+            'authors': self.authors,
+            'venue': self.venue,
+            'year': self.year,
+            'url': self.url,
+            'doi': self.doi,
+            'arxiv_id': self.arxiv_id
+        }
+class MockLLMProvider:
+    """Mock LLM provider for testing"""
+    def __init__(self, config: Dict[str, Any] = None):
+        self.config = config or {}
+        self.responses = []
+        self.call_count = 0
+    def set_responses(self, responses: List[List[str]]):
+        """Set predefined responses for testing"""
+        self.responses = responses
+    def extract_references(self, bibliography_text: str) -> List[str]:
+        """Return mock references"""
+        if self.call_count < len(self.responses):
+            response = self.responses[self.call_count]
+            self.call_count += 1
+            return response
+        return []
+    def is_available(self) -> bool:
+        """Always available for testing"""
+        return True
+class MockSemanticScholarAPI:
+    """Mock Semantic Scholar API for testing"""
+    def __init__(self):
+        self.responses = {}
+        self.call_count = 0
+    def set_response(self, query: str, response: Dict[str, Any]):
+        """Set response for specific query"""
+        self.responses[query] = response
+    def search_papers(self, query: str) -> Dict[str, Any]:
+        """Return mock search results"""
+        self.call_count += 1
+        return self.responses.get(query, {'data': []})
+    def get_paper_details(self, paper_id: str) -> Dict[str, Any]:
+        """Return mock paper details"""
+        self.call_count += 1
+        return self.responses.get(paper_id, {})
+class MockArxivAPI:
+    """Mock ArXiv API for testing"""
+    def __init__(self):
+        self.responses = {}
+        self.call_count = 0
+    def set_response(self, arxiv_id: str, response: Dict[str, Any]):
+        """Set response for specific ArXiv ID"""
+        self.responses[arxiv_id] = response
+    def get_paper_metadata(self, arxiv_id: str) -> Dict[str, Any]:
+        """Return mock paper metadata"""
+        self.call_count += 1
+        return self.responses.get(arxiv_id, {})
+def create_mock_config() -> Dict[str, Any]:
+    """Create a mock configuration for testing"""
+    return {
+        'llm': {
+            'provider': 'mock',
+            'model': 'test-model',
+            'max_tokens': 1000,
+            'temperature': 0.1,
+            'timeout': 30
+        },
+        'processing': {
+            'max_concurrent_requests': 5,
+            'request_delay': 0.1,
+            'retry_attempts': 3
+        },
+        'apis': {
+            'semantic_scholar': {
+                'base_url': 'https://api.semanticscholar.org',
+                'timeout': 30
+            },
+            'arxiv': {
+                'base_url': 'https://arxiv.org/api',
+                'timeout': 30
+            }
+        }
+    }
+def create_mock_paper(title: str = "Test Paper", authors: List[str] = None) -> MockPaper:
+    """Create a mock paper with default values"""
+    if authors is None:
+        authors = ["Test Author"]
+    return MockPaper(
+        title=title,
+        authors=authors,
+        abstract="This is a test abstract.",
+        year=2023,
+        venue="Test Conference",
+        url="https://example.com/paper",
+        doi="10.1000/test",
+        arxiv_id="2023.12345"
+    )
+def create_mock_reference(raw_text: str = "Test Reference") -> MockReference:
+    """Create a mock reference with default values"""
+    return MockReference(
+        raw_text=raw_text,
+        title="Test Reference Title",
+        authors=["Test Author"],
+        venue="Test Journal",
+        year=2023,
+        url="https://example.com/reference",
+        doi="10.1000/test-ref"
+    )
+def create_mock_bibliography() -> str:
+    """Create mock bibliography text for testing"""
+    return """
+[1] Smith, J., & Doe, J. (2023). A comprehensive study of machine learning. Journal of AI Research, 15(3), 123-145.
+[2] Johnson, A. (2022). Deep learning fundamentals. In Proceedings of the International Conference on Neural Networks (pp. 67-89).
+[3] Brown, M., Davis, K., & Wilson, L. (2023). Natural language processing advances. arXiv preprint arXiv:2023.45678.
+[4] Taylor, R. (2021). Computer vision applications. IEEE Transactions on Pattern Analysis, 43(7), 1456-1478.
+"""
+def create_mock_extracted_references() -> List[str]:
+    """Create mock extracted references for testing"""
+    return [
+        "Smith, J., & Doe, J. (2023). A comprehensive study of machine learning. Journal of AI Research, 15(3), 123-145.",
+        "Johnson, A. (2022). Deep learning fundamentals. In Proceedings of the International Conference on Neural Networks (pp. 67-89).",
+        "Brown, M., Davis, K., & Wilson, L. (2023). Natural language processing advances. arXiv preprint arXiv:2023.45678.",
+        "Taylor, R. (2021). Computer vision applications. IEEE Transactions on Pattern Analysis, 43(7), 1456-1478."
+    ]