PyPI - academic-refchecker - Versions diffs - 1.2.51__py3-none-any.whl → 1.2.52__py3-none-any.whl - Mend

academic-refchecker 1.2.51py3-none-any.whl → 1.2.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.51"
+__version__ = "1.2.52"

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.51
+Version: 1.2.52
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT
@@ -65,6 +65,14 @@ Dynamic: license-file
 A comprehensive tool for validating reference accuracy in academic papers, useful for both authors checking their bibliography and conference reviewers ensuring that paper references are authentic and accurate. This tool processes papers from various local and online sources including ArXiv, PDF files, LaTeX documents, and text files to verify the accuracy of references by comparing cited information against authoritative sources.
+## 🎥 Project Deep Dive
+Learn about RefChecker's design philosophy and development process in this detailed discussion between Mark Russinovich (RefChecker's author) and Scott Hanselman. Mark shares insights into how he leveraged AI coding assistants including Cursor, GitHub Copilot, and Claude to build this comprehensive academic reference validation tool.
+**[📺 Watch: "AI Coding with Mark Russinovich: Building RefChecker"](https://www.youtube.com/watch?v=n929Alz-fjo)**
+*This video provides valuable insights into modern AI-assisted development workflows and the technical decisions behind RefChecker's architecture.*
 ## 📊 Sample Output
 ```
@@ -117,6 +125,7 @@ A comprehensive tool for validating reference accuracy in academic papers, usefu
 ## 📋 Table of Contents
+- [🎥 Project Deep Dive](#-project-deep-dive)
 - [📊 Sample Output](#-sample-output)
 - [🎯 Features](#-features)
 - [🚀 Quick Start](#-quick-start)

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=QVi0xkCCLhSxaeFBcvepTdxFHgwlpnbC20YfK43fZ0s,65
-academic_refchecker-1.2.51.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=3kX5OAukU7mOMMYEni5E3TW6cnip3XwxplWJP4qANhU,65
+academic_refchecker-1.2.52.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
 checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
@@ -7,15 +7,16 @@ checkers/github_checker.py,sha256=BXJaBC3AloKze04j8EcQz0a79EhtVoi9_871ilV7t60,14
 checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
 checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
 checkers/openreview_checker.py,sha256=3ckn6U7TN5nQBjqPacr8W8mm2uMo6aWWB6gsxTDNCPk,40452
+checkers/pdf_paper_checker.py,sha256=L5HRHd3xpo0xDltZGTAA-Wk_arIS9bQV8ITeuxW0bNc,19893
 checkers/semantic_scholar.py,sha256=wk6e8DkYJM_O2nWsi-6EfJT53PzfL8KCmX1rS562KKc,34962
-checkers/webpage_checker.py,sha256=ZgmnMPxNC7Jn93kDYzD9ORfzaj1Ewb7FLIF1z8o0RfM,22554
+checkers/webpage_checker.py,sha256=REOotx7Qka86_xbOIMeYj5YVb9D1RVMb4Ye311-28cA,43620
 config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
 config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
 config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
 core/parallel_processor.py,sha256=cq_WfzXrF2EI6IKOtJd6_QcwvM1xT3J6a13teg-wSbM,17638
-core/refchecker.py,sha256=X38KjvO51_YZNqhQ6y8tqCsOKMSaVRf9FRODp1VCi2Q,280164
+core/refchecker.py,sha256=rJ-CbCqN3dxzxCLr4DERq5UxWtVbErwCMyS3YUxdtuo,285500
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,8 +40,8 @@ utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
 utils/text_utils.py,sha256=T3PiiG9-BMPTbdCftG2zypyIeZJl6snuMCKQ0nEOQv0,217834
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
 utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
-academic_refchecker-1.2.51.dist-info/METADATA,sha256=QhJtbkhRypQyRHykzXxQIGoXu-c6_mlXx55YkofCmzM,22576
-academic_refchecker-1.2.51.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.51.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.51.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.51.dist-info/RECORD,,
+academic_refchecker-1.2.52.dist-info/METADATA,sha256=PKCXz09omWTvIVLZGCgP3kt9yO_V-FjXDu-HHfedqUU,23256
+academic_refchecker-1.2.52.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.52.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.52.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.52.dist-info/RECORD,,

checkers/pdf_paper_checker.py ADDED Viewed

@@ -0,0 +1,493 @@
+#!/usr/bin/env python3
+"""
+PDF Paper Checker - Validates citations by extracting and analyzing PDF content
+"""
+import re
+import io
+import logging
+from typing import Dict, List, Any, Optional, Tuple
+from urllib.parse import urlparse
+import requests
+import pdfplumber
+from pypdf import PdfReader
+from fuzzywuzzy import fuzz
+from bs4 import BeautifulSoup
+from utils.text_utils import normalize_text, calculate_title_similarity
+logger = logging.getLogger(__name__)
+class PDFPaperChecker:
+    """
+    Checker that downloads and analyzes PDF documents to validate citations
+    """
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def can_check_reference(self, reference: Dict[str, Any]) -> bool:
+        """
+        Check if this reference can be validated by PDF analysis
+        Args:
+            reference: Reference dictionary containing url and other metadata
+        Returns:
+            True if reference has URL that likely points to a PDF
+        """
+        url = reference.get('url', '').strip()
+        if not url:
+            return False
+        # Check if URL ends with .pdf
+        if url.lower().endswith('.pdf'):
+            return True
+        # Check if URL path suggests PDF content
+        pdf_indicators = ['/pdf/', '/document/', '/download/', '/file/', '/resource/']
+        if any(indicator in url.lower() for indicator in pdf_indicators):
+            return True
+        # Check if URL is from domains that commonly serve PDFs directly
+        domain = urlparse(url).netloc.lower()
+        pdf_domains = [
+            '.gov', '.edu', '.org',  # Common institutional domains
+            'researchgate.net', 'academia.edu', 'arxiv.org',  # Academic platforms
+            'oecd.org', 'who.int', 'unesco.org',  # International organizations
+            'aecea.ca'  # Specific domain from the user's example
+        ]
+        if any(domain.endswith(pdf_domain) or pdf_domain in domain for pdf_domain in pdf_domains):
+            return True
+        return False
+    def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a reference by downloading and analyzing PDF content
+        Args:
+            reference: Reference dictionary with title, authors, year, url, etc.
+        Returns:
+            Tuple of (verified_data, errors, url) where:
+            - verified_data: Dict with verified data if PDF validates citation, None otherwise
+            - errors: List of error dictionaries
+            - url: The URL that was checked
+        """
+        logger.debug(f"Verifying PDF reference: {reference.get('title', 'Untitled')}")
+        url = reference.get('url', '').strip()
+        if not url:
+            return None, [{"error_type": "unverified", "error_details": "no URL provided"}], None
+        try:
+            # First try to download directly as PDF
+            pdf_content = self._download_pdf(url)
+            # If direct download fails, try to find PDF links in the page
+            if not pdf_content:
+                pdf_url = self._find_pdf_url_in_page(url)
+                if pdf_url:
+                    logger.debug(f"Found PDF link in page: {pdf_url}")
+                    pdf_content = self._download_pdf(pdf_url)
+                    url = pdf_url  # Update URL to the actual PDF URL
+            if not pdf_content:
+                return None, [{"error_type": "unverified", "error_details": "could not download PDF content"}], url
+            # Extract text and metadata from PDF
+            pdf_data = self._extract_pdf_data(pdf_content)
+            if not pdf_data:
+                return None, [{"error_type": "unverified", "error_details": "could not extract PDF content"}], url
+            # Validate citation against PDF content
+            is_valid, errors = self._validate_citation(reference, pdf_data)
+            if is_valid:
+                # Create verified data preserving original venue if provided
+                venue = reference.get('journal') or reference.get('venue') or reference.get('booktitle') or 'PDF Document'
+                verified_data = {
+                    'title': reference.get('title', ''),
+                    'authors': reference.get('authors', []),
+                    'year': reference.get('year'),
+                    'venue': venue,
+                    'url': url,
+                    'pdf_metadata': {
+                        'extracted_title': pdf_data.get('title'),
+                        'extracted_authors': pdf_data.get('authors'),
+                        'extracted_text_preview': pdf_data.get('text', '')[:200] + '...' if pdf_data.get('text') else '',
+                        'pdf_pages': pdf_data.get('page_count'),
+                        'extraction_method': pdf_data.get('extraction_method')
+                    }
+                }
+                logger.debug(f"PDF reference verified: {url}")
+                return verified_data, errors, url
+            else:
+                return None, errors, url
+        except Exception as e:
+            logger.error(f"Error verifying PDF reference {url}: {e}")
+            return None, [{"error_type": "unverified", "error_details": "PDF processing error"}], url
+    def _download_pdf(self, url: str, timeout: int = 30) -> Optional[bytes]:
+        """
+        Download PDF content from URL
+        Args:
+            url: URL to download from
+            timeout: Request timeout in seconds
+        Returns:
+            PDF content as bytes, or None if download failed
+        """
+        try:
+            logger.debug(f"Downloading PDF from: {url}")
+            response = self.session.get(url, timeout=timeout, stream=True)
+            response.raise_for_status()
+            # Check if content is actually a PDF
+            content_type = response.headers.get('content-type', '').lower()
+            if 'pdf' not in content_type and not url.lower().endswith('.pdf'):
+                # Sometimes PDFs are served with generic content types, so we'll try anyway
+                logger.debug(f"Content-Type '{content_type}' doesn't indicate PDF, but proceeding anyway")
+            # Download content
+            content = response.content
+            # Basic PDF validation - check for PDF header
+            if content.startswith(b'%PDF-'):
+                logger.debug(f"Successfully downloaded PDF ({len(content)} bytes)")
+                return content
+            else:
+                logger.debug("Downloaded content doesn't appear to be a valid PDF")
+                return None
+        except Exception as e:
+            logger.error(f"Failed to download PDF from {url}: {e}")
+            return None
+    def _find_pdf_url_in_page(self, url: str) -> Optional[str]:
+        """
+        Look for PDF download links in a web page
+        Args:
+            url: URL of the web page to search
+        Returns:
+            URL of PDF document if found, None otherwise
+        """
+        try:
+            logger.debug(f"Searching for PDF links in page: {url}")
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            # Check if the response itself is a PDF (after redirects)
+            content_type = response.headers.get('content-type', '').lower()
+            if 'pdf' in content_type or response.content.startswith(b'%PDF-'):
+                logger.debug("Page redirected directly to PDF")
+                return response.url
+            # Parse HTML to look for PDF links
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Look for links that might be PDFs
+            pdf_links = []
+            # Find all links
+            for link in soup.find_all('a', href=True):
+                href = link.get('href')
+                link_text = link.get_text().lower().strip()
+                # Check if link ends with .pdf
+                if href and href.lower().endswith('.pdf'):
+                    pdf_links.append(href)
+                    continue
+                # Check if link text suggests PDF
+                if any(indicator in link_text for indicator in ['pdf', 'download', 'document', 'report', 'policy']):
+                    pdf_links.append(href)
+                    continue
+                # Check if link has PDF-related attributes
+                if link.get('type', '').lower() == 'application/pdf':
+                    pdf_links.append(href)
+                    continue
+            # Look for PDF links in other elements
+            for element in soup.find_all(attrs={'href': True}):
+                href = element.get('href')
+                if href and href.lower().endswith('.pdf'):
+                    pdf_links.append(href)
+            # Convert relative URLs to absolute
+            from urllib.parse import urljoin
+            absolute_pdf_links = []
+            for link in pdf_links:
+                if link:
+                    absolute_url = urljoin(url, link)
+                    absolute_pdf_links.append(absolute_url)
+            # Remove duplicates
+            absolute_pdf_links = list(set(absolute_pdf_links))
+            if absolute_pdf_links:
+                logger.debug(f"Found {len(absolute_pdf_links)} potential PDF links")
+                # Return the first PDF link found
+                return absolute_pdf_links[0]
+            logger.debug("No PDF links found in page")
+            return None
+        except Exception as e:
+            logger.error(f"Error searching for PDF links in {url}: {e}")
+            return None
+    def _extract_pdf_data(self, pdf_content: bytes) -> Optional[Dict[str, Any]]:
+        """
+        Extract text and metadata from PDF content
+        Args:
+            pdf_content: PDF file content as bytes
+        Returns:
+            Dictionary with extracted data including text, title, authors, etc.
+        """
+        pdf_data = {
+            'text': '',
+            'title': '',
+            'authors': [],
+            'page_count': 0,
+            'extraction_method': 'none'
+        }
+        # Try multiple extraction methods
+        try:
+            # Method 1: Try pdfplumber (usually better for text extraction)
+            pdf_data = self._extract_with_pdfplumber(pdf_content, pdf_data)
+            if pdf_data['text']:
+                pdf_data['extraction_method'] = 'pdfplumber'
+                return pdf_data
+        except Exception as e:
+            logger.debug(f"pdfplumber extraction failed: {e}")
+        try:
+            # Method 2: Try pypdf (fallback)
+            pdf_data = self._extract_with_pypdf(pdf_content, pdf_data)
+            if pdf_data['text']:
+                pdf_data['extraction_method'] = 'pypdf'
+                return pdf_data
+        except Exception as e:
+            logger.debug(f"pypdf extraction failed: {e}")
+        logger.debug("All PDF extraction methods failed")
+        return None
+    def _extract_with_pdfplumber(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract PDF data using pdfplumber"""
+        with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
+            pdf_data['page_count'] = len(pdf.pages)
+            # Extract text from first few pages (usually contains title/author info)
+            text_parts = []
+            for i, page in enumerate(pdf.pages[:5]):  # First 5 pages should be enough
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+            pdf_data['text'] = '\n'.join(text_parts)
+            # Try to extract title and author from first page
+            if pdf.pages:
+                first_page_text = pdf.pages[0].extract_text() or ''
+                pdf_data['title'], pdf_data['authors'] = self._parse_title_and_authors(first_page_text)
+        return pdf_data
+    def _extract_with_pypdf(self, pdf_content: bytes, pdf_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract PDF data using pypdf"""
+        reader = PdfReader(io.BytesIO(pdf_content))
+        pdf_data['page_count'] = len(reader.pages)
+        # Extract metadata
+        if reader.metadata:
+            if '/Title' in reader.metadata:
+                pdf_data['title'] = str(reader.metadata['/Title'])
+            if '/Author' in reader.metadata:
+                pdf_data['authors'] = [str(reader.metadata['/Author'])]
+        # Extract text from first few pages
+        text_parts = []
+        for i, page in enumerate(reader.pages[:5]):  # First 5 pages
+            try:
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+            except Exception as e:
+                logger.debug(f"Failed to extract text from page {i}: {e}")
+                continue
+        pdf_data['text'] = '\n'.join(text_parts)
+        # If no metadata title/author, try to parse from text
+        if not pdf_data['title'] and text_parts:
+            title, authors = self._parse_title_and_authors(text_parts[0])
+            if title and not pdf_data['title']:
+                pdf_data['title'] = title
+            if authors and not pdf_data['authors']:
+                pdf_data['authors'] = authors
+        return pdf_data
+    def _parse_title_and_authors(self, text: str) -> Tuple[str, List[str]]:
+        """
+        Parse title and authors from PDF text
+        Args:
+            text: Text from first page of PDF
+        Returns:
+            Tuple of (title, authors_list)
+        """
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        if not lines:
+            return '', []
+        # The title is often the first meaningful line (after removing headers/footers)
+        title = ''
+        authors = []
+        # Look for the title - usually first non-header line
+        for i, line in enumerate(lines):
+            # Skip obvious header/footer content
+            if len(line) < 10 or any(skip in line.lower() for skip in ['page', 'doi:', 'http', 'www.', '@']):
+                continue
+            # Title is usually longer and on its own line
+            if len(line) > 20 and not any(sep in line for sep in [',', ';']) and not line.endswith('.'):
+                title = line
+                # Authors often follow the title - look for patterns
+                for j in range(i + 1, min(i + 5, len(lines))):
+                    author_line = lines[j]
+                    # Author lines often contain commas, "and", or institutional affiliations
+                    if any(indicator in author_line.lower() for indicator in [',', ' and ', 'university', 'college', 'institute']):
+                        # Clean up author line
+                        author_text = re.sub(r'[0-9*†‡§¶#]', '', author_line)  # Remove superscript markers
+                        if ',' in author_text:
+                            authors.extend([name.strip() for name in author_text.split(',') if name.strip()])
+                        else:
+                            authors.append(author_text.strip())
+                        break
+                break
+        return title, authors
+    def _validate_citation(self, reference: Dict[str, Any], pdf_data: Dict[str, Any]) -> Tuple[bool, List[Dict[str, Any]]]:
+        """
+        Validate citation against extracted PDF data
+        Args:
+            reference: The citation being checked
+            pdf_data: Extracted data from PDF
+        Returns:
+            Tuple of (is_valid, errors_list)
+        """
+        errors = []
+        # Check title match
+        cited_title = reference.get('title', '').strip()
+        extracted_title = pdf_data.get('title', '').strip()
+        pdf_text = pdf_data.get('text', '').lower()
+        title_match = False
+        if cited_title and extracted_title:
+            # Compare titles directly
+            similarity = calculate_title_similarity(cited_title, extracted_title)
+            if similarity > 0.8:  # 80% similarity threshold
+                title_match = True
+        if not title_match and cited_title and pdf_text:
+            # Check if cited title appears in PDF text
+            cited_title_normalized = normalize_text(cited_title)
+            if cited_title_normalized.lower() in pdf_text:
+                title_match = True
+        if not title_match:
+            errors.append({
+                "error_type": "unverified",
+                "error_details": "title not found in PDF content"
+            })
+        # Check author match (more lenient since PDF author extraction is difficult)
+        cited_authors = reference.get('authors', [])
+        extracted_authors = pdf_data.get('authors', [])
+        author_match = False
+        if cited_authors and extracted_authors:
+            # Check if any cited author appears in extracted authors
+            for cited_author in cited_authors:
+                for extracted_author in extracted_authors:
+                    if self._authors_match(cited_author, extracted_author):
+                        author_match = True
+                        break
+                if author_match:
+                    break
+        if not author_match and cited_authors and pdf_text:
+            # Check if any cited author appears in PDF text
+            for cited_author in cited_authors:
+                author_normalized = normalize_text(cited_author)
+                if author_normalized.lower() in pdf_text:
+                    author_match = True
+                    break
+        # For PDF validation, we're more lenient with author matching since extraction is unreliable
+        if not author_match and cited_authors:
+            errors.append({
+                "warning_type": "author",
+                "warning_details": "authors not clearly identified in PDF content"
+            })
+        # A reference is valid if we found the title (author matching is optional due to extraction difficulties)
+        is_valid = title_match
+        return is_valid, errors
+    def _authors_match(self, author1: str, author2: str) -> bool:
+        """Check if two author names likely refer to the same person"""
+        author1_norm = normalize_text(author1).lower()
+        author2_norm = normalize_text(author2).lower()
+        # Exact match
+        if author1_norm == author2_norm:
+            return True
+        # Check similarity
+        similarity = fuzz.ratio(author1_norm, author2_norm)
+        if similarity > 85:  # 85% similarity threshold
+            return True
+        # Check if one name is contained in the other (handles "J. Smith" vs "John Smith")
+        words1 = set(author1_norm.split())
+        words2 = set(author2_norm.split())
+        if words1.intersection(words2):
+            return True
+        return False

checkers/webpage_checker.py CHANGED Viewed

@@ -512,4 +512,427 @@ class WebPageChecker:
                 "warning_details": "Access blocked by site but domain is trusted (likely bot protection)"
             })
-        return verified_data, errors, web_url
+        return verified_data, errors, web_url
+    def check_unverified_url_reference(self, reference: Dict[str, Any]) -> str:
+        """
+        Check a URL from an unverified reference to determine the specific unverified reason
+        Args:
+            reference: Reference dictionary with title, authors, year, url, etc.
+        Returns:
+            String with the specific unverified reason:
+            - "non-existent web page" if the page doesn't exist
+            - "paper not found and URL doesn't reference it" if page exists but doesn't contain title
+            - "paper not verified but URL references paper" if page exists and contains title
+        """
+        logger.debug(f"Checking unverified URL reference: {reference.get('title', 'Untitled')}")
+        # Extract URL from reference
+        web_url = reference.get('url', '').strip()
+        if not web_url:
+            return "paper not found and URL doesn't reference it"  # No URL to check
+        # Make request to check if page exists
+        response = self._respectful_request(web_url)
+        if response is None:
+            return "non-existent web page"
+        if response.status_code == 404:
+            return "non-existent web page"
+        elif response.status_code == 403:
+            # For blocked resources, we can't check content but assume page exists
+            return "paper not verified but URL references paper"
+        elif response.status_code != 200:
+            return "non-existent web page"
+        try:
+            # Parse HTML content to search for title
+            content_type = response.headers.get('content-type', '').lower()
+            if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
+                # For PDFs, we can't search content, so assume it's referenced if accessible
+                return "paper not verified but URL references paper"
+            # Parse HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Extract page content for searching
+            page_title = self._extract_page_title(soup)
+            page_description = self._extract_description(soup)
+            # Get the full page text for comprehensive searching
+            page_text = soup.get_text().lower()
+            # Get the reference title to search for
+            cited_title = reference.get('title', '').strip()
+            if not cited_title:
+                return "paper not found and URL doesn't reference it"
+            # Search for the title in various ways
+            cited_title_lower = cited_title.lower()
+            # Direct search in page text
+            if cited_title_lower in page_text:
+                return "paper not verified but URL references paper"
+            # Search for key words from the title
+            cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
+                             if len(word.strip('.,;:()[]{}')) > 3)
+            # Check if significant portion of title words appear in page
+            page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
+                           if len(word.strip('.,;:()[]{}')) > 3)
+            common_words = cited_words.intersection(page_words)
+            # If most of the title words are found, consider it referenced
+            if len(common_words) >= max(1, len(cited_words) * 0.6):  # At least 60% of words match
+                return "paper not verified but URL references paper"
+            # Also check the extracted title and description specifically
+            if page_title:
+                if self._check_title_match(cited_title, page_title, page_description):
+                    return "paper not verified but URL references paper"
+            # Title not found in page content
+            return "paper not found and URL doesn't reference it"
+        except Exception as e:
+            logger.error(f"Error checking unverified URL {web_url}: {e}")
+            return "paper not found and URL doesn't reference it"
+    def verify_raw_url_for_unverified_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
+        """
+        Verify a raw URL from an unverified reference - can return verified data if appropriate
+        Args:
+            reference: Reference dictionary with title, authors, year, url, etc.
+        Returns:
+            Tuple of (verified_data, errors, url) where:
+            - verified_data: Dict with verified data if URL should be considered verified, None otherwise
+            - errors: List of error dictionaries with specific unverified reasons
+            - url: The URL that was checked
+        """
+        logger.debug(f"Verifying raw URL for unverified reference: {reference.get('title', 'Untitled')}")
+        # Extract URL from reference
+        web_url = reference.get('url', '').strip()
+        if not web_url:
+            return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], None
+        # Make request to check if page exists
+        response = self._respectful_request(web_url)
+        if response is None:
+            return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
+        if response.status_code == 404:
+            return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
+        elif response.status_code == 403:
+            # For blocked resources, we can't check content but assume page exists
+            # If no venue, treat as verified since URL is accessible
+            if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
+                verified_data = {
+                    'title': reference.get('title', ''),
+                    'authors': reference.get('authors', []),
+                    'year': reference.get('year'),
+                    'venue': 'Web Page',
+                    'url': web_url,
+                    'web_metadata': {
+                        'status_code': 403,
+                        'access_blocked': True
+                    }
+                }
+                return verified_data, [], web_url
+            else:
+                return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
+        elif response.status_code != 200:
+            return None, [{"error_type": "unverified", "error_details": "non-existent web page"}], web_url
+        try:
+            # Parse HTML content to search for title
+            content_type = response.headers.get('content-type', '').lower()
+            if 'pdf' in content_type or web_url.lower().endswith('.pdf'):
+                # For PDFs, if no venue specified, treat as verified
+                if not reference.get('journal') and not reference.get('venue') and not reference.get('booktitle'):
+                    verified_data = {
+                        'title': reference.get('title', ''),
+                        'authors': reference.get('authors', []),
+                        'year': reference.get('year'),
+                        'venue': 'PDF Document',
+                        'url': web_url,
+                        'web_metadata': {
+                            'content_type': response.headers.get('content-type', ''),
+                            'status_code': response.status_code
+                        }
+                    }
+                    return verified_data, [], web_url
+                else:
+                    return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
+            # Parse HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Extract page content for searching
+            page_title = self._extract_page_title(soup)
+            page_description = self._extract_description(soup)
+            # Get the full page text for comprehensive searching
+            page_text = soup.get_text().lower()
+            # Get the reference title to search for
+            cited_title = reference.get('title', '').strip()
+            if not cited_title:
+                return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
+            # Search for the title in various ways
+            cited_title_lower = cited_title.lower()
+            title_found = False
+            # Direct search in page text
+            if cited_title_lower in page_text:
+                title_found = True
+            # Search for key words from the title
+            if not title_found:
+                cited_words = set(word.strip('.,;:()[]{}') for word in cited_title_lower.split()
+                                 if len(word.strip('.,;:()[]{}')) > 3)
+                # Check if significant portion of title words appear in page
+                page_words = set(word.strip('.,;:()[]{}') for word in page_text.split()
+                               if len(word.strip('.,;:()[]{}')) > 3)
+                common_words = cited_words.intersection(page_words)
+                # If most of the title words are found, consider it referenced
+                if len(common_words) >= max(1, len(cited_words) * 0.6):  # At least 60% of words match
+                    title_found = True
+            # Also check the extracted title and description specifically
+            if not title_found and page_title:
+                if self._check_title_match(cited_title, page_title, page_description):
+                    title_found = True
+            # Determine if this should be verified or unverified
+            if title_found:
+                # Check if reference should be verified based on venue type
+                venue_field = reference.get('journal') or reference.get('venue') or reference.get('booktitle')
+                if not venue_field:
+                    # No venue specified - verify with URL as venue
+                    site_info = self._extract_site_info(soup, web_url)
+                    venue = site_info.get('organization', 'Web Page') if site_info.get('organization') != site_info.get('domain') else 'Web Page'
+                    verified_data = {
+                        'title': reference.get('title', ''),
+                        'authors': reference.get('authors', []),
+                        'year': reference.get('year'),
+                        'venue': venue,
+                        'url': web_url,
+                        'web_metadata': {
+                            'page_title': page_title,
+                            'description': page_description,
+                            'site_info': site_info,
+                            'final_url': response.url,
+                            'status_code': response.status_code
+                        }
+                    }
+                    logger.debug(f"URL verified as valid source for reference without venue: {web_url}")
+                    return verified_data, [], web_url
+                elif self._is_web_content_venue(venue_field, web_url):
+                    # Has venue but it's a web content venue (news, blog, etc.) - verify it
+                    verified_data = {
+                        'title': reference.get('title', ''),
+                        'authors': reference.get('authors', []),
+                        'year': reference.get('year'),
+                        'venue': venue_field,  # Keep the original venue
+                        'url': web_url,
+                        'web_metadata': {
+                            'page_title': page_title,
+                            'description': page_description,
+                            'site_info': self._extract_site_info(soup, web_url),
+                            'final_url': response.url,
+                            'status_code': response.status_code
+                        }
+                    }
+                    logger.debug(f"URL verified as valid web content source: {web_url}")
+                    return verified_data, [], web_url
+                else:
+                    # Has academic venue but URL references paper - still unverified (needs proper paper verification)
+                    return None, [{"error_type": "unverified", "error_details": "paper not verified but URL references paper"}], web_url
+            else:
+                # Title not found in page content
+                return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
+        except Exception as e:
+            logger.error(f"Error checking raw URL {web_url}: {e}")
+            return None, [{"error_type": "unverified", "error_details": "paper not found and URL doesn't reference it"}], web_url
+    def _is_web_content_venue(self, venue: str, url: str) -> bool:
+        """
+        Determine if a venue represents web content rather than academic publication
+        Args:
+            venue: The venue string (journal, venue, or booktitle)
+            url: The URL being checked (for additional context)
+        Returns:
+            True if this represents web content that can be verified via URL
+        """
+        if not venue:
+            return False
+        venue_lower = venue.lower().strip()
+        # News organizations and media outlets
+        news_indicators = [
+            'news', 'cbc', 'bbc', 'cnn', 'reuters', 'associated press', 'ap news',
+            'npr', 'pbs', 'abc news', 'nbc news', 'fox news', 'guardian', 'times',
+            'post', 'herald', 'tribune', 'gazette', 'chronicle', 'observer',
+            'magazine', 'weekly', 'daily', 'today', 'report', 'wire', 'press'
+        ]
+        # Special case for Wall Street Journal
+        if any(word in venue_lower for word in ['wall street', 'wsj']):
+            news_indicators.append('journal')
+        # Technology and industry publications
+        tech_publications = [
+            'techcrunch', 'wired', 'ars technica', 'the verge', 'engadget',
+            'zdnet', 'cnet', 'computerworld', 'infoworld', 'pcmag', 'pcworld',
+            'ieee spectrum', 'mit technology review', 'scientific american'
+        ]
+        # Blogs and web platforms
+        blog_platforms = [
+            'blog', 'medium', 'substack', 'wordpress', 'blogspot', 'tumblr',
+            'linkedin', 'facebook', 'twitter', 'reddit', 'stack overflow',
+            'github pages', 'personal website', 'company blog'
+        ]
+        # Government and organizational websites
+        org_indicators = [
+            'government', 'gov', '.org', 'agency', 'department', 'ministry',
+            'commission', 'bureau', 'office', 'administration', 'institute',
+            'foundation', 'association', 'society', 'center', 'centre',
+            'council', 'committee', 'board', 'union', 'federation', 'alliance',
+            'coalition', 'consortium', 'network', 'group', 'organization',
+            'organisation', 'corp', 'corporation', 'company', 'ltd', 'inc'
+        ]
+        # Documentation and technical resources
+        tech_resources = [
+            'documentation', 'docs', 'api', 'reference', 'guide', 'tutorial',
+            'manual', 'readme', 'wiki', 'help', 'support', 'developer',
+            'technical', 'white paper', 'whitepaper', 'brief', 'overview',
+            'policy', 'strategy', 'report', 'study', 'analysis', 'research'
+        ]
+        # Check URL domain for additional context
+        url_lower = url.lower() if url else ''
+        # Known web content domains in URL
+        web_domains = [
+            'cbc.ca', 'bbc.com', 'cnn.com', 'reuters.com', 'npr.org', 'pbs.org',
+            'nytimes.com', 'washingtonpost.com', 'theguardian.com', 'wsj.com',
+            'techcrunch.com', 'wired.com', 'theverge.com', 'arstechnica.com',
+            'medium.com', 'substack.com', 'linkedin.com', 'github.io',
+            'readthedocs.io', 'stackoverflow.com', 'reddit.com'
+        ]
+        # Combine all indicators
+        all_indicators = news_indicators + tech_publications + blog_platforms + org_indicators + tech_resources
+        # Academic venue indicators that should NOT be considered web content
+        academic_indicators = [
+            'proceedings', 'conference', 'symposium', 'workshop', 'transactions',
+            'journal of', 'international journal', 'acm', 'ieee', 'springer',
+            'nature', 'science', 'cell', 'lancet', 'plos', 'arxiv', 'pubmed',
+            'artificial intelligence', 'machine learning', 'computer vision',
+            'neural', 'computing', 'robotics', 'bioinformatics'
+        ]
+        # Check if venue is clearly academic (should not be treated as web content)
+        is_academic = any(indicator in venue_lower for indicator in academic_indicators)
+        if is_academic:
+            return False
+        # Check if venue matches any web content indicators
+        venue_matches = any(indicator and indicator in venue_lower for indicator in all_indicators)
+        # Check if URL domain suggests web content
+        url_matches = any(domain in url_lower for domain in web_domains)
+        # Special case: if URL contains news/blog/docs indicators, lean towards web content
+        url_content_indicators = ['news', 'blog', 'post', 'article', 'docs', 'help', 'guide', 'resources', 'policy', 'strategy']
+        url_has_content_indicators = any(indicator in url_lower for indicator in url_content_indicators)
+        # Special case: Check if venue is an organizational acronym/name that matches the URL domain
+        # This handles cases like "AECEA" on aecea.ca domain
+        organizational_match = self._check_organizational_venue_match(venue, url_lower)
+        return venue_matches or url_matches or url_has_content_indicators or organizational_match
+    def _check_organizational_venue_match(self, venue: str, url_lower: str) -> bool:
+        """
+        Check if the venue represents an organization that matches the URL domain
+        Args:
+            venue: The venue string
+            url_lower: The lowercased URL
+        Returns:
+            True if venue appears to be the organization publishing on their own domain
+        """
+        if not venue or not url_lower:
+            return False
+        venue_lower = venue.lower().strip()
+        # Extract domain from URL
+        from urllib.parse import urlparse
+        try:
+            parsed_url = urlparse(url_lower)
+            domain = parsed_url.netloc.lower()
+            # Remove common prefixes
+            domain = domain.replace('www.', '')
+            # Check if venue is likely an acronym (short, all caps or mixed case)
+            is_likely_acronym = (len(venue) <= 10 and
+                               (venue.isupper() or
+                                any(c.isupper() for c in venue) and len(venue.split()) == 1))
+            # Check if venue appears in domain
+            venue_clean = ''.join(c for c in venue_lower if c.isalnum())
+            if venue_clean and venue_clean in domain:
+                return True
+            # For acronyms, check if the acronym could match the domain
+            if is_likely_acronym:
+                # Split venue into words and check if initials match domain
+                venue_words = venue_lower.replace('.', ' ').split()
+                if len(venue_words) == 1 and len(venue_words[0]) <= 6:
+                    # Single word acronym - check if it's in the domain
+                    if venue_words[0] in domain:
+                        return True
+            # Check for educational/professional associations with .ca, .org, .edu domains
+            if any(domain.endswith(tld) for tld in ['.ca', '.org', '.edu', '.gov']):
+                # These domains often host organizational content
+                if any(org_word in venue_lower for org_word in [
+                    'association', 'society', 'institute', 'foundation', 'center',
+                    'centre', 'council', 'committee', 'board', 'agency', 'department'
+                ]):
+                    return True
+                # Check if venue is a short organizational name/acronym
+                if is_likely_acronym:
+                    return True
+            return False
+        except Exception:
+            return False

core/refchecker.py CHANGED Viewed

@@ -2021,8 +2021,20 @@ class ArxivReferenceChecker:
                 logger.debug(f"Database mode: Initial paper_url from database checker: {paper_url}")
                 if not verified_data:
-                    # Mark as unverified but keep the URL if found
-                    return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
+                    # Mark as unverified but check URL for more specific reason or verification
+                    if reference.get('url', '').strip():
+                        # Use raw URL verifier to check if it can be verified or get specific reason
+                        url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
+                        if url_verified_data:
+                            # URL verification succeeded - return as verified
+                            logger.debug(f"Database mode: URL verification succeeded for unverified reference")
+                            return None, url_checked, url_verified_data
+                        else:
+                            # URL verification failed - use specific error reason
+                            url_error_details = url_errors[0].get('error_details', 'Reference could not be verified in database') if url_errors else 'Reference could not be verified in database'
+                            return [{"error_type": "unverified", "error_details": url_error_details}], paper_url, None
+                    else:
+                        return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
                 # Convert database errors to our format
                 formatted_errors = []
@@ -2118,7 +2130,29 @@ class ArxivReferenceChecker:
                 return [{"error_type": "unverified", "error_details": "Database connection not available"}], None, None
         # For non-database mode, use the standard reference verification
-        return self.verify_reference_standard(source_paper, reference)
+        errors, paper_url, verified_data = self.verify_reference_standard(source_paper, reference)
+        # If standard verification failed and the reference has a URL, try raw URL verification
+        if errors and verified_data is None:
+            # Check if there's an unverified error
+            unverified_errors = [e for e in errors if e.get('error_type') == 'unverified']
+            if unverified_errors and reference.get('url', '').strip():
+                # Use raw URL verifier to check if it can be verified or get specific reason
+                url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
+                if url_verified_data:
+                    # URL verification succeeded - return as verified
+                    logger.debug(f"Non-database mode: URL verification succeeded for unverified reference")
+                    return None, url_checked, url_verified_data
+                else:
+                    # URL verification failed - use specific error reason
+                    url_error_details = url_errors[0].get('error_details', 'Reference could not be verified') if url_errors else 'Reference could not be verified'
+                    # Update the unverified error with the specific reason
+                    for error in errors:
+                        if error.get('error_type') == 'unverified':
+                            error['error_details'] = url_error_details
+                            break
+        return errors, paper_url, verified_data
     def verify_github_reference(self, reference):
@@ -2253,6 +2287,55 @@ class ArxivReferenceChecker:
                 formatted_errors.append(formatted_error)
             return formatted_errors if formatted_errors else [{"error_type": "unverified", "error_details": "Web page could not be verified"}], page_url, None
+    def verify_raw_url_reference(self, reference):
+        """
+        Verify a raw URL from an unverified reference - can return verified data if appropriate
+        Args:
+            reference: The reference to verify (already determined to be unverified by paper validators)
+        Returns:
+            Tuple of (verified_data, errors, url) where:
+            - verified_data: Dict with verified data if URL should be considered verified, None otherwise
+            - errors: List of error dictionaries
+            - url: The URL that was checked
+        """
+        logger.debug(f"Checking raw URL for unverified reference: {reference.get('title', 'Untitled')}")
+        # Extract URL from reference
+        web_url = reference.get('url', '').strip()
+        if not web_url:
+            return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
+        # First try PDF paper checker if URL appears to be a PDF
+        from checkers.pdf_paper_checker import PDFPaperChecker
+        pdf_checker = PDFPaperChecker()
+        if pdf_checker.can_check_reference(reference):
+            logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
+            try:
+                verified_data, errors, url = pdf_checker.verify_reference(reference)
+                if verified_data:
+                    logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
+                    return verified_data, errors, url
+                else:
+                    logger.debug(f"PDF verification failed, falling back to web page verification")
+            except Exception as e:
+                logger.error(f"Error in PDF verification: {e}")
+                logger.debug(f"PDF verification error, falling back to web page verification")
+        # Fall back to web page checker
+        from checkers.webpage_checker import WebPageChecker
+        webpage_checker = WebPageChecker()
+        try:
+            verified_data, errors, url = webpage_checker.verify_raw_url_for_unverified_reference(reference)
+            logger.debug(f"Raw URL verification result: verified_data={verified_data is not None}, errors={len(errors)}, url={url}")
+            return verified_data, errors, url
+        except Exception as e:
+            logger.error(f"Error checking raw URL: {e}")
+            return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], web_url
     def verify_reference_standard(self, source_paper, reference):
         """
         Verify if a reference is accurate using GitHub, Semantic Scholar, or other checkers
@@ -2274,11 +2357,6 @@ class ArxivReferenceChecker:
         if github_result:
             return github_result
-        # Next, check if this is a web page reference
-        webpage_result = self.verify_webpage_reference(reference)
-        if webpage_result:
-            return webpage_result
         # Use the Semantic Scholar client to verify the reference
         verified_data, errors, paper_url = self.non_arxiv_checker.verify_reference(reference)
@@ -5515,6 +5593,14 @@ class ArxivReferenceChecker:
         """Categorize the unverified error into checker error or not found"""
         error_details_lower = error_details.lower()
+        # New specific URL-based unverified reasons
+        if error_details_lower == "non-existent web page":
+            return "Non-existent web page"
+        elif error_details_lower == "paper not found and url doesn't reference it":
+            return "Paper not found and URL doesn't reference it"
+        elif error_details_lower == "paper not verified but url references paper":
+            return "Paper not verified but URL references paper"
         # Checker/API errors
         api_error_patterns = [
             'api error', 'rate limit', 'http error', 'network error',

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-1.2.51.dist-info → academic_refchecker-1.2.52.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 1.2.51__py3-none-any.whl → 1.2.52__py3-none-any.whl

academic-refchecker 1.2.51py3-none-any.whl → 1.2.52py3-none-any.whl