PyPI - academic-refchecker - Versions diffs - 2.0.19__py3-none-any.whl → 2.0.21__py3-none-any.whl - Mend

academic-refchecker 2.0.19py3-none-any.whl → 2.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

backend/refchecker_wrapper.py CHANGED Viewed

@@ -7,10 +7,18 @@ import re
 import asyncio
 import logging
 import tempfile
+import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Any, Optional, Callable
 from pathlib import Path
+# Debug file logging
+DEBUG_LOG_FILE = Path(tempfile.gettempdir()) / "refchecker_debug.log"
+def debug_log(msg: str):
+    from datetime import datetime
+    with open(DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
+        f.write(f"{datetime.now().strftime('%H:%M:%S.%f')[:12]} {msg}\n")
 # Add src to path to import refchecker when running from source
 # This is only needed when not installed as a package
 _src_path = str(Path(__file__).parent.parent / "src")
@@ -89,7 +97,8 @@ class ProgressRefChecker:
                  cancel_event: Optional[asyncio.Event] = None,
                  check_id: Optional[int] = None,
                  title_update_callback: Optional[Callable] = None,
-                 bibliography_source_callback: Optional[Callable] = None):
+                 bibliography_source_callback: Optional[Callable] = None,
+                 semantic_scholar_api_key: Optional[str] = None):
         """
         Initialize the progress-aware refchecker
@@ -135,8 +144,12 @@ class ProgressRefChecker:
                 logger.error(f"Failed to initialize LLM: {e}")
         # Initialize reference checker
+        # Use provided API key, fall back to environment variable
+        ss_api_key = semantic_scholar_api_key or os.getenv('SEMANTIC_SCHOLAR_API_KEY')
+        if ss_api_key:
+            logger.info("Semantic Scholar API key configured")
         self.checker = EnhancedHybridReferenceChecker(
-            semantic_scholar_api_key=os.getenv('SEMANTIC_SCHOLAR_API_KEY'),
+            semantic_scholar_api_key=ss_api_key,
             debug_mode=False
         )
@@ -291,7 +304,7 @@ class ProgressRefChecker:
             "authoritative_urls": authoritative_urls,
             "corrected_reference": None
         }
-        logger.debug(f"_format_verification_result output: status={status}, errors={len(formatted_errors)}, warnings={len(formatted_warnings)}, suggestions={len(formatted_suggestions)}")
+        logger.info(f"_format_verification_result output: suggestions={formatted_suggestions}, status={status}")
         return result
     def _format_error_result(
@@ -394,16 +407,32 @@ class ProgressRefChecker:
                     await asyncio.to_thread(download_pdf_url)
-                    # Extract title from PDF filename or URL
-                    from urllib.parse import urlparse, unquote
-                    url_path = urlparse(paper_source).path
-                    pdf_filename = unquote(url_path.split('/')[-1])
-                    paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
-                    await update_title_if_needed(paper_title)
                     extraction_method = 'pdf'
                     pdf_processor = PDFProcessor()
                     paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
+                    # Try to extract the paper title from the PDF content
+                    try:
+                        extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, pdf_path)
+                        if extracted_title:
+                            paper_title = extracted_title
+                            await update_title_if_needed(paper_title)
+                            logger.info(f"Extracted title from PDF URL: {paper_title}")
+                        else:
+                            # Fallback to URL filename
+                            from urllib.parse import urlparse, unquote
+                            url_path = urlparse(paper_source).path
+                            pdf_filename = unquote(url_path.split('/')[-1])
+                            paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
+                            await update_title_if_needed(paper_title)
+                    except Exception as e:
+                        logger.warning(f"Could not extract title from PDF: {e}")
+                        # Fallback to URL filename
+                        from urllib.parse import urlparse, unquote
+                        url_path = urlparse(paper_source).path
+                        pdf_filename = unquote(url_path.split('/')[-1])
+                        paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
+                        await update_title_if_needed(paper_title)
                 else:
                     # Handle ArXiv URLs/IDs
                     arxiv_id = extract_arxiv_id_from_url(paper_source)
@@ -467,14 +496,22 @@ class ProgressRefChecker:
                 })
                 # Handle uploaded file - run PDF processing in thread
-                # Note: paper_title is already set to the original filename in main.py
-                # so we don't update it here
                 if paper_source.lower().endswith('.pdf'):
                     # PDF extraction requires LLM for reliable reference extraction
                     if not self.llm:
                         raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
                     pdf_processor = PDFProcessor()
                     paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
+                    # Try to extract the paper title from the PDF
+                    try:
+                        extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, paper_source)
+                        if extracted_title:
+                            paper_title = extracted_title
+                            await update_title_if_needed(paper_title)
+                            logger.info(f"Extracted title from PDF: {paper_title}")
+                    except Exception as e:
+                        logger.warning(f"Could not extract title from PDF: {e}")
                 elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
                     def read_file():
                         with open(paper_source, 'r', encoding='utf-8') as f:
@@ -808,6 +845,11 @@ class ProgressRefChecker:
                 return []
             if refs:
                 logger.info(f"Extracted {len(refs)} references via CLI parser")
+                # DEBUG: Log problematic references where year looks like title
+                for idx, ref in enumerate(refs):
+                    title = ref.get('title', '')
+                    if title and (title.isdigit() or len(title) < 10):
+                        debug_log(f"PARSE ISSUE ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]} year={ref.get('year')}")
                 # Normalize field names (journal -> venue)
                 refs = [_normalize_reference_fields(ref) for ref in refs]
                 return refs
@@ -853,7 +895,16 @@ class ProgressRefChecker:
                         try:
                             llm_refs = await asyncio.to_thread(cli_checker.llm_extractor.extract_references, bibtex_content)
                             if llm_refs:
+                                # DEBUG: Log raw LLM output
+                                debug_log(f"LLM raw output ({len(llm_refs)} refs):")
+                                for i, r in enumerate(llm_refs[:5]):
+                                    debug_log(f"  [{i+1}] {str(r)[:150]}")
                                 processed_refs = await asyncio.to_thread(cli_checker._process_llm_extracted_references, llm_refs)
+                                # DEBUG: Log processed refs with potential issues
+                                for idx, ref in enumerate(processed_refs):
+                                    title = ref.get('title', '')
+                                    if title and (title.isdigit() or len(title) < 10):
+                                        debug_log(f"PARSE ISSUE after LLM ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]}")
                                 llm_validation = await asyncio.to_thread(validate_parsed_references, processed_refs)
                                 if llm_validation['quality_score'] > validation['quality_score']:
                                     logger.info(f"LLM extraction improved quality ({llm_validation['quality_score']:.2f})")
@@ -916,7 +967,11 @@ class ProgressRefChecker:
             # Run verification with timeout (handled by caller)
             verified_data, errors, url = self.checker.verify_reference(reference)
             return self._format_verification_result(reference, index, verified_data, errors, url)
+        except UnicodeEncodeError as e:
+            # Handle Windows encoding issues with special characters (e.g., Greek letters in titles)
+            logger.warning(f"Unicode encoding error checking reference {index}: {e}")
+            return self._format_error_result(reference, index,
+                Exception(f"Unicode encoding error - title may contain special characters"))
         except Exception as e:
             logger.error(f"Error checking reference {index}: {e}")
             return self._format_error_result(reference, index, e)
@@ -938,13 +993,22 @@ class ProgressRefChecker:
         from .database import db
         # Check cache first
+        cache_start = time.time()
         cached_result = await db.get_cached_verification(reference)
+        cache_time = time.time() - cache_start
+        if cache_time > 0.1:
+            debug_log(f"[TIMING] Cache lookup for ref {idx + 1} took {cache_time:.3f}s")
         if cached_result:
             # Update the index to match current position
             cached_result['index'] = idx + 1
-            logger.info(f"Cache hit for reference {idx + 1}: {reference.get('title', 'Unknown')[:50]}")
+            debug_log(f"Cache hit for reference {idx + 1} in {cache_time:.3f}s")
             return cached_result
+        # Log cache miss with details
+        title = reference.get('title', 'Unknown')[:60]
+        authors = reference.get('authors', [])[:2]
+        debug_log(f"CACHE MISS for ref {idx + 1}: title='{title}' authors={authors}")
         limiter = get_limiter()
         # Wait for a slot in the global queue
@@ -961,7 +1025,6 @@ class ProgressRefChecker:
             try:
                 # Run the sync check in a thread
-                # Use 240 second timeout to allow for ArXiv rate limiting with version checking
                 result = await asyncio.wait_for(
                     loop.run_in_executor(
                         None,  # Use default executor
@@ -969,7 +1032,7 @@ class ProgressRefChecker:
                         reference,
                         idx + 1
                     ),
-                    timeout=240.0  # 4 minute timeout per reference (allows for rate-limited version checking)
+                    timeout=120.0  # 2 minute timeout per reference
                 )
             except asyncio.TimeoutError:
                 result = {
@@ -982,7 +1045,7 @@ class ProgressRefChecker:
                     "status": "error",
                     "errors": [{
                         "error_type": "timeout",
-                        "error_details": "Verification timed out after 240 seconds"
+                        "error_details": "Verification timed out after 120 seconds"
                     }],
                     "warnings": [],
                     "authoritative_urls": [],
@@ -1045,6 +1108,9 @@ class ProgressRefChecker:
         loop = asyncio.get_event_loop()
+        start_time = time.time()
+        debug_log(f"[TIMING] Starting parallel check of {total_refs} references")
         # Create tasks for all references - they will be rate-limited by the global semaphore
         tasks = []
         for idx, ref in enumerate(references):
@@ -1054,11 +1120,18 @@ class ProgressRefChecker:
             )
             tasks.append((idx, task))
+        task_creation_time = time.time()
+        debug_log(f"[TIMING] Tasks created in {task_creation_time - start_time:.3f}s")
         # Process results as they complete
         pending_tasks = {task for _, task in tasks}
         task_to_idx = {task: idx for idx, task in tasks}
+        iteration = 0
         while pending_tasks:
+            iteration += 1
+            iter_start = time.time()
             # Check for cancellation
             try:
                 await self._check_cancelled()
@@ -1068,13 +1141,15 @@ class ProgressRefChecker:
                     task.cancel()
                 raise
-            # Wait for some tasks to complete
+            # Wait for some tasks to complete - no timeout needed, just wait for first completed
             done, pending_tasks = await asyncio.wait(
                 pending_tasks,
-                timeout=0.5,
                 return_when=asyncio.FIRST_COMPLETED
             )
+            wait_time = time.time() - iter_start
+            debug_log(f"[TIMING] Iteration {iteration}: wait took {wait_time:.3f}s, {len(done)} done, {len(pending_tasks)} pending")
             for task in done:
                 idx = task_to_idx[task]
@@ -1147,6 +1222,7 @@ class ProgressRefChecker:
                     refs_with_warnings_only += 1
                 # Emit result immediately
+                emit_start = time.time()
                 await self.emit_progress("reference_result", result)
                 await self.emit_progress("progress", {
                     "current": processed_count,
@@ -1165,6 +1241,20 @@ class ProgressRefChecker:
                     "refs_verified": refs_verified,
                     "progress_percent": round((processed_count / total_refs) * 100, 1)
                 })
+                emit_time = time.time() - emit_start
+                if emit_time > 0.1:
+                    debug_log(f"[TIMING] Emit for ref {idx + 1} took {emit_time:.3f}s")
+                # Yield to event loop to allow WebSocket messages to flush
+                # This prevents stalls when many cache hits complete rapidly
+                await asyncio.sleep(0)
+        total_time = time.time() - start_time
+        debug_log(f"[TIMING] Total parallel check completed in {total_time:.3f}s for {total_refs} refs")
+        # Small delay to ensure all WebSocket messages are sent before returning
+        # This prevents the 'completed' event from arriving before final progress updates
+        await asyncio.sleep(0.1)
         # Convert dict to ordered list
         results_list = [results.get(i) for i in range(total_refs)]

refchecker/__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "2.0.19"
+__version__ = "2.0.21"

refchecker/checkers/semantic_scholar.py CHANGED Viewed

@@ -63,8 +63,8 @@ class NonArxivReferenceChecker:
         # Rate limiting parameters
         self.request_delay = 1.0  # Initial delay between requests (seconds)
-        self.max_retries = 5  # Sufficient for individual API calls
-        self.backoff_factor = 2  # Exponential backoff factor
+        self.max_retries = 3  # Reduced from 5 to limit timeout accumulation
+        self.backoff_factor = 1.5  # Reduced from 2 for faster retries
         # Track API failures for Enhanced Hybrid Checker
         self._api_failed = False

refchecker/core/refchecker.py CHANGED Viewed

@@ -4887,6 +4887,52 @@ class ArxivReferenceChecker:
         title = clean_title(title) if title else ""
         title = title.rstrip(',').strip()
+        # FIX: Detect malformed parsing for standards documents
+        # When title is just a year (e.g., "2023") and authors contains what looks like a title
+        # (common for ISO/SAE/PAS standards), swap them
+        if title and re.match(r'^(19|20)\d{2}$', title):
+            # Title is just a year - check if authors contains the actual title
+            if authors and len(authors) > 0:
+                # Join all author parts (sometimes title is split into multiple "authors")
+                combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
+                first_author = authors[0] if isinstance(authors, list) else str(authors)
+                # If first "author" looks like a title (contains certain keywords or is long)
+                standard_keywords = ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification',
+                                     'road vehicles', 'driving automation', 'guidelines', 'taxonomy']
+                if any(kw in combined_authors.lower() for kw in standard_keywords):
+                    logger.debug(f"Fixing malformed standard reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
+                    # Move year to year field, combined authors to actual title
+                    year = int(title)
+                    title = combined_authors
+                    authors = []  # Standards typically don't have authors
+                elif len(first_author) > 40:
+                    # Long first "author" is likely a title
+                    logger.debug(f"Fixing likely malformed reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
+                    year = int(title)
+                    title = combined_authors
+                    authors = []
+        # FIX: Detect when title is a publisher/organization name and authors contains the actual title
+        # Common publishers for standards: SAE International, BSI Standards, ISO, Beuth Verlag, etc.
+        publisher_patterns = ['sae international', 'bsi standards', 'beuth verlag', 'iso/', 'ieee',
+                             'acm', 'springer', 'elsevier', 'wiley', 'oxford university press',
+                             'cambridge university press', 'mit press', 'verlag', 'förderung']
+        title_lower = title.lower() if title else ''
+        if authors and len(authors) > 0:
+            combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
+            # Check if title looks like a short publisher name and authors looks like a real title
+            is_publisher = any(pub in title_lower for pub in publisher_patterns)
+            is_short_title = len(title) < 30
+            authors_look_like_title = any(kw in combined_authors.lower() for kw in
+                ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification', 'road vehicles',
+                 'driving automation', 'guidelines', 'taxonomy', 'openodd'])
+            if (is_publisher or (is_short_title and authors_look_like_title)) and len(combined_authors) > 20:
+                logger.debug(f"Fixing publisher-as-title: '{title}' -> '{combined_authors[:60]}...'")
+                venue = title  # Publisher becomes venue
+                title = combined_authors
+                authors = []
         # Clean up venue
         # Clean up venue - if venue is just a year, null it
         if venue and venue.isdigit() and len(venue) == 4 and venue.startswith(('19', '20')):

refchecker/services/pdf_processor.py CHANGED Viewed

@@ -265,4 +265,159 @@ class PDFProcessor:
     def clear_cache(self):
         """Clear the text extraction cache"""
         self.cache.clear()
-        logger.debug("PDF text cache cleared")
+        logger.debug("PDF text cache cleared")
+    def extract_title_from_pdf(self, pdf_path: str) -> Optional[str]:
+        """
+        Extract the title from a PDF file.
+        First tries PDF metadata, then falls back to heuristic extraction
+        from the first page text.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            Extracted title or None if not found
+        """
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        try:
+            import pypdf
+            with open(pdf_path, 'rb') as file:
+                pdf_reader = pypdf.PdfReader(file)
+                # Try PDF metadata first
+                metadata = pdf_reader.metadata
+                if metadata:
+                    title = metadata.get('/Title')
+                    if title and isinstance(title, str) and len(title.strip()) > 3:
+                        # Clean up the title
+                        title = title.strip()
+                        # Skip if it looks like a filename
+                        if not title.endswith(('.pdf', '.tex', '.dvi')) and title.lower() != 'untitled':
+                            logger.debug(f"Found title in PDF metadata: {title}")
+                            return title
+                # Fall back to extracting from first page text
+                if len(pdf_reader.pages) > 0:
+                    try:
+                        first_page_text = pdf_reader.pages[0].extract_text()
+                        if first_page_text:
+                            title = self._extract_title_from_text(first_page_text)
+                            if title:
+                                logger.debug(f"Extracted title from first page: {title}")
+                                return title
+                    except Exception as e:
+                        logger.warning(f"Error extracting title from first page: {e}")
+                return None
+        except ImportError:
+            logger.error("pypdf not installed. Install with: pip install pypdf")
+            raise
+        except Exception as e:
+            logger.warning(f"Error extracting title from PDF {pdf_path}: {e}")
+            return None
+    def _extract_title_from_text(self, text: str) -> Optional[str]:
+        """
+        Heuristically extract paper title from text (typically first page).
+        Academic papers typically have the title as one of the first prominent
+        text blocks, often followed by author names.
+        Args:
+            text: Text from first page of PDF
+        Returns:
+            Extracted title or None
+        """
+        if not text:
+            return None
+        import re
+        # Split into lines and clean
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        if not lines:
+            return None
+        # Skip common header elements (conference names, page numbers, etc.)
+        header_patterns = [
+            r'^(proceedings|conference|journal|workshop|symposium)',
+            r'^(vol\.|volume|issue|no\.|number)',
+            r'^\d{1,4}\s*$',  # Page numbers
+            r'^(preprint|arxiv|draft)',
+            r'^(ieee|acm|springer|elsevier)',
+            r'^[a-z]+\s+\d{4}$',  # "January 2024" etc
+        ]
+        # Author indicators that typically follow the title
+        author_indicators = [
+            r'^[A-Z][a-z]+\s+[A-Z][a-z]+(\s*,|\s+and\s+)',  # "John Smith," or "John Smith and"
+            r'^[A-Z]\.\s*[A-Z][a-z]+',  # "J. Smith"
+            r'^[\w\s,]+@[\w\.-]+',  # Email addresses
+            r'^(university|department|institute|school|college)',
+            r'^\d+\s+[A-Z]',  # Addresses like "123 Main St"
+        ]
+        # Find potential title lines
+        title_candidates = []
+        for i, line in enumerate(lines[:15]):  # Only look at first 15 lines
+            # Skip empty or very short lines
+            if len(line) < 10:
+                continue
+            # Skip lines matching header patterns
+            is_header = any(re.search(pat, line, re.IGNORECASE) for pat in header_patterns)
+            if is_header:
+                continue
+            # Check if this looks like the start of author section
+            is_author_section = any(re.search(pat, line, re.IGNORECASE) for pat in author_indicators)
+            if is_author_section:
+                break  # Stop - we've passed the title
+            # Good candidate: reasonable length, not too long
+            if 15 <= len(line) <= 300:
+                title_candidates.append(line)
+                # If next line looks like authors, we found the title
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1]
+                    if any(re.search(pat, next_line, re.IGNORECASE) for pat in author_indicators):
+                        break
+        if not title_candidates:
+            return None
+        # Take the first good candidate, or combine first few if they seem related
+        title = title_candidates[0]
+        # Sometimes titles span multiple lines - check if next line continues
+        if len(title_candidates) > 1:
+            second = title_candidates[1]
+            # If second line is short and starts with lowercase or continues sentence
+            if len(second) < 80 and (second[0].islower() or title.endswith(':')):
+                title = title + ' ' + second
+        # Clean up the title
+        title = re.sub(r'\s+', ' ', title).strip()
+        # Remove common artifacts
+        title = re.sub(r'^\d+\s*', '', title)  # Leading numbers
+        title = re.sub(r'\s*\*+\s*$', '', title)  # Trailing asterisks
+        # Validate: title should have reasonable characteristics
+        if len(title) < 15 or len(title) > 350:
+            return None
+        # Should have some letters (not just numbers/symbols)
+        if not re.search(r'[a-zA-Z]{3,}', title):
+            return None
+        return title

refchecker/utils/text_utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ Text processing utilities for ArXiv Reference Checker
 import re
 import logging
 import unicodedata
+import html
 from typing import List
 logger = logging.getLogger(__name__)
@@ -5088,7 +5089,8 @@ def normalize_venue_for_display(venue: str) -> str:
         return text_lower
-    venue_text = venue.strip()
+    # Decode any HTML entities (e.g., "&amp;" -> "&") before further cleaning
+    venue_text = html.unescape(venue).strip()
     # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
     # This prevents author/editor lists from being treated as venue

{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/top_level.txt RENAMED Viewed

File without changes

academic-refchecker 2.0.19__py3-none-any.whl → 2.0.21__py3-none-any.whl

academic-refchecker 2.0.19py3-none-any.whl → 2.0.21py3-none-any.whl