PyPI - academic-refchecker - Versions diffs - 1.2.55__tar.gz → 1.2.57__tar.gz - Mend

academic-refchecker 1.2.55tar.gz → 1.2.57tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{academic_refchecker-1.2.55/src/academic_refchecker.egg-info → academic_refchecker-1.2.57}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.55
+Version: 1.2.57
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/requirements.txt RENAMED Viewed

@@ -9,6 +9,7 @@ tqdm>=4.60.0
 colorama>=0.4.4
 fuzzywuzzy>=0.18.0
 python-Levenshtein>=0.12.0
+cryptography>=42.0.0  # For API key encryption in web UI
 # Additional core dependencies found in codebase
 pandas>=1.3.0

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.55
+Version: 1.2.57
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

academic_refchecker-1.2.57/src/refchecker/__version__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Version information for RefChecker."""
+__version__ = "1.2.57"
+__version__ = "1.2.57"

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/local_semantic_scholar.py RENAMED Viewed

@@ -469,15 +469,14 @@ class LocalNonArxivReferenceChecker:
         # since this is a Semantic Scholar database checker
         external_ids = paper_data.get('externalIds', {})
-        # First try to get the Semantic Scholar URL since that's what we used for verification
-        if external_ids.get('CorpusId'):
-            from refchecker.utils.url_utils import construct_semantic_scholar_url
-            paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
+        # First try to get the Semantic Scholar URL using paperId (SHA hash)
+        if paper_data.get('paperId'):
+            paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
             logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
         else:
             # Fallback to best available URL if Semantic Scholar URL not available
             open_access_pdf = paper_data.get('openAccessPdf')
-            paper_url = get_best_available_url(external_ids, open_access_pdf)
+            paper_url = get_best_available_url(external_ids, open_access_pdf, paper_data.get('paperId'))
             if paper_url:
                 logger.debug(f"Using fallback URL: {paper_url}")

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/semantic_scholar.py RENAMED Viewed

@@ -85,7 +85,7 @@ class NonArxivReferenceChecker:
         params = {
             "query": query,
             "limit": 10,
-            "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal",
+            "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal",
             "sort": "relevance"  # Ensure consistent ordering
         }
@@ -135,7 +135,7 @@ class NonArxivReferenceChecker:
         endpoint = f"{self.base_url}/paper/DOI:{doi}"
         params = {
-            "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"
+            "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"
         }
         # Make the request with retries and backoff
@@ -260,7 +260,7 @@ class NonArxivReferenceChecker:
                 corpus_id = corpus_match.group(1)
                 # Try to get the paper directly by CorpusID
                 endpoint = f"{self.base_url}/paper/CorpusId:{corpus_id}"
-                params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"}
+                params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"}
                 for attempt in range(self.max_retries):
                     try:
@@ -537,12 +537,33 @@ class NonArxivReferenceChecker:
         # Verify venue
         cited_venue = reference.get('journal', '') or reference.get('venue', '')
-        paper_venue = paper_data.get('venue') or paper_data.get('journal')
-        # Ensure paper_venue is a string (sometimes it can be a dict)
-        if isinstance(paper_venue, dict):
-            paper_venue = paper_venue.get('name', '') if paper_venue else ''
-        elif paper_venue and not isinstance(paper_venue, str):
+        # Extract venue from paper_data - check multiple fields since Semantic Scholar
+        # returns venue info in different fields depending on publication type
+        paper_venue = None
+        # First try the simple 'venue' field (string)
+        if paper_data.get('venue'):
+            paper_venue = paper_data.get('venue')
+        # If no venue, try publicationVenue object
+        if not paper_venue and paper_data.get('publicationVenue'):
+            pub_venue = paper_data.get('publicationVenue')
+            if isinstance(pub_venue, dict):
+                paper_venue = pub_venue.get('name', '')
+            elif isinstance(pub_venue, str):
+                paper_venue = pub_venue
+        # If still no venue, try journal object
+        if not paper_venue and paper_data.get('journal'):
+            journal = paper_data.get('journal')
+            if isinstance(journal, dict):
+                paper_venue = journal.get('name', '')
+            elif isinstance(journal, str):
+                paper_venue = journal
+        # Ensure paper_venue is a string
+        if paper_venue and not isinstance(paper_venue, str):
             paper_venue = str(paper_venue)
         # Check venue mismatches
@@ -552,18 +573,12 @@ class NonArxivReferenceChecker:
                 from refchecker.utils.error_utils import create_venue_warning
                 errors.append(create_venue_warning(cited_venue, paper_venue))
         elif not cited_venue and paper_venue:
-            # Original reference has the venue in raw text but not parsed correctly
-            raw_text = reference.get('raw_text', '')
-            if raw_text and '#' in raw_text:
-                # Check if venue might be in the raw text format (author#title#venue#year#url)
-                parts = raw_text.split('#')
-                if len(parts) >= 3 and parts[2].strip():
-                    # Venue is present in raw text but missing from parsed reference
-                    errors.append({
-                        'warning_type': 'venue',
-                        'warning_details': f"Venue missing: should include '{paper_venue}'",
-                        'ref_venue_correct': paper_venue
-                    })
+            # Reference has no venue but paper has one - always warn about missing venue
+            errors.append({
+                'warning_type': 'venue',
+                'warning_details': f"Venue missing: should include '{paper_venue}'",
+                'ref_venue_correct': paper_venue
+            })
         # Always check for missing arXiv URLs when paper has arXiv ID
         external_ids = paper_data.get('externalIds', {})
@@ -612,10 +627,9 @@ class NonArxivReferenceChecker:
         logger.debug(f"Semantic Scholar - Extracting URL from paper data: {list(paper_data.keys())}")
         # Return the Semantic Scholar URL that was actually used for verification
-        # First priority: Semantic Scholar URL since that's what we used for verification
-        if external_ids.get('CorpusId'):
-            from refchecker.utils.url_utils import construct_semantic_scholar_url
-            paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
+        # First priority: Semantic Scholar URL using paperId (SHA hash, works in web URLs)
+        if paper_data.get('paperId'):
+            paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
             logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
         # Second priority: DOI URL (if this was verified through DOI)

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/refchecker.py RENAMED Viewed

@@ -2089,14 +2089,12 @@ class ArxivReferenceChecker:
                             if correct_paper_data:
                                 logger.debug(f"Database mode: Found correct paper: '{correct_paper_data.get('title', '')}'")
                                 # Use the CORRECT paper's Semantic Scholar URL
-                                correct_external_ids = correct_paper_data.get('externalIds', {})
-                                if correct_external_ids.get('CorpusId'):
-                                    from refchecker.utils.url_utils import construct_semantic_scholar_url
-                                    correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
+                                if correct_paper_data.get('paperId'):
+                                    correct_paper_url = f"https://www.semanticscholar.org/paper/{correct_paper_data['paperId']}"
                                     paper_url = correct_paper_url  # Update the main URL
                                     logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
                                 else:
-                                    logger.debug("Database mode: Correct paper found but no CorpusId available")
+                                    logger.debug("Database mode: Correct paper found but no paperId available")
                             else:
                                 logger.debug("Database mode: Could not find correct paper by title/authors")
                     except Exception as e:
@@ -2117,12 +2115,11 @@ class ArxivReferenceChecker:
                         formatted_errors.append(formatted_error)
                     # Fallback to wrong paper's URL if we couldn't find the correct one
-                    if not correct_paper_data and verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
-                        from refchecker.utils.url_utils import construct_semantic_scholar_url
-                        paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
+                    if not correct_paper_data and verified_data and verified_data.get('paperId'):
+                        paper_url = f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
                         logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
                     elif not correct_paper_data:
-                        logger.debug(f"Database mode: No CorpusId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
+                        logger.debug(f"Database mode: No paperId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
                 return formatted_errors if formatted_errors else None, paper_url, verified_data
             else:
@@ -5521,10 +5518,9 @@ class ArxivReferenceChecker:
         if verified_data and verified_data.get('url') and 'arxiv.org' not in verified_data['url']:
             return verified_data['url']
-        # Second priority: Semantic Scholar URL from CorpusId (if no direct URL available)
-        if verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
-            from refchecker.utils.url_utils import construct_semantic_scholar_url
-            return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
+        # Second priority: Semantic Scholar URL from paperId (if no direct URL available)
+        if verified_data and verified_data.get('paperId'):
+            return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
         # Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
         if verified_data and verified_data.get('externalIds', {}).get('DOI'):
@@ -5576,11 +5572,11 @@ class ArxivReferenceChecker:
             # Non-ArXiv URL, probably safe to use
             return reference_url
-    def _get_fallback_url(self, external_ids):
+    def _get_fallback_url(self, external_ids, verified_data=None):
         """Get fallback URL from external IDs (Semantic Scholar or DOI)"""
-        if external_ids.get('CorpusId'):
-            from refchecker.utils.url_utils import construct_semantic_scholar_url
-            return construct_semantic_scholar_url(external_ids['CorpusId'])
+        # Prefer paperId for Semantic Scholar URLs
+        if verified_data and verified_data.get('paperId'):
+            return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
         elif external_ids.get('DOI'):
             from refchecker.utils.doi_utils import construct_doi_url
             return construct_doi_url(external_ids['DOI'])

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/providers.py RENAMED Viewed

@@ -318,7 +318,23 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
                 }
             )
-            return response.text or ""
+            # Handle empty responses (content safety filter or other issues)
+            if not response.candidates:
+                logger.warning("Google API returned empty candidates (possibly content filtered)")
+                return ""
+            # Safely access the text
+            try:
+                return response.text or ""
+            except (ValueError, AttributeError) as e:
+                # response.text raises ValueError if multiple candidates or no text
+                logger.warning(f"Could not get text from Google response: {e}")
+                # Try to extract text from first candidate manually
+                if response.candidates and hasattr(response.candidates[0], 'content'):
+                    content = response.candidates[0].content
+                    if hasattr(content, 'parts') and content.parts:
+                        return content.parts[0].text or ""
+                return ""
         except Exception as e:
             logger.error(f"Google API call failed: {e}")

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/pdf_processor.py RENAMED Viewed

@@ -69,10 +69,30 @@ class PDFProcessor:
             with open(pdf_path, 'rb') as file:
                 pdf_reader = pypdf.PdfReader(file)
                 text = ""
+                failed_pages = []
                 for page_num in range(len(pdf_reader.pages)):
-                    page = pdf_reader.pages[page_num]
-                    text += page.extract_text() + "\n"
+                    try:
+                        page = pdf_reader.pages[page_num]
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                    except TypeError as e:
+                        # Handle pypdf errors like "NumberObject is not iterable"
+                        # which can occur with malformed PDF pages
+                        failed_pages.append(page_num + 1)  # 1-indexed for logging
+                        logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
+                        continue
+                    except Exception as e:
+                        failed_pages.append(page_num + 1)
+                        logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
+                        continue
+                if failed_pages:
+                    logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
+                if not text.strip():
+                    raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
                 # Cache the result
                 self.cache[pdf_path] = text

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/error_utils.py RENAMED Viewed

@@ -42,8 +42,8 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
     Example:
     Title mismatch:
-        'Cited Title'
-    vs: 'Correct Title'
+           cited:  'Cited Title'
+           actual: 'Correct Title'
     Args:
         mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
@@ -57,11 +57,10 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
     if not mismatch_type.endswith(":"):
         mismatch_type = mismatch_type.rstrip() + ":"
-    # Use fixed indentation for clean, consistent alignment
-    indent = ""  # spaces for content indentation
-    vs_indent = ""   # vs: starts at column 0 for clear visual separation
+    # Use fixed indentation for labels, keeping detail column aligned
+    label_indent = "       "  # 7 spaces to indent labels
-    return f"{mismatch_type}\n{indent}cited:  '{left}'\n{vs_indent}actual: '{right}'"
+    return f"{mismatch_type}\n{label_indent}cited:  {left}\n{label_indent}actual: {right}"
 def format_title_mismatch(cited_title: str, verified_title: str) -> str:
@@ -187,8 +186,9 @@ def format_missing_venue(correct_venue: str) -> str:
     """
     Format a missing venue message with only the actual value.
     """
-    # Only show the actual venue; omit the empty cited line
-    return f"Missing venue: '{correct_venue}'"
+    # Only show the actual venue with indented label
+    label_indent = "       "  # 7 spaces to indent labels
+    return f"Missing venue:\n{label_indent}actual: {correct_venue}"
 def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:

{academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/url_utils.py RENAMED Viewed

@@ -102,7 +102,9 @@ def construct_semantic_scholar_url(paper_id: str) -> str:
     Construct a Semantic Scholar URL from a paper ID.
     Args:
-        paper_id: Semantic Scholar paper ID
+        paper_id: Semantic Scholar paper ID (SHA hash, NOT CorpusId)
+                  The paperId is the 40-character hex hash that works in web URLs.
+                  CorpusId (numeric) does NOT work in web URLs.
     Returns:
         Full Semantic Scholar URL
@@ -151,7 +153,7 @@ def construct_pubmed_url(pmid: str) -> str:
     return f"https://pubmed.ncbi.nlm.nih.gov/{clean_pmid}/"
-def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None) -> Optional[str]:
+def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None, paper_id: Optional[str] = None) -> Optional[str]:
     """
     Get the best available URL from a paper's external IDs and open access information.
     Priority: Open Access PDF > DOI > ArXiv > Semantic Scholar > OpenAlex > PubMed
@@ -159,6 +161,7 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
     Args:
         external_ids: Dictionary of external identifiers
         open_access_pdf: Open access PDF URL if available
+        paper_id: Semantic Scholar paperId (SHA hash) if available
     Returns:
         Best available URL or None if no valid URL found
@@ -175,9 +178,9 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
     if external_ids.get('ArXiv'):
         return construct_arxiv_url(external_ids['ArXiv'])
-    # Priority 4: Semantic Scholar URL
-    if external_ids.get('CorpusId'):
-        return construct_semantic_scholar_url(external_ids['CorpusId'])
+    # Priority 4: Semantic Scholar URL (using paperId, not CorpusId)
+    if paper_id:
+        return construct_semantic_scholar_url(paper_id)
     # Priority 5: OpenAlex URL
     if external_ids.get('OpenAlex'):