academic-refchecker 1.2.55__tar.gz → 1.2.57__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {academic_refchecker-1.2.55/src/academic_refchecker.egg-info → academic_refchecker-1.2.57}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/requirements.txt +1 -0
  3. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. academic_refchecker-1.2.57/src/refchecker/__version__.py +5 -0
  5. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/local_semantic_scholar.py +4 -5
  6. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/semantic_scholar.py +38 -24
  7. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/refchecker.py +13 -17
  8. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/providers.py +17 -1
  9. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/pdf_processor.py +22 -2
  10. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/error_utils.py +8 -8
  11. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/url_utils.py +8 -5
  12. academic_refchecker-1.2.55/src/refchecker/__version__.py +0 -5
  13. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/LICENSE +0 -0
  14. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/MANIFEST.in +0 -0
  15. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/README.md +0 -0
  16. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/pyproject.toml +0 -0
  17. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/download_db.py +0 -0
  18. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/run_tests.py +0 -0
  19. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/scripts/start_vllm_server.py +0 -0
  20. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/setup.cfg +0 -0
  21. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  22. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  23. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  24. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/requires.txt +0 -0
  25. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  26. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/__init__.py +0 -0
  27. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/__main__.py +0 -0
  28. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/__init__.py +0 -0
  29. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/crossref.py +0 -0
  30. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/enhanced_hybrid_checker.py +0 -0
  31. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/github_checker.py +0 -0
  32. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/openalex.py +0 -0
  33. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/openreview_checker.py +0 -0
  34. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/pdf_paper_checker.py +0 -0
  35. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/checkers/webpage_checker.py +0 -0
  36. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/__init__.py +0 -0
  37. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/logging.conf +0 -0
  38. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/config/settings.py +0 -0
  39. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/__init__.py +0 -0
  40. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/db_connection_pool.py +0 -0
  41. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/core/parallel_processor.py +0 -0
  42. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/database/__init__.py +0 -0
  43. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/database/download_semantic_scholar_db.py +0 -0
  44. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/__init__.py +0 -0
  45. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/llm/base.py +0 -0
  46. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/scripts/__init__.py +0 -0
  47. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/scripts/start_vllm_server.py +0 -0
  48. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/services/__init__.py +0 -0
  49. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/__init__.py +0 -0
  50. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/arxiv_utils.py +0 -0
  51. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/author_utils.py +0 -0
  52. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/biblatex_parser.py +0 -0
  53. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibliography_utils.py +0 -0
  54. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/bibtex_parser.py +0 -0
  55. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/config_validator.py +0 -0
  56. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/db_utils.py +0 -0
  57. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/doi_utils.py +0 -0
  58. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/mock_objects.py +0 -0
  59. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/text_utils.py +0 -0
  60. {academic_refchecker-1.2.55 → academic_refchecker-1.2.57}/src/refchecker/utils/unicode_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -9,6 +9,7 @@ tqdm>=4.60.0
9
9
  colorama>=0.4.4
10
10
  fuzzywuzzy>=0.18.0
11
11
  python-Levenshtein>=0.12.0
12
+ cryptography>=42.0.0 # For API key encryption in web UI
12
13
 
13
14
  # Additional core dependencies found in codebase
14
15
  pandas>=1.3.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,5 @@
1
+ """Version information for RefChecker."""
2
+
3
+ __version__ = "1.2.57"
4
+
5
+ __version__ = "1.2.57"
@@ -469,15 +469,14 @@ class LocalNonArxivReferenceChecker:
469
469
  # since this is a Semantic Scholar database checker
470
470
  external_ids = paper_data.get('externalIds', {})
471
471
 
472
- # First try to get the Semantic Scholar URL since that's what we used for verification
473
- if external_ids.get('CorpusId'):
474
- from refchecker.utils.url_utils import construct_semantic_scholar_url
475
- paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
472
+ # First try to get the Semantic Scholar URL using paperId (SHA hash)
473
+ if paper_data.get('paperId'):
474
+ paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
476
475
  logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
477
476
  else:
478
477
  # Fallback to best available URL if Semantic Scholar URL not available
479
478
  open_access_pdf = paper_data.get('openAccessPdf')
480
- paper_url = get_best_available_url(external_ids, open_access_pdf)
479
+ paper_url = get_best_available_url(external_ids, open_access_pdf, paper_data.get('paperId'))
481
480
  if paper_url:
482
481
  logger.debug(f"Using fallback URL: {paper_url}")
483
482
 
@@ -85,7 +85,7 @@ class NonArxivReferenceChecker:
85
85
  params = {
86
86
  "query": query,
87
87
  "limit": 10,
88
- "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal",
88
+ "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal",
89
89
  "sort": "relevance" # Ensure consistent ordering
90
90
  }
91
91
 
@@ -135,7 +135,7 @@ class NonArxivReferenceChecker:
135
135
  endpoint = f"{self.base_url}/paper/DOI:{doi}"
136
136
 
137
137
  params = {
138
- "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"
138
+ "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"
139
139
  }
140
140
 
141
141
  # Make the request with retries and backoff
@@ -260,7 +260,7 @@ class NonArxivReferenceChecker:
260
260
  corpus_id = corpus_match.group(1)
261
261
  # Try to get the paper directly by CorpusID
262
262
  endpoint = f"{self.base_url}/paper/CorpusId:{corpus_id}"
263
- params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,journal"}
263
+ params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"}
264
264
 
265
265
  for attempt in range(self.max_retries):
266
266
  try:
@@ -537,12 +537,33 @@ class NonArxivReferenceChecker:
537
537
 
538
538
  # Verify venue
539
539
  cited_venue = reference.get('journal', '') or reference.get('venue', '')
540
- paper_venue = paper_data.get('venue') or paper_data.get('journal')
541
540
 
542
- # Ensure paper_venue is a string (sometimes it can be a dict)
543
- if isinstance(paper_venue, dict):
544
- paper_venue = paper_venue.get('name', '') if paper_venue else ''
545
- elif paper_venue and not isinstance(paper_venue, str):
541
+ # Extract venue from paper_data - check multiple fields since Semantic Scholar
542
+ # returns venue info in different fields depending on publication type
543
+ paper_venue = None
544
+
545
+ # First try the simple 'venue' field (string)
546
+ if paper_data.get('venue'):
547
+ paper_venue = paper_data.get('venue')
548
+
549
+ # If no venue, try publicationVenue object
550
+ if not paper_venue and paper_data.get('publicationVenue'):
551
+ pub_venue = paper_data.get('publicationVenue')
552
+ if isinstance(pub_venue, dict):
553
+ paper_venue = pub_venue.get('name', '')
554
+ elif isinstance(pub_venue, str):
555
+ paper_venue = pub_venue
556
+
557
+ # If still no venue, try journal object
558
+ if not paper_venue and paper_data.get('journal'):
559
+ journal = paper_data.get('journal')
560
+ if isinstance(journal, dict):
561
+ paper_venue = journal.get('name', '')
562
+ elif isinstance(journal, str):
563
+ paper_venue = journal
564
+
565
+ # Ensure paper_venue is a string
566
+ if paper_venue and not isinstance(paper_venue, str):
546
567
  paper_venue = str(paper_venue)
547
568
 
548
569
  # Check venue mismatches
@@ -552,18 +573,12 @@ class NonArxivReferenceChecker:
552
573
  from refchecker.utils.error_utils import create_venue_warning
553
574
  errors.append(create_venue_warning(cited_venue, paper_venue))
554
575
  elif not cited_venue and paper_venue:
555
- # Original reference has the venue in raw text but not parsed correctly
556
- raw_text = reference.get('raw_text', '')
557
- if raw_text and '#' in raw_text:
558
- # Check if venue might be in the raw text format (author#title#venue#year#url)
559
- parts = raw_text.split('#')
560
- if len(parts) >= 3 and parts[2].strip():
561
- # Venue is present in raw text but missing from parsed reference
562
- errors.append({
563
- 'warning_type': 'venue',
564
- 'warning_details': f"Venue missing: should include '{paper_venue}'",
565
- 'ref_venue_correct': paper_venue
566
- })
576
+ # Reference has no venue but paper has one - always warn about missing venue
577
+ errors.append({
578
+ 'warning_type': 'venue',
579
+ 'warning_details': f"Venue missing: should include '{paper_venue}'",
580
+ 'ref_venue_correct': paper_venue
581
+ })
567
582
 
568
583
  # Always check for missing arXiv URLs when paper has arXiv ID
569
584
  external_ids = paper_data.get('externalIds', {})
@@ -612,10 +627,9 @@ class NonArxivReferenceChecker:
612
627
  logger.debug(f"Semantic Scholar - Extracting URL from paper data: {list(paper_data.keys())}")
613
628
 
614
629
  # Return the Semantic Scholar URL that was actually used for verification
615
- # First priority: Semantic Scholar URL since that's what we used for verification
616
- if external_ids.get('CorpusId'):
617
- from refchecker.utils.url_utils import construct_semantic_scholar_url
618
- paper_url = construct_semantic_scholar_url(external_ids['CorpusId'])
630
+ # First priority: Semantic Scholar URL using paperId (SHA hash, works in web URLs)
631
+ if paper_data.get('paperId'):
632
+ paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
619
633
  logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
620
634
 
621
635
  # Second priority: DOI URL (if this was verified through DOI)
@@ -2089,14 +2089,12 @@ class ArxivReferenceChecker:
2089
2089
  if correct_paper_data:
2090
2090
  logger.debug(f"Database mode: Found correct paper: '{correct_paper_data.get('title', '')}'")
2091
2091
  # Use the CORRECT paper's Semantic Scholar URL
2092
- correct_external_ids = correct_paper_data.get('externalIds', {})
2093
- if correct_external_ids.get('CorpusId'):
2094
- from refchecker.utils.url_utils import construct_semantic_scholar_url
2095
- correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
2092
+ if correct_paper_data.get('paperId'):
2093
+ correct_paper_url = f"https://www.semanticscholar.org/paper/{correct_paper_data['paperId']}"
2096
2094
  paper_url = correct_paper_url # Update the main URL
2097
2095
  logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
2098
2096
  else:
2099
- logger.debug("Database mode: Correct paper found but no CorpusId available")
2097
+ logger.debug("Database mode: Correct paper found but no paperId available")
2100
2098
  else:
2101
2099
  logger.debug("Database mode: Could not find correct paper by title/authors")
2102
2100
  except Exception as e:
@@ -2117,12 +2115,11 @@ class ArxivReferenceChecker:
2117
2115
  formatted_errors.append(formatted_error)
2118
2116
 
2119
2117
  # Fallback to wrong paper's URL if we couldn't find the correct one
2120
- if not correct_paper_data and verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
2121
- from refchecker.utils.url_utils import construct_semantic_scholar_url
2122
- paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
2118
+ if not correct_paper_data and verified_data and verified_data.get('paperId'):
2119
+ paper_url = f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
2123
2120
  logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
2124
2121
  elif not correct_paper_data:
2125
- logger.debug(f"Database mode: No CorpusId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
2122
+ logger.debug(f"Database mode: No paperId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
2126
2123
 
2127
2124
  return formatted_errors if formatted_errors else None, paper_url, verified_data
2128
2125
  else:
@@ -5521,10 +5518,9 @@ class ArxivReferenceChecker:
5521
5518
  if verified_data and verified_data.get('url') and 'arxiv.org' not in verified_data['url']:
5522
5519
  return verified_data['url']
5523
5520
 
5524
- # Second priority: Semantic Scholar URL from CorpusId (if no direct URL available)
5525
- if verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
5526
- from refchecker.utils.url_utils import construct_semantic_scholar_url
5527
- return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
5521
+ # Second priority: Semantic Scholar URL from paperId (if no direct URL available)
5522
+ if verified_data and verified_data.get('paperId'):
5523
+ return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
5528
5524
 
5529
5525
  # Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
5530
5526
  if verified_data and verified_data.get('externalIds', {}).get('DOI'):
@@ -5576,11 +5572,11 @@ class ArxivReferenceChecker:
5576
5572
  # Non-ArXiv URL, probably safe to use
5577
5573
  return reference_url
5578
5574
 
5579
- def _get_fallback_url(self, external_ids):
5575
+ def _get_fallback_url(self, external_ids, verified_data=None):
5580
5576
  """Get fallback URL from external IDs (Semantic Scholar or DOI)"""
5581
- if external_ids.get('CorpusId'):
5582
- from refchecker.utils.url_utils import construct_semantic_scholar_url
5583
- return construct_semantic_scholar_url(external_ids['CorpusId'])
5577
+ # Prefer paperId for Semantic Scholar URLs
5578
+ if verified_data and verified_data.get('paperId'):
5579
+ return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
5584
5580
  elif external_ids.get('DOI'):
5585
5581
  from refchecker.utils.doi_utils import construct_doi_url
5586
5582
  return construct_doi_url(external_ids['DOI'])
@@ -318,7 +318,23 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
318
318
  }
319
319
  )
320
320
 
321
- return response.text or ""
321
+ # Handle empty responses (content safety filter or other issues)
322
+ if not response.candidates:
323
+ logger.warning("Google API returned empty candidates (possibly content filtered)")
324
+ return ""
325
+
326
+ # Safely access the text
327
+ try:
328
+ return response.text or ""
329
+ except (ValueError, AttributeError) as e:
330
+ # response.text raises ValueError if multiple candidates or no text
331
+ logger.warning(f"Could not get text from Google response: {e}")
332
+ # Try to extract text from first candidate manually
333
+ if response.candidates and hasattr(response.candidates[0], 'content'):
334
+ content = response.candidates[0].content
335
+ if hasattr(content, 'parts') and content.parts:
336
+ return content.parts[0].text or ""
337
+ return ""
322
338
 
323
339
  except Exception as e:
324
340
  logger.error(f"Google API call failed: {e}")
@@ -69,10 +69,30 @@ class PDFProcessor:
69
69
  with open(pdf_path, 'rb') as file:
70
70
  pdf_reader = pypdf.PdfReader(file)
71
71
  text = ""
72
+ failed_pages = []
72
73
 
73
74
  for page_num in range(len(pdf_reader.pages)):
74
- page = pdf_reader.pages[page_num]
75
- text += page.extract_text() + "\n"
75
+ try:
76
+ page = pdf_reader.pages[page_num]
77
+ page_text = page.extract_text()
78
+ if page_text:
79
+ text += page_text + "\n"
80
+ except TypeError as e:
81
+ # Handle pypdf errors like "NumberObject is not iterable"
82
+ # which can occur with malformed PDF pages
83
+ failed_pages.append(page_num + 1) # 1-indexed for logging
84
+ logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
85
+ continue
86
+ except Exception as e:
87
+ failed_pages.append(page_num + 1)
88
+ logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
89
+ continue
90
+
91
+ if failed_pages:
92
+ logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
93
+
94
+ if not text.strip():
95
+ raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
76
96
 
77
97
  # Cache the result
78
98
  self.cache[pdf_path] = text
@@ -42,8 +42,8 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
42
42
 
43
43
  Example:
44
44
  Title mismatch:
45
- 'Cited Title'
46
- vs: 'Correct Title'
45
+ cited: 'Cited Title'
46
+ actual: 'Correct Title'
47
47
 
48
48
  Args:
49
49
  mismatch_type: The type of mismatch (e.g., "Author 2 mismatch", "Title mismatch")
@@ -57,11 +57,10 @@ def format_three_line_mismatch(mismatch_type: str, left: str, right: str) -> str
57
57
  if not mismatch_type.endswith(":"):
58
58
  mismatch_type = mismatch_type.rstrip() + ":"
59
59
 
60
- # Use fixed indentation for clean, consistent alignment
61
- indent = "" # spaces for content indentation
62
- vs_indent = "" # vs: starts at column 0 for clear visual separation
60
+ # Use fixed indentation for labels, keeping detail column aligned
61
+ label_indent = " " # 7 spaces to indent labels
63
62
 
64
- return f"{mismatch_type}\n{indent}cited: '{left}'\n{vs_indent}actual: '{right}'"
63
+ return f"{mismatch_type}\n{label_indent}cited: {left}\n{label_indent}actual: {right}"
65
64
 
66
65
 
67
66
  def format_title_mismatch(cited_title: str, verified_title: str) -> str:
@@ -187,8 +186,9 @@ def format_missing_venue(correct_venue: str) -> str:
187
186
  """
188
187
  Format a missing venue message with only the actual value.
189
188
  """
190
- # Only show the actual venue; omit the empty cited line
191
- return f"Missing venue: '{correct_venue}'"
189
+ # Only show the actual venue with indented label
190
+ label_indent = " " # 7 spaces to indent labels
191
+ return f"Missing venue:\n{label_indent}actual: {correct_venue}"
192
192
 
193
193
 
194
194
  def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
@@ -102,7 +102,9 @@ def construct_semantic_scholar_url(paper_id: str) -> str:
102
102
  Construct a Semantic Scholar URL from a paper ID.
103
103
 
104
104
  Args:
105
- paper_id: Semantic Scholar paper ID
105
+ paper_id: Semantic Scholar paper ID (SHA hash, NOT CorpusId)
106
+ The paperId is the 40-character hex hash that works in web URLs.
107
+ CorpusId (numeric) does NOT work in web URLs.
106
108
 
107
109
  Returns:
108
110
  Full Semantic Scholar URL
@@ -151,7 +153,7 @@ def construct_pubmed_url(pmid: str) -> str:
151
153
  return f"https://pubmed.ncbi.nlm.nih.gov/{clean_pmid}/"
152
154
 
153
155
 
154
- def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None) -> Optional[str]:
156
+ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] = None, paper_id: Optional[str] = None) -> Optional[str]:
155
157
  """
156
158
  Get the best available URL from a paper's external IDs and open access information.
157
159
  Priority: Open Access PDF > DOI > ArXiv > Semantic Scholar > OpenAlex > PubMed
@@ -159,6 +161,7 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
159
161
  Args:
160
162
  external_ids: Dictionary of external identifiers
161
163
  open_access_pdf: Open access PDF URL if available
164
+ paper_id: Semantic Scholar paperId (SHA hash) if available
162
165
 
163
166
  Returns:
164
167
  Best available URL or None if no valid URL found
@@ -175,9 +178,9 @@ def get_best_available_url(external_ids: dict, open_access_pdf: Optional[str] =
175
178
  if external_ids.get('ArXiv'):
176
179
  return construct_arxiv_url(external_ids['ArXiv'])
177
180
 
178
- # Priority 4: Semantic Scholar URL
179
- if external_ids.get('CorpusId'):
180
- return construct_semantic_scholar_url(external_ids['CorpusId'])
181
+ # Priority 4: Semantic Scholar URL (using paperId, not CorpusId)
182
+ if paper_id:
183
+ return construct_semantic_scholar_url(paper_id)
181
184
 
182
185
  # Priority 5: OpenAlex URL
183
186
  if external_ids.get('OpenAlex'):
@@ -1,5 +0,0 @@
1
- """Version information for RefChecker."""
2
-
3
- __version__ = "1.2.55"
4
-
5
- __version__ = "1.2.55"