academic-refchecker 1.2.54__py3-none-any.whl → 1.2.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/METADATA +23 -23
  2. academic_refchecker-1.2.56.dist-info/RECORD +49 -0
  3. academic_refchecker-1.2.56.dist-info/entry_points.txt +2 -0
  4. academic_refchecker-1.2.56.dist-info/top_level.txt +1 -0
  5. refchecker/__init__.py +13 -0
  6. refchecker/__main__.py +11 -0
  7. refchecker/__version__.py +5 -0
  8. {checkers → refchecker/checkers}/crossref.py +5 -5
  9. {checkers → refchecker/checkers}/enhanced_hybrid_checker.py +1 -1
  10. {checkers → refchecker/checkers}/github_checker.py +4 -4
  11. {checkers → refchecker/checkers}/local_semantic_scholar.py +10 -11
  12. {checkers → refchecker/checkers}/openalex.py +6 -6
  13. {checkers → refchecker/checkers}/openreview_checker.py +8 -8
  14. {checkers → refchecker/checkers}/pdf_paper_checker.py +1 -1
  15. {checkers → refchecker/checkers}/semantic_scholar.py +47 -33
  16. {checkers → refchecker/checkers}/webpage_checker.py +3 -3
  17. {core → refchecker/core}/parallel_processor.py +6 -6
  18. {core → refchecker/core}/refchecker.py +72 -76
  19. {llm → refchecker/llm}/providers.py +17 -1
  20. {services → refchecker/services}/pdf_processor.py +22 -2
  21. {utils → refchecker/utils}/arxiv_utils.py +3 -3
  22. {utils → refchecker/utils}/biblatex_parser.py +4 -4
  23. {utils → refchecker/utils}/bibliography_utils.py +5 -5
  24. {utils → refchecker/utils}/bibtex_parser.py +5 -5
  25. {utils → refchecker/utils}/error_utils.py +9 -9
  26. {utils → refchecker/utils}/text_utils.py +10 -10
  27. {utils → refchecker/utils}/url_utils.py +8 -5
  28. __version__.py +0 -3
  29. academic_refchecker-1.2.54.dist-info/RECORD +0 -47
  30. academic_refchecker-1.2.54.dist-info/entry_points.txt +0 -2
  31. academic_refchecker-1.2.54.dist-info/top_level.txt +0 -9
  32. {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/WHEEL +0 -0
  33. {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/licenses/LICENSE +0 -0
  34. {checkers → refchecker/checkers}/__init__.py +0 -0
  35. {config → refchecker/config}/__init__.py +0 -0
  36. {config → refchecker/config}/logging.conf +0 -0
  37. {config → refchecker/config}/settings.py +0 -0
  38. {core → refchecker/core}/__init__.py +0 -0
  39. {core → refchecker/core}/db_connection_pool.py +0 -0
  40. {database → refchecker/database}/__init__.py +0 -0
  41. {database → refchecker/database}/download_semantic_scholar_db.py +0 -0
  42. {llm → refchecker/llm}/__init__.py +0 -0
  43. {llm → refchecker/llm}/base.py +0 -0
  44. {scripts → refchecker/scripts}/__init__.py +0 -0
  45. {scripts → refchecker/scripts}/start_vllm_server.py +0 -0
  46. {services → refchecker/services}/__init__.py +0 -0
  47. {utils → refchecker/utils}/__init__.py +0 -0
  48. {utils → refchecker/utils}/author_utils.py +0 -0
  49. {utils → refchecker/utils}/config_validator.py +0 -0
  50. {utils → refchecker/utils}/db_utils.py +0 -0
  51. {utils → refchecker/utils}/doi_utils.py +0 -0
  52. {utils → refchecker/utils}/mock_objects.py +0 -0
  53. {utils → refchecker/utils}/unicode_utils.py +0 -0
@@ -7,7 +7,7 @@ from urllib.parse import urlparse, urljoin
7
7
  from typing import Dict, Optional, Tuple, List, Any
8
8
  from bs4 import BeautifulSoup
9
9
  import time
10
- from utils.text_utils import strip_latex_commands
10
+ from refchecker.utils.text_utils import strip_latex_commands
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -185,7 +185,7 @@ class WebPageChecker:
185
185
  # Check title match
186
186
  if cited_title and page_title:
187
187
  if not self._check_title_match(cited_title, page_title, page_description):
188
- from utils.error_utils import format_title_mismatch
188
+ from refchecker.utils.error_utils import format_title_mismatch
189
189
  # Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
190
190
  clean_cited_title = strip_latex_commands(cited_title)
191
191
  errors.append({
@@ -207,7 +207,7 @@ class WebPageChecker:
207
207
  if cited_authors:
208
208
  author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
209
209
  if not self._check_author_match(author_str, site_info, web_url):
210
- from utils.error_utils import format_three_line_mismatch
210
+ from refchecker.utils.error_utils import format_three_line_mismatch
211
211
  left = author_str
212
212
  right = site_info.get('organization', 'unknown')
213
213
  details = format_three_line_mismatch("Author/organization mismatch", left, right)
@@ -13,7 +13,7 @@ from threading import Thread, Lock
13
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
14
  from dataclasses import dataclass
15
15
  from typing import List, Dict, Any, Optional, Tuple, Callable
16
- from utils.text_utils import deduplicate_urls
16
+ from refchecker.utils.text_utils import deduplicate_urls
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
@@ -277,15 +277,15 @@ class ParallelReferenceProcessor:
277
277
  # Print reference info in the same format as sequential mode
278
278
  raw_title = reference.get('title', 'Untitled')
279
279
  # Clean LaTeX commands from title for display
280
- from utils.text_utils import strip_latex_commands
280
+ from refchecker.utils.text_utils import strip_latex_commands
281
281
  title = strip_latex_commands(raw_title)
282
- from utils.text_utils import format_authors_for_display
282
+ from refchecker.utils.text_utils import format_authors_for_display
283
283
  authors = format_authors_for_display(reference.get('authors', []))
284
284
  year = reference.get('year', '')
285
285
  # Get venue from either 'venue' or 'journal' field and clean it up
286
286
  venue = reference.get('venue', '') or reference.get('journal', '')
287
287
  if venue:
288
- from utils.error_utils import clean_venue_for_comparison
288
+ from refchecker.utils.error_utils import clean_venue_for_comparison
289
289
  venue = clean_venue_for_comparison(venue)
290
290
  url = reference.get('url', '')
291
291
  doi = reference.get('doi', '')
@@ -331,7 +331,7 @@ class ParallelReferenceProcessor:
331
331
 
332
332
  # Show DOI URL if available and different from what's already shown
333
333
  if external_ids.get('DOI'):
334
- from utils.doi_utils import construct_doi_url
334
+ from refchecker.utils.doi_utils import construct_doi_url
335
335
  doi_url = construct_doi_url(external_ids['DOI'])
336
336
  if doi_url != verified_url_to_show and doi_url != url:
337
337
  print(f" DOI URL: {doi_url}")
@@ -355,7 +355,7 @@ class ParallelReferenceProcessor:
355
355
  error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
356
356
  error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
357
357
 
358
- from utils.error_utils import print_labeled_multiline
358
+ from refchecker.utils.error_utils import print_labeled_multiline
359
359
 
360
360
  if error_type == 'arxiv_id':
361
361
  # Keep existing style for arXiv ID errors
@@ -11,7 +11,7 @@ For arXiv references, it uses the arXiv API to verify metadata.
11
11
  For non-arXiv references, it uses the local Semantic Scholar database for verification.
12
12
 
13
13
  Usage:
14
- python refchecker.py --paper PAPER_SPEC [--db-path PATH] [--output-file [PATH]] [--debug]
14
+ python run_refchecker.py --paper PAPER_SPEC [--db-path PATH] [--output-file [PATH]] [--debug]
15
15
 
16
16
  Options:
17
17
  --paper PAPER_SPEC Validate a specific paper by:
@@ -44,23 +44,23 @@ import argparse
44
44
  import sys
45
45
  import json
46
46
  import random
47
- from checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
48
- from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
47
+ from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
48
+ from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
49
49
  extract_arxiv_id_from_url, normalize_text as common_normalize_text,
50
50
  detect_latex_bibliography_format, extract_latex_references,
51
51
  detect_standard_acm_natbib_format, strip_latex_commands,
52
52
  format_corrected_reference, is_name_match, enhanced_name_match,
53
53
  calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
54
54
  compare_authors)
55
- from utils.config_validator import ConfigValidator
56
- from services.pdf_processor import PDFProcessor
57
- from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
58
- from core.parallel_processor import ParallelReferenceProcessor
59
- from core.db_connection_pool import ThreadSafeLocalChecker
55
+ from refchecker.utils.config_validator import ConfigValidator
56
+ from refchecker.services.pdf_processor import PDFProcessor
57
+ from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
58
+ from refchecker.core.parallel_processor import ParallelReferenceProcessor
59
+ from refchecker.core.db_connection_pool import ThreadSafeLocalChecker
60
60
 
61
61
  # Import version
62
- from __version__ import __version__
63
- from llm.base import create_llm_provider, ReferenceExtractor
62
+ from refchecker.__version__ import __version__
63
+ from refchecker.llm.base import create_llm_provider, ReferenceExtractor
64
64
 
65
65
  def get_llm_api_key_interactive(provider: str) -> str:
66
66
  """
@@ -453,7 +453,7 @@ class ArxivReferenceChecker:
453
453
  def extract_arxiv_id_from_url(self, url):
454
454
  """
455
455
  Extract ArXiv ID from a URL or text containing ArXiv reference.
456
- Uses the common extraction function from utils.url_utils.
456
+ Uses the common extraction function from refchecker.utils.url_utils.
457
457
  """
458
458
  return extract_arxiv_id_from_url(url)
459
459
 
@@ -1189,7 +1189,7 @@ class ArxivReferenceChecker:
1189
1189
  last_author = and_parts[1].strip()
1190
1190
 
1191
1191
  # Split the main list by commas, handling initials properly
1192
- from utils.text_utils import parse_authors_with_initials
1192
+ from refchecker.utils.text_utils import parse_authors_with_initials
1193
1193
  authors = parse_authors_with_initials(main_list)
1194
1194
 
1195
1195
  # Add the last author
@@ -1197,7 +1197,7 @@ class ArxivReferenceChecker:
1197
1197
  authors.append(last_author)
1198
1198
  else:
1199
1199
  # No "and" found, use smart comma parsing for initials
1200
- from utils.text_utils import parse_authors_with_initials
1200
+ from refchecker.utils.text_utils import parse_authors_with_initials
1201
1201
  authors = parse_authors_with_initials(authors_text)
1202
1202
 
1203
1203
  # Clean up each author name
@@ -1679,7 +1679,7 @@ class ArxivReferenceChecker:
1679
1679
  if not title and not authors_text:
1680
1680
  # Try to detect a list of names
1681
1681
  if re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
1682
- from utils.text_utils import parse_authors_with_initials
1682
+ from refchecker.utils.text_utils import parse_authors_with_initials
1683
1683
  authors = parse_authors_with_initials(cleaned_ref)
1684
1684
  return authors, ""
1685
1685
 
@@ -1693,7 +1693,7 @@ class ArxivReferenceChecker:
1693
1693
 
1694
1694
  # Final fallback: if the reference is just a list of names, return as authors
1695
1695
  if not title and cleaned_ref and re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
1696
- from utils.text_utils import parse_authors_with_initials
1696
+ from refchecker.utils.text_utils import parse_authors_with_initials
1697
1697
  authors = parse_authors_with_initials(cleaned_ref)
1698
1698
  return authors, ""
1699
1699
 
@@ -1901,7 +1901,7 @@ class ArxivReferenceChecker:
1901
1901
  db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
1902
1902
 
1903
1903
  if normalized_title != db_title:
1904
- from utils.error_utils import format_title_mismatch
1904
+ from refchecker.utils.error_utils import format_title_mismatch
1905
1905
  # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
1906
1906
  clean_cited_title = strip_latex_commands(title)
1907
1907
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
@@ -1940,7 +1940,7 @@ class ArxivReferenceChecker:
1940
1940
  # Only flag as mismatch if the difference is greater than tolerance
1941
1941
  if abs(year - paper_year) > year_tolerance:
1942
1942
  logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1943
- from utils.error_utils import format_year_mismatch
1943
+ from refchecker.utils.error_utils import format_year_mismatch
1944
1944
  errors.append({
1945
1945
  'warning_type': 'year',
1946
1946
  'warning_details': format_year_mismatch(year, paper_year),
@@ -1949,7 +1949,7 @@ class ArxivReferenceChecker:
1949
1949
 
1950
1950
  # Verify DOI
1951
1951
  if doi and external_ids.get('DOI'):
1952
- from utils.doi_utils import compare_dois, normalize_doi
1952
+ from refchecker.utils.doi_utils import compare_dois, normalize_doi
1953
1953
 
1954
1954
  # Use proper DOI comparison first
1955
1955
  if not compare_dois(doi, external_ids['DOI']):
@@ -1962,7 +1962,7 @@ class ArxivReferenceChecker:
1962
1962
  # Only flag as error if it's not a reasonable partial match
1963
1963
  if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
1964
1964
  logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1965
- from utils.error_utils import format_doi_mismatch
1965
+ from refchecker.utils.error_utils import format_doi_mismatch
1966
1966
  errors.append({
1967
1967
  'error_type': 'doi',
1968
1968
  'error_details': format_doi_mismatch(doi, external_ids['DOI']),
@@ -2058,7 +2058,7 @@ class ArxivReferenceChecker:
2058
2058
  elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
2059
2059
  formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
2060
2060
  elif error.get('error_type') == 'doi':
2061
- from utils.doi_utils import construct_doi_url
2061
+ from refchecker.utils.doi_utils import construct_doi_url
2062
2062
  formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
2063
2063
  elif error.get('info_type') == 'url':
2064
2064
  formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
@@ -2089,14 +2089,12 @@ class ArxivReferenceChecker:
2089
2089
  if correct_paper_data:
2090
2090
  logger.debug(f"Database mode: Found correct paper: '{correct_paper_data.get('title', '')}'")
2091
2091
  # Use the CORRECT paper's Semantic Scholar URL
2092
- correct_external_ids = correct_paper_data.get('externalIds', {})
2093
- if correct_external_ids.get('CorpusId'):
2094
- from utils.url_utils import construct_semantic_scholar_url
2095
- correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
2092
+ if correct_paper_data.get('paperId'):
2093
+ correct_paper_url = f"https://www.semanticscholar.org/paper/{correct_paper_data['paperId']}"
2096
2094
  paper_url = correct_paper_url # Update the main URL
2097
2095
  logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
2098
2096
  else:
2099
- logger.debug("Database mode: Correct paper found but no CorpusId available")
2097
+ logger.debug("Database mode: Correct paper found but no paperId available")
2100
2098
  else:
2101
2099
  logger.debug("Database mode: Could not find correct paper by title/authors")
2102
2100
  except Exception as e:
@@ -2117,12 +2115,11 @@ class ArxivReferenceChecker:
2117
2115
  formatted_errors.append(formatted_error)
2118
2116
 
2119
2117
  # Fallback to wrong paper's URL if we couldn't find the correct one
2120
- if not correct_paper_data and verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
2121
- from utils.url_utils import construct_semantic_scholar_url
2122
- paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
2118
+ if not correct_paper_data and verified_data and verified_data.get('paperId'):
2119
+ paper_url = f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
2123
2120
  logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
2124
2121
  elif not correct_paper_data:
2125
- logger.debug(f"Database mode: No CorpusId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
2122
+ logger.debug(f"Database mode: No paperId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
2126
2123
 
2127
2124
  return formatted_errors if formatted_errors else None, paper_url, verified_data
2128
2125
  else:
@@ -2184,7 +2181,7 @@ class ArxivReferenceChecker:
2184
2181
  logger.debug(f"Detected GitHub URL, using GitHub verification: {github_url}")
2185
2182
 
2186
2183
  # Import and use GitHub checker
2187
- from checkers.github_checker import GitHubChecker
2184
+ from refchecker.checkers.github_checker import GitHubChecker
2188
2185
  github_checker = GitHubChecker()
2189
2186
  verified_data, errors, paper_url = github_checker.verify_reference(reference)
2190
2187
 
@@ -2244,7 +2241,7 @@ class ArxivReferenceChecker:
2244
2241
  return None # No URL to check
2245
2242
 
2246
2243
  # Import and use web page checker
2247
- from checkers.webpage_checker import WebPageChecker
2244
+ from refchecker.checkers.webpage_checker import WebPageChecker
2248
2245
  webpage_checker = WebPageChecker()
2249
2246
 
2250
2247
  if not webpage_checker.is_web_page_url(web_url):
@@ -2308,7 +2305,7 @@ class ArxivReferenceChecker:
2308
2305
  return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
2309
2306
 
2310
2307
  # First try PDF paper checker if URL appears to be a PDF
2311
- from checkers.pdf_paper_checker import PDFPaperChecker
2308
+ from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
2312
2309
  pdf_checker = PDFPaperChecker()
2313
2310
 
2314
2311
  if pdf_checker.can_check_reference(reference):
@@ -2325,7 +2322,7 @@ class ArxivReferenceChecker:
2325
2322
  logger.debug(f"PDF verification error, falling back to web page verification")
2326
2323
 
2327
2324
  # Fall back to web page checker
2328
- from checkers.pdf_paper_checker import PDFPaperChecker
2325
+ from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
2329
2326
  pdf_checker = PDFPaperChecker()
2330
2327
 
2331
2328
  if pdf_checker.can_check_reference(reference):
@@ -2342,7 +2339,7 @@ class ArxivReferenceChecker:
2342
2339
  logger.debug(f"PDF verification error, falling back to web page verification")
2343
2340
 
2344
2341
  # Fall back to web page checker
2345
- from checkers.webpage_checker import WebPageChecker
2342
+ from refchecker.checkers.webpage_checker import WebPageChecker
2346
2343
  webpage_checker = WebPageChecker()
2347
2344
 
2348
2345
  try:
@@ -2463,7 +2460,7 @@ class ArxivReferenceChecker:
2463
2460
  elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
2464
2461
  formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
2465
2462
  elif error.get('error_type') == 'doi':
2466
- from utils.doi_utils import construct_doi_url
2463
+ from refchecker.utils.doi_utils import construct_doi_url
2467
2464
  formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
2468
2465
 
2469
2466
  formatted_errors.append(formatted_error)
@@ -2753,7 +2750,7 @@ class ArxivReferenceChecker:
2753
2750
  corrected_data = self._extract_corrected_data_from_error(consolidated_entry, verified_data)
2754
2751
 
2755
2752
  # Generate all three formats for user convenience
2756
- from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2753
+ from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2757
2754
  plaintext_format = format_corrected_plaintext(reference, corrected_data, consolidated_entry)
2758
2755
  bibtex_format = format_corrected_bibtex(reference, corrected_data, consolidated_entry)
2759
2756
  bibitem_format = format_corrected_bibitem(reference, corrected_data, consolidated_entry)
@@ -2824,7 +2821,7 @@ class ArxivReferenceChecker:
2824
2821
  corrected_data = self._extract_corrected_data_from_error(error, verified_data)
2825
2822
 
2826
2823
  # Generate all three formats
2827
- from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2824
+ from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2828
2825
  plaintext_format = format_corrected_plaintext(reference, corrected_data, error_entry)
2829
2826
  bibtex_format = format_corrected_bibtex(reference, corrected_data, error_entry)
2830
2827
  bibitem_format = format_corrected_bibitem(reference, corrected_data, error_entry)
@@ -3326,7 +3323,7 @@ class ArxivReferenceChecker:
3326
3323
 
3327
3324
  if authors:
3328
3325
  # Limit to first 3 authors for readability
3329
- from utils.text_utils import parse_authors_with_initials
3326
+ from refchecker.utils.text_utils import parse_authors_with_initials
3330
3327
  author_list = parse_authors_with_initials(authors)
3331
3328
  if len(author_list) > 3:
3332
3329
  formatted += ", ".join(author_list[:3]) + " et al."
@@ -3568,7 +3565,7 @@ class ArxivReferenceChecker:
3568
3565
  return self._parse_standard_acm_natbib_references(bibliography_text)
3569
3566
 
3570
3567
  # Check if this is BibTeX format
3571
- from utils.bibtex_parser import detect_bibtex_format
3568
+ from refchecker.utils.bibtex_parser import detect_bibtex_format
3572
3569
  if detect_bibtex_format(bibliography_text):
3573
3570
  logger.info("Detected BibTeX format, using BibTeX parser")
3574
3571
  self.used_regex_extraction = True
@@ -3576,7 +3573,7 @@ class ArxivReferenceChecker:
3576
3573
  return self._parse_bibtex_references(bibliography_text)
3577
3574
 
3578
3575
  # Check if this is biblatex format
3579
- from utils.biblatex_parser import detect_biblatex_format
3576
+ from refchecker.utils.biblatex_parser import detect_biblatex_format
3580
3577
  if detect_biblatex_format(bibliography_text):
3581
3578
  logger.debug("Detected biblatex format")
3582
3579
  self.used_regex_extraction = True
@@ -3686,7 +3683,7 @@ class ArxivReferenceChecker:
3686
3683
  if author_field_match:
3687
3684
  author_content = author_field_match.group(1)
3688
3685
  # Find all \bibinfo{person}{Name} entries using balanced brace extraction
3689
- from utils.text_utils import extract_bibinfo_person_content
3686
+ from refchecker.utils.text_utils import extract_bibinfo_person_content
3690
3687
  person_matches = extract_bibinfo_person_content(author_content)
3691
3688
  if person_matches:
3692
3689
  authors = []
@@ -3700,7 +3697,7 @@ class ArxivReferenceChecker:
3700
3697
  ref['authors'] = authors
3701
3698
 
3702
3699
  # Import balanced brace extraction function
3703
- from utils.text_utils import extract_bibinfo_field_content
3700
+ from refchecker.utils.text_utils import extract_bibinfo_field_content
3704
3701
 
3705
3702
  # Extract title from \bibinfo{title}{Title} using balanced brace extraction
3706
3703
  title_content = extract_bibinfo_field_content(content, 'title')
@@ -3758,7 +3755,7 @@ class ArxivReferenceChecker:
3758
3755
  author_part_clean = strip_latex_commands(author_part).strip()
3759
3756
  if author_part_clean and not author_part_clean.startswith('\\'):
3760
3757
  # Parse author names using the robust author parsing function
3761
- from utils.text_utils import parse_authors_with_initials
3758
+ from refchecker.utils.text_utils import parse_authors_with_initials
3762
3759
  author_names = parse_authors_with_initials(author_part_clean)
3763
3760
 
3764
3761
  # Clean up author names
@@ -3812,14 +3809,14 @@ class ArxivReferenceChecker:
3812
3809
  self.used_regex_extraction = True
3813
3810
 
3814
3811
  # Check if this is BibTeX format first
3815
- from utils.bibtex_parser import detect_bibtex_format
3812
+ from refchecker.utils.bibtex_parser import detect_bibtex_format
3816
3813
  if detect_bibtex_format(bibliography_text):
3817
3814
  logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
3818
3815
  # BibTeX parsing is robust, so we don't set used_unreliable_extraction
3819
3816
  return self._parse_bibtex_references(bibliography_text)
3820
3817
 
3821
3818
  # Check if this is biblatex format
3822
- from utils.biblatex_parser import detect_biblatex_format
3819
+ from refchecker.utils.biblatex_parser import detect_biblatex_format
3823
3820
  if detect_biblatex_format(bibliography_text):
3824
3821
  logger.debug("Detected biblatex format, using biblatex-specific parsing")
3825
3822
  # biblatex parsing is also robust, so we don't set used_unreliable_extraction
@@ -4105,7 +4102,7 @@ class ArxivReferenceChecker:
4105
4102
  if doi_match:
4106
4103
  doi = clean_doi(doi_match.group(1))
4107
4104
  if doi:
4108
- from utils.doi_utils import construct_doi_url
4105
+ from refchecker.utils.doi_utils import construct_doi_url
4109
4106
  url = construct_doi_url(doi)
4110
4107
  else:
4111
4108
  url = ''
@@ -4265,7 +4262,7 @@ class ArxivReferenceChecker:
4265
4262
  List of structured reference dictionaries
4266
4263
  """
4267
4264
  # Use the dedicated BibTeX parser
4268
- from utils.bibtex_parser import parse_bibtex_references
4265
+ from refchecker.utils.bibtex_parser import parse_bibtex_references
4269
4266
 
4270
4267
  # Extract references using the BibTeX parser
4271
4268
  references = parse_bibtex_references(bibliography_text)
@@ -4284,7 +4281,7 @@ class ArxivReferenceChecker:
4284
4281
  List of structured reference dictionaries
4285
4282
  """
4286
4283
  # Use the dedicated biblatex parser
4287
- from utils.biblatex_parser import parse_biblatex_references
4284
+ from refchecker.utils.biblatex_parser import parse_biblatex_references
4288
4285
 
4289
4286
  # Extract references using the biblatex parser
4290
4287
  references = parse_biblatex_references(bibliography_text)
@@ -4484,7 +4481,7 @@ class ArxivReferenceChecker:
4484
4481
  return True
4485
4482
 
4486
4483
  # Also check if authors have significant overlap (at least 50% of the shorter author list)
4487
- from utils.text_utils import parse_authors_with_initials
4484
+ from refchecker.utils.text_utils import parse_authors_with_initials
4488
4485
 
4489
4486
  if '*' in seg1['author']:
4490
4487
  author1_parts = seg1['author'].split('*')
@@ -4553,7 +4550,7 @@ class ArxivReferenceChecker:
4553
4550
  parsed_authors = []
4554
4551
  for author in raw_authors:
4555
4552
  # Clean up the author entry and strip LaTeX commands
4556
- from utils.text_utils import strip_latex_commands
4553
+ from refchecker.utils.text_utils import strip_latex_commands
4557
4554
  author_cleaned = strip_latex_commands(author.rstrip('.'))
4558
4555
 
4559
4556
  # Skip special indicators like "others", "et al", etc.
@@ -4571,14 +4568,14 @@ class ArxivReferenceChecker:
4571
4568
  return parsed_authors
4572
4569
  else:
4573
4570
  # Fallback to original logic for backward compatibility
4574
- from utils.text_utils import parse_authors_with_initials
4571
+ from refchecker.utils.text_utils import parse_authors_with_initials
4575
4572
 
4576
4573
  cleaned_text = author_text.rstrip('.')
4577
4574
  authors = parse_authors_with_initials(cleaned_text)
4578
4575
  authors = [a.rstrip('.').strip() for a in authors if a.strip()]
4579
4576
 
4580
4577
  # Handle "others" and similar indicators in fallback logic too
4581
- from utils.text_utils import strip_latex_commands
4578
+ from refchecker.utils.text_utils import strip_latex_commands
4582
4579
  processed_authors = []
4583
4580
  for author in authors:
4584
4581
  # Apply LaTeX cleaning to each author
@@ -4706,7 +4703,7 @@ class ArxivReferenceChecker:
4706
4703
  if '*' in doi:
4707
4704
  doi = doi.split('*')[0]
4708
4705
 
4709
- from utils.doi_utils import construct_doi_url
4706
+ from refchecker.utils.doi_utils import construct_doi_url
4710
4707
  url = construct_doi_url(doi)
4711
4708
  break
4712
4709
 
@@ -4714,7 +4711,7 @@ class ArxivReferenceChecker:
4714
4711
  if not url and not arxiv_url:
4715
4712
  url_match = re.search(r'https?://(?!arxiv\.org)[^\s,]+', ref_text)
4716
4713
  if url_match:
4717
- from utils.url_utils import clean_url_punctuation
4714
+ from refchecker.utils.url_utils import clean_url_punctuation
4718
4715
  url = clean_url_punctuation(url_match.group(0))
4719
4716
 
4720
4717
  # Extract year - will be determined from structured parts below
@@ -4808,7 +4805,7 @@ class ArxivReferenceChecker:
4808
4805
  if 'arxiv' in url_part.lower():
4809
4806
  arxiv_url = url_part
4810
4807
  else:
4811
- from utils.url_utils import clean_url_punctuation
4808
+ from refchecker.utils.url_utils import clean_url_punctuation
4812
4809
  url = clean_url_punctuation(url_part)
4813
4810
  else:
4814
4811
  # Fallback for other formats or malformed input
@@ -4829,7 +4826,7 @@ class ArxivReferenceChecker:
4829
4826
  if 'arxiv' in url_part.lower():
4830
4827
  arxiv_url = url_part
4831
4828
  else:
4832
- from utils.url_utils import clean_url_punctuation
4829
+ from refchecker.utils.url_utils import clean_url_punctuation
4833
4830
  url = clean_url_punctuation(url_part)
4834
4831
  if len(parts) > 5:
4835
4832
  # For cases with more than 5 parts, combine the remaining parts as additional info
@@ -4966,7 +4963,7 @@ class ArxivReferenceChecker:
4966
4963
  if '*' in doi:
4967
4964
  doi = doi.split('*')[0]
4968
4965
 
4969
- from utils.doi_utils import construct_doi_url
4966
+ from refchecker.utils.doi_utils import construct_doi_url
4970
4967
  url = construct_doi_url(doi)
4971
4968
  break
4972
4969
 
@@ -4974,7 +4971,7 @@ class ArxivReferenceChecker:
4974
4971
  if not url and not arxiv_url:
4975
4972
  url_match = re.search(r'https?://(?!arxiv\.org)[^\s,\)]+', ref_text)
4976
4973
  if url_match:
4977
- from utils.url_utils import clean_url_punctuation
4974
+ from refchecker.utils.url_utils import clean_url_punctuation
4978
4975
  url = clean_url_punctuation(url_match.group(0))
4979
4976
 
4980
4977
  # Extract year
@@ -5023,7 +5020,7 @@ class ArxivReferenceChecker:
5023
5020
  logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
5024
5021
 
5025
5022
  # Check if we can get BibTeX content for this paper (ArXiv or other sources)
5026
- from utils.arxiv_utils import get_bibtex_content
5023
+ from refchecker.utils.arxiv_utils import get_bibtex_content
5027
5024
  bibtex_content = get_bibtex_content(paper)
5028
5025
  if bibtex_content:
5029
5026
  logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
@@ -5047,7 +5044,7 @@ class ArxivReferenceChecker:
5047
5044
  references = extract_latex_references(bibtex_content, None)
5048
5045
 
5049
5046
  # Validate the parsed references and fallback to LLM if needed
5050
- from utils.text_utils import validate_parsed_references
5047
+ from refchecker.utils.text_utils import validate_parsed_references
5051
5048
  validation = validate_parsed_references(references)
5052
5049
 
5053
5050
  if not validation['is_valid']:
@@ -5372,9 +5369,9 @@ class ArxivReferenceChecker:
5372
5369
  # Print reference info in non-debug mode (improved formatting)
5373
5370
  raw_title = reference.get('title', 'Untitled')
5374
5371
  # Clean LaTeX commands from title for display
5375
- from utils.text_utils import strip_latex_commands
5372
+ from refchecker.utils.text_utils import strip_latex_commands
5376
5373
  title = strip_latex_commands(raw_title)
5377
- from utils.text_utils import format_authors_for_display
5374
+ from refchecker.utils.text_utils import format_authors_for_display
5378
5375
  authors = format_authors_for_display(reference.get('authors', []))
5379
5376
  year = reference.get('year', '')
5380
5377
  venue = reference.get('venue', '') or reference.get('journal', '')
@@ -5424,7 +5421,7 @@ class ArxivReferenceChecker:
5424
5421
 
5425
5422
  # Show DOI URL if available and different from what's already shown
5426
5423
  if external_ids.get('DOI'):
5427
- from utils.doi_utils import construct_doi_url
5424
+ from refchecker.utils.doi_utils import construct_doi_url
5428
5425
  doi_url = construct_doi_url(external_ids['DOI'])
5429
5426
  if doi_url != verified_url_to_show and doi_url != url:
5430
5427
  print(f" DOI URL: {doi_url}")
@@ -5521,21 +5518,20 @@ class ArxivReferenceChecker:
5521
5518
  if verified_data and verified_data.get('url') and 'arxiv.org' not in verified_data['url']:
5522
5519
  return verified_data['url']
5523
5520
 
5524
- # Second priority: Semantic Scholar URL from CorpusId (if no direct URL available)
5525
- if verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
5526
- from utils.url_utils import construct_semantic_scholar_url
5527
- return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
5521
+ # Second priority: Semantic Scholar URL from paperId (if no direct URL available)
5522
+ if verified_data and verified_data.get('paperId'):
5523
+ return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
5528
5524
 
5529
5525
  # Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
5530
5526
  if verified_data and verified_data.get('externalIds', {}).get('DOI'):
5531
- from utils.doi_utils import construct_doi_url
5527
+ from refchecker.utils.doi_utils import construct_doi_url
5532
5528
  return construct_doi_url(verified_data['externalIds']['DOI'])
5533
5529
 
5534
5530
  # Fourth priority: ArXiv URL from verified data (but only if there's no ArXiv ID error)
5535
5531
  if verified_data and verified_data.get('externalIds', {}).get('ArXiv'):
5536
5532
  # Only show ArXiv URL as verified URL if there's no ArXiv ID mismatch
5537
5533
  if not self._has_arxiv_id_error(errors):
5538
- from utils.url_utils import construct_arxiv_url
5534
+ from refchecker.utils.url_utils import construct_arxiv_url
5539
5535
  correct_arxiv_id = verified_data['externalIds']['ArXiv']
5540
5536
  return construct_arxiv_url(correct_arxiv_id)
5541
5537
 
@@ -5556,7 +5552,7 @@ class ArxivReferenceChecker:
5556
5552
  external_ids = verified_data.get('externalIds', {})
5557
5553
  if external_ids.get('ArXiv'):
5558
5554
  # Extract ArXiv ID from the URL using shared utility
5559
- from utils.url_utils import extract_arxiv_id_from_url
5555
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
5560
5556
  url_arxiv_id = extract_arxiv_id_from_url(reference_url)
5561
5557
  if url_arxiv_id:
5562
5558
  correct_arxiv_id = external_ids['ArXiv']
@@ -5576,13 +5572,13 @@ class ArxivReferenceChecker:
5576
5572
  # Non-ArXiv URL, probably safe to use
5577
5573
  return reference_url
5578
5574
 
5579
- def _get_fallback_url(self, external_ids):
5575
+ def _get_fallback_url(self, external_ids, verified_data=None):
5580
5576
  """Get fallback URL from external IDs (Semantic Scholar or DOI)"""
5581
- if external_ids.get('CorpusId'):
5582
- from utils.url_utils import construct_semantic_scholar_url
5583
- return construct_semantic_scholar_url(external_ids['CorpusId'])
5577
+ # Prefer paperId for Semantic Scholar URLs
5578
+ if verified_data and verified_data.get('paperId'):
5579
+ return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
5584
5580
  elif external_ids.get('DOI'):
5585
- from utils.doi_utils import construct_doi_url
5581
+ from refchecker.utils.doi_utils import construct_doi_url
5586
5582
  return construct_doi_url(external_ids['DOI'])
5587
5583
  return None
5588
5584
 
@@ -5660,7 +5656,7 @@ class ArxivReferenceChecker:
5660
5656
  error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
5661
5657
  error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
5662
5658
 
5663
- from utils.error_utils import print_labeled_multiline
5659
+ from refchecker.utils.error_utils import print_labeled_multiline
5664
5660
 
5665
5661
  if error_type == 'arxiv_id':
5666
5662
  print(f" ❌ {error_details}")
@@ -318,7 +318,23 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
318
318
  }
319
319
  )
320
320
 
321
- return response.text or ""
321
+ # Handle empty responses (content safety filter or other issues)
322
+ if not response.candidates:
323
+ logger.warning("Google API returned empty candidates (possibly content filtered)")
324
+ return ""
325
+
326
+ # Safely access the text
327
+ try:
328
+ return response.text or ""
329
+ except (ValueError, AttributeError) as e:
330
+ # response.text raises ValueError if multiple candidates or no text
331
+ logger.warning(f"Could not get text from Google response: {e}")
332
+ # Try to extract text from first candidate manually
333
+ if response.candidates and hasattr(response.candidates[0], 'content'):
334
+ content = response.candidates[0].content
335
+ if hasattr(content, 'parts') and content.parts:
336
+ return content.parts[0].text or ""
337
+ return ""
322
338
 
323
339
  except Exception as e:
324
340
  logger.error(f"Google API call failed: {e}")
@@ -69,10 +69,30 @@ class PDFProcessor:
69
69
  with open(pdf_path, 'rb') as file:
70
70
  pdf_reader = pypdf.PdfReader(file)
71
71
  text = ""
72
+ failed_pages = []
72
73
 
73
74
  for page_num in range(len(pdf_reader.pages)):
74
- page = pdf_reader.pages[page_num]
75
- text += page.extract_text() + "\n"
75
+ try:
76
+ page = pdf_reader.pages[page_num]
77
+ page_text = page.extract_text()
78
+ if page_text:
79
+ text += page_text + "\n"
80
+ except TypeError as e:
81
+ # Handle pypdf errors like "NumberObject is not iterable"
82
+ # which can occur with malformed PDF pages
83
+ failed_pages.append(page_num + 1) # 1-indexed for logging
84
+ logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
85
+ continue
86
+ except Exception as e:
87
+ failed_pages.append(page_num + 1)
88
+ logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
89
+ continue
90
+
91
+ if failed_pages:
92
+ logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
93
+
94
+ if not text.strip():
95
+ raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
76
96
 
77
97
  # Cache the result
78
98
  self.cache[pdf_path] = text