academic-refchecker 1.2.53__py3-none-any.whl → 1.2.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/METADATA +23 -23
  2. academic_refchecker-1.2.55.dist-info/RECORD +49 -0
  3. academic_refchecker-1.2.55.dist-info/entry_points.txt +2 -0
  4. academic_refchecker-1.2.55.dist-info/top_level.txt +1 -0
  5. refchecker/__init__.py +13 -0
  6. refchecker/__main__.py +11 -0
  7. refchecker/__version__.py +5 -0
  8. {checkers → refchecker/checkers}/crossref.py +5 -5
  9. {checkers → refchecker/checkers}/enhanced_hybrid_checker.py +1 -1
  10. {checkers → refchecker/checkers}/github_checker.py +4 -4
  11. {checkers → refchecker/checkers}/local_semantic_scholar.py +7 -7
  12. {checkers → refchecker/checkers}/openalex.py +6 -6
  13. {checkers → refchecker/checkers}/openreview_checker.py +8 -8
  14. {checkers → refchecker/checkers}/pdf_paper_checker.py +1 -1
  15. {checkers → refchecker/checkers}/semantic_scholar.py +10 -10
  16. {checkers → refchecker/checkers}/webpage_checker.py +3 -3
  17. {core → refchecker/core}/parallel_processor.py +6 -6
  18. {core → refchecker/core}/refchecker.py +63 -63
  19. {utils → refchecker/utils}/arxiv_utils.py +3 -3
  20. {utils → refchecker/utils}/biblatex_parser.py +4 -4
  21. {utils → refchecker/utils}/bibliography_utils.py +5 -5
  22. {utils → refchecker/utils}/bibtex_parser.py +5 -5
  23. {utils → refchecker/utils}/error_utils.py +1 -1
  24. {utils → refchecker/utils}/text_utils.py +62 -13
  25. __version__.py +0 -3
  26. academic_refchecker-1.2.53.dist-info/RECORD +0 -47
  27. academic_refchecker-1.2.53.dist-info/entry_points.txt +0 -2
  28. academic_refchecker-1.2.53.dist-info/top_level.txt +0 -9
  29. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/WHEEL +0 -0
  30. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/licenses/LICENSE +0 -0
  31. {checkers → refchecker/checkers}/__init__.py +0 -0
  32. {config → refchecker/config}/__init__.py +0 -0
  33. {config → refchecker/config}/logging.conf +0 -0
  34. {config → refchecker/config}/settings.py +0 -0
  35. {core → refchecker/core}/__init__.py +0 -0
  36. {core → refchecker/core}/db_connection_pool.py +0 -0
  37. {database → refchecker/database}/__init__.py +0 -0
  38. {database → refchecker/database}/download_semantic_scholar_db.py +0 -0
  39. {llm → refchecker/llm}/__init__.py +0 -0
  40. {llm → refchecker/llm}/base.py +0 -0
  41. {llm → refchecker/llm}/providers.py +0 -0
  42. {scripts → refchecker/scripts}/__init__.py +0 -0
  43. {scripts → refchecker/scripts}/start_vllm_server.py +0 -0
  44. {services → refchecker/services}/__init__.py +0 -0
  45. {services → refchecker/services}/pdf_processor.py +0 -0
  46. {utils → refchecker/utils}/__init__.py +0 -0
  47. {utils → refchecker/utils}/author_utils.py +0 -0
  48. {utils → refchecker/utils}/config_validator.py +0 -0
  49. {utils → refchecker/utils}/db_utils.py +0 -0
  50. {utils → refchecker/utils}/doi_utils.py +0 -0
  51. {utils → refchecker/utils}/mock_objects.py +0 -0
  52. {utils → refchecker/utils}/unicode_utils.py +0 -0
  53. {utils → refchecker/utils}/url_utils.py +0 -0
@@ -11,7 +11,7 @@ For arXiv references, it uses the arXiv API to verify metadata.
11
11
  For non-arXiv references, it uses the local Semantic Scholar database for verification.
12
12
 
13
13
  Usage:
14
- python refchecker.py --paper PAPER_SPEC [--db-path PATH] [--output-file [PATH]] [--debug]
14
+ python run_refchecker.py --paper PAPER_SPEC [--db-path PATH] [--output-file [PATH]] [--debug]
15
15
 
16
16
  Options:
17
17
  --paper PAPER_SPEC Validate a specific paper by:
@@ -44,23 +44,23 @@ import argparse
44
44
  import sys
45
45
  import json
46
46
  import random
47
- from checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
48
- from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
47
+ from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
48
+ from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
49
49
  extract_arxiv_id_from_url, normalize_text as common_normalize_text,
50
50
  detect_latex_bibliography_format, extract_latex_references,
51
51
  detect_standard_acm_natbib_format, strip_latex_commands,
52
52
  format_corrected_reference, is_name_match, enhanced_name_match,
53
53
  calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
54
54
  compare_authors)
55
- from utils.config_validator import ConfigValidator
56
- from services.pdf_processor import PDFProcessor
57
- from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
58
- from core.parallel_processor import ParallelReferenceProcessor
59
- from core.db_connection_pool import ThreadSafeLocalChecker
55
+ from refchecker.utils.config_validator import ConfigValidator
56
+ from refchecker.services.pdf_processor import PDFProcessor
57
+ from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
58
+ from refchecker.core.parallel_processor import ParallelReferenceProcessor
59
+ from refchecker.core.db_connection_pool import ThreadSafeLocalChecker
60
60
 
61
61
  # Import version
62
- from __version__ import __version__
63
- from llm.base import create_llm_provider, ReferenceExtractor
62
+ from refchecker.__version__ import __version__
63
+ from refchecker.llm.base import create_llm_provider, ReferenceExtractor
64
64
 
65
65
  def get_llm_api_key_interactive(provider: str) -> str:
66
66
  """
@@ -453,7 +453,7 @@ class ArxivReferenceChecker:
453
453
  def extract_arxiv_id_from_url(self, url):
454
454
  """
455
455
  Extract ArXiv ID from a URL or text containing ArXiv reference.
456
- Uses the common extraction function from utils.url_utils.
456
+ Uses the common extraction function from refchecker.utils.url_utils.
457
457
  """
458
458
  return extract_arxiv_id_from_url(url)
459
459
 
@@ -1189,7 +1189,7 @@ class ArxivReferenceChecker:
1189
1189
  last_author = and_parts[1].strip()
1190
1190
 
1191
1191
  # Split the main list by commas, handling initials properly
1192
- from utils.text_utils import parse_authors_with_initials
1192
+ from refchecker.utils.text_utils import parse_authors_with_initials
1193
1193
  authors = parse_authors_with_initials(main_list)
1194
1194
 
1195
1195
  # Add the last author
@@ -1197,7 +1197,7 @@ class ArxivReferenceChecker:
1197
1197
  authors.append(last_author)
1198
1198
  else:
1199
1199
  # No "and" found, use smart comma parsing for initials
1200
- from utils.text_utils import parse_authors_with_initials
1200
+ from refchecker.utils.text_utils import parse_authors_with_initials
1201
1201
  authors = parse_authors_with_initials(authors_text)
1202
1202
 
1203
1203
  # Clean up each author name
@@ -1679,7 +1679,7 @@ class ArxivReferenceChecker:
1679
1679
  if not title and not authors_text:
1680
1680
  # Try to detect a list of names
1681
1681
  if re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
1682
- from utils.text_utils import parse_authors_with_initials
1682
+ from refchecker.utils.text_utils import parse_authors_with_initials
1683
1683
  authors = parse_authors_with_initials(cleaned_ref)
1684
1684
  return authors, ""
1685
1685
 
@@ -1693,7 +1693,7 @@ class ArxivReferenceChecker:
1693
1693
 
1694
1694
  # Final fallback: if the reference is just a list of names, return as authors
1695
1695
  if not title and cleaned_ref and re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
1696
- from utils.text_utils import parse_authors_with_initials
1696
+ from refchecker.utils.text_utils import parse_authors_with_initials
1697
1697
  authors = parse_authors_with_initials(cleaned_ref)
1698
1698
  return authors, ""
1699
1699
 
@@ -1901,7 +1901,7 @@ class ArxivReferenceChecker:
1901
1901
  db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
1902
1902
 
1903
1903
  if normalized_title != db_title:
1904
- from utils.error_utils import format_title_mismatch
1904
+ from refchecker.utils.error_utils import format_title_mismatch
1905
1905
  # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
1906
1906
  clean_cited_title = strip_latex_commands(title)
1907
1907
  logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
@@ -1940,7 +1940,7 @@ class ArxivReferenceChecker:
1940
1940
  # Only flag as mismatch if the difference is greater than tolerance
1941
1941
  if abs(year - paper_year) > year_tolerance:
1942
1942
  logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
1943
- from utils.error_utils import format_year_mismatch
1943
+ from refchecker.utils.error_utils import format_year_mismatch
1944
1944
  errors.append({
1945
1945
  'warning_type': 'year',
1946
1946
  'warning_details': format_year_mismatch(year, paper_year),
@@ -1949,7 +1949,7 @@ class ArxivReferenceChecker:
1949
1949
 
1950
1950
  # Verify DOI
1951
1951
  if doi and external_ids.get('DOI'):
1952
- from utils.doi_utils import compare_dois, normalize_doi
1952
+ from refchecker.utils.doi_utils import compare_dois, normalize_doi
1953
1953
 
1954
1954
  # Use proper DOI comparison first
1955
1955
  if not compare_dois(doi, external_ids['DOI']):
@@ -1962,7 +1962,7 @@ class ArxivReferenceChecker:
1962
1962
  # Only flag as error if it's not a reasonable partial match
1963
1963
  if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
1964
1964
  logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
1965
- from utils.error_utils import format_doi_mismatch
1965
+ from refchecker.utils.error_utils import format_doi_mismatch
1966
1966
  errors.append({
1967
1967
  'error_type': 'doi',
1968
1968
  'error_details': format_doi_mismatch(doi, external_ids['DOI']),
@@ -2058,7 +2058,7 @@ class ArxivReferenceChecker:
2058
2058
  elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
2059
2059
  formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
2060
2060
  elif error.get('error_type') == 'doi':
2061
- from utils.doi_utils import construct_doi_url
2061
+ from refchecker.utils.doi_utils import construct_doi_url
2062
2062
  formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
2063
2063
  elif error.get('info_type') == 'url':
2064
2064
  formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
@@ -2091,7 +2091,7 @@ class ArxivReferenceChecker:
2091
2091
  # Use the CORRECT paper's Semantic Scholar URL
2092
2092
  correct_external_ids = correct_paper_data.get('externalIds', {})
2093
2093
  if correct_external_ids.get('CorpusId'):
2094
- from utils.url_utils import construct_semantic_scholar_url
2094
+ from refchecker.utils.url_utils import construct_semantic_scholar_url
2095
2095
  correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
2096
2096
  paper_url = correct_paper_url # Update the main URL
2097
2097
  logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
@@ -2118,7 +2118,7 @@ class ArxivReferenceChecker:
2118
2118
 
2119
2119
  # Fallback to wrong paper's URL if we couldn't find the correct one
2120
2120
  if not correct_paper_data and verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
2121
- from utils.url_utils import construct_semantic_scholar_url
2121
+ from refchecker.utils.url_utils import construct_semantic_scholar_url
2122
2122
  paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
2123
2123
  logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
2124
2124
  elif not correct_paper_data:
@@ -2184,7 +2184,7 @@ class ArxivReferenceChecker:
2184
2184
  logger.debug(f"Detected GitHub URL, using GitHub verification: {github_url}")
2185
2185
 
2186
2186
  # Import and use GitHub checker
2187
- from checkers.github_checker import GitHubChecker
2187
+ from refchecker.checkers.github_checker import GitHubChecker
2188
2188
  github_checker = GitHubChecker()
2189
2189
  verified_data, errors, paper_url = github_checker.verify_reference(reference)
2190
2190
 
@@ -2244,7 +2244,7 @@ class ArxivReferenceChecker:
2244
2244
  return None # No URL to check
2245
2245
 
2246
2246
  # Import and use web page checker
2247
- from checkers.webpage_checker import WebPageChecker
2247
+ from refchecker.checkers.webpage_checker import WebPageChecker
2248
2248
  webpage_checker = WebPageChecker()
2249
2249
 
2250
2250
  if not webpage_checker.is_web_page_url(web_url):
@@ -2308,7 +2308,7 @@ class ArxivReferenceChecker:
2308
2308
  return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
2309
2309
 
2310
2310
  # First try PDF paper checker if URL appears to be a PDF
2311
- from checkers.pdf_paper_checker import PDFPaperChecker
2311
+ from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
2312
2312
  pdf_checker = PDFPaperChecker()
2313
2313
 
2314
2314
  if pdf_checker.can_check_reference(reference):
@@ -2325,7 +2325,7 @@ class ArxivReferenceChecker:
2325
2325
  logger.debug(f"PDF verification error, falling back to web page verification")
2326
2326
 
2327
2327
  # Fall back to web page checker
2328
- from checkers.pdf_paper_checker import PDFPaperChecker
2328
+ from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
2329
2329
  pdf_checker = PDFPaperChecker()
2330
2330
 
2331
2331
  if pdf_checker.can_check_reference(reference):
@@ -2342,7 +2342,7 @@ class ArxivReferenceChecker:
2342
2342
  logger.debug(f"PDF verification error, falling back to web page verification")
2343
2343
 
2344
2344
  # Fall back to web page checker
2345
- from checkers.webpage_checker import WebPageChecker
2345
+ from refchecker.checkers.webpage_checker import WebPageChecker
2346
2346
  webpage_checker = WebPageChecker()
2347
2347
 
2348
2348
  try:
@@ -2463,7 +2463,7 @@ class ArxivReferenceChecker:
2463
2463
  elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
2464
2464
  formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
2465
2465
  elif error.get('error_type') == 'doi':
2466
- from utils.doi_utils import construct_doi_url
2466
+ from refchecker.utils.doi_utils import construct_doi_url
2467
2467
  formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
2468
2468
 
2469
2469
  formatted_errors.append(formatted_error)
@@ -2753,7 +2753,7 @@ class ArxivReferenceChecker:
2753
2753
  corrected_data = self._extract_corrected_data_from_error(consolidated_entry, verified_data)
2754
2754
 
2755
2755
  # Generate all three formats for user convenience
2756
- from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2756
+ from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2757
2757
  plaintext_format = format_corrected_plaintext(reference, corrected_data, consolidated_entry)
2758
2758
  bibtex_format = format_corrected_bibtex(reference, corrected_data, consolidated_entry)
2759
2759
  bibitem_format = format_corrected_bibitem(reference, corrected_data, consolidated_entry)
@@ -2824,7 +2824,7 @@ class ArxivReferenceChecker:
2824
2824
  corrected_data = self._extract_corrected_data_from_error(error, verified_data)
2825
2825
 
2826
2826
  # Generate all three formats
2827
- from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2827
+ from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
2828
2828
  plaintext_format = format_corrected_plaintext(reference, corrected_data, error_entry)
2829
2829
  bibtex_format = format_corrected_bibtex(reference, corrected_data, error_entry)
2830
2830
  bibitem_format = format_corrected_bibitem(reference, corrected_data, error_entry)
@@ -3326,7 +3326,7 @@ class ArxivReferenceChecker:
3326
3326
 
3327
3327
  if authors:
3328
3328
  # Limit to first 3 authors for readability
3329
- from utils.text_utils import parse_authors_with_initials
3329
+ from refchecker.utils.text_utils import parse_authors_with_initials
3330
3330
  author_list = parse_authors_with_initials(authors)
3331
3331
  if len(author_list) > 3:
3332
3332
  formatted += ", ".join(author_list[:3]) + " et al."
@@ -3568,7 +3568,7 @@ class ArxivReferenceChecker:
3568
3568
  return self._parse_standard_acm_natbib_references(bibliography_text)
3569
3569
 
3570
3570
  # Check if this is BibTeX format
3571
- from utils.bibtex_parser import detect_bibtex_format
3571
+ from refchecker.utils.bibtex_parser import detect_bibtex_format
3572
3572
  if detect_bibtex_format(bibliography_text):
3573
3573
  logger.info("Detected BibTeX format, using BibTeX parser")
3574
3574
  self.used_regex_extraction = True
@@ -3576,7 +3576,7 @@ class ArxivReferenceChecker:
3576
3576
  return self._parse_bibtex_references(bibliography_text)
3577
3577
 
3578
3578
  # Check if this is biblatex format
3579
- from utils.biblatex_parser import detect_biblatex_format
3579
+ from refchecker.utils.biblatex_parser import detect_biblatex_format
3580
3580
  if detect_biblatex_format(bibliography_text):
3581
3581
  logger.debug("Detected biblatex format")
3582
3582
  self.used_regex_extraction = True
@@ -3686,7 +3686,7 @@ class ArxivReferenceChecker:
3686
3686
  if author_field_match:
3687
3687
  author_content = author_field_match.group(1)
3688
3688
  # Find all \bibinfo{person}{Name} entries using balanced brace extraction
3689
- from utils.text_utils import extract_bibinfo_person_content
3689
+ from refchecker.utils.text_utils import extract_bibinfo_person_content
3690
3690
  person_matches = extract_bibinfo_person_content(author_content)
3691
3691
  if person_matches:
3692
3692
  authors = []
@@ -3700,7 +3700,7 @@ class ArxivReferenceChecker:
3700
3700
  ref['authors'] = authors
3701
3701
 
3702
3702
  # Import balanced brace extraction function
3703
- from utils.text_utils import extract_bibinfo_field_content
3703
+ from refchecker.utils.text_utils import extract_bibinfo_field_content
3704
3704
 
3705
3705
  # Extract title from \bibinfo{title}{Title} using balanced brace extraction
3706
3706
  title_content = extract_bibinfo_field_content(content, 'title')
@@ -3758,7 +3758,7 @@ class ArxivReferenceChecker:
3758
3758
  author_part_clean = strip_latex_commands(author_part).strip()
3759
3759
  if author_part_clean and not author_part_clean.startswith('\\'):
3760
3760
  # Parse author names using the robust author parsing function
3761
- from utils.text_utils import parse_authors_with_initials
3761
+ from refchecker.utils.text_utils import parse_authors_with_initials
3762
3762
  author_names = parse_authors_with_initials(author_part_clean)
3763
3763
 
3764
3764
  # Clean up author names
@@ -3812,14 +3812,14 @@ class ArxivReferenceChecker:
3812
3812
  self.used_regex_extraction = True
3813
3813
 
3814
3814
  # Check if this is BibTeX format first
3815
- from utils.bibtex_parser import detect_bibtex_format
3815
+ from refchecker.utils.bibtex_parser import detect_bibtex_format
3816
3816
  if detect_bibtex_format(bibliography_text):
3817
3817
  logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
3818
3818
  # BibTeX parsing is robust, so we don't set used_unreliable_extraction
3819
3819
  return self._parse_bibtex_references(bibliography_text)
3820
3820
 
3821
3821
  # Check if this is biblatex format
3822
- from utils.biblatex_parser import detect_biblatex_format
3822
+ from refchecker.utils.biblatex_parser import detect_biblatex_format
3823
3823
  if detect_biblatex_format(bibliography_text):
3824
3824
  logger.debug("Detected biblatex format, using biblatex-specific parsing")
3825
3825
  # biblatex parsing is also robust, so we don't set used_unreliable_extraction
@@ -4105,7 +4105,7 @@ class ArxivReferenceChecker:
4105
4105
  if doi_match:
4106
4106
  doi = clean_doi(doi_match.group(1))
4107
4107
  if doi:
4108
- from utils.doi_utils import construct_doi_url
4108
+ from refchecker.utils.doi_utils import construct_doi_url
4109
4109
  url = construct_doi_url(doi)
4110
4110
  else:
4111
4111
  url = ''
@@ -4265,7 +4265,7 @@ class ArxivReferenceChecker:
4265
4265
  List of structured reference dictionaries
4266
4266
  """
4267
4267
  # Use the dedicated BibTeX parser
4268
- from utils.bibtex_parser import parse_bibtex_references
4268
+ from refchecker.utils.bibtex_parser import parse_bibtex_references
4269
4269
 
4270
4270
  # Extract references using the BibTeX parser
4271
4271
  references = parse_bibtex_references(bibliography_text)
@@ -4284,7 +4284,7 @@ class ArxivReferenceChecker:
4284
4284
  List of structured reference dictionaries
4285
4285
  """
4286
4286
  # Use the dedicated biblatex parser
4287
- from utils.biblatex_parser import parse_biblatex_references
4287
+ from refchecker.utils.biblatex_parser import parse_biblatex_references
4288
4288
 
4289
4289
  # Extract references using the biblatex parser
4290
4290
  references = parse_biblatex_references(bibliography_text)
@@ -4484,7 +4484,7 @@ class ArxivReferenceChecker:
4484
4484
  return True
4485
4485
 
4486
4486
  # Also check if authors have significant overlap (at least 50% of the shorter author list)
4487
- from utils.text_utils import parse_authors_with_initials
4487
+ from refchecker.utils.text_utils import parse_authors_with_initials
4488
4488
 
4489
4489
  if '*' in seg1['author']:
4490
4490
  author1_parts = seg1['author'].split('*')
@@ -4553,7 +4553,7 @@ class ArxivReferenceChecker:
4553
4553
  parsed_authors = []
4554
4554
  for author in raw_authors:
4555
4555
  # Clean up the author entry and strip LaTeX commands
4556
- from utils.text_utils import strip_latex_commands
4556
+ from refchecker.utils.text_utils import strip_latex_commands
4557
4557
  author_cleaned = strip_latex_commands(author.rstrip('.'))
4558
4558
 
4559
4559
  # Skip special indicators like "others", "et al", etc.
@@ -4571,14 +4571,14 @@ class ArxivReferenceChecker:
4571
4571
  return parsed_authors
4572
4572
  else:
4573
4573
  # Fallback to original logic for backward compatibility
4574
- from utils.text_utils import parse_authors_with_initials
4574
+ from refchecker.utils.text_utils import parse_authors_with_initials
4575
4575
 
4576
4576
  cleaned_text = author_text.rstrip('.')
4577
4577
  authors = parse_authors_with_initials(cleaned_text)
4578
4578
  authors = [a.rstrip('.').strip() for a in authors if a.strip()]
4579
4579
 
4580
4580
  # Handle "others" and similar indicators in fallback logic too
4581
- from utils.text_utils import strip_latex_commands
4581
+ from refchecker.utils.text_utils import strip_latex_commands
4582
4582
  processed_authors = []
4583
4583
  for author in authors:
4584
4584
  # Apply LaTeX cleaning to each author
@@ -4706,7 +4706,7 @@ class ArxivReferenceChecker:
4706
4706
  if '*' in doi:
4707
4707
  doi = doi.split('*')[0]
4708
4708
 
4709
- from utils.doi_utils import construct_doi_url
4709
+ from refchecker.utils.doi_utils import construct_doi_url
4710
4710
  url = construct_doi_url(doi)
4711
4711
  break
4712
4712
 
@@ -4714,7 +4714,7 @@ class ArxivReferenceChecker:
4714
4714
  if not url and not arxiv_url:
4715
4715
  url_match = re.search(r'https?://(?!arxiv\.org)[^\s,]+', ref_text)
4716
4716
  if url_match:
4717
- from utils.url_utils import clean_url_punctuation
4717
+ from refchecker.utils.url_utils import clean_url_punctuation
4718
4718
  url = clean_url_punctuation(url_match.group(0))
4719
4719
 
4720
4720
  # Extract year - will be determined from structured parts below
@@ -4808,7 +4808,7 @@ class ArxivReferenceChecker:
4808
4808
  if 'arxiv' in url_part.lower():
4809
4809
  arxiv_url = url_part
4810
4810
  else:
4811
- from utils.url_utils import clean_url_punctuation
4811
+ from refchecker.utils.url_utils import clean_url_punctuation
4812
4812
  url = clean_url_punctuation(url_part)
4813
4813
  else:
4814
4814
  # Fallback for other formats or malformed input
@@ -4829,7 +4829,7 @@ class ArxivReferenceChecker:
4829
4829
  if 'arxiv' in url_part.lower():
4830
4830
  arxiv_url = url_part
4831
4831
  else:
4832
- from utils.url_utils import clean_url_punctuation
4832
+ from refchecker.utils.url_utils import clean_url_punctuation
4833
4833
  url = clean_url_punctuation(url_part)
4834
4834
  if len(parts) > 5:
4835
4835
  # For cases with more than 5 parts, combine the remaining parts as additional info
@@ -4966,7 +4966,7 @@ class ArxivReferenceChecker:
4966
4966
  if '*' in doi:
4967
4967
  doi = doi.split('*')[0]
4968
4968
 
4969
- from utils.doi_utils import construct_doi_url
4969
+ from refchecker.utils.doi_utils import construct_doi_url
4970
4970
  url = construct_doi_url(doi)
4971
4971
  break
4972
4972
 
@@ -4974,7 +4974,7 @@ class ArxivReferenceChecker:
4974
4974
  if not url and not arxiv_url:
4975
4975
  url_match = re.search(r'https?://(?!arxiv\.org)[^\s,\)]+', ref_text)
4976
4976
  if url_match:
4977
- from utils.url_utils import clean_url_punctuation
4977
+ from refchecker.utils.url_utils import clean_url_punctuation
4978
4978
  url = clean_url_punctuation(url_match.group(0))
4979
4979
 
4980
4980
  # Extract year
@@ -5023,7 +5023,7 @@ class ArxivReferenceChecker:
5023
5023
  logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
5024
5024
 
5025
5025
  # Check if we can get BibTeX content for this paper (ArXiv or other sources)
5026
- from utils.arxiv_utils import get_bibtex_content
5026
+ from refchecker.utils.arxiv_utils import get_bibtex_content
5027
5027
  bibtex_content = get_bibtex_content(paper)
5028
5028
  if bibtex_content:
5029
5029
  logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
@@ -5047,7 +5047,7 @@ class ArxivReferenceChecker:
5047
5047
  references = extract_latex_references(bibtex_content, None)
5048
5048
 
5049
5049
  # Validate the parsed references and fallback to LLM if needed
5050
- from utils.text_utils import validate_parsed_references
5050
+ from refchecker.utils.text_utils import validate_parsed_references
5051
5051
  validation = validate_parsed_references(references)
5052
5052
 
5053
5053
  if not validation['is_valid']:
@@ -5372,9 +5372,9 @@ class ArxivReferenceChecker:
5372
5372
  # Print reference info in non-debug mode (improved formatting)
5373
5373
  raw_title = reference.get('title', 'Untitled')
5374
5374
  # Clean LaTeX commands from title for display
5375
- from utils.text_utils import strip_latex_commands
5375
+ from refchecker.utils.text_utils import strip_latex_commands
5376
5376
  title = strip_latex_commands(raw_title)
5377
- from utils.text_utils import format_authors_for_display
5377
+ from refchecker.utils.text_utils import format_authors_for_display
5378
5378
  authors = format_authors_for_display(reference.get('authors', []))
5379
5379
  year = reference.get('year', '')
5380
5380
  venue = reference.get('venue', '') or reference.get('journal', '')
@@ -5424,7 +5424,7 @@ class ArxivReferenceChecker:
5424
5424
 
5425
5425
  # Show DOI URL if available and different from what's already shown
5426
5426
  if external_ids.get('DOI'):
5427
- from utils.doi_utils import construct_doi_url
5427
+ from refchecker.utils.doi_utils import construct_doi_url
5428
5428
  doi_url = construct_doi_url(external_ids['DOI'])
5429
5429
  if doi_url != verified_url_to_show and doi_url != url:
5430
5430
  print(f" DOI URL: {doi_url}")
@@ -5523,19 +5523,19 @@ class ArxivReferenceChecker:
5523
5523
 
5524
5524
  # Second priority: Semantic Scholar URL from CorpusId (if no direct URL available)
5525
5525
  if verified_data and verified_data.get('externalIds', {}).get('CorpusId'):
5526
- from utils.url_utils import construct_semantic_scholar_url
5526
+ from refchecker.utils.url_utils import construct_semantic_scholar_url
5527
5527
  return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
5528
5528
 
5529
5529
  # Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
5530
5530
  if verified_data and verified_data.get('externalIds', {}).get('DOI'):
5531
- from utils.doi_utils import construct_doi_url
5531
+ from refchecker.utils.doi_utils import construct_doi_url
5532
5532
  return construct_doi_url(verified_data['externalIds']['DOI'])
5533
5533
 
5534
5534
  # Fourth priority: ArXiv URL from verified data (but only if there's no ArXiv ID error)
5535
5535
  if verified_data and verified_data.get('externalIds', {}).get('ArXiv'):
5536
5536
  # Only show ArXiv URL as verified URL if there's no ArXiv ID mismatch
5537
5537
  if not self._has_arxiv_id_error(errors):
5538
- from utils.url_utils import construct_arxiv_url
5538
+ from refchecker.utils.url_utils import construct_arxiv_url
5539
5539
  correct_arxiv_id = verified_data['externalIds']['ArXiv']
5540
5540
  return construct_arxiv_url(correct_arxiv_id)
5541
5541
 
@@ -5556,7 +5556,7 @@ class ArxivReferenceChecker:
5556
5556
  external_ids = verified_data.get('externalIds', {})
5557
5557
  if external_ids.get('ArXiv'):
5558
5558
  # Extract ArXiv ID from the URL using shared utility
5559
- from utils.url_utils import extract_arxiv_id_from_url
5559
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
5560
5560
  url_arxiv_id = extract_arxiv_id_from_url(reference_url)
5561
5561
  if url_arxiv_id:
5562
5562
  correct_arxiv_id = external_ids['ArXiv']
@@ -5579,10 +5579,10 @@ class ArxivReferenceChecker:
5579
5579
  def _get_fallback_url(self, external_ids):
5580
5580
  """Get fallback URL from external IDs (Semantic Scholar or DOI)"""
5581
5581
  if external_ids.get('CorpusId'):
5582
- from utils.url_utils import construct_semantic_scholar_url
5582
+ from refchecker.utils.url_utils import construct_semantic_scholar_url
5583
5583
  return construct_semantic_scholar_url(external_ids['CorpusId'])
5584
5584
  elif external_ids.get('DOI'):
5585
- from utils.doi_utils import construct_doi_url
5585
+ from refchecker.utils.doi_utils import construct_doi_url
5586
5586
  return construct_doi_url(external_ids['DOI'])
5587
5587
  return None
5588
5588
 
@@ -5660,7 +5660,7 @@ class ArxivReferenceChecker:
5660
5660
  error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
5661
5661
  error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
5662
5662
 
5663
- from utils.error_utils import print_labeled_multiline
5663
+ from refchecker.utils.error_utils import print_labeled_multiline
5664
5664
 
5665
5665
  if error_type == 'arxiv_id':
5666
5666
  print(f" ❌ {error_details}")
@@ -32,7 +32,7 @@ def extract_arxiv_id_from_paper(paper):
32
32
 
33
33
  if hasattr(paper, 'pdf_url') and paper.pdf_url:
34
34
  # Try to extract ArXiv ID from the PDF URL
35
- from utils.url_utils import extract_arxiv_id_from_url
35
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
36
36
  arxiv_id = extract_arxiv_id_from_url(paper.pdf_url)
37
37
  elif hasattr(paper, 'get_short_id'):
38
38
  # Check if the paper ID itself is an ArXiv ID
@@ -316,7 +316,7 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
316
316
  return bib_content
317
317
 
318
318
  # Parse BibTeX entries and filter
319
- from utils.bibtex_parser import parse_bibtex_entries
319
+ from refchecker.utils.bibtex_parser import parse_bibtex_entries
320
320
  entries = parse_bibtex_entries(bib_content)
321
321
 
322
322
  # Filter entries to only cited ones and remove duplicates
@@ -481,7 +481,7 @@ def get_bibtex_content(paper):
481
481
 
482
482
  elif tex_content:
483
483
  # Check for embedded bibliography in LaTeX
484
- from utils.text_utils import detect_latex_bibliography_format
484
+ from refchecker.utils.text_utils import detect_latex_bibliography_format
485
485
  latex_format = detect_latex_bibliography_format(tex_content)
486
486
  if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
487
487
  logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
@@ -200,8 +200,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
200
200
  List of structured reference dictionaries, or empty list if
201
201
  parsing quality is poor (to trigger LLM fallback)
202
202
  """
203
- from utils.text_utils import parse_authors_with_initials, clean_title
204
- from utils.doi_utils import construct_doi_url, is_valid_doi_format
203
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
204
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
205
205
 
206
206
  if not text or not detect_biblatex_format(text):
207
207
  return []
@@ -300,8 +300,8 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
300
300
  Returns:
301
301
  Dictionary with parsed entry data
302
302
  """
303
- from utils.text_utils import parse_authors_with_initials, clean_title
304
- from utils.doi_utils import construct_doi_url, is_valid_doi_format
303
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
304
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
305
305
 
306
306
  # Initialize default values
307
307
  title = ""
@@ -164,7 +164,7 @@ def _parse_bibtex_references(bibliography_text):
164
164
  Returns:
165
165
  List of reference dictionaries
166
166
  """
167
- from utils.bibtex_parser import parse_bibtex_entries
167
+ from refchecker.utils.bibtex_parser import parse_bibtex_entries
168
168
  return parse_bibtex_entries(bibliography_text)
169
169
 
170
170
 
@@ -178,7 +178,7 @@ def _parse_biblatex_references(bibliography_text):
178
178
  Returns:
179
179
  List of reference dictionaries
180
180
  """
181
- from utils.text_utils import extract_latex_references
181
+ from refchecker.utils.text_utils import extract_latex_references
182
182
  return extract_latex_references(bibliography_text)
183
183
 
184
184
 
@@ -186,7 +186,7 @@ def _parse_standard_acm_natbib_references(bibliography_text):
186
186
  """
187
187
  Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
188
188
  """
189
- from utils.text_utils import detect_standard_acm_natbib_format
189
+ from refchecker.utils.text_utils import detect_standard_acm_natbib_format
190
190
 
191
191
  references = []
192
192
 
@@ -230,7 +230,7 @@ def _parse_simple_natbib_format(ref_num, content, label):
230
230
  Returns:
231
231
  Dictionary containing parsed reference information
232
232
  """
233
- from utils.text_utils import extract_url_from_reference, extract_year_from_reference
233
+ from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
234
234
 
235
235
  # Basic parsing - this could be enhanced with more sophisticated NLP
236
236
  reference = {
@@ -288,7 +288,7 @@ def _parse_references_regex(bibliography_text):
288
288
  }
289
289
 
290
290
  # Basic information extraction
291
- from utils.text_utils import extract_url_from_reference, extract_year_from_reference
291
+ from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
292
292
 
293
293
  url = extract_url_from_reference(ref_content)
294
294
  if url:
@@ -214,8 +214,8 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
214
214
  Returns:
215
215
  List of structured reference dictionaries
216
216
  """
217
- from utils.text_utils import parse_authors_with_initials, clean_title
218
- from utils.doi_utils import construct_doi_url, is_valid_doi_format
217
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
218
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
219
219
 
220
220
  entries = parse_bibtex_entries(bibliography_text)
221
221
  references = []
@@ -291,7 +291,7 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
291
291
  # Extract other URLs
292
292
  url = fields.get('url', '')
293
293
  if url:
294
- from utils.url_utils import clean_url
294
+ from refchecker.utils.url_utils import clean_url
295
295
  url = clean_url(url)
296
296
 
297
297
  # Handle special @misc entries with only howpublished field
@@ -318,7 +318,7 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
318
318
  url = howpublished
319
319
 
320
320
  # Clean the reconstructed URL
321
- from utils.url_utils import clean_url
321
+ from refchecker.utils.url_utils import clean_url
322
322
  url = clean_url(url)
323
323
 
324
324
  # Generate title from domain/path
@@ -350,7 +350,7 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
350
350
 
351
351
  # Clean any URL we extracted
352
352
  if url:
353
- from utils.url_utils import clean_url
353
+ from refchecker.utils.url_utils import clean_url
354
354
  url = clean_url(url)
355
355
 
356
356
  # Construct ArXiv URL from eprint field if no URL present
@@ -179,7 +179,7 @@ def clean_venue_for_comparison(venue: str) -> str:
179
179
  Returns:
180
180
  Cleaned venue name suitable for display
181
181
  """
182
- from utils.text_utils import normalize_venue_for_display
182
+ from refchecker.utils.text_utils import normalize_venue_for_display
183
183
  return normalize_venue_for_display(venue)
184
184
 
185
185