academic-refchecker 1.2.45__py3-none-any.whl → 1.2.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.45"
3
+ __version__ = "1.2.46"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.45
3
+ Version: 1.2.46
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=8vdrigO4-YfHufQMfh_RQ9NlN5btmqndss2dAOLxa1Q,65
2
- academic_refchecker-1.2.45.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=PuEdWOtQEDRwfL2m1mk5h6WKAoScQu-kbHu9VkBS764,65
2
+ academic_refchecker-1.2.46.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -7,8 +7,8 @@ checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14
7
7
  checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
8
8
  checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
9
  checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
10
- checkers/semantic_scholar.py,sha256=0LcVahf3twyHqaD7bQ2eJiTyg-AQ9NGvVohb9nqaHdA,34884
11
- checkers/webpage_checker.py,sha256=woY8mNgZ4Lr9Ug53CN-Xo_2P62BTpR2u_FZyUPgTEuA,21833
10
+ checkers/semantic_scholar.py,sha256=99KOHLAiYs31nSdx-gcMR_TWIlV8G4juNL0bmV4AoUs,34768
11
+ checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
12
12
  config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
13
13
  config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
14
14
  config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
@@ -26,7 +26,7 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
26
26
  services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
27
27
  services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
28
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
29
- utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
29
+ utils/arxiv_utils.py,sha256=MxyD3Q0EzrmE0xORMJw8wdVtZ4Fp-ux_cn6jLMQimV8,18168
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
31
  utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
32
32
  utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
36
  utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
37
37
  utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=T67Y-HSNokj-mOcdCtOcULNviBxyaG9xTjRd_l9titI,210088
39
+ utils/text_utils.py,sha256=gFI-qu6g-9Lo1s3w1OjgBZ9SvdPufL1mMg-05l0BwD0,210269
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
41
  utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
42
- academic_refchecker-1.2.45.dist-info/METADATA,sha256=mY4M9FRaDKcyS5yOFvR3X0Y0bj47_YmZeMayvrrpS38,22576
43
- academic_refchecker-1.2.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.45.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.45.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.45.dist-info/RECORD,,
42
+ academic_refchecker-1.2.46.dist-info/METADATA,sha256=lP_pMHS9uI4hhXEsox_yGlSnT6pSl1_6W58CtzZEbDM,22576
43
+ academic_refchecker-1.2.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.46.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.46.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.46.dist-info/RECORD,,
@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
543
543
  elif paper_venue and not isinstance(paper_venue, str):
544
544
  paper_venue = str(paper_venue)
545
545
 
546
+ # Check venue mismatches
546
547
  if cited_venue and paper_venue:
547
548
  # Use the utility function to check if venues are substantially different
548
549
  if are_venues_substantially_different(cited_venue, paper_venue):
549
550
  from utils.error_utils import create_venue_warning
550
551
  errors.append(create_venue_warning(cited_venue, paper_venue))
551
552
  elif not cited_venue and paper_venue:
552
- # Check if this is an arXiv paper first
553
- external_ids = paper_data.get('externalIds', {})
554
- arxiv_id = external_ids.get('ArXiv') if external_ids else None
555
-
556
- if arxiv_id:
557
- # For arXiv papers, suggest including the arXiv URL instead of venue
558
- arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
559
-
560
- # Check if the reference already includes this ArXiv URL or equivalent DOI
561
- reference_url = reference.get('url', '')
562
-
563
- # Check for direct arXiv URL match
564
- has_arxiv_url = arxiv_url in reference_url
565
-
566
- # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
567
- arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
568
- has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
569
-
570
- if not (has_arxiv_url or has_arxiv_doi):
553
+ # Original reference has the venue in raw text but not parsed correctly
554
+ raw_text = reference.get('raw_text', '')
555
+ if raw_text and '#' in raw_text:
556
+ # Check if venue might be in the raw text format (author#title#venue#year#url)
557
+ parts = raw_text.split('#')
558
+ if len(parts) >= 3 and parts[2].strip():
559
+ # Venue is present in raw text but missing from parsed reference
571
560
  errors.append({
572
561
  'warning_type': 'venue',
573
- 'warning_details': f"Reference should include arXiv URL: {arxiv_url}",
574
- 'ref_url_correct': arxiv_url
562
+ 'warning_details': f"Venue missing: should include '{paper_venue}'",
563
+ 'ref_venue_correct': paper_venue
575
564
  })
576
- else:
577
- # Original reference has the venue in raw text but not parsed correctly
578
- raw_text = reference.get('raw_text', '')
579
- if raw_text and '#' in raw_text:
580
- # Check if venue might be in the raw text format (author#title#venue#year#url)
581
- parts = raw_text.split('#')
582
- if len(parts) >= 3 and parts[2].strip():
583
- # Venue is present in raw text but missing from parsed reference
584
- errors.append({
585
- 'warning_type': 'venue',
586
- 'warning_details': f"Venue missing: should include '{paper_venue}'",
587
- 'ref_venue_correct': paper_venue
588
- })
565
+
566
+ # Always check for missing arXiv URLs when paper has arXiv ID
567
+ external_ids = paper_data.get('externalIds', {})
568
+ arxiv_id = external_ids.get('ArXiv') if external_ids else None
569
+
570
+ if arxiv_id:
571
+ # For arXiv papers, check if reference includes the arXiv URL
572
+ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
573
+
574
+ # Check if the reference already includes this ArXiv URL or equivalent DOI
575
+ reference_url = reference.get('url', '')
576
+
577
+ # Check for direct arXiv URL match
578
+ has_arxiv_url = arxiv_url in reference_url
579
+
580
+ # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
581
+ arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
582
+ has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
583
+
584
+ if not (has_arxiv_url or has_arxiv_doi):
585
+ errors.append({
586
+ 'warning_type': 'url',
587
+ 'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
588
+ 'ref_url_correct': arxiv_url
589
+ })
589
590
 
590
591
  # Verify DOI
591
592
  paper_doi = None
@@ -71,7 +71,8 @@ class WebPageChecker:
71
71
  doc_indicators = [
72
72
  'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
73
73
  'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
74
- 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
74
+ 'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
75
+ 'posts' # For blog posts and forum posts like LessWrong
75
76
  ]
76
77
 
77
78
  return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
@@ -84,7 +85,8 @@ class WebPageChecker:
84
85
  doc_domains = [
85
86
  'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
86
87
  'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
87
- 'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
88
+ 'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
89
+ 'lesswrong.com' # LessWrong rationality and AI safety blog platform
88
90
  ]
89
91
 
90
92
  return any(domain in parsed.netloc for domain in doc_domains)
@@ -395,6 +397,14 @@ class WebPageChecker:
395
397
  organization = site_info.get('organization', '').lower()
396
398
  domain = site_info.get('domain', '').lower()
397
399
 
400
+ # Accept generic web resource terms - these are valid for any web URL
401
+ generic_web_terms = [
402
+ 'web resource', 'web site', 'website', 'online resource',
403
+ 'online', 'web', 'internet resource', 'web page', 'webpage'
404
+ ]
405
+ if cited_lower in generic_web_terms:
406
+ return True
407
+
398
408
  # Direct matches
399
409
  if cited_lower in organization or organization in cited_lower:
400
410
  return True
utils/arxiv_utils.py CHANGED
@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
392
392
  logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
393
  tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
394
 
395
- # Choose between .bib and .bbl files based on content richness
396
- # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
395
+ # Choose between .bib and .bbl files - .bbl files take priority when they contain entries
396
+ # .bbl files are processed biblatex output that reflects exactly what was cited
397
397
  if bib_content and bbl_content:
398
- # Count entries in both
398
+ # Count entries in both for logging
399
399
  bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
400
  bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
401
401
 
402
- # If we have LaTeX content, get filtered BibTeX count
403
- filtered_bib_count = bib_entry_count
404
- filtered_content = bib_content
405
- if tex_content:
406
- cited_keys = extract_cited_keys_from_tex({}, tex_content)
407
- if cited_keys:
408
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
409
- filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
410
- filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
411
-
412
- logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
402
+ logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
413
403
 
414
- # Prioritize .bbl if it has significantly more entries
415
- if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
416
- logger.info(f"Using .bbl files from ArXiv source")
404
+ # Only use .bbl if it actually contains bibliography entries
405
+ if bbl_entry_count > 0:
406
+ logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
417
407
  return bbl_content
418
408
  else:
419
- logger.info(f"Using filtered .bib files")
420
- return filtered_content
409
+ logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
410
+ # If we have LaTeX content, filter BibTeX by cited keys
411
+ if tex_content:
412
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
413
+ if cited_keys:
414
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
415
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
416
+ return filtered_content
417
+ return bib_content
421
418
 
422
419
  elif bib_content:
423
420
  logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
utils/text_utils.py CHANGED
@@ -2111,7 +2111,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2111
2111
  # Allow minor flexibility (1 author difference) but not more
2112
2112
  if abs(len(cleaned_cited) - len(correct_names)) > 1:
2113
2113
  from utils.error_utils import format_author_count_mismatch
2114
- error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), cleaned_cited, correct_names)
2114
+ # Convert cited names to display format (First Last) before showing in error
2115
+ display_cited = [format_author_for_display(author) for author in cleaned_cited]
2116
+ error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2115
2117
  return False, error_msg
2116
2118
 
2117
2119
  # Use the shorter list for comparison