academic-refchecker 1.2.45__py3-none-any.whl → 1.2.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/RECORD +11 -11
- checkers/semantic_scholar.py +35 -34
- checkers/webpage_checker.py +12 -2
- utils/arxiv_utils.py +16 -19
- utils/text_utils.py +3 -1
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=PuEdWOtQEDRwfL2m1mk5h6WKAoScQu-kbHu9VkBS764,65
|
|
2
|
+
academic_refchecker-1.2.46.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
@@ -7,8 +7,8 @@ checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14
|
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=DgGMjmR_w_saz2UTMatEhfXbhUED9tUmDG3vlZAIzc4,20428
|
|
8
8
|
checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
|
|
9
9
|
checkers/openreview_checker.py,sha256=EYRcVu7sZzrFjqdOEKp72vd0S3sXSSjTgI9jsAVpdD0,20524
|
|
10
|
-
checkers/semantic_scholar.py,sha256=
|
|
11
|
-
checkers/webpage_checker.py,sha256=
|
|
10
|
+
checkers/semantic_scholar.py,sha256=99KOHLAiYs31nSdx-gcMR_TWIlV8G4juNL0bmV4AoUs,34768
|
|
11
|
+
checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
|
|
12
12
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
13
13
|
config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
14
14
|
config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
@@ -26,7 +26,7 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
|
|
|
26
26
|
services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
27
27
|
services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
|
|
28
28
|
utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
29
|
-
utils/arxiv_utils.py,sha256=
|
|
29
|
+
utils/arxiv_utils.py,sha256=MxyD3Q0EzrmE0xORMJw8wdVtZ4Fp-ux_cn6jLMQimV8,18168
|
|
30
30
|
utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
31
31
|
utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
|
|
32
32
|
utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
|
|
@@ -36,11 +36,11 @@ utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
|
36
36
|
utils/doi_utils.py,sha256=qAf2slTLkgBwwnkYsWMkwMSDf6q8wNqd2YwNxhQudEc,3995
|
|
37
37
|
utils/error_utils.py,sha256=wrLQhVJIIWUzcTSOLVvrvCAtMRNXt_1YYL5OXYnYf0A,11085
|
|
38
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
39
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=gFI-qu6g-9Lo1s3w1OjgBZ9SvdPufL1mMg-05l0BwD0,210269
|
|
40
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
41
|
utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
42
|
+
academic_refchecker-1.2.46.dist-info/METADATA,sha256=lP_pMHS9uI4hhXEsox_yGlSnT6pSl1_6W58CtzZEbDM,22576
|
|
43
|
+
academic_refchecker-1.2.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.46.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.46.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.46.dist-info/RECORD,,
|
checkers/semantic_scholar.py
CHANGED
|
@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
|
|
|
543
543
|
elif paper_venue and not isinstance(paper_venue, str):
|
|
544
544
|
paper_venue = str(paper_venue)
|
|
545
545
|
|
|
546
|
+
# Check venue mismatches
|
|
546
547
|
if cited_venue and paper_venue:
|
|
547
548
|
# Use the utility function to check if venues are substantially different
|
|
548
549
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
549
550
|
from utils.error_utils import create_venue_warning
|
|
550
551
|
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
551
552
|
elif not cited_venue and paper_venue:
|
|
552
|
-
#
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
# Check if the reference already includes this ArXiv URL or equivalent DOI
|
|
561
|
-
reference_url = reference.get('url', '')
|
|
562
|
-
|
|
563
|
-
# Check for direct arXiv URL match
|
|
564
|
-
has_arxiv_url = arxiv_url in reference_url
|
|
565
|
-
|
|
566
|
-
# Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
|
|
567
|
-
arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
|
|
568
|
-
has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
|
|
569
|
-
|
|
570
|
-
if not (has_arxiv_url or has_arxiv_doi):
|
|
553
|
+
# Original reference has the venue in raw text but not parsed correctly
|
|
554
|
+
raw_text = reference.get('raw_text', '')
|
|
555
|
+
if raw_text and '#' in raw_text:
|
|
556
|
+
# Check if venue might be in the raw text format (author#title#venue#year#url)
|
|
557
|
+
parts = raw_text.split('#')
|
|
558
|
+
if len(parts) >= 3 and parts[2].strip():
|
|
559
|
+
# Venue is present in raw text but missing from parsed reference
|
|
571
560
|
errors.append({
|
|
572
561
|
'warning_type': 'venue',
|
|
573
|
-
'warning_details': f"
|
|
574
|
-
'
|
|
562
|
+
'warning_details': f"Venue missing: should include '{paper_venue}'",
|
|
563
|
+
'ref_venue_correct': paper_venue
|
|
575
564
|
})
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
565
|
+
|
|
566
|
+
# Always check for missing arXiv URLs when paper has arXiv ID
|
|
567
|
+
external_ids = paper_data.get('externalIds', {})
|
|
568
|
+
arxiv_id = external_ids.get('ArXiv') if external_ids else None
|
|
569
|
+
|
|
570
|
+
if arxiv_id:
|
|
571
|
+
# For arXiv papers, check if reference includes the arXiv URL
|
|
572
|
+
arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
573
|
+
|
|
574
|
+
# Check if the reference already includes this ArXiv URL or equivalent DOI
|
|
575
|
+
reference_url = reference.get('url', '')
|
|
576
|
+
|
|
577
|
+
# Check for direct arXiv URL match
|
|
578
|
+
has_arxiv_url = arxiv_url in reference_url
|
|
579
|
+
|
|
580
|
+
# Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
|
|
581
|
+
arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
|
|
582
|
+
has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
|
|
583
|
+
|
|
584
|
+
if not (has_arxiv_url or has_arxiv_doi):
|
|
585
|
+
errors.append({
|
|
586
|
+
'warning_type': 'url',
|
|
587
|
+
'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
|
|
588
|
+
'ref_url_correct': arxiv_url
|
|
589
|
+
})
|
|
589
590
|
|
|
590
591
|
# Verify DOI
|
|
591
592
|
paper_doi = None
|
checkers/webpage_checker.py
CHANGED
|
@@ -71,7 +71,8 @@ class WebPageChecker:
|
|
|
71
71
|
doc_indicators = [
|
|
72
72
|
'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
|
|
73
73
|
'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
|
|
74
|
-
'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
|
|
74
|
+
'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
|
|
75
|
+
'posts' # For blog posts and forum posts like LessWrong
|
|
75
76
|
]
|
|
76
77
|
|
|
77
78
|
return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
|
|
@@ -84,7 +85,8 @@ class WebPageChecker:
|
|
|
84
85
|
doc_domains = [
|
|
85
86
|
'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
|
|
86
87
|
'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
|
|
87
|
-
'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
|
|
88
|
+
'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
|
|
89
|
+
'lesswrong.com' # LessWrong rationality and AI safety blog platform
|
|
88
90
|
]
|
|
89
91
|
|
|
90
92
|
return any(domain in parsed.netloc for domain in doc_domains)
|
|
@@ -395,6 +397,14 @@ class WebPageChecker:
|
|
|
395
397
|
organization = site_info.get('organization', '').lower()
|
|
396
398
|
domain = site_info.get('domain', '').lower()
|
|
397
399
|
|
|
400
|
+
# Accept generic web resource terms - these are valid for any web URL
|
|
401
|
+
generic_web_terms = [
|
|
402
|
+
'web resource', 'web site', 'website', 'online resource',
|
|
403
|
+
'online', 'web', 'internet resource', 'web page', 'webpage'
|
|
404
|
+
]
|
|
405
|
+
if cited_lower in generic_web_terms:
|
|
406
|
+
return True
|
|
407
|
+
|
|
398
408
|
# Direct matches
|
|
399
409
|
if cited_lower in organization or organization in cited_lower:
|
|
400
410
|
return True
|
utils/arxiv_utils.py
CHANGED
|
@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
|
|
|
392
392
|
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
393
393
|
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
394
394
|
|
|
395
|
-
# Choose between .bib and .bbl files
|
|
396
|
-
#
|
|
395
|
+
# Choose between .bib and .bbl files - .bbl files take priority when they contain entries
|
|
396
|
+
# .bbl files are processed biblatex output that reflects exactly what was cited
|
|
397
397
|
if bib_content and bbl_content:
|
|
398
|
-
# Count entries in both
|
|
398
|
+
# Count entries in both for logging
|
|
399
399
|
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
400
400
|
bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
|
|
401
401
|
|
|
402
|
-
|
|
403
|
-
filtered_bib_count = bib_entry_count
|
|
404
|
-
filtered_content = bib_content
|
|
405
|
-
if tex_content:
|
|
406
|
-
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
407
|
-
if cited_keys:
|
|
408
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
409
|
-
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
410
|
-
filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
|
|
411
|
-
|
|
412
|
-
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
|
|
402
|
+
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
|
|
413
403
|
|
|
414
|
-
#
|
|
415
|
-
if bbl_entry_count >
|
|
416
|
-
logger.info(f"Using .bbl files from ArXiv source")
|
|
404
|
+
# Only use .bbl if it actually contains bibliography entries
|
|
405
|
+
if bbl_entry_count > 0:
|
|
406
|
+
logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
|
|
417
407
|
return bbl_content
|
|
418
408
|
else:
|
|
419
|
-
logger.info(f"Using
|
|
420
|
-
|
|
409
|
+
logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
|
|
410
|
+
# If we have LaTeX content, filter BibTeX by cited keys
|
|
411
|
+
if tex_content:
|
|
412
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
413
|
+
if cited_keys:
|
|
414
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
415
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
416
|
+
return filtered_content
|
|
417
|
+
return bib_content
|
|
421
418
|
|
|
422
419
|
elif bib_content:
|
|
423
420
|
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
utils/text_utils.py
CHANGED
|
@@ -2111,7 +2111,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2111
2111
|
# Allow minor flexibility (1 author difference) but not more
|
|
2112
2112
|
if abs(len(cleaned_cited) - len(correct_names)) > 1:
|
|
2113
2113
|
from utils.error_utils import format_author_count_mismatch
|
|
2114
|
-
|
|
2114
|
+
# Convert cited names to display format (First Last) before showing in error
|
|
2115
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2116
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2115
2117
|
return False, error_msg
|
|
2116
2118
|
|
|
2117
2119
|
# Use the shorter list for comparison
|
|
File without changes
|
{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.45.dist-info → academic_refchecker-1.2.46.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|