academic-refchecker 1.2.45__tar.gz → 1.2.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.45/src/academic_refchecker.egg-info → academic_refchecker-1.2.47}/PKG-INFO +1 -1
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/__version__.py +1 -1
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/local_semantic_scholar.py +15 -4
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/semantic_scholar.py +35 -34
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/webpage_checker.py +12 -2
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/refchecker.py +20 -9
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/arxiv_utils.py +16 -19
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/doi_utils.py +23 -5
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/error_utils.py +17 -1
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/text_utils.py +27 -4
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/LICENSE +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/README.md +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/pyproject.toml +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/requirements.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/setup.cfg +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/utils/url_utils.py +0 -0
{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/local_semantic_scholar.py
RENAMED
|
@@ -430,11 +430,22 @@ class LocalNonArxivReferenceChecker:
|
|
|
430
430
|
logger.debug(f"Local DB: Author mismatch - {author_error}")
|
|
431
431
|
errors.append(create_author_error(author_error, paper_data.get('authors', [])))
|
|
432
432
|
|
|
433
|
-
# Verify year
|
|
433
|
+
# Verify year (with tolerance)
|
|
434
434
|
paper_year = paper_data.get('year')
|
|
435
|
-
if year and paper_year
|
|
436
|
-
|
|
437
|
-
|
|
435
|
+
if year and paper_year:
|
|
436
|
+
# Get year tolerance from config (default to 1 if not available)
|
|
437
|
+
year_tolerance = 1 # Default tolerance
|
|
438
|
+
try:
|
|
439
|
+
from config.settings import get_config
|
|
440
|
+
config = get_config()
|
|
441
|
+
year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
|
|
442
|
+
except (ImportError, Exception):
|
|
443
|
+
pass # Use default if config not available
|
|
444
|
+
|
|
445
|
+
# Only flag as mismatch if the difference is greater than tolerance
|
|
446
|
+
if abs(year - paper_year) > year_tolerance:
|
|
447
|
+
logger.debug(f"Local DB: Year mismatch - cited: {year}, actual: {paper_year}")
|
|
448
|
+
errors.append(create_year_warning(year, paper_year))
|
|
438
449
|
|
|
439
450
|
# Verify DOI
|
|
440
451
|
paper_doi = None
|
|
@@ -543,49 +543,50 @@ class NonArxivReferenceChecker:
|
|
|
543
543
|
elif paper_venue and not isinstance(paper_venue, str):
|
|
544
544
|
paper_venue = str(paper_venue)
|
|
545
545
|
|
|
546
|
+
# Check venue mismatches
|
|
546
547
|
if cited_venue and paper_venue:
|
|
547
548
|
# Use the utility function to check if venues are substantially different
|
|
548
549
|
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
549
550
|
from utils.error_utils import create_venue_warning
|
|
550
551
|
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
551
552
|
elif not cited_venue and paper_venue:
|
|
552
|
-
#
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
# Check if the reference already includes this ArXiv URL or equivalent DOI
|
|
561
|
-
reference_url = reference.get('url', '')
|
|
562
|
-
|
|
563
|
-
# Check for direct arXiv URL match
|
|
564
|
-
has_arxiv_url = arxiv_url in reference_url
|
|
565
|
-
|
|
566
|
-
# Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
|
|
567
|
-
arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
|
|
568
|
-
has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
|
|
569
|
-
|
|
570
|
-
if not (has_arxiv_url or has_arxiv_doi):
|
|
553
|
+
# Original reference has the venue in raw text but not parsed correctly
|
|
554
|
+
raw_text = reference.get('raw_text', '')
|
|
555
|
+
if raw_text and '#' in raw_text:
|
|
556
|
+
# Check if venue might be in the raw text format (author#title#venue#year#url)
|
|
557
|
+
parts = raw_text.split('#')
|
|
558
|
+
if len(parts) >= 3 and parts[2].strip():
|
|
559
|
+
# Venue is present in raw text but missing from parsed reference
|
|
571
560
|
errors.append({
|
|
572
561
|
'warning_type': 'venue',
|
|
573
|
-
'warning_details': f"
|
|
574
|
-
'
|
|
562
|
+
'warning_details': f"Venue missing: should include '{paper_venue}'",
|
|
563
|
+
'ref_venue_correct': paper_venue
|
|
575
564
|
})
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
565
|
+
|
|
566
|
+
# Always check for missing arXiv URLs when paper has arXiv ID
|
|
567
|
+
external_ids = paper_data.get('externalIds', {})
|
|
568
|
+
arxiv_id = external_ids.get('ArXiv') if external_ids else None
|
|
569
|
+
|
|
570
|
+
if arxiv_id:
|
|
571
|
+
# For arXiv papers, check if reference includes the arXiv URL
|
|
572
|
+
arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
573
|
+
|
|
574
|
+
# Check if the reference already includes this ArXiv URL or equivalent DOI
|
|
575
|
+
reference_url = reference.get('url', '')
|
|
576
|
+
|
|
577
|
+
# Check for direct arXiv URL match
|
|
578
|
+
has_arxiv_url = arxiv_url in reference_url
|
|
579
|
+
|
|
580
|
+
# Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
|
|
581
|
+
arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
|
|
582
|
+
has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
|
|
583
|
+
|
|
584
|
+
if not (has_arxiv_url or has_arxiv_doi):
|
|
585
|
+
errors.append({
|
|
586
|
+
'warning_type': 'url',
|
|
587
|
+
'warning_details': f"Reference could include arXiv URL: {arxiv_url}",
|
|
588
|
+
'ref_url_correct': arxiv_url
|
|
589
|
+
})
|
|
589
590
|
|
|
590
591
|
# Verify DOI
|
|
591
592
|
paper_doi = None
|
|
@@ -71,7 +71,8 @@ class WebPageChecker:
|
|
|
71
71
|
doc_indicators = [
|
|
72
72
|
'docs', 'documentation', 'readthedocs.io', 'help', 'guide', 'tutorial',
|
|
73
73
|
'reference', 'manual', 'wiki', 'blog', 'api', 'developer', 'platform',
|
|
74
|
-
'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop'
|
|
74
|
+
'index', 'research', 'news', 'insights', 'whitepaper', 'brief', 'develop',
|
|
75
|
+
'posts' # For blog posts and forum posts like LessWrong
|
|
75
76
|
]
|
|
76
77
|
|
|
77
78
|
return any(indicator in url.lower() for indicator in doc_indicators) or self._is_likely_webpage(url)
|
|
@@ -84,7 +85,8 @@ class WebPageChecker:
|
|
|
84
85
|
doc_domains = [
|
|
85
86
|
'pytorch.org', 'tensorflow.org', 'readthedocs.io', 'onnxruntime.ai',
|
|
86
87
|
'deepspeed.ai', 'huggingface.co', 'openai.com', 'microsoft.com',
|
|
87
|
-
'google.com', 'nvidia.com', 'intel.com', 'langchain.com'
|
|
88
|
+
'google.com', 'nvidia.com', 'intel.com', 'langchain.com',
|
|
89
|
+
'lesswrong.com' # LessWrong rationality and AI safety blog platform
|
|
88
90
|
]
|
|
89
91
|
|
|
90
92
|
return any(domain in parsed.netloc for domain in doc_domains)
|
|
@@ -395,6 +397,14 @@ class WebPageChecker:
|
|
|
395
397
|
organization = site_info.get('organization', '').lower()
|
|
396
398
|
domain = site_info.get('domain', '').lower()
|
|
397
399
|
|
|
400
|
+
# Accept generic web resource terms - these are valid for any web URL
|
|
401
|
+
generic_web_terms = [
|
|
402
|
+
'web resource', 'web site', 'website', 'online resource',
|
|
403
|
+
'online', 'web', 'internet resource', 'web page', 'webpage'
|
|
404
|
+
]
|
|
405
|
+
if cited_lower in generic_web_terms:
|
|
406
|
+
return True
|
|
407
|
+
|
|
398
408
|
# Direct matches
|
|
399
409
|
if cited_lower in organization or organization in cited_lower:
|
|
400
410
|
return True
|
|
@@ -1922,16 +1922,27 @@ class ArxivReferenceChecker:
|
|
|
1922
1922
|
'ref_authors_correct': ', '.join(correct_names)
|
|
1923
1923
|
})
|
|
1924
1924
|
|
|
1925
|
-
# Verify year
|
|
1925
|
+
# Verify year (with tolerance)
|
|
1926
1926
|
paper_year = paper_data.get('year')
|
|
1927
|
-
if year and paper_year
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
'
|
|
1934
|
-
|
|
1927
|
+
if year and paper_year:
|
|
1928
|
+
# Get year tolerance from config (default to 1 if not available)
|
|
1929
|
+
year_tolerance = 1 # Default tolerance
|
|
1930
|
+
try:
|
|
1931
|
+
from config.settings import get_config
|
|
1932
|
+
config = get_config()
|
|
1933
|
+
year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
|
|
1934
|
+
except (ImportError, Exception):
|
|
1935
|
+
pass # Use default if config not available
|
|
1936
|
+
|
|
1937
|
+
# Only flag as mismatch if the difference is greater than tolerance
|
|
1938
|
+
if abs(year - paper_year) > year_tolerance:
|
|
1939
|
+
logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
|
|
1940
|
+
from utils.error_utils import format_year_mismatch
|
|
1941
|
+
errors.append({
|
|
1942
|
+
'warning_type': 'year',
|
|
1943
|
+
'warning_details': format_year_mismatch(year, paper_year),
|
|
1944
|
+
'ref_year_correct': paper_year
|
|
1945
|
+
})
|
|
1935
1946
|
|
|
1936
1947
|
# Verify DOI
|
|
1937
1948
|
if doi and external_ids.get('DOI'):
|
|
@@ -392,32 +392,29 @@ def get_bibtex_content(paper):
|
|
|
392
392
|
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
393
393
|
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
394
394
|
|
|
395
|
-
# Choose between .bib and .bbl files
|
|
396
|
-
#
|
|
395
|
+
# Choose between .bib and .bbl files - .bbl files take priority when they contain entries
|
|
396
|
+
# .bbl files are processed biblatex output that reflects exactly what was cited
|
|
397
397
|
if bib_content and bbl_content:
|
|
398
|
-
# Count entries in both
|
|
398
|
+
# Count entries in both for logging
|
|
399
399
|
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
400
400
|
bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
|
|
401
401
|
|
|
402
|
-
|
|
403
|
-
filtered_bib_count = bib_entry_count
|
|
404
|
-
filtered_content = bib_content
|
|
405
|
-
if tex_content:
|
|
406
|
-
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
407
|
-
if cited_keys:
|
|
408
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
409
|
-
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
410
|
-
filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
|
|
411
|
-
|
|
412
|
-
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
|
|
402
|
+
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
|
|
413
403
|
|
|
414
|
-
#
|
|
415
|
-
if bbl_entry_count >
|
|
416
|
-
logger.info(f"Using .bbl files from ArXiv source")
|
|
404
|
+
# Only use .bbl if it actually contains bibliography entries
|
|
405
|
+
if bbl_entry_count > 0:
|
|
406
|
+
logger.info(f"Using .bbl files from ArXiv source (biblatex takes priority over bibtex)")
|
|
417
407
|
return bbl_content
|
|
418
408
|
else:
|
|
419
|
-
logger.info(f"Using
|
|
420
|
-
|
|
409
|
+
logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
|
|
410
|
+
# If we have LaTeX content, filter BibTeX by cited keys
|
|
411
|
+
if tex_content:
|
|
412
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
413
|
+
if cited_keys:
|
|
414
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
415
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
416
|
+
return filtered_content
|
|
417
|
+
return bib_content
|
|
421
418
|
|
|
422
419
|
elif bib_content:
|
|
423
420
|
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
@@ -99,9 +99,8 @@ def compare_dois(doi1: str, doi2: str) -> bool:
|
|
|
99
99
|
"""
|
|
100
100
|
Compare two DOIs for equality, handling different formats and prefixes.
|
|
101
101
|
|
|
102
|
-
This function performs exact matching after normalization,
|
|
103
|
-
|
|
104
|
-
prefixes, case differences, and punctuation.
|
|
102
|
+
This function performs exact matching after normalization, with support
|
|
103
|
+
for partial DOI citations where a shorter DOI is a valid prefix of a longer one.
|
|
105
104
|
|
|
106
105
|
Args:
|
|
107
106
|
doi1: First DOI to compare
|
|
@@ -117,8 +116,27 @@ def compare_dois(doi1: str, doi2: str) -> bool:
|
|
|
117
116
|
norm_doi1 = normalize_doi(doi1)
|
|
118
117
|
norm_doi2 = normalize_doi(doi2)
|
|
119
118
|
|
|
120
|
-
#
|
|
121
|
-
|
|
119
|
+
# First try exact match
|
|
120
|
+
if norm_doi1 == norm_doi2:
|
|
121
|
+
return True
|
|
122
|
+
|
|
123
|
+
# Handle partial DOI citations - if one DOI is a prefix of the other, consider it a match
|
|
124
|
+
# This handles cases like "10.1007" being cited instead of the full "10.1007/s10458-025-09691-y"
|
|
125
|
+
if len(norm_doi1) != len(norm_doi2):
|
|
126
|
+
shorter_doi = norm_doi1 if len(norm_doi1) < len(norm_doi2) else norm_doi2
|
|
127
|
+
longer_doi = norm_doi2 if len(norm_doi1) < len(norm_doi2) else norm_doi1
|
|
128
|
+
|
|
129
|
+
# Only consider it a valid partial match if:
|
|
130
|
+
# 1. The shorter DOI is at least 7 characters (e.g., "10.1007")
|
|
131
|
+
# 2. The longer DOI starts with the shorter DOI
|
|
132
|
+
# 3. The next character in the longer DOI is '/' or '.' (valid DOI separators)
|
|
133
|
+
if (len(shorter_doi) >= 7 and
|
|
134
|
+
longer_doi.startswith(shorter_doi) and
|
|
135
|
+
len(longer_doi) > len(shorter_doi) and
|
|
136
|
+
longer_doi[len(shorter_doi)] in ['/', '.']):
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
return False
|
|
122
140
|
|
|
123
141
|
|
|
124
142
|
def construct_doi_url(doi: str) -> str:
|
|
@@ -183,6 +183,14 @@ def clean_venue_for_comparison(venue: str) -> str:
|
|
|
183
183
|
return normalize_venue_for_display(venue)
|
|
184
184
|
|
|
185
185
|
|
|
186
|
+
def format_missing_venue(correct_venue: str) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Format a missing venue message with only the actual value.
|
|
189
|
+
"""
|
|
190
|
+
# Only show the actual venue; omit the empty cited line
|
|
191
|
+
return f"Missing venue: '{correct_venue}'"
|
|
192
|
+
|
|
193
|
+
|
|
186
194
|
def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]:
|
|
187
195
|
"""
|
|
188
196
|
Create a standardized venue warning dictionary.
|
|
@@ -197,7 +205,15 @@ def create_venue_warning(cited_venue: str, correct_venue: str) -> Dict[str, str]
|
|
|
197
205
|
# Clean both venues for display in the warning
|
|
198
206
|
clean_cited = clean_venue_for_comparison(cited_venue)
|
|
199
207
|
clean_correct = clean_venue_for_comparison(correct_venue)
|
|
200
|
-
|
|
208
|
+
|
|
209
|
+
# If cited venue cleans to empty, treat as missing venue instead of mismatch
|
|
210
|
+
if not clean_cited and clean_correct:
|
|
211
|
+
return {
|
|
212
|
+
'warning_type': 'venue',
|
|
213
|
+
'warning_details': format_missing_venue(clean_correct),
|
|
214
|
+
'ref_venue_correct': correct_venue
|
|
215
|
+
}
|
|
216
|
+
|
|
201
217
|
return {
|
|
202
218
|
'warning_type': 'venue',
|
|
203
219
|
'warning_details': format_three_line_mismatch("Venue mismatch", clean_cited, clean_correct),
|
|
@@ -506,8 +506,10 @@ def clean_author_name(author):
|
|
|
506
506
|
# Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li")
|
|
507
507
|
author = re.sub(r'(\w)\s+\.', r'\1.', author)
|
|
508
508
|
|
|
509
|
-
# Remove common prefixes
|
|
510
|
-
|
|
509
|
+
# Remove common honorific prefixes only when they are standalone at the start (require trailing whitespace)
|
|
510
|
+
# Previous pattern falsely removed the leading "Mr" from names like "Mrinmaya" due to optional whitespace.
|
|
511
|
+
# Anchor to start and require at least one space after the title to avoid stripping inside longer names.
|
|
512
|
+
author = re.sub(r'^(?:Dr|Prof|Professor|Mr|Ms|Mrs)\.?\s+', '', author, flags=re.IGNORECASE)
|
|
511
513
|
|
|
512
514
|
# Remove email addresses
|
|
513
515
|
author = re.sub(r'\S+@\S+\.\S+', '', author)
|
|
@@ -2111,7 +2113,9 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2111
2113
|
# Allow minor flexibility (1 author difference) but not more
|
|
2112
2114
|
if abs(len(cleaned_cited) - len(correct_names)) > 1:
|
|
2113
2115
|
from utils.error_utils import format_author_count_mismatch
|
|
2114
|
-
|
|
2116
|
+
# Convert cited names to display format (First Last) before showing in error
|
|
2117
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2118
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2115
2119
|
return False, error_msg
|
|
2116
2120
|
|
|
2117
2121
|
# Use the shorter list for comparison
|
|
@@ -3586,6 +3590,12 @@ def calculate_title_similarity(title1: str, title2: str) -> float:
|
|
|
3586
3590
|
# Normalize titles for comparison
|
|
3587
3591
|
t1 = title1.lower().strip()
|
|
3588
3592
|
t2 = title2.lower().strip()
|
|
3593
|
+
|
|
3594
|
+
# Remove trailing year suffixes like ", 2024" or " 2024" for robust matching
|
|
3595
|
+
def strip_trailing_year(s: str) -> str:
|
|
3596
|
+
return re.sub(r"[,\s]*\b(19|20)\d{2}\b\s*$", "", s).strip()
|
|
3597
|
+
t1 = strip_trailing_year(t1)
|
|
3598
|
+
t2 = strip_trailing_year(t2)
|
|
3589
3599
|
|
|
3590
3600
|
# Exact match
|
|
3591
3601
|
if t1 == t2:
|
|
@@ -4674,6 +4684,13 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
4674
4684
|
|
|
4675
4685
|
venue_text = venue.strip()
|
|
4676
4686
|
|
|
4687
|
+
# Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
|
|
4688
|
+
# This prevents author/editor lists from being treated as venue
|
|
4689
|
+
# Match 'editors,' 'editor,' or 'eds.,' possibly after a comma; capture the remainder as venue
|
|
4690
|
+
editors_match = re.search(r"(?:^|,)\s*(?:editors?|eds?\.?|editor)\s*,\s*(.+)$", venue_text, re.IGNORECASE)
|
|
4691
|
+
if editors_match:
|
|
4692
|
+
venue_text = editors_match.group(1).strip()
|
|
4693
|
+
|
|
4677
4694
|
# Extract venue from complex editor strings (e.g. "In Smith, J.; and Doe, K., eds., Conference Name, volume 1")
|
|
4678
4695
|
# This handles patterns like "In [authors], eds., [venue], [optional metadata]" (case-insensitive)
|
|
4679
4696
|
editor_match = re.search(r'in\s+[^,]+(?:,\s*[^,]*)*,\s*eds?\.,\s*(.+?)(?:,\s*volume\s*\d+|,\s*pp?\.|$)', venue_text, re.IGNORECASE)
|
|
@@ -4700,7 +4717,9 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
4700
4717
|
prefixes_to_remove = [
|
|
4701
4718
|
r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
|
|
4702
4719
|
r'^\d{4}\s+', # "2024 "
|
|
4703
|
-
|
|
4720
|
+
# Remove 'Proceedings of [the] [ORG]* [ordinal]*' only when followed by at least one word
|
|
4721
|
+
# This avoids cutting a venue down to just 'Proceedings of the'
|
|
4722
|
+
r'^proceedings\s+of\s+(?!the\s*$)(?:the\s+)?(?:(?:acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(?:\d+(?:st|nd|rd|th)\s+)?',
|
|
4704
4723
|
r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
|
|
4705
4724
|
r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
|
|
4706
4725
|
r'^in\s+',
|
|
@@ -4739,4 +4758,8 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
4739
4758
|
venue_text = re.sub(r'\s+', ' ', venue_text) # Normalize whitespace
|
|
4740
4759
|
venue_text = venue_text.strip()
|
|
4741
4760
|
|
|
4761
|
+
# If what's left is too generic (e.g., just 'Proceedings of the'), treat as no venue
|
|
4762
|
+
if venue_text.lower() in {"proceedings of the", "proceedings of"}:
|
|
4763
|
+
return ""
|
|
4764
|
+
|
|
4742
4765
|
return venue_text
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.45 → academic_refchecker-1.2.47}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|