academic-refchecker 1.2.50__py3-none-any.whl → 1.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/METADATA +10 -1
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/RECORD +15 -14
- checkers/github_checker.py +4 -1
- checkers/openreview_checker.py +10 -5
- checkers/pdf_paper_checker.py +493 -0
- checkers/semantic_scholar.py +8 -6
- checkers/webpage_checker.py +428 -2
- core/parallel_processor.py +4 -1
- core/refchecker.py +172 -75
- utils/text_utils.py +134 -13
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/top_level.txt +0 -0
core/refchecker.py
CHANGED
|
@@ -50,7 +50,8 @@ from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
|
|
|
50
50
|
detect_latex_bibliography_format, extract_latex_references,
|
|
51
51
|
detect_standard_acm_natbib_format, strip_latex_commands,
|
|
52
52
|
format_corrected_reference, is_name_match, enhanced_name_match,
|
|
53
|
-
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls
|
|
53
|
+
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
|
|
54
|
+
compare_authors)
|
|
54
55
|
from utils.config_validator import ConfigValidator
|
|
55
56
|
from services.pdf_processor import PDFProcessor
|
|
56
57
|
from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
@@ -1789,7 +1790,7 @@ class ArxivReferenceChecker:
|
|
|
1789
1790
|
if authors:
|
|
1790
1791
|
db_authors = [author.get('name', '') for author in check_paper_data['authors']]
|
|
1791
1792
|
|
|
1792
|
-
authors_match, author_error =
|
|
1793
|
+
authors_match, author_error = compare_authors(authors, db_authors)
|
|
1793
1794
|
if authors_match:
|
|
1794
1795
|
paper_data = check_paper_data
|
|
1795
1796
|
search_strategy = "Normalized title with author match"
|
|
@@ -1901,10 +1902,12 @@ class ArxivReferenceChecker:
|
|
|
1901
1902
|
|
|
1902
1903
|
if normalized_title != db_title:
|
|
1903
1904
|
from utils.error_utils import format_title_mismatch
|
|
1905
|
+
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
1906
|
+
clean_cited_title = strip_latex_commands(title)
|
|
1904
1907
|
logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
|
|
1905
1908
|
errors.append({
|
|
1906
1909
|
'error_type': 'title',
|
|
1907
|
-
'error_details': format_title_mismatch(
|
|
1910
|
+
'error_details': format_title_mismatch(clean_cited_title, paper_data.get('title')),
|
|
1908
1911
|
'ref_title_correct': paper_data.get('title')
|
|
1909
1912
|
})
|
|
1910
1913
|
|
|
@@ -1912,7 +1915,7 @@ class ArxivReferenceChecker:
|
|
|
1912
1915
|
if authors and paper_data.get('authors'):
|
|
1913
1916
|
# Extract author names from database data
|
|
1914
1917
|
correct_names = [author.get('name', '') for author in paper_data['authors']]
|
|
1915
|
-
authors_match, author_error =
|
|
1918
|
+
authors_match, author_error = compare_authors(authors, correct_names)
|
|
1916
1919
|
|
|
1917
1920
|
if not authors_match:
|
|
1918
1921
|
logger.debug(f"DB Verification: Author mismatch - {author_error}")
|
|
@@ -2018,8 +2021,20 @@ class ArxivReferenceChecker:
|
|
|
2018
2021
|
logger.debug(f"Database mode: Initial paper_url from database checker: {paper_url}")
|
|
2019
2022
|
|
|
2020
2023
|
if not verified_data:
|
|
2021
|
-
# Mark as unverified but
|
|
2022
|
-
|
|
2024
|
+
# Mark as unverified but check URL for more specific reason or verification
|
|
2025
|
+
if reference.get('url', '').strip():
|
|
2026
|
+
# Use raw URL verifier to check if it can be verified or get specific reason
|
|
2027
|
+
url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
|
|
2028
|
+
if url_verified_data:
|
|
2029
|
+
# URL verification succeeded - return as verified
|
|
2030
|
+
logger.debug(f"Database mode: URL verification succeeded for unverified reference")
|
|
2031
|
+
return None, url_checked, url_verified_data
|
|
2032
|
+
else:
|
|
2033
|
+
# URL verification failed - use specific error reason
|
|
2034
|
+
url_error_details = url_errors[0].get('error_details', 'Reference could not be verified in database') if url_errors else 'Reference could not be verified in database'
|
|
2035
|
+
return [{"error_type": "unverified", "error_details": url_error_details}], paper_url, None
|
|
2036
|
+
else:
|
|
2037
|
+
return [{"error_type": "unverified", "error_details": "Reference could not be verified in database"}], paper_url, None
|
|
2023
2038
|
|
|
2024
2039
|
# Convert database errors to our format
|
|
2025
2040
|
formatted_errors = []
|
|
@@ -2115,7 +2130,29 @@ class ArxivReferenceChecker:
|
|
|
2115
2130
|
return [{"error_type": "unverified", "error_details": "Database connection not available"}], None, None
|
|
2116
2131
|
|
|
2117
2132
|
# For non-database mode, use the standard reference verification
|
|
2118
|
-
|
|
2133
|
+
errors, paper_url, verified_data = self.verify_reference_standard(source_paper, reference)
|
|
2134
|
+
|
|
2135
|
+
# If standard verification failed and the reference has a URL, try raw URL verification
|
|
2136
|
+
if errors and verified_data is None:
|
|
2137
|
+
# Check if there's an unverified error
|
|
2138
|
+
unverified_errors = [e for e in errors if e.get('error_type') == 'unverified']
|
|
2139
|
+
if unverified_errors and reference.get('url', '').strip():
|
|
2140
|
+
# Use raw URL verifier to check if it can be verified or get specific reason
|
|
2141
|
+
url_verified_data, url_errors, url_checked = self.verify_raw_url_reference(reference)
|
|
2142
|
+
if url_verified_data:
|
|
2143
|
+
# URL verification succeeded - return as verified
|
|
2144
|
+
logger.debug(f"Non-database mode: URL verification succeeded for unverified reference")
|
|
2145
|
+
return None, url_checked, url_verified_data
|
|
2146
|
+
else:
|
|
2147
|
+
# URL verification failed - use specific error reason
|
|
2148
|
+
url_error_details = url_errors[0].get('error_details', 'Reference could not be verified') if url_errors else 'Reference could not be verified'
|
|
2149
|
+
# Update the unverified error with the specific reason
|
|
2150
|
+
for error in errors:
|
|
2151
|
+
if error.get('error_type') == 'unverified':
|
|
2152
|
+
error['error_details'] = url_error_details
|
|
2153
|
+
break
|
|
2154
|
+
|
|
2155
|
+
return errors, paper_url, verified_data
|
|
2119
2156
|
|
|
2120
2157
|
|
|
2121
2158
|
def verify_github_reference(self, reference):
|
|
@@ -2250,6 +2287,55 @@ class ArxivReferenceChecker:
|
|
|
2250
2287
|
formatted_errors.append(formatted_error)
|
|
2251
2288
|
return formatted_errors if formatted_errors else [{"error_type": "unverified", "error_details": "Web page could not be verified"}], page_url, None
|
|
2252
2289
|
|
|
2290
|
+
def verify_raw_url_reference(self, reference):
|
|
2291
|
+
"""
|
|
2292
|
+
Verify a raw URL from an unverified reference - can return verified data if appropriate
|
|
2293
|
+
|
|
2294
|
+
Args:
|
|
2295
|
+
reference: The reference to verify (already determined to be unverified by paper validators)
|
|
2296
|
+
|
|
2297
|
+
Returns:
|
|
2298
|
+
Tuple of (verified_data, errors, url) where:
|
|
2299
|
+
- verified_data: Dict with verified data if URL should be considered verified, None otherwise
|
|
2300
|
+
- errors: List of error dictionaries
|
|
2301
|
+
- url: The URL that was checked
|
|
2302
|
+
"""
|
|
2303
|
+
logger.debug(f"Checking raw URL for unverified reference: {reference.get('title', 'Untitled')}")
|
|
2304
|
+
|
|
2305
|
+
# Extract URL from reference
|
|
2306
|
+
web_url = reference.get('url', '').strip()
|
|
2307
|
+
if not web_url:
|
|
2308
|
+
return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
|
|
2309
|
+
|
|
2310
|
+
# First try PDF paper checker if URL appears to be a PDF
|
|
2311
|
+
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2312
|
+
pdf_checker = PDFPaperChecker()
|
|
2313
|
+
|
|
2314
|
+
if pdf_checker.can_check_reference(reference):
|
|
2315
|
+
logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
|
|
2316
|
+
try:
|
|
2317
|
+
verified_data, errors, url = pdf_checker.verify_reference(reference)
|
|
2318
|
+
if verified_data:
|
|
2319
|
+
logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
|
|
2320
|
+
return verified_data, errors, url
|
|
2321
|
+
else:
|
|
2322
|
+
logger.debug(f"PDF verification failed, falling back to web page verification")
|
|
2323
|
+
except Exception as e:
|
|
2324
|
+
logger.error(f"Error in PDF verification: {e}")
|
|
2325
|
+
logger.debug(f"PDF verification error, falling back to web page verification")
|
|
2326
|
+
|
|
2327
|
+
# Fall back to web page checker
|
|
2328
|
+
from checkers.webpage_checker import WebPageChecker
|
|
2329
|
+
webpage_checker = WebPageChecker()
|
|
2330
|
+
|
|
2331
|
+
try:
|
|
2332
|
+
verified_data, errors, url = webpage_checker.verify_raw_url_for_unverified_reference(reference)
|
|
2333
|
+
logger.debug(f"Raw URL verification result: verified_data={verified_data is not None}, errors={len(errors)}, url={url}")
|
|
2334
|
+
return verified_data, errors, url
|
|
2335
|
+
except Exception as e:
|
|
2336
|
+
logger.error(f"Error checking raw URL: {e}")
|
|
2337
|
+
return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], web_url
|
|
2338
|
+
|
|
2253
2339
|
def verify_reference_standard(self, source_paper, reference):
|
|
2254
2340
|
"""
|
|
2255
2341
|
Verify if a reference is accurate using GitHub, Semantic Scholar, or other checkers
|
|
@@ -2271,11 +2357,6 @@ class ArxivReferenceChecker:
|
|
|
2271
2357
|
if github_result:
|
|
2272
2358
|
return github_result
|
|
2273
2359
|
|
|
2274
|
-
# Next, check if this is a web page reference
|
|
2275
|
-
webpage_result = self.verify_webpage_reference(reference)
|
|
2276
|
-
if webpage_result:
|
|
2277
|
-
return webpage_result
|
|
2278
|
-
|
|
2279
2360
|
# Use the Semantic Scholar client to verify the reference
|
|
2280
2361
|
verified_data, errors, paper_url = self.non_arxiv_checker.verify_reference(reference)
|
|
2281
2362
|
|
|
@@ -3054,6 +3135,13 @@ class ArxivReferenceChecker:
|
|
|
3054
3135
|
try:
|
|
3055
3136
|
# Extract bibliography
|
|
3056
3137
|
bibliography = self.extract_bibliography(paper, debug_mode)
|
|
3138
|
+
|
|
3139
|
+
# Apply deduplication to all bibliography sources (not just LLM-extracted)
|
|
3140
|
+
if len(bibliography) > 1: # Only deduplicate if we have multiple references
|
|
3141
|
+
original_count = len(bibliography)
|
|
3142
|
+
bibliography = self._deduplicate_bibliography_entries(bibliography)
|
|
3143
|
+
if len(bibliography) < original_count:
|
|
3144
|
+
logger.debug(f"Deduplicated {original_count} references to {len(bibliography)} unique references")
|
|
3057
3145
|
|
|
3058
3146
|
# Update statistics
|
|
3059
3147
|
self.total_papers_processed += 1
|
|
@@ -3493,8 +3581,9 @@ class ArxivReferenceChecker:
|
|
|
3493
3581
|
except Exception as e:
|
|
3494
3582
|
logger.error(f"LLM fallback failed: {e}")
|
|
3495
3583
|
return []
|
|
3496
|
-
|
|
3497
|
-
|
|
3584
|
+
if len(biblatex_refs) > 0:
|
|
3585
|
+
logger.debug("Using biblatex file")
|
|
3586
|
+
return biblatex_refs
|
|
3498
3587
|
|
|
3499
3588
|
# For non-standard formats, try LLM-based extraction if available
|
|
3500
3589
|
if self.llm_extractor:
|
|
@@ -4284,9 +4373,9 @@ class ArxivReferenceChecker:
|
|
|
4284
4373
|
# If either has no title, can't reliably determine if duplicate
|
|
4285
4374
|
return False
|
|
4286
4375
|
|
|
4287
|
-
# If titles match exactly, consider them duplicates
|
|
4288
|
-
# This handles the case where the same paper appears multiple times
|
|
4289
|
-
if seg1['title'] == seg2['title']:
|
|
4376
|
+
# If titles match exactly (case-insensitive), consider them duplicates
|
|
4377
|
+
# This handles the case where the same paper appears multiple times with different capitalization
|
|
4378
|
+
if seg1['title'].lower() == seg2['title'].lower():
|
|
4290
4379
|
return True
|
|
4291
4380
|
|
|
4292
4381
|
# Special case: Check if one title is an arXiv identifier and the other is a real title
|
|
@@ -4299,16 +4388,54 @@ class ArxivReferenceChecker:
|
|
|
4299
4388
|
author1 = seg1['author']
|
|
4300
4389
|
author2 = seg2['author']
|
|
4301
4390
|
|
|
4302
|
-
if author1 and author2 and author1 == author2:
|
|
4391
|
+
if author1 and author2 and author1.lower() == author2.lower():
|
|
4303
4392
|
# Same authors - check if one title is substring of other or significant similarity
|
|
4304
|
-
title1 = seg1['title']
|
|
4305
|
-
title2 = seg2['title']
|
|
4393
|
+
title1 = seg1['title'].lower()
|
|
4394
|
+
title2 = seg2['title'].lower()
|
|
4306
4395
|
|
|
4307
4396
|
if (title1 in title2 or title2 in title1):
|
|
4308
4397
|
return True
|
|
4309
4398
|
|
|
4310
4399
|
return False
|
|
4311
4400
|
|
|
4401
|
+
def _deduplicate_bibliography_entries(self, bibliography):
|
|
4402
|
+
"""
|
|
4403
|
+
Deduplicate bibliography entries using title and author comparison.
|
|
4404
|
+
|
|
4405
|
+
This works with structured reference dictionaries from BibTeX/LaTeX parsing,
|
|
4406
|
+
as opposed to _deduplicate_references_with_segment_matching which works with raw text.
|
|
4407
|
+
|
|
4408
|
+
Args:
|
|
4409
|
+
bibliography: List of reference dictionaries with 'title', 'authors', etc.
|
|
4410
|
+
|
|
4411
|
+
Returns:
|
|
4412
|
+
List of unique reference dictionaries
|
|
4413
|
+
"""
|
|
4414
|
+
if len(bibliography) <= 1:
|
|
4415
|
+
return bibliography
|
|
4416
|
+
|
|
4417
|
+
unique_refs = []
|
|
4418
|
+
seen_titles = set()
|
|
4419
|
+
|
|
4420
|
+
for ref in bibliography:
|
|
4421
|
+
title = ref.get('title', '').strip()
|
|
4422
|
+
if not title:
|
|
4423
|
+
# Keep references without titles (they can't be deduplicated)
|
|
4424
|
+
unique_refs.append(ref)
|
|
4425
|
+
continue
|
|
4426
|
+
|
|
4427
|
+
# Normalize title for comparison (case-insensitive, basic cleanup)
|
|
4428
|
+
normalized_title = title.lower().strip()
|
|
4429
|
+
|
|
4430
|
+
# Check if we've seen this title before (case-insensitive)
|
|
4431
|
+
if normalized_title in seen_titles:
|
|
4432
|
+
logger.debug(f"Skipping duplicate reference: '{title}'")
|
|
4433
|
+
else:
|
|
4434
|
+
unique_refs.append(ref)
|
|
4435
|
+
seen_titles.add(normalized_title)
|
|
4436
|
+
|
|
4437
|
+
return unique_refs
|
|
4438
|
+
|
|
4312
4439
|
def _is_arxiv_identifier_title_mismatch(self, seg1, seg2):
|
|
4313
4440
|
"""
|
|
4314
4441
|
Check if one reference has an arXiv identifier as title while the other has a real title,
|
|
@@ -5087,60 +5214,6 @@ class ArxivReferenceChecker:
|
|
|
5087
5214
|
|
|
5088
5215
|
return references
|
|
5089
5216
|
|
|
5090
|
-
def compare_authors(self, cited_authors, correct_authors):
|
|
5091
|
-
"""
|
|
5092
|
-
Compare author lists to check if they match using improved name matching.
|
|
5093
|
-
Uses the utility function is_name_match for robust author name comparison.
|
|
5094
|
-
"""
|
|
5095
|
-
# Clean up author names
|
|
5096
|
-
cleaned_cited = []
|
|
5097
|
-
for author in cited_authors:
|
|
5098
|
-
# Remove reference numbers (e.g., "[1]")
|
|
5099
|
-
author = re.sub(r'^\[\d+\]', '', author)
|
|
5100
|
-
# Remove line breaks
|
|
5101
|
-
author = author.replace('\n', ' ')
|
|
5102
|
-
|
|
5103
|
-
# Handle "et al" cases properly
|
|
5104
|
-
author_clean = author.strip()
|
|
5105
|
-
if author_clean.lower() == 'et al':
|
|
5106
|
-
# Skip pure "et al" entries
|
|
5107
|
-
continue
|
|
5108
|
-
elif 'et al' in author_clean.lower():
|
|
5109
|
-
# Remove "et al" from the author name (e.g., "S. M. Lundberg et al" -> "S. M. Lundberg")
|
|
5110
|
-
author_clean = re.sub(r'\s+et\s+al\.?', '', author_clean, flags=re.IGNORECASE).strip()
|
|
5111
|
-
if author_clean: # Only add if something remains
|
|
5112
|
-
cleaned_cited.append(author_clean)
|
|
5113
|
-
else:
|
|
5114
|
-
cleaned_cited.append(author_clean)
|
|
5115
|
-
|
|
5116
|
-
if not cleaned_cited:
|
|
5117
|
-
return True, "No authors to compare"
|
|
5118
|
-
|
|
5119
|
-
# Handle "et al" cases and length mismatches
|
|
5120
|
-
has_et_al = any('et al' in a.lower() for a in cited_authors)
|
|
5121
|
-
|
|
5122
|
-
if len(cleaned_cited) < len(correct_authors) and (has_et_al or len(cleaned_cited) <= 3):
|
|
5123
|
-
# Only compare the authors that are listed
|
|
5124
|
-
correct_authors = correct_authors[:len(cleaned_cited)]
|
|
5125
|
-
elif len(cleaned_cited) > len(correct_authors) and len(correct_authors) >= 3:
|
|
5126
|
-
# Use available correct authors
|
|
5127
|
-
cleaned_cited = cleaned_cited[:len(correct_authors)]
|
|
5128
|
-
|
|
5129
|
-
# If there's a big count mismatch and no "et al", it's likely an error
|
|
5130
|
-
if abs(len(cleaned_cited) - len(correct_authors)) > 3 and not has_et_al:
|
|
5131
|
-
return False, "Author count mismatch"
|
|
5132
|
-
|
|
5133
|
-
# Compare first author (most important) using the improved utility function
|
|
5134
|
-
if cleaned_cited and correct_authors:
|
|
5135
|
-
# Use raw names for comparison (is_name_match handles normalization internally)
|
|
5136
|
-
cited_first = cleaned_cited[0]
|
|
5137
|
-
correct_first = correct_authors[0]
|
|
5138
|
-
|
|
5139
|
-
if not enhanced_name_match(cited_first, correct_first):
|
|
5140
|
-
from utils.error_utils import format_first_author_mismatch
|
|
5141
|
-
return False, format_first_author_mismatch(cited_first, correct_first)
|
|
5142
|
-
|
|
5143
|
-
return True, "Authors match"
|
|
5144
5217
|
|
|
5145
5218
|
def normalize_text(self, text):
|
|
5146
5219
|
"""
|
|
@@ -5251,6 +5324,19 @@ class ArxivReferenceChecker:
|
|
|
5251
5324
|
return False
|
|
5252
5325
|
return True
|
|
5253
5326
|
|
|
5327
|
+
def compare_authors(self, authors1, authors2):
|
|
5328
|
+
"""
|
|
5329
|
+
Compare authors using the text_utils compare_authors function.
|
|
5330
|
+
|
|
5331
|
+
Args:
|
|
5332
|
+
authors1: First list of authors
|
|
5333
|
+
authors2: Second list of authors
|
|
5334
|
+
|
|
5335
|
+
Returns:
|
|
5336
|
+
Tuple of (match_result, error_message)
|
|
5337
|
+
"""
|
|
5338
|
+
return compare_authors(authors1, authors2)
|
|
5339
|
+
|
|
5254
5340
|
def _verify_references_sequential(self, paper, bibliography, paper_errors, error_types, unverified_count, debug_mode):
|
|
5255
5341
|
"""
|
|
5256
5342
|
Sequential reference verification (original implementation)
|
|
@@ -5267,7 +5353,10 @@ class ArxivReferenceChecker:
|
|
|
5267
5353
|
ref_id = self.extract_arxiv_id_from_url(reference['url'])
|
|
5268
5354
|
|
|
5269
5355
|
# Print reference info in non-debug mode (improved formatting)
|
|
5270
|
-
|
|
5356
|
+
raw_title = reference.get('title', 'Untitled')
|
|
5357
|
+
# Clean LaTeX commands from title for display
|
|
5358
|
+
from utils.text_utils import strip_latex_commands
|
|
5359
|
+
title = strip_latex_commands(raw_title)
|
|
5271
5360
|
from utils.text_utils import format_authors_for_display
|
|
5272
5361
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
5273
5362
|
year = reference.get('year', '')
|
|
@@ -5504,6 +5593,14 @@ class ArxivReferenceChecker:
|
|
|
5504
5593
|
"""Categorize the unverified error into checker error or not found"""
|
|
5505
5594
|
error_details_lower = error_details.lower()
|
|
5506
5595
|
|
|
5596
|
+
# New specific URL-based unverified reasons
|
|
5597
|
+
if error_details_lower == "non-existent web page":
|
|
5598
|
+
return "Non-existent web page"
|
|
5599
|
+
elif error_details_lower == "paper not found and url doesn't reference it":
|
|
5600
|
+
return "Paper not found and URL doesn't reference it"
|
|
5601
|
+
elif error_details_lower == "paper not verified but url references paper":
|
|
5602
|
+
return "Paper not verified but URL references paper"
|
|
5603
|
+
|
|
5507
5604
|
# Checker/API errors
|
|
5508
5605
|
api_error_patterns = [
|
|
5509
5606
|
'api error', 'rate limit', 'http error', 'network error',
|
utils/text_utils.py
CHANGED
|
@@ -580,6 +580,9 @@ def clean_title_for_search(title):
|
|
|
580
580
|
if not isinstance(title, str):
|
|
581
581
|
return str(title) if title is not None else ''
|
|
582
582
|
|
|
583
|
+
# Strip LaTeX commands to handle math formatting and other LaTeX markup
|
|
584
|
+
title = strip_latex_commands(title)
|
|
585
|
+
|
|
583
586
|
# Clean up newlines and normalize whitespace (but preserve other structure)
|
|
584
587
|
title = title.replace('\n', ' ').strip()
|
|
585
588
|
title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
|
|
@@ -753,8 +756,11 @@ def normalize_paper_title(title: str) -> str:
|
|
|
753
756
|
if not title:
|
|
754
757
|
return ""
|
|
755
758
|
|
|
759
|
+
# Strip LaTeX commands first to handle math formatting consistently
|
|
760
|
+
normalized = strip_latex_commands(title)
|
|
761
|
+
|
|
756
762
|
# Convert to lowercase
|
|
757
|
-
normalized =
|
|
763
|
+
normalized = normalized.lower()
|
|
758
764
|
|
|
759
765
|
# Remove common prefixes that don't affect the actual title content
|
|
760
766
|
prefixes_to_remove = [
|
|
@@ -2107,21 +2113,37 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2107
2113
|
|
|
2108
2114
|
return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
|
|
2109
2115
|
|
|
2116
|
+
# Detect if cited authors look like parsing fragments
|
|
2117
|
+
# (many short single-word entries that might be first/last name fragments)
|
|
2118
|
+
def looks_like_fragments(authors_list):
|
|
2119
|
+
if len(authors_list) < 4: # Need at least 4 to detect fragment pattern
|
|
2120
|
+
return False
|
|
2121
|
+
single_word_count = sum(1 for author in authors_list if len(author.strip().split()) == 1)
|
|
2122
|
+
return single_word_count >= len(authors_list) * 0.7 # 70% or more are single words
|
|
2123
|
+
|
|
2110
2124
|
# Normal case without "et al" - compare all authors
|
|
2111
2125
|
if len(cleaned_cited) != len(correct_names):
|
|
2112
|
-
|
|
2113
|
-
#
|
|
2114
|
-
if
|
|
2126
|
+
|
|
2127
|
+
# Check if cited authors look like parsing fragments
|
|
2128
|
+
if looks_like_fragments(cleaned_cited):
|
|
2115
2129
|
from utils.error_utils import format_author_count_mismatch
|
|
2116
|
-
# Convert cited names to display format (First Last) before showing in error
|
|
2117
2130
|
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2118
2131
|
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2119
2132
|
return False, error_msg
|
|
2120
2133
|
|
|
2121
|
-
#
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2134
|
+
# For all count mismatches, show the count mismatch error
|
|
2135
|
+
if len(cleaned_cited) < len(correct_names):
|
|
2136
|
+
from utils.error_utils import format_author_count_mismatch
|
|
2137
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2138
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2139
|
+
return False, error_msg
|
|
2140
|
+
|
|
2141
|
+
# For cases where cited > correct, also show count mismatch
|
|
2142
|
+
elif len(cleaned_cited) > len(correct_names):
|
|
2143
|
+
from utils.error_utils import format_author_count_mismatch
|
|
2144
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2145
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2146
|
+
return False, error_msg
|
|
2125
2147
|
else:
|
|
2126
2148
|
comparison_cited = cleaned_cited
|
|
2127
2149
|
comparison_correct = correct_names
|
|
@@ -2484,8 +2506,64 @@ def strip_latex_commands(text):
|
|
|
2484
2506
|
# Remove font size commands
|
|
2485
2507
|
text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\b', '', text)
|
|
2486
2508
|
|
|
2487
|
-
#
|
|
2488
|
-
|
|
2509
|
+
# Handle complex math mode patterns first
|
|
2510
|
+
# Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
|
|
2511
|
+
def process_complex_math(match):
|
|
2512
|
+
content = match.group(1)
|
|
2513
|
+
# Handle common Greek letters
|
|
2514
|
+
content = re.sub(r'\\mu\b', 'μ', content) # \mu -> μ
|
|
2515
|
+
content = re.sub(r'\\alpha\b', 'α', content) # \alpha -> α
|
|
2516
|
+
content = re.sub(r'\\beta\b', 'β', content) # \beta -> β
|
|
2517
|
+
content = re.sub(r'\\gamma\b', 'γ', content) # \gamma -> γ
|
|
2518
|
+
content = re.sub(r'\\delta\b', 'δ', content) # \delta -> δ
|
|
2519
|
+
content = re.sub(r'\\epsilon\b', 'ε', content) # \epsilon -> ε
|
|
2520
|
+
content = re.sub(r'\\lambda\b', 'λ', content) # \lambda -> λ
|
|
2521
|
+
content = re.sub(r'\\pi\b', 'π', content) # \pi -> π
|
|
2522
|
+
content = re.sub(r'\\sigma\b', 'σ', content) # \sigma -> σ
|
|
2523
|
+
content = re.sub(r'\\theta\b', 'θ', content) # \theta -> θ
|
|
2524
|
+
# Remove any remaining LaTeX commands and braces from inside math
|
|
2525
|
+
content = re.sub(r'\\[a-zA-Z]+\b', '', content)
|
|
2526
|
+
content = re.sub(r'[{}]', '', content)
|
|
2527
|
+
# Clean up any remaining $ signs
|
|
2528
|
+
content = re.sub(r'\$+', '', content)
|
|
2529
|
+
return content
|
|
2530
|
+
|
|
2531
|
+
# Handle complex nested math patterns first
|
|
2532
|
+
# Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
|
|
2533
|
+
def process_nested_math_specifically(match):
|
|
2534
|
+
content = match.group(0)
|
|
2535
|
+
# Handle the specific pattern: $\{$$\mu$second-scale$\}$
|
|
2536
|
+
# Extract the meaningful parts
|
|
2537
|
+
if r'\mu' in content:
|
|
2538
|
+
# Replace \mu with μ and extract the surrounding text
|
|
2539
|
+
content = re.sub(r'\\mu\b', 'μ', content)
|
|
2540
|
+
# Remove all LaTeX math markup
|
|
2541
|
+
content = re.sub(r'[\$\{\}\\]+', '', content)
|
|
2542
|
+
return content
|
|
2543
|
+
|
|
2544
|
+
# Handle the specific problematic pattern
|
|
2545
|
+
text = re.sub(r'\$\\\{[^}]*\\\}\$', process_nested_math_specifically, text)
|
|
2546
|
+
|
|
2547
|
+
# Handle Greek letters in math mode before removing delimiters
|
|
2548
|
+
def process_standard_math(match):
|
|
2549
|
+
content = match.group(1)
|
|
2550
|
+
# Handle common Greek letters - content has single backslashes
|
|
2551
|
+
content = re.sub(r'\\mu\b', 'μ', content)
|
|
2552
|
+
content = re.sub(r'\\alpha\b', 'α', content)
|
|
2553
|
+
content = re.sub(r'\\beta\b', 'β', content)
|
|
2554
|
+
content = re.sub(r'\\gamma\b', 'γ', content)
|
|
2555
|
+
content = re.sub(r'\\delta\b', 'δ', content)
|
|
2556
|
+
content = re.sub(r'\\epsilon\b', 'ε', content)
|
|
2557
|
+
content = re.sub(r'\\lambda\b', 'λ', content)
|
|
2558
|
+
content = re.sub(r'\\pi\b', 'π', content)
|
|
2559
|
+
content = re.sub(r'\\sigma\b', 'σ', content)
|
|
2560
|
+
content = re.sub(r'\\theta\b', 'θ', content)
|
|
2561
|
+
# Remove any remaining LaTeX commands
|
|
2562
|
+
content = re.sub(r'\\[a-zA-Z]+\b', '', content)
|
|
2563
|
+
return content
|
|
2564
|
+
|
|
2565
|
+
# Remove standard math mode delimiters with Greek letter processing
|
|
2566
|
+
text = re.sub(r'\$([^$]*)\$', process_standard_math, text)
|
|
2489
2567
|
text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
|
|
2490
2568
|
text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)
|
|
2491
2569
|
|
|
@@ -3369,7 +3447,18 @@ def _extract_corrected_reference_data(error_entry: dict, corrected_data: dict) -
|
|
|
3369
3447
|
"""
|
|
3370
3448
|
# Get the corrected information
|
|
3371
3449
|
correct_title = error_entry.get('ref_title_correct') or corrected_data.get('title', '')
|
|
3372
|
-
|
|
3450
|
+
|
|
3451
|
+
# Handle authors - can be string or list of dicts from API
|
|
3452
|
+
authors_raw = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
|
|
3453
|
+
if isinstance(authors_raw, list):
|
|
3454
|
+
# Convert list of author dicts to comma-separated string
|
|
3455
|
+
if authors_raw and isinstance(authors_raw[0], dict):
|
|
3456
|
+
correct_authors = ', '.join([author.get('name', '') for author in authors_raw])
|
|
3457
|
+
else:
|
|
3458
|
+
correct_authors = ', '.join(authors_raw)
|
|
3459
|
+
else:
|
|
3460
|
+
correct_authors = str(authors_raw) if authors_raw else ''
|
|
3461
|
+
|
|
3373
3462
|
correct_year = error_entry.get('ref_year_correct') or corrected_data.get('year', '')
|
|
3374
3463
|
|
|
3375
3464
|
# Prioritize the verified URL that was actually used for verification
|
|
@@ -3573,7 +3662,39 @@ def format_corrected_plaintext(original_reference, corrected_data, error_entry):
|
|
|
3573
3662
|
if correct_url:
|
|
3574
3663
|
citation_parts.append(f"{correct_url}")
|
|
3575
3664
|
|
|
3576
|
-
|
|
3665
|
+
citation_text = '. '.join(citation_parts) + '.'
|
|
3666
|
+
|
|
3667
|
+
# Add citation key information if available (for easy copying)
|
|
3668
|
+
citation_key = original_reference.get('bibtex_key') or original_reference.get('bibitem_key')
|
|
3669
|
+
if citation_key and citation_key != 'unknown':
|
|
3670
|
+
bibtex_type = original_reference.get('bibtex_type', 'misc')
|
|
3671
|
+
citation_text += f"\n\n% Citation key for BibTeX: @{bibtex_type}{{{citation_key}, ...}}"
|
|
3672
|
+
|
|
3673
|
+
return citation_text
|
|
3674
|
+
|
|
3675
|
+
|
|
3676
|
+
def compare_titles_with_latex_cleaning(cited_title: str, database_title: str) -> float:
|
|
3677
|
+
"""
|
|
3678
|
+
Compare two titles with proper LaTeX cleaning for accurate similarity scoring.
|
|
3679
|
+
|
|
3680
|
+
This function ensures both titles are cleaned of LaTeX commands before comparison
|
|
3681
|
+
to avoid false mismatches due to formatting differences like {LLM}s vs LLMs.
|
|
3682
|
+
|
|
3683
|
+
Args:
|
|
3684
|
+
cited_title: Title from cited reference (may contain LaTeX)
|
|
3685
|
+
database_title: Title from database (usually already clean)
|
|
3686
|
+
|
|
3687
|
+
Returns:
|
|
3688
|
+
Similarity score between 0 and 1
|
|
3689
|
+
"""
|
|
3690
|
+
if not cited_title or not database_title:
|
|
3691
|
+
return 0.0
|
|
3692
|
+
|
|
3693
|
+
# Clean LaTeX commands from cited title to match database format
|
|
3694
|
+
clean_cited = strip_latex_commands(cited_title)
|
|
3695
|
+
|
|
3696
|
+
# Calculate similarity using cleaned titles
|
|
3697
|
+
return calculate_title_similarity(clean_cited, database_title)
|
|
3577
3698
|
|
|
3578
3699
|
|
|
3579
3700
|
def calculate_title_similarity(title1: str, title2: str) -> float:
|
|
File without changes
|
{academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.50.dist-info → academic_refchecker-1.2.52.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|