academic-refchecker 1.2.49__tar.gz → 1.2.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.49/src/academic_refchecker.egg-info → academic_refchecker-1.2.51}/PKG-INFO +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/__version__.py +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/github_checker.py +4 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/openreview_checker.py +10 -5
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/semantic_scholar.py +8 -6
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/webpage_checker.py +4 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/parallel_processor.py +4 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/refchecker.py +78 -67
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/arxiv_utils.py +98 -54
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/text_utils.py +137 -13
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/LICENSE +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/README.md +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/pyproject.toml +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/requirements.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/setup.cfg +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/utils/url_utils.py +0 -0
|
@@ -5,6 +5,7 @@ import re
|
|
|
5
5
|
import logging
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
|
+
from utils.text_utils import strip_latex_commands
|
|
8
9
|
|
|
9
10
|
logger = logging.getLogger(__name__)
|
|
10
11
|
|
|
@@ -170,7 +171,9 @@ class GitHubChecker:
|
|
|
170
171
|
title_match = self._check_title_match(cited_title, actual_name, actual_description)
|
|
171
172
|
if not title_match:
|
|
172
173
|
from utils.error_utils import format_title_mismatch
|
|
173
|
-
|
|
174
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
175
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
176
|
+
details = format_title_mismatch(clean_cited_title, actual_name)
|
|
174
177
|
if actual_description:
|
|
175
178
|
snippet = actual_description[:100] + ('...' if len(actual_description) > 100 else '')
|
|
176
179
|
details += f" ({snippet})"
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/openreview_checker.py
RENAMED
|
@@ -36,7 +36,8 @@ from utils.text_utils import (
|
|
|
36
36
|
normalize_text, clean_title_basic, is_name_match,
|
|
37
37
|
calculate_title_similarity, compare_authors,
|
|
38
38
|
clean_title_for_search, are_venues_substantially_different,
|
|
39
|
-
is_year_substantially_different
|
|
39
|
+
is_year_substantially_different, strip_latex_commands,
|
|
40
|
+
compare_titles_with_latex_cleaning
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
# Set up logging
|
|
@@ -423,10 +424,12 @@ class OpenReviewReferenceChecker:
|
|
|
423
424
|
paper_title = paper_data.get('title', '').strip()
|
|
424
425
|
|
|
425
426
|
if cited_title and paper_title:
|
|
426
|
-
similarity =
|
|
427
|
+
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
427
428
|
if similarity < 0.7: # Using a reasonable threshold
|
|
428
429
|
from utils.error_utils import format_title_mismatch
|
|
429
|
-
|
|
430
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
431
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
432
|
+
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
430
433
|
errors.append({
|
|
431
434
|
"warning_type": "title",
|
|
432
435
|
"warning_details": details
|
|
@@ -547,10 +550,12 @@ class OpenReviewReferenceChecker:
|
|
|
547
550
|
paper_title = best_match.get('title', '').strip()
|
|
548
551
|
|
|
549
552
|
if cited_title and paper_title:
|
|
550
|
-
similarity =
|
|
553
|
+
similarity = compare_titles_with_latex_cleaning(cited_title, paper_title)
|
|
551
554
|
if similarity < 0.8: # Slightly higher threshold for search results
|
|
552
555
|
from utils.error_utils import format_title_mismatch
|
|
553
|
-
|
|
556
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
557
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
558
|
+
details = format_title_mismatch(clean_cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
554
559
|
errors.append({
|
|
555
560
|
"warning_type": "title",
|
|
556
561
|
"warning_details": details
|
|
@@ -28,7 +28,7 @@ import time
|
|
|
28
28
|
import logging
|
|
29
29
|
import re
|
|
30
30
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
|
-
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search
|
|
31
|
+
from utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
32
|
from utils.error_utils import format_title_mismatch
|
|
33
33
|
from config.settings import get_config
|
|
34
34
|
|
|
@@ -353,7 +353,7 @@ class NonArxivReferenceChecker:
|
|
|
353
353
|
cited_title = title.strip()
|
|
354
354
|
|
|
355
355
|
if cited_title and result_title:
|
|
356
|
-
title_similarity =
|
|
356
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
|
|
357
357
|
logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
|
|
358
358
|
logger.debug(f"Cited title: '{cited_title}'")
|
|
359
359
|
logger.debug(f"Found title: '{result_title}'")
|
|
@@ -385,7 +385,7 @@ class NonArxivReferenceChecker:
|
|
|
385
385
|
logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
|
|
386
386
|
|
|
387
387
|
if cited_title and arxiv_title:
|
|
388
|
-
title_similarity =
|
|
388
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
|
|
389
389
|
logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
|
|
390
390
|
logger.debug(f"Cited title: '{cited_title}'")
|
|
391
391
|
logger.debug(f"ArXiv title: '{arxiv_title}'")
|
|
@@ -419,7 +419,7 @@ class NonArxivReferenceChecker:
|
|
|
419
419
|
arxiv_title_check = arxiv_paper_check.get('title', '').strip()
|
|
420
420
|
cited_title_check = title.strip()
|
|
421
421
|
if cited_title_check and arxiv_title_check:
|
|
422
|
-
title_similarity_check =
|
|
422
|
+
title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
|
|
423
423
|
if title_similarity_check < SIMILARITY_THRESHOLD:
|
|
424
424
|
logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
|
|
425
425
|
arxiv_id_mismatch_detected = True
|
|
@@ -468,11 +468,13 @@ class NonArxivReferenceChecker:
|
|
|
468
468
|
return None, [], None
|
|
469
469
|
|
|
470
470
|
# Check title using similarity function to handle formatting differences
|
|
471
|
-
title_similarity =
|
|
471
|
+
title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
|
|
472
472
|
if found_title and title_similarity < SIMILARITY_THRESHOLD:
|
|
473
|
+
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
474
|
+
clean_cited_title = strip_latex_commands(title)
|
|
473
475
|
errors.append({
|
|
474
476
|
'error_type': 'title',
|
|
475
|
-
'error_details': format_title_mismatch(
|
|
477
|
+
'error_details': format_title_mismatch(clean_cited_title, found_title),
|
|
476
478
|
'ref_title_correct': paper_data.get('title', '')
|
|
477
479
|
})
|
|
478
480
|
|
|
@@ -7,6 +7,7 @@ from urllib.parse import urlparse, urljoin
|
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
import time
|
|
10
|
+
from utils.text_utils import strip_latex_commands
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
12
13
|
|
|
@@ -185,9 +186,11 @@ class WebPageChecker:
|
|
|
185
186
|
if cited_title and page_title:
|
|
186
187
|
if not self._check_title_match(cited_title, page_title, page_description):
|
|
187
188
|
from utils.error_utils import format_title_mismatch
|
|
189
|
+
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
190
|
+
clean_cited_title = strip_latex_commands(cited_title)
|
|
188
191
|
errors.append({
|
|
189
192
|
"warning_type": "title",
|
|
190
|
-
"warning_details": format_title_mismatch(
|
|
193
|
+
"warning_details": format_title_mismatch(clean_cited_title, page_title)
|
|
191
194
|
})
|
|
192
195
|
|
|
193
196
|
# Check if this is a documentation page for the cited topic
|
|
@@ -275,7 +275,10 @@ class ParallelReferenceProcessor:
|
|
|
275
275
|
reference = result.reference
|
|
276
276
|
|
|
277
277
|
# Print reference info in the same format as sequential mode
|
|
278
|
-
|
|
278
|
+
raw_title = reference.get('title', 'Untitled')
|
|
279
|
+
# Clean LaTeX commands from title for display
|
|
280
|
+
from utils.text_utils import strip_latex_commands
|
|
281
|
+
title = strip_latex_commands(raw_title)
|
|
279
282
|
from utils.text_utils import format_authors_for_display
|
|
280
283
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
281
284
|
year = reference.get('year', '')
|
|
@@ -50,7 +50,8 @@ from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
|
|
|
50
50
|
detect_latex_bibliography_format, extract_latex_references,
|
|
51
51
|
detect_standard_acm_natbib_format, strip_latex_commands,
|
|
52
52
|
format_corrected_reference, is_name_match, enhanced_name_match,
|
|
53
|
-
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls
|
|
53
|
+
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
|
|
54
|
+
compare_authors)
|
|
54
55
|
from utils.config_validator import ConfigValidator
|
|
55
56
|
from services.pdf_processor import PDFProcessor
|
|
56
57
|
from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
@@ -1789,7 +1790,7 @@ class ArxivReferenceChecker:
|
|
|
1789
1790
|
if authors:
|
|
1790
1791
|
db_authors = [author.get('name', '') for author in check_paper_data['authors']]
|
|
1791
1792
|
|
|
1792
|
-
authors_match, author_error =
|
|
1793
|
+
authors_match, author_error = compare_authors(authors, db_authors)
|
|
1793
1794
|
if authors_match:
|
|
1794
1795
|
paper_data = check_paper_data
|
|
1795
1796
|
search_strategy = "Normalized title with author match"
|
|
@@ -1901,10 +1902,12 @@ class ArxivReferenceChecker:
|
|
|
1901
1902
|
|
|
1902
1903
|
if normalized_title != db_title:
|
|
1903
1904
|
from utils.error_utils import format_title_mismatch
|
|
1905
|
+
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
1906
|
+
clean_cited_title = strip_latex_commands(title)
|
|
1904
1907
|
logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
|
|
1905
1908
|
errors.append({
|
|
1906
1909
|
'error_type': 'title',
|
|
1907
|
-
'error_details': format_title_mismatch(
|
|
1910
|
+
'error_details': format_title_mismatch(clean_cited_title, paper_data.get('title')),
|
|
1908
1911
|
'ref_title_correct': paper_data.get('title')
|
|
1909
1912
|
})
|
|
1910
1913
|
|
|
@@ -1912,7 +1915,7 @@ class ArxivReferenceChecker:
|
|
|
1912
1915
|
if authors and paper_data.get('authors'):
|
|
1913
1916
|
# Extract author names from database data
|
|
1914
1917
|
correct_names = [author.get('name', '') for author in paper_data['authors']]
|
|
1915
|
-
authors_match, author_error =
|
|
1918
|
+
authors_match, author_error = compare_authors(authors, correct_names)
|
|
1916
1919
|
|
|
1917
1920
|
if not authors_match:
|
|
1918
1921
|
logger.debug(f"DB Verification: Author mismatch - {author_error}")
|
|
@@ -3054,6 +3057,13 @@ class ArxivReferenceChecker:
|
|
|
3054
3057
|
try:
|
|
3055
3058
|
# Extract bibliography
|
|
3056
3059
|
bibliography = self.extract_bibliography(paper, debug_mode)
|
|
3060
|
+
|
|
3061
|
+
# Apply deduplication to all bibliography sources (not just LLM-extracted)
|
|
3062
|
+
if len(bibliography) > 1: # Only deduplicate if we have multiple references
|
|
3063
|
+
original_count = len(bibliography)
|
|
3064
|
+
bibliography = self._deduplicate_bibliography_entries(bibliography)
|
|
3065
|
+
if len(bibliography) < original_count:
|
|
3066
|
+
logger.debug(f"Deduplicated {original_count} references to {len(bibliography)} unique references")
|
|
3057
3067
|
|
|
3058
3068
|
# Update statistics
|
|
3059
3069
|
self.total_papers_processed += 1
|
|
@@ -3493,8 +3503,9 @@ class ArxivReferenceChecker:
|
|
|
3493
3503
|
except Exception as e:
|
|
3494
3504
|
logger.error(f"LLM fallback failed: {e}")
|
|
3495
3505
|
return []
|
|
3496
|
-
|
|
3497
|
-
|
|
3506
|
+
if len(biblatex_refs) > 0:
|
|
3507
|
+
logger.debug("Using biblatex file")
|
|
3508
|
+
return biblatex_refs
|
|
3498
3509
|
|
|
3499
3510
|
# For non-standard formats, try LLM-based extraction if available
|
|
3500
3511
|
if self.llm_extractor:
|
|
@@ -4284,9 +4295,9 @@ class ArxivReferenceChecker:
|
|
|
4284
4295
|
# If either has no title, can't reliably determine if duplicate
|
|
4285
4296
|
return False
|
|
4286
4297
|
|
|
4287
|
-
# If titles match exactly, consider them duplicates
|
|
4288
|
-
# This handles the case where the same paper appears multiple times
|
|
4289
|
-
if seg1['title'] == seg2['title']:
|
|
4298
|
+
# If titles match exactly (case-insensitive), consider them duplicates
|
|
4299
|
+
# This handles the case where the same paper appears multiple times with different capitalization
|
|
4300
|
+
if seg1['title'].lower() == seg2['title'].lower():
|
|
4290
4301
|
return True
|
|
4291
4302
|
|
|
4292
4303
|
# Special case: Check if one title is an arXiv identifier and the other is a real title
|
|
@@ -4299,16 +4310,54 @@ class ArxivReferenceChecker:
|
|
|
4299
4310
|
author1 = seg1['author']
|
|
4300
4311
|
author2 = seg2['author']
|
|
4301
4312
|
|
|
4302
|
-
if author1 and author2 and author1 == author2:
|
|
4313
|
+
if author1 and author2 and author1.lower() == author2.lower():
|
|
4303
4314
|
# Same authors - check if one title is substring of other or significant similarity
|
|
4304
|
-
title1 = seg1['title']
|
|
4305
|
-
title2 = seg2['title']
|
|
4315
|
+
title1 = seg1['title'].lower()
|
|
4316
|
+
title2 = seg2['title'].lower()
|
|
4306
4317
|
|
|
4307
4318
|
if (title1 in title2 or title2 in title1):
|
|
4308
4319
|
return True
|
|
4309
4320
|
|
|
4310
4321
|
return False
|
|
4311
4322
|
|
|
4323
|
+
def _deduplicate_bibliography_entries(self, bibliography):
|
|
4324
|
+
"""
|
|
4325
|
+
Deduplicate bibliography entries using title and author comparison.
|
|
4326
|
+
|
|
4327
|
+
This works with structured reference dictionaries from BibTeX/LaTeX parsing,
|
|
4328
|
+
as opposed to _deduplicate_references_with_segment_matching which works with raw text.
|
|
4329
|
+
|
|
4330
|
+
Args:
|
|
4331
|
+
bibliography: List of reference dictionaries with 'title', 'authors', etc.
|
|
4332
|
+
|
|
4333
|
+
Returns:
|
|
4334
|
+
List of unique reference dictionaries
|
|
4335
|
+
"""
|
|
4336
|
+
if len(bibliography) <= 1:
|
|
4337
|
+
return bibliography
|
|
4338
|
+
|
|
4339
|
+
unique_refs = []
|
|
4340
|
+
seen_titles = set()
|
|
4341
|
+
|
|
4342
|
+
for ref in bibliography:
|
|
4343
|
+
title = ref.get('title', '').strip()
|
|
4344
|
+
if not title:
|
|
4345
|
+
# Keep references without titles (they can't be deduplicated)
|
|
4346
|
+
unique_refs.append(ref)
|
|
4347
|
+
continue
|
|
4348
|
+
|
|
4349
|
+
# Normalize title for comparison (case-insensitive, basic cleanup)
|
|
4350
|
+
normalized_title = title.lower().strip()
|
|
4351
|
+
|
|
4352
|
+
# Check if we've seen this title before (case-insensitive)
|
|
4353
|
+
if normalized_title in seen_titles:
|
|
4354
|
+
logger.debug(f"Skipping duplicate reference: '{title}'")
|
|
4355
|
+
else:
|
|
4356
|
+
unique_refs.append(ref)
|
|
4357
|
+
seen_titles.add(normalized_title)
|
|
4358
|
+
|
|
4359
|
+
return unique_refs
|
|
4360
|
+
|
|
4312
4361
|
def _is_arxiv_identifier_title_mismatch(self, seg1, seg2):
|
|
4313
4362
|
"""
|
|
4314
4363
|
Check if one reference has an arXiv identifier as title while the other has a real title,
|
|
@@ -5087,60 +5136,6 @@ class ArxivReferenceChecker:
|
|
|
5087
5136
|
|
|
5088
5137
|
return references
|
|
5089
5138
|
|
|
5090
|
-
def compare_authors(self, cited_authors, correct_authors):
|
|
5091
|
-
"""
|
|
5092
|
-
Compare author lists to check if they match using improved name matching.
|
|
5093
|
-
Uses the utility function is_name_match for robust author name comparison.
|
|
5094
|
-
"""
|
|
5095
|
-
# Clean up author names
|
|
5096
|
-
cleaned_cited = []
|
|
5097
|
-
for author in cited_authors:
|
|
5098
|
-
# Remove reference numbers (e.g., "[1]")
|
|
5099
|
-
author = re.sub(r'^\[\d+\]', '', author)
|
|
5100
|
-
# Remove line breaks
|
|
5101
|
-
author = author.replace('\n', ' ')
|
|
5102
|
-
|
|
5103
|
-
# Handle "et al" cases properly
|
|
5104
|
-
author_clean = author.strip()
|
|
5105
|
-
if author_clean.lower() == 'et al':
|
|
5106
|
-
# Skip pure "et al" entries
|
|
5107
|
-
continue
|
|
5108
|
-
elif 'et al' in author_clean.lower():
|
|
5109
|
-
# Remove "et al" from the author name (e.g., "S. M. Lundberg et al" -> "S. M. Lundberg")
|
|
5110
|
-
author_clean = re.sub(r'\s+et\s+al\.?', '', author_clean, flags=re.IGNORECASE).strip()
|
|
5111
|
-
if author_clean: # Only add if something remains
|
|
5112
|
-
cleaned_cited.append(author_clean)
|
|
5113
|
-
else:
|
|
5114
|
-
cleaned_cited.append(author_clean)
|
|
5115
|
-
|
|
5116
|
-
if not cleaned_cited:
|
|
5117
|
-
return True, "No authors to compare"
|
|
5118
|
-
|
|
5119
|
-
# Handle "et al" cases and length mismatches
|
|
5120
|
-
has_et_al = any('et al' in a.lower() for a in cited_authors)
|
|
5121
|
-
|
|
5122
|
-
if len(cleaned_cited) < len(correct_authors) and (has_et_al or len(cleaned_cited) <= 3):
|
|
5123
|
-
# Only compare the authors that are listed
|
|
5124
|
-
correct_authors = correct_authors[:len(cleaned_cited)]
|
|
5125
|
-
elif len(cleaned_cited) > len(correct_authors) and len(correct_authors) >= 3:
|
|
5126
|
-
# Use available correct authors
|
|
5127
|
-
cleaned_cited = cleaned_cited[:len(correct_authors)]
|
|
5128
|
-
|
|
5129
|
-
# If there's a big count mismatch and no "et al", it's likely an error
|
|
5130
|
-
if abs(len(cleaned_cited) - len(correct_authors)) > 3 and not has_et_al:
|
|
5131
|
-
return False, "Author count mismatch"
|
|
5132
|
-
|
|
5133
|
-
# Compare first author (most important) using the improved utility function
|
|
5134
|
-
if cleaned_cited and correct_authors:
|
|
5135
|
-
# Use raw names for comparison (is_name_match handles normalization internally)
|
|
5136
|
-
cited_first = cleaned_cited[0]
|
|
5137
|
-
correct_first = correct_authors[0]
|
|
5138
|
-
|
|
5139
|
-
if not enhanced_name_match(cited_first, correct_first):
|
|
5140
|
-
from utils.error_utils import format_first_author_mismatch
|
|
5141
|
-
return False, format_first_author_mismatch(cited_first, correct_first)
|
|
5142
|
-
|
|
5143
|
-
return True, "Authors match"
|
|
5144
5139
|
|
|
5145
5140
|
def normalize_text(self, text):
|
|
5146
5141
|
"""
|
|
@@ -5251,6 +5246,19 @@ class ArxivReferenceChecker:
|
|
|
5251
5246
|
return False
|
|
5252
5247
|
return True
|
|
5253
5248
|
|
|
5249
|
+
def compare_authors(self, authors1, authors2):
|
|
5250
|
+
"""
|
|
5251
|
+
Compare authors using the text_utils compare_authors function.
|
|
5252
|
+
|
|
5253
|
+
Args:
|
|
5254
|
+
authors1: First list of authors
|
|
5255
|
+
authors2: Second list of authors
|
|
5256
|
+
|
|
5257
|
+
Returns:
|
|
5258
|
+
Tuple of (match_result, error_message)
|
|
5259
|
+
"""
|
|
5260
|
+
return compare_authors(authors1, authors2)
|
|
5261
|
+
|
|
5254
5262
|
def _verify_references_sequential(self, paper, bibliography, paper_errors, error_types, unverified_count, debug_mode):
|
|
5255
5263
|
"""
|
|
5256
5264
|
Sequential reference verification (original implementation)
|
|
@@ -5267,7 +5275,10 @@ class ArxivReferenceChecker:
|
|
|
5267
5275
|
ref_id = self.extract_arxiv_id_from_url(reference['url'])
|
|
5268
5276
|
|
|
5269
5277
|
# Print reference info in non-debug mode (improved formatting)
|
|
5270
|
-
|
|
5278
|
+
raw_title = reference.get('title', 'Untitled')
|
|
5279
|
+
# Clean LaTeX commands from title for display
|
|
5280
|
+
from utils.text_utils import strip_latex_commands
|
|
5281
|
+
title = strip_latex_commands(raw_title)
|
|
5271
5282
|
from utils.text_utils import format_authors_for_display
|
|
5272
5283
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
5273
5284
|
year = reference.get('year', '')
|
|
@@ -111,56 +111,8 @@ def download_arxiv_source(arxiv_id):
|
|
|
111
111
|
main_tex_content = largest_file[1]
|
|
112
112
|
logger.debug(f"Using largest tex file: {largest_file[0]}")
|
|
113
113
|
|
|
114
|
-
#
|
|
115
|
-
bib_content =
|
|
116
|
-
if bib_files and main_tex_content:
|
|
117
|
-
# Extract bibliography references from main tex file
|
|
118
|
-
referenced_bibs = []
|
|
119
|
-
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
120
|
-
matches = re.findall(bib_pattern, main_tex_content)
|
|
121
|
-
|
|
122
|
-
for match in matches:
|
|
123
|
-
# Handle multiple bib files separated by commas
|
|
124
|
-
bib_names = [name.strip() for name in match.split(',')]
|
|
125
|
-
for bib_name in bib_names:
|
|
126
|
-
# Add .bib extension if not present
|
|
127
|
-
if not bib_name.endswith('.bib'):
|
|
128
|
-
bib_name += '.bib'
|
|
129
|
-
referenced_bibs.append(bib_name)
|
|
130
|
-
|
|
131
|
-
# Use only referenced .bib files, or all if no references found
|
|
132
|
-
if referenced_bibs:
|
|
133
|
-
used_bibs = []
|
|
134
|
-
for bib_name in referenced_bibs:
|
|
135
|
-
if bib_name in bib_files:
|
|
136
|
-
used_bibs.append(bib_files[bib_name])
|
|
137
|
-
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
138
|
-
else:
|
|
139
|
-
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
140
|
-
|
|
141
|
-
if used_bibs:
|
|
142
|
-
raw_bib_content = '\n\n'.join(used_bibs)
|
|
143
|
-
|
|
144
|
-
# Filter BibTeX to only include cited references
|
|
145
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
146
|
-
|
|
147
|
-
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
148
|
-
else:
|
|
149
|
-
# Fallback to all bib files if none of the referenced ones found
|
|
150
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
151
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
152
|
-
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
153
|
-
else:
|
|
154
|
-
# No \bibliography command found, use all bib files
|
|
155
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
156
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
157
|
-
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
158
|
-
elif bib_files:
|
|
159
|
-
# No main tex file but have bib files
|
|
160
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
161
|
-
# Can't filter without tex files, so use original content
|
|
162
|
-
bib_content = raw_bib_content
|
|
163
|
-
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
114
|
+
# Process .bib files using shared logic
|
|
115
|
+
bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
|
|
164
116
|
|
|
165
117
|
# Combine all bbl file contents
|
|
166
118
|
bbl_content = None
|
|
@@ -219,6 +171,78 @@ def download_arxiv_bibtex(arxiv_id):
|
|
|
219
171
|
return None
|
|
220
172
|
|
|
221
173
|
|
|
174
|
+
def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
|
|
175
|
+
"""
|
|
176
|
+
Select appropriate .bib files based on main TeX file references and filter by citations.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
bib_files: Dict of .bib files {filename: content}
|
|
180
|
+
main_tex_content: Content of main tex file
|
|
181
|
+
tex_files: Dict of all tex files {filename: content} (for filtering)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Filtered BibTeX content or None if no files available
|
|
185
|
+
"""
|
|
186
|
+
import re
|
|
187
|
+
|
|
188
|
+
if not bib_files:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
if main_tex_content:
|
|
192
|
+
# Extract bibliography references from main tex file
|
|
193
|
+
referenced_bibs = []
|
|
194
|
+
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
195
|
+
matches = re.findall(bib_pattern, main_tex_content)
|
|
196
|
+
|
|
197
|
+
for match in matches:
|
|
198
|
+
# Handle multiple bib files separated by commas
|
|
199
|
+
bib_names = [name.strip() for name in match.split(',')]
|
|
200
|
+
for bib_name in bib_names:
|
|
201
|
+
# Add .bib extension if not present
|
|
202
|
+
if not bib_name.endswith('.bib'):
|
|
203
|
+
bib_name += '.bib'
|
|
204
|
+
referenced_bibs.append(bib_name)
|
|
205
|
+
|
|
206
|
+
# Use only referenced .bib files, or all if no references found
|
|
207
|
+
if referenced_bibs:
|
|
208
|
+
used_bibs = []
|
|
209
|
+
seen_bib_names = set() # Track which bib files we've already added
|
|
210
|
+
for bib_name in referenced_bibs:
|
|
211
|
+
if bib_name in bib_files and bib_name not in seen_bib_names:
|
|
212
|
+
used_bibs.append(bib_files[bib_name])
|
|
213
|
+
seen_bib_names.add(bib_name)
|
|
214
|
+
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
215
|
+
elif bib_name in seen_bib_names:
|
|
216
|
+
logger.debug(f"Skipping duplicate .bib file: {bib_name}")
|
|
217
|
+
else:
|
|
218
|
+
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
219
|
+
|
|
220
|
+
if used_bibs:
|
|
221
|
+
raw_bib_content = '\n\n'.join(used_bibs)
|
|
222
|
+
# Filter BibTeX to only include cited references
|
|
223
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
224
|
+
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
225
|
+
return filtered_content
|
|
226
|
+
else:
|
|
227
|
+
# Fallback to all bib files if none of the referenced ones found
|
|
228
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
229
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
230
|
+
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
231
|
+
return filtered_content
|
|
232
|
+
else:
|
|
233
|
+
# No \bibliography command found, use all bib files
|
|
234
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
235
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
236
|
+
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
237
|
+
return filtered_content
|
|
238
|
+
else:
|
|
239
|
+
# No main tex file but have bib files
|
|
240
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
241
|
+
# Can't filter without tex files, so use original content
|
|
242
|
+
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
243
|
+
return raw_bib_content
|
|
244
|
+
|
|
245
|
+
|
|
222
246
|
def extract_cited_keys_from_tex(tex_files, main_tex_content):
|
|
223
247
|
"""
|
|
224
248
|
Extract all citation keys from TeX files.
|
|
@@ -261,7 +285,11 @@ def is_reference_used(reference_key, cited_keys):
|
|
|
261
285
|
Returns:
|
|
262
286
|
True if the reference is cited, False otherwise
|
|
263
287
|
"""
|
|
264
|
-
|
|
288
|
+
result = reference_key in cited_keys
|
|
289
|
+
# Add debugging for the first few mismatches to understand the issue
|
|
290
|
+
if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
|
|
291
|
+
logger.debug(f"Key '{reference_key}' not found in cited_keys")
|
|
292
|
+
return result
|
|
265
293
|
|
|
266
294
|
|
|
267
295
|
def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
@@ -291,14 +319,30 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
|
291
319
|
from utils.bibtex_parser import parse_bibtex_entries
|
|
292
320
|
entries = parse_bibtex_entries(bib_content)
|
|
293
321
|
|
|
294
|
-
# Filter entries to only cited ones
|
|
322
|
+
# Filter entries to only cited ones and remove duplicates
|
|
295
323
|
cited_entries = []
|
|
324
|
+
seen_keys = set()
|
|
325
|
+
not_cited_count = 0
|
|
326
|
+
duplicate_count = 0
|
|
327
|
+
|
|
296
328
|
for entry in entries:
|
|
297
329
|
entry_key = entry.get('key', '')
|
|
298
330
|
if is_reference_used(entry_key, cited_keys):
|
|
299
|
-
|
|
331
|
+
if entry_key not in seen_keys:
|
|
332
|
+
cited_entries.append(entry)
|
|
333
|
+
seen_keys.add(entry_key)
|
|
334
|
+
else:
|
|
335
|
+
duplicate_count += 1
|
|
336
|
+
logger.debug(f"Skipping duplicate entry: '{entry_key}'")
|
|
337
|
+
else:
|
|
338
|
+
not_cited_count += 1
|
|
339
|
+
# Log first few entries that are NOT cited for debugging
|
|
340
|
+
if not_cited_count <= 5:
|
|
341
|
+
logger.debug(f"Entry NOT cited: '{entry_key}'")
|
|
300
342
|
|
|
301
|
-
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited")
|
|
343
|
+
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
|
|
344
|
+
logger.debug(f"Citation keys found: {len(cited_keys)} keys")
|
|
345
|
+
logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
|
|
302
346
|
|
|
303
347
|
# Reconstruct BibTeX content from cited entries
|
|
304
348
|
if not cited_entries:
|
|
@@ -580,6 +580,9 @@ def clean_title_for_search(title):
|
|
|
580
580
|
if not isinstance(title, str):
|
|
581
581
|
return str(title) if title is not None else ''
|
|
582
582
|
|
|
583
|
+
# Strip LaTeX commands to handle math formatting and other LaTeX markup
|
|
584
|
+
title = strip_latex_commands(title)
|
|
585
|
+
|
|
583
586
|
# Clean up newlines and normalize whitespace (but preserve other structure)
|
|
584
587
|
title = title.replace('\n', ' ').strip()
|
|
585
588
|
title = re.sub(r'\s+', ' ', title) # Normalize whitespace only
|
|
@@ -753,8 +756,11 @@ def normalize_paper_title(title: str) -> str:
|
|
|
753
756
|
if not title:
|
|
754
757
|
return ""
|
|
755
758
|
|
|
759
|
+
# Strip LaTeX commands first to handle math formatting consistently
|
|
760
|
+
normalized = strip_latex_commands(title)
|
|
761
|
+
|
|
756
762
|
# Convert to lowercase
|
|
757
|
-
normalized =
|
|
763
|
+
normalized = normalized.lower()
|
|
758
764
|
|
|
759
765
|
# Remove common prefixes that don't affect the actual title content
|
|
760
766
|
prefixes_to_remove = [
|
|
@@ -2107,21 +2113,37 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
|
|
|
2107
2113
|
|
|
2108
2114
|
return True, f"Authors match (verified {len(cleaned_cited)} of {len(correct_names)} with et al)"
|
|
2109
2115
|
|
|
2116
|
+
# Detect if cited authors look like parsing fragments
|
|
2117
|
+
# (many short single-word entries that might be first/last name fragments)
|
|
2118
|
+
def looks_like_fragments(authors_list):
|
|
2119
|
+
if len(authors_list) < 4: # Need at least 4 to detect fragment pattern
|
|
2120
|
+
return False
|
|
2121
|
+
single_word_count = sum(1 for author in authors_list if len(author.strip().split()) == 1)
|
|
2122
|
+
return single_word_count >= len(authors_list) * 0.7 # 70% or more are single words
|
|
2123
|
+
|
|
2110
2124
|
# Normal case without "et al" - compare all authors
|
|
2111
2125
|
if len(cleaned_cited) != len(correct_names):
|
|
2112
|
-
|
|
2113
|
-
#
|
|
2114
|
-
if
|
|
2126
|
+
|
|
2127
|
+
# Check if cited authors look like parsing fragments
|
|
2128
|
+
if looks_like_fragments(cleaned_cited):
|
|
2115
2129
|
from utils.error_utils import format_author_count_mismatch
|
|
2116
|
-
# Convert cited names to display format (First Last) before showing in error
|
|
2117
2130
|
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2118
2131
|
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2119
2132
|
return False, error_msg
|
|
2120
2133
|
|
|
2121
|
-
#
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2134
|
+
# For all count mismatches, show the count mismatch error
|
|
2135
|
+
if len(cleaned_cited) < len(correct_names):
|
|
2136
|
+
from utils.error_utils import format_author_count_mismatch
|
|
2137
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2138
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2139
|
+
return False, error_msg
|
|
2140
|
+
|
|
2141
|
+
# For cases where cited > correct, also show count mismatch
|
|
2142
|
+
elif len(cleaned_cited) > len(correct_names):
|
|
2143
|
+
from utils.error_utils import format_author_count_mismatch
|
|
2144
|
+
display_cited = [format_author_for_display(author) for author in cleaned_cited]
|
|
2145
|
+
error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
|
|
2146
|
+
return False, error_msg
|
|
2125
2147
|
else:
|
|
2126
2148
|
comparison_cited = cleaned_cited
|
|
2127
2149
|
comparison_correct = correct_names
|
|
@@ -2484,8 +2506,64 @@ def strip_latex_commands(text):
|
|
|
2484
2506
|
# Remove font size commands
|
|
2485
2507
|
text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\b', '', text)
|
|
2486
2508
|
|
|
2487
|
-
#
|
|
2488
|
-
|
|
2509
|
+
# Handle complex math mode patterns first
|
|
2510
|
+
# Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
|
|
2511
|
+
def process_complex_math(match):
|
|
2512
|
+
content = match.group(1)
|
|
2513
|
+
# Handle common Greek letters
|
|
2514
|
+
content = re.sub(r'\\mu\b', 'μ', content) # \mu -> μ
|
|
2515
|
+
content = re.sub(r'\\alpha\b', 'α', content) # \alpha -> α
|
|
2516
|
+
content = re.sub(r'\\beta\b', 'β', content) # \beta -> β
|
|
2517
|
+
content = re.sub(r'\\gamma\b', 'γ', content) # \gamma -> γ
|
|
2518
|
+
content = re.sub(r'\\delta\b', 'δ', content) # \delta -> δ
|
|
2519
|
+
content = re.sub(r'\\epsilon\b', 'ε', content) # \epsilon -> ε
|
|
2520
|
+
content = re.sub(r'\\lambda\b', 'λ', content) # \lambda -> λ
|
|
2521
|
+
content = re.sub(r'\\pi\b', 'π', content) # \pi -> π
|
|
2522
|
+
content = re.sub(r'\\sigma\b', 'σ', content) # \sigma -> σ
|
|
2523
|
+
content = re.sub(r'\\theta\b', 'θ', content) # \theta -> θ
|
|
2524
|
+
# Remove any remaining LaTeX commands and braces from inside math
|
|
2525
|
+
content = re.sub(r'\\[a-zA-Z]+\b', '', content)
|
|
2526
|
+
content = re.sub(r'[{}]', '', content)
|
|
2527
|
+
# Clean up any remaining $ signs
|
|
2528
|
+
content = re.sub(r'\$+', '', content)
|
|
2529
|
+
return content
|
|
2530
|
+
|
|
2531
|
+
# Handle complex nested math patterns first
|
|
2532
|
+
# Pattern like $\{$$\mu$second-scale$\}$ should become μsecond-scale
|
|
2533
|
+
def process_nested_math_specifically(match):
|
|
2534
|
+
content = match.group(0)
|
|
2535
|
+
# Handle the specific pattern: $\{$$\mu$second-scale$\}$
|
|
2536
|
+
# Extract the meaningful parts
|
|
2537
|
+
if r'\mu' in content:
|
|
2538
|
+
# Replace \mu with μ and extract the surrounding text
|
|
2539
|
+
content = re.sub(r'\\mu\b', 'μ', content)
|
|
2540
|
+
# Remove all LaTeX math markup
|
|
2541
|
+
content = re.sub(r'[\$\{\}\\]+', '', content)
|
|
2542
|
+
return content
|
|
2543
|
+
|
|
2544
|
+
# Handle the specific problematic pattern
|
|
2545
|
+
text = re.sub(r'\$\\\{[^}]*\\\}\$', process_nested_math_specifically, text)
|
|
2546
|
+
|
|
2547
|
+
# Handle Greek letters in math mode before removing delimiters
|
|
2548
|
+
def process_standard_math(match):
|
|
2549
|
+
content = match.group(1)
|
|
2550
|
+
# Handle common Greek letters - content has single backslashes
|
|
2551
|
+
content = re.sub(r'\\mu\b', 'μ', content)
|
|
2552
|
+
content = re.sub(r'\\alpha\b', 'α', content)
|
|
2553
|
+
content = re.sub(r'\\beta\b', 'β', content)
|
|
2554
|
+
content = re.sub(r'\\gamma\b', 'γ', content)
|
|
2555
|
+
content = re.sub(r'\\delta\b', 'δ', content)
|
|
2556
|
+
content = re.sub(r'\\epsilon\b', 'ε', content)
|
|
2557
|
+
content = re.sub(r'\\lambda\b', 'λ', content)
|
|
2558
|
+
content = re.sub(r'\\pi\b', 'π', content)
|
|
2559
|
+
content = re.sub(r'\\sigma\b', 'σ', content)
|
|
2560
|
+
content = re.sub(r'\\theta\b', 'θ', content)
|
|
2561
|
+
# Remove any remaining LaTeX commands
|
|
2562
|
+
content = re.sub(r'\\[a-zA-Z]+\b', '', content)
|
|
2563
|
+
return content
|
|
2564
|
+
|
|
2565
|
+
# Remove standard math mode delimiters with Greek letter processing
|
|
2566
|
+
text = re.sub(r'\$([^$]*)\$', process_standard_math, text)
|
|
2489
2567
|
text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
|
|
2490
2568
|
text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)
|
|
2491
2569
|
|
|
@@ -3369,7 +3447,18 @@ def _extract_corrected_reference_data(error_entry: dict, corrected_data: dict) -
|
|
|
3369
3447
|
"""
|
|
3370
3448
|
# Get the corrected information
|
|
3371
3449
|
correct_title = error_entry.get('ref_title_correct') or corrected_data.get('title', '')
|
|
3372
|
-
|
|
3450
|
+
|
|
3451
|
+
# Handle authors - can be string or list of dicts from API
|
|
3452
|
+
authors_raw = error_entry.get('ref_authors_correct') or corrected_data.get('authors', '')
|
|
3453
|
+
if isinstance(authors_raw, list):
|
|
3454
|
+
# Convert list of author dicts to comma-separated string
|
|
3455
|
+
if authors_raw and isinstance(authors_raw[0], dict):
|
|
3456
|
+
correct_authors = ', '.join([author.get('name', '') for author in authors_raw])
|
|
3457
|
+
else:
|
|
3458
|
+
correct_authors = ', '.join(authors_raw)
|
|
3459
|
+
else:
|
|
3460
|
+
correct_authors = str(authors_raw) if authors_raw else ''
|
|
3461
|
+
|
|
3373
3462
|
correct_year = error_entry.get('ref_year_correct') or corrected_data.get('year', '')
|
|
3374
3463
|
|
|
3375
3464
|
# Prioritize the verified URL that was actually used for verification
|
|
@@ -3573,7 +3662,39 @@ def format_corrected_plaintext(original_reference, corrected_data, error_entry):
|
|
|
3573
3662
|
if correct_url:
|
|
3574
3663
|
citation_parts.append(f"{correct_url}")
|
|
3575
3664
|
|
|
3576
|
-
|
|
3665
|
+
citation_text = '. '.join(citation_parts) + '.'
|
|
3666
|
+
|
|
3667
|
+
# Add citation key information if available (for easy copying)
|
|
3668
|
+
citation_key = original_reference.get('bibtex_key') or original_reference.get('bibitem_key')
|
|
3669
|
+
if citation_key and citation_key != 'unknown':
|
|
3670
|
+
bibtex_type = original_reference.get('bibtex_type', 'misc')
|
|
3671
|
+
citation_text += f"\n\n% Citation key for BibTeX: @{bibtex_type}{{{citation_key}, ...}}"
|
|
3672
|
+
|
|
3673
|
+
return citation_text
|
|
3674
|
+
|
|
3675
|
+
|
|
3676
|
+
def compare_titles_with_latex_cleaning(cited_title: str, database_title: str) -> float:
|
|
3677
|
+
"""
|
|
3678
|
+
Compare two titles with proper LaTeX cleaning for accurate similarity scoring.
|
|
3679
|
+
|
|
3680
|
+
This function ensures both titles are cleaned of LaTeX commands before comparison
|
|
3681
|
+
to avoid false mismatches due to formatting differences like {LLM}s vs LLMs.
|
|
3682
|
+
|
|
3683
|
+
Args:
|
|
3684
|
+
cited_title: Title from cited reference (may contain LaTeX)
|
|
3685
|
+
database_title: Title from database (usually already clean)
|
|
3686
|
+
|
|
3687
|
+
Returns:
|
|
3688
|
+
Similarity score between 0 and 1
|
|
3689
|
+
"""
|
|
3690
|
+
if not cited_title or not database_title:
|
|
3691
|
+
return 0.0
|
|
3692
|
+
|
|
3693
|
+
# Clean LaTeX commands from cited title to match database format
|
|
3694
|
+
clean_cited = strip_latex_commands(cited_title)
|
|
3695
|
+
|
|
3696
|
+
# Calculate similarity using cleaned titles
|
|
3697
|
+
return calculate_title_similarity(clean_cited, database_title)
|
|
3577
3698
|
|
|
3578
3699
|
|
|
3579
3700
|
def calculate_title_similarity(title1: str, title2: str) -> float:
|
|
@@ -3902,6 +4023,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3902
4023
|
# Handle specific multi-word patterns and well-known acronyms
|
|
3903
4024
|
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
3904
4025
|
'pnas': 'proceedings of the national academy of sciences',
|
|
4026
|
+
# Special cases that don't follow standard acronym patterns
|
|
4027
|
+
'neurips': 'neural information processing systems', # Special case
|
|
4028
|
+
'nips': 'neural information processing systems', # old name for neurips
|
|
3905
4029
|
}
|
|
3906
4030
|
# Sort by length (longest first) to ensure longer matches take precedence
|
|
3907
4031
|
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.51}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|