academic-refchecker 1.2.65__py3-none-any.whl → 1.2.66__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/METADATA +72 -7
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/RECORD +28 -18
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/entry_points.txt +1 -0
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/top_level.txt +1 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +56 -0
- backend/concurrency.py +100 -0
- backend/database.py +686 -0
- backend/main.py +1266 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__version__.py +2 -2
- refchecker/checkers/crossref.py +15 -6
- refchecker/checkers/enhanced_hybrid_checker.py +18 -4
- refchecker/checkers/local_semantic_scholar.py +2 -2
- refchecker/checkers/openalex.py +15 -6
- refchecker/checkers/semantic_scholar.py +15 -6
- refchecker/core/refchecker.py +17 -6
- refchecker/utils/__init__.py +2 -1
- refchecker/utils/arxiv_utils.py +18 -60
- refchecker/utils/doi_utils.py +32 -1
- refchecker/utils/error_utils.py +20 -9
- refchecker/utils/text_utils.py +143 -27
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/licenses/LICENSE +0 -0
|
@@ -37,9 +37,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
37
37
|
|
|
38
38
|
from refchecker.utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
|
|
39
39
|
from refchecker.utils.error_utils import create_author_error, create_year_warning, create_doi_error
|
|
40
|
-
from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity
|
|
40
|
+
from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity
|
|
41
|
+
from refchecker.utils.url_utils import extract_arxiv_id_from_url, get_best_available_url
|
|
41
42
|
from refchecker.utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
|
|
42
|
-
from refchecker.utils.url_utils import get_best_available_url
|
|
43
43
|
from refchecker.config.settings import get_config
|
|
44
44
|
|
|
45
45
|
# Set up logging
|
refchecker/checkers/openalex.py
CHANGED
|
@@ -460,13 +460,22 @@ class OpenAlexReferenceChecker:
|
|
|
460
460
|
|
|
461
461
|
if doi and work_doi:
|
|
462
462
|
# Compare DOIs using the proper comparison function
|
|
463
|
-
from refchecker.utils.doi_utils import compare_dois
|
|
463
|
+
from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
|
|
464
464
|
if not compare_dois(doi, work_doi):
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
465
|
+
# If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
|
|
466
|
+
# Treat as warning instead of error
|
|
467
|
+
if validate_doi_resolves(doi):
|
|
468
|
+
errors.append({
|
|
469
|
+
'warning_type': 'doi',
|
|
470
|
+
'warning_details': format_doi_mismatch(doi, work_doi),
|
|
471
|
+
'ref_doi_correct': work_doi
|
|
472
|
+
})
|
|
473
|
+
else:
|
|
474
|
+
errors.append({
|
|
475
|
+
'error_type': 'doi',
|
|
476
|
+
'error_details': format_doi_mismatch(doi, work_doi),
|
|
477
|
+
'ref_doi_correct': work_doi
|
|
478
|
+
})
|
|
470
479
|
|
|
471
480
|
# Extract URL from work data
|
|
472
481
|
work_url = self.extract_url_from_work(work_data)
|
|
@@ -612,14 +612,23 @@ class NonArxivReferenceChecker:
|
|
|
612
612
|
paper_doi = external_ids['DOI']
|
|
613
613
|
|
|
614
614
|
# Compare DOIs using the proper comparison function
|
|
615
|
-
from refchecker.utils.doi_utils import compare_dois
|
|
615
|
+
from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
|
|
616
616
|
if doi and paper_doi and not compare_dois(doi, paper_doi):
|
|
617
617
|
from refchecker.utils.error_utils import format_doi_mismatch
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
618
|
+
# If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
|
|
619
|
+
# Treat as warning instead of error
|
|
620
|
+
if validate_doi_resolves(doi):
|
|
621
|
+
errors.append({
|
|
622
|
+
'warning_type': 'doi',
|
|
623
|
+
'warning_details': format_doi_mismatch(doi, paper_doi),
|
|
624
|
+
'ref_doi_correct': paper_doi
|
|
625
|
+
})
|
|
626
|
+
else:
|
|
627
|
+
errors.append({
|
|
628
|
+
'error_type': 'doi',
|
|
629
|
+
'error_details': format_doi_mismatch(doi, paper_doi),
|
|
630
|
+
'ref_doi_correct': paper_doi
|
|
631
|
+
})
|
|
623
632
|
|
|
624
633
|
# Extract URL from paper data - prioritize arXiv URLs when available
|
|
625
634
|
paper_url = None
|
refchecker/core/refchecker.py
CHANGED
|
@@ -46,12 +46,13 @@ import json
|
|
|
46
46
|
import random
|
|
47
47
|
from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
48
48
|
from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
|
|
49
|
-
|
|
49
|
+
normalize_text as common_normalize_text,
|
|
50
50
|
detect_latex_bibliography_format, extract_latex_references,
|
|
51
51
|
detect_standard_acm_natbib_format, strip_latex_commands,
|
|
52
52
|
format_corrected_reference, is_name_match, enhanced_name_match,
|
|
53
53
|
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
|
|
54
54
|
compare_authors)
|
|
55
|
+
from refchecker.utils.url_utils import extract_arxiv_id_from_url
|
|
55
56
|
from refchecker.utils.config_validator import ConfigValidator
|
|
56
57
|
from refchecker.services.pdf_processor import PDFProcessor
|
|
57
58
|
from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
@@ -1963,11 +1964,21 @@ class ArxivReferenceChecker:
|
|
|
1963
1964
|
if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
|
|
1964
1965
|
logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
|
|
1965
1966
|
from refchecker.utils.error_utils import format_doi_mismatch
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1967
|
+
from refchecker.utils.doi_utils import validate_doi_resolves
|
|
1968
|
+
# If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
|
|
1969
|
+
# Treat as warning instead of error
|
|
1970
|
+
if validate_doi_resolves(doi):
|
|
1971
|
+
errors.append({
|
|
1972
|
+
'warning_type': 'doi',
|
|
1973
|
+
'warning_details': format_doi_mismatch(doi, external_ids['DOI']),
|
|
1974
|
+
'ref_doi_correct': external_ids['DOI']
|
|
1975
|
+
})
|
|
1976
|
+
else:
|
|
1977
|
+
errors.append({
|
|
1978
|
+
'error_type': 'doi',
|
|
1979
|
+
'error_details': format_doi_mismatch(doi, external_ids['DOI']),
|
|
1980
|
+
'ref_doi_correct': external_ids['DOI']
|
|
1981
|
+
})
|
|
1971
1982
|
else:
|
|
1972
1983
|
logger.debug(f"DB Verification: DOI partial match - cited: {doi}, actual: {external_ids['DOI']} (acceptable)")
|
|
1973
1984
|
|
refchecker/utils/__init__.py
CHANGED
|
@@ -4,9 +4,10 @@ Utility functions for text processing, author comparison, mocking, and configura
|
|
|
4
4
|
|
|
5
5
|
from .text_utils import (
|
|
6
6
|
clean_author_name, clean_title, normalize_text,
|
|
7
|
-
|
|
7
|
+
clean_conference_markers_from_title,
|
|
8
8
|
remove_year_from_title
|
|
9
9
|
)
|
|
10
|
+
from .url_utils import extract_arxiv_id_from_url
|
|
10
11
|
from .author_utils import compare_authors, levenshtein_distance, extract_authors_list
|
|
11
12
|
from .mock_objects import (
|
|
12
13
|
MockPaper, MockReference, MockLLMProvider, MockSemanticScholarAPI, MockArxivAPI,
|
refchecker/utils/arxiv_utils.py
CHANGED
|
@@ -422,6 +422,11 @@ def get_bibtex_content(paper):
|
|
|
422
422
|
"""
|
|
423
423
|
Try to get BibTeX content for a paper from various sources.
|
|
424
424
|
|
|
425
|
+
For ArXiv papers, only use .bbl files (compiled bibliography).
|
|
426
|
+
The .bbl file contains only the actually-cited references, while .bib files
|
|
427
|
+
are unreliable - they may contain entire bibliography databases (e.g., full
|
|
428
|
+
ACL Anthology with 80k+ entries) or unfiltered reference collections.
|
|
429
|
+
|
|
425
430
|
Args:
|
|
426
431
|
paper: Paper object
|
|
427
432
|
|
|
@@ -433,71 +438,24 @@ def get_bibtex_content(paper):
|
|
|
433
438
|
# Try ArXiv source if it's an ArXiv paper
|
|
434
439
|
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
435
440
|
if arxiv_id:
|
|
436
|
-
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for
|
|
441
|
+
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for .bbl bibliography")
|
|
437
442
|
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
438
443
|
|
|
439
|
-
#
|
|
440
|
-
|
|
441
|
-
uses_bibtex = False
|
|
442
|
-
if tex_content:
|
|
443
|
-
# Look for \bibliography{...} commands in the main TeX file
|
|
444
|
-
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
445
|
-
bib_matches = re.findall(bib_pattern, tex_content)
|
|
446
|
-
if bib_matches:
|
|
447
|
-
uses_bibtex = True
|
|
448
|
-
referenced_bibs = []
|
|
449
|
-
for match in bib_matches:
|
|
450
|
-
bib_names = [name.strip() for name in match.split(',')]
|
|
451
|
-
referenced_bibs.extend(bib_names)
|
|
452
|
-
logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
|
|
453
|
-
|
|
454
|
-
if bib_content and bbl_content:
|
|
455
|
-
# Count entries in both for logging
|
|
456
|
-
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
444
|
+
# Only use .bbl files for ArXiv papers (.bib files are unreliable)
|
|
445
|
+
if bbl_content:
|
|
457
446
|
bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
# IMPORTANT: Prefer .bbl when .bib is excessively large (e.g., includes full ACL Anthology)
|
|
462
|
-
# The .bbl file contains only the actually-cited references, while .bib may contain
|
|
463
|
-
# entire bibliography databases. Parsing 80k+ entries would cause the tool to hang.
|
|
464
|
-
# Use .bbl if: (1) .bbl has entries AND (2) .bib has >10x more entries than .bbl OR >1000 entries
|
|
465
|
-
excessive_bib = bib_entry_count > 1000 or (bbl_entry_count > 0 and bib_entry_count > bbl_entry_count * 10)
|
|
466
|
-
|
|
467
|
-
if bbl_entry_count > 0 and excessive_bib:
|
|
468
|
-
logger.info(f"Using .bbl files from ArXiv source (.bib has {bib_entry_count} entries which is excessive, .bbl has {bbl_entry_count})")
|
|
469
|
-
return bbl_content
|
|
470
|
-
elif uses_bibtex and bib_entry_count > 0 and not excessive_bib:
|
|
471
|
-
logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
|
|
472
|
-
return bib_content
|
|
473
|
-
elif bbl_entry_count > 0:
|
|
474
|
-
logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
|
|
447
|
+
if bbl_entry_count > 0:
|
|
448
|
+
logger.info(f"Using .bbl files from ArXiv source ({bbl_entry_count} entries)")
|
|
475
449
|
return bbl_content
|
|
476
|
-
elif bib_entry_count > 0:
|
|
477
|
-
logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
|
|
478
|
-
return bib_content
|
|
479
450
|
else:
|
|
480
|
-
logger.
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
|
|
489
|
-
return bbl_content
|
|
490
|
-
|
|
491
|
-
elif tex_content:
|
|
492
|
-
# Check for embedded bibliography in LaTeX
|
|
493
|
-
from refchecker.utils.text_utils import detect_latex_bibliography_format
|
|
494
|
-
latex_format = detect_latex_bibliography_format(tex_content)
|
|
495
|
-
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
496
|
-
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
497
|
-
# Skip embedded bibliography and return None to trigger fallback methods
|
|
498
|
-
return None
|
|
499
|
-
|
|
500
|
-
# Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
|
|
451
|
+
logger.debug(f"Found .bbl file but it appears empty")
|
|
452
|
+
|
|
453
|
+
# No .bbl available - return None to trigger PDF fallback
|
|
454
|
+
if bib_content:
|
|
455
|
+
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
456
|
+
logger.debug(f"Skipping .bib file ({bib_entry_count} entries) - unreliable, falling back to PDF extraction")
|
|
457
|
+
|
|
458
|
+
logger.debug(f"No usable .bbl file found for ArXiv paper {arxiv_id}")
|
|
501
459
|
|
|
502
460
|
return None
|
|
503
461
|
|
refchecker/utils/doi_utils.py
CHANGED
|
@@ -156,4 +156,35 @@ def construct_doi_url(doi: str) -> str:
|
|
|
156
156
|
normalized_doi = normalize_doi(doi)
|
|
157
157
|
|
|
158
158
|
# Construct URL
|
|
159
|
-
return f"https://doi.org/{normalized_doi}"
|
|
159
|
+
return f"https://doi.org/{normalized_doi}"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def validate_doi_resolves(doi: str, timeout: float = 5.0) -> bool:
|
|
163
|
+
"""
|
|
164
|
+
Validate that a DOI resolves by checking if doi.org returns a redirect.
|
|
165
|
+
|
|
166
|
+
This is useful for determining if a DOI is valid, even if it's different
|
|
167
|
+
from what a verification source has stored (e.g., arXiv DOI vs conference DOI).
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
doi: DOI string to validate
|
|
171
|
+
timeout: Request timeout in seconds
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
True if DOI resolves (returns 302/301/200), False otherwise
|
|
175
|
+
"""
|
|
176
|
+
if not doi or not is_valid_doi_format(normalize_doi(doi)):
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
import requests
|
|
181
|
+
url = construct_doi_url(doi)
|
|
182
|
+
# Use HEAD request first (faster), fall back to GET if needed
|
|
183
|
+
response = requests.head(url, allow_redirects=False, timeout=timeout)
|
|
184
|
+
# DOI.org returns 302 for valid DOIs that redirect to the paper
|
|
185
|
+
# Some may return 301 (permanent redirect) or 200 (direct response)
|
|
186
|
+
return response.status_code in (200, 301, 302, 303, 307, 308)
|
|
187
|
+
except Exception:
|
|
188
|
+
# On any error (timeout, connection error, etc.), assume DOI might be valid
|
|
189
|
+
# to avoid false negatives due to network issues
|
|
190
|
+
return True
|
refchecker/utils/error_utils.py
CHANGED
|
@@ -126,28 +126,39 @@ def create_year_warning(cited_year: int, correct_year: int) -> Dict[str, Any]:
|
|
|
126
126
|
|
|
127
127
|
def create_doi_error(cited_doi: str, correct_doi: str) -> Optional[Dict[str, str]]:
|
|
128
128
|
"""
|
|
129
|
-
Create a standardized DOI error dictionary.
|
|
129
|
+
Create a standardized DOI error or warning dictionary.
|
|
130
|
+
|
|
131
|
+
If the cited DOI resolves (is valid), this returns a warning instead of an error,
|
|
132
|
+
since papers can have multiple valid DOIs (e.g., arXiv DOI vs conference DOI).
|
|
130
133
|
|
|
131
134
|
Args:
|
|
132
135
|
cited_doi: DOI as cited in the reference
|
|
133
136
|
correct_doi: Correct DOI from database
|
|
134
137
|
|
|
135
138
|
Returns:
|
|
136
|
-
Standardized error dictionary if DOIs differ, None if they match after cleaning
|
|
139
|
+
Standardized error/warning dictionary if DOIs differ, None if they match after cleaning
|
|
137
140
|
"""
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
+
from refchecker.utils.doi_utils import validate_doi_resolves, compare_dois
|
|
142
|
+
|
|
143
|
+
# Use compare_dois which handles normalization (case, prefixes, trailing punctuation)
|
|
144
|
+
if compare_dois(cited_doi, correct_doi):
|
|
145
|
+
return None
|
|
141
146
|
|
|
142
|
-
#
|
|
143
|
-
|
|
147
|
+
# DOIs are different - determine if this should be error or warning
|
|
148
|
+
# If cited DOI resolves, it's likely a valid alternate DOI
|
|
149
|
+
# Treat as warning instead of error
|
|
150
|
+
if validate_doi_resolves(cited_doi):
|
|
151
|
+
return {
|
|
152
|
+
'warning_type': 'doi',
|
|
153
|
+
'warning_details': format_doi_mismatch(cited_doi, correct_doi),
|
|
154
|
+
'ref_doi_correct': correct_doi
|
|
155
|
+
}
|
|
156
|
+
else:
|
|
144
157
|
return {
|
|
145
158
|
'error_type': 'doi',
|
|
146
159
|
'error_details': format_doi_mismatch(cited_doi, correct_doi),
|
|
147
160
|
'ref_doi_correct': correct_doi
|
|
148
161
|
}
|
|
149
|
-
|
|
150
|
-
return None
|
|
151
162
|
|
|
152
163
|
|
|
153
164
|
def create_title_error(error_details: str, correct_title: str) -> Dict[str, str]:
|
refchecker/utils/text_utils.py
CHANGED
|
@@ -676,22 +676,6 @@ def clean_title(title):
|
|
|
676
676
|
return title
|
|
677
677
|
|
|
678
678
|
|
|
679
|
-
def extract_arxiv_id_from_url(url):
|
|
680
|
-
"""
|
|
681
|
-
Extract ArXiv ID from URL or text containing ArXiv reference.
|
|
682
|
-
|
|
683
|
-
This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
|
|
684
|
-
Kept for backwards compatibility.
|
|
685
|
-
|
|
686
|
-
Args:
|
|
687
|
-
url: URL string or text containing arXiv reference
|
|
688
|
-
|
|
689
|
-
Returns:
|
|
690
|
-
ArXiv ID or None if not found
|
|
691
|
-
"""
|
|
692
|
-
from refchecker.utils.url_utils import extract_arxiv_id_from_url as common_extract
|
|
693
|
-
return common_extract(url)
|
|
694
|
-
|
|
695
679
|
def extract_year_from_text(text):
|
|
696
680
|
"""
|
|
697
681
|
Extract a 4-digit year from text
|
|
@@ -808,6 +792,9 @@ def normalize_paper_title(title: str) -> str:
|
|
|
808
792
|
# Strip LaTeX commands first to handle math formatting consistently
|
|
809
793
|
normalized = strip_latex_commands(title)
|
|
810
794
|
|
|
795
|
+
# Normalize diacritics (ü -> u, é -> e, etc.) for consistent comparison
|
|
796
|
+
normalized = normalize_diacritics(normalized)
|
|
797
|
+
|
|
811
798
|
# Convert to lowercase
|
|
812
799
|
normalized = normalized.lower()
|
|
813
800
|
|
|
@@ -2343,6 +2330,17 @@ def detect_latex_bibliography_format(text):
|
|
|
2343
2330
|
'details': details
|
|
2344
2331
|
}
|
|
2345
2332
|
|
|
2333
|
+
# Check for standalone \bibitem entries (common in .bbl files without full environment wrapper)
|
|
2334
|
+
# This handles cases where the \begin{thebibliography} wrapper is missing
|
|
2335
|
+
bibitem_matches = re.findall(r'\\bibitem(?:\[[^\]]*\])?\{[^}]+\}', text)
|
|
2336
|
+
if bibitem_matches:
|
|
2337
|
+
details['bibitem_count'] = len(bibitem_matches)
|
|
2338
|
+
return {
|
|
2339
|
+
'is_latex': True,
|
|
2340
|
+
'format_type': 'thebibliography',
|
|
2341
|
+
'details': details
|
|
2342
|
+
}
|
|
2343
|
+
|
|
2346
2344
|
# Check for \bibliography{} command
|
|
2347
2345
|
bibcommand_pattern = r'\\bibliography\{([^}]+)\}'
|
|
2348
2346
|
bibcommand_match = re.search(bibcommand_pattern, text, re.IGNORECASE)
|
|
@@ -3125,7 +3123,8 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3125
3123
|
# Parse \bibitem entries (improved for .bbl files with ACM-Reference-Format)
|
|
3126
3124
|
# Handle both simple \bibitem{key} and complex \bibitem[label]{key} formats
|
|
3127
3125
|
# Also handle line continuation with % and various spacing patterns
|
|
3128
|
-
|
|
3126
|
+
# Updated to also match end-of-string ($) for standalone bibitem entries
|
|
3127
|
+
bibitem_pattern = r'\\bibitem(?:\[([^\]]*)\])?\s*%?\s*\n?\s*\{([^}]+)\}\s*(.*?)(?=\\bibitem|\\end\{thebibliography\}|$)'
|
|
3129
3128
|
|
|
3130
3129
|
matches = re.finditer(bibitem_pattern, text, re.DOTALL | re.IGNORECASE)
|
|
3131
3130
|
|
|
@@ -3196,10 +3195,21 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3196
3195
|
if label_year_match:
|
|
3197
3196
|
ref['year'] = int(label_year_match.group(1))
|
|
3198
3197
|
else:
|
|
3199
|
-
# Try to extract from content
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3198
|
+
# Try to extract from content - be careful to avoid ArXiv IDs like 1907.10641
|
|
3199
|
+
# Look for year at end of content or after a comma (typical citation format)
|
|
3200
|
+
# Pattern: standalone year after comma/space, not followed by a dot and more digits (ArXiv ID)
|
|
3201
|
+
year_patterns = [
|
|
3202
|
+
r',\s*((?:19|20)\d{2})\s*\.$', # Year at end after comma: ", 2019."
|
|
3203
|
+
r',\s*((?:19|20)\d{2})\s*$', # Year at end after comma: ", 2019"
|
|
3204
|
+
r'\s+((?:19|20)\d{2})\s*\.$', # Year at end after space: " 2019."
|
|
3205
|
+
r'\s+((?:19|20)\d{2})\s*$', # Year at end after space: " 2019"
|
|
3206
|
+
r'\b((?:19|20)\d{2})(?!\.\d)', # Year not followed by decimal (avoid ArXiv IDs)
|
|
3207
|
+
]
|
|
3208
|
+
for pattern in year_patterns:
|
|
3209
|
+
content_year_match = re.search(pattern, content)
|
|
3210
|
+
if content_year_match:
|
|
3211
|
+
ref['year'] = int(content_year_match.group(1))
|
|
3212
|
+
break
|
|
3203
3213
|
|
|
3204
3214
|
# Parse natbib format: usually has author line, then \newblock title, then \newblock venue
|
|
3205
3215
|
parts = re.split(r'\\newblock', content, flags=re.IGNORECASE)
|
|
@@ -3300,7 +3310,80 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3300
3310
|
# Second part is usually title
|
|
3301
3311
|
if len(parts) >= 2:
|
|
3302
3312
|
title_part = parts[1].strip()
|
|
3303
|
-
|
|
3313
|
+
|
|
3314
|
+
# Handle \href{URL}{text} or \href {URL} {text} format
|
|
3315
|
+
# Extract URL before stripping LaTeX commands
|
|
3316
|
+
# We need to use balanced brace matching because titles can contain
|
|
3317
|
+
# nested braces like {LLM} for capitalization protection
|
|
3318
|
+
href_url = None
|
|
3319
|
+
title_text = None
|
|
3320
|
+
|
|
3321
|
+
href_start = title_part.find('\\href')
|
|
3322
|
+
if href_start != -1:
|
|
3323
|
+
# Find first opening brace (URL)
|
|
3324
|
+
pos = href_start + 5 # Skip \href
|
|
3325
|
+
while pos < len(title_part) and title_part[pos] in ' \t\n':
|
|
3326
|
+
pos += 1
|
|
3327
|
+
|
|
3328
|
+
if pos < len(title_part) and title_part[pos] == '{':
|
|
3329
|
+
# Extract URL using balanced braces
|
|
3330
|
+
brace_count = 0
|
|
3331
|
+
url_start = pos + 1
|
|
3332
|
+
url_end = pos
|
|
3333
|
+
for i in range(pos, len(title_part)):
|
|
3334
|
+
if title_part[i] == '{':
|
|
3335
|
+
brace_count += 1
|
|
3336
|
+
elif title_part[i] == '}':
|
|
3337
|
+
brace_count -= 1
|
|
3338
|
+
if brace_count == 0:
|
|
3339
|
+
url_end = i
|
|
3340
|
+
break
|
|
3341
|
+
|
|
3342
|
+
if url_end > url_start:
|
|
3343
|
+
href_url = title_part[url_start:url_end].strip()
|
|
3344
|
+
|
|
3345
|
+
# Now find the second brace group (title text)
|
|
3346
|
+
pos = url_end + 1
|
|
3347
|
+
while pos < len(title_part) and title_part[pos] in ' \t\n':
|
|
3348
|
+
pos += 1
|
|
3349
|
+
|
|
3350
|
+
if pos < len(title_part) and title_part[pos] == '{':
|
|
3351
|
+
# Extract title text using balanced braces
|
|
3352
|
+
brace_count = 0
|
|
3353
|
+
text_start = pos + 1
|
|
3354
|
+
text_end = pos
|
|
3355
|
+
for i in range(pos, len(title_part)):
|
|
3356
|
+
if title_part[i] == '{':
|
|
3357
|
+
brace_count += 1
|
|
3358
|
+
elif title_part[i] == '}':
|
|
3359
|
+
brace_count -= 1
|
|
3360
|
+
if brace_count == 0:
|
|
3361
|
+
text_end = i
|
|
3362
|
+
break
|
|
3363
|
+
|
|
3364
|
+
if text_end > text_start:
|
|
3365
|
+
title_text = title_part[text_start:text_end].strip()
|
|
3366
|
+
|
|
3367
|
+
if href_url and title_text:
|
|
3368
|
+
|
|
3369
|
+
# Extract DOI if it's a doi.org URL
|
|
3370
|
+
if 'doi.org/' in href_url and not ref.get('doi'):
|
|
3371
|
+
doi_match = re.search(r'doi\.org/(.+)$', href_url)
|
|
3372
|
+
if doi_match:
|
|
3373
|
+
ref['doi'] = doi_match.group(1)
|
|
3374
|
+
ref['url'] = href_url
|
|
3375
|
+
# Extract arXiv ID if it's an arxiv URL
|
|
3376
|
+
elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
|
|
3377
|
+
ref['url'] = href_url
|
|
3378
|
+
# Generic URL
|
|
3379
|
+
elif not ref.get('url'):
|
|
3380
|
+
ref['url'] = href_url
|
|
3381
|
+
|
|
3382
|
+
# Use the title text (second part of href), not the URL
|
|
3383
|
+
title_clean = strip_latex_commands(title_text).strip()
|
|
3384
|
+
else:
|
|
3385
|
+
title_clean = strip_latex_commands(title_part).strip()
|
|
3386
|
+
|
|
3304
3387
|
# Remove trailing dots and clean up
|
|
3305
3388
|
title_clean = title_clean.rstrip('.')
|
|
3306
3389
|
if title_clean and len(title_clean) > 5: # Reasonable title length
|
|
@@ -3310,9 +3393,13 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3310
3393
|
if len(parts) >= 3:
|
|
3311
3394
|
venue_part = parts[2].strip()
|
|
3312
3395
|
venue_clean = strip_latex_commands(venue_part).strip()
|
|
3313
|
-
# Remove
|
|
3396
|
+
# Remove "In " prefix if present (common in bbl format)
|
|
3397
|
+
venue_clean = re.sub(r'^In\s+', '', venue_clean)
|
|
3398
|
+
# Remove trailing year only (at end of string), not year in the middle of venue name
|
|
3399
|
+
# e.g., "2020 Conference on..." should keep the conference name
|
|
3314
3400
|
if ref['year']:
|
|
3315
|
-
|
|
3401
|
+
# Only remove year if it appears at the very end (possibly with punctuation)
|
|
3402
|
+
venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
|
|
3316
3403
|
venue_clean = venue_clean.rstrip(',. ')
|
|
3317
3404
|
# Filter out common non-venue patterns that shouldn't be treated as venues
|
|
3318
3405
|
non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
|
|
@@ -3387,11 +3474,24 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3387
3474
|
from refchecker.utils.url_utils import clean_url_punctuation
|
|
3388
3475
|
ref['url'] = clean_url_punctuation(url_match.group(1))
|
|
3389
3476
|
|
|
3390
|
-
# Extract DOI from \href{https://doi.org/...}
|
|
3477
|
+
# Extract DOI from \href{https://doi.org/...} or \href {URL} {text} with spaces
|
|
3391
3478
|
if not ref.get('doi'):
|
|
3392
|
-
|
|
3479
|
+
# Handle both \href{URL}{text} and \href {URL} {text} formats
|
|
3480
|
+
doi_match = re.search(r'\\href\s*\{(https?://doi\.org/[^}]+)\}', content)
|
|
3393
3481
|
if doi_match:
|
|
3394
|
-
|
|
3482
|
+
doi_url = doi_match.group(1)
|
|
3483
|
+
# Extract DOI from the URL
|
|
3484
|
+
doi_id_match = re.search(r'doi\.org/(.+)$', doi_url)
|
|
3485
|
+
if doi_id_match:
|
|
3486
|
+
ref['doi'] = doi_id_match.group(1)
|
|
3487
|
+
if not ref.get('url'):
|
|
3488
|
+
ref['url'] = doi_url
|
|
3489
|
+
|
|
3490
|
+
# Extract URL from \href{URL}{text} if not already set (for non-DOI URLs like arXiv)
|
|
3491
|
+
if not ref.get('url'):
|
|
3492
|
+
href_url_match = re.search(r'\\href\s*\{([^}]+)\}\s*\{[^}]*\}', content)
|
|
3493
|
+
if href_url_match:
|
|
3494
|
+
ref['url'] = href_url_match.group(1).strip()
|
|
3395
3495
|
|
|
3396
3496
|
# Extract arXiv ID from \showeprint[arxiv]{...} (ACM format) or from content (natbib format)
|
|
3397
3497
|
arxiv_match = re.search(r'\\showeprint\[arxiv\]\{([^}]+)\}', content)
|
|
@@ -4020,6 +4120,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4020
4120
|
Returns:
|
|
4021
4121
|
True if venues are substantially different, False if they match/overlap
|
|
4022
4122
|
"""
|
|
4123
|
+
# Import here to avoid circular dependency
|
|
4124
|
+
from refchecker.utils.url_utils import extract_arxiv_id_from_url
|
|
4125
|
+
|
|
4023
4126
|
if not venue1 or not venue2:
|
|
4024
4127
|
return bool(venue1 != venue2)
|
|
4025
4128
|
|
|
@@ -4088,6 +4191,19 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4088
4191
|
|
|
4089
4192
|
venue_lower = expand_abbreviations(venue_lower)
|
|
4090
4193
|
|
|
4194
|
+
# Strip page numbers (e.g., "pages 38--55", "pp. 123-456", "page 42")
|
|
4195
|
+
venue_lower = re.sub(r',?\s*pages?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
|
|
4196
|
+
venue_lower = re.sub(r',?\s*pp\.?\s*\d+\s*[-–—]+\s*\d+', '', venue_lower)
|
|
4197
|
+
venue_lower = re.sub(r',?\s*pages?\s*\d+', '', venue_lower)
|
|
4198
|
+
venue_lower = re.sub(r',?\s*pp\.?\s*\d+', '', venue_lower)
|
|
4199
|
+
|
|
4200
|
+
# Strip publisher names that are commonly appended
|
|
4201
|
+
publishers = ['springer', 'elsevier', 'wiley', 'acm', 'ieee', 'mit press',
|
|
4202
|
+
'cambridge university press', 'oxford university press',
|
|
4203
|
+
'morgan kaufmann', 'addison-wesley', 'prentice hall']
|
|
4204
|
+
for publisher in publishers:
|
|
4205
|
+
venue_lower = re.sub(rf',?\s*{re.escape(publisher)}\s*$', '', venue_lower, flags=re.IGNORECASE)
|
|
4206
|
+
|
|
4091
4207
|
# Remove punctuation and normalize spacing for comparison
|
|
4092
4208
|
venue_lower = re.sub(r'[.,;:]', '', venue_lower) # Remove punctuation
|
|
4093
4209
|
venue_lower = re.sub(r'\\s+on\\s+', ' ', venue_lower) # Remove \"on\" preposition
|
|
File without changes
|
{academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.66.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|