academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ArXiv utility functions for downloading and processing ArXiv papers.
|
|
3
|
+
|
|
4
|
+
This module provides functions for:
|
|
5
|
+
- Downloading ArXiv LaTeX source files
|
|
6
|
+
- Downloading ArXiv BibTeX citations
|
|
7
|
+
- Extracting ArXiv IDs from URLs or paper identifiers
|
|
8
|
+
- Processing ArXiv source files for bibliography content
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import logging
|
|
14
|
+
import requests
|
|
15
|
+
import tempfile
|
|
16
|
+
import tarfile
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_arxiv_id_from_paper(paper):
|
|
22
|
+
"""
|
|
23
|
+
Extract ArXiv ID from a paper object.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
paper: Paper object with potential ArXiv ID in URL or short_id
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
str: ArXiv ID if found, None otherwise
|
|
30
|
+
"""
|
|
31
|
+
arxiv_id = None
|
|
32
|
+
|
|
33
|
+
if hasattr(paper, 'pdf_url') and paper.pdf_url:
|
|
34
|
+
# Try to extract ArXiv ID from the PDF URL
|
|
35
|
+
from refchecker.utils.url_utils import extract_arxiv_id_from_url
|
|
36
|
+
arxiv_id = extract_arxiv_id_from_url(paper.pdf_url)
|
|
37
|
+
elif hasattr(paper, 'get_short_id'):
|
|
38
|
+
# Check if the paper ID itself is an ArXiv ID
|
|
39
|
+
short_id = paper.get_short_id()
|
|
40
|
+
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', short_id):
|
|
41
|
+
arxiv_id = short_id
|
|
42
|
+
|
|
43
|
+
return arxiv_id
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def download_arxiv_source(arxiv_id):
|
|
47
|
+
"""
|
|
48
|
+
Download LaTeX source files from ArXiv for a given ArXiv ID.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
arxiv_id: ArXiv identifier (e.g., "1706.03762")
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Tuple of (main_tex_content, bib_files_content, bbl_files_content) or (None, None, None) if download fails
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
source_url = f"https://arxiv.org/e-print/{arxiv_id}"
|
|
58
|
+
logger.debug(f"Downloading ArXiv source from: {source_url}")
|
|
59
|
+
|
|
60
|
+
response = requests.get(source_url, timeout=60)
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
|
|
63
|
+
# Save to temporary file and extract
|
|
64
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
65
|
+
temp_file.write(response.content)
|
|
66
|
+
temp_path = temp_file.name
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# Extract the tar.gz file
|
|
70
|
+
with tarfile.open(temp_path, 'r:gz') as tar:
|
|
71
|
+
extracted_files = {}
|
|
72
|
+
|
|
73
|
+
for member in tar.getmembers():
|
|
74
|
+
if member.isfile():
|
|
75
|
+
try:
|
|
76
|
+
content = tar.extractfile(member)
|
|
77
|
+
if content:
|
|
78
|
+
# Try to decode as text
|
|
79
|
+
try:
|
|
80
|
+
text_content = content.read().decode('utf-8')
|
|
81
|
+
extracted_files[member.name] = text_content
|
|
82
|
+
except UnicodeDecodeError:
|
|
83
|
+
try:
|
|
84
|
+
text_content = content.read().decode('latin-1')
|
|
85
|
+
extracted_files[member.name] = text_content
|
|
86
|
+
except UnicodeDecodeError:
|
|
87
|
+
# Skip binary files
|
|
88
|
+
continue
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.debug(f"Could not extract {member.name}: {e}")
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Find main .tex file, .bib files, and .bbl files
|
|
94
|
+
tex_files = {name: content for name, content in extracted_files.items() if name.endswith('.tex')}
|
|
95
|
+
bib_files = {name: content for name, content in extracted_files.items() if name.endswith('.bib')}
|
|
96
|
+
bbl_files = {name: content for name, content in extracted_files.items() if name.endswith('.bbl')}
|
|
97
|
+
|
|
98
|
+
# Find the main tex file (usually the one with documentclass or largest file)
|
|
99
|
+
main_tex_content = None
|
|
100
|
+
if tex_files:
|
|
101
|
+
# Look for file with \documentclass
|
|
102
|
+
for name, content in tex_files.items():
|
|
103
|
+
if '\\documentclass' in content:
|
|
104
|
+
main_tex_content = content
|
|
105
|
+
logger.debug(f"Found main tex file: {name}")
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
# If no documentclass found, take the largest file
|
|
109
|
+
if not main_tex_content:
|
|
110
|
+
largest_file = max(tex_files.items(), key=lambda x: len(x[1]))
|
|
111
|
+
main_tex_content = largest_file[1]
|
|
112
|
+
logger.debug(f"Using largest tex file: {largest_file[0]}")
|
|
113
|
+
|
|
114
|
+
# Process .bib files using shared logic
|
|
115
|
+
bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
|
|
116
|
+
|
|
117
|
+
# Combine all bbl file contents
|
|
118
|
+
bbl_content = None
|
|
119
|
+
if bbl_files:
|
|
120
|
+
bbl_content = '\n\n'.join(bbl_files.values())
|
|
121
|
+
logger.debug(f"Found {len(bbl_files)} .bbl files")
|
|
122
|
+
|
|
123
|
+
if main_tex_content or bib_content or bbl_content:
|
|
124
|
+
logger.info(f"Successfully downloaded ArXiv source for {arxiv_id}")
|
|
125
|
+
return main_tex_content, bib_content, bbl_content
|
|
126
|
+
else:
|
|
127
|
+
logger.debug(f"No usable tex, bib, or bbl files found in ArXiv source for {arxiv_id}")
|
|
128
|
+
return None, None, None
|
|
129
|
+
|
|
130
|
+
finally:
|
|
131
|
+
# Clean up temporary file
|
|
132
|
+
try:
|
|
133
|
+
os.unlink(temp_path)
|
|
134
|
+
except:
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.debug(f"Failed to download ArXiv source for {arxiv_id}: {str(e)}")
|
|
139
|
+
return None, None, None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def download_arxiv_bibtex(arxiv_id):
|
|
143
|
+
"""
|
|
144
|
+
Download BibTeX data directly from ArXiv for a given ArXiv ID.
|
|
145
|
+
|
|
146
|
+
Note: This returns BibTeX for CITING the paper itself, not the paper's bibliography
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
arxiv_id: ArXiv identifier (e.g., "1706.03762")
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
BibTeX content as string, or None if download fails
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
bibtex_url = f"https://arxiv.org/bibtex/{arxiv_id}"
|
|
156
|
+
logger.debug(f"Downloading ArXiv BibTeX from: {bibtex_url}")
|
|
157
|
+
|
|
158
|
+
response = requests.get(bibtex_url, timeout=30)
|
|
159
|
+
response.raise_for_status()
|
|
160
|
+
|
|
161
|
+
bibtex_content = response.text.strip()
|
|
162
|
+
if bibtex_content and bibtex_content.startswith('@'):
|
|
163
|
+
logger.info(f"Successfully downloaded citation BibTeX for ArXiv paper {arxiv_id}")
|
|
164
|
+
return bibtex_content
|
|
165
|
+
else:
|
|
166
|
+
logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.debug(f"Failed to download BibTeX for ArXiv paper {arxiv_id}: {str(e)}")
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
|
|
175
|
+
"""
|
|
176
|
+
Select appropriate .bib files based on main TeX file references and filter by citations.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
bib_files: Dict of .bib files {filename: content}
|
|
180
|
+
main_tex_content: Content of main tex file
|
|
181
|
+
tex_files: Dict of all tex files {filename: content} (for filtering)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Filtered BibTeX content or None if no files available
|
|
185
|
+
"""
|
|
186
|
+
import re
|
|
187
|
+
|
|
188
|
+
if not bib_files:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
if main_tex_content:
|
|
192
|
+
# Extract bibliography references from main tex file
|
|
193
|
+
referenced_bibs = []
|
|
194
|
+
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
195
|
+
matches = re.findall(bib_pattern, main_tex_content)
|
|
196
|
+
|
|
197
|
+
for match in matches:
|
|
198
|
+
# Handle multiple bib files separated by commas
|
|
199
|
+
bib_names = [name.strip() for name in match.split(',')]
|
|
200
|
+
for bib_name in bib_names:
|
|
201
|
+
# Add .bib extension if not present
|
|
202
|
+
if not bib_name.endswith('.bib'):
|
|
203
|
+
bib_name += '.bib'
|
|
204
|
+
referenced_bibs.append(bib_name)
|
|
205
|
+
|
|
206
|
+
# Use only referenced .bib files, or all if no references found
|
|
207
|
+
if referenced_bibs:
|
|
208
|
+
used_bibs = []
|
|
209
|
+
seen_bib_names = set() # Track which bib files we've already added
|
|
210
|
+
for bib_name in referenced_bibs:
|
|
211
|
+
if bib_name in bib_files and bib_name not in seen_bib_names:
|
|
212
|
+
used_bibs.append(bib_files[bib_name])
|
|
213
|
+
seen_bib_names.add(bib_name)
|
|
214
|
+
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
215
|
+
elif bib_name in seen_bib_names:
|
|
216
|
+
logger.debug(f"Skipping duplicate .bib file: {bib_name}")
|
|
217
|
+
else:
|
|
218
|
+
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
219
|
+
|
|
220
|
+
if used_bibs:
|
|
221
|
+
raw_bib_content = '\n\n'.join(used_bibs)
|
|
222
|
+
# Filter BibTeX to only include cited references
|
|
223
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
224
|
+
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
225
|
+
return filtered_content
|
|
226
|
+
else:
|
|
227
|
+
# Fallback to all bib files if none of the referenced ones found
|
|
228
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
229
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
230
|
+
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
231
|
+
return filtered_content
|
|
232
|
+
else:
|
|
233
|
+
# No \bibliography command found, use all bib files
|
|
234
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
235
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
236
|
+
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
237
|
+
return filtered_content
|
|
238
|
+
else:
|
|
239
|
+
# No main tex file but have bib files
|
|
240
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
241
|
+
# Can't filter without tex files, so use original content
|
|
242
|
+
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
243
|
+
return raw_bib_content
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def extract_cited_keys_from_tex(tex_files, main_tex_content):
|
|
247
|
+
"""
|
|
248
|
+
Extract all citation keys from TeX files.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
tex_files: Dict of all tex files {filename: content}
|
|
252
|
+
main_tex_content: Content of main tex file
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Set of cited reference keys
|
|
256
|
+
"""
|
|
257
|
+
cited_keys = set()
|
|
258
|
+
|
|
259
|
+
# Combine all tex content
|
|
260
|
+
all_tex_content = main_tex_content or ""
|
|
261
|
+
for tex_content in tex_files.values():
|
|
262
|
+
all_tex_content += "\n" + tex_content
|
|
263
|
+
|
|
264
|
+
# Find all \cite{...} commands
|
|
265
|
+
cite_pattern = r'\\cite\{([^}]+)\}'
|
|
266
|
+
matches = re.findall(cite_pattern, all_tex_content)
|
|
267
|
+
|
|
268
|
+
for match in matches:
|
|
269
|
+
# Handle multiple citations: \cite{key1,key2,key3}
|
|
270
|
+
keys = [key.strip() for key in match.split(',')]
|
|
271
|
+
cited_keys.update(keys)
|
|
272
|
+
|
|
273
|
+
logger.debug(f"Found {len(cited_keys)} unique cited references")
|
|
274
|
+
return cited_keys
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def is_reference_used(reference_key, cited_keys):
|
|
278
|
+
"""
|
|
279
|
+
Check if a specific reference key is used/cited.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
reference_key: The BibTeX key to check
|
|
283
|
+
cited_keys: Set of all cited reference keys
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
True if the reference is cited, False otherwise
|
|
287
|
+
"""
|
|
288
|
+
result = reference_key in cited_keys
|
|
289
|
+
# Add debugging for the first few mismatches to understand the issue
|
|
290
|
+
if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
|
|
291
|
+
logger.debug(f"Key '{reference_key}' not found in cited_keys")
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
296
|
+
"""
|
|
297
|
+
Filter BibTeX content to only include references that are actually cited.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
bib_content: Full BibTeX content
|
|
301
|
+
tex_files: Dict of all tex files {filename: content}
|
|
302
|
+
main_tex_content: Content of main tex file
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Filtered BibTeX content with only cited references
|
|
306
|
+
"""
|
|
307
|
+
if not bib_content:
|
|
308
|
+
return bib_content
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
# Extract all citation keys from tex files
|
|
312
|
+
cited_keys = extract_cited_keys_from_tex(tex_files, main_tex_content)
|
|
313
|
+
|
|
314
|
+
if not cited_keys:
|
|
315
|
+
logger.debug("No citations found, returning full BibTeX content")
|
|
316
|
+
return bib_content
|
|
317
|
+
|
|
318
|
+
# Parse BibTeX entries and filter
|
|
319
|
+
from refchecker.utils.bibtex_parser import parse_bibtex_entries
|
|
320
|
+
entries = parse_bibtex_entries(bib_content)
|
|
321
|
+
|
|
322
|
+
# Filter entries to only cited ones and remove duplicates
|
|
323
|
+
cited_entries = []
|
|
324
|
+
seen_keys = set()
|
|
325
|
+
not_cited_count = 0
|
|
326
|
+
duplicate_count = 0
|
|
327
|
+
|
|
328
|
+
for entry in entries:
|
|
329
|
+
entry_key = entry.get('key', '')
|
|
330
|
+
if is_reference_used(entry_key, cited_keys):
|
|
331
|
+
if entry_key not in seen_keys:
|
|
332
|
+
cited_entries.append(entry)
|
|
333
|
+
seen_keys.add(entry_key)
|
|
334
|
+
else:
|
|
335
|
+
duplicate_count += 1
|
|
336
|
+
logger.debug(f"Skipping duplicate entry: '{entry_key}'")
|
|
337
|
+
else:
|
|
338
|
+
not_cited_count += 1
|
|
339
|
+
# Log first few entries that are NOT cited for debugging
|
|
340
|
+
if not_cited_count <= 5:
|
|
341
|
+
logger.debug(f"Entry NOT cited: '{entry_key}'")
|
|
342
|
+
|
|
343
|
+
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
|
|
344
|
+
logger.debug(f"Citation keys found: {len(cited_keys)} keys")
|
|
345
|
+
logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
|
|
346
|
+
|
|
347
|
+
# Reconstruct BibTeX content from cited entries
|
|
348
|
+
if not cited_entries:
|
|
349
|
+
logger.debug("No cited entries found in BibTeX, returning original content")
|
|
350
|
+
return bib_content
|
|
351
|
+
|
|
352
|
+
# Convert entries back to BibTeX format
|
|
353
|
+
filtered_content = reconstruct_bibtex_content(cited_entries, bib_content)
|
|
354
|
+
return filtered_content
|
|
355
|
+
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.debug(f"Error filtering BibTeX by citations: {e}")
|
|
358
|
+
return bib_content # Fallback to original content
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def reconstruct_bibtex_content(cited_entries, original_content):
|
|
362
|
+
"""
|
|
363
|
+
Reconstruct BibTeX content from filtered entries by extracting original text.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
cited_entries: List of cited entry dictionaries
|
|
367
|
+
original_content: Original BibTeX content
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Reconstructed BibTeX content with only cited entries
|
|
371
|
+
"""
|
|
372
|
+
cited_keys = {entry.get('key', '') for entry in cited_entries}
|
|
373
|
+
|
|
374
|
+
# Extract original BibTeX entries by finding their text blocks using robust brace counting
|
|
375
|
+
filtered_parts = []
|
|
376
|
+
|
|
377
|
+
import re
|
|
378
|
+
# Find all entry starts
|
|
379
|
+
entry_starts = []
|
|
380
|
+
for match in re.finditer(r'@\w+\s*\{', original_content, re.IGNORECASE):
|
|
381
|
+
entry_starts.append(match.start())
|
|
382
|
+
|
|
383
|
+
# For each entry start, find the complete entry by counting braces
|
|
384
|
+
for start_pos in entry_starts:
|
|
385
|
+
# Extract the key from the entry header
|
|
386
|
+
key_match = re.search(r'@\w+\s*\{\s*([^,\s}]+)', original_content[start_pos:start_pos+200])
|
|
387
|
+
if not key_match:
|
|
388
|
+
continue
|
|
389
|
+
|
|
390
|
+
entry_key = key_match.group(1).strip()
|
|
391
|
+
if entry_key not in cited_keys:
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
# Find the complete entry by counting braces
|
|
395
|
+
brace_count = 0
|
|
396
|
+
pos = start_pos
|
|
397
|
+
entry_start_found = False
|
|
398
|
+
|
|
399
|
+
while pos < len(original_content):
|
|
400
|
+
char = original_content[pos]
|
|
401
|
+
if char == '{':
|
|
402
|
+
if not entry_start_found:
|
|
403
|
+
entry_start_found = True
|
|
404
|
+
brace_count += 1
|
|
405
|
+
elif char == '}':
|
|
406
|
+
brace_count -= 1
|
|
407
|
+
if entry_start_found and brace_count == 0:
|
|
408
|
+
entry_end = pos + 1
|
|
409
|
+
entry_text = original_content[start_pos:entry_end]
|
|
410
|
+
filtered_parts.append(entry_text)
|
|
411
|
+
break
|
|
412
|
+
pos += 1
|
|
413
|
+
|
|
414
|
+
if not filtered_parts:
|
|
415
|
+
logger.debug("Could not reconstruct BibTeX entries, returning original")
|
|
416
|
+
return original_content
|
|
417
|
+
|
|
418
|
+
return '\n\n'.join(filtered_parts) + '\n'
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def get_bibtex_content(paper):
|
|
422
|
+
"""
|
|
423
|
+
Try to get BibTeX content for a paper from various sources.
|
|
424
|
+
|
|
425
|
+
For ArXiv papers, only use .bbl files (compiled bibliography).
|
|
426
|
+
The .bbl file contains only the actually-cited references, while .bib files
|
|
427
|
+
are unreliable - they may contain entire bibliography databases (e.g., full
|
|
428
|
+
ACL Anthology with 80k+ entries) or unfiltered reference collections.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
paper: Paper object
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
str: BibTeX content if found, None otherwise
|
|
435
|
+
"""
|
|
436
|
+
import re
|
|
437
|
+
|
|
438
|
+
# Try ArXiv source if it's an ArXiv paper
|
|
439
|
+
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
440
|
+
if arxiv_id:
|
|
441
|
+
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for .bbl bibliography")
|
|
442
|
+
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
443
|
+
|
|
444
|
+
# Only use .bbl files for ArXiv papers (.bib files are unreliable)
|
|
445
|
+
if bbl_content:
|
|
446
|
+
bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
|
|
447
|
+
if bbl_entry_count > 0:
|
|
448
|
+
logger.info(f"Using .bbl files from ArXiv source ({bbl_entry_count} entries)")
|
|
449
|
+
return bbl_content
|
|
450
|
+
else:
|
|
451
|
+
logger.debug(f"Found .bbl file but it appears empty")
|
|
452
|
+
|
|
453
|
+
# No .bbl available - return None to trigger PDF fallback
|
|
454
|
+
if bib_content:
|
|
455
|
+
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
456
|
+
logger.debug(f"Skipping .bib file ({bib_entry_count} entries) - unreliable, falling back to PDF extraction")
|
|
457
|
+
|
|
458
|
+
logger.debug(f"No usable .bbl file found for ArXiv paper {arxiv_id}")
|
|
459
|
+
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Author comparison utilities for ArXiv Reference Checker
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import logging
|
|
8
|
+
from .text_utils import normalize_text
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
def levenshtein_distance(s1, s2):
|
|
13
|
+
"""
|
|
14
|
+
Calculate the Levenshtein distance between two strings
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
s1, s2: Strings to compare
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Integer distance
|
|
21
|
+
"""
|
|
22
|
+
if len(s1) < len(s2):
|
|
23
|
+
return levenshtein_distance(s2, s1)
|
|
24
|
+
|
|
25
|
+
if len(s2) == 0:
|
|
26
|
+
return len(s1)
|
|
27
|
+
|
|
28
|
+
previous_row = list(range(len(s2) + 1))
|
|
29
|
+
for i, c1 in enumerate(s1):
|
|
30
|
+
current_row = [i + 1]
|
|
31
|
+
for j, c2 in enumerate(s2):
|
|
32
|
+
insertions = previous_row[j + 1] + 1
|
|
33
|
+
deletions = current_row[j] + 1
|
|
34
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
35
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
36
|
+
previous_row = current_row
|
|
37
|
+
|
|
38
|
+
return previous_row[-1]
|
|
39
|
+
|
|
40
|
+
def compare_authors(cited_authors, correct_authors, threshold=0.8):
|
|
41
|
+
"""
|
|
42
|
+
Compare two author lists and return similarity metrics
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
cited_authors: List of authors as cited
|
|
46
|
+
correct_authors: List of correct authors
|
|
47
|
+
threshold: Similarity threshold (0-1)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Dictionary with comparison results
|
|
51
|
+
"""
|
|
52
|
+
if not cited_authors or not correct_authors:
|
|
53
|
+
return {
|
|
54
|
+
'match': False,
|
|
55
|
+
'similarity': 0.0,
|
|
56
|
+
'details': 'One or both author lists empty'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Handle "et al." cases
|
|
60
|
+
cited_has_et_al = any('et al' in author.lower() for author in cited_authors)
|
|
61
|
+
correct_has_et_al = len(correct_authors) > 3
|
|
62
|
+
|
|
63
|
+
if cited_has_et_al or correct_has_et_al:
|
|
64
|
+
# Compare only the first few authors
|
|
65
|
+
cited_main = [a for a in cited_authors if 'et al' not in a.lower()][:3]
|
|
66
|
+
correct_main = correct_authors[:3]
|
|
67
|
+
|
|
68
|
+
if len(cited_main) == 0:
|
|
69
|
+
return {
|
|
70
|
+
'match': True, # "et al." without specific authors
|
|
71
|
+
'similarity': 0.9,
|
|
72
|
+
'details': 'Et al. reference'
|
|
73
|
+
}
|
|
74
|
+
else:
|
|
75
|
+
cited_main = cited_authors
|
|
76
|
+
correct_main = correct_authors
|
|
77
|
+
|
|
78
|
+
# Calculate similarities for each cited author
|
|
79
|
+
similarities = []
|
|
80
|
+
matched_authors = 0
|
|
81
|
+
|
|
82
|
+
for cited_author in cited_main:
|
|
83
|
+
cited_norm = normalize_text(cited_author)
|
|
84
|
+
best_similarity = 0.0
|
|
85
|
+
best_match = ''
|
|
86
|
+
|
|
87
|
+
for correct_author in correct_main:
|
|
88
|
+
correct_norm = normalize_text(correct_author)
|
|
89
|
+
|
|
90
|
+
# Calculate similarity
|
|
91
|
+
if cited_norm == correct_norm:
|
|
92
|
+
similarity = 1.0
|
|
93
|
+
elif cited_norm in correct_norm or correct_norm in cited_norm:
|
|
94
|
+
similarity = 0.9
|
|
95
|
+
else:
|
|
96
|
+
# Use Levenshtein distance
|
|
97
|
+
max_len = max(len(cited_norm), len(correct_norm))
|
|
98
|
+
if max_len == 0:
|
|
99
|
+
similarity = 1.0
|
|
100
|
+
else:
|
|
101
|
+
distance = levenshtein_distance(cited_norm, correct_norm)
|
|
102
|
+
similarity = 1.0 - (distance / max_len)
|
|
103
|
+
|
|
104
|
+
if similarity > best_similarity:
|
|
105
|
+
best_similarity = similarity
|
|
106
|
+
best_match = correct_author
|
|
107
|
+
|
|
108
|
+
similarities.append({
|
|
109
|
+
'cited': cited_author,
|
|
110
|
+
'matched': best_match,
|
|
111
|
+
'similarity': best_similarity
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
if best_similarity >= threshold:
|
|
115
|
+
matched_authors += 1
|
|
116
|
+
|
|
117
|
+
# Calculate overall match
|
|
118
|
+
if len(cited_main) == 0:
|
|
119
|
+
overall_similarity = 0.0
|
|
120
|
+
else:
|
|
121
|
+
overall_similarity = matched_authors / len(cited_main)
|
|
122
|
+
|
|
123
|
+
# Determine if it's a match
|
|
124
|
+
is_match = overall_similarity >= threshold
|
|
125
|
+
|
|
126
|
+
# Handle author count mismatch
|
|
127
|
+
count_penalty = 0
|
|
128
|
+
if len(cited_main) != len(correct_main):
|
|
129
|
+
count_diff = abs(len(cited_main) - len(correct_main))
|
|
130
|
+
count_penalty = min(0.1 * count_diff, 0.3) # Max 30% penalty
|
|
131
|
+
overall_similarity = max(0, overall_similarity - count_penalty)
|
|
132
|
+
|
|
133
|
+
details = f"Matched {matched_authors}/{len(cited_main)} authors"
|
|
134
|
+
if count_penalty > 0:
|
|
135
|
+
details += f", count mismatch penalty: {count_penalty:.1f}"
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
'match': is_match,
|
|
139
|
+
'similarity': overall_similarity,
|
|
140
|
+
'details': details,
|
|
141
|
+
'author_matches': similarities
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
def extract_authors_list(authors_text):
|
|
145
|
+
"""
|
|
146
|
+
Extract a list of authors from text
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
authors_text: String containing author names
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of cleaned author names
|
|
153
|
+
"""
|
|
154
|
+
if not isinstance(authors_text, str):
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
# Remove common prefixes
|
|
158
|
+
authors_text = re.sub(r'^(by|authors?:)\s*', '', authors_text, flags=re.IGNORECASE)
|
|
159
|
+
|
|
160
|
+
# Split by common separators
|
|
161
|
+
separators = [',', ';', ' and ', ' & ', '\n']
|
|
162
|
+
authors = [authors_text]
|
|
163
|
+
|
|
164
|
+
for sep in separators:
|
|
165
|
+
new_authors = []
|
|
166
|
+
for author in authors:
|
|
167
|
+
new_authors.extend([a.strip() for a in author.split(sep)])
|
|
168
|
+
authors = new_authors
|
|
169
|
+
|
|
170
|
+
# Clean each author name
|
|
171
|
+
from .text_utils import clean_author_name
|
|
172
|
+
cleaned_authors = []
|
|
173
|
+
for author in authors:
|
|
174
|
+
if author.strip(): # Skip empty strings
|
|
175
|
+
cleaned = clean_author_name(author)
|
|
176
|
+
if cleaned: # Only add non-empty cleaned names
|
|
177
|
+
cleaned_authors.append(cleaned)
|
|
178
|
+
|
|
179
|
+
return cleaned_authors
|