academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Bibliography extraction and parsing utilities.
|
|
4
|
+
|
|
5
|
+
This module provides utilities for extracting and parsing bibliographies from
|
|
6
|
+
academic papers in various formats (LaTeX, BibTeX, PDF text, etc.).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_text_from_latex(latex_file_path):
|
|
17
|
+
"""
|
|
18
|
+
Extract text from a LaTeX file
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
latex_file_path: Path to the LaTeX file
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
String containing the LaTeX file content
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
logger.info(f"Reading LaTeX file: {latex_file_path}")
|
|
28
|
+
with open(latex_file_path, 'r', encoding='utf-8') as f:
|
|
29
|
+
content = f.read()
|
|
30
|
+
logger.info(f"Successfully read LaTeX file with {len(content)} characters")
|
|
31
|
+
return content
|
|
32
|
+
except UnicodeDecodeError:
|
|
33
|
+
# Try with latin-1 encoding if utf-8 fails
|
|
34
|
+
try:
|
|
35
|
+
logger.warning(f"UTF-8 encoding failed for {latex_file_path}, trying latin-1")
|
|
36
|
+
with open(latex_file_path, 'r', encoding='latin-1') as f:
|
|
37
|
+
content = f.read()
|
|
38
|
+
logger.info(f"Successfully read LaTeX file with latin-1 encoding")
|
|
39
|
+
return content
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
|
|
42
|
+
return None
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def find_bibliography_section(text):
|
|
49
|
+
"""
|
|
50
|
+
Find the bibliography section in the text
|
|
51
|
+
"""
|
|
52
|
+
if not text:
|
|
53
|
+
logger.warning("No text provided to find_bibliography_section")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# Log a sample of the text for debugging
|
|
57
|
+
text_sample = text[:500] + "..." if len(text) > 500 else text
|
|
58
|
+
logger.debug(f"Text sample: {text_sample}")
|
|
59
|
+
|
|
60
|
+
# Common section titles for bibliography
|
|
61
|
+
section_patterns = [
|
|
62
|
+
# Patterns for numbered sections with potential spacing issues from PDF extraction
|
|
63
|
+
r'(?i)\d+\s*ref\s*er\s*ences\s*\n', # "12 Refer ences" with spaces
|
|
64
|
+
r'(?i)\d+\s*references\s*\n', # "12References" or "12 References"
|
|
65
|
+
r'(?i)^\s*\d+\.\s*references\s*$', # Numbered section: "7. References"
|
|
66
|
+
r'(?i)\d+\s+references\s*\.', # "9 References." format used in Georgia Tech paper
|
|
67
|
+
# Standard reference patterns
|
|
68
|
+
r'(?i)references\s*\n',
|
|
69
|
+
r'(?i)bibliography\s*\n',
|
|
70
|
+
r'(?i)works cited\s*\n',
|
|
71
|
+
r'(?i)literature cited\s*\n',
|
|
72
|
+
r'(?i)references\s*$', # End of document
|
|
73
|
+
r'(?i)\[\s*references\s*\]', # [References]
|
|
74
|
+
r'(?i)^\s*references\s*$', # References as a standalone line
|
|
75
|
+
r'(?i)^\s*bibliography\s*$', # Bibliography as a standalone line
|
|
76
|
+
r'(?i)references\s*and\s*citations', # References and Citations
|
|
77
|
+
r'(?i)cited\s*references', # Cited References
|
|
78
|
+
r'(?i)reference\s*list', # Reference List
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
bibliography_start = None
|
|
82
|
+
matched_pattern = None
|
|
83
|
+
|
|
84
|
+
for pattern in section_patterns:
|
|
85
|
+
matches = re.search(pattern, text, re.MULTILINE)
|
|
86
|
+
if matches:
|
|
87
|
+
bibliography_start = matches.end()
|
|
88
|
+
matched_pattern = pattern
|
|
89
|
+
logger.debug(f"Bibliography section found using pattern: {pattern}")
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
if bibliography_start is None:
|
|
93
|
+
logger.debug("No bibliography section header found, trying end-of-document approach")
|
|
94
|
+
# Try to find bibliography at the end of the document without explicit headers
|
|
95
|
+
lines = text.split('\n')
|
|
96
|
+
for i in range(len(lines) - 1, max(0, len(lines) - 100), -1): # Check last 100 lines
|
|
97
|
+
line = lines[i].strip()
|
|
98
|
+
if re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line):
|
|
99
|
+
# Found what looks like reference entries
|
|
100
|
+
bibliography_start = text.rfind('\n'.join(lines[i:]))
|
|
101
|
+
logger.debug(f"Bibliography section found at end of document starting with: {line[:50]}")
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
if bibliography_start is not None:
|
|
105
|
+
bibliography_text = text[bibliography_start:].strip()
|
|
106
|
+
logger.debug(f"Bibliography text length: {len(bibliography_text)}")
|
|
107
|
+
|
|
108
|
+
# Optional: Try to find the end of the bibliography section
|
|
109
|
+
# This is challenging because it might go to the end of the document
|
|
110
|
+
# or be followed by appendices, acknowledgments, etc.
|
|
111
|
+
|
|
112
|
+
return bibliography_text
|
|
113
|
+
|
|
114
|
+
logger.debug("Bibliography section not found")
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_references(bibliography_text):
|
|
119
|
+
"""
|
|
120
|
+
Parse references from bibliography text using multiple parsing strategies.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
bibliography_text: String containing bibliography content
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of parsed reference dictionaries
|
|
127
|
+
"""
|
|
128
|
+
if not bibliography_text:
|
|
129
|
+
logger.warning("No bibliography text provided to parse_references")
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
# Try different parsing strategies in order of preference
|
|
133
|
+
parsing_strategies = [
|
|
134
|
+
('BibTeX', _parse_bibtex_references),
|
|
135
|
+
('biblatex', _parse_biblatex_references),
|
|
136
|
+
('ACM/natbib', _parse_standard_acm_natbib_references),
|
|
137
|
+
('regex-based', _parse_references_regex)
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
for strategy_name, parse_func in parsing_strategies:
|
|
141
|
+
try:
|
|
142
|
+
logger.debug(f"Attempting {strategy_name} parsing")
|
|
143
|
+
references = parse_func(bibliography_text)
|
|
144
|
+
if references and len(references) > 0:
|
|
145
|
+
logger.info(f"Successfully parsed {len(references)} references using {strategy_name} format")
|
|
146
|
+
return references
|
|
147
|
+
else:
|
|
148
|
+
logger.debug(f"{strategy_name} parsing returned no references")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.debug(f"{strategy_name} parsing failed: {e}")
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
logger.warning("All parsing strategies failed to extract references")
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _parse_bibtex_references(bibliography_text):
|
|
158
|
+
"""
|
|
159
|
+
Parse BibTeX formatted references like @inproceedings{...}, @article{...}, etc.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
bibliography_text: String containing BibTeX entries
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
List of reference dictionaries
|
|
166
|
+
"""
|
|
167
|
+
from refchecker.utils.bibtex_parser import parse_bibtex_entries
|
|
168
|
+
return parse_bibtex_entries(bibliography_text)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _parse_biblatex_references(bibliography_text):
|
|
172
|
+
"""
|
|
173
|
+
Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
bibliography_text: String containing biblatex .bbl entries
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
List of reference dictionaries
|
|
180
|
+
"""
|
|
181
|
+
from refchecker.utils.text_utils import extract_latex_references
|
|
182
|
+
return extract_latex_references(bibliography_text)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _parse_standard_acm_natbib_references(bibliography_text):
|
|
186
|
+
"""
|
|
187
|
+
Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
|
|
188
|
+
"""
|
|
189
|
+
from refchecker.utils.text_utils import detect_standard_acm_natbib_format
|
|
190
|
+
|
|
191
|
+
references = []
|
|
192
|
+
|
|
193
|
+
# Check if this is standard ACM natbib format
|
|
194
|
+
format_info = detect_standard_acm_natbib_format(bibliography_text)
|
|
195
|
+
if format_info['is_acm_natbib']:
|
|
196
|
+
logger.debug("Detected standard ACM natbib format")
|
|
197
|
+
|
|
198
|
+
# Split by reference entries
|
|
199
|
+
ref_pattern = r'\[(\d+)\]\s*'
|
|
200
|
+
entries = re.split(ref_pattern, bibliography_text)[1:] # Skip first empty element
|
|
201
|
+
|
|
202
|
+
for i in range(0, len(entries), 2):
|
|
203
|
+
if i + 1 < len(entries):
|
|
204
|
+
ref_num = entries[i]
|
|
205
|
+
ref_content = entries[i + 1].strip()
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
reference = _parse_simple_natbib_format(int(ref_num), ref_content, f"[{ref_num}]")
|
|
209
|
+
if reference:
|
|
210
|
+
references.append(reference)
|
|
211
|
+
logger.debug(f"Parsed reference {ref_num}: {reference.get('title', 'No title')[:50]}...")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.debug(f"Error parsing reference {ref_num}: {e}")
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
logger.debug(f"ACM natbib parsing extracted {len(references)} references")
|
|
217
|
+
|
|
218
|
+
return references
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _parse_simple_natbib_format(ref_num, content, label):
|
|
222
|
+
"""
|
|
223
|
+
Parse a simple natbib format reference entry.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
ref_num: Reference number
|
|
227
|
+
content: Reference content text
|
|
228
|
+
label: Reference label (e.g., "[1]")
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Dictionary containing parsed reference information
|
|
232
|
+
"""
|
|
233
|
+
from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
|
|
234
|
+
|
|
235
|
+
# Basic parsing - this could be enhanced with more sophisticated NLP
|
|
236
|
+
reference = {
|
|
237
|
+
'raw_text': content,
|
|
238
|
+
'label': label,
|
|
239
|
+
'type': 'unknown'
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Try to extract basic information
|
|
243
|
+
# This is a simplified parser - real parsing would be much more complex
|
|
244
|
+
|
|
245
|
+
# Look for URL
|
|
246
|
+
url = extract_url_from_reference(content)
|
|
247
|
+
if url:
|
|
248
|
+
reference['url'] = url
|
|
249
|
+
|
|
250
|
+
# Look for year
|
|
251
|
+
year = extract_year_from_reference(content)
|
|
252
|
+
if year:
|
|
253
|
+
reference['year'] = year
|
|
254
|
+
|
|
255
|
+
# Try to identify the type based on content
|
|
256
|
+
content_lower = content.lower()
|
|
257
|
+
if 'proceedings' in content_lower or 'conference' in content_lower:
|
|
258
|
+
reference['type'] = 'inproceedings'
|
|
259
|
+
elif 'journal' in content_lower or 'trans.' in content_lower:
|
|
260
|
+
reference['type'] = 'article'
|
|
261
|
+
elif 'arxiv' in content_lower:
|
|
262
|
+
reference['type'] = 'misc'
|
|
263
|
+
reference['note'] = 'arXiv preprint'
|
|
264
|
+
|
|
265
|
+
return reference
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _parse_references_regex(bibliography_text):
|
|
269
|
+
"""
|
|
270
|
+
Parse references using regex-based approach (original implementation)
|
|
271
|
+
"""
|
|
272
|
+
references = []
|
|
273
|
+
|
|
274
|
+
# Split bibliography into individual references
|
|
275
|
+
# Look for patterns like [1], [2], etc.
|
|
276
|
+
ref_pattern = r'\[(\d+)\](.*?)(?=\[\d+\]|$)'
|
|
277
|
+
matches = re.findall(ref_pattern, bibliography_text, re.DOTALL)
|
|
278
|
+
|
|
279
|
+
for ref_num, ref_content in matches:
|
|
280
|
+
ref_content = ref_content.strip()
|
|
281
|
+
if not ref_content:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
reference = {
|
|
285
|
+
'raw_text': ref_content,
|
|
286
|
+
'label': f"[{ref_num}]",
|
|
287
|
+
'type': 'unknown'
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Basic information extraction
|
|
291
|
+
from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
|
|
292
|
+
|
|
293
|
+
url = extract_url_from_reference(ref_content)
|
|
294
|
+
if url:
|
|
295
|
+
reference['url'] = url
|
|
296
|
+
|
|
297
|
+
year = extract_year_from_reference(ref_content)
|
|
298
|
+
if year:
|
|
299
|
+
reference['year'] = year
|
|
300
|
+
|
|
301
|
+
references.append(reference)
|
|
302
|
+
|
|
303
|
+
return references
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _is_bibtex_surname_given_format(surname_part, given_part):
|
|
307
|
+
"""
|
|
308
|
+
Check if this appears to be a BibTeX "Surname, Given" format.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
surname_part: The part before the comma
|
|
312
|
+
given_part: The part after the comma
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Boolean indicating if this looks like BibTeX name format
|
|
316
|
+
"""
|
|
317
|
+
# Simple heuristics to detect BibTeX format
|
|
318
|
+
if not surname_part or not given_part:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
# Check if surname looks like a surname (capitalized, not too long)
|
|
322
|
+
if not re.match(r'^[A-Z][a-zA-Z\s\-\']+$', surname_part.strip()):
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
# Check if given part looks like given names (often abbreviated)
|
|
326
|
+
given_clean = given_part.strip()
|
|
327
|
+
if re.match(r'^[A-Z](\.\s*[A-Z]\.?)*$', given_clean): # Like "J. R." or "M. K."
|
|
328
|
+
return True
|
|
329
|
+
if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]*)*$', given_clean): # Like "John Robert"
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
return False
|