academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Biblatex format parser utility
|
|
4
|
+
|
|
5
|
+
Handles parsing of biblatex .bbl format references like:
|
|
6
|
+
[1] Author et al. "Title". In: Venue. Year.
|
|
7
|
+
[43] Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez.
|
|
8
|
+
Gorilla: Large Language Model Connected with Massive APIs. 2023. arXiv: 2305.15334 [cs.CL].
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import logging
|
|
13
|
+
from typing import List, Dict, Any
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _handle_hyphenated_line_breaks(content: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Intelligently handle hyphenated words split across lines.
|
|
21
|
+
|
|
22
|
+
Distinguishes between:
|
|
23
|
+
- Syllable breaks: "Christo-\npher" -> "Christopher" (remove hyphen)
|
|
24
|
+
- Compound words: "Browser-\nassisted" -> "Browser-assisted" (keep hyphen)
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
content: Text content with potential hyphenated line breaks
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Content with appropriate hyphen handling
|
|
31
|
+
"""
|
|
32
|
+
# Find all hyphen + line break patterns
|
|
33
|
+
hyphen_matches = list(re.finditer(r'(\w+)-\s*\n\s*(\w+)', content))
|
|
34
|
+
|
|
35
|
+
# Process matches in reverse order to avoid offset issues
|
|
36
|
+
for match in reversed(hyphen_matches):
|
|
37
|
+
before_word = match.group(1)
|
|
38
|
+
after_word = match.group(2)
|
|
39
|
+
full_match = match.group(0)
|
|
40
|
+
|
|
41
|
+
# Determine if this is a syllable break or compound word
|
|
42
|
+
if _is_syllable_break(before_word, after_word):
|
|
43
|
+
# Remove hyphen for syllable breaks
|
|
44
|
+
replacement = before_word + after_word
|
|
45
|
+
else:
|
|
46
|
+
# Keep hyphen for compound words
|
|
47
|
+
replacement = before_word + '-' + after_word
|
|
48
|
+
|
|
49
|
+
# Replace in content
|
|
50
|
+
start, end = match.span()
|
|
51
|
+
content = content[:start] + replacement + content[end:]
|
|
52
|
+
|
|
53
|
+
return content
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _is_syllable_break(before_word: str, after_word: str) -> bool:
|
|
57
|
+
"""
|
|
58
|
+
Determine if a hyphen represents a syllable break vs compound word.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
before_word: Word part before the hyphen
|
|
62
|
+
after_word: Word part after the hyphen
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if this appears to be a syllable break, False if compound word
|
|
66
|
+
"""
|
|
67
|
+
# Convert to lowercase for analysis
|
|
68
|
+
before_lower = before_word.lower()
|
|
69
|
+
after_lower = after_word.lower()
|
|
70
|
+
|
|
71
|
+
# Common patterns that indicate syllable breaks (should remove hyphen)
|
|
72
|
+
syllable_break_patterns = [
|
|
73
|
+
# Name patterns - first part looks like truncated first name, second part like surname
|
|
74
|
+
(len(before_lower) <= 8 and before_word[0].isupper() and
|
|
75
|
+
len(after_lower) >= 3 and after_word[0].islower()),
|
|
76
|
+
|
|
77
|
+
# Common word ending/beginning patterns for syllable breaks
|
|
78
|
+
(before_lower.endswith(('ing', 'tion', 'sion', 'ness', 'ment', 'ful', 'less', 'ity', 'ies', 'ly', 'ed')) and
|
|
79
|
+
len(after_lower) <= 4),
|
|
80
|
+
|
|
81
|
+
# Short fragments that are likely syllable breaks
|
|
82
|
+
(len(before_lower) <= 4 and len(after_lower) <= 4),
|
|
83
|
+
|
|
84
|
+
# Common prefixes that typically form single words/suffixes
|
|
85
|
+
(before_lower in ['pre', 'post', 'anti', 'co', 'sub', 'out', 'up', 'non', 'dis', 'mis', 'un', 'in', 're'] or
|
|
86
|
+
after_lower.startswith(('ing', 'ed', 'er', 'est', 'ly', 'ness', 'ment', 'ful', 'less', 'ism', 'ist', 'ity'))),
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Common patterns that indicate compound words (should keep hyphen)
|
|
90
|
+
compound_word_patterns = [
|
|
91
|
+
# Both parts are substantial words (likely compound)
|
|
92
|
+
(len(before_lower) >= 5 and len(after_lower) >= 5),
|
|
93
|
+
|
|
94
|
+
# Technical/academic compound words
|
|
95
|
+
(before_lower in ['browser', 'question', 'self', 'multi', 'cross', 'inter', 'state', 'real', 'end'] or
|
|
96
|
+
after_lower in ['assisted', 'answering', 'aware', 'based', 'driven', 'oriented', 'time', 'world', 'user']),
|
|
97
|
+
|
|
98
|
+
# Common compound word patterns
|
|
99
|
+
(before_lower.endswith('er') and len(before_lower) >= 4 and len(after_lower) >= 6),
|
|
100
|
+
|
|
101
|
+
# Both words start with capital (likely proper nouns or technical terms)
|
|
102
|
+
(before_word[0].isupper() and after_word[0].isupper() and
|
|
103
|
+
len(before_word) >= 4 and len(after_word) >= 4),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Check compound word patterns first (more specific)
|
|
107
|
+
for pattern in compound_word_patterns:
|
|
108
|
+
if pattern:
|
|
109
|
+
return False # Keep hyphen (compound word)
|
|
110
|
+
|
|
111
|
+
# Check syllable break patterns
|
|
112
|
+
for pattern in syllable_break_patterns:
|
|
113
|
+
if pattern:
|
|
114
|
+
return True # Remove hyphen (syllable break)
|
|
115
|
+
|
|
116
|
+
# Default: if uncertain, lean towards compound word to preserve meaning
|
|
117
|
+
# This is safer than incorrectly joining compound words
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def detect_biblatex_format(text: str) -> bool:
|
|
122
|
+
"""
|
|
123
|
+
Detect if text contains biblatex .bbl format references
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
text: Text to analyze
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
True if biblatex format detected, False otherwise
|
|
130
|
+
"""
|
|
131
|
+
# Look for biblatex patterns like [1] Author. "Title".
|
|
132
|
+
# This is different from BibTeX (@article{}) and standard numbered lists
|
|
133
|
+
|
|
134
|
+
# Must have the biblatex auxiliary file marker or numbered reference pattern
|
|
135
|
+
has_biblatex_marker = 'biblatex auxiliary file' in text
|
|
136
|
+
has_numbered_refs = bool(re.search(r'^\[\d+\]\s+[A-Z]', text, re.MULTILINE))
|
|
137
|
+
|
|
138
|
+
return has_biblatex_marker or has_numbered_refs
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
|
|
142
|
+
"""
|
|
143
|
+
Validate that biblatex parsing results are of acceptable quality.
|
|
144
|
+
If quality is poor, we should fallback to LLM parsing instead.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
references: List of parsed reference dictionaries
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
True if parsing quality is acceptable, False if should fallback to LLM
|
|
151
|
+
"""
|
|
152
|
+
if not references:
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
# Count problematic entries
|
|
156
|
+
unknown_authors = 0
|
|
157
|
+
unknown_titles = 0
|
|
158
|
+
total_entries = len(references)
|
|
159
|
+
|
|
160
|
+
for ref in references:
|
|
161
|
+
authors = ref.get('authors', [])
|
|
162
|
+
title = ref.get('title', '')
|
|
163
|
+
|
|
164
|
+
# Check for "Unknown Author" entries
|
|
165
|
+
if not authors or authors == ['Unknown Author']:
|
|
166
|
+
unknown_authors += 1
|
|
167
|
+
|
|
168
|
+
# Check for "Unknown Title" entries
|
|
169
|
+
if not title or title == 'Unknown Title':
|
|
170
|
+
unknown_titles += 1
|
|
171
|
+
|
|
172
|
+
# Calculate failure rates
|
|
173
|
+
author_failure_rate = unknown_authors / total_entries
|
|
174
|
+
title_failure_rate = unknown_titles / total_entries
|
|
175
|
+
|
|
176
|
+
# Quality thresholds - if more than 20% of entries have parsing failures,
|
|
177
|
+
# fallback to LLM which is more robust
|
|
178
|
+
MAX_ACCEPTABLE_FAILURE_RATE = 0.2
|
|
179
|
+
|
|
180
|
+
if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
181
|
+
logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
185
|
+
logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
193
|
+
"""
|
|
194
|
+
Parse biblatex formatted references into structured format
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
text: String containing biblatex .bbl entries
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of structured reference dictionaries, or empty list if
|
|
201
|
+
parsing quality is poor (to trigger LLM fallback)
|
|
202
|
+
"""
|
|
203
|
+
from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
|
|
204
|
+
from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
205
|
+
|
|
206
|
+
if not text or not detect_biblatex_format(text):
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
references = []
|
|
210
|
+
|
|
211
|
+
# First split by entries to handle them individually
|
|
212
|
+
# This is more robust than a single regex for the entire text
|
|
213
|
+
# Use ^ to ensure we only match entries at start of line (bibliography entries)
|
|
214
|
+
entry_starts = []
|
|
215
|
+
for match in re.finditer(r'^\[(\d+)\]', text, re.MULTILINE):
|
|
216
|
+
entry_starts.append((int(match.group(1)), match.start(), match.end()))
|
|
217
|
+
|
|
218
|
+
# Sort by entry number to ensure correct order
|
|
219
|
+
entry_starts.sort()
|
|
220
|
+
|
|
221
|
+
matches = []
|
|
222
|
+
for i, (entry_num, start, end) in enumerate(entry_starts):
|
|
223
|
+
# Find the content between this entry and the next (or end of text)
|
|
224
|
+
if i + 1 < len(entry_starts):
|
|
225
|
+
next_start = entry_starts[i + 1][1]
|
|
226
|
+
raw_content = text[end:next_start].strip()
|
|
227
|
+
else:
|
|
228
|
+
# Last entry - take everything to end, but be smart about stopping
|
|
229
|
+
remaining = text[end:].strip()
|
|
230
|
+
# Stop at obvious document structure markers
|
|
231
|
+
stop_patterns = [
|
|
232
|
+
r'\n\d+\n', # Page numbers
|
|
233
|
+
r'\nChecklist\n',
|
|
234
|
+
r'\nA Additional Details',
|
|
235
|
+
r'\nAppendix',
|
|
236
|
+
r'\n\d+\. For all authors',
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
min_stop = len(remaining)
|
|
240
|
+
for pattern in stop_patterns:
|
|
241
|
+
match = re.search(pattern, remaining)
|
|
242
|
+
if match and match.start() < min_stop:
|
|
243
|
+
min_stop = match.start()
|
|
244
|
+
|
|
245
|
+
raw_content = remaining[:min_stop].strip()
|
|
246
|
+
|
|
247
|
+
# Clean up content - handle cases where entry might be incomplete or malformed
|
|
248
|
+
if raw_content:
|
|
249
|
+
# Remove stray closing brackets or incomplete markers
|
|
250
|
+
content = raw_content
|
|
251
|
+
# Remove trailing "]" if it's the only thing on the last line
|
|
252
|
+
lines = content.split('\n')
|
|
253
|
+
if len(lines) > 1 and lines[-1].strip() == ']':
|
|
254
|
+
content = '\n'.join(lines[:-1]).strip()
|
|
255
|
+
elif content.strip() == ']':
|
|
256
|
+
# If content is only "], skip this entry as it's incomplete
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
matches.append((entry_num, content))
|
|
260
|
+
|
|
261
|
+
for entry_num, content in matches:
|
|
262
|
+
|
|
263
|
+
if not content:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# The content should already be clean from the improved extraction
|
|
267
|
+
# Just do minimal cleaning - remove any obvious appendix content but don't be too aggressive
|
|
268
|
+
|
|
269
|
+
# Debug logging for specific entries
|
|
270
|
+
if entry_num == 74:
|
|
271
|
+
logger.debug(f"Entry [74] content being parsed: {repr(content[:200])}...")
|
|
272
|
+
|
|
273
|
+
# Parse the biblatex entry content
|
|
274
|
+
parsed_ref = parse_biblatex_entry_content(str(entry_num), content)
|
|
275
|
+
|
|
276
|
+
# Debug logging for results
|
|
277
|
+
if entry_num == 74 and parsed_ref:
|
|
278
|
+
logger.debug(f"Entry [74] parsing result: title={repr(parsed_ref.get('title'))}, authors={len(parsed_ref.get('authors', []))}")
|
|
279
|
+
|
|
280
|
+
if parsed_ref:
|
|
281
|
+
references.append(parsed_ref)
|
|
282
|
+
|
|
283
|
+
logger.debug(f"Extracted {len(references)} biblatex references")
|
|
284
|
+
|
|
285
|
+
# Validate parsing quality - if poor, return empty list to trigger LLM fallback
|
|
286
|
+
if not _validate_parsing_quality(references):
|
|
287
|
+
return []
|
|
288
|
+
|
|
289
|
+
return references
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]:
|
|
293
|
+
"""
|
|
294
|
+
Parse the content of a single biblatex entry
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
entry_num: The reference number (e.g., "1", "43")
|
|
298
|
+
content: The full content after the [number]
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary with parsed entry data
|
|
302
|
+
"""
|
|
303
|
+
from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
|
|
304
|
+
from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
305
|
+
|
|
306
|
+
# Initialize default values
|
|
307
|
+
title = ""
|
|
308
|
+
authors = []
|
|
309
|
+
year = None
|
|
310
|
+
journal = ""
|
|
311
|
+
doi = ""
|
|
312
|
+
url = ""
|
|
313
|
+
|
|
314
|
+
# Normalize whitespace and remove line breaks
|
|
315
|
+
# Handle hyphenated words split across lines with intelligence to distinguish
|
|
316
|
+
# between syllable breaks (remove hyphen) and compound words (keep hyphen)
|
|
317
|
+
content = _handle_hyphenated_line_breaks(content)
|
|
318
|
+
# Then normalize all other whitespace
|
|
319
|
+
content = re.sub(r'\s+', ' ', content.strip())
|
|
320
|
+
|
|
321
|
+
# Pattern matching for different biblatex formats:
|
|
322
|
+
|
|
323
|
+
# 1. Try to extract title - can be in quotes or as capitalized text after authors
|
|
324
|
+
# Handle both regular quotes (") and smart quotes (", ")
|
|
325
|
+
title_match = re.search(r'["\u201c\u201d]([^"\u201c\u201d]+)["\u201c\u201d]', content)
|
|
326
|
+
if title_match:
|
|
327
|
+
raw_title = title_match.group(1)
|
|
328
|
+
title = clean_title(raw_title)
|
|
329
|
+
else:
|
|
330
|
+
# If no quoted title, look for title after author names
|
|
331
|
+
# Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
|
|
332
|
+
# Order matters: more specific patterns first
|
|
333
|
+
title_patterns = [
|
|
334
|
+
# Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
|
|
335
|
+
r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
|
|
336
|
+
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
|
|
337
|
+
r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
|
|
338
|
+
r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
|
|
339
|
+
r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
|
|
340
|
+
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
341
|
+
r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
for pattern in title_patterns:
|
|
345
|
+
title_match = re.search(pattern, content)
|
|
346
|
+
if title_match:
|
|
347
|
+
potential_title = title_match.group(1)
|
|
348
|
+
# Make sure it looks like a title and not author names
|
|
349
|
+
# Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
|
|
350
|
+
author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
|
|
351
|
+
multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
|
|
352
|
+
|
|
353
|
+
is_author_like = (re.match(author_like_pattern, potential_title) or
|
|
354
|
+
re.match(multi_word_author, potential_title))
|
|
355
|
+
|
|
356
|
+
if len(potential_title) > 2 and not is_author_like:
|
|
357
|
+
title = clean_title(potential_title)
|
|
358
|
+
break
|
|
359
|
+
|
|
360
|
+
# 2. Extract year - prioritize year in parentheses over ArXiv IDs
|
|
361
|
+
year_patterns = [
|
|
362
|
+
r'\((\d{4})\)', # Year in parentheses like "(2024)" - most reliable
|
|
363
|
+
r'\b(\d{4})\.$', # Year at end of sentence like "2024."
|
|
364
|
+
r'\b(20\d{2})\b', # Recent years (2000-2099) - avoid ArXiv IDs like "2403"
|
|
365
|
+
r'\b(\d{4})\b', # Any 4-digit number as fallback
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
for pattern in year_patterns:
|
|
369
|
+
year_match = re.search(pattern, content)
|
|
370
|
+
if year_match:
|
|
371
|
+
try:
|
|
372
|
+
potential_year = int(year_match.group(1))
|
|
373
|
+
# Validate it's a reasonable publication year
|
|
374
|
+
if 1900 <= potential_year <= 2030:
|
|
375
|
+
year = potential_year
|
|
376
|
+
break
|
|
377
|
+
except ValueError:
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
# 3. Extract DOI
|
|
381
|
+
# Handle DOIs that may be split across lines or have spaces
|
|
382
|
+
doi_match = re.search(r'DOI\s*:\s*(10\.\d+/[^\s.]+(?:\.\s*\d+)*)', content, re.IGNORECASE)
|
|
383
|
+
if doi_match:
|
|
384
|
+
doi = doi_match.group(1)
|
|
385
|
+
# Clean up DOI - remove spaces and trailing periods
|
|
386
|
+
doi = re.sub(r'\s+', '', doi).rstrip('.')
|
|
387
|
+
if is_valid_doi_format(doi):
|
|
388
|
+
url = construct_doi_url(doi)
|
|
389
|
+
|
|
390
|
+
# 4. Extract ArXiv ID and construct URL
|
|
391
|
+
if not url:
|
|
392
|
+
arxiv_match = re.search(r'arXiv:\s*(\d{4}\.\d{4,5}(?:v\d+)?)', content, re.IGNORECASE)
|
|
393
|
+
if arxiv_match:
|
|
394
|
+
arxiv_id = re.sub(r'v\d+$', '', arxiv_match.group(1)) # Remove version
|
|
395
|
+
url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
396
|
+
|
|
397
|
+
# 5. Extract URL if present
|
|
398
|
+
if not url:
|
|
399
|
+
url_match = re.search(r'https?://[^\s]+', content)
|
|
400
|
+
if url_match:
|
|
401
|
+
url = url_match.group(0).rstrip('.,') # Remove trailing punctuation
|
|
402
|
+
|
|
403
|
+
# 6. Extract authors - improved to handle various biblatex patterns
|
|
404
|
+
authors_text = ""
|
|
405
|
+
|
|
406
|
+
# The key insight is that authors come first, then title (often in quotes), then venue/year
|
|
407
|
+
# Examples we need to handle:
|
|
408
|
+
# "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
|
|
409
|
+
# "Andrej Karpathy. Intro to Large Language Models. https://... year."
|
|
410
|
+
# "A. Author and B. Coauthor, \"Title\"," <- handle this format
|
|
411
|
+
|
|
412
|
+
# Try multiple patterns to extract authors
|
|
413
|
+
# Order matters - more specific patterns first!
|
|
414
|
+
author_patterns = [
|
|
415
|
+
# Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
|
|
416
|
+
r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
|
|
417
|
+
r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
|
|
418
|
+
|
|
419
|
+
# Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
|
|
420
|
+
r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
|
|
421
|
+
|
|
422
|
+
# Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
|
|
423
|
+
r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
|
|
424
|
+
|
|
425
|
+
# Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
|
|
426
|
+
r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
|
|
427
|
+
|
|
428
|
+
# Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
|
|
429
|
+
r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
|
|
430
|
+
]
|
|
431
|
+
|
|
432
|
+
for i, pattern in enumerate(author_patterns):
|
|
433
|
+
author_match = re.search(pattern, content)
|
|
434
|
+
if author_match:
|
|
435
|
+
potential_authors = author_match.group(1).strip()
|
|
436
|
+
|
|
437
|
+
# For patterns that also capture title, extract it
|
|
438
|
+
if i == 2 and not title and len(author_match.groups()) > 2:
|
|
439
|
+
# Pattern 2 (book format) captures authors, title, and subtitle
|
|
440
|
+
title_part = author_match.group(2).strip()
|
|
441
|
+
subtitle_part = author_match.group(3).strip()
|
|
442
|
+
combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
|
|
443
|
+
if len(combined_title) > 2:
|
|
444
|
+
title = clean_title(combined_title)
|
|
445
|
+
elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
|
|
446
|
+
# Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
|
|
447
|
+
potential_title = author_match.group(2).strip()
|
|
448
|
+
if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
|
|
449
|
+
title = clean_title(potential_title)
|
|
450
|
+
|
|
451
|
+
# Validate that this looks like authors
|
|
452
|
+
if (potential_authors and
|
|
453
|
+
not potential_authors.startswith(('http', 'DOI', 'arXiv', 'In:')) and
|
|
454
|
+
len(potential_authors) < 300 and
|
|
455
|
+
# Should contain at least one name-like pattern
|
|
456
|
+
re.search(r'[A-Z][a-z]+', potential_authors)):
|
|
457
|
+
authors_text = potential_authors
|
|
458
|
+
break
|
|
459
|
+
|
|
460
|
+
# Remove trailing punctuation and clean up
|
|
461
|
+
authors_text = re.sub(r'[.,;:]$', '', authors_text.strip())
|
|
462
|
+
|
|
463
|
+
# Parse authors
|
|
464
|
+
if authors_text:
|
|
465
|
+
try:
|
|
466
|
+
authors = parse_authors_with_initials(authors_text)
|
|
467
|
+
# Filter out overly long "authors" that are probably not just names
|
|
468
|
+
authors = [a for a in authors if len(a) < 100 and not re.search(r'\b(http|www|doi|arxiv)\b', a.lower())]
|
|
469
|
+
|
|
470
|
+
# Clean up "and" prefixes from authors (common in biblatex format)
|
|
471
|
+
cleaned_authors = []
|
|
472
|
+
for author in authors:
|
|
473
|
+
cleaned_author = re.sub(r'^and\s+', '', author.strip())
|
|
474
|
+
if cleaned_author and len(cleaned_author) > 2:
|
|
475
|
+
cleaned_authors.append(cleaned_author)
|
|
476
|
+
|
|
477
|
+
# If we got reasonable results, use them
|
|
478
|
+
if cleaned_authors and all(len(a) > 2 for a in cleaned_authors):
|
|
479
|
+
authors = cleaned_authors
|
|
480
|
+
else:
|
|
481
|
+
authors = [] # Reset to try fallback
|
|
482
|
+
|
|
483
|
+
except Exception as e:
|
|
484
|
+
logger.debug(f"Author parsing failed for '{authors_text}': {e}")
|
|
485
|
+
authors = []
|
|
486
|
+
|
|
487
|
+
# Fallback: split by common patterns if parse_authors_with_initials failed
|
|
488
|
+
if not authors:
|
|
489
|
+
if 'et al' in authors_text.lower():
|
|
490
|
+
# Handle "FirstAuthor et al." case - separate base author from "et al"
|
|
491
|
+
base_author = authors_text.split(' et al')[0].strip()
|
|
492
|
+
if base_author:
|
|
493
|
+
authors = [base_author, 'et al']
|
|
494
|
+
elif ' and ' in authors_text:
|
|
495
|
+
# Handle "Author1 and Author2 and Author3" format
|
|
496
|
+
author_parts = [p.strip() for p in authors_text.split(' and ')]
|
|
497
|
+
authors = []
|
|
498
|
+
for part in author_parts:
|
|
499
|
+
part = part.strip(' ,.')
|
|
500
|
+
if part and len(part) > 2:
|
|
501
|
+
authors.append(part)
|
|
502
|
+
else:
|
|
503
|
+
# Try sophisticated parsing one more time with relaxed constraints
|
|
504
|
+
try:
|
|
505
|
+
# Remove "and" connectors for cleaner parsing
|
|
506
|
+
clean_text = re.sub(r'\s+and\s+', ', ', authors_text)
|
|
507
|
+
fallback_authors = parse_authors_with_initials(clean_text)
|
|
508
|
+
if fallback_authors and len(fallback_authors) >= 1:
|
|
509
|
+
authors = fallback_authors
|
|
510
|
+
else:
|
|
511
|
+
raise ValueError("Fallback parsing failed")
|
|
512
|
+
except:
|
|
513
|
+
# Last resort: naive comma separation for "Author1, Author2, Author3"
|
|
514
|
+
# This should rarely be reached now
|
|
515
|
+
author_parts = [p.strip() for p in authors_text.split(',')]
|
|
516
|
+
authors = []
|
|
517
|
+
for part in author_parts:
|
|
518
|
+
part = part.strip(' .')
|
|
519
|
+
# Remove "and" prefix if present
|
|
520
|
+
if part.startswith('and '):
|
|
521
|
+
part = part[4:].strip()
|
|
522
|
+
# Skip parts that are too short or look like initials only
|
|
523
|
+
if (part and len(part) > 2 and
|
|
524
|
+
not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
|
|
525
|
+
authors.append(part)
|
|
526
|
+
|
|
527
|
+
# 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
|
|
528
|
+
# Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
|
|
529
|
+
journal_patterns = [
|
|
530
|
+
r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
|
|
531
|
+
r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
|
|
532
|
+
r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
|
|
533
|
+
r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
|
|
534
|
+
]
|
|
535
|
+
|
|
536
|
+
for pattern in journal_patterns:
|
|
537
|
+
journal_match = re.search(pattern, content)
|
|
538
|
+
if journal_match:
|
|
539
|
+
potential_journal = journal_match.group(1).strip()
|
|
540
|
+
# Make sure it's not just author names or year
|
|
541
|
+
if not re.match(r'^[A-Z][a-z]+,\s*[A-Z]', potential_journal) and not potential_journal.isdigit():
|
|
542
|
+
journal = potential_journal
|
|
543
|
+
break
|
|
544
|
+
|
|
545
|
+
# Apply defaults if needed
|
|
546
|
+
if not title:
|
|
547
|
+
# Try to extract title from content if no quotes found
|
|
548
|
+
# Look for capitalized text that could be a title
|
|
549
|
+
title_fallback_match = re.search(r'([A-Z][^.]*[a-z][^.]*)', content)
|
|
550
|
+
if title_fallback_match:
|
|
551
|
+
potential_title = title_fallback_match.group(1)
|
|
552
|
+
# Make sure it doesn't look like author names
|
|
553
|
+
if not re.search(r'[A-Z][a-z]+,\s*[A-Z]', potential_title):
|
|
554
|
+
title = clean_title(potential_title)
|
|
555
|
+
|
|
556
|
+
if not title:
|
|
557
|
+
title = "Unknown Title"
|
|
558
|
+
|
|
559
|
+
if not authors:
|
|
560
|
+
authors = ["Unknown Author"]
|
|
561
|
+
|
|
562
|
+
# Determine reference type
|
|
563
|
+
ref_type = 'other'
|
|
564
|
+
if 'arxiv' in url.lower() if url else False or 'arxiv' in title.lower():
|
|
565
|
+
ref_type = 'arxiv'
|
|
566
|
+
elif url or doi:
|
|
567
|
+
ref_type = 'non-arxiv'
|
|
568
|
+
|
|
569
|
+
# Create structured reference (matching refchecker expected format)
|
|
570
|
+
reference = {
|
|
571
|
+
'title': title,
|
|
572
|
+
'authors': authors,
|
|
573
|
+
'year': year,
|
|
574
|
+
'journal': journal,
|
|
575
|
+
'doi': doi,
|
|
576
|
+
'url': url,
|
|
577
|
+
'type': ref_type,
|
|
578
|
+
'bibtex_key': f"ref{entry_num}", # Generate key since biblatex doesn't have explicit keys
|
|
579
|
+
'bibtex_type': 'biblatex',
|
|
580
|
+
'raw_text': f"[{entry_num}] {content}",
|
|
581
|
+
'entry_number': int(entry_num)
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
return reference
|