academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,584 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Biblatex format parser utility
4
+
5
+ Handles parsing of biblatex .bbl format references like:
6
+ [1] Author et al. "Title". In: Venue. Year.
7
+ [43] Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez.
8
+ Gorilla: Large Language Model Connected with Massive APIs. 2023. arXiv: 2305.15334 [cs.CL].
9
+ """
10
+
11
+ import re
12
+ import logging
13
+ from typing import List, Dict, Any
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def _handle_hyphenated_line_breaks(content: str) -> str:
19
+ """
20
+ Intelligently handle hyphenated words split across lines.
21
+
22
+ Distinguishes between:
23
+ - Syllable breaks: "Christo-\npher" -> "Christopher" (remove hyphen)
24
+ - Compound words: "Browser-\nassisted" -> "Browser-assisted" (keep hyphen)
25
+
26
+ Args:
27
+ content: Text content with potential hyphenated line breaks
28
+
29
+ Returns:
30
+ Content with appropriate hyphen handling
31
+ """
32
+ # Find all hyphen + line break patterns
33
+ hyphen_matches = list(re.finditer(r'(\w+)-\s*\n\s*(\w+)', content))
34
+
35
+ # Process matches in reverse order to avoid offset issues
36
+ for match in reversed(hyphen_matches):
37
+ before_word = match.group(1)
38
+ after_word = match.group(2)
39
+ full_match = match.group(0)
40
+
41
+ # Determine if this is a syllable break or compound word
42
+ if _is_syllable_break(before_word, after_word):
43
+ # Remove hyphen for syllable breaks
44
+ replacement = before_word + after_word
45
+ else:
46
+ # Keep hyphen for compound words
47
+ replacement = before_word + '-' + after_word
48
+
49
+ # Replace in content
50
+ start, end = match.span()
51
+ content = content[:start] + replacement + content[end:]
52
+
53
+ return content
54
+
55
+
56
+ def _is_syllable_break(before_word: str, after_word: str) -> bool:
57
+ """
58
+ Determine if a hyphen represents a syllable break vs compound word.
59
+
60
+ Args:
61
+ before_word: Word part before the hyphen
62
+ after_word: Word part after the hyphen
63
+
64
+ Returns:
65
+ True if this appears to be a syllable break, False if compound word
66
+ """
67
+ # Convert to lowercase for analysis
68
+ before_lower = before_word.lower()
69
+ after_lower = after_word.lower()
70
+
71
+ # Common patterns that indicate syllable breaks (should remove hyphen)
72
+ syllable_break_patterns = [
73
+ # Name patterns - first part looks like truncated first name, second part like surname
74
+ (len(before_lower) <= 8 and before_word[0].isupper() and
75
+ len(after_lower) >= 3 and after_word[0].islower()),
76
+
77
+ # Common word ending/beginning patterns for syllable breaks
78
+ (before_lower.endswith(('ing', 'tion', 'sion', 'ness', 'ment', 'ful', 'less', 'ity', 'ies', 'ly', 'ed')) and
79
+ len(after_lower) <= 4),
80
+
81
+ # Short fragments that are likely syllable breaks
82
+ (len(before_lower) <= 4 and len(after_lower) <= 4),
83
+
84
+ # Common prefixes that typically form single words/suffixes
85
+ (before_lower in ['pre', 'post', 'anti', 'co', 'sub', 'out', 'up', 'non', 'dis', 'mis', 'un', 'in', 're'] or
86
+ after_lower.startswith(('ing', 'ed', 'er', 'est', 'ly', 'ness', 'ment', 'ful', 'less', 'ism', 'ist', 'ity'))),
87
+ ]
88
+
89
+ # Common patterns that indicate compound words (should keep hyphen)
90
+ compound_word_patterns = [
91
+ # Both parts are substantial words (likely compound)
92
+ (len(before_lower) >= 5 and len(after_lower) >= 5),
93
+
94
+ # Technical/academic compound words
95
+ (before_lower in ['browser', 'question', 'self', 'multi', 'cross', 'inter', 'state', 'real', 'end'] or
96
+ after_lower in ['assisted', 'answering', 'aware', 'based', 'driven', 'oriented', 'time', 'world', 'user']),
97
+
98
+ # Common compound word patterns
99
+ (before_lower.endswith('er') and len(before_lower) >= 4 and len(after_lower) >= 6),
100
+
101
+ # Both words start with capital (likely proper nouns or technical terms)
102
+ (before_word[0].isupper() and after_word[0].isupper() and
103
+ len(before_word) >= 4 and len(after_word) >= 4),
104
+ ]
105
+
106
+ # Check compound word patterns first (more specific)
107
+ for pattern in compound_word_patterns:
108
+ if pattern:
109
+ return False # Keep hyphen (compound word)
110
+
111
+ # Check syllable break patterns
112
+ for pattern in syllable_break_patterns:
113
+ if pattern:
114
+ return True # Remove hyphen (syllable break)
115
+
116
+ # Default: if uncertain, lean towards compound word to preserve meaning
117
+ # This is safer than incorrectly joining compound words
118
+ return False
119
+
120
+
121
+ def detect_biblatex_format(text: str) -> bool:
122
+ """
123
+ Detect if text contains biblatex .bbl format references
124
+
125
+ Args:
126
+ text: Text to analyze
127
+
128
+ Returns:
129
+ True if biblatex format detected, False otherwise
130
+ """
131
+ # Look for biblatex patterns like [1] Author. "Title".
132
+ # This is different from BibTeX (@article{}) and standard numbered lists
133
+
134
+ # Must have the biblatex auxiliary file marker or numbered reference pattern
135
+ has_biblatex_marker = 'biblatex auxiliary file' in text
136
+ has_numbered_refs = bool(re.search(r'^\[\d+\]\s+[A-Z]', text, re.MULTILINE))
137
+
138
+ return has_biblatex_marker or has_numbered_refs
139
+
140
+
141
+ def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
142
+ """
143
+ Validate that biblatex parsing results are of acceptable quality.
144
+ If quality is poor, we should fallback to LLM parsing instead.
145
+
146
+ Args:
147
+ references: List of parsed reference dictionaries
148
+
149
+ Returns:
150
+ True if parsing quality is acceptable, False if should fallback to LLM
151
+ """
152
+ if not references:
153
+ return False
154
+
155
+ # Count problematic entries
156
+ unknown_authors = 0
157
+ unknown_titles = 0
158
+ total_entries = len(references)
159
+
160
+ for ref in references:
161
+ authors = ref.get('authors', [])
162
+ title = ref.get('title', '')
163
+
164
+ # Check for "Unknown Author" entries
165
+ if not authors or authors == ['Unknown Author']:
166
+ unknown_authors += 1
167
+
168
+ # Check for "Unknown Title" entries
169
+ if not title or title == 'Unknown Title':
170
+ unknown_titles += 1
171
+
172
+ # Calculate failure rates
173
+ author_failure_rate = unknown_authors / total_entries
174
+ title_failure_rate = unknown_titles / total_entries
175
+
176
+ # Quality thresholds - if more than 20% of entries have parsing failures,
177
+ # fallback to LLM which is more robust
178
+ MAX_ACCEPTABLE_FAILURE_RATE = 0.2
179
+
180
+ if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
181
+ logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
182
+ return False
183
+
184
+ if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
185
+ logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
186
+ return False
187
+
188
+ logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
189
+ return True
190
+
191
+
192
+ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
193
+ """
194
+ Parse biblatex formatted references into structured format
195
+
196
+ Args:
197
+ text: String containing biblatex .bbl entries
198
+
199
+ Returns:
200
+ List of structured reference dictionaries, or empty list if
201
+ parsing quality is poor (to trigger LLM fallback)
202
+ """
203
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
204
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
205
+
206
+ if not text or not detect_biblatex_format(text):
207
+ return []
208
+
209
+ references = []
210
+
211
+ # First split by entries to handle them individually
212
+ # This is more robust than a single regex for the entire text
213
+ # Use ^ to ensure we only match entries at start of line (bibliography entries)
214
+ entry_starts = []
215
+ for match in re.finditer(r'^\[(\d+)\]', text, re.MULTILINE):
216
+ entry_starts.append((int(match.group(1)), match.start(), match.end()))
217
+
218
+ # Sort by entry number to ensure correct order
219
+ entry_starts.sort()
220
+
221
+ matches = []
222
+ for i, (entry_num, start, end) in enumerate(entry_starts):
223
+ # Find the content between this entry and the next (or end of text)
224
+ if i + 1 < len(entry_starts):
225
+ next_start = entry_starts[i + 1][1]
226
+ raw_content = text[end:next_start].strip()
227
+ else:
228
+ # Last entry - take everything to end, but be smart about stopping
229
+ remaining = text[end:].strip()
230
+ # Stop at obvious document structure markers
231
+ stop_patterns = [
232
+ r'\n\d+\n', # Page numbers
233
+ r'\nChecklist\n',
234
+ r'\nA Additional Details',
235
+ r'\nAppendix',
236
+ r'\n\d+\. For all authors',
237
+ ]
238
+
239
+ min_stop = len(remaining)
240
+ for pattern in stop_patterns:
241
+ match = re.search(pattern, remaining)
242
+ if match and match.start() < min_stop:
243
+ min_stop = match.start()
244
+
245
+ raw_content = remaining[:min_stop].strip()
246
+
247
+ # Clean up content - handle cases where entry might be incomplete or malformed
248
+ if raw_content:
249
+ # Remove stray closing brackets or incomplete markers
250
+ content = raw_content
251
+ # Remove trailing "]" if it's the only thing on the last line
252
+ lines = content.split('\n')
253
+ if len(lines) > 1 and lines[-1].strip() == ']':
254
+ content = '\n'.join(lines[:-1]).strip()
255
+ elif content.strip() == ']':
256
+ # If content is only "], skip this entry as it's incomplete
257
+ continue
258
+
259
+ matches.append((entry_num, content))
260
+
261
+ for entry_num, content in matches:
262
+
263
+ if not content:
264
+ continue
265
+
266
+ # The content should already be clean from the improved extraction
267
+ # Just do minimal cleaning - remove any obvious appendix content but don't be too aggressive
268
+
269
+ # Debug logging for specific entries
270
+ if entry_num == 74:
271
+ logger.debug(f"Entry [74] content being parsed: {repr(content[:200])}...")
272
+
273
+ # Parse the biblatex entry content
274
+ parsed_ref = parse_biblatex_entry_content(str(entry_num), content)
275
+
276
+ # Debug logging for results
277
+ if entry_num == 74 and parsed_ref:
278
+ logger.debug(f"Entry [74] parsing result: title={repr(parsed_ref.get('title'))}, authors={len(parsed_ref.get('authors', []))}")
279
+
280
+ if parsed_ref:
281
+ references.append(parsed_ref)
282
+
283
+ logger.debug(f"Extracted {len(references)} biblatex references")
284
+
285
+ # Validate parsing quality - if poor, return empty list to trigger LLM fallback
286
+ if not _validate_parsing_quality(references):
287
+ return []
288
+
289
+ return references
290
+
291
+
292
+ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]:
293
+ """
294
+ Parse the content of a single biblatex entry
295
+
296
+ Args:
297
+ entry_num: The reference number (e.g., "1", "43")
298
+ content: The full content after the [number]
299
+
300
+ Returns:
301
+ Dictionary with parsed entry data
302
+ """
303
+ from refchecker.utils.text_utils import parse_authors_with_initials, clean_title
304
+ from refchecker.utils.doi_utils import construct_doi_url, is_valid_doi_format
305
+
306
+ # Initialize default values
307
+ title = ""
308
+ authors = []
309
+ year = None
310
+ journal = ""
311
+ doi = ""
312
+ url = ""
313
+
314
+ # Normalize whitespace and remove line breaks
315
+ # Handle hyphenated words split across lines with intelligence to distinguish
316
+ # between syllable breaks (remove hyphen) and compound words (keep hyphen)
317
+ content = _handle_hyphenated_line_breaks(content)
318
+ # Then normalize all other whitespace
319
+ content = re.sub(r'\s+', ' ', content.strip())
320
+
321
+ # Pattern matching for different biblatex formats:
322
+
323
+ # 1. Try to extract title - can be in quotes or as capitalized text after authors
324
+ # Handle both regular quotes (") and smart quotes (", ")
325
+ title_match = re.search(r'["\u201c\u201d]([^"\u201c\u201d]+)["\u201c\u201d]', content)
326
+ if title_match:
327
+ raw_title = title_match.group(1)
328
+ title = clean_title(raw_title)
329
+ else:
330
+ # If no quoted title, look for title after author names
331
+ # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
332
+ # Order matters: more specific patterns first
333
+ title_patterns = [
334
+ # Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
335
+ r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
336
+ r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
337
+ r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
338
+ r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
339
+ r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
340
+ r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
341
+ r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
342
+ ]
343
+
344
+ for pattern in title_patterns:
345
+ title_match = re.search(pattern, content)
346
+ if title_match:
347
+ potential_title = title_match.group(1)
348
+ # Make sure it looks like a title and not author names
349
+ # Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
350
+ author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
351
+ multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
352
+
353
+ is_author_like = (re.match(author_like_pattern, potential_title) or
354
+ re.match(multi_word_author, potential_title))
355
+
356
+ if len(potential_title) > 2 and not is_author_like:
357
+ title = clean_title(potential_title)
358
+ break
359
+
360
+ # 2. Extract year - prioritize year in parentheses over ArXiv IDs
361
+ year_patterns = [
362
+ r'\((\d{4})\)', # Year in parentheses like "(2024)" - most reliable
363
+ r'\b(\d{4})\.$', # Year at end of sentence like "2024."
364
+ r'\b(20\d{2})\b', # Recent years (2000-2099) - avoid ArXiv IDs like "2403"
365
+ r'\b(\d{4})\b', # Any 4-digit number as fallback
366
+ ]
367
+
368
+ for pattern in year_patterns:
369
+ year_match = re.search(pattern, content)
370
+ if year_match:
371
+ try:
372
+ potential_year = int(year_match.group(1))
373
+ # Validate it's a reasonable publication year
374
+ if 1900 <= potential_year <= 2030:
375
+ year = potential_year
376
+ break
377
+ except ValueError:
378
+ continue
379
+
380
+ # 3. Extract DOI
381
+ # Handle DOIs that may be split across lines or have spaces
382
+ doi_match = re.search(r'DOI\s*:\s*(10\.\d+/[^\s.]+(?:\.\s*\d+)*)', content, re.IGNORECASE)
383
+ if doi_match:
384
+ doi = doi_match.group(1)
385
+ # Clean up DOI - remove spaces and trailing periods
386
+ doi = re.sub(r'\s+', '', doi).rstrip('.')
387
+ if is_valid_doi_format(doi):
388
+ url = construct_doi_url(doi)
389
+
390
+ # 4. Extract ArXiv ID and construct URL
391
+ if not url:
392
+ arxiv_match = re.search(r'arXiv:\s*(\d{4}\.\d{4,5}(?:v\d+)?)', content, re.IGNORECASE)
393
+ if arxiv_match:
394
+ arxiv_id = re.sub(r'v\d+$', '', arxiv_match.group(1)) # Remove version
395
+ url = f"https://arxiv.org/abs/{arxiv_id}"
396
+
397
+ # 5. Extract URL if present
398
+ if not url:
399
+ url_match = re.search(r'https?://[^\s]+', content)
400
+ if url_match:
401
+ url = url_match.group(0).rstrip('.,') # Remove trailing punctuation
402
+
403
+ # 6. Extract authors - improved to handle various biblatex patterns
404
+ authors_text = ""
405
+
406
+ # The key insight is that authors come first, then title (often in quotes), then venue/year
407
+ # Examples we need to handle:
408
+ # "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
409
+ # "Andrej Karpathy. Intro to Large Language Models. https://... year."
410
+ # "A. Author and B. Coauthor, \"Title\"," <- handle this format
411
+
412
+ # Try multiple patterns to extract authors
413
+ # Order matters - more specific patterns first!
414
+ author_patterns = [
415
+ # Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
416
+ r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
417
+ r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
418
+
419
+ # Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
420
+ r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
421
+
422
+ # Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
423
+ r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
424
+
425
+ # Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
426
+ r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
427
+
428
+ # Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
429
+ r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
430
+ ]
431
+
432
+ for i, pattern in enumerate(author_patterns):
433
+ author_match = re.search(pattern, content)
434
+ if author_match:
435
+ potential_authors = author_match.group(1).strip()
436
+
437
+ # For patterns that also capture title, extract it
438
+ if i == 2 and not title and len(author_match.groups()) > 2:
439
+ # Pattern 2 (book format) captures authors, title, and subtitle
440
+ title_part = author_match.group(2).strip()
441
+ subtitle_part = author_match.group(3).strip()
442
+ combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
443
+ if len(combined_title) > 2:
444
+ title = clean_title(combined_title)
445
+ elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
446
+ # Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
447
+ potential_title = author_match.group(2).strip()
448
+ if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
449
+ title = clean_title(potential_title)
450
+
451
+ # Validate that this looks like authors
452
+ if (potential_authors and
453
+ not potential_authors.startswith(('http', 'DOI', 'arXiv', 'In:')) and
454
+ len(potential_authors) < 300 and
455
+ # Should contain at least one name-like pattern
456
+ re.search(r'[A-Z][a-z]+', potential_authors)):
457
+ authors_text = potential_authors
458
+ break
459
+
460
+ # Remove trailing punctuation and clean up
461
+ authors_text = re.sub(r'[.,;:]$', '', authors_text.strip())
462
+
463
+ # Parse authors
464
+ if authors_text:
465
+ try:
466
+ authors = parse_authors_with_initials(authors_text)
467
+ # Filter out overly long "authors" that are probably not just names
468
+ authors = [a for a in authors if len(a) < 100 and not re.search(r'\b(http|www|doi|arxiv)\b', a.lower())]
469
+
470
+ # Clean up "and" prefixes from authors (common in biblatex format)
471
+ cleaned_authors = []
472
+ for author in authors:
473
+ cleaned_author = re.sub(r'^and\s+', '', author.strip())
474
+ if cleaned_author and len(cleaned_author) > 2:
475
+ cleaned_authors.append(cleaned_author)
476
+
477
+ # If we got reasonable results, use them
478
+ if cleaned_authors and all(len(a) > 2 for a in cleaned_authors):
479
+ authors = cleaned_authors
480
+ else:
481
+ authors = [] # Reset to try fallback
482
+
483
+ except Exception as e:
484
+ logger.debug(f"Author parsing failed for '{authors_text}': {e}")
485
+ authors = []
486
+
487
+ # Fallback: split by common patterns if parse_authors_with_initials failed
488
+ if not authors:
489
+ if 'et al' in authors_text.lower():
490
+ # Handle "FirstAuthor et al." case - separate base author from "et al"
491
+ base_author = authors_text.split(' et al')[0].strip()
492
+ if base_author:
493
+ authors = [base_author, 'et al']
494
+ elif ' and ' in authors_text:
495
+ # Handle "Author1 and Author2 and Author3" format
496
+ author_parts = [p.strip() for p in authors_text.split(' and ')]
497
+ authors = []
498
+ for part in author_parts:
499
+ part = part.strip(' ,.')
500
+ if part and len(part) > 2:
501
+ authors.append(part)
502
+ else:
503
+ # Try sophisticated parsing one more time with relaxed constraints
504
+ try:
505
+ # Remove "and" connectors for cleaner parsing
506
+ clean_text = re.sub(r'\s+and\s+', ', ', authors_text)
507
+ fallback_authors = parse_authors_with_initials(clean_text)
508
+ if fallback_authors and len(fallback_authors) >= 1:
509
+ authors = fallback_authors
510
+ else:
511
+ raise ValueError("Fallback parsing failed")
512
+ except:
513
+ # Last resort: naive comma separation for "Author1, Author2, Author3"
514
+ # This should rarely be reached now
515
+ author_parts = [p.strip() for p in authors_text.split(',')]
516
+ authors = []
517
+ for part in author_parts:
518
+ part = part.strip(' .')
519
+ # Remove "and" prefix if present
520
+ if part.startswith('and '):
521
+ part = part[4:].strip()
522
+ # Skip parts that are too short or look like initials only
523
+ if (part and len(part) > 2 and
524
+ not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
525
+ authors.append(part)
526
+
527
+ # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
528
+ # Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
529
+ journal_patterns = [
530
+ r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
531
+ r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
532
+ r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
533
+ r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
534
+ ]
535
+
536
+ for pattern in journal_patterns:
537
+ journal_match = re.search(pattern, content)
538
+ if journal_match:
539
+ potential_journal = journal_match.group(1).strip()
540
+ # Make sure it's not just author names or year
541
+ if not re.match(r'^[A-Z][a-z]+,\s*[A-Z]', potential_journal) and not potential_journal.isdigit():
542
+ journal = potential_journal
543
+ break
544
+
545
+ # Apply defaults if needed
546
+ if not title:
547
+ # Try to extract title from content if no quotes found
548
+ # Look for capitalized text that could be a title
549
+ title_fallback_match = re.search(r'([A-Z][^.]*[a-z][^.]*)', content)
550
+ if title_fallback_match:
551
+ potential_title = title_fallback_match.group(1)
552
+ # Make sure it doesn't look like author names
553
+ if not re.search(r'[A-Z][a-z]+,\s*[A-Z]', potential_title):
554
+ title = clean_title(potential_title)
555
+
556
+ if not title:
557
+ title = "Unknown Title"
558
+
559
+ if not authors:
560
+ authors = ["Unknown Author"]
561
+
562
+ # Determine reference type
563
+ ref_type = 'other'
564
+ if 'arxiv' in url.lower() if url else False or 'arxiv' in title.lower():
565
+ ref_type = 'arxiv'
566
+ elif url or doi:
567
+ ref_type = 'non-arxiv'
568
+
569
+ # Create structured reference (matching refchecker expected format)
570
+ reference = {
571
+ 'title': title,
572
+ 'authors': authors,
573
+ 'year': year,
574
+ 'journal': journal,
575
+ 'doi': doi,
576
+ 'url': url,
577
+ 'type': ref_type,
578
+ 'bibtex_key': f"ref{entry_num}", # Generate key since biblatex doesn't have explicit keys
579
+ 'bibtex_type': 'biblatex',
580
+ 'raw_text': f"[{entry_num}] {content}",
581
+ 'entry_number': int(entry_num)
582
+ }
583
+
584
+ return reference