academic-refchecker 1.2.65__py3-none-any.whl → 1.2.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/METADATA +72 -7
  2. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/RECORD +33 -18
  3. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/entry_points.txt +1 -0
  4. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/top_level.txt +1 -0
  5. backend/__init__.py +21 -0
  6. backend/__main__.py +11 -0
  7. backend/cli.py +64 -0
  8. backend/concurrency.py +100 -0
  9. backend/database.py +711 -0
  10. backend/main.py +1367 -0
  11. backend/models.py +99 -0
  12. backend/refchecker_wrapper.py +1126 -0
  13. backend/static/assets/index-2P6L_39v.css +1 -0
  14. backend/static/assets/index-hk21nqxR.js +25 -0
  15. backend/static/favicon.svg +6 -0
  16. backend/static/index.html +15 -0
  17. backend/static/vite.svg +1 -0
  18. backend/thumbnail.py +517 -0
  19. backend/websocket_manager.py +104 -0
  20. refchecker/__version__.py +2 -2
  21. refchecker/checkers/crossref.py +15 -6
  22. refchecker/checkers/enhanced_hybrid_checker.py +18 -4
  23. refchecker/checkers/local_semantic_scholar.py +2 -2
  24. refchecker/checkers/openalex.py +15 -6
  25. refchecker/checkers/semantic_scholar.py +15 -6
  26. refchecker/core/refchecker.py +17 -6
  27. refchecker/utils/__init__.py +2 -1
  28. refchecker/utils/arxiv_utils.py +18 -60
  29. refchecker/utils/doi_utils.py +32 -1
  30. refchecker/utils/error_utils.py +20 -9
  31. refchecker/utils/text_utils.py +143 -27
  32. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/WHEEL +0 -0
  33. {academic_refchecker-1.2.65.dist-info → academic_refchecker-1.2.67.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1126 @@
1
+ """
2
+ Wrapper around refchecker library with progress callbacks for real-time updates
3
+ """
4
+ import sys
5
+ import os
6
+ import asyncio
7
+ import logging
8
+ import tempfile
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from typing import List, Dict, Any, Optional, Callable
11
+ from pathlib import Path
12
+
13
+ # Add src to path to import refchecker when running from source
14
+ # This is only needed when not installed as a package
15
+ _src_path = str(Path(__file__).parent.parent / "src")
16
+ if _src_path not in sys.path and os.path.exists(_src_path):
17
+ sys.path.insert(0, _src_path)
18
+
19
+ from backend.concurrency import get_limiter
20
+
21
+ from refchecker.utils.text_utils import extract_latex_references
22
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
23
+ from refchecker.services.pdf_processor import PDFProcessor
24
+ from refchecker.llm.base import create_llm_provider, ReferenceExtractor
25
+ from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
26
+ from refchecker.core.refchecker import ArxivReferenceChecker
27
+ from refchecker.utils.arxiv_utils import get_bibtex_content
28
+ import arxiv
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def _process_llm_references_cli_style(references: List[Any]) -> List[Dict[str, Any]]:
34
+ """Use the CLI's post-processing logic to structure LLM references.
35
+
36
+ We intentionally reuse the exact methods from the CLI's ArxivReferenceChecker
37
+ (without running its heavy __init__) to avoid diverging behavior between
38
+ CLI and Web extraction.
39
+ """
40
+ cli_checker = ArxivReferenceChecker.__new__(ArxivReferenceChecker)
41
+ return cli_checker._process_llm_extracted_references(references)
42
+
43
+
44
+ def _make_cli_checker(llm_provider):
45
+ """Create a lightweight ArxivReferenceChecker instance for parsing only.
46
+
47
+ We bypass __init__ to avoid heavy setup and set just the fields needed for
48
+ bibliography finding and reference parsing so that logic/order matches CLI.
49
+ """
50
+ cli_checker = ArxivReferenceChecker.__new__(ArxivReferenceChecker)
51
+ cli_checker.llm_extractor = ReferenceExtractor(llm_provider) if llm_provider else None
52
+ cli_checker.llm_enabled = bool(llm_provider)
53
+ cli_checker.used_regex_extraction = False
54
+ cli_checker.used_unreliable_extraction = False
55
+ cli_checker.fatal_error = False
56
+ return cli_checker
57
+
58
+
59
+ def _normalize_reference_fields(ref: Dict[str, Any]) -> Dict[str, Any]:
60
+ """Normalize reference field names for consistency.
61
+
62
+ The parser uses 'journal' but the rest of the pipeline expects 'venue'.
63
+ This function normalizes field names for consistent handling.
64
+ """
65
+ # Map 'journal' to 'venue' if venue is not set
66
+ if ref.get('journal') and not ref.get('venue'):
67
+ ref['venue'] = ref['journal']
68
+ return ref
69
+
70
+
71
+ # Default max concurrent reference checks (similar to CLI default)
72
+ # This value is now managed by the global concurrency limiter
73
+ DEFAULT_MAX_CONCURRENT_CHECKS = 6
74
+
75
+
76
+ class ProgressRefChecker:
77
+ """
78
+ RefChecker wrapper with progress callbacks for real-time updates
79
+ """
80
+
81
+ def __init__(self,
82
+ llm_provider: Optional[str] = None,
83
+ llm_model: Optional[str] = None,
84
+ api_key: Optional[str] = None,
85
+ use_llm: bool = True,
86
+ progress_callback: Optional[Callable] = None,
87
+ cancel_event: Optional[asyncio.Event] = None,
88
+ check_id: Optional[int] = None,
89
+ title_update_callback: Optional[Callable] = None,
90
+ bibliography_source_callback: Optional[Callable] = None):
91
+ """
92
+ Initialize the progress-aware refchecker
93
+
94
+ Args:
95
+ llm_provider: LLM provider (anthropic, openai, google, etc.)
96
+ llm_model: Specific model to use
97
+ api_key: API key for the LLM provider
98
+ use_llm: Whether to use LLM for reference extraction
99
+ progress_callback: Async callback for progress updates
100
+ check_id: Database ID for this check (for updating title)
101
+ title_update_callback: Async callback to update title in DB
102
+ bibliography_source_callback: Async callback to save bibliography source content
103
+ """
104
+ self.llm_provider = llm_provider
105
+ self.llm_model = llm_model
106
+ self.api_key = api_key
107
+ self.use_llm = use_llm
108
+ self.progress_callback = progress_callback
109
+ self.cancel_event = cancel_event
110
+ self.check_id = check_id
111
+ self.title_update_callback = title_update_callback
112
+ self.bibliography_source_callback = bibliography_source_callback
113
+
114
+ # Initialize LLM if requested
115
+ self.llm = None
116
+ if use_llm and llm_provider:
117
+ try:
118
+ # Build config dict for the LLM provider
119
+ llm_config = {}
120
+ if llm_model:
121
+ llm_config['model'] = llm_model
122
+ if api_key:
123
+ llm_config['api_key'] = api_key
124
+ self.llm = create_llm_provider(
125
+ provider_name=llm_provider,
126
+ config=llm_config
127
+ )
128
+ logger.info(f"Initialized LLM provider: {llm_provider}")
129
+ except Exception as e:
130
+ logger.error(f"Failed to initialize LLM: {e}")
131
+
132
+ # Initialize reference checker
133
+ self.checker = EnhancedHybridReferenceChecker(
134
+ semantic_scholar_api_key=os.getenv('SEMANTIC_SCHOLAR_API_KEY'),
135
+ debug_mode=False
136
+ )
137
+
138
+ def _format_verification_result(
139
+ self,
140
+ reference: Dict[str, Any],
141
+ index: int,
142
+ verified_data: Optional[Dict[str, Any]],
143
+ errors: List[Dict[str, Any]],
144
+ url: Optional[str]
145
+ ) -> Dict[str, Any]:
146
+ """
147
+ Format verification result into a standardized response.
148
+
149
+ Shared by both async and sync verification methods.
150
+ """
151
+ # Normalize errors to align with CLI behavior
152
+ logger.info(f"_format_verification_result: raw errors={errors}")
153
+ sanitized = []
154
+ for err in errors:
155
+ e_type = err.get('error_type') or err.get('warning_type') or err.get('info_type')
156
+ details = err.get('error_details') or err.get('warning_details') or err.get('info_details')
157
+ if not e_type and not details:
158
+ continue
159
+ # Track if this was originally an info_type (suggestion, not error)
160
+ is_info = 'info_type' in err
161
+ # Track if this was originally a warning_type (warning, not error)
162
+ is_warning = 'warning_type' in err
163
+ logger.info(f"Sanitizing error: e_type={e_type}, is_info={is_info}, is_warning={is_warning}, keys={list(err.keys())}")
164
+ sanitized.append({
165
+ # If it was info_type, store as 'info' to ensure proper categorization
166
+ "error_type": 'info' if is_info else (e_type or 'unknown'),
167
+ "error_details": details or '',
168
+ "cited_value": err.get('cited_value'),
169
+ "actual_value": err.get('actual_value'),
170
+ "is_suggestion": is_info, # Preserve info_type as suggestion flag
171
+ "is_warning": is_warning, # Preserve warning_type as warning flag
172
+ })
173
+
174
+ # Determine status - items with warning_type or certain error types are warnings, not errors
175
+ warning_types = ['year', 'venue', 'author']
176
+ # Items originally from info_type are suggestions, not errors
177
+ # Items originally from warning_type are warnings, not errors
178
+ has_errors = any(
179
+ e.get('error_type') not in ['unverified', 'info'] + warning_types
180
+ and not e.get('is_suggestion')
181
+ and not e.get('is_warning')
182
+ for e in sanitized
183
+ )
184
+ has_warnings = any(
185
+ (e.get('error_type') in warning_types or e.get('is_warning'))
186
+ and not e.get('is_suggestion')
187
+ for e in sanitized
188
+ )
189
+ has_suggestions = any(e.get('is_suggestion') or e.get('error_type') == 'info' for e in sanitized)
190
+ is_unverified = any(e.get('error_type') == 'unverified' for e in sanitized)
191
+
192
+ if has_errors:
193
+ status = 'error'
194
+ elif has_warnings:
195
+ status = 'warning'
196
+ elif has_suggestions:
197
+ status = 'suggestion'
198
+ elif is_unverified:
199
+ status = 'unverified'
200
+ else:
201
+ status = 'verified'
202
+
203
+ # Extract authoritative URLs with proper type detection
204
+ authoritative_urls = []
205
+ if url:
206
+ url_type = "other"
207
+ if "semanticscholar.org" in url:
208
+ url_type = "semantic_scholar"
209
+ elif "openalex.org" in url:
210
+ url_type = "openalex"
211
+ elif "crossref.org" in url or "doi.org" in url:
212
+ url_type = "doi"
213
+ elif "openreview.net" in url:
214
+ url_type = "openreview"
215
+ elif "arxiv.org" in url:
216
+ url_type = "arxiv"
217
+ authoritative_urls.append({"type": url_type, "url": url})
218
+
219
+ # Extract external IDs from verified data (Semantic Scholar format)
220
+ if verified_data:
221
+ external_ids = verified_data.get('externalIds', {})
222
+
223
+ # Add ArXiv URL if available
224
+ arxiv_id = external_ids.get('ArXiv') or verified_data.get('arxiv_id')
225
+ if arxiv_id:
226
+ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
227
+ if not any(u.get('url') == arxiv_url for u in authoritative_urls):
228
+ authoritative_urls.append({"type": "arxiv", "url": arxiv_url})
229
+
230
+ # Add DOI URL if available
231
+ doi = external_ids.get('DOI') or verified_data.get('doi')
232
+ if doi:
233
+ doi_url = f"https://doi.org/{doi}"
234
+ if not any(u.get('url') == doi_url for u in authoritative_urls):
235
+ authoritative_urls.append({"type": "doi", "url": doi_url})
236
+
237
+ # Format errors, warnings, and suggestions
238
+ formatted_errors = []
239
+ formatted_warnings = []
240
+ formatted_suggestions = []
241
+ for err in sanitized:
242
+ err_obj = {
243
+ "error_type": err.get('error_type', 'unknown'),
244
+ "error_details": err.get('error_details', ''),
245
+ "cited_value": err.get('cited_value'),
246
+ "actual_value": err.get('actual_value')
247
+ }
248
+ # Check is_suggestion flag (set when original had info_type)
249
+ if err.get('is_suggestion') or err.get('error_type') == 'info':
250
+ # Store as suggestion with full details
251
+ formatted_suggestions.append({
252
+ "suggestion_type": err.get('error_type') or 'info',
253
+ "suggestion_details": err.get('error_details', '')
254
+ })
255
+ elif err.get('is_warning') or err.get('error_type') in ['year', 'venue', 'author']:
256
+ # Items with is_warning flag or known warning types go to warnings
257
+ formatted_warnings.append(err_obj)
258
+ elif err.get('error_type') == 'unverified':
259
+ formatted_errors.append({**err_obj, "error_type": 'unverified'})
260
+ else:
261
+ formatted_errors.append(err_obj)
262
+
263
+ result = {
264
+ "index": index,
265
+ "title": reference.get('title') or reference.get('cited_url') or reference.get('url') or 'Unknown Title',
266
+ "authors": reference.get('authors', []),
267
+ "year": reference.get('year'),
268
+ "venue": reference.get('venue'),
269
+ "cited_url": reference.get('cited_url') or reference.get('url'),
270
+ "status": status,
271
+ "errors": formatted_errors,
272
+ "warnings": formatted_warnings,
273
+ "suggestions": formatted_suggestions,
274
+ "authoritative_urls": authoritative_urls,
275
+ "corrected_reference": None
276
+ }
277
+ logger.info(f"_format_verification_result output: suggestions={formatted_suggestions}, status={status}")
278
+ return result
279
+
280
+ def _format_error_result(
281
+ self,
282
+ reference: Dict[str, Any],
283
+ index: int,
284
+ error: Exception
285
+ ) -> Dict[str, Any]:
286
+ """Format an error result when verification fails."""
287
+ return {
288
+ "index": index,
289
+ "title": reference.get('title') or reference.get('cited_url') or reference.get('url') or 'Unknown',
290
+ "authors": reference.get('authors', []),
291
+ "year": reference.get('year'),
292
+ "venue": reference.get('venue'),
293
+ "cited_url": reference.get('cited_url') or reference.get('url'),
294
+ "status": "error",
295
+ "errors": [{
296
+ "error_type": "check_failed",
297
+ "error_details": str(error)
298
+ }],
299
+ "warnings": [],
300
+ "suggestions": [],
301
+ "authoritative_urls": [],
302
+ "corrected_reference": None
303
+ }
304
+
305
+ async def emit_progress(self, event_type: str, data: Dict[str, Any]):
306
+ """Emit progress event to callback"""
307
+ logger.info(f"Emitting progress: {event_type} - {str(data)[:200]}")
308
+ if self.progress_callback:
309
+ await self.progress_callback(event_type, data)
310
+
311
+ async def _check_cancelled(self):
312
+ if self.cancel_event and self.cancel_event.is_set():
313
+ raise asyncio.CancelledError()
314
+
315
+ async def check_paper(self, paper_source: str, source_type: str) -> Dict[str, Any]:
316
+ """
317
+ Check a paper and emit progress updates
318
+
319
+ Args:
320
+ paper_source: URL, ArXiv ID, or file path
321
+ source_type: 'url' or 'file'
322
+
323
+ Returns:
324
+ Dictionary with paper title, references, and results
325
+ """
326
+ try:
327
+ # Step 1: Get paper content
328
+ await self.emit_progress("started", {
329
+ "message": "Starting reference check...",
330
+ "source": paper_source
331
+ })
332
+
333
+ paper_title = "Unknown Paper"
334
+ paper_text = ""
335
+ title_updated = False
336
+
337
+ async def update_title_if_needed(title: str):
338
+ nonlocal title_updated
339
+ if not title_updated and title and title != "Unknown Paper":
340
+ title_updated = True
341
+ if self.title_update_callback and self.check_id:
342
+ await self.title_update_callback(self.check_id, title)
343
+ # Also emit via WebSocket so frontend can update
344
+ await self.emit_progress("title_updated", {"paper_title": title})
345
+
346
+ await self._check_cancelled()
347
+ # Track if we got references from ArXiv source files and the extraction method
348
+ arxiv_source_references = None
349
+ extraction_method = None # 'bbl', 'bib', 'pdf', 'llm', or None
350
+
351
+ if source_type == "url":
352
+ # Check if this is a direct PDF URL (not arXiv)
353
+ is_direct_pdf_url = (
354
+ paper_source.lower().endswith('.pdf') and
355
+ 'arxiv.org' not in paper_source.lower()
356
+ )
357
+
358
+ if is_direct_pdf_url:
359
+ # Handle direct PDF URLs (e.g., Microsoft Research PDFs)
360
+ # PDF extraction requires LLM for reliable reference extraction
361
+ if not self.llm:
362
+ raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
363
+
364
+ await self.emit_progress("extracting", {
365
+ "message": "Downloading PDF from URL..."
366
+ })
367
+
368
+ # Download PDF from URL
369
+ import urllib.request
370
+ import hashlib
371
+ pdf_hash = hashlib.md5(paper_source.encode()).hexdigest()[:12]
372
+ pdf_path = os.path.join(tempfile.gettempdir(), f"refchecker_pdf_{pdf_hash}.pdf")
373
+
374
+ def download_pdf_url():
375
+ urllib.request.urlretrieve(paper_source, pdf_path)
376
+ return pdf_path
377
+
378
+ await asyncio.to_thread(download_pdf_url)
379
+
380
+ # Extract title from PDF filename or URL
381
+ from urllib.parse import urlparse, unquote
382
+ url_path = urlparse(paper_source).path
383
+ pdf_filename = unquote(url_path.split('/')[-1])
384
+ paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
385
+ await update_title_if_needed(paper_title)
386
+
387
+ extraction_method = 'pdf'
388
+ pdf_processor = PDFProcessor()
389
+ paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
390
+ else:
391
+ # Handle ArXiv URLs/IDs
392
+ arxiv_id = extract_arxiv_id_from_url(paper_source)
393
+ if not arxiv_id:
394
+ arxiv_id = paper_source # Assume it's already an ID
395
+
396
+ await self.emit_progress("extracting", {
397
+ "message": f"Fetching ArXiv paper {arxiv_id}..."
398
+ })
399
+
400
+ # Download from ArXiv - run in thread to avoid blocking event loop
401
+ def fetch_arxiv():
402
+ search = arxiv.Search(id_list=[arxiv_id])
403
+ return next(search.results())
404
+
405
+ paper = await asyncio.to_thread(fetch_arxiv)
406
+ paper_title = paper.title
407
+ await update_title_if_needed(paper_title)
408
+
409
+ # Try to get BibTeX content from ArXiv source files first
410
+ # This uses the .bbl file preference logic for papers with large .bib files
411
+ await self.emit_progress("extracting", {
412
+ "message": f"Checking ArXiv source for bibliography files..."
413
+ })
414
+
415
+ bibtex_content = await asyncio.to_thread(get_bibtex_content, paper)
416
+
417
+ if bibtex_content:
418
+ logger.info(f"Found BibTeX/BBL content from ArXiv source for {arxiv_id}")
419
+ # Save the bibliography content for later viewing
420
+ if self.bibliography_source_callback and self.check_id:
421
+ await self.bibliography_source_callback(self.check_id, bibtex_content, arxiv_id)
422
+ # Extract references from the BibTeX content (returns tuple)
423
+ result = await self._extract_references_from_bibtex(bibtex_content)
424
+ arxiv_source_references, extraction_method = result
425
+ if arxiv_source_references:
426
+ logger.info(f"Extracted {len(arxiv_source_references)} references from ArXiv source files (method: {extraction_method})")
427
+ else:
428
+ logger.warning("Could not extract references from ArXiv source, falling back to PDF")
429
+
430
+ # Fall back to PDF extraction if no references from source files
431
+ if not arxiv_source_references:
432
+ # PDF extraction requires LLM for reliable reference extraction
433
+ if not self.llm:
434
+ raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings or provide a paper with BibTeX/LaTeX source files.")
435
+ extraction_method = 'pdf'
436
+ # Download PDF - run in thread (use cross-platform temp directory)
437
+ pdf_path = os.path.join(tempfile.gettempdir(), f"arxiv_{arxiv_id}.pdf")
438
+ await asyncio.to_thread(paper.download_pdf, filename=pdf_path)
439
+
440
+ # Extract text from PDF - run in thread
441
+ pdf_processor = PDFProcessor()
442
+ paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
443
+ else:
444
+ paper_text = "" # Not needed since we have references
445
+
446
+ elif source_type == "file":
447
+ extraction_method = 'file'
448
+ await self.emit_progress("extracting", {
449
+ "message": "Extracting text from file..."
450
+ })
451
+
452
+ # Handle uploaded file - run PDF processing in thread
453
+ # Note: paper_title is already set to the original filename in main.py
454
+ # so we don't update it here
455
+ if paper_source.lower().endswith('.pdf'):
456
+ # PDF extraction requires LLM for reliable reference extraction
457
+ if not self.llm:
458
+ raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
459
+ pdf_processor = PDFProcessor()
460
+ paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
461
+ elif paper_source.lower().endswith(('.tex', '.txt')):
462
+ def read_file():
463
+ with open(paper_source, 'r', encoding='utf-8') as f:
464
+ return f.read()
465
+ paper_text = await asyncio.to_thread(read_file)
466
+ else:
467
+ raise ValueError(f"Unsupported file type: {paper_source}")
468
+ elif source_type == "text":
469
+ await self.emit_progress("extracting", {
470
+ "message": "Preparing pasted text..."
471
+ })
472
+ # paper_source is now a file path - read the actual text content
473
+ if os.path.exists(paper_source):
474
+ def read_text_file():
475
+ with open(paper_source, 'r', encoding='utf-8') as f:
476
+ return f.read()
477
+ paper_text = await asyncio.to_thread(read_text_file)
478
+ else:
479
+ # Fallback: paper_source is the actual text (legacy behavior)
480
+ paper_text = paper_source
481
+ paper_title = "Pasted Text"
482
+ extraction_method = 'text'
483
+
484
+ # Check if the pasted text is LaTeX thebibliography format (.bbl)
485
+ if '\\begin{thebibliography}' in paper_text and '\\bibitem' in paper_text:
486
+ logger.info("Detected LaTeX thebibliography format in pasted text")
487
+ # Use the BibTeX extraction method instead
488
+ refs_result = await self._extract_references_from_bibtex(paper_text)
489
+ if refs_result and refs_result[0]:
490
+ arxiv_source_references = refs_result[0]
491
+ extraction_method = 'bbl' # Mark as bbl extraction
492
+ logger.info(f"Extracted {len(arxiv_source_references)} references from pasted .bbl content")
493
+ # Don't update title for pasted text - keep the placeholder
494
+ else:
495
+ raise ValueError(f"Unsupported source type: {source_type}")
496
+
497
+ # Step 2: Extract references
498
+ await self.emit_progress("extracting", {
499
+ "message": "Extracting references from paper...",
500
+ "paper_title": paper_title,
501
+ "extraction_method": extraction_method
502
+ })
503
+
504
+ # Use ArXiv source references if available, otherwise extract from text
505
+ if arxiv_source_references:
506
+ references = arxiv_source_references
507
+ logger.info(f"Using {len(references)} references from ArXiv source files (method: {extraction_method})")
508
+ else:
509
+ references = await self._extract_references(paper_text)
510
+ # If we used PDF/file extraction and LLM was configured, mark as LLM-assisted
511
+ if self.llm and extraction_method in ('pdf', 'file', 'text'):
512
+ extraction_method = 'llm'
513
+
514
+ if not references:
515
+ return {
516
+ "paper_title": paper_title,
517
+ "paper_source": paper_source,
518
+ "extraction_method": extraction_method,
519
+ "references": [],
520
+ "summary": {
521
+ "total_refs": 0,
522
+ "errors_count": 0,
523
+ "warnings_count": 0,
524
+ "suggestions_count": 0,
525
+ "unverified_count": 0,
526
+ "verified_count": 0
527
+ }
528
+ }
529
+
530
+ # Step 3: Check references in parallel (like CLI)
531
+ total_refs = len(references)
532
+ await self.emit_progress("references_extracted", {
533
+ "total_refs": total_refs,
534
+ "extraction_method": extraction_method,
535
+ "references": [
536
+ {
537
+ "index": idx,
538
+ "title": ref.get("title") or ref.get("cited_url") or ref.get("url") or "Unknown Title",
539
+ "authors": ref.get("authors", []),
540
+ "year": ref.get("year"),
541
+ "venue": ref.get("venue"),
542
+ "cited_url": ref.get("cited_url") or ref.get("url")
543
+ }
544
+ for idx, ref in enumerate(references, 1)
545
+ ]
546
+ })
547
+ limiter = get_limiter()
548
+ await self.emit_progress("progress", {
549
+ "current": 0,
550
+ "total": total_refs,
551
+ "message": f"Checking {total_refs} references (max {limiter.max_concurrent} concurrent)..."
552
+ })
553
+
554
+ # Process references in parallel
555
+ results, errors_count, warnings_count, suggestions_count, unverified_count, verified_count, refs_with_errors, refs_with_warnings_only, refs_verified = \
556
+ await self._check_references_parallel(references, total_refs)
557
+
558
+ # Step 4: Return final results
559
+ final_result = {
560
+ "paper_title": paper_title,
561
+ "paper_source": paper_source,
562
+ "extraction_method": extraction_method,
563
+ "references": results,
564
+ "summary": {
565
+ "total_refs": total_refs,
566
+ "processed_refs": total_refs,
567
+ "errors_count": errors_count,
568
+ "warnings_count": warnings_count,
569
+ "suggestions_count": suggestions_count,
570
+ "unverified_count": unverified_count,
571
+ "verified_count": verified_count,
572
+ "refs_with_errors": refs_with_errors,
573
+ "refs_with_warnings_only": refs_with_warnings_only,
574
+ "refs_verified": refs_verified,
575
+ "progress_percent": 100.0,
576
+ "extraction_method": extraction_method
577
+ }
578
+ }
579
+
580
+ await self.emit_progress("completed", final_result["summary"])
581
+
582
+ return final_result
583
+
584
+ except Exception as e:
585
+ logger.error(f"Error checking paper: {e}", exc_info=True)
586
+ await self.emit_progress("error", {
587
+ "message": str(e),
588
+ "details": type(e).__name__
589
+ })
590
+ raise
591
+
592
+ def _parse_llm_reference(self, ref_string: str) -> Optional[Dict[str, Any]]:
593
+ """Parse a single LLM reference string into a structured dict.
594
+
595
+ LLM returns strings in format: Authors#Title#Venue#Year#URL
596
+ Authors are separated by asterisks (*).
597
+ Also handles plain text references that don't follow the format.
598
+ """
599
+ import re
600
+
601
+ if not ref_string:
602
+ return None
603
+
604
+ # If it's already a dict, return as-is
605
+ if isinstance(ref_string, dict):
606
+ return ref_string
607
+
608
+ if not isinstance(ref_string, str):
609
+ ref_string = str(ref_string)
610
+
611
+ ref_string = ref_string.strip()
612
+ if not ref_string:
613
+ return None
614
+
615
+ # Skip LLM explanatory responses (not actual references)
616
+ skip_patterns = [
617
+ r'^I cannot extract',
618
+ r'^No valid.*references',
619
+ r'^This text (does not|doesn\'t) contain',
620
+ r'^The (provided|given) text',
621
+ r'^I was unable to',
622
+ r'^There are no.*references',
623
+ r'^I don\'t see any',
624
+ r'^Unable to extract',
625
+ r'^No references found',
626
+ r'^This appears to be',
627
+ r'^This section',
628
+ r'^The text (appears|seems) to',
629
+ ]
630
+ for pattern in skip_patterns:
631
+ if re.match(pattern, ref_string, re.IGNORECASE):
632
+ logger.debug(f"Skipping LLM explanatory text: {ref_string[:60]}...")
633
+ return None
634
+
635
+ # Check if this looks like a citation key (e.g., "JLZ+22", "ZNIS23")
636
+ # Citation keys are typically short alphanumeric strings, possibly with + or -
637
+ citation_key_pattern = r'^[A-Za-z]+[+\-]?\d{2,4}$'
638
+ is_citation_key = bool(re.match(citation_key_pattern, ref_string.replace('#', '').replace(' ', '')))
639
+
640
+ # Check if it follows the # format
641
+ parts = ref_string.split('#')
642
+
643
+ if len(parts) >= 2:
644
+ # Parse parts: Authors#Title#Venue#Year#URL
645
+ authors_str = parts[0].strip() if len(parts) > 0 else ''
646
+ title = parts[1].strip() if len(parts) > 1 else ''
647
+ venue = parts[2].strip() if len(parts) > 2 else ''
648
+ year_str = parts[3].strip() if len(parts) > 3 else ''
649
+ url = parts[4].strip() if len(parts) > 4 else ''
650
+
651
+ # Check if this is a malformed reference (citation key with empty fields)
652
+ # If most fields are empty and authors looks like a citation key, skip it
653
+ non_empty_fields = sum(1 for f in [title, venue, year_str, url] if f)
654
+ authors_is_citation_key = bool(re.match(citation_key_pattern, authors_str.replace(' ', '')))
655
+
656
+ if non_empty_fields == 0 and authors_is_citation_key:
657
+ # This is just a citation key, not a real reference - skip it
658
+ logger.debug(f"Skipping malformed reference (citation key only): {ref_string}")
659
+ return None
660
+
661
+ # Also skip if title is just a citation key or year
662
+ if title and re.match(citation_key_pattern, title.replace(' ', '')):
663
+ logger.debug(f"Skipping reference with citation key as title: {ref_string}")
664
+ return None
665
+
666
+ # Skip if title looks like it's just a year
667
+ if title and re.match(r'^\d{4}$', title.strip()):
668
+ logger.debug(f"Skipping reference with year as title: {ref_string}")
669
+ return None
670
+
671
+ # Parse authors (separated by *)
672
+ authors = []
673
+ if authors_str:
674
+ # Don't treat citation keys as authors
675
+ if not authors_is_citation_key:
676
+ authors = [a.strip() for a in authors_str.split('*') if a.strip()]
677
+
678
+ # Parse year as integer
679
+ year_int = None
680
+ if year_str:
681
+ year_match = re.search(r'\b(19|20)\d{2}\b', year_str)
682
+ if year_match:
683
+ year_int = int(year_match.group())
684
+
685
+ # Ensure we have a valid title - don't use the raw string if it's mostly separators
686
+ if not title:
687
+ # If there's no title and no meaningful content, skip this reference
688
+ if non_empty_fields == 0:
689
+ return None
690
+ # Otherwise try to clean up the raw string for display
691
+ clean_raw = ref_string.replace('#', ' ').strip()
692
+ clean_raw = re.sub(r'\s+', ' ', clean_raw)
693
+ title = clean_raw[:100] if len(clean_raw) > 100 else clean_raw
694
+
695
+ return {
696
+ 'title': title,
697
+ 'authors': authors,
698
+ 'year': year_int,
699
+ 'venue': venue or None,
700
+ 'url': url or None,
701
+ 'raw_text': ref_string
702
+ }
703
+ else:
704
+ # Not in expected format, parse as plain text reference
705
+
706
+ # Skip very short strings (likely citation keys or garbage)
707
+ if len(ref_string) < 15:
708
+ logger.debug(f"Skipping short string: {ref_string}")
709
+ return None
710
+
711
+ # Try to extract structured data from plain text
712
+ title = ref_string
713
+ authors = []
714
+ year_int = None
715
+ venue = None
716
+ url = None
717
+
718
+ # Try to extract year from plain text
719
+ year_match = re.search(r'\b(19|20)\d{2}\b', ref_string)
720
+ if year_match:
721
+ year_int = int(year_match.group())
722
+
723
+ # Try to extract URL from plain text
724
+ url_match = re.search(r'https?://[^\s]+', ref_string)
725
+ if url_match:
726
+ url = url_match.group()
727
+
728
+ # Clean up title - remove year and URL if found
729
+ if year_match:
730
+ title = title.replace(year_match.group(), '').strip()
731
+ if url_match:
732
+ title = title.replace(url_match.group(), '').strip()
733
+
734
+ # Remove common delimiters from start/end
735
+ title = title.strip('.,;:-() ')
736
+
737
+ return {
738
+ 'title': title if title else ref_string[:100],
739
+ 'authors': authors,
740
+ 'year': year_int,
741
+ 'venue': venue,
742
+ 'url': url,
743
+ 'raw_text': ref_string
744
+ }
745
+
746
+ async def _extract_references(self, paper_text: str) -> List[Dict[str, Any]]:
747
+ """Extract references using the same pipeline/order as the CLI."""
748
+ try:
749
+ cli_checker = _make_cli_checker(self.llm)
750
+
751
+ # Step 1: find bibliography section (CLI logic) - run in thread
752
+ bib_section = await asyncio.to_thread(cli_checker.find_bibliography_section, paper_text)
753
+ if not bib_section:
754
+ logger.warning("Could not find bibliography section in paper")
755
+ return []
756
+
757
+ logger.info(f"Found bibliography section ({len(bib_section)} chars)")
758
+
759
+ # Step 2: parse references (CLI logic, including LLM and post-processing) - run in thread
760
+ refs = await asyncio.to_thread(cli_checker.parse_references, bib_section)
761
+ if cli_checker.fatal_error:
762
+ logger.error("Reference parsing failed (CLI fatal_error)")
763
+ return []
764
+ if refs:
765
+ logger.info(f"Extracted {len(refs)} references via CLI parser")
766
+ # Normalize field names (journal -> venue)
767
+ refs = [_normalize_reference_fields(ref) for ref in refs]
768
+ return refs
769
+
770
+ logger.warning("No references could be extracted")
771
+ return []
772
+ except Exception as e:
773
+ error_msg = str(e)
774
+ logger.error(f"Error extracting references: {error_msg}")
775
+ # Emit error to frontend
776
+ await self.emit_progress("error", {
777
+ "message": f"Failed to extract references: {error_msg}",
778
+ "details": type(e).__name__
779
+ })
780
+ raise
781
+
782
+ async def _extract_references_from_bibtex(self, bibtex_content: str) -> tuple:
783
+ """Extract references from BibTeX/BBL content (from ArXiv source files).
784
+
785
+ This mirrors the CLI's extract_bibliography logic for handling BibTeX content.
786
+
787
+ Returns:
788
+ Tuple of (references list, extraction_method string)
789
+ extraction_method is one of: 'bbl', 'bib', 'llm', or None if extraction failed
790
+ """
791
+ try:
792
+ cli_checker = _make_cli_checker(self.llm)
793
+
794
+ # Check if this is LaTeX thebibliography format (e.g., from .bbl files)
795
+ if '\\begin{thebibliography}' in bibtex_content and '\\bibitem' in bibtex_content:
796
+ logger.info("Detected LaTeX thebibliography format from .bbl file")
797
+ # Use extract_latex_references for .bbl format
798
+ refs = await asyncio.to_thread(extract_latex_references, bibtex_content, None)
799
+
800
+ if refs:
801
+ # Validate the parsed references
802
+ from refchecker.utils.text_utils import validate_parsed_references
803
+ validation = await asyncio.to_thread(validate_parsed_references, refs)
804
+
805
+ if not validation['is_valid'] and self.llm:
806
+ logger.debug(f"LaTeX parsing validation failed (quality: {validation['quality_score']:.2f}), trying LLM fallback")
807
+ # Try LLM fallback
808
+ try:
809
+ llm_refs = await asyncio.to_thread(cli_checker.llm_extractor.extract_references, bibtex_content)
810
+ if llm_refs:
811
+ processed_refs = await asyncio.to_thread(cli_checker._process_llm_extracted_references, llm_refs)
812
+ llm_validation = await asyncio.to_thread(validate_parsed_references, processed_refs)
813
+ if llm_validation['quality_score'] > validation['quality_score']:
814
+ logger.info(f"LLM extraction improved quality ({llm_validation['quality_score']:.2f})")
815
+ # Normalize field names (journal -> venue)
816
+ processed_refs = [_normalize_reference_fields(ref) for ref in processed_refs]
817
+ return (processed_refs, 'llm')
818
+ except Exception as e:
819
+ logger.warning(f"LLM fallback failed: {e}")
820
+
821
+ logger.info(f"Extracted {len(refs)} references from .bbl content")
822
+ # Normalize field names (journal -> venue)
823
+ refs = [_normalize_reference_fields(ref) for ref in refs]
824
+ return (refs, 'bbl')
825
+ else:
826
+ # Parse as BibTeX format
827
+ logger.info("Detected BibTeX format from .bib file")
828
+ refs = await asyncio.to_thread(cli_checker.parse_references, bibtex_content)
829
+ if cli_checker.fatal_error:
830
+ logger.error("BibTeX parsing failed")
831
+ return ([], None)
832
+ if refs:
833
+ logger.info(f"Extracted {len(refs)} references from .bib content")
834
+ # Normalize field names (journal -> venue)
835
+ refs = [_normalize_reference_fields(ref) for ref in refs]
836
+ return (refs, 'bib')
837
+
838
+ return ([], None)
839
+ except Exception as e:
840
+ logger.error(f"Error extracting references from BibTeX: {e}")
841
+ return ([], None)
842
+
843
+ async def _check_reference(self, reference: Dict[str, Any], index: int) -> Dict[str, Any]:
844
+ """Check a single reference and format result"""
845
+ try:
846
+ # Use the hybrid checker with timeout protection
847
+ import asyncio
848
+ loop = asyncio.get_event_loop()
849
+
850
+ # Run verification in a thread with timeout
851
+ try:
852
+ verified_data, errors, url = await asyncio.wait_for(
853
+ loop.run_in_executor(None, self.checker.verify_reference, reference),
854
+ timeout=60.0 # 60 second timeout per reference
855
+ )
856
+ except asyncio.TimeoutError:
857
+ logger.warning(f"Reference {index} verification timed out")
858
+ verified_data = None
859
+ errors = [{"error_type": "timeout", "error_details": "Verification timed out after 60 seconds"}]
860
+ url = None
861
+
862
+ return self._format_verification_result(reference, index, verified_data, errors, url)
863
+
864
+ except Exception as e:
865
+ logger.error(f"Error checking reference {index}: {e}")
866
+ return self._format_error_result(reference, index, e)
867
+
868
+ def _check_reference_sync(self, reference: Dict[str, Any], index: int) -> Dict[str, Any]:
869
+ """Synchronous version of reference checking for thread pool"""
870
+ try:
871
+ # Run verification with timeout (handled by caller)
872
+ verified_data, errors, url = self.checker.verify_reference(reference)
873
+ return self._format_verification_result(reference, index, verified_data, errors, url)
874
+
875
+ except Exception as e:
876
+ logger.error(f"Error checking reference {index}: {e}")
877
+ return self._format_error_result(reference, index, e)
878
+
879
+ async def _check_single_reference_with_limit(
880
+ self,
881
+ reference: Dict[str, Any],
882
+ idx: int,
883
+ total_refs: int,
884
+ loop: asyncio.AbstractEventLoop
885
+ ) -> Dict[str, Any]:
886
+ """
887
+ Check a single reference with global concurrency limiting.
888
+
889
+ First checks the verification cache for a previous result.
890
+ Acquires a slot from the global limiter before starting the check,
891
+ and releases it when done. Stores result in cache on success.
892
+ """
893
+ from .database import db
894
+
895
+ # Check cache first
896
+ cached_result = await db.get_cached_verification(reference)
897
+ if cached_result:
898
+ # Update the index to match current position
899
+ cached_result['index'] = idx + 1
900
+ logger.info(f"Cache hit for reference {idx + 1}: {reference.get('title', 'Unknown')[:50]}")
901
+ return cached_result
902
+
903
+ limiter = get_limiter()
904
+
905
+ # Wait for a slot in the global queue
906
+ async with limiter:
907
+ # Check for cancellation before starting
908
+ await self._check_cancelled()
909
+
910
+ # Emit that this reference is now being checked
911
+ await self.emit_progress("checking_reference", {
912
+ "index": idx + 1,
913
+ "title": reference.get("title") or reference.get("cited_url") or reference.get("url") or "Unknown Title",
914
+ "total": total_refs
915
+ })
916
+
917
+ try:
918
+ # Run the sync check in a thread
919
+ result = await asyncio.wait_for(
920
+ loop.run_in_executor(
921
+ None, # Use default executor
922
+ self._check_reference_sync,
923
+ reference,
924
+ idx + 1
925
+ ),
926
+ timeout=120.0 # 2 minute timeout per reference
927
+ )
928
+ except asyncio.TimeoutError:
929
+ result = {
930
+ "index": idx + 1,
931
+ "title": reference.get('title') or reference.get('cited_url') or reference.get('url') or 'Unknown',
932
+ "authors": reference.get('authors', []),
933
+ "year": reference.get('year'),
934
+ "venue": reference.get('venue'),
935
+ "cited_url": reference.get('cited_url') or reference.get('url'),
936
+ "status": "error",
937
+ "errors": [{
938
+ "error_type": "timeout",
939
+ "error_details": "Verification timed out after 120 seconds"
940
+ }],
941
+ "warnings": [],
942
+ "authoritative_urls": [],
943
+ "corrected_reference": None
944
+ }
945
+ except asyncio.CancelledError:
946
+ raise # Re-raise cancellation
947
+ except Exception as e:
948
+ logger.error(f"Error checking reference {idx + 1}: {e}")
949
+ result = {
950
+ "index": idx + 1,
951
+ "title": reference.get('title', 'Unknown'),
952
+ "authors": reference.get('authors', []),
953
+ "year": reference.get('year'),
954
+ "venue": reference.get('venue'),
955
+ "cited_url": reference.get('url'),
956
+ "status": "error",
957
+ "errors": [{
958
+ "error_type": "check_failed",
959
+ "error_details": str(e)
960
+ }],
961
+ "warnings": [],
962
+ "authoritative_urls": [],
963
+ "corrected_reference": None
964
+ }
965
+
966
+ # Store successful results in cache (db.store_cached_verification filters out errors)
967
+ try:
968
+ await db.store_cached_verification(reference, result)
969
+ except Exception as cache_error:
970
+ logger.warning(f"Failed to cache verification result: {cache_error}")
971
+
972
+ return result
973
+
974
+ async def _check_references_parallel(
975
+ self,
976
+ references: List[Dict[str, Any]],
977
+ total_refs: int
978
+ ) -> tuple:
979
+ """
980
+ Check references in parallel using global concurrency limiting.
981
+
982
+ All papers share the same global limit, so if you have 3 papers checking
983
+ and concurrency is 6, each paper gets a share of the 6 slots.
984
+
985
+ Emits progress updates as results come in.
986
+ Only marks references as 'checking' when they actually start.
987
+ Returns results list and counts.
988
+ """
989
+ results = {}
990
+ errors_count = 0
991
+ warnings_count = 0
992
+ suggestions_count = 0
993
+ unverified_count = 0
994
+ verified_count = 0
995
+ refs_with_errors = 0
996
+ refs_with_warnings_only = 0
997
+ refs_verified = 0
998
+ processed_count = 0
999
+
1000
+ loop = asyncio.get_event_loop()
1001
+
1002
+ # Create tasks for all references - they will be rate-limited by the global semaphore
1003
+ tasks = []
1004
+ for idx, ref in enumerate(references):
1005
+ task = asyncio.create_task(
1006
+ self._check_single_reference_with_limit(ref, idx, total_refs, loop),
1007
+ name=f"ref-check-{idx}"
1008
+ )
1009
+ tasks.append((idx, task))
1010
+
1011
+ # Process results as they complete
1012
+ pending_tasks = {task for _, task in tasks}
1013
+ task_to_idx = {task: idx for idx, task in tasks}
1014
+
1015
+ while pending_tasks:
1016
+ # Check for cancellation
1017
+ try:
1018
+ await self._check_cancelled()
1019
+ except asyncio.CancelledError:
1020
+ # Cancel all pending tasks
1021
+ for task in pending_tasks:
1022
+ task.cancel()
1023
+ raise
1024
+
1025
+ # Wait for some tasks to complete
1026
+ done, pending_tasks = await asyncio.wait(
1027
+ pending_tasks,
1028
+ timeout=0.5,
1029
+ return_when=asyncio.FIRST_COMPLETED
1030
+ )
1031
+
1032
+ for task in done:
1033
+ idx = task_to_idx[task]
1034
+
1035
+ try:
1036
+ result = task.result()
1037
+ except asyncio.CancelledError:
1038
+ # Task was cancelled, create cancelled result
1039
+ result = {
1040
+ "index": idx + 1,
1041
+ "title": references[idx].get('title', 'Unknown'),
1042
+ "authors": references[idx].get('authors', []),
1043
+ "year": references[idx].get('year'),
1044
+ "venue": references[idx].get('venue'),
1045
+ "cited_url": references[idx].get('url'),
1046
+ "status": "cancelled",
1047
+ "errors": [],
1048
+ "warnings": [],
1049
+ "authoritative_urls": [],
1050
+ "corrected_reference": None
1051
+ }
1052
+ except Exception as e:
1053
+ logger.error(f"Unexpected error for reference {idx + 1}: {e}")
1054
+ result = {
1055
+ "index": idx + 1,
1056
+ "title": references[idx].get('title', 'Unknown'),
1057
+ "authors": references[idx].get('authors', []),
1058
+ "year": references[idx].get('year'),
1059
+ "venue": references[idx].get('venue'),
1060
+ "cited_url": references[idx].get('url'),
1061
+ "status": "error",
1062
+ "errors": [{
1063
+ "error_type": "unexpected_error",
1064
+ "error_details": str(e)
1065
+ }],
1066
+ "warnings": [],
1067
+ "authoritative_urls": [],
1068
+ "corrected_reference": None
1069
+ }
1070
+
1071
+ # Store result
1072
+ results[idx] = result
1073
+ processed_count += 1
1074
+
1075
+ # Count individual issues (not just references)
1076
+ # Exclude 'unverified' from error count since it has its own category
1077
+ real_errors = [e for e in result.get('errors', []) if e.get('error_type') != 'unverified']
1078
+ num_errors = len(real_errors)
1079
+ num_warnings = len(result.get('warnings', []))
1080
+ num_suggestions = len(result.get('suggestions', []))
1081
+
1082
+ errors_count += num_errors
1083
+ warnings_count += num_warnings
1084
+ suggestions_count += num_suggestions
1085
+
1086
+ # Count references by status for filtering
1087
+ if result['status'] == 'unverified':
1088
+ unverified_count += 1
1089
+ elif result['status'] == 'verified':
1090
+ verified_count += 1
1091
+ refs_verified += 1
1092
+ elif result['status'] == 'suggestion':
1093
+ # Suggestion-only refs are considered verified (no errors or warnings)
1094
+ verified_count += 1
1095
+ refs_verified += 1
1096
+
1097
+ # Track references by issue type (excluding unverified from error check)
1098
+ if result['status'] == 'error' or num_errors > 0:
1099
+ refs_with_errors += 1
1100
+ elif result['status'] == 'warning' or num_warnings > 0:
1101
+ refs_with_warnings_only += 1
1102
+
1103
+ # Emit result immediately
1104
+ await self.emit_progress("reference_result", result)
1105
+ await self.emit_progress("progress", {
1106
+ "current": processed_count,
1107
+ "total": total_refs
1108
+ })
1109
+ await self.emit_progress("summary_update", {
1110
+ "total_refs": total_refs,
1111
+ "processed_refs": processed_count,
1112
+ "errors_count": errors_count,
1113
+ "warnings_count": warnings_count,
1114
+ "suggestions_count": suggestions_count,
1115
+ "unverified_count": unverified_count,
1116
+ "verified_count": verified_count,
1117
+ "refs_with_errors": refs_with_errors,
1118
+ "refs_with_warnings_only": refs_with_warnings_only,
1119
+ "refs_verified": refs_verified,
1120
+ "progress_percent": round((processed_count / total_refs) * 100, 1)
1121
+ })
1122
+
1123
+ # Convert dict to ordered list
1124
+ results_list = [results.get(i) for i in range(total_refs)]
1125
+
1126
+ return results_list, errors_count, warnings_count, suggestions_count, unverified_count, verified_count, refs_with_errors, refs_with_warnings_only, refs_verified