academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,415 @@
1
+ """
2
+ Parallel reference processing system for RefChecker.
3
+
4
+ This module provides parallelized reference verification with ordered result output.
5
+ It maintains the same error detection quality as sequential processing while
6
+ dramatically improving performance for large bibliographies.
7
+ """
8
+
9
+ import time
10
+ import logging
11
+ from queue import Queue
12
+ from threading import Thread, Lock
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from dataclasses import dataclass
15
+ from typing import List, Dict, Any, Optional, Tuple, Callable
16
+ from refchecker.utils.text_utils import deduplicate_urls
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class ReferenceWorkItem:
23
+ """Work item for the reference verification queue."""
24
+ index: int
25
+ source_paper: Any
26
+ reference: Dict[str, Any]
27
+ timestamp: float
28
+
29
+
30
+ @dataclass
31
+ class ReferenceResult:
32
+ """Result of a reference verification."""
33
+ index: int
34
+ errors: Optional[List[Dict[str, Any]]]
35
+ url: Optional[str]
36
+ processing_time: float
37
+ reference: Dict[str, Any]
38
+ verified_data: Optional[Dict[str, Any]] = None
39
+
40
+
41
+ class ParallelReferenceProcessor:
42
+ """
43
+ Parallel reference verification processor with ordered output.
44
+
45
+ This class manages a pool of worker threads that verify references independently
46
+ while ensuring results are printed in the original order (1, 2, 3...).
47
+ """
48
+
49
+ def __init__(self, base_checker: Any, max_workers: int = 6, enable_progress: bool = True):
50
+ """
51
+ Initialize the parallel processor.
52
+
53
+ Args:
54
+ base_checker: The base reference checker instance
55
+ max_workers: Maximum number of worker threads
56
+ enable_progress: Whether to show progress indicators
57
+ """
58
+ self.base_checker = base_checker
59
+ self.max_workers = max_workers
60
+ self.enable_progress = enable_progress
61
+
62
+ # Threading components
63
+ self.work_queue = Queue()
64
+ self.result_queue = Queue()
65
+ self.result_buffer = {} # index -> ReferenceResult
66
+ self.buffer_lock = Lock()
67
+
68
+ # State tracking
69
+ self.next_print_index = 0
70
+ self.total_references = 0
71
+ self.completed_count = 0
72
+ self.start_time = 0
73
+
74
+ # Statistics
75
+ self.processing_stats = {
76
+ 'total_processed': 0,
77
+ 'total_errors': 0,
78
+ 'avg_processing_time': 0,
79
+ 'fastest_time': float('inf'),
80
+ 'slowest_time': 0
81
+ }
82
+
83
+ def verify_references_parallel(self, source_paper: Any, bibliography: List[Dict[str, Any]],
84
+ result_callback: Optional[Callable] = None) -> Dict[str, Any]:
85
+ """
86
+ Verify references in parallel with ordered output.
87
+
88
+ Args:
89
+ source_paper: The source paper containing the references
90
+ bibliography: List of references to verify
91
+ result_callback: Optional callback for each completed result
92
+
93
+ Returns:
94
+ Dictionary with processing statistics
95
+ """
96
+ if not bibliography:
97
+ logger.info("No references to verify")
98
+ return self._get_stats()
99
+
100
+ self.total_references = len(bibliography)
101
+ self.start_time = time.time()
102
+ self.next_print_index = 0
103
+ self.completed_count = 0
104
+ self.result_buffer.clear()
105
+
106
+ logger.debug(f"Starting parallel verification of {self.total_references} references with {self.max_workers} workers")
107
+
108
+ # Populate work queue
109
+ for i, reference in enumerate(bibliography):
110
+ work_item = ReferenceWorkItem(
111
+ index=i,
112
+ source_paper=source_paper,
113
+ reference=reference,
114
+ timestamp=time.time()
115
+ )
116
+ self.work_queue.put(work_item)
117
+
118
+ # Add sentinel values to signal workers to stop (one per worker)
119
+ for _ in range(self.max_workers):
120
+ self.work_queue.put(None) # None signals end of work
121
+
122
+ # Start result printer thread
123
+ printer_thread = Thread(target=self._ordered_result_printer, args=(result_callback,))
124
+ printer_thread.daemon = True
125
+ printer_thread.start()
126
+
127
+ # Start worker threads
128
+ with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="RefWorker") as executor:
129
+ # Submit worker tasks
130
+ futures = []
131
+ for worker_id in range(self.max_workers):
132
+ future = executor.submit(self._worker_loop, worker_id)
133
+ futures.append(future)
134
+
135
+ # Wait for all workers to complete
136
+ for future in as_completed(futures):
137
+ try:
138
+ future.result()
139
+ except Exception as e:
140
+ logger.error(f"Worker thread failed: {e}")
141
+
142
+ # Wait for printer to finish
143
+ printer_thread.join()
144
+
145
+ # Final stats printing disabled
146
+ # if self.enable_progress:
147
+ # self._print_final_stats()
148
+
149
+ return self._get_stats()
150
+
151
+ def _worker_loop(self, worker_id: int) -> None:
152
+ """
153
+ Main loop for worker threads - no timeouts, only exit when queue is empty.
154
+
155
+ Args:
156
+ worker_id: Unique identifier for this worker
157
+ """
158
+ processed_count = 0
159
+ logger.debug(f"Worker {worker_id} started")
160
+
161
+ while True:
162
+ try:
163
+ # Get work item - blocks until available
164
+ work_item = self.work_queue.get(block=True)
165
+
166
+ # Check for sentinel value (signals end of work)
167
+ if work_item is None:
168
+ self.work_queue.task_done()
169
+ break
170
+
171
+ try:
172
+ # Perform reference verification using base checker
173
+ start_time = time.time()
174
+ errors, url, verified_data = self.base_checker.verify_reference(
175
+ work_item.source_paper,
176
+ work_item.reference
177
+ )
178
+ processing_time = time.time() - start_time
179
+
180
+ # Create result
181
+ result = ReferenceResult(
182
+ index=work_item.index,
183
+ errors=errors,
184
+ url=url,
185
+ processing_time=processing_time,
186
+ reference=work_item.reference,
187
+ verified_data=verified_data
188
+ )
189
+
190
+ # Put result in queue
191
+ self.result_queue.put(result)
192
+ processed_count += 1
193
+
194
+ logger.debug(f"Worker {worker_id} completed reference {work_item.index} in {processing_time:.2f}s")
195
+
196
+ except Exception as e:
197
+ # Handle verification errors gracefully
198
+ logger.error(f"Worker {worker_id} failed to verify reference {work_item.index}: {e}")
199
+
200
+ error_result = ReferenceResult(
201
+ index=work_item.index,
202
+ errors=[{"error_type": "processing_failed", "error_details": f"Internal processing error: {str(e)}"}],
203
+ url=None,
204
+ processing_time=time.time() - work_item.timestamp,
205
+ reference=work_item.reference
206
+ )
207
+ self.result_queue.put(error_result)
208
+
209
+ finally:
210
+ self.work_queue.task_done()
211
+
212
+ except Exception as e:
213
+ logger.error(f"Worker {worker_id} encountered unexpected error: {e}")
214
+ break
215
+
216
+ logger.debug(f"Worker {worker_id} finished after processing {processed_count} items")
217
+
218
+ def _ordered_result_printer(self, result_callback: Optional[Callable] = None) -> None:
219
+ """
220
+ Print results in order and handle callbacks.
221
+
222
+ Args:
223
+ result_callback: Optional callback function for each result
224
+ """
225
+ logger.debug("Result printer started")
226
+
227
+ while self.next_print_index < self.total_references:
228
+ try:
229
+ # Get result - blocks until available
230
+ result = self.result_queue.get(block=True)
231
+
232
+ # Store result in buffer
233
+ with self.buffer_lock:
234
+ self.result_buffer[result.index] = result
235
+ self._update_stats(result)
236
+
237
+ # Print any consecutive results starting from next_print_index
238
+ with self.buffer_lock:
239
+ while self.next_print_index in self.result_buffer:
240
+ current_result = self.result_buffer[self.next_print_index]
241
+
242
+ # Print the result using base checker's output methods
243
+ self._print_reference_result(current_result)
244
+
245
+ # Call callback if provided
246
+ if result_callback:
247
+ try:
248
+ result_callback(current_result)
249
+ except Exception as e:
250
+ logger.error(f"Result callback failed for reference {current_result.index}: {e}")
251
+
252
+ # Clean up and advance
253
+ del self.result_buffer[self.next_print_index]
254
+ self.next_print_index += 1
255
+ self.completed_count += 1
256
+
257
+ # Show progress (disabled)
258
+ # if self.enable_progress and self.completed_count % 10 == 0:
259
+ # self._print_progress()
260
+
261
+ except Exception as e:
262
+ logger.error(f"Result printer error: {e}")
263
+ continue
264
+
265
+ logger.debug("Result printer finished")
266
+
267
+
268
+ def _print_reference_result(self, result: ReferenceResult) -> None:
269
+ """
270
+ Print a single reference result using the base checker's format.
271
+
272
+ Args:
273
+ result: The reference result to print
274
+ """
275
+ reference = result.reference
276
+
277
+ # Print reference info in the same format as sequential mode
278
+ raw_title = reference.get('title', 'Untitled')
279
+ # Clean LaTeX commands from title for display
280
+ from refchecker.utils.text_utils import strip_latex_commands
281
+ title = strip_latex_commands(raw_title)
282
+ from refchecker.utils.text_utils import format_authors_for_display
283
+ authors = format_authors_for_display(reference.get('authors', []))
284
+ year = reference.get('year', '')
285
+ # Get venue from either 'venue' or 'journal' field and clean it up
286
+ venue = reference.get('venue', '') or reference.get('journal', '')
287
+ if venue:
288
+ from refchecker.utils.error_utils import clean_venue_for_comparison
289
+ venue = clean_venue_for_comparison(venue)
290
+ url = reference.get('url', '')
291
+ doi = reference.get('doi', '')
292
+
293
+ # Extract actual reference number from raw text for accurate display
294
+ import re
295
+ raw_text = reference.get('raw_text', '')
296
+ match = re.match(r'\[(\d+)\]', raw_text)
297
+ ref_num = match.group(1) if match else str(result.index + 1)
298
+ print(f"[{ref_num}/{self.total_references}] {title}")
299
+ if authors:
300
+ print(f" {authors}")
301
+ if venue:
302
+ print(f" {venue}")
303
+ if year:
304
+ print(f" {year}")
305
+ if doi:
306
+ print(f" {doi}")
307
+ # Show cited URL if available
308
+ if url:
309
+ print(f" {url}")
310
+
311
+ # Get the appropriate verified URL using shared logic from base checker
312
+ verified_url_to_show = self.base_checker._get_verified_url(result.verified_data, result.url, result.errors)
313
+
314
+ # Show the verified URL with appropriate label
315
+ print("")
316
+ if verified_url_to_show:
317
+ print(f" Verified URL: {verified_url_to_show}")
318
+
319
+ # Show correct ArXiv URL if available from verified data and different from cited
320
+ if result.verified_data:
321
+ external_ids = result.verified_data.get('externalIds', {})
322
+ if external_ids.get('ArXiv'):
323
+ correct_arxiv_url = f"https://arxiv.org/abs/{external_ids['ArXiv']}"
324
+ # Only show if it's different from the cited URL
325
+ if correct_arxiv_url != url:
326
+ print(f" ArXiv URL: {correct_arxiv_url}")
327
+
328
+ # Show additional external ID URLs if available and different
329
+ if result.verified_data:
330
+ external_ids = result.verified_data.get('externalIds', {})
331
+
332
+ # Show DOI URL if available and different from what's already shown
333
+ if external_ids.get('DOI'):
334
+ from refchecker.utils.doi_utils import construct_doi_url
335
+ doi_url = construct_doi_url(external_ids['DOI'])
336
+ if doi_url != verified_url_to_show and doi_url != url:
337
+ print(f" DOI URL: {doi_url}")
338
+
339
+ # Show any other URL from verified data if different
340
+ if result.verified_data.get('url') and result.verified_data['url'] != verified_url_to_show and result.verified_data['url'] != url:
341
+ print(f" {result.verified_data['url']}")
342
+
343
+ # Display errors and warnings
344
+ if result.errors:
345
+ # Check if there's an unverified error
346
+ has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in result.errors)
347
+
348
+ if has_unverified_error:
349
+ # Use the centralized unverified error display function from base checker
350
+ self.base_checker._display_unverified_error_with_subreason(reference, result.url, result.errors, debug_mode=False, print_output=True)
351
+
352
+ # Display all non-unverified errors and warnings
353
+ for error in result.errors:
354
+ if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
355
+ error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
356
+ error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
357
+
358
+ from refchecker.utils.error_utils import print_labeled_multiline
359
+
360
+ if error_type == 'arxiv_id':
361
+ # Keep existing style for arXiv ID errors
362
+ print(f" ❌ {error_details}")
363
+ elif 'error_type' in error:
364
+ print_labeled_multiline("❌ Error", error_details)
365
+ elif 'warning_type' in error:
366
+ print_labeled_multiline("⚠️ Warning", error_details)
367
+ else:
368
+ print_labeled_multiline("ℹ️ Information", error_details)
369
+
370
+ # Show timing info for slow references
371
+ if result.processing_time > 5.0:
372
+ logger.debug(f"Reference {result.index + 1} took {result.processing_time:.2f}s to verify: {title}")
373
+ logger.debug(f"Raw text: {reference.get('raw_text', '')}")
374
+
375
+ def _update_stats(self, result: ReferenceResult) -> None:
376
+ """Update processing statistics."""
377
+ self.processing_stats['total_processed'] += 1
378
+
379
+ if result.errors:
380
+ self.processing_stats['total_errors'] += len(result.errors)
381
+
382
+ # Update timing stats
383
+ proc_time = result.processing_time
384
+ self.processing_stats['fastest_time'] = min(self.processing_stats['fastest_time'], proc_time)
385
+ self.processing_stats['slowest_time'] = max(self.processing_stats['slowest_time'], proc_time)
386
+
387
+ # Update average
388
+ total = self.processing_stats['total_processed']
389
+ current_avg = self.processing_stats['avg_processing_time']
390
+ self.processing_stats['avg_processing_time'] = ((current_avg * (total - 1)) + proc_time) / total
391
+
392
+ def _print_progress(self) -> None:
393
+ """Print progress information."""
394
+ # Progress printing disabled to avoid noise
395
+ pass
396
+
397
+ def _print_final_stats(self) -> None:
398
+ """Print final processing statistics."""
399
+ # Final stats printing disabled to avoid noise
400
+ pass
401
+
402
+ def _get_stats(self) -> Dict[str, Any]:
403
+ """Get processing statistics."""
404
+ total_time = time.time() - self.start_time if self.start_time > 0 else 0
405
+
406
+ return {
407
+ 'total_references': self.total_references,
408
+ 'total_time': total_time,
409
+ 'references_per_second': self.total_references / total_time if total_time > 0 else 0,
410
+ 'total_errors': self.processing_stats['total_errors'],
411
+ 'avg_processing_time': self.processing_stats['avg_processing_time'],
412
+ 'fastest_time': self.processing_stats['fastest_time'] if self.processing_stats['fastest_time'] != float('inf') else 0,
413
+ 'slowest_time': self.processing_stats['slowest_time']
414
+ }
415
+