academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
File without changes
refchecker/llm/base.py ADDED
@@ -0,0 +1,376 @@
1
+ """
2
+ Base classes for LLM-based reference extraction
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import List, Dict, Any, Optional
7
+ import logging
8
+ import re
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ import time
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class LLMProvider(ABC):
16
+ """Abstract base class for LLM providers"""
17
+
18
+ def __init__(self, config: Dict[str, Any]):
19
+ self.config = config
20
+ self.model = config.get("model")
21
+ self.max_tokens = config.get("max_tokens", 4000)
22
+ self.temperature = config.get("temperature", 0.1)
23
+
24
+ @abstractmethod
25
+ def extract_references(self, bibliography_text: str) -> List[str]:
26
+ """
27
+ Extract references from bibliography text using LLM
28
+
29
+ Args:
30
+ bibliography_text: Raw bibliography text
31
+
32
+ Returns:
33
+ List of extracted references
34
+ """
35
+ pass
36
+
37
+ @abstractmethod
38
+ def is_available(self) -> bool:
39
+ """Check if the LLM provider is properly configured and available"""
40
+ pass
41
+
42
+ def _create_extraction_prompt(self, bibliography_text: str) -> str:
43
+ """Create the prompt for reference extraction - should be overridden by subclasses"""
44
+ raise NotImplementedError("Subclasses must implement _create_extraction_prompt")
45
+
46
+ def _call_llm(self, prompt: str) -> str:
47
+ """Make the actual LLM API call and return the response text - should be overridden by subclasses"""
48
+ raise NotImplementedError("Subclasses must implement _call_llm")
49
+
50
+ def _chunk_bibliography(self, bibliography_text: str, max_tokens: int = 2000) -> List[str]:
51
+ """Split bibliography into balanced overlapping chunks to prevent reference loss at boundaries"""
52
+
53
+ # Calculate target chunk size in characters (rough estimate: 1 token ≈ 4 characters)
54
+ target_chunk_size = max_tokens * 4
55
+ total_length = len(bibliography_text)
56
+
57
+ # Calculate how many chunks we need for balanced processing
58
+ num_chunks = max(1, (total_length + target_chunk_size - 1) // target_chunk_size)
59
+
60
+ # Use overlap of ~10% of chunk size to ensure references aren't lost
61
+ overlap_size = target_chunk_size // 10
62
+
63
+ # Calculate actual chunk size for balanced distribution
64
+ effective_chunk_size = (total_length + num_chunks - 1) // num_chunks
65
+
66
+ logger.debug(f"Bibliography length: {total_length} chars, target: {target_chunk_size}, "
67
+ f"creating {num_chunks} balanced chunks of ~{effective_chunk_size} chars with {overlap_size} overlap")
68
+
69
+ chunks = []
70
+ start = 0
71
+
72
+ for i in range(num_chunks):
73
+ if i == num_chunks - 1:
74
+ # Last chunk gets all remaining content
75
+ chunk = bibliography_text[start:].strip()
76
+ if chunk and len(chunk) > 50:
77
+ chunks.append(chunk)
78
+ logger.debug(f"Chunk {len(chunks)} (final): {len(chunk)} characters")
79
+ break
80
+
81
+ # Calculate end position for this chunk
82
+ end = min(start + effective_chunk_size, total_length)
83
+
84
+ # Look for reference boundaries within reasonable distance
85
+ search_window = effective_chunk_size // 5 # Look within 20% of target size
86
+ search_start = max(start, end - search_window)
87
+ search_end = min(total_length, end + search_window)
88
+
89
+ text_section = bibliography_text[search_start:search_end]
90
+
91
+ # Find the latest reference start pattern like "\n[32]"
92
+ best_break = end
93
+ ref_boundary_matches = list(re.finditer(r'\n\[\d+\]', text_section))
94
+ if ref_boundary_matches:
95
+ # Use the last reference boundary found within the search window
96
+ last_match = ref_boundary_matches[-1]
97
+ best_break = search_start + last_match.start() + 1 # +1 to include the \n
98
+
99
+ # Extract chunk
100
+ chunk = bibliography_text[start:best_break].strip()
101
+
102
+ if chunk and len(chunk) > 50:
103
+ chunks.append(chunk)
104
+ logger.debug(f"Chunk {len(chunks)}: {len(chunk)} characters, starts with: {chunk[:60]}...")
105
+
106
+ # For next chunk, start with fixed overlap size
107
+ next_start = max(0, best_break - overlap_size)
108
+
109
+ start = next_start
110
+
111
+ logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
112
+ return chunks
113
+
114
+ def _parse_llm_response(self, response_text: str) -> List[str]:
115
+ """Parse LLM response and extract individual references"""
116
+ if not response_text:
117
+ return []
118
+
119
+ # Split by newlines and filter out empty lines
120
+ references = []
121
+ for line in response_text.strip().split('\n'):
122
+ line = line.strip()
123
+ if line and not line.startswith('#') and len(line) > 10: # Basic filtering
124
+ references.append(line)
125
+
126
+ return references
127
+
128
+ def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
129
+ """
130
+ Template method that handles chunking for all providers.
131
+ Subclasses should implement _call_llm instead of extract_references.
132
+ """
133
+ if not self.is_available():
134
+ raise Exception(f"{self.__class__.__name__} not available")
135
+
136
+ # Get model's max_tokens from configuration - try to get provider-specific config
137
+ from config.settings import get_config
138
+ config = get_config()
139
+
140
+ # Try to get provider-specific max_tokens, fall back to general config
141
+ provider_name = self.__class__.__name__.lower().replace('provider', '')
142
+ model_max_tokens = config.get('llm', {}).get(provider_name, {}).get('max_tokens', self.max_tokens)
143
+
144
+ # Check if bibliography is too long and needs chunking
145
+ estimated_tokens = len(bibliography_text) // 4 # Rough estimate
146
+
147
+ # Account for prompt overhead
148
+ prompt_overhead = 300 # Conservative estimate for prompt template and system messages
149
+ # Ensure prompt is < 1/2 the model's total token limit to leave room for response
150
+ max_input_tokens = (model_max_tokens // 2) - prompt_overhead
151
+
152
+ logger.debug(f"Using model max_tokens: {model_max_tokens}, max_input_tokens: {max_input_tokens}")
153
+
154
+ if estimated_tokens > max_input_tokens:
155
+ logger.debug(f"Bibliography too long ({estimated_tokens} estimated tokens), splitting into chunks")
156
+ chunks = self._chunk_bibliography(bibliography_text, max_input_tokens)
157
+
158
+ # Process chunks in parallel
159
+ all_references = self._process_chunks_parallel(chunks)
160
+
161
+ # Remove duplicates while preserving order based on reference numbers
162
+ seen_ref_nums = set()
163
+ unique_references = []
164
+ for ref in all_references:
165
+ # Extract reference number for more robust deduplication
166
+ ref_num_match = re.search(r'\[(\d+)\]', ref)
167
+ if ref_num_match:
168
+ ref_num = ref_num_match.group(1)
169
+ if ref_num not in seen_ref_nums:
170
+ seen_ref_nums.add(ref_num)
171
+ unique_references.append(ref)
172
+ else:
173
+ logger.debug(f"Skipping duplicate reference [{ref_num}]: {ref[:100]}...")
174
+ else:
175
+ # Fallback to segment-based deduplication for references without numbers
176
+ # Split into segments separated by '#' and compare first two (author list and title)
177
+ segments = ref.split('#')
178
+ if len(segments) >= 2:
179
+ # Normalize author names by removing spaces around periods in initials
180
+ # This handles cases like "D.Iosifidis" vs "D. Iosifidis"
181
+ author_normalized = re.sub(r'\s*\.\s*', '.', segments[0].strip().lower())
182
+ title_normalized = segments[1].strip().lower()
183
+
184
+ author_title_key = (author_normalized, title_normalized)
185
+ if author_title_key not in seen_ref_nums:
186
+ seen_ref_nums.add(author_title_key)
187
+ unique_references.append(ref)
188
+ else:
189
+ logger.debug(f"Skipping duplicate reference (same author+title): {ref[:100]}...")
190
+ else:
191
+ # No segments, fallback to full text deduplication
192
+ ref_normalized = ref.strip().lower()
193
+ if ref_normalized not in seen_ref_nums:
194
+ seen_ref_nums.add(ref_normalized)
195
+ unique_references.append(ref)
196
+
197
+ logger.debug(f"Extracted {len(unique_references)} unique references from {len(chunks)} chunks")
198
+ return unique_references
199
+ else:
200
+ # Process normally for short bibliographies
201
+ prompt = self._create_extraction_prompt(bibliography_text)
202
+ response_text = self._call_llm(prompt)
203
+ return self._parse_llm_response(response_text)
204
+
205
+ def _process_chunks_parallel(self, chunks: List[str]) -> List[str]:
206
+ """
207
+ Process chunks in parallel using ThreadPoolExecutor
208
+
209
+ Args:
210
+ chunks: List of bibliography text chunks to process
211
+
212
+ Returns:
213
+ List of all extracted references from all chunks
214
+ """
215
+ # Get configuration for parallel processing
216
+ from config.settings import get_config
217
+ config = get_config()
218
+
219
+ # Check if parallel processing is enabled
220
+ llm_config = config.get('llm', {})
221
+ parallel_enabled = llm_config.get('parallel_chunks', True)
222
+ max_workers = llm_config.get('max_chunk_workers', 4)
223
+
224
+ # If parallel processing is disabled, fall back to sequential
225
+ if not parallel_enabled:
226
+ logger.info("Parallel chunk processing disabled, using sequential processing")
227
+ return self._process_chunks_sequential(chunks)
228
+
229
+ # Limit max_workers based on number of chunks
230
+ effective_workers = min(max_workers, len(chunks))
231
+ logger.info(f"Processing {len(chunks)} chunks in parallel with {effective_workers} workers")
232
+
233
+ start_time = time.time()
234
+ all_references = []
235
+
236
+ def process_single_chunk(chunk_data):
237
+ """Process a single chunk and return results"""
238
+ chunk_index, chunk_text = chunk_data
239
+ try:
240
+ logger.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}")
241
+ prompt = self._create_extraction_prompt(chunk_text)
242
+ response_text = self._call_llm(prompt)
243
+ chunk_references = self._parse_llm_response(response_text)
244
+ logger.debug(f"Chunk {chunk_index + 1} extracted {len(chunk_references)} references")
245
+ return chunk_index, chunk_references
246
+ except Exception as e:
247
+ logger.error(f"Failed to process chunk {chunk_index + 1}: {e}")
248
+ return chunk_index, []
249
+
250
+ # Create indexed chunks for processing
251
+ indexed_chunks = [(i, chunk) for i, chunk in enumerate(chunks)]
252
+
253
+ # Process chunks in parallel
254
+ with ThreadPoolExecutor(max_workers=effective_workers, thread_name_prefix="LLMChunk") as executor:
255
+ # Submit all chunks for processing
256
+ future_to_chunk = {
257
+ executor.submit(process_single_chunk, chunk_data): chunk_data[0]
258
+ for chunk_data in indexed_chunks
259
+ }
260
+
261
+ # Collect results as they complete
262
+ chunk_results = {}
263
+ for future in as_completed(future_to_chunk):
264
+ chunk_index = future_to_chunk[future]
265
+ try:
266
+ result_index, references = future.result()
267
+ chunk_results[result_index] = references
268
+ logger.debug(f"Completed chunk {result_index + 1}/{len(chunks)}")
269
+ except Exception as e:
270
+ logger.error(f"Chunk {chunk_index + 1} processing failed: {e}")
271
+ chunk_results[chunk_index] = []
272
+
273
+ # Combine results in original order
274
+ for i in range(len(chunks)):
275
+ if i in chunk_results:
276
+ all_references.extend(chunk_results[i])
277
+
278
+ processing_time = time.time() - start_time
279
+ logger.debug(f"Parallel chunk processing completed in {processing_time:.2f}s, "
280
+ f"extracted {len(all_references)} total references")
281
+
282
+ return all_references
283
+
284
+ def _process_chunks_sequential(self, chunks: List[str]) -> List[str]:
285
+ """
286
+ Process chunks sequentially (fallback method)
287
+
288
+ Args:
289
+ chunks: List of bibliography text chunks to process
290
+
291
+ Returns:
292
+ List of all extracted references from all chunks
293
+ """
294
+ logger.info(f"Processing {len(chunks)} chunks sequentially")
295
+ start_time = time.time()
296
+
297
+ all_references = []
298
+ for i, chunk in enumerate(chunks):
299
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
300
+ try:
301
+ prompt = self._create_extraction_prompt(chunk)
302
+ response_text = self._call_llm(prompt)
303
+ chunk_references = self._parse_llm_response(response_text)
304
+ all_references.extend(chunk_references)
305
+ logger.debug(f"Chunk {i+1} extracted {len(chunk_references)} references")
306
+ except Exception as e:
307
+ logger.error(f"Failed to process chunk {i+1}: {e}")
308
+
309
+ processing_time = time.time() - start_time
310
+ logger.info(f"Sequential chunk processing completed in {processing_time:.2f}s, "
311
+ f"extracted {len(all_references)} total references")
312
+
313
+ return all_references
314
+
315
+
316
+ class ReferenceExtractor:
317
+ """Main class for LLM-based reference extraction with fallback"""
318
+
319
+ def __init__(self, llm_provider: Optional[LLMProvider] = None, fallback_enabled: bool = True):
320
+ self.llm_provider = llm_provider
321
+ self.fallback_enabled = fallback_enabled
322
+ self.logger = logging.getLogger(__name__)
323
+
324
+ def extract_references(self, bibliography_text: str, fallback_func=None) -> List[str]:
325
+ """
326
+ Extract references with LLM and fallback to regex if needed
327
+
328
+ Args:
329
+ bibliography_text: Raw bibliography text
330
+ fallback_func: Function to call if LLM extraction fails
331
+
332
+ Returns:
333
+ List of extracted references
334
+ """
335
+ if not bibliography_text:
336
+ return []
337
+
338
+ # Try LLM extraction first
339
+ if self.llm_provider and self.llm_provider.is_available():
340
+ try:
341
+ model_name = self.llm_provider.model or "unknown"
342
+ self.logger.info(f"Attempting LLM-based reference extraction using {model_name}")
343
+ references = self.llm_provider.extract_references(bibliography_text)
344
+ if references:
345
+ return references
346
+ else:
347
+ self.logger.warning("LLM returned no references")
348
+ except Exception as e:
349
+ self.logger.error(f"LLM reference extraction failed: {e}")
350
+
351
+ # If LLM was specified but failed, don't fallback - that's terminal
352
+ self.logger.error("LLM-based reference extraction failed and fallback is disabled")
353
+ return []
354
+
355
+
356
+ def create_llm_provider(provider_name: str, config: Dict[str, Any]) -> Optional[LLMProvider]:
357
+ """Factory function to create LLM provider instances"""
358
+ from .providers import OpenAIProvider, AnthropicProvider, GoogleProvider, AzureProvider, vLLMProvider
359
+
360
+ providers = {
361
+ "openai": OpenAIProvider,
362
+ "anthropic": AnthropicProvider,
363
+ "google": GoogleProvider,
364
+ "azure": AzureProvider,
365
+ "vllm": vLLMProvider,
366
+ }
367
+
368
+ if provider_name not in providers:
369
+ logger.error(f"Unknown LLM provider: {provider_name}")
370
+ return None
371
+
372
+ try:
373
+ return providers[provider_name](config)
374
+ except Exception as e:
375
+ logger.error(f"Failed to create {provider_name} provider: {e}")
376
+ return None