academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,563 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Hybrid Reference Checker with Multiple API Sources
4
+
5
+ This module provides an improved hybrid reference checker that intelligently combines
6
+ multiple API sources for optimal reliability and performance. It replaces Google Scholar
7
+ with more reliable alternatives while maintaining backward compatibility.
8
+
9
+ New API Integration Priority:
10
+ 1. Local Semantic Scholar Database (fastest, offline)
11
+ 2. Semantic Scholar API (reliable, good coverage)
12
+ 3. OpenAlex API (excellent reliability, replaces Google Scholar)
13
+ 4. CrossRef API (best for DOI-based verification)
14
+ 5. Google Scholar (final fallback, kept for legacy support)
15
+
16
+ Usage:
17
+ from enhanced_hybrid_checker import EnhancedHybridReferenceChecker
18
+
19
+ checker = EnhancedHybridReferenceChecker(
20
+ semantic_scholar_api_key="your_key",
21
+ db_path="path/to/db.sqlite",
22
+ contact_email="your@email.com"
23
+ )
24
+
25
+ verified_data, errors, url = checker.verify_reference(reference)
26
+ """
27
+
28
+ import logging
29
+ import random
30
+ import requests
31
+ import time
32
+ from typing import Dict, List, Tuple, Optional, Any
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class EnhancedHybridReferenceChecker:
37
+ """
38
+ Enhanced hybrid reference checker with multiple API sources for improved reliability
39
+ """
40
+
41
+ def __init__(self, semantic_scholar_api_key: Optional[str] = None,
42
+ db_path: Optional[str] = None,
43
+ contact_email: Optional[str] = None,
44
+ enable_openalex: bool = True,
45
+ enable_crossref: bool = True,
46
+ debug_mode: bool = False):
47
+ """
48
+ Initialize the enhanced hybrid reference checker
49
+
50
+ Args:
51
+ semantic_scholar_api_key: Optional API key for Semantic Scholar
52
+ db_path: Optional path to local Semantic Scholar database
53
+ contact_email: Email for polite pool access to APIs
54
+ enable_openalex: Whether to use OpenAlex API
55
+ enable_crossref: Whether to use CrossRef API
56
+ debug_mode: Whether to enable debug logging
57
+ """
58
+ self.contact_email = contact_email
59
+ self.debug_mode = debug_mode
60
+
61
+ # Initialize local database checker if available
62
+ self.local_db = None
63
+ if db_path:
64
+ try:
65
+ from .local_semantic_scholar import LocalNonArxivReferenceChecker
66
+ self.local_db = LocalNonArxivReferenceChecker(db_path=db_path)
67
+ logger.debug(f"Enhanced Hybrid: Local database enabled at {db_path}")
68
+ except Exception as e:
69
+ logger.warning(f"Enhanced Hybrid: Failed to initialize local database: {e}")
70
+ self.local_db = None
71
+
72
+ # Initialize Semantic Scholar API
73
+ try:
74
+ from .semantic_scholar import NonArxivReferenceChecker
75
+ self.semantic_scholar = NonArxivReferenceChecker(api_key=semantic_scholar_api_key)
76
+ logger.debug("Enhanced Hybrid: Semantic Scholar API initialized")
77
+ except Exception as e:
78
+ logger.error(f"Enhanced Hybrid: Failed to initialize Semantic Scholar: {e}")
79
+ self.semantic_scholar = None
80
+
81
+ # Initialize OpenAlex API
82
+ self.openalex = None
83
+ if enable_openalex:
84
+ try:
85
+ from .openalex import OpenAlexReferenceChecker
86
+ self.openalex = OpenAlexReferenceChecker(email=contact_email)
87
+ logger.debug("Enhanced Hybrid: OpenAlex API initialized")
88
+ except Exception as e:
89
+ logger.warning(f"Enhanced Hybrid: Failed to initialize OpenAlex: {e}")
90
+
91
+ # Initialize CrossRef API
92
+ self.crossref = None
93
+ if enable_crossref:
94
+ try:
95
+ from .crossref import CrossRefReferenceChecker
96
+ self.crossref = CrossRefReferenceChecker(email=contact_email)
97
+ logger.debug("Enhanced Hybrid: CrossRef API initialized")
98
+ except Exception as e:
99
+ logger.warning(f"Enhanced Hybrid: Failed to initialize CrossRef: {e}")
100
+
101
+ # Initialize OpenReview checker
102
+ self.openreview = None
103
+ try:
104
+ from .openreview_checker import OpenReviewReferenceChecker
105
+ self.openreview = OpenReviewReferenceChecker()
106
+ logger.debug("Enhanced Hybrid: OpenReview checker initialized")
107
+ except Exception as e:
108
+ logger.warning(f"Enhanced Hybrid: Failed to initialize OpenReview: {e}")
109
+ self.openreview = None
110
+
111
+ # Google Scholar removed - using more reliable APIs only
112
+
113
+ # Track API performance for adaptive selection
114
+ self.api_stats = {
115
+ 'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
116
+ 'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
117
+ 'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
118
+ 'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
119
+ 'openreview': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
120
+ }
121
+
122
+ # Track failed API calls for retry logic - OPTIMIZED CONFIGURATION
123
+ self.retry_base_delay = 1 # Base delay for retrying throttled APIs (seconds)
124
+ self.retry_backoff_factor = 1.5 # Exponential backoff multiplier
125
+ self.max_retry_delay = 20 # Maximum delay cap in seconds
126
+
127
+ def _update_api_stats(self, api_name: str, success: bool, duration: float):
128
+ """Update API performance statistics"""
129
+ if api_name in self.api_stats:
130
+ stats = self.api_stats[api_name]
131
+ if success:
132
+ stats['success'] += 1
133
+ else:
134
+ stats['failure'] += 1
135
+
136
+ # Update average time (simple moving average)
137
+ total_calls = stats['success'] + stats['failure']
138
+ stats['avg_time'] = ((stats['avg_time'] * (total_calls - 1)) + duration) / total_calls
139
+
140
+ def _try_api(self, api_name: str, api_instance: Any, reference: Dict[str, Any], is_retry: bool = False) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
141
+ """
142
+ Try to verify reference with a specific API and track performance
143
+
144
+ Returns:
145
+ Tuple of (verified_data, errors, url, success, failure_type)
146
+ failure_type can be: 'none', 'not_found', 'throttled', 'timeout', 'other'
147
+ """
148
+ if not api_instance:
149
+ return None, [], None, False, 'none'
150
+
151
+ start_time = time.time()
152
+ failure_type = 'none'
153
+
154
+ try:
155
+ verified_data, errors, url = api_instance.verify_reference(reference)
156
+ duration = time.time() - start_time
157
+
158
+ # Check if we got API failure errors indicating retryable failure
159
+ api_failure_errors = [err for err in errors if err.get('error_type') == 'api_failure']
160
+ if api_failure_errors:
161
+ # This is a retryable API failure, not a verification result
162
+ self._update_api_stats(api_name, False, duration)
163
+ logger.debug(f"Enhanced Hybrid: {api_name} API failed in {duration:.2f}s: {api_failure_errors[0].get('error_details', 'unknown')}")
164
+ return None, [], None, False, 'throttled' # Treat API failures as throttling for retry logic
165
+
166
+ # Consider it successful if we found data or verification errors (i.e., we could verify something)
167
+ success = verified_data is not None or len(errors) > 0
168
+ self._update_api_stats(api_name, success, duration)
169
+
170
+ if success:
171
+ retry_info = " (retry)" if is_retry else ""
172
+ logger.debug(f"Enhanced Hybrid: {api_name} successful in {duration:.2f}s{retry_info}, URL: {url}")
173
+ return verified_data, errors, url, True, 'none'
174
+ else:
175
+ logger.debug(f"Enhanced Hybrid: {api_name} found no results in {duration:.2f}s")
176
+ return None, [], None, False, 'not_found'
177
+
178
+ except requests.exceptions.Timeout as e:
179
+ duration = time.time() - start_time
180
+ self._update_api_stats(api_name, False, duration)
181
+ failure_type = 'timeout'
182
+ logger.debug(f"Enhanced Hybrid: {api_name} timed out in {duration:.2f}s: {e}")
183
+ return None, [], None, False, failure_type
184
+
185
+ except requests.exceptions.RequestException as e:
186
+ duration = time.time() - start_time
187
+ self._update_api_stats(api_name, False, duration)
188
+
189
+ # Check if it's a rate limiting or server error that should be retried
190
+ error_str = str(e).lower()
191
+ status_code = getattr(e.response, 'status_code', None) if hasattr(e, 'response') and e.response else None
192
+
193
+ if (status_code == 429) or "429" in str(e) or "rate limit" in error_str:
194
+ failure_type = 'throttled'
195
+ self.api_stats[api_name]['throttled'] += 1
196
+ logger.debug(f"Enhanced Hybrid: {api_name} rate limited in {duration:.2f}s: {e}")
197
+ elif (status_code and status_code >= 500) or "500" in str(e) or "502" in str(e) or "503" in str(e) or "server error" in error_str or "service unavailable" in error_str:
198
+ failure_type = 'server_error'
199
+ logger.debug(f"Enhanced Hybrid: {api_name} server error in {duration:.2f}s: {e}")
200
+ else:
201
+ failure_type = 'other'
202
+ logger.debug(f"Enhanced Hybrid: {api_name} failed in {duration:.2f}s: {e}")
203
+ return None, [], None, False, failure_type
204
+
205
+ except Exception as e:
206
+ duration = time.time() - start_time
207
+ self._update_api_stats(api_name, False, duration)
208
+ failure_type = 'other'
209
+ logger.debug(f"Enhanced Hybrid: {api_name} failed in {duration:.2f}s: {e}")
210
+ return None, [], None, False, failure_type
211
+
212
+ def _should_try_doi_apis_first(self, reference: Dict[str, Any]) -> bool:
213
+ """
214
+ Determine if we should prioritize DOI-based APIs (CrossRef) for this reference
215
+ """
216
+ # Check if reference has DOI information
217
+ has_doi = (reference.get('doi') or
218
+ (reference.get('url') and ('doi.org' in reference['url'] or 'doi:' in reference['url'])) or
219
+ (reference.get('raw_text') and ('doi' in reference['raw_text'].lower())))
220
+ return has_doi
221
+
222
+ def _is_data_complete(self, verified_data: Dict[str, Any], reference: Dict[str, Any]) -> bool:
223
+ """
224
+ Check if the verified data is sufficiently complete for the reference verification
225
+
226
+ Args:
227
+ verified_data: Paper data returned by API
228
+ reference: Original reference data
229
+
230
+ Returns:
231
+ True if data is complete enough to use, False if incomplete
232
+ """
233
+ if not verified_data:
234
+ return False
235
+
236
+ # If the reference has authors, the verified data should also have authors
237
+ cited_authors = reference.get('authors', [])
238
+ found_authors = verified_data.get('authors', [])
239
+
240
+ # If we cited authors but found none, the data is incomplete
241
+ if cited_authors and not found_authors:
242
+ logger.debug(f"Enhanced Hybrid: Data incomplete - cited authors {cited_authors} but found none")
243
+ return False
244
+
245
+ return True
246
+
247
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
248
+ """
249
+ Verify a non-arXiv reference using multiple APIs in priority order
250
+
251
+ First tries all APIs once, then retries failed APIs if no success.
252
+
253
+ Args:
254
+ reference: Reference data dictionary
255
+
256
+ Returns:
257
+ Tuple of (verified_data, errors, url)
258
+ """
259
+ # Check if this is a URL-only reference (should skip verification)
260
+ authors = reference.get('authors', [])
261
+ if authors and "URL Reference" in authors:
262
+ # Skip verification for URL references - they're just links, not papers
263
+ logger.debug("Enhanced Hybrid: Skipping verification for URL reference")
264
+ return None, [], reference.get('cited_url') or reference.get('url')
265
+
266
+ # Also check if it looks like a URL-only reference (no title, just URL)
267
+ title = reference.get('title', '').strip()
268
+ cited_url = reference.get('cited_url') or reference.get('url')
269
+ if not title and cited_url:
270
+ # This is a URL-only reference without a title
271
+ logger.debug(f"Enhanced Hybrid: Skipping verification for URL-only reference: {cited_url}")
272
+ return None, [], cited_url
273
+
274
+ # Track all APIs that failed and could be retried
275
+ failed_apis = []
276
+
277
+ # PHASE 1: Try all APIs once in priority order
278
+
279
+ # Strategy 1: Always try local database first (fastest)
280
+ if self.local_db:
281
+ verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
282
+ if success:
283
+ return verified_data, errors, url
284
+ if failure_type in ['throttled', 'timeout', 'server_error']:
285
+ failed_apis.append(('local_db', self.local_db, failure_type))
286
+
287
+ # Strategy 2: If reference has DOI, prioritize CrossRef
288
+ crossref_result = None
289
+ if self._should_try_doi_apis_first(reference) and self.crossref:
290
+ verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
291
+ if success:
292
+ # Check if the data is complete enough to use
293
+ if self._is_data_complete(verified_data, reference):
294
+ return verified_data, errors, url
295
+ else:
296
+ # Data is incomplete, save it as fallback and continue with other APIs
297
+ crossref_result = (verified_data, errors, url)
298
+ logger.debug("Enhanced Hybrid: CrossRef data incomplete, continuing with other APIs")
299
+ if failure_type in ['throttled', 'timeout', 'server_error']:
300
+ failed_apis.append(('crossref', self.crossref, failure_type))
301
+
302
+ # Strategy 3: Try Semantic Scholar API (reliable, good coverage)
303
+ if self.semantic_scholar:
304
+ verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
305
+ if success:
306
+ return verified_data, errors, url
307
+ # For Semantic Scholar, only retry retryable failures (not 'not_found')
308
+ if failure_type in ['throttled', 'timeout', 'server_error']:
309
+ failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
310
+
311
+ # Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
312
+ openalex_result = None
313
+ if self.openalex:
314
+ verified_data, errors, url, success, failure_type = self._try_api('openalex', self.openalex, reference)
315
+ if success:
316
+ # Check if the data is complete enough to use
317
+ if self._is_data_complete(verified_data, reference):
318
+ return verified_data, errors, url
319
+ else:
320
+ # Data is incomplete, save it as fallback and continue with other APIs
321
+ openalex_result = (verified_data, errors, url)
322
+ logger.debug("Enhanced Hybrid: OpenAlex data incomplete, continuing with other APIs")
323
+ if failure_type in ['throttled', 'timeout', 'server_error']:
324
+ failed_apis.append(('openalex', self.openalex, failure_type))
325
+
326
+ # Strategy 5: Try OpenReview if URL suggests it's an OpenReview paper
327
+ if (self.openreview and
328
+ hasattr(self.openreview, 'is_openreview_reference') and
329
+ self.openreview.is_openreview_reference(reference)):
330
+ logger.debug("Enhanced Hybrid: Trying OpenReview URL-based verification")
331
+ verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
332
+ if success:
333
+ return verified_data, errors, url
334
+ if failure_type in ['throttled', 'timeout', 'server_error']:
335
+ failed_apis.append(('openreview', self.openreview, failure_type))
336
+
337
+ # Strategy 5b: Try OpenReview by search if venue suggests it might be there
338
+ elif (self.openreview and
339
+ hasattr(self.openreview, 'verify_reference_by_search')):
340
+ # Check if venue suggests this might be on OpenReview
341
+ venue = reference.get('venue', reference.get('journal', '')).lower()
342
+ openreview_venues = [
343
+ 'iclr', 'icml', 'neurips', 'nips', 'aaai', 'ijcai',
344
+ 'international conference on learning representations',
345
+ 'international conference on machine learning',
346
+ 'neural information processing systems'
347
+ ]
348
+
349
+ venue_suggests_openreview = any(or_venue in venue for or_venue in openreview_venues)
350
+ logger.debug(f"Enhanced Hybrid: OpenReview venue check - venue: '{venue}', suggests: {venue_suggests_openreview}")
351
+
352
+ if venue_suggests_openreview:
353
+ logger.debug("Enhanced Hybrid: Trying OpenReview search-based verification")
354
+ verified_data, errors, url, success, failure_type = self._try_openreview_search(reference)
355
+ if success:
356
+ return verified_data, errors, url
357
+ if failure_type in ['throttled', 'timeout', 'server_error']:
358
+ failed_apis.append(('openreview_search', self.openreview, failure_type))
359
+
360
+ # Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
361
+ if not self._should_try_doi_apis_first(reference) and self.crossref:
362
+ verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
363
+ if success:
364
+ # Check if the data is complete enough to use
365
+ if self._is_data_complete(verified_data, reference):
366
+ return verified_data, errors, url
367
+ else:
368
+ # Data is incomplete, save it as fallback
369
+ if not crossref_result: # Only save if we don't already have one
370
+ crossref_result = (verified_data, errors, url)
371
+ logger.debug("Enhanced Hybrid: CrossRef data incomplete (non-DOI), continuing with other APIs")
372
+ if failure_type in ['throttled', 'timeout', 'server_error']:
373
+ failed_apis.append(('crossref', self.crossref, failure_type))
374
+
375
+ # PHASE 2: If no API succeeded in Phase 1, retry failed APIs
376
+ if failed_apis:
377
+ logger.debug(f"Enhanced Hybrid: Phase 1 complete, no success. Retrying {len(failed_apis)} failed APIs")
378
+
379
+ # Sort failed APIs to prioritize Semantic Scholar retries
380
+ semantic_scholar_retries = [api for api in failed_apis if api[0] == 'semantic_scholar']
381
+ other_retries = [api for api in failed_apis if api[0] != 'semantic_scholar']
382
+
383
+ # Try other APIs first, then Semantic Scholar with more aggressive retries
384
+ retry_order = other_retries + semantic_scholar_retries
385
+
386
+ for api_name, api_instance, failure_type in retry_order:
387
+ # Use base delay for first retry of each API
388
+ delay = min(self.retry_base_delay, self.max_retry_delay)
389
+
390
+ # Add jitter to prevent thundering herd (±25% randomization)
391
+ jitter = delay * 0.25 * (2 * random.random() - 1)
392
+ final_delay = max(0.5, delay + jitter)
393
+
394
+ logger.debug(f"Enhanced Hybrid: Waiting {final_delay:.1f}s before retrying {api_name} after {failure_type} failure")
395
+ time.sleep(final_delay)
396
+
397
+ logger.debug(f"Enhanced Hybrid: Retrying {api_name}")
398
+ verified_data, errors, url, success, _ = self._try_api(api_name, api_instance, reference, is_retry=True)
399
+ if success:
400
+ logger.debug(f"Enhanced Hybrid: {api_name} succeeded on retry after {failure_type} (delay: {final_delay:.1f}s)")
401
+ return verified_data, errors, url
402
+
403
+ # For Semantic Scholar, try additional retries with increasing delays
404
+ if api_name == 'semantic_scholar' and not success:
405
+ for retry_attempt in range(2): # Additional 2 retries for Semantic Scholar
406
+ retry_delay = delay * (self.retry_backoff_factor ** (retry_attempt + 1))
407
+ retry_delay = min(retry_delay, self.max_retry_delay)
408
+ retry_jitter = retry_delay * 0.25 * (2 * random.random() - 1)
409
+ final_retry_delay = max(1.0, retry_delay + retry_jitter)
410
+
411
+ logger.debug(f"Enhanced Hybrid: Additional Semantic Scholar retry {retry_attempt + 2} after {final_retry_delay:.1f}s")
412
+ time.sleep(final_retry_delay)
413
+
414
+ verified_data, errors, url, success, _ = self._try_api(api_name, api_instance, reference, is_retry=True)
415
+ if success:
416
+ logger.debug(f"Enhanced Hybrid: {api_name} succeeded on retry {retry_attempt + 2} (delay: {final_retry_delay:.1f}s)")
417
+ return verified_data, errors, url
418
+
419
+ # PHASE 3: If all APIs failed or returned incomplete data, use best available incomplete data as fallback
420
+ incomplete_results = [r for r in [crossref_result, openalex_result] if r is not None]
421
+ if incomplete_results:
422
+ # Prefer CrossRef over OpenAlex for incomplete data (usually more reliable)
423
+ best_incomplete = crossref_result if crossref_result else openalex_result
424
+ logger.debug("Enhanced Hybrid: No complete data found, using incomplete data as fallback")
425
+ return best_incomplete
426
+
427
+ # If all APIs failed, return unverified
428
+ failed_count = len(failed_apis)
429
+ total_attempted = (1 if self.local_db else 0) + (1 if self.semantic_scholar else 0) + (1 if self.openalex else 0) + (1 if self.crossref else 0)
430
+
431
+ if failed_count > 0:
432
+ logger.debug(f"Enhanced Hybrid: All {total_attempted} APIs failed to verify reference ({failed_count} retried)")
433
+ else:
434
+ logger.debug("Enhanced Hybrid: All available APIs failed to verify reference")
435
+
436
+ return None, [{
437
+ 'error_type': 'unverified',
438
+ 'error_details': 'Could not verify reference using any available API'
439
+ }], None
440
+
441
+ def _try_openreview_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
442
+ """
443
+ Try to verify reference using OpenReview search
444
+
445
+ Returns:
446
+ Tuple of (verified_data, errors, url, success, failure_type)
447
+ """
448
+ if not self.openreview:
449
+ return None, [], None, False, 'none'
450
+
451
+ start_time = time.time()
452
+ failure_type = 'none'
453
+
454
+ try:
455
+ verified_data, errors, url = self.openreview.verify_reference_by_search(reference)
456
+ duration = time.time() - start_time
457
+
458
+ # Consider it successful if we found data or verification errors
459
+ success = verified_data is not None or len(errors) > 0
460
+ self._update_api_stats('openreview', success, duration)
461
+
462
+ if success:
463
+ logger.debug(f"Enhanced Hybrid: OpenReview search successful in {duration:.2f}s, URL: {url}")
464
+ return verified_data, errors, url, True, 'none'
465
+ else:
466
+ logger.debug(f"Enhanced Hybrid: OpenReview search found no results in {duration:.2f}s")
467
+ return None, [], None, False, 'not_found'
468
+
469
+ except requests.exceptions.Timeout as e:
470
+ duration = time.time() - start_time
471
+ self._update_api_stats('openreview', False, duration)
472
+ failure_type = 'timeout'
473
+ logger.debug(f"Enhanced Hybrid: OpenReview search timed out in {duration:.2f}s: {e}")
474
+ return None, [], None, False, failure_type
475
+
476
+ except requests.exceptions.RequestException as e:
477
+ duration = time.time() - start_time
478
+ self._update_api_stats('openreview', False, duration)
479
+
480
+ # Check if it's a rate limiting error
481
+ if hasattr(e, 'response') and e.response is not None:
482
+ if e.response.status_code in [429, 503]:
483
+ failure_type = 'throttled'
484
+ elif e.response.status_code >= 500:
485
+ failure_type = 'server_error'
486
+ else:
487
+ failure_type = 'other'
488
+ else:
489
+ failure_type = 'other'
490
+
491
+ logger.debug(f"Enhanced Hybrid: OpenReview search failed in {duration:.2f}s: {type(e).__name__}: {e}")
492
+ return None, [], None, False, failure_type
493
+
494
+ except Exception as e:
495
+ duration = time.time() - start_time
496
+ self._update_api_stats('openreview', False, duration)
497
+ failure_type = 'other'
498
+ logger.debug(f"Enhanced Hybrid: OpenReview search error in {duration:.2f}s: {type(e).__name__}: {e}")
499
+ return None, [], None, False, failure_type
500
+
501
+ def get_performance_stats(self) -> Dict[str, Any]:
502
+ """
503
+ Get performance statistics for all APIs
504
+
505
+ Returns:
506
+ Dictionary with performance statistics
507
+ """
508
+ stats = {}
509
+ for api_name, api_stats in self.api_stats.items():
510
+ total_calls = api_stats['success'] + api_stats['failure']
511
+ if total_calls > 0:
512
+ success_rate = api_stats['success'] / total_calls
513
+ stats[api_name] = {
514
+ 'success_rate': success_rate,
515
+ 'total_calls': total_calls,
516
+ 'avg_time': api_stats['avg_time'],
517
+ 'success_count': api_stats['success'],
518
+ 'failure_count': api_stats['failure']
519
+ }
520
+ else:
521
+ stats[api_name] = {
522
+ 'success_rate': 0,
523
+ 'total_calls': 0,
524
+ 'avg_time': 0,
525
+ 'success_count': 0,
526
+ 'failure_count': 0
527
+ }
528
+ return stats
529
+
530
+ def log_performance_summary(self):
531
+ """Log a summary of API performance statistics (only if debug mode is enabled)"""
532
+ if not self.debug_mode:
533
+ return
534
+
535
+ stats = self.get_performance_stats()
536
+ logger.info("Enhanced Hybrid API Performance Summary:")
537
+ for api_name, api_stats in stats.items():
538
+ if api_stats['total_calls'] > 0:
539
+ logger.info(f" {api_name}: {api_stats['success_rate']:.2%} success rate, "
540
+ f"{api_stats['total_calls']} calls, {api_stats['avg_time']:.2f}s avg")
541
+ else:
542
+ logger.info(f" {api_name}: not used")
543
+
544
+ def normalize_paper_title(self, title: str) -> str:
545
+ """
546
+ Normalize paper title for comparison (delegates to Semantic Scholar checker)
547
+ """
548
+ if self.semantic_scholar:
549
+ return self.semantic_scholar.normalize_paper_title(title)
550
+ else:
551
+ # Use the centralized normalization function from text_utils
552
+ from refchecker.utils.text_utils import normalize_paper_title as normalize_title
553
+ return normalize_title(title)
554
+
555
+ def compare_authors(self, cited_authors: List[str], correct_authors: List[Any]) -> Tuple[bool, str]:
556
+ """
557
+ Compare author lists (delegates to shared utility)
558
+ """
559
+ from refchecker.utils.text_utils import compare_authors
560
+ return compare_authors(cited_authors, correct_authors)
561
+
562
+ # Backward compatibility alias
563
+ HybridReferenceChecker = EnhancedHybridReferenceChecker