academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Enhanced Hybrid Reference Checker with Multiple API Sources
|
|
4
|
+
|
|
5
|
+
This module provides an improved hybrid reference checker that intelligently combines
|
|
6
|
+
multiple API sources for optimal reliability and performance. It replaces Google Scholar
|
|
7
|
+
with more reliable alternatives while maintaining backward compatibility.
|
|
8
|
+
|
|
9
|
+
New API Integration Priority:
|
|
10
|
+
1. Local Semantic Scholar Database (fastest, offline)
|
|
11
|
+
2. Semantic Scholar API (reliable, good coverage)
|
|
12
|
+
3. OpenAlex API (excellent reliability, replaces Google Scholar)
|
|
13
|
+
4. CrossRef API (best for DOI-based verification)
|
|
14
|
+
5. Google Scholar (final fallback, kept for legacy support)
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
from enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
18
|
+
|
|
19
|
+
checker = EnhancedHybridReferenceChecker(
|
|
20
|
+
semantic_scholar_api_key="your_key",
|
|
21
|
+
db_path="path/to/db.sqlite",
|
|
22
|
+
contact_email="your@email.com"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
verified_data, errors, url = checker.verify_reference(reference)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
import random
|
|
30
|
+
import requests
|
|
31
|
+
import time
|
|
32
|
+
from typing import Dict, List, Tuple, Optional, Any
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
class EnhancedHybridReferenceChecker:
|
|
37
|
+
"""
|
|
38
|
+
Enhanced hybrid reference checker with multiple API sources for improved reliability
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, semantic_scholar_api_key: Optional[str] = None,
|
|
42
|
+
db_path: Optional[str] = None,
|
|
43
|
+
contact_email: Optional[str] = None,
|
|
44
|
+
enable_openalex: bool = True,
|
|
45
|
+
enable_crossref: bool = True,
|
|
46
|
+
debug_mode: bool = False):
|
|
47
|
+
"""
|
|
48
|
+
Initialize the enhanced hybrid reference checker
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
semantic_scholar_api_key: Optional API key for Semantic Scholar
|
|
52
|
+
db_path: Optional path to local Semantic Scholar database
|
|
53
|
+
contact_email: Email for polite pool access to APIs
|
|
54
|
+
enable_openalex: Whether to use OpenAlex API
|
|
55
|
+
enable_crossref: Whether to use CrossRef API
|
|
56
|
+
debug_mode: Whether to enable debug logging
|
|
57
|
+
"""
|
|
58
|
+
self.contact_email = contact_email
|
|
59
|
+
self.debug_mode = debug_mode
|
|
60
|
+
|
|
61
|
+
# Initialize local database checker if available
|
|
62
|
+
self.local_db = None
|
|
63
|
+
if db_path:
|
|
64
|
+
try:
|
|
65
|
+
from .local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
66
|
+
self.local_db = LocalNonArxivReferenceChecker(db_path=db_path)
|
|
67
|
+
logger.debug(f"Enhanced Hybrid: Local database enabled at {db_path}")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize local database: {e}")
|
|
70
|
+
self.local_db = None
|
|
71
|
+
|
|
72
|
+
# Initialize Semantic Scholar API
|
|
73
|
+
try:
|
|
74
|
+
from .semantic_scholar import NonArxivReferenceChecker
|
|
75
|
+
self.semantic_scholar = NonArxivReferenceChecker(api_key=semantic_scholar_api_key)
|
|
76
|
+
logger.debug("Enhanced Hybrid: Semantic Scholar API initialized")
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.error(f"Enhanced Hybrid: Failed to initialize Semantic Scholar: {e}")
|
|
79
|
+
self.semantic_scholar = None
|
|
80
|
+
|
|
81
|
+
# Initialize OpenAlex API
|
|
82
|
+
self.openalex = None
|
|
83
|
+
if enable_openalex:
|
|
84
|
+
try:
|
|
85
|
+
from .openalex import OpenAlexReferenceChecker
|
|
86
|
+
self.openalex = OpenAlexReferenceChecker(email=contact_email)
|
|
87
|
+
logger.debug("Enhanced Hybrid: OpenAlex API initialized")
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize OpenAlex: {e}")
|
|
90
|
+
|
|
91
|
+
# Initialize CrossRef API
|
|
92
|
+
self.crossref = None
|
|
93
|
+
if enable_crossref:
|
|
94
|
+
try:
|
|
95
|
+
from .crossref import CrossRefReferenceChecker
|
|
96
|
+
self.crossref = CrossRefReferenceChecker(email=contact_email)
|
|
97
|
+
logger.debug("Enhanced Hybrid: CrossRef API initialized")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize CrossRef: {e}")
|
|
100
|
+
|
|
101
|
+
# Initialize OpenReview checker
|
|
102
|
+
self.openreview = None
|
|
103
|
+
try:
|
|
104
|
+
from .openreview_checker import OpenReviewReferenceChecker
|
|
105
|
+
self.openreview = OpenReviewReferenceChecker()
|
|
106
|
+
logger.debug("Enhanced Hybrid: OpenReview checker initialized")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.warning(f"Enhanced Hybrid: Failed to initialize OpenReview: {e}")
|
|
109
|
+
self.openreview = None
|
|
110
|
+
|
|
111
|
+
# Google Scholar removed - using more reliable APIs only
|
|
112
|
+
|
|
113
|
+
# Track API performance for adaptive selection
|
|
114
|
+
self.api_stats = {
|
|
115
|
+
'local_db': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
116
|
+
'semantic_scholar': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
117
|
+
'openalex': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
118
|
+
'crossref': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0},
|
|
119
|
+
'openreview': {'success': 0, 'failure': 0, 'avg_time': 0, 'throttled': 0}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# Track failed API calls for retry logic - OPTIMIZED CONFIGURATION
|
|
123
|
+
self.retry_base_delay = 1 # Base delay for retrying throttled APIs (seconds)
|
|
124
|
+
self.retry_backoff_factor = 1.5 # Exponential backoff multiplier
|
|
125
|
+
self.max_retry_delay = 20 # Maximum delay cap in seconds
|
|
126
|
+
|
|
127
|
+
def _update_api_stats(self, api_name: str, success: bool, duration: float):
|
|
128
|
+
"""Update API performance statistics"""
|
|
129
|
+
if api_name in self.api_stats:
|
|
130
|
+
stats = self.api_stats[api_name]
|
|
131
|
+
if success:
|
|
132
|
+
stats['success'] += 1
|
|
133
|
+
else:
|
|
134
|
+
stats['failure'] += 1
|
|
135
|
+
|
|
136
|
+
# Update average time (simple moving average)
|
|
137
|
+
total_calls = stats['success'] + stats['failure']
|
|
138
|
+
stats['avg_time'] = ((stats['avg_time'] * (total_calls - 1)) + duration) / total_calls
|
|
139
|
+
|
|
140
|
+
def _try_api(self, api_name: str, api_instance: Any, reference: Dict[str, Any], is_retry: bool = False) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
|
|
141
|
+
"""
|
|
142
|
+
Try to verify reference with a specific API and track performance
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Tuple of (verified_data, errors, url, success, failure_type)
|
|
146
|
+
failure_type can be: 'none', 'not_found', 'throttled', 'timeout', 'other'
|
|
147
|
+
"""
|
|
148
|
+
if not api_instance:
|
|
149
|
+
return None, [], None, False, 'none'
|
|
150
|
+
|
|
151
|
+
start_time = time.time()
|
|
152
|
+
failure_type = 'none'
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
verified_data, errors, url = api_instance.verify_reference(reference)
|
|
156
|
+
duration = time.time() - start_time
|
|
157
|
+
|
|
158
|
+
# Check if we got API failure errors indicating retryable failure
|
|
159
|
+
api_failure_errors = [err for err in errors if err.get('error_type') == 'api_failure']
|
|
160
|
+
if api_failure_errors:
|
|
161
|
+
# This is a retryable API failure, not a verification result
|
|
162
|
+
self._update_api_stats(api_name, False, duration)
|
|
163
|
+
logger.debug(f"Enhanced Hybrid: {api_name} API failed in {duration:.2f}s: {api_failure_errors[0].get('error_details', 'unknown')}")
|
|
164
|
+
return None, [], None, False, 'throttled' # Treat API failures as throttling for retry logic
|
|
165
|
+
|
|
166
|
+
# Consider it successful if we found data or verification errors (i.e., we could verify something)
|
|
167
|
+
success = verified_data is not None or len(errors) > 0
|
|
168
|
+
self._update_api_stats(api_name, success, duration)
|
|
169
|
+
|
|
170
|
+
if success:
|
|
171
|
+
retry_info = " (retry)" if is_retry else ""
|
|
172
|
+
logger.debug(f"Enhanced Hybrid: {api_name} successful in {duration:.2f}s{retry_info}, URL: {url}")
|
|
173
|
+
return verified_data, errors, url, True, 'none'
|
|
174
|
+
else:
|
|
175
|
+
logger.debug(f"Enhanced Hybrid: {api_name} found no results in {duration:.2f}s")
|
|
176
|
+
return None, [], None, False, 'not_found'
|
|
177
|
+
|
|
178
|
+
except requests.exceptions.Timeout as e:
|
|
179
|
+
duration = time.time() - start_time
|
|
180
|
+
self._update_api_stats(api_name, False, duration)
|
|
181
|
+
failure_type = 'timeout'
|
|
182
|
+
logger.debug(f"Enhanced Hybrid: {api_name} timed out in {duration:.2f}s: {e}")
|
|
183
|
+
return None, [], None, False, failure_type
|
|
184
|
+
|
|
185
|
+
except requests.exceptions.RequestException as e:
|
|
186
|
+
duration = time.time() - start_time
|
|
187
|
+
self._update_api_stats(api_name, False, duration)
|
|
188
|
+
|
|
189
|
+
# Check if it's a rate limiting or server error that should be retried
|
|
190
|
+
error_str = str(e).lower()
|
|
191
|
+
status_code = getattr(e.response, 'status_code', None) if hasattr(e, 'response') and e.response else None
|
|
192
|
+
|
|
193
|
+
if (status_code == 429) or "429" in str(e) or "rate limit" in error_str:
|
|
194
|
+
failure_type = 'throttled'
|
|
195
|
+
self.api_stats[api_name]['throttled'] += 1
|
|
196
|
+
logger.debug(f"Enhanced Hybrid: {api_name} rate limited in {duration:.2f}s: {e}")
|
|
197
|
+
elif (status_code and status_code >= 500) or "500" in str(e) or "502" in str(e) or "503" in str(e) or "server error" in error_str or "service unavailable" in error_str:
|
|
198
|
+
failure_type = 'server_error'
|
|
199
|
+
logger.debug(f"Enhanced Hybrid: {api_name} server error in {duration:.2f}s: {e}")
|
|
200
|
+
else:
|
|
201
|
+
failure_type = 'other'
|
|
202
|
+
logger.debug(f"Enhanced Hybrid: {api_name} failed in {duration:.2f}s: {e}")
|
|
203
|
+
return None, [], None, False, failure_type
|
|
204
|
+
|
|
205
|
+
except Exception as e:
|
|
206
|
+
duration = time.time() - start_time
|
|
207
|
+
self._update_api_stats(api_name, False, duration)
|
|
208
|
+
failure_type = 'other'
|
|
209
|
+
logger.debug(f"Enhanced Hybrid: {api_name} failed in {duration:.2f}s: {e}")
|
|
210
|
+
return None, [], None, False, failure_type
|
|
211
|
+
|
|
212
|
+
def _should_try_doi_apis_first(self, reference: Dict[str, Any]) -> bool:
|
|
213
|
+
"""
|
|
214
|
+
Determine if we should prioritize DOI-based APIs (CrossRef) for this reference
|
|
215
|
+
"""
|
|
216
|
+
# Check if reference has DOI information
|
|
217
|
+
has_doi = (reference.get('doi') or
|
|
218
|
+
(reference.get('url') and ('doi.org' in reference['url'] or 'doi:' in reference['url'])) or
|
|
219
|
+
(reference.get('raw_text') and ('doi' in reference['raw_text'].lower())))
|
|
220
|
+
return has_doi
|
|
221
|
+
|
|
222
|
+
def _is_data_complete(self, verified_data: Dict[str, Any], reference: Dict[str, Any]) -> bool:
|
|
223
|
+
"""
|
|
224
|
+
Check if the verified data is sufficiently complete for the reference verification
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
verified_data: Paper data returned by API
|
|
228
|
+
reference: Original reference data
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
True if data is complete enough to use, False if incomplete
|
|
232
|
+
"""
|
|
233
|
+
if not verified_data:
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
# If the reference has authors, the verified data should also have authors
|
|
237
|
+
cited_authors = reference.get('authors', [])
|
|
238
|
+
found_authors = verified_data.get('authors', [])
|
|
239
|
+
|
|
240
|
+
# If we cited authors but found none, the data is incomplete
|
|
241
|
+
if cited_authors and not found_authors:
|
|
242
|
+
logger.debug(f"Enhanced Hybrid: Data incomplete - cited authors {cited_authors} but found none")
|
|
243
|
+
return False
|
|
244
|
+
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
248
|
+
"""
|
|
249
|
+
Verify a non-arXiv reference using multiple APIs in priority order
|
|
250
|
+
|
|
251
|
+
First tries all APIs once, then retries failed APIs if no success.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
reference: Reference data dictionary
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Tuple of (verified_data, errors, url)
|
|
258
|
+
"""
|
|
259
|
+
# Check if this is a URL-only reference (should skip verification)
|
|
260
|
+
authors = reference.get('authors', [])
|
|
261
|
+
if authors and "URL Reference" in authors:
|
|
262
|
+
# Skip verification for URL references - they're just links, not papers
|
|
263
|
+
logger.debug("Enhanced Hybrid: Skipping verification for URL reference")
|
|
264
|
+
return None, [], reference.get('cited_url') or reference.get('url')
|
|
265
|
+
|
|
266
|
+
# Also check if it looks like a URL-only reference (no title, just URL)
|
|
267
|
+
title = reference.get('title', '').strip()
|
|
268
|
+
cited_url = reference.get('cited_url') or reference.get('url')
|
|
269
|
+
if not title and cited_url:
|
|
270
|
+
# This is a URL-only reference without a title
|
|
271
|
+
logger.debug(f"Enhanced Hybrid: Skipping verification for URL-only reference: {cited_url}")
|
|
272
|
+
return None, [], cited_url
|
|
273
|
+
|
|
274
|
+
# Track all APIs that failed and could be retried
|
|
275
|
+
failed_apis = []
|
|
276
|
+
|
|
277
|
+
# PHASE 1: Try all APIs once in priority order
|
|
278
|
+
|
|
279
|
+
# Strategy 1: Always try local database first (fastest)
|
|
280
|
+
if self.local_db:
|
|
281
|
+
verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
|
|
282
|
+
if success:
|
|
283
|
+
return verified_data, errors, url
|
|
284
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
285
|
+
failed_apis.append(('local_db', self.local_db, failure_type))
|
|
286
|
+
|
|
287
|
+
# Strategy 2: If reference has DOI, prioritize CrossRef
|
|
288
|
+
crossref_result = None
|
|
289
|
+
if self._should_try_doi_apis_first(reference) and self.crossref:
|
|
290
|
+
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
291
|
+
if success:
|
|
292
|
+
# Check if the data is complete enough to use
|
|
293
|
+
if self._is_data_complete(verified_data, reference):
|
|
294
|
+
return verified_data, errors, url
|
|
295
|
+
else:
|
|
296
|
+
# Data is incomplete, save it as fallback and continue with other APIs
|
|
297
|
+
crossref_result = (verified_data, errors, url)
|
|
298
|
+
logger.debug("Enhanced Hybrid: CrossRef data incomplete, continuing with other APIs")
|
|
299
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
300
|
+
failed_apis.append(('crossref', self.crossref, failure_type))
|
|
301
|
+
|
|
302
|
+
# Strategy 3: Try Semantic Scholar API (reliable, good coverage)
|
|
303
|
+
if self.semantic_scholar:
|
|
304
|
+
verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
|
|
305
|
+
if success:
|
|
306
|
+
return verified_data, errors, url
|
|
307
|
+
# For Semantic Scholar, only retry retryable failures (not 'not_found')
|
|
308
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
309
|
+
failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
|
|
310
|
+
|
|
311
|
+
# Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
|
|
312
|
+
openalex_result = None
|
|
313
|
+
if self.openalex:
|
|
314
|
+
verified_data, errors, url, success, failure_type = self._try_api('openalex', self.openalex, reference)
|
|
315
|
+
if success:
|
|
316
|
+
# Check if the data is complete enough to use
|
|
317
|
+
if self._is_data_complete(verified_data, reference):
|
|
318
|
+
return verified_data, errors, url
|
|
319
|
+
else:
|
|
320
|
+
# Data is incomplete, save it as fallback and continue with other APIs
|
|
321
|
+
openalex_result = (verified_data, errors, url)
|
|
322
|
+
logger.debug("Enhanced Hybrid: OpenAlex data incomplete, continuing with other APIs")
|
|
323
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
324
|
+
failed_apis.append(('openalex', self.openalex, failure_type))
|
|
325
|
+
|
|
326
|
+
# Strategy 5: Try OpenReview if URL suggests it's an OpenReview paper
|
|
327
|
+
if (self.openreview and
|
|
328
|
+
hasattr(self.openreview, 'is_openreview_reference') and
|
|
329
|
+
self.openreview.is_openreview_reference(reference)):
|
|
330
|
+
logger.debug("Enhanced Hybrid: Trying OpenReview URL-based verification")
|
|
331
|
+
verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
|
|
332
|
+
if success:
|
|
333
|
+
return verified_data, errors, url
|
|
334
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
335
|
+
failed_apis.append(('openreview', self.openreview, failure_type))
|
|
336
|
+
|
|
337
|
+
# Strategy 5b: Try OpenReview by search if venue suggests it might be there
|
|
338
|
+
elif (self.openreview and
|
|
339
|
+
hasattr(self.openreview, 'verify_reference_by_search')):
|
|
340
|
+
# Check if venue suggests this might be on OpenReview
|
|
341
|
+
venue = reference.get('venue', reference.get('journal', '')).lower()
|
|
342
|
+
openreview_venues = [
|
|
343
|
+
'iclr', 'icml', 'neurips', 'nips', 'aaai', 'ijcai',
|
|
344
|
+
'international conference on learning representations',
|
|
345
|
+
'international conference on machine learning',
|
|
346
|
+
'neural information processing systems'
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
venue_suggests_openreview = any(or_venue in venue for or_venue in openreview_venues)
|
|
350
|
+
logger.debug(f"Enhanced Hybrid: OpenReview venue check - venue: '{venue}', suggests: {venue_suggests_openreview}")
|
|
351
|
+
|
|
352
|
+
if venue_suggests_openreview:
|
|
353
|
+
logger.debug("Enhanced Hybrid: Trying OpenReview search-based verification")
|
|
354
|
+
verified_data, errors, url, success, failure_type = self._try_openreview_search(reference)
|
|
355
|
+
if success:
|
|
356
|
+
return verified_data, errors, url
|
|
357
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
358
|
+
failed_apis.append(('openreview_search', self.openreview, failure_type))
|
|
359
|
+
|
|
360
|
+
# Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
|
|
361
|
+
if not self._should_try_doi_apis_first(reference) and self.crossref:
|
|
362
|
+
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
363
|
+
if success:
|
|
364
|
+
# Check if the data is complete enough to use
|
|
365
|
+
if self._is_data_complete(verified_data, reference):
|
|
366
|
+
return verified_data, errors, url
|
|
367
|
+
else:
|
|
368
|
+
# Data is incomplete, save it as fallback
|
|
369
|
+
if not crossref_result: # Only save if we don't already have one
|
|
370
|
+
crossref_result = (verified_data, errors, url)
|
|
371
|
+
logger.debug("Enhanced Hybrid: CrossRef data incomplete (non-DOI), continuing with other APIs")
|
|
372
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
373
|
+
failed_apis.append(('crossref', self.crossref, failure_type))
|
|
374
|
+
|
|
375
|
+
# PHASE 2: If no API succeeded in Phase 1, retry failed APIs
|
|
376
|
+
if failed_apis:
|
|
377
|
+
logger.debug(f"Enhanced Hybrid: Phase 1 complete, no success. Retrying {len(failed_apis)} failed APIs")
|
|
378
|
+
|
|
379
|
+
# Sort failed APIs to prioritize Semantic Scholar retries
|
|
380
|
+
semantic_scholar_retries = [api for api in failed_apis if api[0] == 'semantic_scholar']
|
|
381
|
+
other_retries = [api for api in failed_apis if api[0] != 'semantic_scholar']
|
|
382
|
+
|
|
383
|
+
# Try other APIs first, then Semantic Scholar with more aggressive retries
|
|
384
|
+
retry_order = other_retries + semantic_scholar_retries
|
|
385
|
+
|
|
386
|
+
for api_name, api_instance, failure_type in retry_order:
|
|
387
|
+
# Use base delay for first retry of each API
|
|
388
|
+
delay = min(self.retry_base_delay, self.max_retry_delay)
|
|
389
|
+
|
|
390
|
+
# Add jitter to prevent thundering herd (±25% randomization)
|
|
391
|
+
jitter = delay * 0.25 * (2 * random.random() - 1)
|
|
392
|
+
final_delay = max(0.5, delay + jitter)
|
|
393
|
+
|
|
394
|
+
logger.debug(f"Enhanced Hybrid: Waiting {final_delay:.1f}s before retrying {api_name} after {failure_type} failure")
|
|
395
|
+
time.sleep(final_delay)
|
|
396
|
+
|
|
397
|
+
logger.debug(f"Enhanced Hybrid: Retrying {api_name}")
|
|
398
|
+
verified_data, errors, url, success, _ = self._try_api(api_name, api_instance, reference, is_retry=True)
|
|
399
|
+
if success:
|
|
400
|
+
logger.debug(f"Enhanced Hybrid: {api_name} succeeded on retry after {failure_type} (delay: {final_delay:.1f}s)")
|
|
401
|
+
return verified_data, errors, url
|
|
402
|
+
|
|
403
|
+
# For Semantic Scholar, try additional retries with increasing delays
|
|
404
|
+
if api_name == 'semantic_scholar' and not success:
|
|
405
|
+
for retry_attempt in range(2): # Additional 2 retries for Semantic Scholar
|
|
406
|
+
retry_delay = delay * (self.retry_backoff_factor ** (retry_attempt + 1))
|
|
407
|
+
retry_delay = min(retry_delay, self.max_retry_delay)
|
|
408
|
+
retry_jitter = retry_delay * 0.25 * (2 * random.random() - 1)
|
|
409
|
+
final_retry_delay = max(1.0, retry_delay + retry_jitter)
|
|
410
|
+
|
|
411
|
+
logger.debug(f"Enhanced Hybrid: Additional Semantic Scholar retry {retry_attempt + 2} after {final_retry_delay:.1f}s")
|
|
412
|
+
time.sleep(final_retry_delay)
|
|
413
|
+
|
|
414
|
+
verified_data, errors, url, success, _ = self._try_api(api_name, api_instance, reference, is_retry=True)
|
|
415
|
+
if success:
|
|
416
|
+
logger.debug(f"Enhanced Hybrid: {api_name} succeeded on retry {retry_attempt + 2} (delay: {final_retry_delay:.1f}s)")
|
|
417
|
+
return verified_data, errors, url
|
|
418
|
+
|
|
419
|
+
# PHASE 3: If all APIs failed or returned incomplete data, use best available incomplete data as fallback
|
|
420
|
+
incomplete_results = [r for r in [crossref_result, openalex_result] if r is not None]
|
|
421
|
+
if incomplete_results:
|
|
422
|
+
# Prefer CrossRef over OpenAlex for incomplete data (usually more reliable)
|
|
423
|
+
best_incomplete = crossref_result if crossref_result else openalex_result
|
|
424
|
+
logger.debug("Enhanced Hybrid: No complete data found, using incomplete data as fallback")
|
|
425
|
+
return best_incomplete
|
|
426
|
+
|
|
427
|
+
# If all APIs failed, return unverified
|
|
428
|
+
failed_count = len(failed_apis)
|
|
429
|
+
total_attempted = (1 if self.local_db else 0) + (1 if self.semantic_scholar else 0) + (1 if self.openalex else 0) + (1 if self.crossref else 0)
|
|
430
|
+
|
|
431
|
+
if failed_count > 0:
|
|
432
|
+
logger.debug(f"Enhanced Hybrid: All {total_attempted} APIs failed to verify reference ({failed_count} retried)")
|
|
433
|
+
else:
|
|
434
|
+
logger.debug("Enhanced Hybrid: All available APIs failed to verify reference")
|
|
435
|
+
|
|
436
|
+
return None, [{
|
|
437
|
+
'error_type': 'unverified',
|
|
438
|
+
'error_details': 'Could not verify reference using any available API'
|
|
439
|
+
}], None
|
|
440
|
+
|
|
441
|
+
def _try_openreview_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
|
|
442
|
+
"""
|
|
443
|
+
Try to verify reference using OpenReview search
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Tuple of (verified_data, errors, url, success, failure_type)
|
|
447
|
+
"""
|
|
448
|
+
if not self.openreview:
|
|
449
|
+
return None, [], None, False, 'none'
|
|
450
|
+
|
|
451
|
+
start_time = time.time()
|
|
452
|
+
failure_type = 'none'
|
|
453
|
+
|
|
454
|
+
try:
|
|
455
|
+
verified_data, errors, url = self.openreview.verify_reference_by_search(reference)
|
|
456
|
+
duration = time.time() - start_time
|
|
457
|
+
|
|
458
|
+
# Consider it successful if we found data or verification errors
|
|
459
|
+
success = verified_data is not None or len(errors) > 0
|
|
460
|
+
self._update_api_stats('openreview', success, duration)
|
|
461
|
+
|
|
462
|
+
if success:
|
|
463
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search successful in {duration:.2f}s, URL: {url}")
|
|
464
|
+
return verified_data, errors, url, True, 'none'
|
|
465
|
+
else:
|
|
466
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search found no results in {duration:.2f}s")
|
|
467
|
+
return None, [], None, False, 'not_found'
|
|
468
|
+
|
|
469
|
+
except requests.exceptions.Timeout as e:
|
|
470
|
+
duration = time.time() - start_time
|
|
471
|
+
self._update_api_stats('openreview', False, duration)
|
|
472
|
+
failure_type = 'timeout'
|
|
473
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search timed out in {duration:.2f}s: {e}")
|
|
474
|
+
return None, [], None, False, failure_type
|
|
475
|
+
|
|
476
|
+
except requests.exceptions.RequestException as e:
|
|
477
|
+
duration = time.time() - start_time
|
|
478
|
+
self._update_api_stats('openreview', False, duration)
|
|
479
|
+
|
|
480
|
+
# Check if it's a rate limiting error
|
|
481
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
482
|
+
if e.response.status_code in [429, 503]:
|
|
483
|
+
failure_type = 'throttled'
|
|
484
|
+
elif e.response.status_code >= 500:
|
|
485
|
+
failure_type = 'server_error'
|
|
486
|
+
else:
|
|
487
|
+
failure_type = 'other'
|
|
488
|
+
else:
|
|
489
|
+
failure_type = 'other'
|
|
490
|
+
|
|
491
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search failed in {duration:.2f}s: {type(e).__name__}: {e}")
|
|
492
|
+
return None, [], None, False, failure_type
|
|
493
|
+
|
|
494
|
+
except Exception as e:
|
|
495
|
+
duration = time.time() - start_time
|
|
496
|
+
self._update_api_stats('openreview', False, duration)
|
|
497
|
+
failure_type = 'other'
|
|
498
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search error in {duration:.2f}s: {type(e).__name__}: {e}")
|
|
499
|
+
return None, [], None, False, failure_type
|
|
500
|
+
|
|
501
|
+
def get_performance_stats(self) -> Dict[str, Any]:
|
|
502
|
+
"""
|
|
503
|
+
Get performance statistics for all APIs
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
Dictionary with performance statistics
|
|
507
|
+
"""
|
|
508
|
+
stats = {}
|
|
509
|
+
for api_name, api_stats in self.api_stats.items():
|
|
510
|
+
total_calls = api_stats['success'] + api_stats['failure']
|
|
511
|
+
if total_calls > 0:
|
|
512
|
+
success_rate = api_stats['success'] / total_calls
|
|
513
|
+
stats[api_name] = {
|
|
514
|
+
'success_rate': success_rate,
|
|
515
|
+
'total_calls': total_calls,
|
|
516
|
+
'avg_time': api_stats['avg_time'],
|
|
517
|
+
'success_count': api_stats['success'],
|
|
518
|
+
'failure_count': api_stats['failure']
|
|
519
|
+
}
|
|
520
|
+
else:
|
|
521
|
+
stats[api_name] = {
|
|
522
|
+
'success_rate': 0,
|
|
523
|
+
'total_calls': 0,
|
|
524
|
+
'avg_time': 0,
|
|
525
|
+
'success_count': 0,
|
|
526
|
+
'failure_count': 0
|
|
527
|
+
}
|
|
528
|
+
return stats
|
|
529
|
+
|
|
530
|
+
def log_performance_summary(self):
|
|
531
|
+
"""Log a summary of API performance statistics (only if debug mode is enabled)"""
|
|
532
|
+
if not self.debug_mode:
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
stats = self.get_performance_stats()
|
|
536
|
+
logger.info("Enhanced Hybrid API Performance Summary:")
|
|
537
|
+
for api_name, api_stats in stats.items():
|
|
538
|
+
if api_stats['total_calls'] > 0:
|
|
539
|
+
logger.info(f" {api_name}: {api_stats['success_rate']:.2%} success rate, "
|
|
540
|
+
f"{api_stats['total_calls']} calls, {api_stats['avg_time']:.2f}s avg")
|
|
541
|
+
else:
|
|
542
|
+
logger.info(f" {api_name}: not used")
|
|
543
|
+
|
|
544
|
+
def normalize_paper_title(self, title: str) -> str:
|
|
545
|
+
"""
|
|
546
|
+
Normalize paper title for comparison (delegates to Semantic Scholar checker)
|
|
547
|
+
"""
|
|
548
|
+
if self.semantic_scholar:
|
|
549
|
+
return self.semantic_scholar.normalize_paper_title(title)
|
|
550
|
+
else:
|
|
551
|
+
# Use the centralized normalization function from text_utils
|
|
552
|
+
from refchecker.utils.text_utils import normalize_paper_title as normalize_title
|
|
553
|
+
return normalize_title(title)
|
|
554
|
+
|
|
555
|
+
def compare_authors(self, cited_authors: List[str], correct_authors: List[Any]) -> Tuple[bool, str]:
|
|
556
|
+
"""
|
|
557
|
+
Compare author lists (delegates to shared utility)
|
|
558
|
+
"""
|
|
559
|
+
from refchecker.utils.text_utils import compare_authors
|
|
560
|
+
return compare_authors(cited_authors, correct_authors)
|
|
561
|
+
|
|
562
|
+
# Backward compatibility alias
|
|
563
|
+
HybridReferenceChecker = EnhancedHybridReferenceChecker
|