academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,764 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Semantic Scholar API Client for Reference Verification
|
|
4
|
+
|
|
5
|
+
This module provides functionality to verify non-arXiv references using the Semantic Scholar API.
|
|
6
|
+
It can check if a reference's metadata (authors, year, title) matches what's in the Semantic Scholar database.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from semantic_scholar import NonArxivReferenceChecker
|
|
10
|
+
|
|
11
|
+
# Initialize the checker
|
|
12
|
+
checker = NonArxivReferenceChecker(api_key="your_api_key") # API key is optional
|
|
13
|
+
|
|
14
|
+
# Verify a reference
|
|
15
|
+
reference = {
|
|
16
|
+
'title': 'Title of the paper',
|
|
17
|
+
'authors': ['Author 1', 'Author 2'],
|
|
18
|
+
'year': 2020,
|
|
19
|
+
'url': 'https://example.com/paper',
|
|
20
|
+
'raw_text': 'Full citation text'
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
verified_data, errors = checker.verify_reference(reference)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import requests
|
|
27
|
+
import time
|
|
28
|
+
import logging
|
|
29
|
+
import re
|
|
30
|
+
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
|
+
from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
33
|
+
from refchecker.config.settings import get_config
|
|
34
|
+
|
|
35
|
+
# Set up logging
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# Get configuration
|
|
39
|
+
config = get_config()
|
|
40
|
+
SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
|
|
41
|
+
|
|
42
|
+
class NonArxivReferenceChecker:
|
|
43
|
+
"""
|
|
44
|
+
A class to verify non-arXiv references using the Semantic Scholar API
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the Semantic Scholar API client
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
api_key: Optional API key for Semantic Scholar (increases rate limits)
|
|
53
|
+
"""
|
|
54
|
+
self.base_url = "https://api.semanticscholar.org/graph/v1"
|
|
55
|
+
self.headers = {
|
|
56
|
+
"Accept": "application/json"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if api_key:
|
|
60
|
+
self.headers["x-api-key"] = api_key
|
|
61
|
+
|
|
62
|
+
# Rate limiting parameters
|
|
63
|
+
self.request_delay = 1.0 # Initial delay between requests (seconds)
|
|
64
|
+
self.max_retries = 5 # Sufficient for individual API calls
|
|
65
|
+
self.backoff_factor = 2 # Exponential backoff factor
|
|
66
|
+
|
|
67
|
+
# Track API failures for Enhanced Hybrid Checker
|
|
68
|
+
self._api_failed = False
|
|
69
|
+
self._failure_reason = None
|
|
70
|
+
|
|
71
|
+
def search_paper(self, query: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Search for papers matching the query
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
query: Search query (title, authors, etc.)
|
|
77
|
+
year: Publication year to filter by
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of paper data dictionaries
|
|
81
|
+
"""
|
|
82
|
+
endpoint = f"{self.base_url}/paper/search"
|
|
83
|
+
|
|
84
|
+
# Build query parameters
|
|
85
|
+
params = {
|
|
86
|
+
"query": query,
|
|
87
|
+
"limit": 10,
|
|
88
|
+
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal",
|
|
89
|
+
"sort": "relevance" # Ensure consistent ordering
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Reduce retries for ArXiv ID searches to avoid unnecessary API calls when mismatch is likely
|
|
93
|
+
max_retries_for_this_query = 2 if "arXiv:" in query else self.max_retries
|
|
94
|
+
|
|
95
|
+
# Make the request with retries and backoff
|
|
96
|
+
for attempt in range(max_retries_for_this_query):
|
|
97
|
+
try:
|
|
98
|
+
response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
|
|
99
|
+
|
|
100
|
+
# Check for rate limiting
|
|
101
|
+
if response.status_code == 429:
|
|
102
|
+
wait_time = self.request_delay * (self.backoff_factor ** attempt)
|
|
103
|
+
logger.debug(f"Rate limit exceeded. Increasing delay and retrying...")
|
|
104
|
+
time.sleep(wait_time)
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
# Check for other errors
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
|
|
110
|
+
# Parse the response
|
|
111
|
+
data = response.json()
|
|
112
|
+
return data.get('data', [])
|
|
113
|
+
|
|
114
|
+
except requests.exceptions.RequestException as e:
|
|
115
|
+
wait_time = self.request_delay * (self.backoff_factor ** attempt)
|
|
116
|
+
logger.warning(f"Request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
|
|
117
|
+
time.sleep(wait_time)
|
|
118
|
+
|
|
119
|
+
# If we get here, all retries failed
|
|
120
|
+
logger.debug(f"Failed to search for paper after {self.max_retries} attempts")
|
|
121
|
+
self._api_failed = True
|
|
122
|
+
self._failure_reason = "rate_limited_or_timeout"
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
def get_paper_by_doi(self, doi: str) -> Optional[Dict[str, Any]]:
|
|
126
|
+
"""
|
|
127
|
+
Get paper data by DOI
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
doi: DOI of the paper
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Paper data dictionary or None if not found
|
|
134
|
+
"""
|
|
135
|
+
endpoint = f"{self.base_url}/paper/DOI:{doi}"
|
|
136
|
+
|
|
137
|
+
params = {
|
|
138
|
+
"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Make the request with retries and backoff
|
|
142
|
+
for attempt in range(self.max_retries):
|
|
143
|
+
try:
|
|
144
|
+
response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
|
|
145
|
+
|
|
146
|
+
# Check for rate limiting
|
|
147
|
+
if response.status_code == 429:
|
|
148
|
+
wait_time = self.request_delay * (self.backoff_factor ** attempt)
|
|
149
|
+
logger.debug(f"Rate limit exceeded. Increasing delay and retrying...")
|
|
150
|
+
time.sleep(wait_time)
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
# If not found, return None
|
|
154
|
+
if response.status_code == 404:
|
|
155
|
+
logger.debug(f"Paper with DOI {doi} not found")
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
# Check for other errors
|
|
159
|
+
response.raise_for_status()
|
|
160
|
+
|
|
161
|
+
# Parse the response
|
|
162
|
+
return response.json()
|
|
163
|
+
|
|
164
|
+
except requests.exceptions.RequestException as e:
|
|
165
|
+
wait_time = self.request_delay * (self.backoff_factor ** attempt)
|
|
166
|
+
logger.warning(f"Request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
|
|
167
|
+
time.sleep(wait_time)
|
|
168
|
+
|
|
169
|
+
# If we get here, all retries failed
|
|
170
|
+
logger.error(f"Failed to get paper by DOI after {self.max_retries} attempts")
|
|
171
|
+
self._api_failed = True
|
|
172
|
+
self._failure_reason = "rate_limited_or_timeout"
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def extract_doi_from_url(self, url: str) -> Optional[str]:
|
|
176
|
+
"""
|
|
177
|
+
Extract DOI from a URL
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
url: URL that might contain a DOI
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Extracted DOI or None if not found
|
|
184
|
+
"""
|
|
185
|
+
if not url:
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
# Check if it's a DOI URL
|
|
189
|
+
if 'doi.org' in url:
|
|
190
|
+
# Extract the DOI part after doi.org/
|
|
191
|
+
match = re.search(r'doi\.org/([^/\s]+)', url)
|
|
192
|
+
if match:
|
|
193
|
+
return match.group(1)
|
|
194
|
+
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
def normalize_author_name(self, name: str) -> str:
|
|
198
|
+
"""
|
|
199
|
+
Normalize author name for comparison
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
name: Author name
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Normalized name
|
|
206
|
+
"""
|
|
207
|
+
# Remove reference numbers (e.g., "[1]")
|
|
208
|
+
name = re.sub(r'^\[\d+\]', '', name)
|
|
209
|
+
|
|
210
|
+
# Use common normalization function
|
|
211
|
+
return normalize_text(name)
|
|
212
|
+
|
|
213
|
+
def compare_authors(self, cited_authors: List[str], correct_authors: List[Dict[str, str]]) -> Tuple[bool, str]:
|
|
214
|
+
"""
|
|
215
|
+
Compare author lists to check if they match (delegates to shared utility)
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
cited_authors: List of author names as cited
|
|
219
|
+
correct_authors: List of author data from Semantic Scholar
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Tuple of (match_result, error_message)
|
|
223
|
+
"""
|
|
224
|
+
return compare_authors(cited_authors, correct_authors)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
229
|
+
"""
|
|
230
|
+
Verify a non-arXiv reference using Semantic Scholar
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
reference: Reference data dictionary
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Tuple of (verified_data, errors, url)
|
|
237
|
+
- verified_data: Paper data from Semantic Scholar or None if not found
|
|
238
|
+
- errors: List of error dictionaries
|
|
239
|
+
- url: URL of the paper if found, None otherwise
|
|
240
|
+
"""
|
|
241
|
+
# Reset API failure tracking for this verification attempt
|
|
242
|
+
self._api_failed = False
|
|
243
|
+
self._failure_reason = None
|
|
244
|
+
|
|
245
|
+
paper_data = None
|
|
246
|
+
errors = []
|
|
247
|
+
|
|
248
|
+
# Extract reference data
|
|
249
|
+
title = reference.get('title', '')
|
|
250
|
+
authors = reference.get('authors', [])
|
|
251
|
+
year = reference.get('year', 0)
|
|
252
|
+
url = reference.get('url', '')
|
|
253
|
+
raw_text = reference.get('raw_text', '')
|
|
254
|
+
|
|
255
|
+
# First, check if we have a Semantic Scholar URL (API format)
|
|
256
|
+
if url and 'api.semanticscholar.org/CorpusID:' in url:
|
|
257
|
+
# Extract CorpusID from API URL
|
|
258
|
+
corpus_match = re.search(r'CorpusID:(\d+)', url)
|
|
259
|
+
if corpus_match:
|
|
260
|
+
corpus_id = corpus_match.group(1)
|
|
261
|
+
# Try to get the paper directly by CorpusID
|
|
262
|
+
endpoint = f"{self.base_url}/paper/CorpusId:{corpus_id}"
|
|
263
|
+
params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"}
|
|
264
|
+
|
|
265
|
+
for attempt in range(self.max_retries):
|
|
266
|
+
try:
|
|
267
|
+
response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
|
|
268
|
+
|
|
269
|
+
if response.status_code == 429:
|
|
270
|
+
wait_time = self.request_delay * (self.backoff_factor ** attempt)
|
|
271
|
+
logger.debug(f"Rate limit exceeded. Retrying in {wait_time}s...")
|
|
272
|
+
time.sleep(wait_time)
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
if response.status_code == 200:
|
|
276
|
+
paper_data = response.json()
|
|
277
|
+
logger.debug(f"Found paper by Semantic Scholar CorpusID: {corpus_id}")
|
|
278
|
+
break
|
|
279
|
+
elif response.status_code == 404:
|
|
280
|
+
logger.debug(f"Paper not found for CorpusID: {corpus_id}")
|
|
281
|
+
break
|
|
282
|
+
else:
|
|
283
|
+
logger.warning(f"Unexpected status code {response.status_code} for CorpusID: {corpus_id}")
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
except requests.RequestException as e:
|
|
287
|
+
logger.warning(f"Request failed for CorpusID {corpus_id}: {e}")
|
|
288
|
+
if attempt == self.max_retries - 1:
|
|
289
|
+
break
|
|
290
|
+
else:
|
|
291
|
+
time.sleep(self.request_delay * (self.backoff_factor ** attempt))
|
|
292
|
+
|
|
293
|
+
# Initialize DOI variable for later use
|
|
294
|
+
doi = None
|
|
295
|
+
if 'doi' in reference and reference['doi']:
|
|
296
|
+
doi = reference['doi']
|
|
297
|
+
elif url:
|
|
298
|
+
doi = self.extract_doi_from_url(url)
|
|
299
|
+
|
|
300
|
+
# If we don't have paper data yet, try DOI
|
|
301
|
+
if not paper_data and doi:
|
|
302
|
+
# Try to get the paper by DOI
|
|
303
|
+
paper_data = self.get_paper_by_doi(doi)
|
|
304
|
+
|
|
305
|
+
if paper_data:
|
|
306
|
+
logger.debug(f"Found paper by DOI: {doi}")
|
|
307
|
+
else:
|
|
308
|
+
logger.debug(f"Could not find paper with DOI: {doi}")
|
|
309
|
+
|
|
310
|
+
# If we couldn't get the paper by DOI, try searching by title
|
|
311
|
+
found_title = ''
|
|
312
|
+
if not paper_data and title:
|
|
313
|
+
# Clean up the title for search using centralized utility function
|
|
314
|
+
cleaned_title = clean_title_for_search(title)
|
|
315
|
+
|
|
316
|
+
# Search for the paper using cleaned query
|
|
317
|
+
search_results = self.search_paper(cleaned_title, year)
|
|
318
|
+
|
|
319
|
+
if search_results:
|
|
320
|
+
best_match, best_score = find_best_match(search_results, cleaned_title, year, authors)
|
|
321
|
+
|
|
322
|
+
# Consider it a match if similarity is above threshold
|
|
323
|
+
if best_match and best_score >= SIMILARITY_THRESHOLD:
|
|
324
|
+
paper_data = best_match
|
|
325
|
+
found_title = best_match['title']
|
|
326
|
+
logger.debug(f"Found paper by title with similarity {best_score:.2f}: {cleaned_title}")
|
|
327
|
+
else:
|
|
328
|
+
logger.debug(f"No good match found for title: {cleaned_title}")
|
|
329
|
+
else:
|
|
330
|
+
logger.debug(f"No papers found for title: {cleaned_title}")
|
|
331
|
+
|
|
332
|
+
# Track if we found an ArXiv ID mismatch (wrong paper via ArXiv ID)
|
|
333
|
+
arxiv_id_mismatch_detected = False
|
|
334
|
+
|
|
335
|
+
# If we still couldn't find the paper, try searching by ArXiv ID if available
|
|
336
|
+
if not paper_data and url and 'arxiv.org/abs/' in url:
|
|
337
|
+
# Extract ArXiv ID from URL
|
|
338
|
+
arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
|
|
339
|
+
if arxiv_match:
|
|
340
|
+
arxiv_id = arxiv_match.group(1)
|
|
341
|
+
logger.debug(f"Trying to find paper by ArXiv ID: {arxiv_id}")
|
|
342
|
+
|
|
343
|
+
# Search using ArXiv ID
|
|
344
|
+
search_results = self.search_paper(f"arXiv:{arxiv_id}")
|
|
345
|
+
|
|
346
|
+
if search_results:
|
|
347
|
+
# For ArXiv searches, check if the found paper matches the cited title
|
|
348
|
+
for result in search_results:
|
|
349
|
+
external_ids = result.get('externalIds', {})
|
|
350
|
+
if external_ids and external_ids.get('ArXiv') == arxiv_id:
|
|
351
|
+
# Found the paper by ArXiv ID, but check if title matches cited title
|
|
352
|
+
result_title = result.get('title', '').strip()
|
|
353
|
+
cited_title = title.strip()
|
|
354
|
+
|
|
355
|
+
if cited_title and result_title:
|
|
356
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
|
|
357
|
+
logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
|
|
358
|
+
logger.debug(f"Cited title: '{cited_title}'")
|
|
359
|
+
logger.debug(f"Found title: '{result_title}'")
|
|
360
|
+
|
|
361
|
+
if title_similarity >= SIMILARITY_THRESHOLD:
|
|
362
|
+
paper_data = result
|
|
363
|
+
found_title = result['title']
|
|
364
|
+
logger.debug(f"Found matching paper by ArXiv ID: {arxiv_id}")
|
|
365
|
+
else:
|
|
366
|
+
logger.debug(f"ArXiv ID points to different paper (similarity: {title_similarity:.3f})")
|
|
367
|
+
arxiv_id_mismatch_detected = True
|
|
368
|
+
else:
|
|
369
|
+
# If no title to compare, accept the paper (fallback)
|
|
370
|
+
paper_data = result
|
|
371
|
+
found_title = result['title']
|
|
372
|
+
logger.debug(f"Found paper by ArXiv ID (no title comparison): {arxiv_id}")
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
# If still not found after ArXiv ID search, try ArXiv API directly
|
|
376
|
+
if not paper_data:
|
|
377
|
+
logger.debug(f"Paper not found in Semantic Scholar by ArXiv ID, trying ArXiv API directly for: {arxiv_id}")
|
|
378
|
+
arxiv_paper = self._get_paper_from_arxiv_api(arxiv_id)
|
|
379
|
+
if arxiv_paper:
|
|
380
|
+
# Verify that the ArXiv paper matches the cited reference title
|
|
381
|
+
arxiv_title = arxiv_paper.get('title', '').strip()
|
|
382
|
+
cited_title = title.strip()
|
|
383
|
+
|
|
384
|
+
logger.debug(f"DEBUG: ArXiv paper found, comparing titles...")
|
|
385
|
+
logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
|
|
386
|
+
|
|
387
|
+
if cited_title and arxiv_title:
|
|
388
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
|
|
389
|
+
logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
|
|
390
|
+
logger.debug(f"Cited title: '{cited_title}'")
|
|
391
|
+
logger.debug(f"ArXiv title: '{arxiv_title}'")
|
|
392
|
+
|
|
393
|
+
# Only accept the ArXiv paper if the titles match sufficiently
|
|
394
|
+
if title_similarity >= SIMILARITY_THRESHOLD:
|
|
395
|
+
paper_data = arxiv_paper
|
|
396
|
+
found_title = arxiv_paper['title']
|
|
397
|
+
logger.debug(f"Found matching paper in ArXiv API: {arxiv_id}")
|
|
398
|
+
else:
|
|
399
|
+
logger.debug(f"ArXiv paper title doesn't match cited title (similarity: {title_similarity:.3f})")
|
|
400
|
+
arxiv_id_mismatch_detected = True
|
|
401
|
+
logger.debug(f"DEBUG: Set arxiv_id_mismatch_detected = {arxiv_id_mismatch_detected}")
|
|
402
|
+
else:
|
|
403
|
+
# If we don't have a title to compare, don't use the ArXiv paper
|
|
404
|
+
logger.debug(f"Cannot verify ArXiv paper without title comparison")
|
|
405
|
+
logger.debug(f"DEBUG: No title comparison possible, cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
|
|
406
|
+
else:
|
|
407
|
+
logger.debug(f"Paper not found in ArXiv API: {arxiv_id}")
|
|
408
|
+
|
|
409
|
+
# Check for ArXiv ID mismatch before doing raw text search
|
|
410
|
+
if not paper_data and url and 'arxiv.org/abs/' in url:
|
|
411
|
+
# Extract ArXiv ID to check if it would cause a mismatch
|
|
412
|
+
arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
|
|
413
|
+
if arxiv_match:
|
|
414
|
+
check_arxiv_id = arxiv_match.group(1)
|
|
415
|
+
# Quick check if ArXiv ID would point to wrong paper
|
|
416
|
+
try:
|
|
417
|
+
arxiv_paper_check = self._get_paper_from_arxiv_api(check_arxiv_id)
|
|
418
|
+
if arxiv_paper_check:
|
|
419
|
+
arxiv_title_check = arxiv_paper_check.get('title', '').strip()
|
|
420
|
+
cited_title_check = title.strip()
|
|
421
|
+
if cited_title_check and arxiv_title_check:
|
|
422
|
+
title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
|
|
423
|
+
if title_similarity_check < SIMILARITY_THRESHOLD:
|
|
424
|
+
logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
|
|
425
|
+
arxiv_id_mismatch_detected = True
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.debug(f"Error checking ArXiv ID mismatch: {e}")
|
|
428
|
+
|
|
429
|
+
# If we still couldn't find the paper, try searching by the raw text
|
|
430
|
+
# BUT skip this if we detected an ArXiv ID mismatch (no point in more searches)
|
|
431
|
+
if not paper_data and raw_text and not arxiv_id_mismatch_detected:
|
|
432
|
+
logger.debug(f"Proceeding with raw text search (arxiv_id_mismatch_detected={arxiv_id_mismatch_detected})")
|
|
433
|
+
elif not paper_data and raw_text and arxiv_id_mismatch_detected:
|
|
434
|
+
logger.debug(f"Skipping raw text search due to ArXiv ID mismatch detected")
|
|
435
|
+
|
|
436
|
+
if not paper_data and raw_text and not arxiv_id_mismatch_detected:
|
|
437
|
+
# Extract and normalize a reasonable search query from the raw text
|
|
438
|
+
search_query = raw_text.replace('\n', ' ').strip()
|
|
439
|
+
normalized_raw_query = normalize_text(search_query).lower().strip()
|
|
440
|
+
|
|
441
|
+
# Search for the paper using normalized query
|
|
442
|
+
search_results = self.search_paper(normalized_raw_query)
|
|
443
|
+
|
|
444
|
+
if search_results:
|
|
445
|
+
# Take the first result as a best guess
|
|
446
|
+
best_match, best_score = find_best_match(search_results, cleaned_title, year, authors)
|
|
447
|
+
|
|
448
|
+
# Consider it a match if similarity is above threshold
|
|
449
|
+
if best_match and best_score >= SIMILARITY_THRESHOLD:
|
|
450
|
+
paper_data = best_match
|
|
451
|
+
found_title = best_match['title']
|
|
452
|
+
logger.debug(f"Found paper by raw text search")
|
|
453
|
+
else:
|
|
454
|
+
logger.debug(f"No good match found for raw text search: {search_query}")
|
|
455
|
+
else:
|
|
456
|
+
logger.debug(f"No papers found for raw text search")
|
|
457
|
+
|
|
458
|
+
# If we couldn't find the paper, check if API failed or genuinely not found
|
|
459
|
+
if not paper_data:
|
|
460
|
+
logger.debug(f"Could not find matching paper for reference: {title}")
|
|
461
|
+
logger.debug(f"Tried: DOI search, title search, ArXiv ID search, ArXiv API fallback, raw text search")
|
|
462
|
+
|
|
463
|
+
# If API failed during search, return error indicating retryable failure
|
|
464
|
+
if self._api_failed:
|
|
465
|
+
return None, [{"error_type": "api_failure", "error_details": f"Semantic Scholar API failed: {self._failure_reason}"}], None
|
|
466
|
+
else:
|
|
467
|
+
# Paper genuinely not found in database
|
|
468
|
+
return None, [], None
|
|
469
|
+
|
|
470
|
+
# Check title using similarity function to handle formatting differences
|
|
471
|
+
title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
|
|
472
|
+
if found_title and title_similarity < SIMILARITY_THRESHOLD:
|
|
473
|
+
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
474
|
+
clean_cited_title = strip_latex_commands(title)
|
|
475
|
+
errors.append({
|
|
476
|
+
'error_type': 'title',
|
|
477
|
+
'error_details': format_title_mismatch(clean_cited_title, found_title),
|
|
478
|
+
'ref_title_correct': paper_data.get('title', '')
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
# Verify authors
|
|
482
|
+
if authors:
|
|
483
|
+
authors_match, author_error = self.compare_authors(authors, paper_data.get('authors', []))
|
|
484
|
+
|
|
485
|
+
if not authors_match:
|
|
486
|
+
# Check if we have an exact ArXiv ID match - if so, be more lenient with author mismatches
|
|
487
|
+
# since they might be due to incomplete data in Semantic Scholar
|
|
488
|
+
arxiv_id_match = False
|
|
489
|
+
if url and 'arxiv.org/abs/' in url:
|
|
490
|
+
arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
|
|
491
|
+
if arxiv_match:
|
|
492
|
+
cited_arxiv_id = arxiv_match.group(1)
|
|
493
|
+
external_ids = paper_data.get('externalIds', {})
|
|
494
|
+
found_arxiv_id = external_ids.get('ArXiv')
|
|
495
|
+
arxiv_id_match = (cited_arxiv_id == found_arxiv_id)
|
|
496
|
+
|
|
497
|
+
# If ArXiv IDs match exactly, treat author mismatch as warning (likely incomplete data)
|
|
498
|
+
if arxiv_id_match:
|
|
499
|
+
errors.append({
|
|
500
|
+
'warning_type': 'author',
|
|
501
|
+
'warning_details': f"{author_error}",
|
|
502
|
+
'ref_authors_correct': ', '.join([author.get('name', '') for author in paper_data.get('authors', [])])
|
|
503
|
+
})
|
|
504
|
+
else:
|
|
505
|
+
# No ArXiv ID match, treat as error
|
|
506
|
+
errors.append({
|
|
507
|
+
'error_type': 'author',
|
|
508
|
+
'error_details': author_error,
|
|
509
|
+
'ref_authors_correct': ', '.join([author.get('name', '') for author in paper_data.get('authors', [])])
|
|
510
|
+
})
|
|
511
|
+
|
|
512
|
+
# Verify year using flexible validation
|
|
513
|
+
paper_year = paper_data.get('year')
|
|
514
|
+
# Check if we have an exact ArXiv ID match for additional context
|
|
515
|
+
arxiv_id_match = False
|
|
516
|
+
if url and 'arxiv.org/abs/' in url:
|
|
517
|
+
arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
|
|
518
|
+
if arxiv_match:
|
|
519
|
+
cited_arxiv_id = arxiv_match.group(1)
|
|
520
|
+
external_ids = paper_data.get('externalIds', {})
|
|
521
|
+
found_arxiv_id = external_ids.get('ArXiv')
|
|
522
|
+
arxiv_id_match = (cited_arxiv_id == found_arxiv_id)
|
|
523
|
+
|
|
524
|
+
from refchecker.utils.error_utils import validate_year
|
|
525
|
+
year_warning = validate_year(
|
|
526
|
+
cited_year=year,
|
|
527
|
+
paper_year=paper_year,
|
|
528
|
+
use_flexible_validation=True,
|
|
529
|
+
context={'arxiv_match': arxiv_id_match}
|
|
530
|
+
)
|
|
531
|
+
if year_warning:
|
|
532
|
+
errors.append(year_warning)
|
|
533
|
+
|
|
534
|
+
# Verify venue
|
|
535
|
+
cited_venue = reference.get('journal', '') or reference.get('venue', '')
|
|
536
|
+
|
|
537
|
+
# Extract venue from paper_data - check multiple fields since Semantic Scholar
|
|
538
|
+
# returns venue info in different fields depending on publication type
|
|
539
|
+
paper_venue = None
|
|
540
|
+
|
|
541
|
+
# First try the simple 'venue' field (string)
|
|
542
|
+
if paper_data.get('venue'):
|
|
543
|
+
paper_venue = paper_data.get('venue')
|
|
544
|
+
|
|
545
|
+
# If no venue, try publicationVenue object
|
|
546
|
+
if not paper_venue and paper_data.get('publicationVenue'):
|
|
547
|
+
pub_venue = paper_data.get('publicationVenue')
|
|
548
|
+
if isinstance(pub_venue, dict):
|
|
549
|
+
paper_venue = pub_venue.get('name', '')
|
|
550
|
+
elif isinstance(pub_venue, str):
|
|
551
|
+
paper_venue = pub_venue
|
|
552
|
+
|
|
553
|
+
# If still no venue, try journal object
|
|
554
|
+
if not paper_venue and paper_data.get('journal'):
|
|
555
|
+
journal = paper_data.get('journal')
|
|
556
|
+
if isinstance(journal, dict):
|
|
557
|
+
paper_venue = journal.get('name', '')
|
|
558
|
+
elif isinstance(journal, str):
|
|
559
|
+
paper_venue = journal
|
|
560
|
+
|
|
561
|
+
# Ensure paper_venue is a string
|
|
562
|
+
if paper_venue and not isinstance(paper_venue, str):
|
|
563
|
+
paper_venue = str(paper_venue)
|
|
564
|
+
|
|
565
|
+
# Check venue mismatches
|
|
566
|
+
if cited_venue and paper_venue:
|
|
567
|
+
# Use the utility function to check if venues are substantially different
|
|
568
|
+
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
569
|
+
from refchecker.utils.error_utils import create_venue_warning
|
|
570
|
+
errors.append(create_venue_warning(cited_venue, paper_venue))
|
|
571
|
+
elif not cited_venue and paper_venue:
|
|
572
|
+
# Reference has no venue but paper has one - error for missing venue
|
|
573
|
+
errors.append({
|
|
574
|
+
'error_type': 'venue',
|
|
575
|
+
'error_details': f"Venue missing: should include '{paper_venue}'",
|
|
576
|
+
'ref_venue_correct': paper_venue
|
|
577
|
+
})
|
|
578
|
+
|
|
579
|
+
# Always check for missing arXiv URLs when paper has arXiv ID
|
|
580
|
+
external_ids = paper_data.get('externalIds', {})
|
|
581
|
+
arxiv_id = external_ids.get('ArXiv') if external_ids else None
|
|
582
|
+
|
|
583
|
+
if arxiv_id:
|
|
584
|
+
# For arXiv papers, check if reference includes the arXiv URL
|
|
585
|
+
arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
586
|
+
|
|
587
|
+
# Check if the reference already includes this ArXiv URL or equivalent DOI
|
|
588
|
+
reference_url = reference.get('url', '')
|
|
589
|
+
|
|
590
|
+
# Check for direct arXiv URL match
|
|
591
|
+
has_arxiv_url = arxiv_url in reference_url
|
|
592
|
+
|
|
593
|
+
# Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
|
|
594
|
+
arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
|
|
595
|
+
has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
|
|
596
|
+
|
|
597
|
+
if not (has_arxiv_url or has_arxiv_doi):
|
|
598
|
+
errors.append({
|
|
599
|
+
'info_type': 'url',
|
|
600
|
+
'info_details': f"Reference could include arXiv URL: {arxiv_url}",
|
|
601
|
+
'ref_url_correct': arxiv_url
|
|
602
|
+
})
|
|
603
|
+
|
|
604
|
+
# Verify DOI
|
|
605
|
+
paper_doi = None
|
|
606
|
+
external_ids = paper_data.get('externalIds', {})
|
|
607
|
+
if external_ids and 'DOI' in external_ids:
|
|
608
|
+
paper_doi = external_ids['DOI']
|
|
609
|
+
|
|
610
|
+
# Compare DOIs using the proper comparison function
|
|
611
|
+
from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
|
|
612
|
+
if doi and paper_doi and not compare_dois(doi, paper_doi):
|
|
613
|
+
from refchecker.utils.error_utils import format_doi_mismatch
|
|
614
|
+
# If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
|
|
615
|
+
# Treat as warning instead of error
|
|
616
|
+
if validate_doi_resolves(doi):
|
|
617
|
+
errors.append({
|
|
618
|
+
'warning_type': 'doi',
|
|
619
|
+
'warning_details': format_doi_mismatch(doi, paper_doi),
|
|
620
|
+
'ref_doi_correct': paper_doi
|
|
621
|
+
})
|
|
622
|
+
else:
|
|
623
|
+
errors.append({
|
|
624
|
+
'error_type': 'doi',
|
|
625
|
+
'error_details': format_doi_mismatch(doi, paper_doi),
|
|
626
|
+
'ref_doi_correct': paper_doi
|
|
627
|
+
})
|
|
628
|
+
|
|
629
|
+
# Extract URL from paper data - prioritize arXiv URLs when available
|
|
630
|
+
paper_url = None
|
|
631
|
+
|
|
632
|
+
logger.debug(f"Semantic Scholar - Extracting URL from paper data: {list(paper_data.keys())}")
|
|
633
|
+
|
|
634
|
+
# Return the Semantic Scholar URL that was actually used for verification
|
|
635
|
+
# First priority: Semantic Scholar URL using paperId (SHA hash, works in web URLs)
|
|
636
|
+
if paper_data.get('paperId'):
|
|
637
|
+
paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
|
|
638
|
+
logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
|
|
639
|
+
|
|
640
|
+
# Second priority: DOI URL (if this was verified through DOI)
|
|
641
|
+
elif external_ids.get('DOI'):
|
|
642
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
643
|
+
paper_url = construct_doi_url(external_ids['DOI'])
|
|
644
|
+
logger.debug(f"Using DOI URL for verification: {paper_url}")
|
|
645
|
+
|
|
646
|
+
# Third priority: open access PDF
|
|
647
|
+
elif paper_data.get('openAccessPdf') and paper_data['openAccessPdf'].get('url'):
|
|
648
|
+
paper_url = paper_data['openAccessPdf']['url']
|
|
649
|
+
logger.debug(f"Using open access PDF URL: {paper_url}")
|
|
650
|
+
|
|
651
|
+
# Fourth priority: general URL field
|
|
652
|
+
elif paper_data.get('url'):
|
|
653
|
+
paper_url = paper_data['url']
|
|
654
|
+
logger.debug(f"Using general paper URL: {paper_url}")
|
|
655
|
+
|
|
656
|
+
# Last resort: arXiv URL (only if no other verification source was available)
|
|
657
|
+
elif external_ids.get('ArXiv'):
|
|
658
|
+
arxiv_id = external_ids['ArXiv']
|
|
659
|
+
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
660
|
+
logger.debug(f"Using arXiv URL as fallback: {paper_url}")
|
|
661
|
+
|
|
662
|
+
if not paper_url:
|
|
663
|
+
logger.debug(f"No URL found in paper data - available fields: {list(paper_data.keys())}")
|
|
664
|
+
logger.debug(f"Paper data sample: {str(paper_data)[:200]}...")
|
|
665
|
+
|
|
666
|
+
return paper_data, errors, paper_url
|
|
667
|
+
|
|
668
|
+
def _get_paper_from_arxiv_api(self, arxiv_id: str) -> Optional[Dict[str, Any]]:
|
|
669
|
+
"""
|
|
670
|
+
Get paper metadata directly from ArXiv API for very recent papers not yet in Semantic Scholar.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
arxiv_id: ArXiv ID (e.g., "2507.08846")
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
Paper data dictionary in Semantic Scholar format, or None if not found
|
|
677
|
+
"""
|
|
678
|
+
try:
|
|
679
|
+
import xml.etree.ElementTree as ET
|
|
680
|
+
|
|
681
|
+
arxiv_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
|
|
682
|
+
logger.debug(f"Querying ArXiv API: {arxiv_url}")
|
|
683
|
+
|
|
684
|
+
response = requests.get(arxiv_url, timeout=30)
|
|
685
|
+
response.raise_for_status()
|
|
686
|
+
|
|
687
|
+
# Parse XML response
|
|
688
|
+
root = ET.fromstring(response.text)
|
|
689
|
+
|
|
690
|
+
# Check if any entries were found
|
|
691
|
+
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
|
|
692
|
+
if not entries:
|
|
693
|
+
logger.debug(f"No entries found for ArXiv ID: {arxiv_id}")
|
|
694
|
+
return None
|
|
695
|
+
|
|
696
|
+
entry = entries[0] # Take the first entry
|
|
697
|
+
|
|
698
|
+
# Extract title
|
|
699
|
+
title_elem = entry.find('{http://www.w3.org/2005/Atom}title')
|
|
700
|
+
title = title_elem.text.strip() if title_elem is not None else ""
|
|
701
|
+
|
|
702
|
+
# Extract authors
|
|
703
|
+
authors = []
|
|
704
|
+
for author_elem in entry.findall('{http://www.w3.org/2005/Atom}author'):
|
|
705
|
+
name_elem = author_elem.find('{http://www.w3.org/2005/Atom}name')
|
|
706
|
+
if name_elem is not None:
|
|
707
|
+
authors.append({"name": name_elem.text.strip()})
|
|
708
|
+
|
|
709
|
+
# Extract published date
|
|
710
|
+
published_elem = entry.find('{http://www.w3.org/2005/Atom}published')
|
|
711
|
+
year = None
|
|
712
|
+
if published_elem is not None:
|
|
713
|
+
published_date = published_elem.text
|
|
714
|
+
try:
|
|
715
|
+
year = int(published_date[:4])
|
|
716
|
+
except (ValueError, IndexError):
|
|
717
|
+
pass
|
|
718
|
+
|
|
719
|
+
# Create Semantic Scholar-compatible data structure
|
|
720
|
+
paper_data = {
|
|
721
|
+
'title': title,
|
|
722
|
+
'authors': authors,
|
|
723
|
+
'year': year,
|
|
724
|
+
'externalIds': {'ArXiv': arxiv_id},
|
|
725
|
+
'url': f"https://arxiv.org/abs/{arxiv_id}",
|
|
726
|
+
'venue': 'arXiv',
|
|
727
|
+
'isOpenAccess': True,
|
|
728
|
+
'openAccessPdf': {'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf"}
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
logger.debug(f"Successfully retrieved ArXiv paper: {title}")
|
|
732
|
+
return paper_data
|
|
733
|
+
|
|
734
|
+
except Exception as e:
|
|
735
|
+
logger.debug(f"Failed to get paper from ArXiv API: {str(e)}")
|
|
736
|
+
return None
|
|
737
|
+
|
|
738
|
+
if __name__ == "__main__":
|
|
739
|
+
# Example usage
|
|
740
|
+
checker = NonArxivReferenceChecker()
|
|
741
|
+
|
|
742
|
+
# Example reference
|
|
743
|
+
reference = {
|
|
744
|
+
'title': 'Attention is All You Need',
|
|
745
|
+
'authors': ['Ashish Vaswani', 'Noam Shazeer'],
|
|
746
|
+
'year': 2017,
|
|
747
|
+
'url': 'https://example.com/paper',
|
|
748
|
+
'raw_text': 'Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.'
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
# Verify the reference
|
|
752
|
+
verified_data, errors = checker.verify_reference(reference)
|
|
753
|
+
|
|
754
|
+
if verified_data:
|
|
755
|
+
print(f"Found paper: {verified_data.get('title')}")
|
|
756
|
+
|
|
757
|
+
if errors:
|
|
758
|
+
print("Errors found:")
|
|
759
|
+
for error in errors:
|
|
760
|
+
print(f" - {error['error_type']}: {error['error_details']}")
|
|
761
|
+
else:
|
|
762
|
+
print("No errors found")
|
|
763
|
+
else:
|
|
764
|
+
print("Could not find matching paper")
|