academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,764 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Semantic Scholar API Client for Reference Verification
4
+
5
+ This module provides functionality to verify non-arXiv references using the Semantic Scholar API.
6
+ It can check if a reference's metadata (authors, year, title) matches what's in the Semantic Scholar database.
7
+
8
+ Usage:
9
+ from semantic_scholar import NonArxivReferenceChecker
10
+
11
+ # Initialize the checker
12
+ checker = NonArxivReferenceChecker(api_key="your_api_key") # API key is optional
13
+
14
+ # Verify a reference
15
+ reference = {
16
+ 'title': 'Title of the paper',
17
+ 'authors': ['Author 1', 'Author 2'],
18
+ 'year': 2020,
19
+ 'url': 'https://example.com/paper',
20
+ 'raw_text': 'Full citation text'
21
+ }
22
+
23
+ verified_data, errors = checker.verify_reference(reference)
24
+ """
25
+
26
+ import requests
27
+ import time
28
+ import logging
29
+ import re
30
+ from typing import Dict, List, Tuple, Optional, Any, Union
31
+ from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
32
+ from refchecker.utils.error_utils import format_title_mismatch
33
+ from refchecker.config.settings import get_config
34
+
35
+ # Set up logging
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # Get configuration
39
+ config = get_config()
40
+ SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
41
+
42
+ class NonArxivReferenceChecker:
43
+ """
44
+ A class to verify non-arXiv references using the Semantic Scholar API
45
+ """
46
+
47
+ def __init__(self, api_key: Optional[str] = None):
48
+ """
49
+ Initialize the Semantic Scholar API client
50
+
51
+ Args:
52
+ api_key: Optional API key for Semantic Scholar (increases rate limits)
53
+ """
54
+ self.base_url = "https://api.semanticscholar.org/graph/v1"
55
+ self.headers = {
56
+ "Accept": "application/json"
57
+ }
58
+
59
+ if api_key:
60
+ self.headers["x-api-key"] = api_key
61
+
62
+ # Rate limiting parameters
63
+ self.request_delay = 1.0 # Initial delay between requests (seconds)
64
+ self.max_retries = 5 # Sufficient for individual API calls
65
+ self.backoff_factor = 2 # Exponential backoff factor
66
+
67
+ # Track API failures for Enhanced Hybrid Checker
68
+ self._api_failed = False
69
+ self._failure_reason = None
70
+
71
+ def search_paper(self, query: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
72
+ """
73
+ Search for papers matching the query
74
+
75
+ Args:
76
+ query: Search query (title, authors, etc.)
77
+ year: Publication year to filter by
78
+
79
+ Returns:
80
+ List of paper data dictionaries
81
+ """
82
+ endpoint = f"{self.base_url}/paper/search"
83
+
84
+ # Build query parameters
85
+ params = {
86
+ "query": query,
87
+ "limit": 10,
88
+ "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal",
89
+ "sort": "relevance" # Ensure consistent ordering
90
+ }
91
+
92
+ # Reduce retries for ArXiv ID searches to avoid unnecessary API calls when mismatch is likely
93
+ max_retries_for_this_query = 2 if "arXiv:" in query else self.max_retries
94
+
95
+ # Make the request with retries and backoff
96
+ for attempt in range(max_retries_for_this_query):
97
+ try:
98
+ response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
99
+
100
+ # Check for rate limiting
101
+ if response.status_code == 429:
102
+ wait_time = self.request_delay * (self.backoff_factor ** attempt)
103
+ logger.debug(f"Rate limit exceeded. Increasing delay and retrying...")
104
+ time.sleep(wait_time)
105
+ continue
106
+
107
+ # Check for other errors
108
+ response.raise_for_status()
109
+
110
+ # Parse the response
111
+ data = response.json()
112
+ return data.get('data', [])
113
+
114
+ except requests.exceptions.RequestException as e:
115
+ wait_time = self.request_delay * (self.backoff_factor ** attempt)
116
+ logger.warning(f"Request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
117
+ time.sleep(wait_time)
118
+
119
+ # If we get here, all retries failed
120
+ logger.debug(f"Failed to search for paper after {self.max_retries} attempts")
121
+ self._api_failed = True
122
+ self._failure_reason = "rate_limited_or_timeout"
123
+ return []
124
+
125
+ def get_paper_by_doi(self, doi: str) -> Optional[Dict[str, Any]]:
126
+ """
127
+ Get paper data by DOI
128
+
129
+ Args:
130
+ doi: DOI of the paper
131
+
132
+ Returns:
133
+ Paper data dictionary or None if not found
134
+ """
135
+ endpoint = f"{self.base_url}/paper/DOI:{doi}"
136
+
137
+ params = {
138
+ "fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"
139
+ }
140
+
141
+ # Make the request with retries and backoff
142
+ for attempt in range(self.max_retries):
143
+ try:
144
+ response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
145
+
146
+ # Check for rate limiting
147
+ if response.status_code == 429:
148
+ wait_time = self.request_delay * (self.backoff_factor ** attempt)
149
+ logger.debug(f"Rate limit exceeded. Increasing delay and retrying...")
150
+ time.sleep(wait_time)
151
+ continue
152
+
153
+ # If not found, return None
154
+ if response.status_code == 404:
155
+ logger.debug(f"Paper with DOI {doi} not found")
156
+ return None
157
+
158
+ # Check for other errors
159
+ response.raise_for_status()
160
+
161
+ # Parse the response
162
+ return response.json()
163
+
164
+ except requests.exceptions.RequestException as e:
165
+ wait_time = self.request_delay * (self.backoff_factor ** attempt)
166
+ logger.warning(f"Request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
167
+ time.sleep(wait_time)
168
+
169
+ # If we get here, all retries failed
170
+ logger.error(f"Failed to get paper by DOI after {self.max_retries} attempts")
171
+ self._api_failed = True
172
+ self._failure_reason = "rate_limited_or_timeout"
173
+ return None
174
+
175
+ def extract_doi_from_url(self, url: str) -> Optional[str]:
176
+ """
177
+ Extract DOI from a URL
178
+
179
+ Args:
180
+ url: URL that might contain a DOI
181
+
182
+ Returns:
183
+ Extracted DOI or None if not found
184
+ """
185
+ if not url:
186
+ return None
187
+
188
+ # Check if it's a DOI URL
189
+ if 'doi.org' in url:
190
+ # Extract the DOI part after doi.org/
191
+ match = re.search(r'doi\.org/([^/\s]+)', url)
192
+ if match:
193
+ return match.group(1)
194
+
195
+ return None
196
+
197
+ def normalize_author_name(self, name: str) -> str:
198
+ """
199
+ Normalize author name for comparison
200
+
201
+ Args:
202
+ name: Author name
203
+
204
+ Returns:
205
+ Normalized name
206
+ """
207
+ # Remove reference numbers (e.g., "[1]")
208
+ name = re.sub(r'^\[\d+\]', '', name)
209
+
210
+ # Use common normalization function
211
+ return normalize_text(name)
212
+
213
+ def compare_authors(self, cited_authors: List[str], correct_authors: List[Dict[str, str]]) -> Tuple[bool, str]:
214
+ """
215
+ Compare author lists to check if they match (delegates to shared utility)
216
+
217
+ Args:
218
+ cited_authors: List of author names as cited
219
+ correct_authors: List of author data from Semantic Scholar
220
+
221
+ Returns:
222
+ Tuple of (match_result, error_message)
223
+ """
224
+ return compare_authors(cited_authors, correct_authors)
225
+
226
+
227
+
228
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
229
+ """
230
+ Verify a non-arXiv reference using Semantic Scholar
231
+
232
+ Args:
233
+ reference: Reference data dictionary
234
+
235
+ Returns:
236
+ Tuple of (verified_data, errors, url)
237
+ - verified_data: Paper data from Semantic Scholar or None if not found
238
+ - errors: List of error dictionaries
239
+ - url: URL of the paper if found, None otherwise
240
+ """
241
+ # Reset API failure tracking for this verification attempt
242
+ self._api_failed = False
243
+ self._failure_reason = None
244
+
245
+ paper_data = None
246
+ errors = []
247
+
248
+ # Extract reference data
249
+ title = reference.get('title', '')
250
+ authors = reference.get('authors', [])
251
+ year = reference.get('year', 0)
252
+ url = reference.get('url', '')
253
+ raw_text = reference.get('raw_text', '')
254
+
255
+ # First, check if we have a Semantic Scholar URL (API format)
256
+ if url and 'api.semanticscholar.org/CorpusID:' in url:
257
+ # Extract CorpusID from API URL
258
+ corpus_match = re.search(r'CorpusID:(\d+)', url)
259
+ if corpus_match:
260
+ corpus_id = corpus_match.group(1)
261
+ # Try to get the paper directly by CorpusID
262
+ endpoint = f"{self.base_url}/paper/CorpusId:{corpus_id}"
263
+ params = {"fields": "title,authors,year,externalIds,url,abstract,openAccessPdf,isOpenAccess,venue,publicationVenue,journal"}
264
+
265
+ for attempt in range(self.max_retries):
266
+ try:
267
+ response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
268
+
269
+ if response.status_code == 429:
270
+ wait_time = self.request_delay * (self.backoff_factor ** attempt)
271
+ logger.debug(f"Rate limit exceeded. Retrying in {wait_time}s...")
272
+ time.sleep(wait_time)
273
+ continue
274
+
275
+ if response.status_code == 200:
276
+ paper_data = response.json()
277
+ logger.debug(f"Found paper by Semantic Scholar CorpusID: {corpus_id}")
278
+ break
279
+ elif response.status_code == 404:
280
+ logger.debug(f"Paper not found for CorpusID: {corpus_id}")
281
+ break
282
+ else:
283
+ logger.warning(f"Unexpected status code {response.status_code} for CorpusID: {corpus_id}")
284
+ break
285
+
286
+ except requests.RequestException as e:
287
+ logger.warning(f"Request failed for CorpusID {corpus_id}: {e}")
288
+ if attempt == self.max_retries - 1:
289
+ break
290
+ else:
291
+ time.sleep(self.request_delay * (self.backoff_factor ** attempt))
292
+
293
+ # Initialize DOI variable for later use
294
+ doi = None
295
+ if 'doi' in reference and reference['doi']:
296
+ doi = reference['doi']
297
+ elif url:
298
+ doi = self.extract_doi_from_url(url)
299
+
300
+ # If we don't have paper data yet, try DOI
301
+ if not paper_data and doi:
302
+ # Try to get the paper by DOI
303
+ paper_data = self.get_paper_by_doi(doi)
304
+
305
+ if paper_data:
306
+ logger.debug(f"Found paper by DOI: {doi}")
307
+ else:
308
+ logger.debug(f"Could not find paper with DOI: {doi}")
309
+
310
+ # If we couldn't get the paper by DOI, try searching by title
311
+ found_title = ''
312
+ if not paper_data and title:
313
+ # Clean up the title for search using centralized utility function
314
+ cleaned_title = clean_title_for_search(title)
315
+
316
+ # Search for the paper using cleaned query
317
+ search_results = self.search_paper(cleaned_title, year)
318
+
319
+ if search_results:
320
+ best_match, best_score = find_best_match(search_results, cleaned_title, year, authors)
321
+
322
+ # Consider it a match if similarity is above threshold
323
+ if best_match and best_score >= SIMILARITY_THRESHOLD:
324
+ paper_data = best_match
325
+ found_title = best_match['title']
326
+ logger.debug(f"Found paper by title with similarity {best_score:.2f}: {cleaned_title}")
327
+ else:
328
+ logger.debug(f"No good match found for title: {cleaned_title}")
329
+ else:
330
+ logger.debug(f"No papers found for title: {cleaned_title}")
331
+
332
+ # Track if we found an ArXiv ID mismatch (wrong paper via ArXiv ID)
333
+ arxiv_id_mismatch_detected = False
334
+
335
+ # If we still couldn't find the paper, try searching by ArXiv ID if available
336
+ if not paper_data and url and 'arxiv.org/abs/' in url:
337
+ # Extract ArXiv ID from URL
338
+ arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
339
+ if arxiv_match:
340
+ arxiv_id = arxiv_match.group(1)
341
+ logger.debug(f"Trying to find paper by ArXiv ID: {arxiv_id}")
342
+
343
+ # Search using ArXiv ID
344
+ search_results = self.search_paper(f"arXiv:{arxiv_id}")
345
+
346
+ if search_results:
347
+ # For ArXiv searches, check if the found paper matches the cited title
348
+ for result in search_results:
349
+ external_ids = result.get('externalIds', {})
350
+ if external_ids and external_ids.get('ArXiv') == arxiv_id:
351
+ # Found the paper by ArXiv ID, but check if title matches cited title
352
+ result_title = result.get('title', '').strip()
353
+ cited_title = title.strip()
354
+
355
+ if cited_title and result_title:
356
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, result_title)
357
+ logger.debug(f"Semantic Scholar ArXiv search title similarity: {title_similarity:.3f}")
358
+ logger.debug(f"Cited title: '{cited_title}'")
359
+ logger.debug(f"Found title: '{result_title}'")
360
+
361
+ if title_similarity >= SIMILARITY_THRESHOLD:
362
+ paper_data = result
363
+ found_title = result['title']
364
+ logger.debug(f"Found matching paper by ArXiv ID: {arxiv_id}")
365
+ else:
366
+ logger.debug(f"ArXiv ID points to different paper (similarity: {title_similarity:.3f})")
367
+ arxiv_id_mismatch_detected = True
368
+ else:
369
+ # If no title to compare, accept the paper (fallback)
370
+ paper_data = result
371
+ found_title = result['title']
372
+ logger.debug(f"Found paper by ArXiv ID (no title comparison): {arxiv_id}")
373
+ break
374
+
375
+ # If still not found after ArXiv ID search, try ArXiv API directly
376
+ if not paper_data:
377
+ logger.debug(f"Paper not found in Semantic Scholar by ArXiv ID, trying ArXiv API directly for: {arxiv_id}")
378
+ arxiv_paper = self._get_paper_from_arxiv_api(arxiv_id)
379
+ if arxiv_paper:
380
+ # Verify that the ArXiv paper matches the cited reference title
381
+ arxiv_title = arxiv_paper.get('title', '').strip()
382
+ cited_title = title.strip()
383
+
384
+ logger.debug(f"DEBUG: ArXiv paper found, comparing titles...")
385
+ logger.debug(f"DEBUG: cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
386
+
387
+ if cited_title and arxiv_title:
388
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, arxiv_title)
389
+ logger.debug(f"ArXiv API title similarity: {title_similarity:.3f}")
390
+ logger.debug(f"Cited title: '{cited_title}'")
391
+ logger.debug(f"ArXiv title: '{arxiv_title}'")
392
+
393
+ # Only accept the ArXiv paper if the titles match sufficiently
394
+ if title_similarity >= SIMILARITY_THRESHOLD:
395
+ paper_data = arxiv_paper
396
+ found_title = arxiv_paper['title']
397
+ logger.debug(f"Found matching paper in ArXiv API: {arxiv_id}")
398
+ else:
399
+ logger.debug(f"ArXiv paper title doesn't match cited title (similarity: {title_similarity:.3f})")
400
+ arxiv_id_mismatch_detected = True
401
+ logger.debug(f"DEBUG: Set arxiv_id_mismatch_detected = {arxiv_id_mismatch_detected}")
402
+ else:
403
+ # If we don't have a title to compare, don't use the ArXiv paper
404
+ logger.debug(f"Cannot verify ArXiv paper without title comparison")
405
+ logger.debug(f"DEBUG: No title comparison possible, cited_title='{cited_title}', arxiv_title='{arxiv_title}'")
406
+ else:
407
+ logger.debug(f"Paper not found in ArXiv API: {arxiv_id}")
408
+
409
+ # Check for ArXiv ID mismatch before doing raw text search
410
+ if not paper_data and url and 'arxiv.org/abs/' in url:
411
+ # Extract ArXiv ID to check if it would cause a mismatch
412
+ arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
413
+ if arxiv_match:
414
+ check_arxiv_id = arxiv_match.group(1)
415
+ # Quick check if ArXiv ID would point to wrong paper
416
+ try:
417
+ arxiv_paper_check = self._get_paper_from_arxiv_api(check_arxiv_id)
418
+ if arxiv_paper_check:
419
+ arxiv_title_check = arxiv_paper_check.get('title', '').strip()
420
+ cited_title_check = title.strip()
421
+ if cited_title_check and arxiv_title_check:
422
+ title_similarity_check = compare_titles_with_latex_cleaning(cited_title_check, arxiv_title_check)
423
+ if title_similarity_check < SIMILARITY_THRESHOLD:
424
+ logger.debug(f"Detected ArXiv ID mismatch before raw text search - skipping unnecessary searches")
425
+ arxiv_id_mismatch_detected = True
426
+ except Exception as e:
427
+ logger.debug(f"Error checking ArXiv ID mismatch: {e}")
428
+
429
+ # If we still couldn't find the paper, try searching by the raw text
430
+ # BUT skip this if we detected an ArXiv ID mismatch (no point in more searches)
431
+ if not paper_data and raw_text and not arxiv_id_mismatch_detected:
432
+ logger.debug(f"Proceeding with raw text search (arxiv_id_mismatch_detected={arxiv_id_mismatch_detected})")
433
+ elif not paper_data and raw_text and arxiv_id_mismatch_detected:
434
+ logger.debug(f"Skipping raw text search due to ArXiv ID mismatch detected")
435
+
436
+ if not paper_data and raw_text and not arxiv_id_mismatch_detected:
437
+ # Extract and normalize a reasonable search query from the raw text
438
+ search_query = raw_text.replace('\n', ' ').strip()
439
+ normalized_raw_query = normalize_text(search_query).lower().strip()
440
+
441
+ # Search for the paper using normalized query
442
+ search_results = self.search_paper(normalized_raw_query)
443
+
444
+ if search_results:
445
+ # Take the first result as a best guess
446
+ best_match, best_score = find_best_match(search_results, cleaned_title, year, authors)
447
+
448
+ # Consider it a match if similarity is above threshold
449
+ if best_match and best_score >= SIMILARITY_THRESHOLD:
450
+ paper_data = best_match
451
+ found_title = best_match['title']
452
+ logger.debug(f"Found paper by raw text search")
453
+ else:
454
+ logger.debug(f"No good match found for raw text search: {search_query}")
455
+ else:
456
+ logger.debug(f"No papers found for raw text search")
457
+
458
+ # If we couldn't find the paper, check if API failed or genuinely not found
459
+ if not paper_data:
460
+ logger.debug(f"Could not find matching paper for reference: {title}")
461
+ logger.debug(f"Tried: DOI search, title search, ArXiv ID search, ArXiv API fallback, raw text search")
462
+
463
+ # If API failed during search, return error indicating retryable failure
464
+ if self._api_failed:
465
+ return None, [{"error_type": "api_failure", "error_details": f"Semantic Scholar API failed: {self._failure_reason}"}], None
466
+ else:
467
+ # Paper genuinely not found in database
468
+ return None, [], None
469
+
470
+ # Check title using similarity function to handle formatting differences
471
+ title_similarity = compare_titles_with_latex_cleaning(title, found_title) if found_title else 0.0
472
+ if found_title and title_similarity < SIMILARITY_THRESHOLD:
473
+ # Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
474
+ clean_cited_title = strip_latex_commands(title)
475
+ errors.append({
476
+ 'error_type': 'title',
477
+ 'error_details': format_title_mismatch(clean_cited_title, found_title),
478
+ 'ref_title_correct': paper_data.get('title', '')
479
+ })
480
+
481
+ # Verify authors
482
+ if authors:
483
+ authors_match, author_error = self.compare_authors(authors, paper_data.get('authors', []))
484
+
485
+ if not authors_match:
486
+ # Check if we have an exact ArXiv ID match - if so, be more lenient with author mismatches
487
+ # since they might be due to incomplete data in Semantic Scholar
488
+ arxiv_id_match = False
489
+ if url and 'arxiv.org/abs/' in url:
490
+ arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
491
+ if arxiv_match:
492
+ cited_arxiv_id = arxiv_match.group(1)
493
+ external_ids = paper_data.get('externalIds', {})
494
+ found_arxiv_id = external_ids.get('ArXiv')
495
+ arxiv_id_match = (cited_arxiv_id == found_arxiv_id)
496
+
497
+ # If ArXiv IDs match exactly, treat author mismatch as warning (likely incomplete data)
498
+ if arxiv_id_match:
499
+ errors.append({
500
+ 'warning_type': 'author',
501
+ 'warning_details': f"{author_error}",
502
+ 'ref_authors_correct': ', '.join([author.get('name', '') for author in paper_data.get('authors', [])])
503
+ })
504
+ else:
505
+ # No ArXiv ID match, treat as error
506
+ errors.append({
507
+ 'error_type': 'author',
508
+ 'error_details': author_error,
509
+ 'ref_authors_correct': ', '.join([author.get('name', '') for author in paper_data.get('authors', [])])
510
+ })
511
+
512
+ # Verify year using flexible validation
513
+ paper_year = paper_data.get('year')
514
+ # Check if we have an exact ArXiv ID match for additional context
515
+ arxiv_id_match = False
516
+ if url and 'arxiv.org/abs/' in url:
517
+ arxiv_match = re.search(r'arxiv\.org/abs/([^\s/?#]+)', url)
518
+ if arxiv_match:
519
+ cited_arxiv_id = arxiv_match.group(1)
520
+ external_ids = paper_data.get('externalIds', {})
521
+ found_arxiv_id = external_ids.get('ArXiv')
522
+ arxiv_id_match = (cited_arxiv_id == found_arxiv_id)
523
+
524
+ from refchecker.utils.error_utils import validate_year
525
+ year_warning = validate_year(
526
+ cited_year=year,
527
+ paper_year=paper_year,
528
+ use_flexible_validation=True,
529
+ context={'arxiv_match': arxiv_id_match}
530
+ )
531
+ if year_warning:
532
+ errors.append(year_warning)
533
+
534
+ # Verify venue
535
+ cited_venue = reference.get('journal', '') or reference.get('venue', '')
536
+
537
+ # Extract venue from paper_data - check multiple fields since Semantic Scholar
538
+ # returns venue info in different fields depending on publication type
539
+ paper_venue = None
540
+
541
+ # First try the simple 'venue' field (string)
542
+ if paper_data.get('venue'):
543
+ paper_venue = paper_data.get('venue')
544
+
545
+ # If no venue, try publicationVenue object
546
+ if not paper_venue and paper_data.get('publicationVenue'):
547
+ pub_venue = paper_data.get('publicationVenue')
548
+ if isinstance(pub_venue, dict):
549
+ paper_venue = pub_venue.get('name', '')
550
+ elif isinstance(pub_venue, str):
551
+ paper_venue = pub_venue
552
+
553
+ # If still no venue, try journal object
554
+ if not paper_venue and paper_data.get('journal'):
555
+ journal = paper_data.get('journal')
556
+ if isinstance(journal, dict):
557
+ paper_venue = journal.get('name', '')
558
+ elif isinstance(journal, str):
559
+ paper_venue = journal
560
+
561
+ # Ensure paper_venue is a string
562
+ if paper_venue and not isinstance(paper_venue, str):
563
+ paper_venue = str(paper_venue)
564
+
565
+ # Check venue mismatches
566
+ if cited_venue and paper_venue:
567
+ # Use the utility function to check if venues are substantially different
568
+ if are_venues_substantially_different(cited_venue, paper_venue):
569
+ from refchecker.utils.error_utils import create_venue_warning
570
+ errors.append(create_venue_warning(cited_venue, paper_venue))
571
+ elif not cited_venue and paper_venue:
572
+ # Reference has no venue but paper has one - error for missing venue
573
+ errors.append({
574
+ 'error_type': 'venue',
575
+ 'error_details': f"Venue missing: should include '{paper_venue}'",
576
+ 'ref_venue_correct': paper_venue
577
+ })
578
+
579
+ # Always check for missing arXiv URLs when paper has arXiv ID
580
+ external_ids = paper_data.get('externalIds', {})
581
+ arxiv_id = external_ids.get('ArXiv') if external_ids else None
582
+
583
+ if arxiv_id:
584
+ # For arXiv papers, check if reference includes the arXiv URL
585
+ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
586
+
587
+ # Check if the reference already includes this ArXiv URL or equivalent DOI
588
+ reference_url = reference.get('url', '')
589
+
590
+ # Check for direct arXiv URL match
591
+ has_arxiv_url = arxiv_url in reference_url
592
+
593
+ # Also check for arXiv DOI URL (e.g., https://doi.org/10.48550/arxiv.2505.11595)
594
+ arxiv_doi_url = f"https://doi.org/10.48550/arxiv.{arxiv_id}"
595
+ has_arxiv_doi = arxiv_doi_url.lower() in reference_url.lower()
596
+
597
+ if not (has_arxiv_url or has_arxiv_doi):
598
+ errors.append({
599
+ 'info_type': 'url',
600
+ 'info_details': f"Reference could include arXiv URL: {arxiv_url}",
601
+ 'ref_url_correct': arxiv_url
602
+ })
603
+
604
+ # Verify DOI
605
+ paper_doi = None
606
+ external_ids = paper_data.get('externalIds', {})
607
+ if external_ids and 'DOI' in external_ids:
608
+ paper_doi = external_ids['DOI']
609
+
610
+ # Compare DOIs using the proper comparison function
611
+ from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
612
+ if doi and paper_doi and not compare_dois(doi, paper_doi):
613
+ from refchecker.utils.error_utils import format_doi_mismatch
614
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
615
+ # Treat as warning instead of error
616
+ if validate_doi_resolves(doi):
617
+ errors.append({
618
+ 'warning_type': 'doi',
619
+ 'warning_details': format_doi_mismatch(doi, paper_doi),
620
+ 'ref_doi_correct': paper_doi
621
+ })
622
+ else:
623
+ errors.append({
624
+ 'error_type': 'doi',
625
+ 'error_details': format_doi_mismatch(doi, paper_doi),
626
+ 'ref_doi_correct': paper_doi
627
+ })
628
+
629
+ # Extract URL from paper data - prioritize arXiv URLs when available
630
+ paper_url = None
631
+
632
+ logger.debug(f"Semantic Scholar - Extracting URL from paper data: {list(paper_data.keys())}")
633
+
634
+ # Return the Semantic Scholar URL that was actually used for verification
635
+ # First priority: Semantic Scholar URL using paperId (SHA hash, works in web URLs)
636
+ if paper_data.get('paperId'):
637
+ paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
638
+ logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
639
+
640
+ # Second priority: DOI URL (if this was verified through DOI)
641
+ elif external_ids.get('DOI'):
642
+ from refchecker.utils.doi_utils import construct_doi_url
643
+ paper_url = construct_doi_url(external_ids['DOI'])
644
+ logger.debug(f"Using DOI URL for verification: {paper_url}")
645
+
646
+ # Third priority: open access PDF
647
+ elif paper_data.get('openAccessPdf') and paper_data['openAccessPdf'].get('url'):
648
+ paper_url = paper_data['openAccessPdf']['url']
649
+ logger.debug(f"Using open access PDF URL: {paper_url}")
650
+
651
+ # Fourth priority: general URL field
652
+ elif paper_data.get('url'):
653
+ paper_url = paper_data['url']
654
+ logger.debug(f"Using general paper URL: {paper_url}")
655
+
656
+ # Last resort: arXiv URL (only if no other verification source was available)
657
+ elif external_ids.get('ArXiv'):
658
+ arxiv_id = external_ids['ArXiv']
659
+ paper_url = f"https://arxiv.org/abs/{arxiv_id}"
660
+ logger.debug(f"Using arXiv URL as fallback: {paper_url}")
661
+
662
+ if not paper_url:
663
+ logger.debug(f"No URL found in paper data - available fields: {list(paper_data.keys())}")
664
+ logger.debug(f"Paper data sample: {str(paper_data)[:200]}...")
665
+
666
+ return paper_data, errors, paper_url
667
+
668
+ def _get_paper_from_arxiv_api(self, arxiv_id: str) -> Optional[Dict[str, Any]]:
669
+ """
670
+ Get paper metadata directly from ArXiv API for very recent papers not yet in Semantic Scholar.
671
+
672
+ Args:
673
+ arxiv_id: ArXiv ID (e.g., "2507.08846")
674
+
675
+ Returns:
676
+ Paper data dictionary in Semantic Scholar format, or None if not found
677
+ """
678
+ try:
679
+ import xml.etree.ElementTree as ET
680
+
681
+ arxiv_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
682
+ logger.debug(f"Querying ArXiv API: {arxiv_url}")
683
+
684
+ response = requests.get(arxiv_url, timeout=30)
685
+ response.raise_for_status()
686
+
687
+ # Parse XML response
688
+ root = ET.fromstring(response.text)
689
+
690
+ # Check if any entries were found
691
+ entries = root.findall('{http://www.w3.org/2005/Atom}entry')
692
+ if not entries:
693
+ logger.debug(f"No entries found for ArXiv ID: {arxiv_id}")
694
+ return None
695
+
696
+ entry = entries[0] # Take the first entry
697
+
698
+ # Extract title
699
+ title_elem = entry.find('{http://www.w3.org/2005/Atom}title')
700
+ title = title_elem.text.strip() if title_elem is not None else ""
701
+
702
+ # Extract authors
703
+ authors = []
704
+ for author_elem in entry.findall('{http://www.w3.org/2005/Atom}author'):
705
+ name_elem = author_elem.find('{http://www.w3.org/2005/Atom}name')
706
+ if name_elem is not None:
707
+ authors.append({"name": name_elem.text.strip()})
708
+
709
+ # Extract published date
710
+ published_elem = entry.find('{http://www.w3.org/2005/Atom}published')
711
+ year = None
712
+ if published_elem is not None:
713
+ published_date = published_elem.text
714
+ try:
715
+ year = int(published_date[:4])
716
+ except (ValueError, IndexError):
717
+ pass
718
+
719
+ # Create Semantic Scholar-compatible data structure
720
+ paper_data = {
721
+ 'title': title,
722
+ 'authors': authors,
723
+ 'year': year,
724
+ 'externalIds': {'ArXiv': arxiv_id},
725
+ 'url': f"https://arxiv.org/abs/{arxiv_id}",
726
+ 'venue': 'arXiv',
727
+ 'isOpenAccess': True,
728
+ 'openAccessPdf': {'url': f"https://arxiv.org/pdf/{arxiv_id}.pdf"}
729
+ }
730
+
731
+ logger.debug(f"Successfully retrieved ArXiv paper: {title}")
732
+ return paper_data
733
+
734
+ except Exception as e:
735
+ logger.debug(f"Failed to get paper from ArXiv API: {str(e)}")
736
+ return None
737
+
738
+ if __name__ == "__main__":
739
+ # Example usage
740
+ checker = NonArxivReferenceChecker()
741
+
742
+ # Example reference
743
+ reference = {
744
+ 'title': 'Attention is All You Need',
745
+ 'authors': ['Ashish Vaswani', 'Noam Shazeer'],
746
+ 'year': 2017,
747
+ 'url': 'https://example.com/paper',
748
+ 'raw_text': 'Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.'
749
+ }
750
+
751
+ # Verify the reference
752
+ verified_data, errors = checker.verify_reference(reference)
753
+
754
+ if verified_data:
755
+ print(f"Found paper: {verified_data.get('title')}")
756
+
757
+ if errors:
758
+ print("Errors found:")
759
+ for error in errors:
760
+ print(f" - {error['error_type']}: {error['error_details']}")
761
+ else:
762
+ print("No errors found")
763
+ else:
764
+ print("Could not find matching paper")