academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,513 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenAlex API Client for Reference Verification
4
+
5
+ This module provides functionality to verify non-arXiv references using the OpenAlex API.
6
+ It can check if a reference's metadata (authors, year, title) matches what's in the OpenAlex database.
7
+
8
+ OpenAlex is a comprehensive database of scholarly papers, authors, venues, institutions, and concepts.
9
+ It provides free access to over 240 million research outputs with excellent coverage across all disciplines.
10
+
11
+ Usage:
12
+ from openalex import OpenAlexReferenceChecker
13
+
14
+ # Initialize the checker
15
+ checker = OpenAlexReferenceChecker(email="your@email.com") # Email for polite pool
16
+
17
+ # Verify a reference
18
+ reference = {
19
+ 'title': 'Title of the paper',
20
+ 'authors': ['Author 1', 'Author 2'],
21
+ 'year': 2020,
22
+ 'url': 'https://example.com/paper',
23
+ 'raw_text': 'Full citation text'
24
+ }
25
+
26
+ verified_data, errors, url = checker.verify_reference(reference)
27
+ """
28
+
29
+ import requests
30
+ import time
31
+ import logging
32
+ import re
33
+ from typing import Dict, List, Tuple, Optional, Any, Union
34
+ from urllib.parse import quote_plus
35
+ from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, compare_authors, clean_title_for_search
36
+ from refchecker.utils.error_utils import format_year_mismatch, format_doi_mismatch
37
+ from refchecker.config.settings import get_config
38
+
39
+ # Set up logging
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Get configuration
43
+ config = get_config()
44
+ SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
45
+
46
+ class OpenAlexReferenceChecker:
47
+ """
48
+ A class to verify non-arXiv references using the OpenAlex API
49
+ """
50
+
51
+ def __init__(self, email: Optional[str] = None):
52
+ """
53
+ Initialize the OpenAlex API client
54
+
55
+ Args:
56
+ email: Optional email for polite pool access (better performance)
57
+ """
58
+ self.base_url = "https://api.openalex.org"
59
+ self.headers = {
60
+ "Accept": "application/json",
61
+ "User-Agent": "RefChecker/1.0.0 (https://github.com/markrussinovich/refchecker)"
62
+ }
63
+
64
+ # Add email to headers for polite pool access
65
+ if email:
66
+ self.headers["User-Agent"] += f" mailto:{email}"
67
+
68
+ # Rate limiting parameters - OpenAlex allows 10 requests per second
69
+ self.request_delay = 0.1 # 100ms between requests (10 req/sec)
70
+ self.max_retries = 3
71
+ self.backoff_factor = 2 # Exponential backoff factor
72
+
73
+ def search_works(self, query: str, year: Optional[int] = None, limit: int = 5) -> List[Dict[str, Any]]:
74
+ """
75
+ Search for works matching the query
76
+
77
+ Args:
78
+ query: Search query (title, authors, etc.)
79
+ year: Publication year to filter by
80
+ limit: Maximum number of results to return
81
+
82
+ Returns:
83
+ List of work data dictionaries
84
+ """
85
+ endpoint = f"{self.base_url}/works"
86
+
87
+ # Build query parameters - OpenAlex uses a flexible search syntax
88
+ params = {
89
+ "search": query,
90
+ "per_page": min(limit, 25), # OpenAlex max per page is 200, but we limit for performance
91
+ "select": "id,doi,title,display_name,publication_year,authorships,type,open_access,primary_location,locations,referenced_works,ids"
92
+ }
93
+
94
+ # Add year filter if provided
95
+ if year:
96
+ params["filter"] = f"publication_year:{year}"
97
+
98
+ # Make the request with retries and backoff
99
+ for attempt in range(self.max_retries):
100
+ try:
101
+ response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
102
+
103
+ # Check for rate limiting (OpenAlex returns 429 for rate limits)
104
+ if response.status_code == 429:
105
+ wait_time = self.request_delay * (self.backoff_factor ** attempt) + 1
106
+ logger.debug(f"OpenAlex rate limit exceeded. Retrying in {wait_time:.2f} seconds...")
107
+ time.sleep(wait_time)
108
+ continue
109
+
110
+ # Check for other errors
111
+ response.raise_for_status()
112
+
113
+ # Parse the response
114
+ data = response.json()
115
+ results = data.get('results', [])
116
+
117
+ logger.debug(f"OpenAlex search returned {len(results)} results for query: {query[:50]}...")
118
+ return results
119
+
120
+ except requests.exceptions.RequestException as e:
121
+ wait_time = self.request_delay * (self.backoff_factor ** attempt) + 1
122
+ logger.debug(f"OpenAlex request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
123
+ time.sleep(wait_time)
124
+
125
+ # If we get here, all retries failed
126
+ logger.debug(f"Failed to search OpenAlex after {self.max_retries} attempts")
127
+ return []
128
+
129
+ def get_work_by_doi(self, doi: str) -> Optional[Dict[str, Any]]:
130
+ """
131
+ Get work data by DOI
132
+
133
+ Args:
134
+ doi: DOI of the work
135
+
136
+ Returns:
137
+ Work data dictionary or None if not found
138
+ """
139
+ # Clean DOI - remove any prefixes
140
+ clean_doi = doi
141
+ if doi.startswith('doi:'):
142
+ clean_doi = doi[4:]
143
+ if doi.startswith('https://doi.org/'):
144
+ clean_doi = doi[16:]
145
+ if doi.startswith('http://doi.org/'):
146
+ clean_doi = doi[15:]
147
+
148
+ endpoint = f"{self.base_url}/works/doi:{clean_doi}"
149
+
150
+ params = {
151
+ "select": "id,doi,title,display_name,publication_year,authorships,type,open_access,primary_location,locations,referenced_works,ids"
152
+ }
153
+
154
+ # Make the request with retries and backoff
155
+ for attempt in range(self.max_retries):
156
+ try:
157
+ # Add delay to respect rate limits
158
+ time.sleep(self.request_delay)
159
+
160
+ response = requests.get(endpoint, headers=self.headers, params=params, timeout=30)
161
+
162
+ # Check for rate limiting
163
+ if response.status_code == 429:
164
+ wait_time = self.request_delay * (self.backoff_factor ** attempt) + 1
165
+ logger.debug(f"OpenAlex rate limit exceeded. Retrying in {wait_time:.2f} seconds...")
166
+ time.sleep(wait_time)
167
+ continue
168
+
169
+ # If not found, return None
170
+ if response.status_code == 404:
171
+ logger.debug(f"Work with DOI {doi} not found in OpenAlex")
172
+ return None
173
+
174
+ # Check for other errors
175
+ response.raise_for_status()
176
+
177
+ # Parse the response
178
+ work_data = response.json()
179
+ logger.debug(f"Found work by DOI in OpenAlex: {doi}")
180
+ return work_data
181
+
182
+ except requests.exceptions.RequestException as e:
183
+ wait_time = self.request_delay * (self.backoff_factor ** attempt) + 1
184
+ logger.debug(f"OpenAlex request failed: {str(e)}. Retrying in {wait_time:.2f} seconds...")
185
+ time.sleep(wait_time)
186
+
187
+ # If we get here, all retries failed
188
+ logger.error(f"Failed to get work by DOI from OpenAlex after {self.max_retries} attempts")
189
+ return None
190
+
191
+ def extract_doi_from_url(self, url: str) -> Optional[str]:
192
+ """
193
+ Extract DOI from a URL
194
+
195
+ Args:
196
+ url: URL that might contain a DOI
197
+
198
+ Returns:
199
+ Extracted DOI or None if not found
200
+ """
201
+ if not url:
202
+ return None
203
+
204
+ # Only extract DOIs from actual DOI URLs, not from other domains
205
+ # This prevents false positives from URLs like aclanthology.org
206
+ if 'doi.org' not in url and 'doi:' not in url:
207
+ return None
208
+
209
+ # Check if it's a DOI URL
210
+ doi_patterns = [
211
+ r'doi\.org/([^/\s\?#]+(?:/[^/\s\?#]+)*)', # Full DOI pattern
212
+ r'doi:([^/\s\?#]+(?:/[^/\s\?#]+)*)', # doi: prefix
213
+ ]
214
+
215
+ for pattern in doi_patterns:
216
+ match = re.search(pattern, url)
217
+ if match:
218
+ doi_candidate = match.group(1)
219
+ # DOIs must start with "10." and have at least one slash
220
+ if doi_candidate.startswith('10.') and '/' in doi_candidate and len(doi_candidate) > 6:
221
+ return doi_candidate
222
+
223
+ return None
224
+
225
+ def normalize_author_name(self, name: str) -> str:
226
+ """
227
+ Normalize author name for comparison
228
+
229
+ Args:
230
+ name: Author name
231
+
232
+ Returns:
233
+ Normalized name
234
+ """
235
+ # Remove reference numbers (e.g., "[1]")
236
+ name = re.sub(r'^\[\d+\]', '', name)
237
+
238
+ # Use common normalization function
239
+ return normalize_text(name)
240
+
241
+ def compare_authors(self, cited_authors: List[str], openalex_authors: List[Dict[str, Any]]) -> Tuple[bool, str]:
242
+ """
243
+ Compare author lists to check if they match (delegates to shared utility)
244
+
245
+ Args:
246
+ cited_authors: List of author names as cited
247
+ openalex_authors: List of authorship data from OpenAlex
248
+
249
+ Returns:
250
+ Tuple of (match_result, error_message)
251
+ """
252
+ # Extract author names from OpenAlex data for the shared utility
253
+ author_dicts = []
254
+ for authorship in openalex_authors:
255
+ author = authorship.get('author', {})
256
+ display_name = author.get('display_name', '')
257
+ if display_name:
258
+ author_dicts.append({'name': display_name})
259
+
260
+ return compare_authors(cited_authors, author_dicts)
261
+
262
+ def is_name_match(self, name1: str, name2: str) -> bool:
263
+ """
264
+ Check if two author names match, allowing for variations
265
+
266
+ Args:
267
+ name1: First author name (normalized)
268
+ name2: Second author name (normalized)
269
+
270
+ Returns:
271
+ True if names match, False otherwise
272
+ """
273
+ # Exact match
274
+ if name1 == name2:
275
+ return True
276
+
277
+ # If one is a substring of the other, consider it a match
278
+ if name1 in name2 or name2 in name1:
279
+ return True
280
+
281
+ # Split into parts (first name, last name, etc.)
282
+ parts1 = name1.split()
283
+ parts2 = name2.split()
284
+
285
+ if not parts1 or not parts2:
286
+ return False
287
+
288
+ # If either name has only one part, compare directly
289
+ if len(parts1) == 1 or len(parts2) == 1:
290
+ return parts1[-1] == parts2[-1] # Compare last parts (last names)
291
+
292
+ # Compare last names (last parts)
293
+ if parts1[-1] != parts2[-1]:
294
+ return False
295
+
296
+ # Compare first initials
297
+ if len(parts1[0]) > 0 and len(parts2[0]) > 0 and parts1[0][0] != parts2[0][0]:
298
+ return False
299
+
300
+ return True
301
+
302
+ def extract_url_from_work(self, work_data: Dict[str, Any]) -> Optional[str]:
303
+ """
304
+ Extract the best URL from OpenAlex work data
305
+
306
+ Args:
307
+ work_data: Work data from OpenAlex
308
+
309
+ Returns:
310
+ Best available URL or None
311
+ """
312
+ # Priority order: Open access PDF, primary location, DOI
313
+
314
+ # Check for open access PDF
315
+ open_access = work_data.get('open_access', {})
316
+ if open_access.get('is_oa') and open_access.get('oa_url'):
317
+ logger.debug(f"Found open access URL: {open_access['oa_url']}")
318
+ return open_access['oa_url']
319
+
320
+ # Check primary location
321
+ primary_location = work_data.get('primary_location', {})
322
+ if primary_location:
323
+ # Try landing page URL first
324
+ if primary_location.get('landing_page_url'):
325
+ logger.debug(f"Found primary location URL: {primary_location['landing_page_url']}")
326
+ return primary_location['landing_page_url']
327
+
328
+ # Try PDF URL
329
+ if primary_location.get('pdf_url'):
330
+ logger.debug(f"Found primary location PDF: {primary_location['pdf_url']}")
331
+ return primary_location['pdf_url']
332
+
333
+ # Check other locations for PDFs
334
+ locations = work_data.get('locations', [])
335
+ for location in locations:
336
+ if location.get('pdf_url'):
337
+ logger.debug(f"Found PDF in location: {location['pdf_url']}")
338
+ return location['pdf_url']
339
+ if location.get('landing_page_url'):
340
+ logger.debug(f"Found landing page in location: {location['landing_page_url']}")
341
+ return location['landing_page_url']
342
+
343
+ # Fall back to DOI URL
344
+ doi = work_data.get('doi')
345
+ if doi:
346
+ from refchecker.utils.doi_utils import construct_doi_url
347
+ doi_url = construct_doi_url(doi)
348
+ logger.debug(f"Generated DOI URL: {doi_url}")
349
+ return doi_url
350
+
351
+ # Check ids for other identifiers
352
+ ids = work_data.get('ids', {})
353
+ if ids.get('doi'):
354
+ from refchecker.utils.doi_utils import construct_doi_url
355
+ doi_url = construct_doi_url(ids['doi'])
356
+ logger.debug(f"Generated DOI URL from ids: {doi_url}")
357
+ return doi_url
358
+
359
+ logger.debug("No URL found in OpenAlex work data")
360
+ return None
361
+
362
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
363
+ """
364
+ Verify a non-arXiv reference using OpenAlex
365
+
366
+ Args:
367
+ reference: Reference data dictionary
368
+
369
+ Returns:
370
+ Tuple of (verified_data, errors, url)
371
+ - verified_data: Work data from OpenAlex or None if not found
372
+ - errors: List of error dictionaries
373
+ - url: URL of the work if found, None otherwise
374
+ """
375
+ errors = []
376
+
377
+ # Extract reference data
378
+ title = reference.get('title', '') or ''
379
+ authors = reference.get('authors', [])
380
+ year = reference.get('year', 0)
381
+ url = reference.get('url', '')
382
+ raw_text = reference.get('raw_text', '')
383
+
384
+ # If we have a DOI, try to get the work directly
385
+ doi = None
386
+ if 'doi' in reference and reference['doi']:
387
+ doi = reference['doi']
388
+ elif url:
389
+ doi = self.extract_doi_from_url(url)
390
+
391
+ work_data = None
392
+
393
+ if doi:
394
+ # Try to get the work by DOI
395
+ work_data = self.get_work_by_doi(doi)
396
+
397
+ if work_data:
398
+ logger.debug(f"Found work by DOI in OpenAlex: {doi}")
399
+ else:
400
+ logger.debug(f"Could not find work with DOI in OpenAlex: {doi}")
401
+
402
+ # If we couldn't get the work by DOI, try searching by title
403
+ if not work_data and title:
404
+ # Clean up the title for search using centralized utility function
405
+ cleaned_title = clean_title_for_search(title)
406
+
407
+ # Search for the work
408
+ search_results = self.search_works(cleaned_title, year)
409
+
410
+ if search_results:
411
+ best_match, best_score = find_best_match(search_results, cleaned_title, year, authors)
412
+
413
+ # Use match if score is good enough
414
+ if best_match and best_score >= SIMILARITY_THRESHOLD:
415
+ work_data = best_match
416
+ logger.debug(f"Found work by title in OpenAlex with score {best_score:.2f}: {cleaned_title}")
417
+ else:
418
+ logger.debug(f"No good title match found in OpenAlex (best score: {best_score:.2f})")
419
+ else:
420
+ logger.debug(f"No works found for title in OpenAlex: {cleaned_title}")
421
+
422
+ # If we still couldn't find the work, return no verification
423
+ if not work_data:
424
+ logger.debug("Could not find matching work in OpenAlex")
425
+ return None, [], None
426
+
427
+ # Verify authors
428
+ if authors:
429
+ authorships = work_data.get('authorships', [])
430
+ authors_match, author_error = self.compare_authors(authors, authorships)
431
+
432
+ if not authors_match:
433
+ # Extract correct author names for error reporting
434
+ correct_author_names = []
435
+ for authorship in authorships:
436
+ author = authorship.get('author', {})
437
+ display_name = author.get('display_name', '')
438
+ if display_name:
439
+ correct_author_names.append(display_name)
440
+
441
+ errors.append({
442
+ 'error_type': 'author',
443
+ 'error_details': author_error,
444
+ 'ref_authors_correct': ', '.join(correct_author_names)
445
+ })
446
+
447
+ # Verify year
448
+ work_year = work_data.get('publication_year')
449
+ if year and work_year and year != work_year:
450
+ errors.append({
451
+ 'warning_type': 'year',
452
+ 'warning_details': format_year_mismatch(year, work_year),
453
+ 'ref_year_correct': work_year
454
+ })
455
+
456
+ # Verify DOI
457
+ work_doi = work_data.get('doi')
458
+ if not work_doi and work_data.get('ids', {}).get('doi'):
459
+ work_doi = work_data['ids']['doi']
460
+
461
+ if doi and work_doi:
462
+ # Compare DOIs using the proper comparison function
463
+ from refchecker.utils.doi_utils import compare_dois, validate_doi_resolves
464
+ if not compare_dois(doi, work_doi):
465
+ # If cited DOI resolves, it's likely a valid alternate DOI (e.g., arXiv vs conference)
466
+ # Treat as warning instead of error
467
+ if validate_doi_resolves(doi):
468
+ errors.append({
469
+ 'warning_type': 'doi',
470
+ 'warning_details': format_doi_mismatch(doi, work_doi),
471
+ 'ref_doi_correct': work_doi
472
+ })
473
+ else:
474
+ errors.append({
475
+ 'error_type': 'doi',
476
+ 'error_details': format_doi_mismatch(doi, work_doi),
477
+ 'ref_doi_correct': work_doi
478
+ })
479
+
480
+ # Extract URL from work data
481
+ work_url = self.extract_url_from_work(work_data)
482
+
483
+ return work_data, errors, work_url
484
+
485
+ if __name__ == "__main__":
486
+ # Example usage
487
+ checker = OpenAlexReferenceChecker(email="test@example.com")
488
+
489
+ # Example reference
490
+ reference = {
491
+ 'title': 'Attention is All You Need',
492
+ 'authors': ['Ashish Vaswani', 'Noam Shazeer'],
493
+ 'year': 2017,
494
+ 'url': 'https://example.com/paper',
495
+ 'raw_text': 'Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.'
496
+ }
497
+
498
+ # Verify the reference
499
+ verified_data, errors, url = checker.verify_reference(reference)
500
+
501
+ if verified_data:
502
+ print(f"Found work: {verified_data.get('title') or verified_data.get('display_name')}")
503
+ print(f"URL: {url}")
504
+
505
+ if errors:
506
+ print("Errors found:")
507
+ for error in errors:
508
+ error_type = error.get('error_type') or error.get('warning_type')
509
+ print(f" - {error_type}: {error.get('error_details') or error.get('warning_details')}")
510
+ else:
511
+ print("No errors found")
512
+ else:
513
+ print("Could not find matching work")