academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,540 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Local Semantic Scholar Database Client for Reference Verification
4
+
5
+ This module provides functionality to verify non-arXiv references using a local Semantic Scholar database.
6
+ It can check if a reference's metadata (authors, year, title) matches what's in the local database.
7
+
8
+ Usage:
9
+ from local_semantic_scholar import LocalNonArxivReferenceChecker
10
+
11
+ # Initialize the checker
12
+ checker = LocalNonArxivReferenceChecker(db_path="semantic_scholar_db/semantic_scholar.db")
13
+
14
+ # Verify a reference
15
+ reference = {
16
+ 'title': 'Title of the paper',
17
+ 'authors': ['Author 1', 'Author 2'],
18
+ 'year': 2020,
19
+ 'url': 'https://example.com/paper',
20
+ 'raw_text': 'Full citation text'
21
+ }
22
+
23
+ verified_data, errors = checker.verify_reference(reference)
24
+ """
25
+
26
+ import json
27
+ import logging
28
+ import re
29
+ import sqlite3
30
+ import time
31
+ from typing import Dict, List, Tuple, Optional, Any, Union
32
+
33
+ # Import utility functions
34
+ import sys
35
+ import os
36
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
37
+
38
+ from refchecker.utils.doi_utils import extract_doi_from_url, compare_dois, construct_doi_url
39
+ from refchecker.utils.error_utils import create_author_error, create_doi_error
40
+ from refchecker.utils.text_utils import normalize_author_name, normalize_paper_title, is_name_match, compare_authors, calculate_title_similarity
41
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url, get_best_available_url
42
+ from refchecker.utils.db_utils import process_semantic_scholar_result, process_semantic_scholar_results
43
+ from refchecker.config.settings import get_config
44
+
45
+ # Set up logging
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # Get configuration
49
+ config = get_config()
50
+ SIMILARITY_THRESHOLD = config["text_processing"]["similarity_threshold"]
51
+
52
+ def log_query_debug(query: str, params: list, execution_time: float, result_count: int, strategy: str):
53
+ """Log database query details in debug mode"""
54
+ if logger.isEnabledFor(logging.DEBUG):
55
+ logger.debug(f"DB Query Strategy: {strategy}")
56
+ logger.debug(f"DB Query: {query}")
57
+ logger.debug(f"DB Params: {params}")
58
+ logger.debug(f"DB Execution Time: {execution_time:.3f}s")
59
+ logger.debug(f"DB Result Count: {result_count}")
60
+ else:
61
+ # Always log strategy and result count for INFO level
62
+ logger.debug(f"DB Query [{strategy}]: {result_count} results in {execution_time:.3f}s")
63
+
64
+ class LocalNonArxivReferenceChecker:
65
+ """
66
+ A class to verify non-arXiv references using a local Semantic Scholar database
67
+ """
68
+
69
+ def __init__(self, db_path: str = "semantic_scholar_db/semantic_scholar.db"):
70
+ """
71
+ Initialize the local Semantic Scholar database client
72
+
73
+ Args:
74
+ db_path: Path to the SQLite database
75
+ """
76
+ self.db_path = db_path
77
+ self.conn = sqlite3.connect(db_path)
78
+ self.conn.row_factory = sqlite3.Row # Return rows as dictionaries
79
+
80
+ # DOI extraction now handled by utility function
81
+
82
+ # Title normalization now handled by utility function
83
+
84
+ # Author name normalization now handled by utility function
85
+
86
+ # Author comparison now handled by utility function
87
+
88
+ # Name matching now handled by utility function
89
+
90
+ def get_paper_by_doi(self, doi: str) -> Optional[Dict[str, Any]]:
91
+ """
92
+ Get paper data by DOI from the local database
93
+
94
+ Args:
95
+ doi: DOI of the paper
96
+
97
+ Returns:
98
+ Paper data dictionary or None if not found
99
+ """
100
+ cursor = self.conn.cursor()
101
+
102
+ # Query the database for the paper with the given DOI using the column-based schema
103
+ query = '''
104
+ SELECT * FROM papers
105
+ WHERE externalIds_DOI = ?
106
+ '''
107
+ params = (doi,)
108
+
109
+ start_time = time.time()
110
+ cursor.execute(query, params)
111
+ row = cursor.fetchone()
112
+ execution_time = time.time() - start_time
113
+
114
+ result_count = 1 if row else 0
115
+ log_query_debug(query, list(params), execution_time, result_count, "DOI lookup")
116
+
117
+ if not row:
118
+ return None
119
+
120
+ # Convert row to dictionary and process using utility function
121
+ paper_data = process_semantic_scholar_result(dict(row))
122
+
123
+ return paper_data
124
+
125
+ def get_paper_by_arxiv_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]:
126
+ """
127
+ Get paper data by arXiv ID from the local database
128
+
129
+ Args:
130
+ arxiv_id: arXiv ID of the paper
131
+
132
+ Returns:
133
+ Paper data dictionary or None if not found
134
+ """
135
+ cursor = self.conn.cursor()
136
+
137
+ # Query the database for the paper with the given arXiv ID using the column-based schema
138
+ query = '''
139
+ SELECT * FROM papers
140
+ WHERE externalIds_ArXiv = ?
141
+ '''
142
+ params = (arxiv_id,)
143
+
144
+ start_time = time.time()
145
+ cursor.execute(query, params)
146
+ row = cursor.fetchone()
147
+ execution_time = time.time() - start_time
148
+
149
+ result_count = 1 if row else 0
150
+ log_query_debug(query, list(params), execution_time, result_count, "arXiv ID lookup")
151
+
152
+ if not row:
153
+ return None
154
+
155
+ # Convert row to dictionary and process using utility function
156
+ paper_data = process_semantic_scholar_result(dict(row))
157
+
158
+ return paper_data
159
+
160
+ def search_papers_by_title(self, title: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
161
+ """
162
+ Search for papers by title in the local database with optimized performance
163
+
164
+ Args:
165
+ title: Paper title
166
+ year: Publication year (optional)
167
+
168
+ Returns:
169
+ List of paper data dictionaries
170
+ """
171
+ cursor = self.conn.cursor()
172
+
173
+ # Clean up the title for searching
174
+ title_cleaned = title.replace('%', '').strip()
175
+ title_lower = title_cleaned.lower()
176
+ title_normalized = normalize_paper_title(title_cleaned)
177
+
178
+ results = []
179
+
180
+ # Strategy 1: Try normalized title match first (fastest and most accurate)
181
+ try:
182
+ cursor.execute("PRAGMA table_info(papers)")
183
+ columns = [row[1] for row in cursor.fetchall()]
184
+
185
+ if 'normalized_paper_title' in columns and title_normalized:
186
+ query = "SELECT * FROM papers WHERE normalized_paper_title = ?"
187
+ params = [title_normalized]
188
+
189
+ start_time = time.time()
190
+ cursor.execute(query, params)
191
+ results.extend([dict(row) for row in cursor.fetchall()])
192
+ execution_time = time.time() - start_time
193
+
194
+ log_query_debug(query, params, execution_time, len(results), "normalized title match")
195
+
196
+ if results:
197
+ logger.debug(f"Found {len(results)} results using normalized title match")
198
+ return process_semantic_scholar_results(results)
199
+ except Exception as e:
200
+ logger.warning(f"Error in normalized title search: {e}")
201
+
202
+ return process_semantic_scholar_results(results)
203
+
204
+ # Result processing now handled by utility function
205
+
206
+ def search_papers_by_author(self, author_name: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
207
+ """
208
+ Search for papers by author name in the local database
209
+
210
+ Args:
211
+ author_name: Author name
212
+ year: Publication year (optional)
213
+
214
+ Returns:
215
+ List of paper data dictionaries
216
+ """
217
+ cursor = self.conn.cursor()
218
+
219
+ # Clean up the author name for searching
220
+ search_name = f"%{author_name.replace('%', '').lower()}%"
221
+
222
+ # Build the query using the column-based schema with JSON_EXTRACT for authors
223
+ query = '''
224
+ SELECT * FROM papers
225
+ WHERE LOWER(authors) LIKE ?
226
+ '''
227
+ params = [search_name]
228
+
229
+ # Add year filter if provided
230
+ if year:
231
+ query += ' AND year = ?'
232
+ params.append(year)
233
+
234
+ # Execute the query
235
+ start_time = time.time()
236
+ cursor.execute(query, params)
237
+ execution_time = time.time() - start_time
238
+
239
+ # Fetch results
240
+ results = []
241
+ raw_results = cursor.fetchall()
242
+
243
+ log_query_debug(query, params, execution_time, len(raw_results), "author name search")
244
+
245
+ for row in raw_results:
246
+ # Convert row to dictionary and reconstruct paper data structure
247
+ paper_data = dict(row)
248
+
249
+ # Extract authors from JSON
250
+ if paper_data.get('authors'):
251
+ authors_list = json.loads(paper_data['authors'])
252
+ paper_data['authors'] = authors_list
253
+
254
+ # Check if any author actually matches our search
255
+ author_match = False
256
+ for author in authors_list:
257
+ author_name_normalized = normalize_author_name(author.get('name', ''))
258
+ search_name_normalized = normalize_author_name(author_name)
259
+ if search_name_normalized in author_name_normalized:
260
+ author_match = True
261
+ break
262
+
263
+ # Skip if no actual author match (reduces false positives)
264
+ if not author_match:
265
+ continue
266
+ else:
267
+ paper_data['authors'] = []
268
+
269
+ # Reconstruct external IDs from flattened columns
270
+ external_ids = {}
271
+ for key, value in paper_data.items():
272
+ if key.startswith('externalIds_') and value:
273
+ external_id_type = key.replace('externalIds_', '')
274
+ external_ids[external_id_type] = value
275
+ paper_data['externalIds'] = external_ids
276
+
277
+ # Add other JSON fields
278
+ if paper_data.get('s2FieldsOfStudy'):
279
+ paper_data['s2FieldsOfStudy'] = json.loads(paper_data['s2FieldsOfStudy'])
280
+ if paper_data.get('publicationTypes'):
281
+ paper_data['publicationTypes'] = json.loads(paper_data['publicationTypes'])
282
+
283
+ results.append(paper_data)
284
+
285
+ return results
286
+
287
+ def find_best_match(self, title: str, authors: List[str], year: Optional[int] = None) -> Optional[Dict[str, Any]]:
288
+ """
289
+ Find the best matching paper in the local database
290
+
291
+ Args:
292
+ title: Paper title
293
+ authors: List of author names
294
+ year: Publication year (optional)
295
+
296
+ Returns:
297
+ Best matching paper data dictionary or None if not found
298
+ """
299
+ logger.debug(f"Local DB: Finding best match for title: '{title}', authors: {authors}, year: {year}")
300
+
301
+ # Search by title
302
+ title_results = self.search_papers_by_title(title, year)
303
+
304
+ logger.debug(f"Local DB: Title search returned {len(title_results)} results")
305
+
306
+ if title_results:
307
+ # Find the best match by title similarity with stable sorting
308
+ scored_results = []
309
+
310
+ for result in title_results:
311
+ result_title = result.get('title', '')
312
+
313
+ # Calculate similarity score using utility function
314
+ score = calculate_title_similarity(title, result_title)
315
+
316
+ # Check author match
317
+ if authors and result.get('authors'):
318
+ # Compare first author
319
+ first_author = normalize_author_name(authors[0])
320
+ result_first_author = normalize_author_name(result['authors'][0].get('name', ''))
321
+
322
+ if is_name_match(first_author, result_first_author):
323
+ score += 0.2
324
+
325
+ # Check year match
326
+ if year and result.get('year') == year:
327
+ score += 0.1
328
+
329
+ logger.debug(f"Local DB: Candidate match score {score:.2f} for '{result_title}'")
330
+
331
+ scored_results.append((score, result))
332
+
333
+ # Sort by score (descending), then by title for stable ordering when scores are equal
334
+ scored_results.sort(key=lambda x: (-x[0], x[1].get('title', '')))
335
+
336
+ if scored_results:
337
+ best_score, best_match = scored_results[0]
338
+
339
+ # If we found a good match, return it
340
+ if best_score >= SIMILARITY_THRESHOLD:
341
+ logger.debug(f"Local DB: Found good title match with score {best_score:.2f}")
342
+ return best_match
343
+ else:
344
+ logger.debug(f"Local DB: Best title match score {best_score:.2f} below threshold ({SIMILARITY_THRESHOLD})")
345
+
346
+ logger.debug("Local DB: No good match found")
347
+ return None
348
+
349
+ def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
350
+ """
351
+ Verify a non-arXiv reference using the local database
352
+
353
+ Args:
354
+ reference: Reference data dictionary
355
+
356
+ Returns:
357
+ Tuple of (verified_data, errors, url)
358
+ - verified_data: Paper data from the database or None if not found
359
+ - errors: List of error dictionaries
360
+ - url: URL of the paper if found, None otherwise
361
+ """
362
+ errors = []
363
+
364
+ # Extract reference data
365
+ title = reference.get('title', '')
366
+ authors = reference.get('authors', [])
367
+ year = reference.get('year', 0)
368
+ url = reference.get('url', '')
369
+ raw_text = reference.get('raw_text', '')
370
+
371
+ logger.debug(f"Local DB: Verifying reference - Title: '{title}', Authors: {authors}, Year: {year}")
372
+
373
+ # Try to get the paper by DOI or arXiv ID first
374
+ doi = None
375
+ arxiv_id = None
376
+
377
+ if 'doi' in reference and reference['doi']:
378
+ doi = reference['doi']
379
+ elif url:
380
+ # Check if it's an arXiv URL first
381
+ arxiv_id = extract_arxiv_id_from_url(url)
382
+ if not arxiv_id:
383
+ # If not arXiv, try extracting DOI
384
+ doi = extract_doi_from_url(url)
385
+
386
+ paper_data = None
387
+
388
+ # Try arXiv ID first if available
389
+ if arxiv_id:
390
+ logger.debug(f"Local DB: Searching by arXiv ID: {arxiv_id}")
391
+ paper_data = self.get_paper_by_arxiv_id(arxiv_id)
392
+
393
+ if paper_data:
394
+ logger.debug(f"Found paper by arXiv ID: {arxiv_id}")
395
+ else:
396
+ logger.warning(f"Could not find paper with arXiv ID: {arxiv_id}")
397
+
398
+ # Try DOI if we haven't found the paper yet
399
+ if not paper_data and doi:
400
+ logger.debug(f"Local DB: Searching by DOI: {doi}")
401
+ paper_data = self.get_paper_by_doi(doi)
402
+
403
+ if paper_data:
404
+ logger.debug(f"Found paper by DOI: {doi}")
405
+ else:
406
+ logger.debug(f"Could not find paper with DOI: {doi}")
407
+
408
+ # If we couldn't get the paper by DOI or arXiv ID, try searching by title and authors
409
+ if not paper_data and (title or authors):
410
+ logger.debug(f"Local DB: Searching by title/authors - Title: '{title}', Authors: {authors}, Year: {year}")
411
+ paper_data = self.find_best_match(title, authors, year)
412
+
413
+ if paper_data:
414
+ logger.debug(f"Found paper by title/author search")
415
+ else:
416
+ logger.debug(f"Could not find matching paper for reference")
417
+
418
+ # If we couldn't find the paper, return no errors (can't verify)
419
+ if not paper_data:
420
+ logger.debug("Local DB: No matching paper found - cannot verify reference")
421
+ return None, [], None
422
+
423
+ logger.debug(f"Local DB: Found matching paper - Title: '{paper_data.get('title', '')}', Year: {paper_data.get('year', '')}")
424
+
425
+ # Verify authors
426
+ if authors:
427
+ authors_match, author_error = compare_authors(authors, paper_data.get('authors', []))
428
+
429
+ if not authors_match:
430
+ logger.debug(f"Local DB: Author mismatch - {author_error}")
431
+ errors.append(create_author_error(author_error, paper_data.get('authors', [])))
432
+
433
+ # Verify year (with tolerance)
434
+ paper_year = paper_data.get('year')
435
+ # Get year tolerance from config (default to 1 if not available)
436
+ year_tolerance = 1 # Default tolerance
437
+ try:
438
+ from config.settings import get_config
439
+ config = get_config()
440
+ year_tolerance = config.get('text_processing', {}).get('year_tolerance', 1)
441
+ except (ImportError, Exception):
442
+ pass # Use default if config not available
443
+
444
+ from refchecker.utils.error_utils import validate_year
445
+ year_warning = validate_year(
446
+ cited_year=year,
447
+ paper_year=paper_year,
448
+ year_tolerance=year_tolerance
449
+ )
450
+ if year_warning:
451
+ logger.debug(f"Local DB: Year issue - {year_warning.get('warning_details', '')}")
452
+ errors.append(year_warning)
453
+
454
+ # Verify DOI
455
+ paper_doi = None
456
+ external_ids = paper_data.get('externalIds', {})
457
+ if external_ids and 'DOI' in external_ids:
458
+ paper_doi = external_ids['DOI']
459
+
460
+ # Compare DOIs using utility function
461
+ if doi and paper_doi and not compare_dois(doi, paper_doi):
462
+ logger.debug(f"Local DB: DOI mismatch - cited: {doi}, actual: {paper_doi}")
463
+ doi_error = create_doi_error(doi, paper_doi)
464
+ if doi_error: # Only add if there's actually a mismatch after cleaning
465
+ errors.append(doi_error)
466
+
467
+ if errors:
468
+ logger.debug(f"Local DB: Found {len(errors)} errors in reference verification")
469
+ else:
470
+ logger.debug("Local DB: Reference verification passed - no errors found")
471
+
472
+ # Return the Semantic Scholar URL that was actually used for verification
473
+ # since this is a Semantic Scholar database checker
474
+ external_ids = paper_data.get('externalIds', {})
475
+
476
+ # First try to get the Semantic Scholar URL using paperId (SHA hash)
477
+ if paper_data.get('paperId'):
478
+ paper_url = f"https://www.semanticscholar.org/paper/{paper_data['paperId']}"
479
+ logger.debug(f"Using Semantic Scholar URL for verification: {paper_url}")
480
+ else:
481
+ # Fallback to best available URL if Semantic Scholar URL not available
482
+ open_access_pdf = paper_data.get('openAccessPdf')
483
+ paper_url = get_best_available_url(external_ids, open_access_pdf, paper_data.get('paperId'))
484
+ if paper_url:
485
+ logger.debug(f"Using fallback URL: {paper_url}")
486
+
487
+ return paper_data, errors, paper_url
488
+
489
+ def close(self):
490
+ """Close the database connection"""
491
+ self.conn.close()
492
+
493
+ if __name__ == "__main__":
494
+ # Example usage
495
+ import sys
496
+
497
+ # Set up logging
498
+ logging.basicConfig(
499
+ level=logging.INFO,
500
+ format='%(asctime)s - %(levelname)s - %(message)s',
501
+ handlers=[
502
+ logging.StreamHandler(sys.stdout)
503
+ ]
504
+ )
505
+
506
+ # Check if database path is provided
507
+ if len(sys.argv) > 1:
508
+ db_path = sys.argv[1]
509
+ else:
510
+ db_path = "semantic_scholar_db/semantic_scholar.db"
511
+
512
+ # Initialize the checker
513
+ checker = LocalNonArxivReferenceChecker(db_path=db_path)
514
+
515
+ # Example reference
516
+ reference = {
517
+ 'title': 'Attention is All You Need',
518
+ 'authors': ['Ashish Vaswani', 'Noam Shazeer'],
519
+ 'year': 2017,
520
+ 'url': 'https://example.com/paper',
521
+ 'raw_text': 'Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.'
522
+ }
523
+
524
+ # Verify the reference
525
+ verified_data, errors = checker.verify_reference(reference)
526
+
527
+ if verified_data:
528
+ print(f"Found paper: {verified_data.get('title')}")
529
+
530
+ if errors:
531
+ print("Errors found:")
532
+ for error in errors:
533
+ print(f" - {error['error_type']}: {error['error_details']}")
534
+ else:
535
+ print("No errors found")
536
+ else:
537
+ print("Could not find matching paper")
538
+
539
+ # Close the database connection
540
+ checker.close()