academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,1725 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download Semantic Scholar Paper Metadata
4
+
5
+ This script downloads paper metadata from the Semantic Scholar API and stores it in a SQLite database.
6
+ The database can then be used by the local_semantic_scholar.py module to verify references offline.
7
+
8
+ Usage:
9
+ python download_semantic_scholar_db.py [--output-dir DIR] [--batch-size N] [--api-key KEY] [--fields FIELDS]
10
+
11
+ Options:
12
+ --output-dir DIR Directory to store the database (default: semantic_scholar_db)
13
+ --batch-size N Number of papers to download in each batch (default: 100)
14
+ --api-key KEY Semantic Scholar API key (optional, increases rate limits)
15
+ --fields FIELDS Comma-separated list of fields to include (default: id,title,authors,year,externalIds,url,abstract)
16
+ --query QUERY Search query to download papers
17
+ --start-year YEAR Start year for downloading papers by year range
18
+ --end-year YEAR End year for downloading papers by year range
19
+ --field FIELD Field or subject area for downloading papers by field
20
+ --download-dataset Download the official Semantic Scholar dataset files (.gz)
21
+ --process-local-files Process existing .gz files in the output directory into the database
22
+ --force-reprocess Force reprocessing of all files (use with --process-local-files)
23
+
24
+ Behavior:
25
+ - If the database does not exist, a full download is performed.
26
+ - If the database exists, an incremental update is performed automatically.
27
+ """
28
+
29
+ import argparse
30
+ import json
31
+ import logging
32
+ import os
33
+ import requests
34
+ import sqlite3
35
+ import sys
36
+ import time
37
+ import random
38
+ import concurrent.futures
39
+ import gzip
40
+ import hashlib
41
+ import re
42
+ import urllib.parse
43
+ import dateutil.parser
44
+ from datetime import datetime, timezone, timedelta
45
+ from tqdm import tqdm
46
+ from functools import lru_cache
47
+
48
+ # Set up logging
49
+ logging.basicConfig(
50
+ level=logging.INFO,
51
+ format='%(asctime)s - %(levelname)s - %(message)s',
52
+ handlers=[
53
+ logging.StreamHandler(sys.stdout)
54
+ ]
55
+ )
56
+ logger = logging.getLogger(__name__)
57
+
58
+ class SemanticScholarDownloader:
59
+ """
60
+ Class to download paper metadata from Semantic Scholar and store it in a SQLite database
61
+ """
62
+
63
+ def __init__(self, output_dir="semantic_scholar_db", batch_size=100, api_key=None, fields=None):
64
+ """
65
+ Initialize the downloader
66
+
67
+ Args:
68
+ output_dir: Directory to store the database
69
+ batch_size: Number of papers to download in each batch
70
+ api_key: Semantic Scholar API key (optional)
71
+ fields: List of fields to include in the API response
72
+ """
73
+ self.output_dir = output_dir
74
+ self.batch_size = batch_size
75
+ self.api_key = api_key
76
+
77
+ # Default fields to include
78
+ if fields is None:
79
+ self.fields = ["id", "title", "authors", "year", "externalIds", "url", "abstract"]
80
+ else:
81
+ self.fields = fields
82
+
83
+ # Create output directory if it doesn't exist
84
+ os.makedirs(output_dir, exist_ok=True)
85
+
86
+ # Initialize database
87
+ self.db_path = os.path.join(output_dir, "semantic_scholar.db")
88
+ self.conn = self._get_db_connection()
89
+ self.create_tables()
90
+
91
+ # Set up session for API requests
92
+ self.session = requests.Session()
93
+ if self.api_key:
94
+ self.session.headers.update({"x-api-key": self.api_key})
95
+
96
+ def _get_db_connection(self):
97
+ """Get a connection to the SQLite database with optimized settings"""
98
+ conn = sqlite3.connect(self.db_path)
99
+ conn.execute("PRAGMA journal_mode=WAL") # Write-Ahead Logging for better concurrency
100
+ conn.execute("PRAGMA synchronous=NORMAL") # Reduce synchronous writes for better performance
101
+ conn.execute("PRAGMA cache_size=10000") # Increase cache size
102
+ conn.execute("PRAGMA temp_store=MEMORY") # Store temp tables in memory
103
+ return conn
104
+
105
+ def create_tables(self):
106
+ """Create database tables if they don't exist"""
107
+ cursor = self.conn.cursor()
108
+
109
+ # Create papers table with comprehensive schema
110
+ cursor.execute('''
111
+ CREATE TABLE IF NOT EXISTS papers (
112
+ paperId TEXT PRIMARY KEY,
113
+ corpusId INTEGER,
114
+ title TEXT,
115
+ normalized_paper_title TEXT,
116
+ abstract TEXT,
117
+ venue TEXT,
118
+ publicationVenueId TEXT,
119
+ year INTEGER,
120
+ referenceCount INTEGER,
121
+ citationCount INTEGER,
122
+ influentialCitationCount INTEGER,
123
+ isOpenAccess BOOLEAN,
124
+ publicationDate TEXT,
125
+ url TEXT,
126
+
127
+ -- External IDs (flattened)
128
+ externalIds_MAG TEXT,
129
+ externalIds_CorpusId TEXT,
130
+ externalIds_ACL TEXT,
131
+ externalIds_PubMed TEXT,
132
+ externalIds_DOI TEXT,
133
+ externalIds_PubMedCentral TEXT,
134
+ externalIds_DBLP TEXT,
135
+ externalIds_ArXiv TEXT,
136
+
137
+ -- Journal info (flattened)
138
+ journal_name TEXT,
139
+ journal_pages TEXT,
140
+ journal_volume TEXT,
141
+
142
+ -- Lists stored as JSON for complex queries
143
+ authors TEXT, -- JSON array
144
+ s2FieldsOfStudy TEXT, -- JSON array
145
+ publicationTypes TEXT, -- JSON array
146
+
147
+ -- Full JSON for complete data access
148
+ json_data TEXT
149
+ )
150
+ ''')
151
+
152
+ # Create metadata table for tracking incremental updates
153
+ cursor.execute('''
154
+ CREATE TABLE IF NOT EXISTS metadata (
155
+ key TEXT PRIMARY KEY,
156
+ value TEXT,
157
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
158
+ )
159
+ ''')
160
+
161
+ # Create indexes for efficient querying
162
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year)')
163
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_title ON papers(title)')
164
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_normalized_title ON papers(normalized_paper_title)')
165
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_venue ON papers(venue)')
166
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_citationCount ON papers(citationCount)')
167
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(externalIds_DOI)')
168
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_arxiv ON papers(externalIds_ArXiv)')
169
+
170
+ self.conn.commit()
171
+
172
+ def get_metadata(self, key: str, default=None):
173
+ """Get metadata value from the database"""
174
+ cursor = self.conn.cursor()
175
+ cursor.execute("SELECT value FROM metadata WHERE key = ?", (key,))
176
+ result = cursor.fetchone()
177
+ return result[0] if result else default
178
+
179
+ def set_metadata(self, key: str, value: str):
180
+ """Set metadata value in the database"""
181
+ cursor = self.conn.cursor()
182
+ cursor.execute("""
183
+ INSERT OR REPLACE INTO metadata (key, value, updated_at)
184
+ VALUES (?, ?, CURRENT_TIMESTAMP)
185
+ """, (key, value))
186
+ self.conn.commit()
187
+
188
+ def get_last_update_time(self):
189
+ """Get the timestamp of the last successful update"""
190
+ return self.get_metadata('last_update_time')
191
+
192
+ def set_last_update_time(self, timestamp: str):
193
+ """Set the timestamp of the last successful update"""
194
+ self.set_metadata('last_update_time', timestamp)
195
+
196
+ def get_last_release_id(self):
197
+ """Get the last processed release ID"""
198
+ return self.get_metadata('last_release_id')
199
+
200
+ def set_last_release_id(self, release_id: str):
201
+ """Set the last processed release ID"""
202
+ self.set_metadata('last_release_id', release_id)
203
+
204
+ def check_for_updates(self):
205
+ """
206
+ Check if there are new releases or incremental updates available
207
+
208
+ Returns:
209
+ dict: Information about available updates
210
+ """
211
+ try:
212
+ # Get the latest release information
213
+ latest_release = self.get_latest_release_id()
214
+ last_release = self.get_last_release_id()
215
+ last_update_time = self.get_last_update_time()
216
+
217
+ logger.info(f"Latest release: {latest_release}")
218
+ logger.info(f"Last release: {last_release}")
219
+
220
+ # Check if database has records but no update time
221
+ if not last_update_time:
222
+
223
+ # does databse have > 1 record?
224
+ cursor = self.conn.cursor()
225
+ cursor.execute("SELECT EXISTS(SELECT 1 FROM papers LIMIT 1)")
226
+ record_count = cursor.fetchone()[0]
227
+
228
+ if record_count > 0:
229
+ # Database has records but no update time - create a reasonable timestamp
230
+ # Use a timestamp from 1 day ago to check for recent updates
231
+ default_update_time = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
232
+ logger.info(f"Creating default update time: {default_update_time}")
233
+ last_update_time = default_update_time
234
+ else:
235
+ logger.info(f"Last update time: {last_update_time}")
236
+
237
+ # Check for incremental updates using release IDs instead of timestamps
238
+ if last_release:
239
+ logger.info("Checking for incremental updates since last release...")
240
+ incremental_updates = self.check_incremental_updates(last_release)
241
+ if incremental_updates:
242
+ return {
243
+ 'has_updates': True,
244
+ 'latest_release': latest_release,
245
+ 'last_release': last_release,
246
+ 'is_new_release': False,
247
+ 'incremental_updates': incremental_updates,
248
+ 'message': f'Incremental updates available from {last_release} to {latest_release}'
249
+ }
250
+ else:
251
+ logger.info("No incremental updates found")
252
+ else:
253
+ logger.info("No last release ID available, skipping incremental check")
254
+
255
+ # Check for new releases
256
+ if not last_release:
257
+ logger.info("No previous release ID found in database")
258
+ return {
259
+ 'has_updates': True,
260
+ 'latest_release': latest_release,
261
+ 'last_release': None,
262
+ 'is_new_release': True,
263
+ 'message': 'No previous release found, performing full download'
264
+ }
265
+
266
+ if latest_release != last_release:
267
+ return {
268
+ 'has_updates': True,
269
+ 'latest_release': latest_release,
270
+ 'last_release': last_release,
271
+ 'is_new_release': True,
272
+ 'message': f'New release available: {last_release} -> {latest_release}'
273
+ }
274
+
275
+ return {
276
+ 'has_updates': False,
277
+ 'latest_release': latest_release,
278
+ 'last_release': last_release,
279
+ 'is_new_release': False,
280
+ 'message': f'Already up to date with release {latest_release}'
281
+ }
282
+
283
+ except Exception as e:
284
+ logger.error(f"Error checking for updates: {e}")
285
+ return {
286
+ 'has_updates': False,
287
+ 'error': str(e),
288
+ 'message': f'Error checking for updates: {e}'
289
+ }
290
+
291
+ def check_incremental_updates(self, start_release_id=None):
292
+ """
293
+ Check for incremental updates between releases using the correct API
294
+
295
+ Args:
296
+ start_release_id: Release ID to start from (if None, uses last processed release)
297
+
298
+ Returns:
299
+ list: List of incremental update diffs available, or None if no updates
300
+ """
301
+ try:
302
+ # Get the start and end release IDs
303
+ if start_release_id is None:
304
+ start_release_id = self.get_last_release_id()
305
+
306
+ if not start_release_id:
307
+ logger.info("No start release ID available, cannot check for incremental updates")
308
+ return None
309
+
310
+ # Get the latest release ID
311
+ end_release_id = self.get_latest_release_id()
312
+
313
+ # If we're already at the latest release, no updates needed
314
+ if start_release_id == end_release_id:
315
+ logger.info(f"Already at latest release {end_release_id}, no incremental updates needed")
316
+ return None
317
+
318
+ logger.info(f"Checking for incremental updates from {start_release_id} to {end_release_id}")
319
+
320
+ # Use the correct incremental diffs API endpoint
321
+ url = f"https://api.semanticscholar.org/datasets/v1/diffs/{start_release_id}/to/{end_release_id}/papers"
322
+ headers = {}
323
+ if self.api_key:
324
+ headers["x-api-key"] = self.api_key
325
+
326
+ logger.info(f"Requesting incremental diffs from: {url}")
327
+ response = self.session.get(url, headers=headers, timeout=30)
328
+
329
+ # Handle different response codes
330
+ if response.status_code == 404:
331
+ logger.info(f"Incremental diffs not available for {start_release_id} to {end_release_id} (404)")
332
+ logger.info("This usually means the release gap is too large for incremental updates")
333
+ return self._check_incremental_alternative_by_release(start_release_id, end_release_id)
334
+ elif response.status_code == 429:
335
+ logger.warning("Rate limited on diffs API. Consider waiting or using a higher tier API key")
336
+ return None
337
+
338
+ response.raise_for_status()
339
+ data = response.json()
340
+
341
+ diffs = data.get("diffs", [])
342
+ if diffs:
343
+ logger.info(f"Found {len(diffs)} incremental diffs from {start_release_id} to {end_release_id}")
344
+ return diffs
345
+ else:
346
+ logger.info("No incremental diffs found from API endpoint")
347
+
348
+ return None
349
+
350
+ except Exception as e:
351
+ logger.info(f"Error checking incremental updates: {e}")
352
+ logger.info("Falling back to alternative incremental check method")
353
+ # Try to get end_release_id if it wasn't set yet
354
+ try:
355
+ if 'end_release_id' not in locals():
356
+ end_release_id = self.get_latest_release_id()
357
+ return self._check_incremental_alternative_by_release(start_release_id or self.get_last_release_id(), end_release_id)
358
+ except Exception as fallback_error:
359
+ logger.debug(f"Error in fallback method: {fallback_error}")
360
+ return None
361
+
362
+ def _check_incremental_alternative_by_release(self, start_release_id, end_release_id):
363
+ """
364
+ Alternative method to check for incremental updates when the diffs API is unavailable
365
+ This tries to compare release IDs and suggest a full dataset download if needed
366
+
367
+ Args:
368
+ start_release_id: Starting release ID
369
+ end_release_id: Target release ID
370
+
371
+ Returns:
372
+ list: List indicating a full dataset update is needed, or None if no updates
373
+ """
374
+ try:
375
+ if not start_release_id or not end_release_id:
376
+ return None
377
+
378
+ if start_release_id == end_release_id:
379
+ return None
380
+
381
+ # Try to find intermediate releases that might have diffs available
382
+ # This could be improved by calling a releases list API if available
383
+ from datetime import datetime, timedelta
384
+
385
+ try:
386
+ start_date = datetime.strptime(start_release_id, "%Y-%m-%d")
387
+ end_date = datetime.strptime(end_release_id, "%Y-%m-%d")
388
+ days_diff = (end_date - start_date).days
389
+
390
+ if days_diff <= 7:
391
+ logger.info(f"Release gap of {days_diff} days should support diffs, but API returned 404")
392
+ logger.info("This might be a temporary API issue or the releases don't exist")
393
+ return None
394
+ elif days_diff <= 30:
395
+ logger.info(f"Release gap of {days_diff} days may be too large for diffs API")
396
+ logger.info("Consider updating release tracking more frequently")
397
+ else:
398
+ logger.info(f"Release gap of {days_diff} days is too large for incremental updates")
399
+
400
+ except ValueError:
401
+ logger.info(f"Cannot parse release dates: {start_release_id}, {end_release_id}")
402
+
403
+ logger.info(f"Recommending full dataset download from {start_release_id} to {end_release_id}")
404
+
405
+ # Return a structure indicating that a full dataset download is needed
406
+ return [{
407
+ "type": "full_dataset_update",
408
+ "start_release": start_release_id,
409
+ "end_release": end_release_id,
410
+ "message": f"Incremental diffs unavailable for gap from {start_release_id} to {end_release_id}, full dataset update recommended"
411
+ }]
412
+
413
+ except Exception as e:
414
+ logger.debug(f"Error in alternative incremental check by release: {e}")
415
+ return None
416
+
417
+ def _check_incremental_alternative(self, since_timestamp):
418
+ """
419
+ Alternative method to check for incremental updates
420
+ This tries different approaches when the direct incremental endpoint isn't available
421
+
422
+ Args:
423
+ since_timestamp: ISO timestamp string of last update
424
+
425
+ Returns:
426
+ list: List of incremental update files available, or None if no updates
427
+ """
428
+ try:
429
+ # Try to get recent papers using the search API with date filtering
430
+ # This is a fallback when incremental endpoints aren't available
431
+ from datetime import datetime, timezone
432
+ import dateutil.parser
433
+
434
+ # Parse the timestamp
435
+ last_update = dateutil.parser.parse(since_timestamp)
436
+ current_time = datetime.now(timezone.utc)
437
+
438
+ # Check if there's been significant time since last update
439
+ time_diff = current_time - last_update
440
+ if time_diff.days < 1: # Less than 1 day, probably no significant updates
441
+ return None
442
+
443
+ logger.info(f"Last update was {time_diff.days} days ago, checking for recent papers")
444
+
445
+ # Use the search API to find recent papers
446
+ # This is not as efficient as true incremental updates but can work as a fallback
447
+ url = "https://api.semanticscholar.org/graph/v1/paper/search"
448
+ headers = {}
449
+ if self.api_key:
450
+ headers["x-api-key"] = self.api_key
451
+
452
+ params = {
453
+ "query": f"year:{current_time.year}",
454
+ "limit": 100,
455
+ "fields": "paperId,title,year,publicationDate"
456
+ }
457
+
458
+ response = self.session.get(url, headers=headers, params=params, timeout=30)
459
+ response.raise_for_status()
460
+ data = response.json()
461
+
462
+ recent_papers = data.get("data", [])
463
+ if recent_papers:
464
+ logger.info(f"Found {len(recent_papers)} recent papers that might need updating")
465
+ # Return a structure that indicates these are recent papers to check
466
+ return [{
467
+ "type": "recent_papers",
468
+ "count": len(recent_papers),
469
+ "papers": recent_papers
470
+ }]
471
+
472
+ return None
473
+
474
+ except Exception as e:
475
+ logger.debug(f"Error in alternative incremental check: {e}")
476
+ return None
477
+
478
+ def download_incremental_updates(self, diffs):
479
+ """
480
+ Download and process incremental diffs according to the Semantic Scholar API format
481
+
482
+ Args:
483
+ diffs: List of diff dictionaries from the incremental API
484
+
485
+ Returns:
486
+ bool: True if successful, False otherwise
487
+ """
488
+ try:
489
+ logger.info("Processing incremental diffs...")
490
+
491
+ total_updated = 0
492
+ total_deleted = 0
493
+
494
+ for diff in diffs:
495
+ if diff.get("type") == "full_dataset_update":
496
+ # Handle full dataset update recommendation by downloading the full dataset
497
+ logger.info(f"Full dataset update recommended: {diff.get('message')}")
498
+ logger.info("Automatically downloading full dataset...")
499
+
500
+ # Download the full dataset
501
+ success = self.download_dataset_files()
502
+ if success:
503
+ logger.info("Full dataset download completed, processing files...")
504
+ # Process the downloaded files
505
+ self.process_local_files(force_reprocess=False, incremental=False)
506
+
507
+ # After processing, check for any remaining incremental updates
508
+ logger.info("Checking for additional incremental updates after full dataset processing...")
509
+ latest_release = self.get_latest_release_id()
510
+ current_release = self.get_last_release_id()
511
+
512
+ if current_release and current_release != latest_release:
513
+ logger.info(f"Checking for incremental updates from {current_release} to {latest_release}")
514
+ additional_updates = self.check_incremental_updates(current_release)
515
+ if additional_updates:
516
+ # Filter out any full_dataset_update recommendations to avoid infinite recursion
517
+ filtered_updates = [u for u in additional_updates if u.get("type") != "full_dataset_update"]
518
+ if filtered_updates:
519
+ logger.info(f"Processing {len(filtered_updates)} additional incremental updates")
520
+ self.download_incremental_updates(filtered_updates)
521
+
522
+ return True
523
+ else:
524
+ logger.error("Failed to download full dataset")
525
+ return False
526
+ elif diff.get("type") == "recent_papers":
527
+ # Handle recent papers update (fallback)
528
+ papers = diff.get("papers", [])
529
+ if papers:
530
+ logger.info(f"Processing {len(papers)} recent papers")
531
+ paper_ids = [p.get("paperId") for p in papers if p.get("paperId")]
532
+ if paper_ids:
533
+ self.download_papers(paper_ids)
534
+ total_updated += len(paper_ids)
535
+ else:
536
+ # Handle proper incremental diff format
537
+ update_files = diff.get("update_files", [])
538
+ delete_files = diff.get("delete_files", [])
539
+
540
+ # Process update files
541
+ for update_url in update_files:
542
+ try:
543
+ logger.info(f"Processing update file: {update_url}")
544
+ records_updated = self._process_incremental_file(update_url, "update")
545
+ total_updated += records_updated
546
+ except Exception as e:
547
+ logger.error(f"Error processing update file {update_url}: {e}")
548
+ continue
549
+
550
+ # Process delete files
551
+ for delete_url in delete_files:
552
+ try:
553
+ logger.info(f"Processing delete file: {delete_url}")
554
+ records_deleted = self._process_incremental_file(delete_url, "delete")
555
+ total_deleted += records_deleted
556
+ except Exception as e:
557
+ logger.error(f"Error processing delete file {delete_url}: {e}")
558
+ continue
559
+
560
+ logger.info(f"Incremental update complete - Updated: {total_updated}, Deleted: {total_deleted}")
561
+
562
+ # Update metadata after successful incremental update
563
+ if total_updated > 0 or total_deleted > 0:
564
+ current_time = datetime.now(timezone.utc).isoformat()
565
+ self.set_last_update_time(current_time)
566
+
567
+ # Update the last release ID to the latest
568
+ try:
569
+ latest_release = self.get_latest_release_id()
570
+ self.set_last_release_id(latest_release)
571
+ logger.info(f"Updated metadata - last update: {current_time}, release: {latest_release}")
572
+ except Exception as e:
573
+ logger.warning(f"Could not update release ID: {e}")
574
+
575
+ return total_updated > 0 or total_deleted > 0
576
+
577
+ except Exception as e:
578
+ logger.error(f"Error processing incremental diffs: {e}")
579
+ return False
580
+
581
+ def get_latest_release_id(self):
582
+ """
583
+ Get the latest release ID from the Semantic Scholar API
584
+
585
+ Returns:
586
+ str: Latest release ID
587
+ """
588
+ try:
589
+ # Use the datasets API to get the latest release
590
+ url = "https://api.semanticscholar.org/datasets/v1/release/latest"
591
+ headers = {}
592
+ if self.api_key:
593
+ headers["x-api-key"] = self.api_key
594
+
595
+ response = self.session.get(url, headers=headers, timeout=30)
596
+ response.raise_for_status()
597
+
598
+ data = response.json()
599
+ release_id = data.get("release_id")
600
+
601
+ if not release_id:
602
+ raise ValueError("No release_id found in API response")
603
+
604
+ return release_id
605
+
606
+ except Exception as e:
607
+ logger.error(f"Error getting latest release ID: {e}")
608
+ raise
609
+
610
+ def normalize_paper_title(self, title: str) -> str:
611
+ """
612
+ Normalize paper title by converting to lowercase and removing whitespace and punctuation
613
+
614
+ Args:
615
+ title: Original paper title
616
+
617
+ Returns:
618
+ Normalized title string
619
+ """
620
+ if not title:
621
+ return ""
622
+
623
+ # Convert to lowercase
624
+ normalized = title.lower()
625
+
626
+ # Remove all non-alphanumeric characters (keeping only letters and numbers)
627
+ import re
628
+ normalized = re.sub(r'[^a-z0-9]', '', normalized)
629
+
630
+ return normalized
631
+
632
+ @lru_cache(maxsize=32)
633
+ def _build_search_query(self, query, year, field):
634
+ """
635
+ Build search query string with caching for repeated parameters
636
+
637
+ Args:
638
+ query: Search query
639
+ year: Year for filtering (single year)
640
+ field: Field or subject area
641
+
642
+ Returns:
643
+ Formatted search query string
644
+ """
645
+ search_parts = []
646
+
647
+ if query:
648
+ search_parts.append(query)
649
+
650
+ # Add year (single year only)
651
+ if year:
652
+ search_parts.append(f"year:{year}")
653
+
654
+ # Add field
655
+ if field:
656
+ search_parts.append(f"venue:{field}")
657
+
658
+ # If no search criteria provided, use a default query
659
+ if not search_parts:
660
+ search_parts.append("machine learning")
661
+ logger.warning(f"No search criteria provided, using default query: {search_parts[0]}")
662
+
663
+ return " ".join(search_parts)
664
+
665
+ def search_papers(self, query=None, start_year=None, end_year=None, field=None, limit=1000):
666
+ """
667
+ Search for papers using the Semantic Scholar API
668
+
669
+ Args:
670
+ query: Search query
671
+ start_year: Start year for filtering
672
+ end_year: End year for filtering
673
+ field: Field or subject area
674
+ limit: Maximum number of papers to return
675
+
676
+ Returns:
677
+ List of paper IDs
678
+ """
679
+ logger.info(f"Searching for papers with query: {query}, years: {start_year}-{end_year}, field: {field}")
680
+
681
+ # If a year range is specified, perform separate queries for each year
682
+ paper_ids = []
683
+ if start_year and end_year and start_year != end_year:
684
+ years = range(start_year, end_year + 1)
685
+ elif start_year:
686
+ years = [start_year]
687
+ elif end_year:
688
+ years = [end_year]
689
+ else:
690
+ years = [None]
691
+
692
+ total_limit = limit
693
+ per_year_limit = max(1, limit // len(years)) if years[0] is not None else limit
694
+
695
+ for year in years:
696
+ # Build the search query for this year
697
+ search_query = self._build_search_query(query, year, field)
698
+
699
+ url = "https://api.semanticscholar.org/graph/v1/paper/search"
700
+ params = {
701
+ "query": search_query,
702
+ "limit": min(100, self.batch_size), # API limit is 100 per request
703
+ "fields": "paperId",
704
+ }
705
+
706
+ offset = 0
707
+ year_paper_ids = []
708
+ max_offset = 1000 # API: offset must be < 1000 per query
709
+
710
+ with tqdm(total=per_year_limit, desc=f"Searching papers for year {year}" if year else "Searching papers") as pbar:
711
+ while offset < per_year_limit and offset < max_offset:
712
+ params["offset"] = offset
713
+ # Don't request more than the API allows in one query
714
+ params["limit"] = min(params["limit"], per_year_limit - offset, max_offset - offset)
715
+ try:
716
+ response = self.session.get(url, params=params)
717
+ response.raise_for_status()
718
+ data = response.json()
719
+
720
+ batch_ids = [paper.get("paperId") for paper in data.get("data", []) if paper.get("paperId")]
721
+ year_paper_ids.extend(batch_ids)
722
+ pbar.update(len(batch_ids))
723
+
724
+ if len(batch_ids) < params["limit"]:
725
+ break
726
+
727
+ offset += len(batch_ids)
728
+ time.sleep(3 if self.api_key else 5)
729
+ except requests.exceptions.RequestException as e:
730
+ logger.error(f"Error searching papers: {str(e)}")
731
+ wait_time = min(60, 2 ** (offset // 100))
732
+ logger.info(f"Waiting {wait_time} seconds before retrying...")
733
+ time.sleep(wait_time)
734
+ continue
735
+ paper_ids.extend(year_paper_ids)
736
+ if len(paper_ids) >= total_limit:
737
+ break
738
+
739
+ # Truncate to the requested limit
740
+ paper_ids = paper_ids[:limit]
741
+ logger.info(f"Found {len(paper_ids)} papers")
742
+ return paper_ids
743
+
744
+ def download_paper_batch(self, paper_ids):
745
+ """
746
+ Download metadata for a batch of papers
747
+
748
+ Args:
749
+ paper_ids: List of paper IDs to download
750
+
751
+ Returns:
752
+ List of paper data dictionaries
753
+ """
754
+ if not paper_ids:
755
+ return []
756
+
757
+ # Set up API request
758
+ url = "https://api.semanticscholar.org/graph/v1/paper/batch"
759
+ headers = {"Content-Type": "application/json"}
760
+
761
+ # Prepare request data
762
+ data = {
763
+ "ids": paper_ids,
764
+ "fields": ",".join(self.fields)
765
+ }
766
+
767
+ try:
768
+ response = self.session.post(url, headers=headers, json=data)
769
+ response.raise_for_status()
770
+ return response.json()
771
+ except requests.exceptions.RequestException as e:
772
+ logger.error(f"Error downloading papers: {str(e)}")
773
+ # If we get a rate limit error, wait and retry
774
+ if response.status_code == 429:
775
+ wait_time = int(response.headers.get("Retry-After", 60))
776
+ # Add a buffer to the wait time plus some jitter to avoid synchronization
777
+ jitter = random.uniform(5, 15)
778
+ total_wait = wait_time + jitter
779
+ logger.info(f"Rate limited. Waiting {total_wait:.1f} seconds before retrying...")
780
+ time.sleep(total_wait)
781
+ return self.download_paper_batch(paper_ids)
782
+ return []
783
+
784
+ def store_papers_batch(self, papers_data):
785
+ """
786
+ Store multiple papers in a single transaction
787
+
788
+ Args:
789
+ papers_data: List of paper data dictionaries from the API
790
+ """
791
+ if not papers_data:
792
+ return
793
+
794
+ cursor = self.conn.cursor()
795
+
796
+ try:
797
+ # Begin transaction
798
+ self.conn.execute("BEGIN TRANSACTION")
799
+
800
+ batch = []
801
+ for paper_data in papers_data:
802
+ if not paper_data or "paperId" not in paper_data:
803
+ continue
804
+
805
+ # Extract scalar fields
806
+ paper_id = paper_data.get("paperId")
807
+ corpus_id = paper_data.get("corpusId")
808
+ title = paper_data.get("title", "")
809
+ normalized_title = self.normalize_paper_title(title)
810
+ abstract = paper_data.get("abstract", "")
811
+ venue = paper_data.get("venue", "")
812
+ publication_venue_id = paper_data.get("publicationVenueId")
813
+ year = paper_data.get("year")
814
+ reference_count = paper_data.get("referenceCount")
815
+ citation_count = paper_data.get("citationCount")
816
+ influential_citation_count = paper_data.get("influentialCitationCount")
817
+ is_open_access = paper_data.get("isOpenAccess")
818
+ publication_date = paper_data.get("publicationDate")
819
+ url = paper_data.get("url", "")
820
+
821
+ # Extract external IDs
822
+ external_ids = paper_data.get("externalIds", {}) or {}
823
+ external_mag = external_ids.get("MAG")
824
+ external_corpus_id = external_ids.get("CorpusId")
825
+ external_acl = external_ids.get("ACL")
826
+ external_pubmed = external_ids.get("PubMed")
827
+ external_doi = external_ids.get("DOI")
828
+ external_pmc = external_ids.get("PubMedCentral")
829
+ external_dblp = external_ids.get("DBLP")
830
+ external_arxiv = external_ids.get("ArXiv")
831
+
832
+ # Extract journal info
833
+ journal = paper_data.get("journal", {}) or {}
834
+ journal_name = journal.get("name", "")
835
+ journal_pages = journal.get("pages")
836
+ journal_volume = journal.get("volume")
837
+
838
+ # Store complex fields as JSON
839
+ authors_json = json.dumps(paper_data.get("authors", []))
840
+ s2_fields_json = json.dumps(paper_data.get("s2FieldsOfStudy", []))
841
+ pub_types_json = json.dumps(paper_data.get("publicationTypes", []))
842
+
843
+ # Full JSON for complete access
844
+ full_json = json.dumps(paper_data)
845
+
846
+ batch.append((
847
+ paper_id, corpus_id, title, normalized_title, abstract, venue, publication_venue_id,
848
+ year, reference_count, citation_count, influential_citation_count,
849
+ is_open_access, publication_date, url,
850
+ external_mag, external_corpus_id, external_acl, external_pubmed,
851
+ external_doi, external_pmc, external_dblp, external_arxiv,
852
+ journal_name, journal_pages, journal_volume,
853
+ authors_json, s2_fields_json, pub_types_json, full_json
854
+ ))
855
+
856
+ # Insert all papers in batch
857
+ if batch:
858
+ cursor.executemany("""
859
+ INSERT OR REPLACE INTO papers VALUES (
860
+ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
861
+ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
862
+ ?, ?, ?, ?
863
+ )
864
+ """, batch)
865
+
866
+ # Commit transaction
867
+ self.conn.commit()
868
+ except sqlite3.Error as e:
869
+ logger.error(f"Error storing papers batch: {str(e)}")
870
+ self.conn.rollback()
871
+
872
+ def _retry_with_backoff(self, func, *args, max_retries=5, **kwargs):
873
+ """
874
+ Retry a function with exponential backoff
875
+
876
+ Args:
877
+ func: Function to retry
878
+ max_retries: Maximum number of retries
879
+ *args, **kwargs: Arguments to pass to the function
880
+
881
+ Returns:
882
+ The function result
883
+ """
884
+ for attempt in range(max_retries):
885
+ try:
886
+ return func(*args, **kwargs)
887
+ except requests.exceptions.RequestException as e:
888
+ if attempt == max_retries - 1:
889
+ # Last attempt, re-raise the exception
890
+ raise
891
+
892
+ # Calculate backoff time with jitter
893
+ backoff_time = min(60, (2 ** attempt) * (1 + random.random()))
894
+ logger.warning(f"Request failed: {str(e)}. Retrying in {backoff_time:.1f} seconds (attempt {attempt+1}/{max_retries})...")
895
+ time.sleep(backoff_time)
896
+
897
+ def download_papers(self, paper_ids):
898
+ """
899
+ Download and store papers in batches
900
+
901
+ Args:
902
+ paper_ids: List of paper IDs to download
903
+ """
904
+ if not paper_ids:
905
+ logger.warning("No paper IDs provided")
906
+ return
907
+
908
+ # Remove duplicates while preserving order
909
+ unique_paper_ids = list(dict.fromkeys(paper_ids))
910
+ logger.info(f"Downloading {len(unique_paper_ids)} unique papers in batches of {self.batch_size}")
911
+
912
+ # Check which papers are already in the database
913
+ existing_ids = set()
914
+ try:
915
+ cursor = self.conn.cursor()
916
+ # Process in chunks to avoid sqlite parameter limit
917
+ chunk_size = 500
918
+ for i in range(0, len(unique_paper_ids), chunk_size):
919
+ chunk = unique_paper_ids[i:i+chunk_size]
920
+ placeholders = ','.join(['?'] * len(chunk))
921
+ cursor.execute(f"SELECT paperId FROM papers WHERE paperId IN ({placeholders})", chunk)
922
+ existing_ids.update(row[0] for row in cursor.fetchall())
923
+ except sqlite3.Error as e:
924
+ logger.error(f"Error checking existing papers: {str(e)}")
925
+
926
+ # Filter out papers that are already in the database
927
+ paper_ids_to_download = [pid for pid in unique_paper_ids if pid not in existing_ids]
928
+ logger.info(f"Skipping {len(existing_ids)} already downloaded papers")
929
+ logger.info(f"Downloading {len(paper_ids_to_download)} new papers")
930
+
931
+ if not paper_ids_to_download:
932
+ logger.info("All papers already exist in the database")
933
+ return
934
+
935
+ # Process papers in batches
936
+ for i in tqdm(range(0, len(paper_ids_to_download), self.batch_size), desc="Downloading batches"):
937
+ batch_ids = paper_ids_to_download[i:i+self.batch_size]
938
+
939
+ # Download batch with retry mechanism
940
+ try:
941
+ # Use a smaller batch size for unauthenticated requests
942
+ if not self.api_key and len(batch_ids) > 10:
943
+ sub_batches = [batch_ids[j:j+10] for j in range(0, len(batch_ids), 10)]
944
+ batch_data = []
945
+ for sub_batch in sub_batches:
946
+ sub_data = self._retry_with_backoff(self.download_paper_batch, sub_batch)
947
+ batch_data.extend(sub_data)
948
+ # Add extra delay between sub-batches
949
+ time.sleep(random.uniform(4, 6))
950
+ else:
951
+ batch_data = self._retry_with_backoff(self.download_paper_batch, batch_ids)
952
+
953
+ # Store papers in a single transaction
954
+ self.store_papers_batch(batch_data)
955
+
956
+ # Sleep to avoid rate limits with some randomness to avoid patterns
957
+ sleep_time = random.uniform(4, 7) if not self.api_key else random.uniform(2, 4)
958
+ time.sleep(sleep_time)
959
+
960
+ except Exception as e:
961
+ logger.error(f"Failed to download batch after multiple retries: {str(e)}")
962
+ logger.info("Saving progress and continuing with next batch...")
963
+ # Continue with the next batch instead of failing completely
964
+
965
+ def _is_file_processed(self, file_path):
966
+ """
967
+ Check if a file's contents are already in the database by checking sample records.
968
+ Uses both first and last valid records in the file - if both exist in DB, file is processed.
969
+ This is much faster than reading the entire file.
970
+
971
+ Args:
972
+ file_path: Full path to the .gz file
973
+
974
+ Returns:
975
+ bool: True if file appears to be already processed
976
+ """
977
+ # If file doesn't exist, it's not processed
978
+ if not os.path.exists(file_path):
979
+ return False
980
+
981
+ try:
982
+ # Get sample records from the file (first and last)
983
+ sample_paper_ids = []
984
+
985
+ with gzip.open(file_path, "rt", encoding="utf-8") as f:
986
+ # Check first few lines for a valid record
987
+ for i in range(5):
988
+ line = f.readline()
989
+ if not line:
990
+ break
991
+ try:
992
+ paper_data = json.loads(line.strip())
993
+ if paper_data:
994
+ paper_id = paper_data.get("paperId")
995
+ corpus_id = paper_data.get("corpusid") or paper_data.get("corpusId")
996
+
997
+ if paper_id:
998
+ sample_paper_ids.append(paper_id)
999
+ break
1000
+ elif corpus_id:
1001
+ sample_paper_ids.append(str(corpus_id))
1002
+ break
1003
+ except json.JSONDecodeError:
1004
+ continue
1005
+
1006
+ # For the last few lines, we need to be more careful with gzip files
1007
+ # Since we can't easily seek to the end, we'll just check if we found a first record
1008
+ # and assume the file is processed if we found at least one valid record
1009
+ if sample_paper_ids:
1010
+ # Check if the sample record exists in the database
1011
+ cursor = self.conn.cursor()
1012
+ cursor.execute("SELECT COUNT(*) FROM papers WHERE paperId = ?", (sample_paper_ids[0],))
1013
+ count = cursor.fetchone()[0]
1014
+ if count > 0:
1015
+ logger.debug(f"File {os.path.basename(file_path)} appears to be processed (sample record found in DB)")
1016
+ return True
1017
+
1018
+ logger.debug(f"File {os.path.basename(file_path)} not processed (no sample records found in DB)")
1019
+ return False
1020
+
1021
+ except Exception as e:
1022
+ logger.warning(f"Error checking if {os.path.basename(file_path)} is processed: {e}")
1023
+ return False # If we can't check, assume not processed
1024
+
1025
+ def process_local_files(self, force_reprocess=False, incremental=False):
1026
+ """
1027
+ Process existing .gz files in the output directory into the database
1028
+
1029
+ Args:
1030
+ force_reprocess: If True, reprocess all files even if already processed
1031
+ incremental: If True, only process new or modified files (set automatically if DB exists)
1032
+ """
1033
+ if incremental and not force_reprocess:
1034
+ logger.info("Running in incremental mode - checking for updates...")
1035
+ update_info = self.check_for_updates()
1036
+
1037
+ if not update_info['has_updates']:
1038
+ logger.info(update_info['message'])
1039
+ return
1040
+
1041
+ logger.info(update_info['message'])
1042
+
1043
+ # Handle incremental updates if available, but only if there are no new local files
1044
+ if update_info.get('incremental_updates'):
1045
+ # Check if we have any unprocessed .gz files first
1046
+ gz_files = []
1047
+ for root, dirs, files in os.walk(self.output_dir):
1048
+ for file in files:
1049
+ if file.endswith('.gz'):
1050
+ gz_files.append(os.path.join(root, file))
1051
+
1052
+ unprocessed_files = []
1053
+ for gz_file in gz_files:
1054
+ if not self._is_file_processed(gz_file) or self._should_process_file(gz_file):
1055
+ unprocessed_files.append(gz_file)
1056
+
1057
+ # Check if any incremental updates are actually full dataset updates
1058
+ has_full_dataset_update = any(diff.get("type") == "full_dataset_update" for diff in update_info['incremental_updates'])
1059
+
1060
+ if unprocessed_files and not has_full_dataset_update:
1061
+ logger.info(f"Found {len(unprocessed_files)} unprocessed local files, processing those instead of incremental updates")
1062
+ elif has_full_dataset_update:
1063
+ logger.info("Full dataset update needed - this will download and process the latest dataset")
1064
+ success = self.download_incremental_updates(update_info['incremental_updates'])
1065
+ if success:
1066
+ logger.info("Full dataset update completed successfully")
1067
+ return
1068
+ else:
1069
+ logger.warning("Failed to process full dataset update, falling back to file processing")
1070
+ else:
1071
+ logger.info("Processing incremental updates...")
1072
+ success = self.download_incremental_updates(update_info['incremental_updates'])
1073
+ if success:
1074
+ logger.info("Incremental updates processed successfully")
1075
+ return
1076
+ else:
1077
+ logger.warning("Failed to process incremental updates, falling back to file processing")
1078
+
1079
+ # Find all .gz files in the output directory
1080
+ gz_files = []
1081
+ for root, dirs, files in os.walk(self.output_dir):
1082
+ for file in files:
1083
+ if file.endswith('.gz'):
1084
+ gz_files.append(os.path.join(root, file))
1085
+
1086
+ if not gz_files:
1087
+ logger.warning(f"No .gz files found in {self.output_dir}")
1088
+ return
1089
+
1090
+ logger.info(f"Found {len(gz_files)} .gz files to process")
1091
+
1092
+ # Process files
1093
+ total_records = 0
1094
+ skipped_files = 0
1095
+
1096
+ for i, gz_file in enumerate(gz_files, 1):
1097
+ try:
1098
+ # Check if file should be processed
1099
+ if incremental and not force_reprocess:
1100
+ # First check if file contents are already in database
1101
+ if self._is_file_processed(gz_file):
1102
+ logger.info(f"Skipping [{i}/{len(gz_files)}] {os.path.basename(gz_file)} - already processed")
1103
+ skipped_files += 1
1104
+ continue
1105
+
1106
+ # Then check if file has been modified since last processing
1107
+ if not self._should_process_file(gz_file):
1108
+ logger.info(f"Skipping [{i}/{len(gz_files)}] {os.path.basename(gz_file)} - already processed and not modified")
1109
+ skipped_files += 1
1110
+ continue
1111
+
1112
+ logger.info(f"Processing [{i}/{len(gz_files)}] {os.path.basename(gz_file)}")
1113
+ records_processed = self._process_gz_file(gz_file)
1114
+ total_records += records_processed
1115
+ logger.info(f"Processed [{i}/{len(gz_files)}] {records_processed:,} records from {os.path.basename(gz_file)}")
1116
+
1117
+ except Exception as e:
1118
+ logger.error(f"Error processing [{i}/{len(gz_files)}] {gz_file}: {e}")
1119
+ continue
1120
+
1121
+ # Update metadata after successful processing
1122
+ if incremental and total_records > 0:
1123
+ current_time = datetime.now(timezone.utc).isoformat()
1124
+ self.set_last_update_time(current_time)
1125
+
1126
+ # Get and set the latest release ID
1127
+ try:
1128
+ latest_release = self.get_latest_release_id()
1129
+ self.set_last_release_id(latest_release)
1130
+ logger.info(f"Updated metadata - last update: {current_time}, release: {latest_release}")
1131
+ except Exception as e:
1132
+ logger.warning(f"Could not update release ID: {e}")
1133
+
1134
+ logger.info(f"Total records processed: {total_records:,}")
1135
+ if skipped_files > 0:
1136
+ logger.info(f"Files skipped (already processed): {skipped_files}")
1137
+
1138
+ def _should_process_file(self, file_path):
1139
+ """
1140
+ Check if a file should be processed in incremental mode
1141
+
1142
+ Args:
1143
+ file_path: Path to the file to check
1144
+
1145
+ Returns:
1146
+ bool: True if file should be processed
1147
+ """
1148
+ # Check if we have metadata about this file
1149
+ file_hash = self._get_file_hash(file_path)
1150
+ last_processed_hash = self.get_metadata(f'file_hash_{os.path.basename(file_path)}')
1151
+
1152
+ if last_processed_hash != file_hash:
1153
+ # File has changed, should process
1154
+ return True
1155
+
1156
+ # Check file modification time
1157
+ file_mtime = os.path.getmtime(file_path)
1158
+ last_processed_time = self.get_metadata(f'file_mtime_{os.path.basename(file_path)}')
1159
+
1160
+ if last_processed_time:
1161
+ try:
1162
+ last_time = float(last_processed_time)
1163
+ if file_mtime > last_time:
1164
+ return True
1165
+ except ValueError:
1166
+ pass
1167
+
1168
+ return False
1169
+
1170
+ def _get_file_hash(self, file_path):
1171
+ """Get a hash of the file for change detection"""
1172
+ try:
1173
+ with open(file_path, 'rb') as f:
1174
+ return hashlib.md5(f.read(1024)).hexdigest() # Hash first 1KB for speed
1175
+ except Exception:
1176
+ return None
1177
+
1178
+ def _process_gz_file(self, filename, max_records=None):
1179
+ """
1180
+ Process a single .gz file into the database
1181
+
1182
+ Args:
1183
+ filename: Path to the .gz file
1184
+ max_records: Maximum number of records to process (for testing)
1185
+
1186
+ Returns:
1187
+ int: Number of records processed
1188
+ """
1189
+ file_path = filename
1190
+
1191
+ if not os.path.exists(file_path):
1192
+ logger.error(f"File not found: {file_path}")
1193
+ return 0
1194
+
1195
+ records_processed = 0
1196
+ cursor = self.conn.cursor()
1197
+
1198
+ try:
1199
+ with gzip.open(file_path, 'rt', encoding='utf-8') as f:
1200
+ for line_num, line in enumerate(f, 1):
1201
+ if max_records and records_processed >= max_records:
1202
+ break
1203
+
1204
+ try:
1205
+ paper_data = json.loads(line.strip())
1206
+ self._insert_paper(cursor, paper_data)
1207
+ records_processed += 1
1208
+
1209
+ if records_processed % 10000 == 0:
1210
+ logger.info(f"Processed {records_processed:,} records from {filename}")
1211
+ self.conn.commit()
1212
+
1213
+ except json.JSONDecodeError as e:
1214
+ logger.warning(f"Invalid JSON on line {line_num} in {filename}: {e}")
1215
+ continue
1216
+ except Exception as e:
1217
+ logger.error(f"Error processing line {line_num} in {filename}: {e}")
1218
+ continue
1219
+
1220
+ # Final commit
1221
+ self.conn.commit()
1222
+
1223
+ # Track file processing metadata for incremental updates
1224
+ self._track_file_processing(filename, file_path, records_processed)
1225
+
1226
+ return records_processed
1227
+
1228
+ except Exception as e:
1229
+ logger.error(f"Error processing file {filename}: {e}")
1230
+ return 0
1231
+
1232
+ def _track_file_processing(self, filename, file_path, records_processed):
1233
+ """
1234
+ Track file processing metadata for incremental updates
1235
+
1236
+ Args:
1237
+ filename: Name of the processed file
1238
+ file_path: Full path to the file
1239
+ records_processed: Number of records processed
1240
+ """
1241
+ try:
1242
+ # Get file metadata
1243
+ file_hash = self._get_file_hash(file_path)
1244
+ file_mtime = os.path.getmtime(file_path)
1245
+ file_size = os.path.getsize(file_path)
1246
+
1247
+ # Store metadata
1248
+ self.set_metadata(f'file_hash_{filename}', file_hash or '')
1249
+ self.set_metadata(f'file_mtime_{filename}', str(file_mtime))
1250
+ self.set_metadata(f'file_size_{filename}', str(file_size))
1251
+ self.set_metadata(f'file_records_{filename}', str(records_processed))
1252
+ self.set_metadata(f'file_processed_{filename}', datetime.now(timezone.utc).isoformat())
1253
+
1254
+ except Exception as e:
1255
+ logger.warning(f"Could not track file processing metadata for {filename}: {e}")
1256
+
1257
+ def _insert_paper(self, cursor, paper_data):
1258
+ """
1259
+ Insert a single paper into the database
1260
+
1261
+ Args:
1262
+ cursor: Database cursor
1263
+ paper_data: Paper data dictionary
1264
+ """
1265
+ # Skip empty or invalid records
1266
+ if not paper_data:
1267
+ return
1268
+
1269
+ # Use corpusid as primary key if paperId not available
1270
+ paper_id = paper_data.get("paperId")
1271
+ if not paper_id:
1272
+ corpus_id = paper_data.get("corpusid") or paper_data.get("corpusId")
1273
+ if corpus_id:
1274
+ paper_id = str(corpus_id) # Use corpus ID as paper ID
1275
+ else:
1276
+ return # Skip if no ID available
1277
+
1278
+ # Extract scalar fields (handle both camelCase and lowercase)
1279
+ corpus_id = paper_data.get("corpusId") or paper_data.get("corpusid")
1280
+ title = paper_data.get("title", "")
1281
+ normalized_title = self.normalize_paper_title(title)
1282
+ abstract = paper_data.get("abstract", "")
1283
+ venue = paper_data.get("venue", "")
1284
+ publication_venue_id = paper_data.get("publicationVenueId") or paper_data.get("publicationvenueid")
1285
+ year = paper_data.get("year")
1286
+ reference_count = paper_data.get("referenceCount") or paper_data.get("referencecount")
1287
+ citation_count = paper_data.get("citationCount") or paper_data.get("citationcount")
1288
+ influential_citation_count = paper_data.get("influentialCitationCount") or paper_data.get("influentialcitationcount")
1289
+ is_open_access = paper_data.get("isOpenAccess") or paper_data.get("isopenaccess")
1290
+ publication_date = paper_data.get("publicationDate") or paper_data.get("publicationdate")
1291
+ url = paper_data.get("url", "")
1292
+
1293
+ # Extract external IDs (handle both camelCase and lowercase)
1294
+ external_ids = paper_data.get("externalIds") or paper_data.get("externalids") or {}
1295
+ external_mag = external_ids.get("MAG")
1296
+ external_corpus_id = external_ids.get("CorpusId")
1297
+ external_acl = external_ids.get("ACL")
1298
+ external_pubmed = external_ids.get("PubMed")
1299
+ external_doi = external_ids.get("DOI")
1300
+ external_pmc = external_ids.get("PubMedCentral")
1301
+ external_dblp = external_ids.get("DBLP")
1302
+ external_arxiv = external_ids.get("ArXiv")
1303
+
1304
+ # Extract journal info
1305
+ journal = paper_data.get("journal", {}) or {}
1306
+ journal_name = journal.get("name", "")
1307
+ journal_pages = journal.get("pages")
1308
+ journal_volume = journal.get("volume")
1309
+
1310
+ # Store complex fields as JSON (handle both camelCase and lowercase)
1311
+ authors_json = json.dumps(paper_data.get("authors", []))
1312
+ s2_fields_json = json.dumps(paper_data.get("s2FieldsOfStudy") or paper_data.get("s2fieldsofstudy") or [])
1313
+ pub_types_json = json.dumps(paper_data.get("publicationTypes") or paper_data.get("publicationtypes") or [])
1314
+
1315
+ # Full JSON for complete access
1316
+ full_json = json.dumps(paper_data)
1317
+
1318
+ # Insert or replace the paper
1319
+ cursor.execute("""
1320
+ INSERT OR REPLACE INTO papers VALUES (
1321
+ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
1322
+ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
1323
+ ?, ?, ?, ?
1324
+ )
1325
+ """, (
1326
+ paper_id, corpus_id, title, normalized_title, abstract, venue, publication_venue_id,
1327
+ year, reference_count, citation_count, influential_citation_count,
1328
+ is_open_access, publication_date, url,
1329
+ external_mag, external_corpus_id, external_acl, external_pubmed,
1330
+ external_doi, external_pmc, external_dblp, external_arxiv,
1331
+ journal_name, journal_pages, journal_volume,
1332
+ authors_json, s2_fields_json, pub_types_json, full_json
1333
+ ))
1334
+
1335
+ def close(self):
1336
+ """Close the database connection"""
1337
+ if hasattr(self, 'conn') and self.conn:
1338
+ # Optimize database before closing
1339
+ self.conn.execute("PRAGMA optimize")
1340
+ self.conn.close()
1341
+
1342
+ if hasattr(self, 'session') and self.session:
1343
+ self.session.close()
1344
+
1345
+ def download_dataset_files(self):
1346
+ """
1347
+ Download the official Semantic Scholar dataset files
1348
+
1349
+ Returns:
1350
+ bool: True if successful, False otherwise
1351
+ """
1352
+ try:
1353
+ logger.info("Downloading Semantic Scholar dataset files...")
1354
+
1355
+ # Get the latest release ID
1356
+ latest_release = self.get_latest_release_id()
1357
+ logger.info(f"Latest release: {latest_release}")
1358
+
1359
+ # List files for the latest release
1360
+ files = self.list_files(latest_release, dataset="papers")
1361
+ if not files:
1362
+ logger.error("No files found for the latest release")
1363
+ return False
1364
+
1365
+ logger.info(f"Found {len(files)} files to download")
1366
+
1367
+ # Download files
1368
+ downloaded_count = 0
1369
+ for file_meta in files:
1370
+ try:
1371
+ path, updated = self.download_file(file_meta)
1372
+ if updated:
1373
+ downloaded_count += 1
1374
+ logger.info(f"Downloaded: {path}")
1375
+ else:
1376
+ logger.info(f"Skipped (not modified): {path}")
1377
+ except Exception as e:
1378
+ logger.error(f"Error downloading {file_meta.get('path', 'unknown')}: {e}")
1379
+ continue
1380
+
1381
+ logger.info(f"Downloaded {downloaded_count} files out of {len(files)} total files")
1382
+ return downloaded_count > 0
1383
+
1384
+ except Exception as e:
1385
+ logger.error(f"Error downloading dataset files: {e}")
1386
+ return False
1387
+
1388
+ def list_files(self, release_id: str, dataset: str = "papers") -> list[dict]:
1389
+ """
1390
+ List all files for a given release and dataset.
1391
+
1392
+ Args:
1393
+ release_id: Release ID
1394
+ dataset: Dataset name (default: "papers")
1395
+
1396
+ Returns:
1397
+ list: List of file metadata dictionaries
1398
+ """
1399
+ logger.info(f"Requesting file list for release {release_id}, dataset {dataset}...")
1400
+
1401
+ try:
1402
+ url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}/dataset/{dataset}"
1403
+ headers = {}
1404
+ if self.api_key:
1405
+ headers["x-api-key"] = self.api_key
1406
+
1407
+ response = self.session.get(url, headers=headers, timeout=30)
1408
+ response.raise_for_status()
1409
+
1410
+ data = response.json()
1411
+ files = data.get("files", [])
1412
+
1413
+ # Convert URL-based files to structured format
1414
+ structured_files = []
1415
+ for file_item in files:
1416
+ if isinstance(file_item, str):
1417
+ # File is a URL string - extract filename and create structure
1418
+ import urllib.parse
1419
+ parsed_url = urllib.parse.urlparse(file_item)
1420
+ filename = parsed_url.path.split('/')[-1]
1421
+
1422
+ structured_files.append({
1423
+ 'path': filename,
1424
+ 'url': file_item,
1425
+ 'size': 0 # Size not available from URL format
1426
+ })
1427
+ elif isinstance(file_item, dict):
1428
+ # File is already structured
1429
+ structured_files.append(file_item)
1430
+
1431
+ logger.info(f"Retrieved {len(structured_files)} files from API")
1432
+ return structured_files
1433
+
1434
+ except Exception as e:
1435
+ logger.error(f"Error listing files: {e}")
1436
+ return []
1437
+
1438
+ def download_file(self, file_meta):
1439
+ """
1440
+ Download a single file from the dataset
1441
+
1442
+ Args:
1443
+ file_meta: File metadata dictionary
1444
+
1445
+ Returns:
1446
+ tuple: (file_path, was_updated)
1447
+ """
1448
+ url = file_meta["url"]
1449
+ local_path = os.path.join(self.output_dir, file_meta["path"])
1450
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
1451
+
1452
+ # Get file size for progress tracking
1453
+ file_size = file_meta.get("size", 0)
1454
+ file_name = file_meta["path"]
1455
+
1456
+ headers = {}
1457
+ # Use conditional request if we have Last-Modified stored
1458
+ if os.path.exists(local_path + ".meta"):
1459
+ last_mod = open(local_path + ".meta").read().strip()
1460
+ headers["If-Modified-Since"] = last_mod
1461
+
1462
+ logger.info(f"Downloading {file_name} ({self._format_size(file_size)})")
1463
+ start_time = time.time()
1464
+
1465
+ resp = self.session.get(url, headers=headers, stream=True, timeout=300)
1466
+ if resp.status_code == 304:
1467
+ logger.info(f"{file_meta['path']} not modified, skipping.")
1468
+ return file_meta["path"], False
1469
+ resp.raise_for_status()
1470
+
1471
+ # Get actual content length from response headers if available
1472
+ content_length = int(resp.headers.get('Content-Length', file_size or 0))
1473
+
1474
+ # Save file with progress tracking
1475
+ downloaded = 0
1476
+ with open(local_path, "wb") as f_out:
1477
+ for chunk in resp.iter_content(8192):
1478
+ f_out.write(chunk)
1479
+ downloaded += len(chunk)
1480
+
1481
+ download_time = time.time() - start_time
1482
+ download_speed = downloaded / download_time if download_time > 0 else 0
1483
+
1484
+ logger.info(f"Downloaded {file_name}: {self._format_size(downloaded)} in {download_time:.1f}s "
1485
+ f"({self._format_size(download_speed)}/s)")
1486
+
1487
+ # Save new Last-Modified
1488
+ last_mod = resp.headers.get("Last-Modified", datetime.now(timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT"))
1489
+ with open(local_path + ".meta", "w") as m:
1490
+ m.write(last_mod)
1491
+ return file_meta["path"], True
1492
+
1493
+ def _format_size(self, size_bytes):
1494
+ """Format file size in human readable format."""
1495
+ if size_bytes == 0:
1496
+ return "0 B"
1497
+
1498
+ size_names = ["B", "KB", "MB", "GB", "TB"]
1499
+ i = 0
1500
+ size = float(size_bytes)
1501
+
1502
+ while size >= 1024.0 and i < len(size_names) - 1:
1503
+ size /= 1024.0
1504
+ i += 1
1505
+
1506
+ return f"{size:.1f} {size_names[i]}"
1507
+
1508
+ def _process_incremental_file(self, file_url, operation_type):
1509
+ """
1510
+ Process a single incremental diff file (either updates or deletes)
1511
+
1512
+ Args:
1513
+ file_url: URL of the diff file to process
1514
+ operation_type: Either "update" or "delete"
1515
+
1516
+ Returns:
1517
+ int: Number of records processed
1518
+ """
1519
+ try:
1520
+ logger.info(f"Processing {operation_type} file: {file_url}")
1521
+
1522
+ # Download the file content
1523
+ response = self.session.get(file_url, stream=True, timeout=300)
1524
+ response.raise_for_status()
1525
+
1526
+ records_processed = 0
1527
+ cursor = self.conn.cursor()
1528
+
1529
+ # Begin transaction for better performance
1530
+ self.conn.execute("BEGIN TRANSACTION")
1531
+
1532
+ try:
1533
+ # Process the file line by line
1534
+ for line_num, line in enumerate(response.iter_lines(decode_unicode=True), 1):
1535
+ if not line.strip():
1536
+ continue
1537
+
1538
+ try:
1539
+ record = json.loads(line.strip())
1540
+
1541
+ if operation_type == "update":
1542
+ # Insert or update the record
1543
+ self._insert_paper(cursor, record)
1544
+ elif operation_type == "delete":
1545
+ # Delete the record by primary key
1546
+ paper_id = record.get("paperId")
1547
+ if not paper_id:
1548
+ # Fallback to corpusId if paperId not available
1549
+ corpus_id = record.get("corpusid") or record.get("corpusId")
1550
+ if corpus_id:
1551
+ paper_id = str(corpus_id)
1552
+
1553
+ if paper_id:
1554
+ cursor.execute("DELETE FROM papers WHERE paperId = ?", (paper_id,))
1555
+
1556
+ records_processed += 1
1557
+
1558
+ # Commit periodically for large files
1559
+ if records_processed % 10000 == 0:
1560
+ self.conn.commit()
1561
+ self.conn.execute("BEGIN TRANSACTION")
1562
+ logger.info(f"Processed {records_processed:,} {operation_type} records")
1563
+
1564
+ except json.JSONDecodeError as e:
1565
+ logger.warning(f"Invalid JSON on line {line_num} in {operation_type} file: {e}")
1566
+ continue
1567
+ except Exception as e:
1568
+ logger.error(f"Error processing line {line_num} in {operation_type} file: {e}")
1569
+ continue
1570
+
1571
+ # Final commit
1572
+ self.conn.commit()
1573
+ logger.info(f"Completed processing {records_processed:,} {operation_type} records")
1574
+
1575
+ except Exception as e:
1576
+ self.conn.rollback()
1577
+ raise e
1578
+
1579
+ return records_processed
1580
+
1581
+ except Exception as e:
1582
+ logger.error(f"Error processing {operation_type} file {file_url}: {e}")
1583
+ return 0
1584
+
1585
+ def main():
1586
+ """Main function"""
1587
+ parser = argparse.ArgumentParser(description="Download and process Semantic Scholar paper metadata")
1588
+ parser.add_argument("--output-dir", type=str, default="semantic_scholar_db",
1589
+ help="Directory to store the database (default: semantic_scholar_db)")
1590
+ parser.add_argument("--batch-size", type=int, default=10,
1591
+ help="Number of papers to download in each batch (default: 10 for unauthenticated requests, can increase with API key)")
1592
+ parser.add_argument("--api-key", type=str,
1593
+ help="Semantic Scholar API key (optional, increases rate limits)")
1594
+ parser.add_argument("--fields", type=str,
1595
+ default="id,title,authors,year,externalIds,url,abstract",
1596
+ help="Comma-separated list of fields to include")
1597
+
1598
+ # Dataset download options
1599
+ parser.add_argument("--download-dataset", action="store_true",
1600
+ help="Download the official Semantic Scholar dataset files (.gz)")
1601
+ parser.add_argument("--process-local-files", action="store_true",
1602
+ help="Process existing .gz files in the output directory into the database")
1603
+ parser.add_argument("--force-reprocess", action="store_true",
1604
+ help="Force reprocessing of all files (use with --process-local-files)")
1605
+
1606
+ # Legacy API-based search options (for backwards compatibility)
1607
+ parser.add_argument("--query", type=str,
1608
+ help="Search query to download papers via API")
1609
+ parser.add_argument("--start-year", type=int,
1610
+ help="Start year for downloading papers by year range via API")
1611
+ parser.add_argument("--end-year", type=int,
1612
+ help="End year for downloading papers by year range via API")
1613
+ parser.add_argument("--field", type=str,
1614
+ help="Field or subject area for downloading papers by field via API")
1615
+ parser.add_argument("--limit", type=int, default=100000,
1616
+ help="Maximum number of papers to download via API (default: 100000)")
1617
+ parser.add_argument("--threads", type=int, default=1,
1618
+ help="Number of threads for parallel processing (default: 1)")
1619
+
1620
+ args = parser.parse_args()
1621
+
1622
+ # Initialize downloader
1623
+ downloader = SemanticScholarDownloader(
1624
+ output_dir=args.output_dir,
1625
+ batch_size=args.batch_size,
1626
+ api_key=args.api_key,
1627
+ fields=args.fields.split(",") if args.fields else None
1628
+ )
1629
+
1630
+ try:
1631
+ # Check if database exists
1632
+ db_exists = os.path.exists(downloader.db_path)
1633
+
1634
+ # Determine what to do based on arguments and database state
1635
+ if args.download_dataset:
1636
+ logger.info("Downloading dataset files only")
1637
+ success = downloader.download_dataset_files()
1638
+ if not success:
1639
+ logger.error("Failed to download dataset files")
1640
+ return 1
1641
+
1642
+ elif args.process_local_files:
1643
+ logger.info("Processing local files mode")
1644
+ downloader.process_local_files(
1645
+ force_reprocess=args.force_reprocess,
1646
+ incremental=db_exists # Only incremental if DB exists
1647
+ )
1648
+
1649
+ elif args.query or args.start_year or args.end_year or args.field:
1650
+ # Legacy API-based search
1651
+ logger.info("Using legacy API-based paper search")
1652
+ paper_ids = downloader.search_papers(
1653
+ query=args.query,
1654
+ start_year=args.start_year,
1655
+ end_year=args.end_year,
1656
+ field=args.field,
1657
+ limit=args.limit
1658
+ )
1659
+
1660
+ # Download papers
1661
+ downloader.download_papers(paper_ids)
1662
+
1663
+ else:
1664
+ # Default behavior: automatic full or incremental based on DB state
1665
+ if not db_exists:
1666
+ logger.info("No database found - performing full download")
1667
+ success = downloader.download_dataset_files()
1668
+ if not success:
1669
+ logger.error("Failed to download dataset files")
1670
+ return 1
1671
+ downloader.process_local_files(incremental=False)
1672
+ else:
1673
+ logger.info("Database exists - checking for new or updated data (incremental update)")
1674
+ # Check if there are any .gz files to process
1675
+ gz_files = []
1676
+ for root, dirs, files in os.walk(args.output_dir):
1677
+ for file in files:
1678
+ if file.endswith('.gz'):
1679
+ gz_files.append(os.path.join(root, file))
1680
+
1681
+ if gz_files:
1682
+ logger.info(f"Found {len(gz_files)} .gz files to process")
1683
+ downloader.process_local_files(
1684
+ force_reprocess=args.force_reprocess,
1685
+ incremental=True
1686
+ )
1687
+ else:
1688
+ logger.info("No .gz files found - downloading latest dataset")
1689
+ success = downloader.download_dataset_files()
1690
+ if not success:
1691
+ logger.error("Failed to download dataset files")
1692
+ return 1
1693
+ downloader.process_local_files(incremental=True)
1694
+
1695
+ logger.info(f"Completed processing in {args.output_dir}")
1696
+
1697
+ # Show database statistics
1698
+ cursor = downloader.conn.cursor()
1699
+ cursor.execute("SELECT COUNT(*) FROM papers")
1700
+ count = cursor.fetchone()[0]
1701
+ logger.info(f"Total papers in database: {count:,}")
1702
+
1703
+ # Show metadata if available
1704
+ if db_exists:
1705
+ last_update = downloader.get_last_update_time()
1706
+ last_release = downloader.get_last_release_id()
1707
+ if last_update:
1708
+ logger.info(f"Last update timestamp: {last_update}")
1709
+ if last_release:
1710
+ logger.info(f"Current release: {last_release}")
1711
+
1712
+ return 0
1713
+
1714
+ except KeyboardInterrupt:
1715
+ logger.info("Download interrupted by user")
1716
+ return 1
1717
+ except Exception as e:
1718
+ logger.error(f"Error during processing: {e}")
1719
+ return 1
1720
+ finally:
1721
+ # Close database connection
1722
+ downloader.close()
1723
+
1724
+ if __name__ == "__main__":
1725
+ main()