academic-refchecker 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_refchecker-2.0.7.dist-info/METADATA +738 -0
- academic_refchecker-2.0.7.dist-info/RECORD +64 -0
- academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
- academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
- academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
- academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
- backend/__init__.py +21 -0
- backend/__main__.py +11 -0
- backend/cli.py +64 -0
- backend/concurrency.py +100 -0
- backend/database.py +711 -0
- backend/main.py +1367 -0
- backend/models.py +99 -0
- backend/refchecker_wrapper.py +1126 -0
- backend/static/assets/index-2P6L_39v.css +1 -0
- backend/static/assets/index-hk21nqxR.js +25 -0
- backend/static/favicon.svg +6 -0
- backend/static/index.html +15 -0
- backend/static/vite.svg +1 -0
- backend/thumbnail.py +517 -0
- backend/websocket_manager.py +104 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +3 -0
- refchecker/checkers/__init__.py +17 -0
- refchecker/checkers/crossref.py +541 -0
- refchecker/checkers/enhanced_hybrid_checker.py +563 -0
- refchecker/checkers/github_checker.py +326 -0
- refchecker/checkers/local_semantic_scholar.py +540 -0
- refchecker/checkers/openalex.py +513 -0
- refchecker/checkers/openreview_checker.py +984 -0
- refchecker/checkers/pdf_paper_checker.py +493 -0
- refchecker/checkers/semantic_scholar.py +764 -0
- refchecker/checkers/webpage_checker.py +938 -0
- refchecker/config/__init__.py +1 -0
- refchecker/config/logging.conf +36 -0
- refchecker/config/settings.py +170 -0
- refchecker/core/__init__.py +7 -0
- refchecker/core/db_connection_pool.py +141 -0
- refchecker/core/parallel_processor.py +415 -0
- refchecker/core/refchecker.py +5838 -0
- refchecker/database/__init__.py +6 -0
- refchecker/database/download_semantic_scholar_db.py +1725 -0
- refchecker/llm/__init__.py +0 -0
- refchecker/llm/base.py +376 -0
- refchecker/llm/providers.py +911 -0
- refchecker/scripts/__init__.py +1 -0
- refchecker/scripts/start_vllm_server.py +121 -0
- refchecker/services/__init__.py +8 -0
- refchecker/services/pdf_processor.py +268 -0
- refchecker/utils/__init__.py +27 -0
- refchecker/utils/arxiv_utils.py +462 -0
- refchecker/utils/author_utils.py +179 -0
- refchecker/utils/biblatex_parser.py +584 -0
- refchecker/utils/bibliography_utils.py +332 -0
- refchecker/utils/bibtex_parser.py +411 -0
- refchecker/utils/config_validator.py +262 -0
- refchecker/utils/db_utils.py +210 -0
- refchecker/utils/doi_utils.py +190 -0
- refchecker/utils/error_utils.py +482 -0
- refchecker/utils/mock_objects.py +211 -0
- refchecker/utils/text_utils.py +5057 -0
- refchecker/utils/unicode_utils.py +335 -0
- refchecker/utils/url_utils.py +307 -0
|
@@ -0,0 +1,1725 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Download Semantic Scholar Paper Metadata
|
|
4
|
+
|
|
5
|
+
This script downloads paper metadata from the Semantic Scholar API and stores it in a SQLite database.
|
|
6
|
+
The database can then be used by the local_semantic_scholar.py module to verify references offline.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python download_semantic_scholar_db.py [--output-dir DIR] [--batch-size N] [--api-key KEY] [--fields FIELDS]
|
|
10
|
+
|
|
11
|
+
Options:
|
|
12
|
+
--output-dir DIR Directory to store the database (default: semantic_scholar_db)
|
|
13
|
+
--batch-size N Number of papers to download in each batch (default: 100)
|
|
14
|
+
--api-key KEY Semantic Scholar API key (optional, increases rate limits)
|
|
15
|
+
--fields FIELDS Comma-separated list of fields to include (default: id,title,authors,year,externalIds,url,abstract)
|
|
16
|
+
--query QUERY Search query to download papers
|
|
17
|
+
--start-year YEAR Start year for downloading papers by year range
|
|
18
|
+
--end-year YEAR End year for downloading papers by year range
|
|
19
|
+
--field FIELD Field or subject area for downloading papers by field
|
|
20
|
+
--download-dataset Download the official Semantic Scholar dataset files (.gz)
|
|
21
|
+
--process-local-files Process existing .gz files in the output directory into the database
|
|
22
|
+
--force-reprocess Force reprocessing of all files (use with --process-local-files)
|
|
23
|
+
|
|
24
|
+
Behavior:
|
|
25
|
+
- If the database does not exist, a full download is performed.
|
|
26
|
+
- If the database exists, an incremental update is performed automatically.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import logging
|
|
32
|
+
import os
|
|
33
|
+
import requests
|
|
34
|
+
import sqlite3
|
|
35
|
+
import sys
|
|
36
|
+
import time
|
|
37
|
+
import random
|
|
38
|
+
import concurrent.futures
|
|
39
|
+
import gzip
|
|
40
|
+
import hashlib
|
|
41
|
+
import re
|
|
42
|
+
import urllib.parse
|
|
43
|
+
import dateutil.parser
|
|
44
|
+
from datetime import datetime, timezone, timedelta
|
|
45
|
+
from tqdm import tqdm
|
|
46
|
+
from functools import lru_cache
|
|
47
|
+
|
|
48
|
+
# Set up logging
|
|
49
|
+
logging.basicConfig(
|
|
50
|
+
level=logging.INFO,
|
|
51
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
52
|
+
handlers=[
|
|
53
|
+
logging.StreamHandler(sys.stdout)
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
class SemanticScholarDownloader:
|
|
59
|
+
"""
|
|
60
|
+
Class to download paper metadata from Semantic Scholar and store it in a SQLite database
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, output_dir="semantic_scholar_db", batch_size=100, api_key=None, fields=None):
|
|
64
|
+
"""
|
|
65
|
+
Initialize the downloader
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
output_dir: Directory to store the database
|
|
69
|
+
batch_size: Number of papers to download in each batch
|
|
70
|
+
api_key: Semantic Scholar API key (optional)
|
|
71
|
+
fields: List of fields to include in the API response
|
|
72
|
+
"""
|
|
73
|
+
self.output_dir = output_dir
|
|
74
|
+
self.batch_size = batch_size
|
|
75
|
+
self.api_key = api_key
|
|
76
|
+
|
|
77
|
+
# Default fields to include
|
|
78
|
+
if fields is None:
|
|
79
|
+
self.fields = ["id", "title", "authors", "year", "externalIds", "url", "abstract"]
|
|
80
|
+
else:
|
|
81
|
+
self.fields = fields
|
|
82
|
+
|
|
83
|
+
# Create output directory if it doesn't exist
|
|
84
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
# Initialize database
|
|
87
|
+
self.db_path = os.path.join(output_dir, "semantic_scholar.db")
|
|
88
|
+
self.conn = self._get_db_connection()
|
|
89
|
+
self.create_tables()
|
|
90
|
+
|
|
91
|
+
# Set up session for API requests
|
|
92
|
+
self.session = requests.Session()
|
|
93
|
+
if self.api_key:
|
|
94
|
+
self.session.headers.update({"x-api-key": self.api_key})
|
|
95
|
+
|
|
96
|
+
def _get_db_connection(self):
|
|
97
|
+
"""Get a connection to the SQLite database with optimized settings"""
|
|
98
|
+
conn = sqlite3.connect(self.db_path)
|
|
99
|
+
conn.execute("PRAGMA journal_mode=WAL") # Write-Ahead Logging for better concurrency
|
|
100
|
+
conn.execute("PRAGMA synchronous=NORMAL") # Reduce synchronous writes for better performance
|
|
101
|
+
conn.execute("PRAGMA cache_size=10000") # Increase cache size
|
|
102
|
+
conn.execute("PRAGMA temp_store=MEMORY") # Store temp tables in memory
|
|
103
|
+
return conn
|
|
104
|
+
|
|
105
|
+
def create_tables(self):
|
|
106
|
+
"""Create database tables if they don't exist"""
|
|
107
|
+
cursor = self.conn.cursor()
|
|
108
|
+
|
|
109
|
+
# Create papers table with comprehensive schema
|
|
110
|
+
cursor.execute('''
|
|
111
|
+
CREATE TABLE IF NOT EXISTS papers (
|
|
112
|
+
paperId TEXT PRIMARY KEY,
|
|
113
|
+
corpusId INTEGER,
|
|
114
|
+
title TEXT,
|
|
115
|
+
normalized_paper_title TEXT,
|
|
116
|
+
abstract TEXT,
|
|
117
|
+
venue TEXT,
|
|
118
|
+
publicationVenueId TEXT,
|
|
119
|
+
year INTEGER,
|
|
120
|
+
referenceCount INTEGER,
|
|
121
|
+
citationCount INTEGER,
|
|
122
|
+
influentialCitationCount INTEGER,
|
|
123
|
+
isOpenAccess BOOLEAN,
|
|
124
|
+
publicationDate TEXT,
|
|
125
|
+
url TEXT,
|
|
126
|
+
|
|
127
|
+
-- External IDs (flattened)
|
|
128
|
+
externalIds_MAG TEXT,
|
|
129
|
+
externalIds_CorpusId TEXT,
|
|
130
|
+
externalIds_ACL TEXT,
|
|
131
|
+
externalIds_PubMed TEXT,
|
|
132
|
+
externalIds_DOI TEXT,
|
|
133
|
+
externalIds_PubMedCentral TEXT,
|
|
134
|
+
externalIds_DBLP TEXT,
|
|
135
|
+
externalIds_ArXiv TEXT,
|
|
136
|
+
|
|
137
|
+
-- Journal info (flattened)
|
|
138
|
+
journal_name TEXT,
|
|
139
|
+
journal_pages TEXT,
|
|
140
|
+
journal_volume TEXT,
|
|
141
|
+
|
|
142
|
+
-- Lists stored as JSON for complex queries
|
|
143
|
+
authors TEXT, -- JSON array
|
|
144
|
+
s2FieldsOfStudy TEXT, -- JSON array
|
|
145
|
+
publicationTypes TEXT, -- JSON array
|
|
146
|
+
|
|
147
|
+
-- Full JSON for complete data access
|
|
148
|
+
json_data TEXT
|
|
149
|
+
)
|
|
150
|
+
''')
|
|
151
|
+
|
|
152
|
+
# Create metadata table for tracking incremental updates
|
|
153
|
+
cursor.execute('''
|
|
154
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
155
|
+
key TEXT PRIMARY KEY,
|
|
156
|
+
value TEXT,
|
|
157
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
158
|
+
)
|
|
159
|
+
''')
|
|
160
|
+
|
|
161
|
+
# Create indexes for efficient querying
|
|
162
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year)')
|
|
163
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_title ON papers(title)')
|
|
164
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_normalized_title ON papers(normalized_paper_title)')
|
|
165
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_venue ON papers(venue)')
|
|
166
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_citationCount ON papers(citationCount)')
|
|
167
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(externalIds_DOI)')
|
|
168
|
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_papers_arxiv ON papers(externalIds_ArXiv)')
|
|
169
|
+
|
|
170
|
+
self.conn.commit()
|
|
171
|
+
|
|
172
|
+
def get_metadata(self, key: str, default=None):
|
|
173
|
+
"""Get metadata value from the database"""
|
|
174
|
+
cursor = self.conn.cursor()
|
|
175
|
+
cursor.execute("SELECT value FROM metadata WHERE key = ?", (key,))
|
|
176
|
+
result = cursor.fetchone()
|
|
177
|
+
return result[0] if result else default
|
|
178
|
+
|
|
179
|
+
def set_metadata(self, key: str, value: str):
|
|
180
|
+
"""Set metadata value in the database"""
|
|
181
|
+
cursor = self.conn.cursor()
|
|
182
|
+
cursor.execute("""
|
|
183
|
+
INSERT OR REPLACE INTO metadata (key, value, updated_at)
|
|
184
|
+
VALUES (?, ?, CURRENT_TIMESTAMP)
|
|
185
|
+
""", (key, value))
|
|
186
|
+
self.conn.commit()
|
|
187
|
+
|
|
188
|
+
def get_last_update_time(self):
|
|
189
|
+
"""Get the timestamp of the last successful update"""
|
|
190
|
+
return self.get_metadata('last_update_time')
|
|
191
|
+
|
|
192
|
+
def set_last_update_time(self, timestamp: str):
|
|
193
|
+
"""Set the timestamp of the last successful update"""
|
|
194
|
+
self.set_metadata('last_update_time', timestamp)
|
|
195
|
+
|
|
196
|
+
def get_last_release_id(self):
|
|
197
|
+
"""Get the last processed release ID"""
|
|
198
|
+
return self.get_metadata('last_release_id')
|
|
199
|
+
|
|
200
|
+
def set_last_release_id(self, release_id: str):
|
|
201
|
+
"""Set the last processed release ID"""
|
|
202
|
+
self.set_metadata('last_release_id', release_id)
|
|
203
|
+
|
|
204
|
+
def check_for_updates(self):
|
|
205
|
+
"""
|
|
206
|
+
Check if there are new releases or incremental updates available
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
dict: Information about available updates
|
|
210
|
+
"""
|
|
211
|
+
try:
|
|
212
|
+
# Get the latest release information
|
|
213
|
+
latest_release = self.get_latest_release_id()
|
|
214
|
+
last_release = self.get_last_release_id()
|
|
215
|
+
last_update_time = self.get_last_update_time()
|
|
216
|
+
|
|
217
|
+
logger.info(f"Latest release: {latest_release}")
|
|
218
|
+
logger.info(f"Last release: {last_release}")
|
|
219
|
+
|
|
220
|
+
# Check if database has records but no update time
|
|
221
|
+
if not last_update_time:
|
|
222
|
+
|
|
223
|
+
# does databse have > 1 record?
|
|
224
|
+
cursor = self.conn.cursor()
|
|
225
|
+
cursor.execute("SELECT EXISTS(SELECT 1 FROM papers LIMIT 1)")
|
|
226
|
+
record_count = cursor.fetchone()[0]
|
|
227
|
+
|
|
228
|
+
if record_count > 0:
|
|
229
|
+
# Database has records but no update time - create a reasonable timestamp
|
|
230
|
+
# Use a timestamp from 1 day ago to check for recent updates
|
|
231
|
+
default_update_time = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
|
|
232
|
+
logger.info(f"Creating default update time: {default_update_time}")
|
|
233
|
+
last_update_time = default_update_time
|
|
234
|
+
else:
|
|
235
|
+
logger.info(f"Last update time: {last_update_time}")
|
|
236
|
+
|
|
237
|
+
# Check for incremental updates using release IDs instead of timestamps
|
|
238
|
+
if last_release:
|
|
239
|
+
logger.info("Checking for incremental updates since last release...")
|
|
240
|
+
incremental_updates = self.check_incremental_updates(last_release)
|
|
241
|
+
if incremental_updates:
|
|
242
|
+
return {
|
|
243
|
+
'has_updates': True,
|
|
244
|
+
'latest_release': latest_release,
|
|
245
|
+
'last_release': last_release,
|
|
246
|
+
'is_new_release': False,
|
|
247
|
+
'incremental_updates': incremental_updates,
|
|
248
|
+
'message': f'Incremental updates available from {last_release} to {latest_release}'
|
|
249
|
+
}
|
|
250
|
+
else:
|
|
251
|
+
logger.info("No incremental updates found")
|
|
252
|
+
else:
|
|
253
|
+
logger.info("No last release ID available, skipping incremental check")
|
|
254
|
+
|
|
255
|
+
# Check for new releases
|
|
256
|
+
if not last_release:
|
|
257
|
+
logger.info("No previous release ID found in database")
|
|
258
|
+
return {
|
|
259
|
+
'has_updates': True,
|
|
260
|
+
'latest_release': latest_release,
|
|
261
|
+
'last_release': None,
|
|
262
|
+
'is_new_release': True,
|
|
263
|
+
'message': 'No previous release found, performing full download'
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if latest_release != last_release:
|
|
267
|
+
return {
|
|
268
|
+
'has_updates': True,
|
|
269
|
+
'latest_release': latest_release,
|
|
270
|
+
'last_release': last_release,
|
|
271
|
+
'is_new_release': True,
|
|
272
|
+
'message': f'New release available: {last_release} -> {latest_release}'
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
return {
|
|
276
|
+
'has_updates': False,
|
|
277
|
+
'latest_release': latest_release,
|
|
278
|
+
'last_release': last_release,
|
|
279
|
+
'is_new_release': False,
|
|
280
|
+
'message': f'Already up to date with release {latest_release}'
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"Error checking for updates: {e}")
|
|
285
|
+
return {
|
|
286
|
+
'has_updates': False,
|
|
287
|
+
'error': str(e),
|
|
288
|
+
'message': f'Error checking for updates: {e}'
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
def check_incremental_updates(self, start_release_id=None):
|
|
292
|
+
"""
|
|
293
|
+
Check for incremental updates between releases using the correct API
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
start_release_id: Release ID to start from (if None, uses last processed release)
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
list: List of incremental update diffs available, or None if no updates
|
|
300
|
+
"""
|
|
301
|
+
try:
|
|
302
|
+
# Get the start and end release IDs
|
|
303
|
+
if start_release_id is None:
|
|
304
|
+
start_release_id = self.get_last_release_id()
|
|
305
|
+
|
|
306
|
+
if not start_release_id:
|
|
307
|
+
logger.info("No start release ID available, cannot check for incremental updates")
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
# Get the latest release ID
|
|
311
|
+
end_release_id = self.get_latest_release_id()
|
|
312
|
+
|
|
313
|
+
# If we're already at the latest release, no updates needed
|
|
314
|
+
if start_release_id == end_release_id:
|
|
315
|
+
logger.info(f"Already at latest release {end_release_id}, no incremental updates needed")
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
logger.info(f"Checking for incremental updates from {start_release_id} to {end_release_id}")
|
|
319
|
+
|
|
320
|
+
# Use the correct incremental diffs API endpoint
|
|
321
|
+
url = f"https://api.semanticscholar.org/datasets/v1/diffs/{start_release_id}/to/{end_release_id}/papers"
|
|
322
|
+
headers = {}
|
|
323
|
+
if self.api_key:
|
|
324
|
+
headers["x-api-key"] = self.api_key
|
|
325
|
+
|
|
326
|
+
logger.info(f"Requesting incremental diffs from: {url}")
|
|
327
|
+
response = self.session.get(url, headers=headers, timeout=30)
|
|
328
|
+
|
|
329
|
+
# Handle different response codes
|
|
330
|
+
if response.status_code == 404:
|
|
331
|
+
logger.info(f"Incremental diffs not available for {start_release_id} to {end_release_id} (404)")
|
|
332
|
+
logger.info("This usually means the release gap is too large for incremental updates")
|
|
333
|
+
return self._check_incremental_alternative_by_release(start_release_id, end_release_id)
|
|
334
|
+
elif response.status_code == 429:
|
|
335
|
+
logger.warning("Rate limited on diffs API. Consider waiting or using a higher tier API key")
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
response.raise_for_status()
|
|
339
|
+
data = response.json()
|
|
340
|
+
|
|
341
|
+
diffs = data.get("diffs", [])
|
|
342
|
+
if diffs:
|
|
343
|
+
logger.info(f"Found {len(diffs)} incremental diffs from {start_release_id} to {end_release_id}")
|
|
344
|
+
return diffs
|
|
345
|
+
else:
|
|
346
|
+
logger.info("No incremental diffs found from API endpoint")
|
|
347
|
+
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.info(f"Error checking incremental updates: {e}")
|
|
352
|
+
logger.info("Falling back to alternative incremental check method")
|
|
353
|
+
# Try to get end_release_id if it wasn't set yet
|
|
354
|
+
try:
|
|
355
|
+
if 'end_release_id' not in locals():
|
|
356
|
+
end_release_id = self.get_latest_release_id()
|
|
357
|
+
return self._check_incremental_alternative_by_release(start_release_id or self.get_last_release_id(), end_release_id)
|
|
358
|
+
except Exception as fallback_error:
|
|
359
|
+
logger.debug(f"Error in fallback method: {fallback_error}")
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
def _check_incremental_alternative_by_release(self, start_release_id, end_release_id):
|
|
363
|
+
"""
|
|
364
|
+
Alternative method to check for incremental updates when the diffs API is unavailable
|
|
365
|
+
This tries to compare release IDs and suggest a full dataset download if needed
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
start_release_id: Starting release ID
|
|
369
|
+
end_release_id: Target release ID
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
list: List indicating a full dataset update is needed, or None if no updates
|
|
373
|
+
"""
|
|
374
|
+
try:
|
|
375
|
+
if not start_release_id or not end_release_id:
|
|
376
|
+
return None
|
|
377
|
+
|
|
378
|
+
if start_release_id == end_release_id:
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
# Try to find intermediate releases that might have diffs available
|
|
382
|
+
# This could be improved by calling a releases list API if available
|
|
383
|
+
from datetime import datetime, timedelta
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
start_date = datetime.strptime(start_release_id, "%Y-%m-%d")
|
|
387
|
+
end_date = datetime.strptime(end_release_id, "%Y-%m-%d")
|
|
388
|
+
days_diff = (end_date - start_date).days
|
|
389
|
+
|
|
390
|
+
if days_diff <= 7:
|
|
391
|
+
logger.info(f"Release gap of {days_diff} days should support diffs, but API returned 404")
|
|
392
|
+
logger.info("This might be a temporary API issue or the releases don't exist")
|
|
393
|
+
return None
|
|
394
|
+
elif days_diff <= 30:
|
|
395
|
+
logger.info(f"Release gap of {days_diff} days may be too large for diffs API")
|
|
396
|
+
logger.info("Consider updating release tracking more frequently")
|
|
397
|
+
else:
|
|
398
|
+
logger.info(f"Release gap of {days_diff} days is too large for incremental updates")
|
|
399
|
+
|
|
400
|
+
except ValueError:
|
|
401
|
+
logger.info(f"Cannot parse release dates: {start_release_id}, {end_release_id}")
|
|
402
|
+
|
|
403
|
+
logger.info(f"Recommending full dataset download from {start_release_id} to {end_release_id}")
|
|
404
|
+
|
|
405
|
+
# Return a structure indicating that a full dataset download is needed
|
|
406
|
+
return [{
|
|
407
|
+
"type": "full_dataset_update",
|
|
408
|
+
"start_release": start_release_id,
|
|
409
|
+
"end_release": end_release_id,
|
|
410
|
+
"message": f"Incremental diffs unavailable for gap from {start_release_id} to {end_release_id}, full dataset update recommended"
|
|
411
|
+
}]
|
|
412
|
+
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.debug(f"Error in alternative incremental check by release: {e}")
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
def _check_incremental_alternative(self, since_timestamp):
|
|
418
|
+
"""
|
|
419
|
+
Alternative method to check for incremental updates
|
|
420
|
+
This tries different approaches when the direct incremental endpoint isn't available
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
since_timestamp: ISO timestamp string of last update
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
list: List of incremental update files available, or None if no updates
|
|
427
|
+
"""
|
|
428
|
+
try:
|
|
429
|
+
# Try to get recent papers using the search API with date filtering
|
|
430
|
+
# This is a fallback when incremental endpoints aren't available
|
|
431
|
+
from datetime import datetime, timezone
|
|
432
|
+
import dateutil.parser
|
|
433
|
+
|
|
434
|
+
# Parse the timestamp
|
|
435
|
+
last_update = dateutil.parser.parse(since_timestamp)
|
|
436
|
+
current_time = datetime.now(timezone.utc)
|
|
437
|
+
|
|
438
|
+
# Check if there's been significant time since last update
|
|
439
|
+
time_diff = current_time - last_update
|
|
440
|
+
if time_diff.days < 1: # Less than 1 day, probably no significant updates
|
|
441
|
+
return None
|
|
442
|
+
|
|
443
|
+
logger.info(f"Last update was {time_diff.days} days ago, checking for recent papers")
|
|
444
|
+
|
|
445
|
+
# Use the search API to find recent papers
|
|
446
|
+
# This is not as efficient as true incremental updates but can work as a fallback
|
|
447
|
+
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
|
448
|
+
headers = {}
|
|
449
|
+
if self.api_key:
|
|
450
|
+
headers["x-api-key"] = self.api_key
|
|
451
|
+
|
|
452
|
+
params = {
|
|
453
|
+
"query": f"year:{current_time.year}",
|
|
454
|
+
"limit": 100,
|
|
455
|
+
"fields": "paperId,title,year,publicationDate"
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
response = self.session.get(url, headers=headers, params=params, timeout=30)
|
|
459
|
+
response.raise_for_status()
|
|
460
|
+
data = response.json()
|
|
461
|
+
|
|
462
|
+
recent_papers = data.get("data", [])
|
|
463
|
+
if recent_papers:
|
|
464
|
+
logger.info(f"Found {len(recent_papers)} recent papers that might need updating")
|
|
465
|
+
# Return a structure that indicates these are recent papers to check
|
|
466
|
+
return [{
|
|
467
|
+
"type": "recent_papers",
|
|
468
|
+
"count": len(recent_papers),
|
|
469
|
+
"papers": recent_papers
|
|
470
|
+
}]
|
|
471
|
+
|
|
472
|
+
return None
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logger.debug(f"Error in alternative incremental check: {e}")
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
def download_incremental_updates(self, diffs):
|
|
479
|
+
"""
|
|
480
|
+
Download and process incremental diffs according to the Semantic Scholar API format
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
diffs: List of diff dictionaries from the incremental API
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
bool: True if successful, False otherwise
|
|
487
|
+
"""
|
|
488
|
+
try:
|
|
489
|
+
logger.info("Processing incremental diffs...")
|
|
490
|
+
|
|
491
|
+
total_updated = 0
|
|
492
|
+
total_deleted = 0
|
|
493
|
+
|
|
494
|
+
for diff in diffs:
|
|
495
|
+
if diff.get("type") == "full_dataset_update":
|
|
496
|
+
# Handle full dataset update recommendation by downloading the full dataset
|
|
497
|
+
logger.info(f"Full dataset update recommended: {diff.get('message')}")
|
|
498
|
+
logger.info("Automatically downloading full dataset...")
|
|
499
|
+
|
|
500
|
+
# Download the full dataset
|
|
501
|
+
success = self.download_dataset_files()
|
|
502
|
+
if success:
|
|
503
|
+
logger.info("Full dataset download completed, processing files...")
|
|
504
|
+
# Process the downloaded files
|
|
505
|
+
self.process_local_files(force_reprocess=False, incremental=False)
|
|
506
|
+
|
|
507
|
+
# After processing, check for any remaining incremental updates
|
|
508
|
+
logger.info("Checking for additional incremental updates after full dataset processing...")
|
|
509
|
+
latest_release = self.get_latest_release_id()
|
|
510
|
+
current_release = self.get_last_release_id()
|
|
511
|
+
|
|
512
|
+
if current_release and current_release != latest_release:
|
|
513
|
+
logger.info(f"Checking for incremental updates from {current_release} to {latest_release}")
|
|
514
|
+
additional_updates = self.check_incremental_updates(current_release)
|
|
515
|
+
if additional_updates:
|
|
516
|
+
# Filter out any full_dataset_update recommendations to avoid infinite recursion
|
|
517
|
+
filtered_updates = [u for u in additional_updates if u.get("type") != "full_dataset_update"]
|
|
518
|
+
if filtered_updates:
|
|
519
|
+
logger.info(f"Processing {len(filtered_updates)} additional incremental updates")
|
|
520
|
+
self.download_incremental_updates(filtered_updates)
|
|
521
|
+
|
|
522
|
+
return True
|
|
523
|
+
else:
|
|
524
|
+
logger.error("Failed to download full dataset")
|
|
525
|
+
return False
|
|
526
|
+
elif diff.get("type") == "recent_papers":
|
|
527
|
+
# Handle recent papers update (fallback)
|
|
528
|
+
papers = diff.get("papers", [])
|
|
529
|
+
if papers:
|
|
530
|
+
logger.info(f"Processing {len(papers)} recent papers")
|
|
531
|
+
paper_ids = [p.get("paperId") for p in papers if p.get("paperId")]
|
|
532
|
+
if paper_ids:
|
|
533
|
+
self.download_papers(paper_ids)
|
|
534
|
+
total_updated += len(paper_ids)
|
|
535
|
+
else:
|
|
536
|
+
# Handle proper incremental diff format
|
|
537
|
+
update_files = diff.get("update_files", [])
|
|
538
|
+
delete_files = diff.get("delete_files", [])
|
|
539
|
+
|
|
540
|
+
# Process update files
|
|
541
|
+
for update_url in update_files:
|
|
542
|
+
try:
|
|
543
|
+
logger.info(f"Processing update file: {update_url}")
|
|
544
|
+
records_updated = self._process_incremental_file(update_url, "update")
|
|
545
|
+
total_updated += records_updated
|
|
546
|
+
except Exception as e:
|
|
547
|
+
logger.error(f"Error processing update file {update_url}: {e}")
|
|
548
|
+
continue
|
|
549
|
+
|
|
550
|
+
# Process delete files
|
|
551
|
+
for delete_url in delete_files:
|
|
552
|
+
try:
|
|
553
|
+
logger.info(f"Processing delete file: {delete_url}")
|
|
554
|
+
records_deleted = self._process_incremental_file(delete_url, "delete")
|
|
555
|
+
total_deleted += records_deleted
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.error(f"Error processing delete file {delete_url}: {e}")
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
logger.info(f"Incremental update complete - Updated: {total_updated}, Deleted: {total_deleted}")
|
|
561
|
+
|
|
562
|
+
# Update metadata after successful incremental update
|
|
563
|
+
if total_updated > 0 or total_deleted > 0:
|
|
564
|
+
current_time = datetime.now(timezone.utc).isoformat()
|
|
565
|
+
self.set_last_update_time(current_time)
|
|
566
|
+
|
|
567
|
+
# Update the last release ID to the latest
|
|
568
|
+
try:
|
|
569
|
+
latest_release = self.get_latest_release_id()
|
|
570
|
+
self.set_last_release_id(latest_release)
|
|
571
|
+
logger.info(f"Updated metadata - last update: {current_time}, release: {latest_release}")
|
|
572
|
+
except Exception as e:
|
|
573
|
+
logger.warning(f"Could not update release ID: {e}")
|
|
574
|
+
|
|
575
|
+
return total_updated > 0 or total_deleted > 0
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(f"Error processing incremental diffs: {e}")
|
|
579
|
+
return False
|
|
580
|
+
|
|
581
|
+
def get_latest_release_id(self):
|
|
582
|
+
"""
|
|
583
|
+
Get the latest release ID from the Semantic Scholar API
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
str: Latest release ID
|
|
587
|
+
"""
|
|
588
|
+
try:
|
|
589
|
+
# Use the datasets API to get the latest release
|
|
590
|
+
url = "https://api.semanticscholar.org/datasets/v1/release/latest"
|
|
591
|
+
headers = {}
|
|
592
|
+
if self.api_key:
|
|
593
|
+
headers["x-api-key"] = self.api_key
|
|
594
|
+
|
|
595
|
+
response = self.session.get(url, headers=headers, timeout=30)
|
|
596
|
+
response.raise_for_status()
|
|
597
|
+
|
|
598
|
+
data = response.json()
|
|
599
|
+
release_id = data.get("release_id")
|
|
600
|
+
|
|
601
|
+
if not release_id:
|
|
602
|
+
raise ValueError("No release_id found in API response")
|
|
603
|
+
|
|
604
|
+
return release_id
|
|
605
|
+
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.error(f"Error getting latest release ID: {e}")
|
|
608
|
+
raise
|
|
609
|
+
|
|
610
|
+
def normalize_paper_title(self, title: str) -> str:
|
|
611
|
+
"""
|
|
612
|
+
Normalize paper title by converting to lowercase and removing whitespace and punctuation
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
title: Original paper title
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Normalized title string
|
|
619
|
+
"""
|
|
620
|
+
if not title:
|
|
621
|
+
return ""
|
|
622
|
+
|
|
623
|
+
# Convert to lowercase
|
|
624
|
+
normalized = title.lower()
|
|
625
|
+
|
|
626
|
+
# Remove all non-alphanumeric characters (keeping only letters and numbers)
|
|
627
|
+
import re
|
|
628
|
+
normalized = re.sub(r'[^a-z0-9]', '', normalized)
|
|
629
|
+
|
|
630
|
+
return normalized
|
|
631
|
+
|
|
632
|
+
@lru_cache(maxsize=32)
|
|
633
|
+
def _build_search_query(self, query, year, field):
|
|
634
|
+
"""
|
|
635
|
+
Build search query string with caching for repeated parameters
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
query: Search query
|
|
639
|
+
year: Year for filtering (single year)
|
|
640
|
+
field: Field or subject area
|
|
641
|
+
|
|
642
|
+
Returns:
|
|
643
|
+
Formatted search query string
|
|
644
|
+
"""
|
|
645
|
+
search_parts = []
|
|
646
|
+
|
|
647
|
+
if query:
|
|
648
|
+
search_parts.append(query)
|
|
649
|
+
|
|
650
|
+
# Add year (single year only)
|
|
651
|
+
if year:
|
|
652
|
+
search_parts.append(f"year:{year}")
|
|
653
|
+
|
|
654
|
+
# Add field
|
|
655
|
+
if field:
|
|
656
|
+
search_parts.append(f"venue:{field}")
|
|
657
|
+
|
|
658
|
+
# If no search criteria provided, use a default query
|
|
659
|
+
if not search_parts:
|
|
660
|
+
search_parts.append("machine learning")
|
|
661
|
+
logger.warning(f"No search criteria provided, using default query: {search_parts[0]}")
|
|
662
|
+
|
|
663
|
+
return " ".join(search_parts)
|
|
664
|
+
|
|
665
|
+
def search_papers(self, query=None, start_year=None, end_year=None, field=None, limit=1000):
|
|
666
|
+
"""
|
|
667
|
+
Search for papers using the Semantic Scholar API
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
query: Search query
|
|
671
|
+
start_year: Start year for filtering
|
|
672
|
+
end_year: End year for filtering
|
|
673
|
+
field: Field or subject area
|
|
674
|
+
limit: Maximum number of papers to return
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
List of paper IDs
|
|
678
|
+
"""
|
|
679
|
+
logger.info(f"Searching for papers with query: {query}, years: {start_year}-{end_year}, field: {field}")
|
|
680
|
+
|
|
681
|
+
# If a year range is specified, perform separate queries for each year
|
|
682
|
+
paper_ids = []
|
|
683
|
+
if start_year and end_year and start_year != end_year:
|
|
684
|
+
years = range(start_year, end_year + 1)
|
|
685
|
+
elif start_year:
|
|
686
|
+
years = [start_year]
|
|
687
|
+
elif end_year:
|
|
688
|
+
years = [end_year]
|
|
689
|
+
else:
|
|
690
|
+
years = [None]
|
|
691
|
+
|
|
692
|
+
total_limit = limit
|
|
693
|
+
per_year_limit = max(1, limit // len(years)) if years[0] is not None else limit
|
|
694
|
+
|
|
695
|
+
for year in years:
|
|
696
|
+
# Build the search query for this year
|
|
697
|
+
search_query = self._build_search_query(query, year, field)
|
|
698
|
+
|
|
699
|
+
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
|
700
|
+
params = {
|
|
701
|
+
"query": search_query,
|
|
702
|
+
"limit": min(100, self.batch_size), # API limit is 100 per request
|
|
703
|
+
"fields": "paperId",
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
offset = 0
|
|
707
|
+
year_paper_ids = []
|
|
708
|
+
max_offset = 1000 # API: offset must be < 1000 per query
|
|
709
|
+
|
|
710
|
+
with tqdm(total=per_year_limit, desc=f"Searching papers for year {year}" if year else "Searching papers") as pbar:
|
|
711
|
+
while offset < per_year_limit and offset < max_offset:
|
|
712
|
+
params["offset"] = offset
|
|
713
|
+
# Don't request more than the API allows in one query
|
|
714
|
+
params["limit"] = min(params["limit"], per_year_limit - offset, max_offset - offset)
|
|
715
|
+
try:
|
|
716
|
+
response = self.session.get(url, params=params)
|
|
717
|
+
response.raise_for_status()
|
|
718
|
+
data = response.json()
|
|
719
|
+
|
|
720
|
+
batch_ids = [paper.get("paperId") for paper in data.get("data", []) if paper.get("paperId")]
|
|
721
|
+
year_paper_ids.extend(batch_ids)
|
|
722
|
+
pbar.update(len(batch_ids))
|
|
723
|
+
|
|
724
|
+
if len(batch_ids) < params["limit"]:
|
|
725
|
+
break
|
|
726
|
+
|
|
727
|
+
offset += len(batch_ids)
|
|
728
|
+
time.sleep(3 if self.api_key else 5)
|
|
729
|
+
except requests.exceptions.RequestException as e:
|
|
730
|
+
logger.error(f"Error searching papers: {str(e)}")
|
|
731
|
+
wait_time = min(60, 2 ** (offset // 100))
|
|
732
|
+
logger.info(f"Waiting {wait_time} seconds before retrying...")
|
|
733
|
+
time.sleep(wait_time)
|
|
734
|
+
continue
|
|
735
|
+
paper_ids.extend(year_paper_ids)
|
|
736
|
+
if len(paper_ids) >= total_limit:
|
|
737
|
+
break
|
|
738
|
+
|
|
739
|
+
# Truncate to the requested limit
|
|
740
|
+
paper_ids = paper_ids[:limit]
|
|
741
|
+
logger.info(f"Found {len(paper_ids)} papers")
|
|
742
|
+
return paper_ids
|
|
743
|
+
|
|
744
|
+
def download_paper_batch(self, paper_ids):
|
|
745
|
+
"""
|
|
746
|
+
Download metadata for a batch of papers
|
|
747
|
+
|
|
748
|
+
Args:
|
|
749
|
+
paper_ids: List of paper IDs to download
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
List of paper data dictionaries
|
|
753
|
+
"""
|
|
754
|
+
if not paper_ids:
|
|
755
|
+
return []
|
|
756
|
+
|
|
757
|
+
# Set up API request
|
|
758
|
+
url = "https://api.semanticscholar.org/graph/v1/paper/batch"
|
|
759
|
+
headers = {"Content-Type": "application/json"}
|
|
760
|
+
|
|
761
|
+
# Prepare request data
|
|
762
|
+
data = {
|
|
763
|
+
"ids": paper_ids,
|
|
764
|
+
"fields": ",".join(self.fields)
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
try:
|
|
768
|
+
response = self.session.post(url, headers=headers, json=data)
|
|
769
|
+
response.raise_for_status()
|
|
770
|
+
return response.json()
|
|
771
|
+
except requests.exceptions.RequestException as e:
|
|
772
|
+
logger.error(f"Error downloading papers: {str(e)}")
|
|
773
|
+
# If we get a rate limit error, wait and retry
|
|
774
|
+
if response.status_code == 429:
|
|
775
|
+
wait_time = int(response.headers.get("Retry-After", 60))
|
|
776
|
+
# Add a buffer to the wait time plus some jitter to avoid synchronization
|
|
777
|
+
jitter = random.uniform(5, 15)
|
|
778
|
+
total_wait = wait_time + jitter
|
|
779
|
+
logger.info(f"Rate limited. Waiting {total_wait:.1f} seconds before retrying...")
|
|
780
|
+
time.sleep(total_wait)
|
|
781
|
+
return self.download_paper_batch(paper_ids)
|
|
782
|
+
return []
|
|
783
|
+
|
|
784
|
+
def store_papers_batch(self, papers_data):
|
|
785
|
+
"""
|
|
786
|
+
Store multiple papers in a single transaction
|
|
787
|
+
|
|
788
|
+
Args:
|
|
789
|
+
papers_data: List of paper data dictionaries from the API
|
|
790
|
+
"""
|
|
791
|
+
if not papers_data:
|
|
792
|
+
return
|
|
793
|
+
|
|
794
|
+
cursor = self.conn.cursor()
|
|
795
|
+
|
|
796
|
+
try:
|
|
797
|
+
# Begin transaction
|
|
798
|
+
self.conn.execute("BEGIN TRANSACTION")
|
|
799
|
+
|
|
800
|
+
batch = []
|
|
801
|
+
for paper_data in papers_data:
|
|
802
|
+
if not paper_data or "paperId" not in paper_data:
|
|
803
|
+
continue
|
|
804
|
+
|
|
805
|
+
# Extract scalar fields
|
|
806
|
+
paper_id = paper_data.get("paperId")
|
|
807
|
+
corpus_id = paper_data.get("corpusId")
|
|
808
|
+
title = paper_data.get("title", "")
|
|
809
|
+
normalized_title = self.normalize_paper_title(title)
|
|
810
|
+
abstract = paper_data.get("abstract", "")
|
|
811
|
+
venue = paper_data.get("venue", "")
|
|
812
|
+
publication_venue_id = paper_data.get("publicationVenueId")
|
|
813
|
+
year = paper_data.get("year")
|
|
814
|
+
reference_count = paper_data.get("referenceCount")
|
|
815
|
+
citation_count = paper_data.get("citationCount")
|
|
816
|
+
influential_citation_count = paper_data.get("influentialCitationCount")
|
|
817
|
+
is_open_access = paper_data.get("isOpenAccess")
|
|
818
|
+
publication_date = paper_data.get("publicationDate")
|
|
819
|
+
url = paper_data.get("url", "")
|
|
820
|
+
|
|
821
|
+
# Extract external IDs
|
|
822
|
+
external_ids = paper_data.get("externalIds", {}) or {}
|
|
823
|
+
external_mag = external_ids.get("MAG")
|
|
824
|
+
external_corpus_id = external_ids.get("CorpusId")
|
|
825
|
+
external_acl = external_ids.get("ACL")
|
|
826
|
+
external_pubmed = external_ids.get("PubMed")
|
|
827
|
+
external_doi = external_ids.get("DOI")
|
|
828
|
+
external_pmc = external_ids.get("PubMedCentral")
|
|
829
|
+
external_dblp = external_ids.get("DBLP")
|
|
830
|
+
external_arxiv = external_ids.get("ArXiv")
|
|
831
|
+
|
|
832
|
+
# Extract journal info
|
|
833
|
+
journal = paper_data.get("journal", {}) or {}
|
|
834
|
+
journal_name = journal.get("name", "")
|
|
835
|
+
journal_pages = journal.get("pages")
|
|
836
|
+
journal_volume = journal.get("volume")
|
|
837
|
+
|
|
838
|
+
# Store complex fields as JSON
|
|
839
|
+
authors_json = json.dumps(paper_data.get("authors", []))
|
|
840
|
+
s2_fields_json = json.dumps(paper_data.get("s2FieldsOfStudy", []))
|
|
841
|
+
pub_types_json = json.dumps(paper_data.get("publicationTypes", []))
|
|
842
|
+
|
|
843
|
+
# Full JSON for complete access
|
|
844
|
+
full_json = json.dumps(paper_data)
|
|
845
|
+
|
|
846
|
+
batch.append((
|
|
847
|
+
paper_id, corpus_id, title, normalized_title, abstract, venue, publication_venue_id,
|
|
848
|
+
year, reference_count, citation_count, influential_citation_count,
|
|
849
|
+
is_open_access, publication_date, url,
|
|
850
|
+
external_mag, external_corpus_id, external_acl, external_pubmed,
|
|
851
|
+
external_doi, external_pmc, external_dblp, external_arxiv,
|
|
852
|
+
journal_name, journal_pages, journal_volume,
|
|
853
|
+
authors_json, s2_fields_json, pub_types_json, full_json
|
|
854
|
+
))
|
|
855
|
+
|
|
856
|
+
# Insert all papers in batch
|
|
857
|
+
if batch:
|
|
858
|
+
cursor.executemany("""
|
|
859
|
+
INSERT OR REPLACE INTO papers VALUES (
|
|
860
|
+
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
|
861
|
+
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
|
862
|
+
?, ?, ?, ?
|
|
863
|
+
)
|
|
864
|
+
""", batch)
|
|
865
|
+
|
|
866
|
+
# Commit transaction
|
|
867
|
+
self.conn.commit()
|
|
868
|
+
except sqlite3.Error as e:
|
|
869
|
+
logger.error(f"Error storing papers batch: {str(e)}")
|
|
870
|
+
self.conn.rollback()
|
|
871
|
+
|
|
872
|
+
def _retry_with_backoff(self, func, *args, max_retries=5, **kwargs):
|
|
873
|
+
"""
|
|
874
|
+
Retry a function with exponential backoff
|
|
875
|
+
|
|
876
|
+
Args:
|
|
877
|
+
func: Function to retry
|
|
878
|
+
max_retries: Maximum number of retries
|
|
879
|
+
*args, **kwargs: Arguments to pass to the function
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
The function result
|
|
883
|
+
"""
|
|
884
|
+
for attempt in range(max_retries):
|
|
885
|
+
try:
|
|
886
|
+
return func(*args, **kwargs)
|
|
887
|
+
except requests.exceptions.RequestException as e:
|
|
888
|
+
if attempt == max_retries - 1:
|
|
889
|
+
# Last attempt, re-raise the exception
|
|
890
|
+
raise
|
|
891
|
+
|
|
892
|
+
# Calculate backoff time with jitter
|
|
893
|
+
backoff_time = min(60, (2 ** attempt) * (1 + random.random()))
|
|
894
|
+
logger.warning(f"Request failed: {str(e)}. Retrying in {backoff_time:.1f} seconds (attempt {attempt+1}/{max_retries})...")
|
|
895
|
+
time.sleep(backoff_time)
|
|
896
|
+
|
|
897
|
+
def download_papers(self, paper_ids):
|
|
898
|
+
"""
|
|
899
|
+
Download and store papers in batches
|
|
900
|
+
|
|
901
|
+
Args:
|
|
902
|
+
paper_ids: List of paper IDs to download
|
|
903
|
+
"""
|
|
904
|
+
if not paper_ids:
|
|
905
|
+
logger.warning("No paper IDs provided")
|
|
906
|
+
return
|
|
907
|
+
|
|
908
|
+
# Remove duplicates while preserving order
|
|
909
|
+
unique_paper_ids = list(dict.fromkeys(paper_ids))
|
|
910
|
+
logger.info(f"Downloading {len(unique_paper_ids)} unique papers in batches of {self.batch_size}")
|
|
911
|
+
|
|
912
|
+
# Check which papers are already in the database
|
|
913
|
+
existing_ids = set()
|
|
914
|
+
try:
|
|
915
|
+
cursor = self.conn.cursor()
|
|
916
|
+
# Process in chunks to avoid sqlite parameter limit
|
|
917
|
+
chunk_size = 500
|
|
918
|
+
for i in range(0, len(unique_paper_ids), chunk_size):
|
|
919
|
+
chunk = unique_paper_ids[i:i+chunk_size]
|
|
920
|
+
placeholders = ','.join(['?'] * len(chunk))
|
|
921
|
+
cursor.execute(f"SELECT paperId FROM papers WHERE paperId IN ({placeholders})", chunk)
|
|
922
|
+
existing_ids.update(row[0] for row in cursor.fetchall())
|
|
923
|
+
except sqlite3.Error as e:
|
|
924
|
+
logger.error(f"Error checking existing papers: {str(e)}")
|
|
925
|
+
|
|
926
|
+
# Filter out papers that are already in the database
|
|
927
|
+
paper_ids_to_download = [pid for pid in unique_paper_ids if pid not in existing_ids]
|
|
928
|
+
logger.info(f"Skipping {len(existing_ids)} already downloaded papers")
|
|
929
|
+
logger.info(f"Downloading {len(paper_ids_to_download)} new papers")
|
|
930
|
+
|
|
931
|
+
if not paper_ids_to_download:
|
|
932
|
+
logger.info("All papers already exist in the database")
|
|
933
|
+
return
|
|
934
|
+
|
|
935
|
+
# Process papers in batches
|
|
936
|
+
for i in tqdm(range(0, len(paper_ids_to_download), self.batch_size), desc="Downloading batches"):
|
|
937
|
+
batch_ids = paper_ids_to_download[i:i+self.batch_size]
|
|
938
|
+
|
|
939
|
+
# Download batch with retry mechanism
|
|
940
|
+
try:
|
|
941
|
+
# Use a smaller batch size for unauthenticated requests
|
|
942
|
+
if not self.api_key and len(batch_ids) > 10:
|
|
943
|
+
sub_batches = [batch_ids[j:j+10] for j in range(0, len(batch_ids), 10)]
|
|
944
|
+
batch_data = []
|
|
945
|
+
for sub_batch in sub_batches:
|
|
946
|
+
sub_data = self._retry_with_backoff(self.download_paper_batch, sub_batch)
|
|
947
|
+
batch_data.extend(sub_data)
|
|
948
|
+
# Add extra delay between sub-batches
|
|
949
|
+
time.sleep(random.uniform(4, 6))
|
|
950
|
+
else:
|
|
951
|
+
batch_data = self._retry_with_backoff(self.download_paper_batch, batch_ids)
|
|
952
|
+
|
|
953
|
+
# Store papers in a single transaction
|
|
954
|
+
self.store_papers_batch(batch_data)
|
|
955
|
+
|
|
956
|
+
# Sleep to avoid rate limits with some randomness to avoid patterns
|
|
957
|
+
sleep_time = random.uniform(4, 7) if not self.api_key else random.uniform(2, 4)
|
|
958
|
+
time.sleep(sleep_time)
|
|
959
|
+
|
|
960
|
+
except Exception as e:
|
|
961
|
+
logger.error(f"Failed to download batch after multiple retries: {str(e)}")
|
|
962
|
+
logger.info("Saving progress and continuing with next batch...")
|
|
963
|
+
# Continue with the next batch instead of failing completely
|
|
964
|
+
|
|
965
|
+
def _is_file_processed(self, file_path):
|
|
966
|
+
"""
|
|
967
|
+
Check if a file's contents are already in the database by checking sample records.
|
|
968
|
+
Uses both first and last valid records in the file - if both exist in DB, file is processed.
|
|
969
|
+
This is much faster than reading the entire file.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
file_path: Full path to the .gz file
|
|
973
|
+
|
|
974
|
+
Returns:
|
|
975
|
+
bool: True if file appears to be already processed
|
|
976
|
+
"""
|
|
977
|
+
# If file doesn't exist, it's not processed
|
|
978
|
+
if not os.path.exists(file_path):
|
|
979
|
+
return False
|
|
980
|
+
|
|
981
|
+
try:
|
|
982
|
+
# Get sample records from the file (first and last)
|
|
983
|
+
sample_paper_ids = []
|
|
984
|
+
|
|
985
|
+
with gzip.open(file_path, "rt", encoding="utf-8") as f:
|
|
986
|
+
# Check first few lines for a valid record
|
|
987
|
+
for i in range(5):
|
|
988
|
+
line = f.readline()
|
|
989
|
+
if not line:
|
|
990
|
+
break
|
|
991
|
+
try:
|
|
992
|
+
paper_data = json.loads(line.strip())
|
|
993
|
+
if paper_data:
|
|
994
|
+
paper_id = paper_data.get("paperId")
|
|
995
|
+
corpus_id = paper_data.get("corpusid") or paper_data.get("corpusId")
|
|
996
|
+
|
|
997
|
+
if paper_id:
|
|
998
|
+
sample_paper_ids.append(paper_id)
|
|
999
|
+
break
|
|
1000
|
+
elif corpus_id:
|
|
1001
|
+
sample_paper_ids.append(str(corpus_id))
|
|
1002
|
+
break
|
|
1003
|
+
except json.JSONDecodeError:
|
|
1004
|
+
continue
|
|
1005
|
+
|
|
1006
|
+
# For the last few lines, we need to be more careful with gzip files
|
|
1007
|
+
# Since we can't easily seek to the end, we'll just check if we found a first record
|
|
1008
|
+
# and assume the file is processed if we found at least one valid record
|
|
1009
|
+
if sample_paper_ids:
|
|
1010
|
+
# Check if the sample record exists in the database
|
|
1011
|
+
cursor = self.conn.cursor()
|
|
1012
|
+
cursor.execute("SELECT COUNT(*) FROM papers WHERE paperId = ?", (sample_paper_ids[0],))
|
|
1013
|
+
count = cursor.fetchone()[0]
|
|
1014
|
+
if count > 0:
|
|
1015
|
+
logger.debug(f"File {os.path.basename(file_path)} appears to be processed (sample record found in DB)")
|
|
1016
|
+
return True
|
|
1017
|
+
|
|
1018
|
+
logger.debug(f"File {os.path.basename(file_path)} not processed (no sample records found in DB)")
|
|
1019
|
+
return False
|
|
1020
|
+
|
|
1021
|
+
except Exception as e:
|
|
1022
|
+
logger.warning(f"Error checking if {os.path.basename(file_path)} is processed: {e}")
|
|
1023
|
+
return False # If we can't check, assume not processed
|
|
1024
|
+
|
|
1025
|
+
def process_local_files(self, force_reprocess=False, incremental=False):
|
|
1026
|
+
"""
|
|
1027
|
+
Process existing .gz files in the output directory into the database
|
|
1028
|
+
|
|
1029
|
+
Args:
|
|
1030
|
+
force_reprocess: If True, reprocess all files even if already processed
|
|
1031
|
+
incremental: If True, only process new or modified files (set automatically if DB exists)
|
|
1032
|
+
"""
|
|
1033
|
+
if incremental and not force_reprocess:
|
|
1034
|
+
logger.info("Running in incremental mode - checking for updates...")
|
|
1035
|
+
update_info = self.check_for_updates()
|
|
1036
|
+
|
|
1037
|
+
if not update_info['has_updates']:
|
|
1038
|
+
logger.info(update_info['message'])
|
|
1039
|
+
return
|
|
1040
|
+
|
|
1041
|
+
logger.info(update_info['message'])
|
|
1042
|
+
|
|
1043
|
+
# Handle incremental updates if available, but only if there are no new local files
|
|
1044
|
+
if update_info.get('incremental_updates'):
|
|
1045
|
+
# Check if we have any unprocessed .gz files first
|
|
1046
|
+
gz_files = []
|
|
1047
|
+
for root, dirs, files in os.walk(self.output_dir):
|
|
1048
|
+
for file in files:
|
|
1049
|
+
if file.endswith('.gz'):
|
|
1050
|
+
gz_files.append(os.path.join(root, file))
|
|
1051
|
+
|
|
1052
|
+
unprocessed_files = []
|
|
1053
|
+
for gz_file in gz_files:
|
|
1054
|
+
if not self._is_file_processed(gz_file) or self._should_process_file(gz_file):
|
|
1055
|
+
unprocessed_files.append(gz_file)
|
|
1056
|
+
|
|
1057
|
+
# Check if any incremental updates are actually full dataset updates
|
|
1058
|
+
has_full_dataset_update = any(diff.get("type") == "full_dataset_update" for diff in update_info['incremental_updates'])
|
|
1059
|
+
|
|
1060
|
+
if unprocessed_files and not has_full_dataset_update:
|
|
1061
|
+
logger.info(f"Found {len(unprocessed_files)} unprocessed local files, processing those instead of incremental updates")
|
|
1062
|
+
elif has_full_dataset_update:
|
|
1063
|
+
logger.info("Full dataset update needed - this will download and process the latest dataset")
|
|
1064
|
+
success = self.download_incremental_updates(update_info['incremental_updates'])
|
|
1065
|
+
if success:
|
|
1066
|
+
logger.info("Full dataset update completed successfully")
|
|
1067
|
+
return
|
|
1068
|
+
else:
|
|
1069
|
+
logger.warning("Failed to process full dataset update, falling back to file processing")
|
|
1070
|
+
else:
|
|
1071
|
+
logger.info("Processing incremental updates...")
|
|
1072
|
+
success = self.download_incremental_updates(update_info['incremental_updates'])
|
|
1073
|
+
if success:
|
|
1074
|
+
logger.info("Incremental updates processed successfully")
|
|
1075
|
+
return
|
|
1076
|
+
else:
|
|
1077
|
+
logger.warning("Failed to process incremental updates, falling back to file processing")
|
|
1078
|
+
|
|
1079
|
+
# Find all .gz files in the output directory
|
|
1080
|
+
gz_files = []
|
|
1081
|
+
for root, dirs, files in os.walk(self.output_dir):
|
|
1082
|
+
for file in files:
|
|
1083
|
+
if file.endswith('.gz'):
|
|
1084
|
+
gz_files.append(os.path.join(root, file))
|
|
1085
|
+
|
|
1086
|
+
if not gz_files:
|
|
1087
|
+
logger.warning(f"No .gz files found in {self.output_dir}")
|
|
1088
|
+
return
|
|
1089
|
+
|
|
1090
|
+
logger.info(f"Found {len(gz_files)} .gz files to process")
|
|
1091
|
+
|
|
1092
|
+
# Process files
|
|
1093
|
+
total_records = 0
|
|
1094
|
+
skipped_files = 0
|
|
1095
|
+
|
|
1096
|
+
for i, gz_file in enumerate(gz_files, 1):
|
|
1097
|
+
try:
|
|
1098
|
+
# Check if file should be processed
|
|
1099
|
+
if incremental and not force_reprocess:
|
|
1100
|
+
# First check if file contents are already in database
|
|
1101
|
+
if self._is_file_processed(gz_file):
|
|
1102
|
+
logger.info(f"Skipping [{i}/{len(gz_files)}] {os.path.basename(gz_file)} - already processed")
|
|
1103
|
+
skipped_files += 1
|
|
1104
|
+
continue
|
|
1105
|
+
|
|
1106
|
+
# Then check if file has been modified since last processing
|
|
1107
|
+
if not self._should_process_file(gz_file):
|
|
1108
|
+
logger.info(f"Skipping [{i}/{len(gz_files)}] {os.path.basename(gz_file)} - already processed and not modified")
|
|
1109
|
+
skipped_files += 1
|
|
1110
|
+
continue
|
|
1111
|
+
|
|
1112
|
+
logger.info(f"Processing [{i}/{len(gz_files)}] {os.path.basename(gz_file)}")
|
|
1113
|
+
records_processed = self._process_gz_file(gz_file)
|
|
1114
|
+
total_records += records_processed
|
|
1115
|
+
logger.info(f"Processed [{i}/{len(gz_files)}] {records_processed:,} records from {os.path.basename(gz_file)}")
|
|
1116
|
+
|
|
1117
|
+
except Exception as e:
|
|
1118
|
+
logger.error(f"Error processing [{i}/{len(gz_files)}] {gz_file}: {e}")
|
|
1119
|
+
continue
|
|
1120
|
+
|
|
1121
|
+
# Update metadata after successful processing
|
|
1122
|
+
if incremental and total_records > 0:
|
|
1123
|
+
current_time = datetime.now(timezone.utc).isoformat()
|
|
1124
|
+
self.set_last_update_time(current_time)
|
|
1125
|
+
|
|
1126
|
+
# Get and set the latest release ID
|
|
1127
|
+
try:
|
|
1128
|
+
latest_release = self.get_latest_release_id()
|
|
1129
|
+
self.set_last_release_id(latest_release)
|
|
1130
|
+
logger.info(f"Updated metadata - last update: {current_time}, release: {latest_release}")
|
|
1131
|
+
except Exception as e:
|
|
1132
|
+
logger.warning(f"Could not update release ID: {e}")
|
|
1133
|
+
|
|
1134
|
+
logger.info(f"Total records processed: {total_records:,}")
|
|
1135
|
+
if skipped_files > 0:
|
|
1136
|
+
logger.info(f"Files skipped (already processed): {skipped_files}")
|
|
1137
|
+
|
|
1138
|
+
def _should_process_file(self, file_path):
|
|
1139
|
+
"""
|
|
1140
|
+
Check if a file should be processed in incremental mode
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
file_path: Path to the file to check
|
|
1144
|
+
|
|
1145
|
+
Returns:
|
|
1146
|
+
bool: True if file should be processed
|
|
1147
|
+
"""
|
|
1148
|
+
# Check if we have metadata about this file
|
|
1149
|
+
file_hash = self._get_file_hash(file_path)
|
|
1150
|
+
last_processed_hash = self.get_metadata(f'file_hash_{os.path.basename(file_path)}')
|
|
1151
|
+
|
|
1152
|
+
if last_processed_hash != file_hash:
|
|
1153
|
+
# File has changed, should process
|
|
1154
|
+
return True
|
|
1155
|
+
|
|
1156
|
+
# Check file modification time
|
|
1157
|
+
file_mtime = os.path.getmtime(file_path)
|
|
1158
|
+
last_processed_time = self.get_metadata(f'file_mtime_{os.path.basename(file_path)}')
|
|
1159
|
+
|
|
1160
|
+
if last_processed_time:
|
|
1161
|
+
try:
|
|
1162
|
+
last_time = float(last_processed_time)
|
|
1163
|
+
if file_mtime > last_time:
|
|
1164
|
+
return True
|
|
1165
|
+
except ValueError:
|
|
1166
|
+
pass
|
|
1167
|
+
|
|
1168
|
+
return False
|
|
1169
|
+
|
|
1170
|
+
def _get_file_hash(self, file_path):
|
|
1171
|
+
"""Get a hash of the file for change detection"""
|
|
1172
|
+
try:
|
|
1173
|
+
with open(file_path, 'rb') as f:
|
|
1174
|
+
return hashlib.md5(f.read(1024)).hexdigest() # Hash first 1KB for speed
|
|
1175
|
+
except Exception:
|
|
1176
|
+
return None
|
|
1177
|
+
|
|
1178
|
+
def _process_gz_file(self, filename, max_records=None):
|
|
1179
|
+
"""
|
|
1180
|
+
Process a single .gz file into the database
|
|
1181
|
+
|
|
1182
|
+
Args:
|
|
1183
|
+
filename: Path to the .gz file
|
|
1184
|
+
max_records: Maximum number of records to process (for testing)
|
|
1185
|
+
|
|
1186
|
+
Returns:
|
|
1187
|
+
int: Number of records processed
|
|
1188
|
+
"""
|
|
1189
|
+
file_path = filename
|
|
1190
|
+
|
|
1191
|
+
if not os.path.exists(file_path):
|
|
1192
|
+
logger.error(f"File not found: {file_path}")
|
|
1193
|
+
return 0
|
|
1194
|
+
|
|
1195
|
+
records_processed = 0
|
|
1196
|
+
cursor = self.conn.cursor()
|
|
1197
|
+
|
|
1198
|
+
try:
|
|
1199
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
1200
|
+
for line_num, line in enumerate(f, 1):
|
|
1201
|
+
if max_records and records_processed >= max_records:
|
|
1202
|
+
break
|
|
1203
|
+
|
|
1204
|
+
try:
|
|
1205
|
+
paper_data = json.loads(line.strip())
|
|
1206
|
+
self._insert_paper(cursor, paper_data)
|
|
1207
|
+
records_processed += 1
|
|
1208
|
+
|
|
1209
|
+
if records_processed % 10000 == 0:
|
|
1210
|
+
logger.info(f"Processed {records_processed:,} records from {filename}")
|
|
1211
|
+
self.conn.commit()
|
|
1212
|
+
|
|
1213
|
+
except json.JSONDecodeError as e:
|
|
1214
|
+
logger.warning(f"Invalid JSON on line {line_num} in {filename}: {e}")
|
|
1215
|
+
continue
|
|
1216
|
+
except Exception as e:
|
|
1217
|
+
logger.error(f"Error processing line {line_num} in {filename}: {e}")
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
1220
|
+
# Final commit
|
|
1221
|
+
self.conn.commit()
|
|
1222
|
+
|
|
1223
|
+
# Track file processing metadata for incremental updates
|
|
1224
|
+
self._track_file_processing(filename, file_path, records_processed)
|
|
1225
|
+
|
|
1226
|
+
return records_processed
|
|
1227
|
+
|
|
1228
|
+
except Exception as e:
|
|
1229
|
+
logger.error(f"Error processing file {filename}: {e}")
|
|
1230
|
+
return 0
|
|
1231
|
+
|
|
1232
|
+
def _track_file_processing(self, filename, file_path, records_processed):
|
|
1233
|
+
"""
|
|
1234
|
+
Track file processing metadata for incremental updates
|
|
1235
|
+
|
|
1236
|
+
Args:
|
|
1237
|
+
filename: Name of the processed file
|
|
1238
|
+
file_path: Full path to the file
|
|
1239
|
+
records_processed: Number of records processed
|
|
1240
|
+
"""
|
|
1241
|
+
try:
|
|
1242
|
+
# Get file metadata
|
|
1243
|
+
file_hash = self._get_file_hash(file_path)
|
|
1244
|
+
file_mtime = os.path.getmtime(file_path)
|
|
1245
|
+
file_size = os.path.getsize(file_path)
|
|
1246
|
+
|
|
1247
|
+
# Store metadata
|
|
1248
|
+
self.set_metadata(f'file_hash_{filename}', file_hash or '')
|
|
1249
|
+
self.set_metadata(f'file_mtime_{filename}', str(file_mtime))
|
|
1250
|
+
self.set_metadata(f'file_size_{filename}', str(file_size))
|
|
1251
|
+
self.set_metadata(f'file_records_{filename}', str(records_processed))
|
|
1252
|
+
self.set_metadata(f'file_processed_{filename}', datetime.now(timezone.utc).isoformat())
|
|
1253
|
+
|
|
1254
|
+
except Exception as e:
|
|
1255
|
+
logger.warning(f"Could not track file processing metadata for {filename}: {e}")
|
|
1256
|
+
|
|
1257
|
+
def _insert_paper(self, cursor, paper_data):
|
|
1258
|
+
"""
|
|
1259
|
+
Insert a single paper into the database
|
|
1260
|
+
|
|
1261
|
+
Args:
|
|
1262
|
+
cursor: Database cursor
|
|
1263
|
+
paper_data: Paper data dictionary
|
|
1264
|
+
"""
|
|
1265
|
+
# Skip empty or invalid records
|
|
1266
|
+
if not paper_data:
|
|
1267
|
+
return
|
|
1268
|
+
|
|
1269
|
+
# Use corpusid as primary key if paperId not available
|
|
1270
|
+
paper_id = paper_data.get("paperId")
|
|
1271
|
+
if not paper_id:
|
|
1272
|
+
corpus_id = paper_data.get("corpusid") or paper_data.get("corpusId")
|
|
1273
|
+
if corpus_id:
|
|
1274
|
+
paper_id = str(corpus_id) # Use corpus ID as paper ID
|
|
1275
|
+
else:
|
|
1276
|
+
return # Skip if no ID available
|
|
1277
|
+
|
|
1278
|
+
# Extract scalar fields (handle both camelCase and lowercase)
|
|
1279
|
+
corpus_id = paper_data.get("corpusId") or paper_data.get("corpusid")
|
|
1280
|
+
title = paper_data.get("title", "")
|
|
1281
|
+
normalized_title = self.normalize_paper_title(title)
|
|
1282
|
+
abstract = paper_data.get("abstract", "")
|
|
1283
|
+
venue = paper_data.get("venue", "")
|
|
1284
|
+
publication_venue_id = paper_data.get("publicationVenueId") or paper_data.get("publicationvenueid")
|
|
1285
|
+
year = paper_data.get("year")
|
|
1286
|
+
reference_count = paper_data.get("referenceCount") or paper_data.get("referencecount")
|
|
1287
|
+
citation_count = paper_data.get("citationCount") or paper_data.get("citationcount")
|
|
1288
|
+
influential_citation_count = paper_data.get("influentialCitationCount") or paper_data.get("influentialcitationcount")
|
|
1289
|
+
is_open_access = paper_data.get("isOpenAccess") or paper_data.get("isopenaccess")
|
|
1290
|
+
publication_date = paper_data.get("publicationDate") or paper_data.get("publicationdate")
|
|
1291
|
+
url = paper_data.get("url", "")
|
|
1292
|
+
|
|
1293
|
+
# Extract external IDs (handle both camelCase and lowercase)
|
|
1294
|
+
external_ids = paper_data.get("externalIds") or paper_data.get("externalids") or {}
|
|
1295
|
+
external_mag = external_ids.get("MAG")
|
|
1296
|
+
external_corpus_id = external_ids.get("CorpusId")
|
|
1297
|
+
external_acl = external_ids.get("ACL")
|
|
1298
|
+
external_pubmed = external_ids.get("PubMed")
|
|
1299
|
+
external_doi = external_ids.get("DOI")
|
|
1300
|
+
external_pmc = external_ids.get("PubMedCentral")
|
|
1301
|
+
external_dblp = external_ids.get("DBLP")
|
|
1302
|
+
external_arxiv = external_ids.get("ArXiv")
|
|
1303
|
+
|
|
1304
|
+
# Extract journal info
|
|
1305
|
+
journal = paper_data.get("journal", {}) or {}
|
|
1306
|
+
journal_name = journal.get("name", "")
|
|
1307
|
+
journal_pages = journal.get("pages")
|
|
1308
|
+
journal_volume = journal.get("volume")
|
|
1309
|
+
|
|
1310
|
+
# Store complex fields as JSON (handle both camelCase and lowercase)
|
|
1311
|
+
authors_json = json.dumps(paper_data.get("authors", []))
|
|
1312
|
+
s2_fields_json = json.dumps(paper_data.get("s2FieldsOfStudy") or paper_data.get("s2fieldsofstudy") or [])
|
|
1313
|
+
pub_types_json = json.dumps(paper_data.get("publicationTypes") or paper_data.get("publicationtypes") or [])
|
|
1314
|
+
|
|
1315
|
+
# Full JSON for complete access
|
|
1316
|
+
full_json = json.dumps(paper_data)
|
|
1317
|
+
|
|
1318
|
+
# Insert or replace the paper
|
|
1319
|
+
cursor.execute("""
|
|
1320
|
+
INSERT OR REPLACE INTO papers VALUES (
|
|
1321
|
+
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
|
1322
|
+
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
|
|
1323
|
+
?, ?, ?, ?
|
|
1324
|
+
)
|
|
1325
|
+
""", (
|
|
1326
|
+
paper_id, corpus_id, title, normalized_title, abstract, venue, publication_venue_id,
|
|
1327
|
+
year, reference_count, citation_count, influential_citation_count,
|
|
1328
|
+
is_open_access, publication_date, url,
|
|
1329
|
+
external_mag, external_corpus_id, external_acl, external_pubmed,
|
|
1330
|
+
external_doi, external_pmc, external_dblp, external_arxiv,
|
|
1331
|
+
journal_name, journal_pages, journal_volume,
|
|
1332
|
+
authors_json, s2_fields_json, pub_types_json, full_json
|
|
1333
|
+
))
|
|
1334
|
+
|
|
1335
|
+
def close(self):
|
|
1336
|
+
"""Close the database connection"""
|
|
1337
|
+
if hasattr(self, 'conn') and self.conn:
|
|
1338
|
+
# Optimize database before closing
|
|
1339
|
+
self.conn.execute("PRAGMA optimize")
|
|
1340
|
+
self.conn.close()
|
|
1341
|
+
|
|
1342
|
+
if hasattr(self, 'session') and self.session:
|
|
1343
|
+
self.session.close()
|
|
1344
|
+
|
|
1345
|
+
def download_dataset_files(self):
|
|
1346
|
+
"""
|
|
1347
|
+
Download the official Semantic Scholar dataset files
|
|
1348
|
+
|
|
1349
|
+
Returns:
|
|
1350
|
+
bool: True if successful, False otherwise
|
|
1351
|
+
"""
|
|
1352
|
+
try:
|
|
1353
|
+
logger.info("Downloading Semantic Scholar dataset files...")
|
|
1354
|
+
|
|
1355
|
+
# Get the latest release ID
|
|
1356
|
+
latest_release = self.get_latest_release_id()
|
|
1357
|
+
logger.info(f"Latest release: {latest_release}")
|
|
1358
|
+
|
|
1359
|
+
# List files for the latest release
|
|
1360
|
+
files = self.list_files(latest_release, dataset="papers")
|
|
1361
|
+
if not files:
|
|
1362
|
+
logger.error("No files found for the latest release")
|
|
1363
|
+
return False
|
|
1364
|
+
|
|
1365
|
+
logger.info(f"Found {len(files)} files to download")
|
|
1366
|
+
|
|
1367
|
+
# Download files
|
|
1368
|
+
downloaded_count = 0
|
|
1369
|
+
for file_meta in files:
|
|
1370
|
+
try:
|
|
1371
|
+
path, updated = self.download_file(file_meta)
|
|
1372
|
+
if updated:
|
|
1373
|
+
downloaded_count += 1
|
|
1374
|
+
logger.info(f"Downloaded: {path}")
|
|
1375
|
+
else:
|
|
1376
|
+
logger.info(f"Skipped (not modified): {path}")
|
|
1377
|
+
except Exception as e:
|
|
1378
|
+
logger.error(f"Error downloading {file_meta.get('path', 'unknown')}: {e}")
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1381
|
+
logger.info(f"Downloaded {downloaded_count} files out of {len(files)} total files")
|
|
1382
|
+
return downloaded_count > 0
|
|
1383
|
+
|
|
1384
|
+
except Exception as e:
|
|
1385
|
+
logger.error(f"Error downloading dataset files: {e}")
|
|
1386
|
+
return False
|
|
1387
|
+
|
|
1388
|
+
def list_files(self, release_id: str, dataset: str = "papers") -> list[dict]:
|
|
1389
|
+
"""
|
|
1390
|
+
List all files for a given release and dataset.
|
|
1391
|
+
|
|
1392
|
+
Args:
|
|
1393
|
+
release_id: Release ID
|
|
1394
|
+
dataset: Dataset name (default: "papers")
|
|
1395
|
+
|
|
1396
|
+
Returns:
|
|
1397
|
+
list: List of file metadata dictionaries
|
|
1398
|
+
"""
|
|
1399
|
+
logger.info(f"Requesting file list for release {release_id}, dataset {dataset}...")
|
|
1400
|
+
|
|
1401
|
+
try:
|
|
1402
|
+
url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}/dataset/{dataset}"
|
|
1403
|
+
headers = {}
|
|
1404
|
+
if self.api_key:
|
|
1405
|
+
headers["x-api-key"] = self.api_key
|
|
1406
|
+
|
|
1407
|
+
response = self.session.get(url, headers=headers, timeout=30)
|
|
1408
|
+
response.raise_for_status()
|
|
1409
|
+
|
|
1410
|
+
data = response.json()
|
|
1411
|
+
files = data.get("files", [])
|
|
1412
|
+
|
|
1413
|
+
# Convert URL-based files to structured format
|
|
1414
|
+
structured_files = []
|
|
1415
|
+
for file_item in files:
|
|
1416
|
+
if isinstance(file_item, str):
|
|
1417
|
+
# File is a URL string - extract filename and create structure
|
|
1418
|
+
import urllib.parse
|
|
1419
|
+
parsed_url = urllib.parse.urlparse(file_item)
|
|
1420
|
+
filename = parsed_url.path.split('/')[-1]
|
|
1421
|
+
|
|
1422
|
+
structured_files.append({
|
|
1423
|
+
'path': filename,
|
|
1424
|
+
'url': file_item,
|
|
1425
|
+
'size': 0 # Size not available from URL format
|
|
1426
|
+
})
|
|
1427
|
+
elif isinstance(file_item, dict):
|
|
1428
|
+
# File is already structured
|
|
1429
|
+
structured_files.append(file_item)
|
|
1430
|
+
|
|
1431
|
+
logger.info(f"Retrieved {len(structured_files)} files from API")
|
|
1432
|
+
return structured_files
|
|
1433
|
+
|
|
1434
|
+
except Exception as e:
|
|
1435
|
+
logger.error(f"Error listing files: {e}")
|
|
1436
|
+
return []
|
|
1437
|
+
|
|
1438
|
+
def download_file(self, file_meta):
|
|
1439
|
+
"""
|
|
1440
|
+
Download a single file from the dataset
|
|
1441
|
+
|
|
1442
|
+
Args:
|
|
1443
|
+
file_meta: File metadata dictionary
|
|
1444
|
+
|
|
1445
|
+
Returns:
|
|
1446
|
+
tuple: (file_path, was_updated)
|
|
1447
|
+
"""
|
|
1448
|
+
url = file_meta["url"]
|
|
1449
|
+
local_path = os.path.join(self.output_dir, file_meta["path"])
|
|
1450
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
1451
|
+
|
|
1452
|
+
# Get file size for progress tracking
|
|
1453
|
+
file_size = file_meta.get("size", 0)
|
|
1454
|
+
file_name = file_meta["path"]
|
|
1455
|
+
|
|
1456
|
+
headers = {}
|
|
1457
|
+
# Use conditional request if we have Last-Modified stored
|
|
1458
|
+
if os.path.exists(local_path + ".meta"):
|
|
1459
|
+
last_mod = open(local_path + ".meta").read().strip()
|
|
1460
|
+
headers["If-Modified-Since"] = last_mod
|
|
1461
|
+
|
|
1462
|
+
logger.info(f"Downloading {file_name} ({self._format_size(file_size)})")
|
|
1463
|
+
start_time = time.time()
|
|
1464
|
+
|
|
1465
|
+
resp = self.session.get(url, headers=headers, stream=True, timeout=300)
|
|
1466
|
+
if resp.status_code == 304:
|
|
1467
|
+
logger.info(f"{file_meta['path']} not modified, skipping.")
|
|
1468
|
+
return file_meta["path"], False
|
|
1469
|
+
resp.raise_for_status()
|
|
1470
|
+
|
|
1471
|
+
# Get actual content length from response headers if available
|
|
1472
|
+
content_length = int(resp.headers.get('Content-Length', file_size or 0))
|
|
1473
|
+
|
|
1474
|
+
# Save file with progress tracking
|
|
1475
|
+
downloaded = 0
|
|
1476
|
+
with open(local_path, "wb") as f_out:
|
|
1477
|
+
for chunk in resp.iter_content(8192):
|
|
1478
|
+
f_out.write(chunk)
|
|
1479
|
+
downloaded += len(chunk)
|
|
1480
|
+
|
|
1481
|
+
download_time = time.time() - start_time
|
|
1482
|
+
download_speed = downloaded / download_time if download_time > 0 else 0
|
|
1483
|
+
|
|
1484
|
+
logger.info(f"Downloaded {file_name}: {self._format_size(downloaded)} in {download_time:.1f}s "
|
|
1485
|
+
f"({self._format_size(download_speed)}/s)")
|
|
1486
|
+
|
|
1487
|
+
# Save new Last-Modified
|
|
1488
|
+
last_mod = resp.headers.get("Last-Modified", datetime.now(timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT"))
|
|
1489
|
+
with open(local_path + ".meta", "w") as m:
|
|
1490
|
+
m.write(last_mod)
|
|
1491
|
+
return file_meta["path"], True
|
|
1492
|
+
|
|
1493
|
+
def _format_size(self, size_bytes):
|
|
1494
|
+
"""Format file size in human readable format."""
|
|
1495
|
+
if size_bytes == 0:
|
|
1496
|
+
return "0 B"
|
|
1497
|
+
|
|
1498
|
+
size_names = ["B", "KB", "MB", "GB", "TB"]
|
|
1499
|
+
i = 0
|
|
1500
|
+
size = float(size_bytes)
|
|
1501
|
+
|
|
1502
|
+
while size >= 1024.0 and i < len(size_names) - 1:
|
|
1503
|
+
size /= 1024.0
|
|
1504
|
+
i += 1
|
|
1505
|
+
|
|
1506
|
+
return f"{size:.1f} {size_names[i]}"
|
|
1507
|
+
|
|
1508
|
+
def _process_incremental_file(self, file_url, operation_type):
|
|
1509
|
+
"""
|
|
1510
|
+
Process a single incremental diff file (either updates or deletes)
|
|
1511
|
+
|
|
1512
|
+
Args:
|
|
1513
|
+
file_url: URL of the diff file to process
|
|
1514
|
+
operation_type: Either "update" or "delete"
|
|
1515
|
+
|
|
1516
|
+
Returns:
|
|
1517
|
+
int: Number of records processed
|
|
1518
|
+
"""
|
|
1519
|
+
try:
|
|
1520
|
+
logger.info(f"Processing {operation_type} file: {file_url}")
|
|
1521
|
+
|
|
1522
|
+
# Download the file content
|
|
1523
|
+
response = self.session.get(file_url, stream=True, timeout=300)
|
|
1524
|
+
response.raise_for_status()
|
|
1525
|
+
|
|
1526
|
+
records_processed = 0
|
|
1527
|
+
cursor = self.conn.cursor()
|
|
1528
|
+
|
|
1529
|
+
# Begin transaction for better performance
|
|
1530
|
+
self.conn.execute("BEGIN TRANSACTION")
|
|
1531
|
+
|
|
1532
|
+
try:
|
|
1533
|
+
# Process the file line by line
|
|
1534
|
+
for line_num, line in enumerate(response.iter_lines(decode_unicode=True), 1):
|
|
1535
|
+
if not line.strip():
|
|
1536
|
+
continue
|
|
1537
|
+
|
|
1538
|
+
try:
|
|
1539
|
+
record = json.loads(line.strip())
|
|
1540
|
+
|
|
1541
|
+
if operation_type == "update":
|
|
1542
|
+
# Insert or update the record
|
|
1543
|
+
self._insert_paper(cursor, record)
|
|
1544
|
+
elif operation_type == "delete":
|
|
1545
|
+
# Delete the record by primary key
|
|
1546
|
+
paper_id = record.get("paperId")
|
|
1547
|
+
if not paper_id:
|
|
1548
|
+
# Fallback to corpusId if paperId not available
|
|
1549
|
+
corpus_id = record.get("corpusid") or record.get("corpusId")
|
|
1550
|
+
if corpus_id:
|
|
1551
|
+
paper_id = str(corpus_id)
|
|
1552
|
+
|
|
1553
|
+
if paper_id:
|
|
1554
|
+
cursor.execute("DELETE FROM papers WHERE paperId = ?", (paper_id,))
|
|
1555
|
+
|
|
1556
|
+
records_processed += 1
|
|
1557
|
+
|
|
1558
|
+
# Commit periodically for large files
|
|
1559
|
+
if records_processed % 10000 == 0:
|
|
1560
|
+
self.conn.commit()
|
|
1561
|
+
self.conn.execute("BEGIN TRANSACTION")
|
|
1562
|
+
logger.info(f"Processed {records_processed:,} {operation_type} records")
|
|
1563
|
+
|
|
1564
|
+
except json.JSONDecodeError as e:
|
|
1565
|
+
logger.warning(f"Invalid JSON on line {line_num} in {operation_type} file: {e}")
|
|
1566
|
+
continue
|
|
1567
|
+
except Exception as e:
|
|
1568
|
+
logger.error(f"Error processing line {line_num} in {operation_type} file: {e}")
|
|
1569
|
+
continue
|
|
1570
|
+
|
|
1571
|
+
# Final commit
|
|
1572
|
+
self.conn.commit()
|
|
1573
|
+
logger.info(f"Completed processing {records_processed:,} {operation_type} records")
|
|
1574
|
+
|
|
1575
|
+
except Exception as e:
|
|
1576
|
+
self.conn.rollback()
|
|
1577
|
+
raise e
|
|
1578
|
+
|
|
1579
|
+
return records_processed
|
|
1580
|
+
|
|
1581
|
+
except Exception as e:
|
|
1582
|
+
logger.error(f"Error processing {operation_type} file {file_url}: {e}")
|
|
1583
|
+
return 0
|
|
1584
|
+
|
|
1585
|
+
def main():
|
|
1586
|
+
"""Main function"""
|
|
1587
|
+
parser = argparse.ArgumentParser(description="Download and process Semantic Scholar paper metadata")
|
|
1588
|
+
parser.add_argument("--output-dir", type=str, default="semantic_scholar_db",
|
|
1589
|
+
help="Directory to store the database (default: semantic_scholar_db)")
|
|
1590
|
+
parser.add_argument("--batch-size", type=int, default=10,
|
|
1591
|
+
help="Number of papers to download in each batch (default: 10 for unauthenticated requests, can increase with API key)")
|
|
1592
|
+
parser.add_argument("--api-key", type=str,
|
|
1593
|
+
help="Semantic Scholar API key (optional, increases rate limits)")
|
|
1594
|
+
parser.add_argument("--fields", type=str,
|
|
1595
|
+
default="id,title,authors,year,externalIds,url,abstract",
|
|
1596
|
+
help="Comma-separated list of fields to include")
|
|
1597
|
+
|
|
1598
|
+
# Dataset download options
|
|
1599
|
+
parser.add_argument("--download-dataset", action="store_true",
|
|
1600
|
+
help="Download the official Semantic Scholar dataset files (.gz)")
|
|
1601
|
+
parser.add_argument("--process-local-files", action="store_true",
|
|
1602
|
+
help="Process existing .gz files in the output directory into the database")
|
|
1603
|
+
parser.add_argument("--force-reprocess", action="store_true",
|
|
1604
|
+
help="Force reprocessing of all files (use with --process-local-files)")
|
|
1605
|
+
|
|
1606
|
+
# Legacy API-based search options (for backwards compatibility)
|
|
1607
|
+
parser.add_argument("--query", type=str,
|
|
1608
|
+
help="Search query to download papers via API")
|
|
1609
|
+
parser.add_argument("--start-year", type=int,
|
|
1610
|
+
help="Start year for downloading papers by year range via API")
|
|
1611
|
+
parser.add_argument("--end-year", type=int,
|
|
1612
|
+
help="End year for downloading papers by year range via API")
|
|
1613
|
+
parser.add_argument("--field", type=str,
|
|
1614
|
+
help="Field or subject area for downloading papers by field via API")
|
|
1615
|
+
parser.add_argument("--limit", type=int, default=100000,
|
|
1616
|
+
help="Maximum number of papers to download via API (default: 100000)")
|
|
1617
|
+
parser.add_argument("--threads", type=int, default=1,
|
|
1618
|
+
help="Number of threads for parallel processing (default: 1)")
|
|
1619
|
+
|
|
1620
|
+
args = parser.parse_args()
|
|
1621
|
+
|
|
1622
|
+
# Initialize downloader
|
|
1623
|
+
downloader = SemanticScholarDownloader(
|
|
1624
|
+
output_dir=args.output_dir,
|
|
1625
|
+
batch_size=args.batch_size,
|
|
1626
|
+
api_key=args.api_key,
|
|
1627
|
+
fields=args.fields.split(",") if args.fields else None
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1630
|
+
try:
|
|
1631
|
+
# Check if database exists
|
|
1632
|
+
db_exists = os.path.exists(downloader.db_path)
|
|
1633
|
+
|
|
1634
|
+
# Determine what to do based on arguments and database state
|
|
1635
|
+
if args.download_dataset:
|
|
1636
|
+
logger.info("Downloading dataset files only")
|
|
1637
|
+
success = downloader.download_dataset_files()
|
|
1638
|
+
if not success:
|
|
1639
|
+
logger.error("Failed to download dataset files")
|
|
1640
|
+
return 1
|
|
1641
|
+
|
|
1642
|
+
elif args.process_local_files:
|
|
1643
|
+
logger.info("Processing local files mode")
|
|
1644
|
+
downloader.process_local_files(
|
|
1645
|
+
force_reprocess=args.force_reprocess,
|
|
1646
|
+
incremental=db_exists # Only incremental if DB exists
|
|
1647
|
+
)
|
|
1648
|
+
|
|
1649
|
+
elif args.query or args.start_year or args.end_year or args.field:
|
|
1650
|
+
# Legacy API-based search
|
|
1651
|
+
logger.info("Using legacy API-based paper search")
|
|
1652
|
+
paper_ids = downloader.search_papers(
|
|
1653
|
+
query=args.query,
|
|
1654
|
+
start_year=args.start_year,
|
|
1655
|
+
end_year=args.end_year,
|
|
1656
|
+
field=args.field,
|
|
1657
|
+
limit=args.limit
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
# Download papers
|
|
1661
|
+
downloader.download_papers(paper_ids)
|
|
1662
|
+
|
|
1663
|
+
else:
|
|
1664
|
+
# Default behavior: automatic full or incremental based on DB state
|
|
1665
|
+
if not db_exists:
|
|
1666
|
+
logger.info("No database found - performing full download")
|
|
1667
|
+
success = downloader.download_dataset_files()
|
|
1668
|
+
if not success:
|
|
1669
|
+
logger.error("Failed to download dataset files")
|
|
1670
|
+
return 1
|
|
1671
|
+
downloader.process_local_files(incremental=False)
|
|
1672
|
+
else:
|
|
1673
|
+
logger.info("Database exists - checking for new or updated data (incremental update)")
|
|
1674
|
+
# Check if there are any .gz files to process
|
|
1675
|
+
gz_files = []
|
|
1676
|
+
for root, dirs, files in os.walk(args.output_dir):
|
|
1677
|
+
for file in files:
|
|
1678
|
+
if file.endswith('.gz'):
|
|
1679
|
+
gz_files.append(os.path.join(root, file))
|
|
1680
|
+
|
|
1681
|
+
if gz_files:
|
|
1682
|
+
logger.info(f"Found {len(gz_files)} .gz files to process")
|
|
1683
|
+
downloader.process_local_files(
|
|
1684
|
+
force_reprocess=args.force_reprocess,
|
|
1685
|
+
incremental=True
|
|
1686
|
+
)
|
|
1687
|
+
else:
|
|
1688
|
+
logger.info("No .gz files found - downloading latest dataset")
|
|
1689
|
+
success = downloader.download_dataset_files()
|
|
1690
|
+
if not success:
|
|
1691
|
+
logger.error("Failed to download dataset files")
|
|
1692
|
+
return 1
|
|
1693
|
+
downloader.process_local_files(incremental=True)
|
|
1694
|
+
|
|
1695
|
+
logger.info(f"Completed processing in {args.output_dir}")
|
|
1696
|
+
|
|
1697
|
+
# Show database statistics
|
|
1698
|
+
cursor = downloader.conn.cursor()
|
|
1699
|
+
cursor.execute("SELECT COUNT(*) FROM papers")
|
|
1700
|
+
count = cursor.fetchone()[0]
|
|
1701
|
+
logger.info(f"Total papers in database: {count:,}")
|
|
1702
|
+
|
|
1703
|
+
# Show metadata if available
|
|
1704
|
+
if db_exists:
|
|
1705
|
+
last_update = downloader.get_last_update_time()
|
|
1706
|
+
last_release = downloader.get_last_release_id()
|
|
1707
|
+
if last_update:
|
|
1708
|
+
logger.info(f"Last update timestamp: {last_update}")
|
|
1709
|
+
if last_release:
|
|
1710
|
+
logger.info(f"Current release: {last_release}")
|
|
1711
|
+
|
|
1712
|
+
return 0
|
|
1713
|
+
|
|
1714
|
+
except KeyboardInterrupt:
|
|
1715
|
+
logger.info("Download interrupted by user")
|
|
1716
|
+
return 1
|
|
1717
|
+
except Exception as e:
|
|
1718
|
+
logger.error(f"Error during processing: {e}")
|
|
1719
|
+
return 1
|
|
1720
|
+
finally:
|
|
1721
|
+
# Close database connection
|
|
1722
|
+
downloader.close()
|
|
1723
|
+
|
|
1724
|
+
if __name__ == "__main__":
|
|
1725
|
+
main()
|