qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qlever/Qleverfiles/Qleverfile.dblp +36 -0
- qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
- qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
- qlever/Qleverfiles/Qleverfile.default +51 -0
- qlever/Qleverfiles/Qleverfile.dnb +40 -0
- qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
- qlever/Qleverfiles/Qleverfile.freebase +28 -0
- qlever/Qleverfiles/Qleverfile.imdb +36 -0
- qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
- qlever/Qleverfiles/Qleverfile.olympics +31 -0
- qlever/Qleverfiles/Qleverfile.orkg +30 -0
- qlever/Qleverfiles/Qleverfile.osm-country +39 -0
- qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
- qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
- qlever/Qleverfiles/Qleverfile.pubchem +131 -0
- qlever/Qleverfiles/Qleverfile.scientists +29 -0
- qlever/Qleverfiles/Qleverfile.uniprot +74 -0
- qlever/Qleverfiles/Qleverfile.vvz +31 -0
- qlever/Qleverfiles/Qleverfile.wikidata +42 -0
- qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
- qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
- qlever/__init__.py +44 -1380
- qlever/command.py +87 -0
- qlever/commands/__init__.py +0 -0
- qlever/commands/add_text_index.py +115 -0
- qlever/commands/benchmark_queries.py +1019 -0
- qlever/commands/cache_stats.py +125 -0
- qlever/commands/clear_cache.py +88 -0
- qlever/commands/extract_queries.py +120 -0
- qlever/commands/get_data.py +48 -0
- qlever/commands/index.py +333 -0
- qlever/commands/index_stats.py +306 -0
- qlever/commands/log.py +66 -0
- qlever/commands/materialized_view.py +110 -0
- qlever/commands/query.py +142 -0
- qlever/commands/rebuild_index.py +176 -0
- qlever/commands/reset_updates.py +59 -0
- qlever/commands/settings.py +115 -0
- qlever/commands/setup_config.py +97 -0
- qlever/commands/start.py +336 -0
- qlever/commands/status.py +50 -0
- qlever/commands/stop.py +90 -0
- qlever/commands/system_info.py +130 -0
- qlever/commands/ui.py +271 -0
- qlever/commands/update.py +90 -0
- qlever/commands/update_wikidata.py +1204 -0
- qlever/commands/warmup.py +41 -0
- qlever/config.py +223 -0
- qlever/containerize.py +167 -0
- qlever/log.py +55 -0
- qlever/qlever_main.py +79 -0
- qlever/qleverfile.py +530 -0
- qlever/util.py +330 -0
- qlever-0.5.41.dist-info/METADATA +127 -0
- qlever-0.5.41.dist-info/RECORD +59 -0
- {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
- qlever-0.5.41.dist-info/entry_points.txt +2 -0
- qlever-0.5.41.dist-info/top_level.txt +1 -0
- build/lib/qlever/__init__.py +0 -1383
- build/lib/qlever/__main__.py +0 -4
- qlever/__main__.py +0 -4
- qlever-0.2.5.dist-info/METADATA +0 -277
- qlever-0.2.5.dist-info/RECORD +0 -12
- qlever-0.2.5.dist-info/entry_points.txt +0 -2
- qlever-0.2.5.dist-info/top_level.txt +0 -4
- src/qlever/__init__.py +0 -1383
- src/qlever/__main__.py +0 -4
- {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,1204 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import signal
|
|
8
|
+
import time
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from enum import Enum, auto
|
|
11
|
+
|
|
12
|
+
import rdflib.term
|
|
13
|
+
import requests_sse
|
|
14
|
+
from rdflib import Graph
|
|
15
|
+
from termcolor import colored
|
|
16
|
+
from tqdm.contrib.logging import tqdm_logging_redirect
|
|
17
|
+
|
|
18
|
+
from qlever.command import QleverCommand
|
|
19
|
+
from qlever.log import log
|
|
20
|
+
from qlever.util import run_command
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Monkey patch `rdflib.term._castLexicalToPython` to avoid casting of literals
|
|
24
|
+
# to Python types. We do not need it (all we want it convert Turtle to N-Triples),
|
|
25
|
+
# and we can speed up parsing by a factor of about 2.
|
|
26
|
+
def custom_cast_lexical_to_python(lexical, datatype):
|
|
27
|
+
return None # Your desired behavior
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
rdflib.term._castLexicalToPython = custom_cast_lexical_to_python
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def retry_with_backoff(operation, operation_name, max_retries, log):
|
|
34
|
+
"""
|
|
35
|
+
Retry an operation with exponential backoff, see backoff intervals below
|
|
36
|
+
(in seconds). Returns the result of the operation if successful, or raises
|
|
37
|
+
the last exception.
|
|
38
|
+
"""
|
|
39
|
+
backoff_intervals = [5, 10, 30, 60, 300, 900, 1800, 3600]
|
|
40
|
+
|
|
41
|
+
for attempt in range(max_retries):
|
|
42
|
+
try:
|
|
43
|
+
return operation()
|
|
44
|
+
except Exception as e:
|
|
45
|
+
if attempt < max_retries - 1:
|
|
46
|
+
# Use the appropriate backoff interval (once we get to the end
|
|
47
|
+
# of the list, keep using the last interval).
|
|
48
|
+
retry_delay = (
|
|
49
|
+
backoff_intervals[attempt]
|
|
50
|
+
if attempt < len(backoff_intervals)
|
|
51
|
+
else backoff_intervals[-1]
|
|
52
|
+
)
|
|
53
|
+
# Show the delay as seconds, minutes, or hours.
|
|
54
|
+
if retry_delay >= 3600:
|
|
55
|
+
delay_str = f"{retry_delay // 3600}h"
|
|
56
|
+
elif retry_delay >= 60:
|
|
57
|
+
delay_str = f"{retry_delay // 60}min"
|
|
58
|
+
else:
|
|
59
|
+
delay_str = f"{retry_delay}s"
|
|
60
|
+
log.warn(
|
|
61
|
+
f"{operation_name} failed (attempt {attempt + 1}/{max_retries}): {e}. "
|
|
62
|
+
f"Retrying in {delay_str} ..."
|
|
63
|
+
)
|
|
64
|
+
time.sleep(retry_delay)
|
|
65
|
+
else:
|
|
66
|
+
# If this was the last attempt, re-raise the exception.
|
|
67
|
+
raise
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def connect_to_sse_stream(sse_stream_url, since=None, event_id=None):
|
|
71
|
+
"""
|
|
72
|
+
Connect to the SSE stream and return the connected EventSource.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
sse_stream_url: URL of the SSE stream
|
|
76
|
+
since: ISO date string to start from (mutually exclusive with event_id)
|
|
77
|
+
event_id: Event ID to resume from (mutually exclusive with since)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The connected EventSource object
|
|
81
|
+
"""
|
|
82
|
+
if event_id:
|
|
83
|
+
event_id_json = json.dumps(event_id)
|
|
84
|
+
source = requests_sse.EventSource(
|
|
85
|
+
sse_stream_url,
|
|
86
|
+
headers={
|
|
87
|
+
"Accept": "text/event-stream",
|
|
88
|
+
"User-Agent": "qlever update-wikidata",
|
|
89
|
+
"Last-Event-ID": event_id_json,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
source = requests_sse.EventSource(
|
|
94
|
+
sse_stream_url,
|
|
95
|
+
params={"since": since} if since else {},
|
|
96
|
+
headers={
|
|
97
|
+
"Accept": "text/event-stream",
|
|
98
|
+
"User-Agent": "qlever update-wikidata",
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
source.connect()
|
|
103
|
+
return source
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class UpdateWikidataCommand(QleverCommand):
|
|
107
|
+
"""
|
|
108
|
+
Class for executing the `update` command.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self):
|
|
112
|
+
# SPARQL query to get the date until which the updates of the
|
|
113
|
+
# SPARQL endpoint are complete.
|
|
114
|
+
self.sparql_updates_complete_until_query = (
|
|
115
|
+
"PREFIX wikibase: <http://wikiba.se/ontology#> "
|
|
116
|
+
"PREFIX schema: <http://schema.org/> "
|
|
117
|
+
"SELECT * WHERE { "
|
|
118
|
+
"{ SELECT (MIN(?date_modified) AS ?updates_complete_until) { "
|
|
119
|
+
"wikibase:Dump schema:dateModified ?date_modified } } "
|
|
120
|
+
"UNION { wikibase:Dump wikibase:updatesCompleteUntil ?updates_complete_until } "
|
|
121
|
+
"} ORDER BY DESC(?updates_complete_until) LIMIT 1"
|
|
122
|
+
)
|
|
123
|
+
# URL of the Wikidata SSE stream.
|
|
124
|
+
self.wikidata_update_stream_url = (
|
|
125
|
+
"https://stream.wikimedia.org/v2/"
|
|
126
|
+
"stream/rdf-streaming-updater.mutation.v2"
|
|
127
|
+
)
|
|
128
|
+
# Remember if Ctrl+C was pressed, so we can handle it gracefully.
|
|
129
|
+
self.ctrl_c_pressed = False
|
|
130
|
+
# Set to `True` when finished.
|
|
131
|
+
self.finished = False
|
|
132
|
+
|
|
133
|
+
def description(self) -> str:
|
|
134
|
+
return "Update from given SSE stream"
|
|
135
|
+
|
|
136
|
+
def should_have_qleverfile(self) -> bool:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
|
|
140
|
+
return {"server": ["host_name", "port", "access_token"]}
|
|
141
|
+
|
|
142
|
+
def additional_arguments(self, subparser) -> None:
|
|
143
|
+
subparser.add_argument(
|
|
144
|
+
"sse_stream_url",
|
|
145
|
+
nargs="?",
|
|
146
|
+
type=str,
|
|
147
|
+
default=self.wikidata_update_stream_url,
|
|
148
|
+
help="URL of the SSE stream to update from",
|
|
149
|
+
)
|
|
150
|
+
subparser.add_argument(
|
|
151
|
+
"--batch-size",
|
|
152
|
+
type=int,
|
|
153
|
+
default=100000,
|
|
154
|
+
help="Group this many messages together into one update "
|
|
155
|
+
"(default: one update for each message); NOTE: this simply "
|
|
156
|
+
"concatenates the `rdf_added_data` and `rdf_deleted_data` fields, "
|
|
157
|
+
"which is not 100%% correct; as soon as chaining is supported, "
|
|
158
|
+
"this will be fixed",
|
|
159
|
+
)
|
|
160
|
+
subparser.add_argument(
|
|
161
|
+
"--lag-seconds",
|
|
162
|
+
type=int,
|
|
163
|
+
default=1,
|
|
164
|
+
help="When a message is encountered that is within this many "
|
|
165
|
+
"seconds of the current time, finish the current batch "
|
|
166
|
+
"(and show a warning that this happened)",
|
|
167
|
+
)
|
|
168
|
+
subparser.add_argument(
|
|
169
|
+
"--since",
|
|
170
|
+
type=str,
|
|
171
|
+
help="Consume stream messages since this date "
|
|
172
|
+
"(default: determine automatically from the SPARQL endpoint)",
|
|
173
|
+
)
|
|
174
|
+
subparser.add_argument(
|
|
175
|
+
"--until",
|
|
176
|
+
type=str,
|
|
177
|
+
help="Stop consuming stream messages when reaching this date "
|
|
178
|
+
"(default: continue indefinitely)",
|
|
179
|
+
)
|
|
180
|
+
subparser.add_argument(
|
|
181
|
+
"--offset",
|
|
182
|
+
type=int,
|
|
183
|
+
help="Consume stream messages starting from this offset "
|
|
184
|
+
"(default: not set)",
|
|
185
|
+
)
|
|
186
|
+
subparser.add_argument(
|
|
187
|
+
"--topic",
|
|
188
|
+
type=str,
|
|
189
|
+
choices=[
|
|
190
|
+
"eqiad.rdf-streaming-updater.mutation",
|
|
191
|
+
"codfw.rdf-streaming-updater.mutation",
|
|
192
|
+
],
|
|
193
|
+
default="eqiad.rdf-streaming-updater.mutation",
|
|
194
|
+
help="The topic to consume from the SSE stream (default: "
|
|
195
|
+
"eqiad.rdf-streaming-updater.mutation)",
|
|
196
|
+
)
|
|
197
|
+
subparser.add_argument(
|
|
198
|
+
"--partition",
|
|
199
|
+
type=int,
|
|
200
|
+
default=0,
|
|
201
|
+
help="The partition to consume from the SSE stream (default: 0)",
|
|
202
|
+
)
|
|
203
|
+
subparser.add_argument(
|
|
204
|
+
"--min-or-max-date",
|
|
205
|
+
choices=["min", "max"],
|
|
206
|
+
default="max",
|
|
207
|
+
help="Use the minimum or maximum date of the batch for the "
|
|
208
|
+
"`updatesCompleteUntil` property (default: maximum)",
|
|
209
|
+
)
|
|
210
|
+
subparser.add_argument(
|
|
211
|
+
"--wait-between-batches",
|
|
212
|
+
type=int,
|
|
213
|
+
default=300,
|
|
214
|
+
help="Wait this many seconds between batches that were "
|
|
215
|
+
"finished due to a message that is within `lag_seconds` of "
|
|
216
|
+
"the current time (default: 300s)",
|
|
217
|
+
)
|
|
218
|
+
subparser.add_argument(
|
|
219
|
+
"--num-messages",
|
|
220
|
+
type=int,
|
|
221
|
+
help="Process exactly this many messages and then exit "
|
|
222
|
+
"(default: no bound on the number of messages)",
|
|
223
|
+
)
|
|
224
|
+
subparser.add_argument(
|
|
225
|
+
"--verbose",
|
|
226
|
+
choices=["no", "yes"],
|
|
227
|
+
default="yes",
|
|
228
|
+
help='Verbose logging, "yes" or "no" (default: "yes")',
|
|
229
|
+
)
|
|
230
|
+
subparser.add_argument(
|
|
231
|
+
"--use-cached-sparql-queries",
|
|
232
|
+
action="store_true",
|
|
233
|
+
help="Use cached SPARQL query files if they exist with matching "
|
|
234
|
+
"offset and target batch size (default: off)",
|
|
235
|
+
)
|
|
236
|
+
subparser.add_argument(
|
|
237
|
+
"--check-offset-before-each-batch",
|
|
238
|
+
choices=["yes", "no"],
|
|
239
|
+
default="yes",
|
|
240
|
+
help="Before each batch, verify that the stream offset matches the "
|
|
241
|
+
"stored offset in the knowledge base (default: yes)",
|
|
242
|
+
)
|
|
243
|
+
subparser.add_argument(
|
|
244
|
+
"--num-retries",
|
|
245
|
+
type=int,
|
|
246
|
+
default=10,
|
|
247
|
+
help="Number of retries for offset verification queries when they fail "
|
|
248
|
+
"(default: 10)",
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Handle Ctrl+C gracefully by finishing the current batch and then exiting.
|
|
252
|
+
def handle_ctrl_c(self, signal_received, frame):
|
|
253
|
+
if self.ctrl_c_pressed:
|
|
254
|
+
log.warn("\rCtrl+C pressed again, watch your blood pressure")
|
|
255
|
+
else:
|
|
256
|
+
self.ctrl_c_pressed = True
|
|
257
|
+
|
|
258
|
+
def execute(self, args) -> bool:
|
|
259
|
+
# cURL command to get the date until which the updates of the
|
|
260
|
+
# SPARQL endpoint are complete.
|
|
261
|
+
sparql_endpoint = f"http://{args.host_name}:{args.port}"
|
|
262
|
+
curl_cmd_updates_complete_until = (
|
|
263
|
+
f"curl -s {sparql_endpoint}"
|
|
264
|
+
f' -H "Accept: text/csv"'
|
|
265
|
+
f' -H "Content-type: application/sparql-query"'
|
|
266
|
+
f' --data "{self.sparql_updates_complete_until_query}"'
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Construct the command and show it.
|
|
270
|
+
cmd_description = []
|
|
271
|
+
if args.since:
|
|
272
|
+
cmd_description.append(f"SINCE={args.since}")
|
|
273
|
+
else:
|
|
274
|
+
cmd_description.append(
|
|
275
|
+
f"SINCE=$({curl_cmd_updates_complete_until} | sed 1d)"
|
|
276
|
+
)
|
|
277
|
+
if args.until:
|
|
278
|
+
cmd_description.append(f"UNTIL={args.until}")
|
|
279
|
+
cmd_description.append(
|
|
280
|
+
f"Process SSE stream from {args.sse_stream_url} "
|
|
281
|
+
f"in batches of up to {args.batch_size:,} messages "
|
|
282
|
+
)
|
|
283
|
+
self.show("\n".join(cmd_description), only_show=args.show)
|
|
284
|
+
if args.show:
|
|
285
|
+
return True
|
|
286
|
+
|
|
287
|
+
# Compute the `since` date if not given.
|
|
288
|
+
if args.since:
|
|
289
|
+
since = args.since
|
|
290
|
+
else:
|
|
291
|
+
try:
|
|
292
|
+
since = run_command(
|
|
293
|
+
f"{curl_cmd_updates_complete_until} | sed 1d",
|
|
294
|
+
return_output=True,
|
|
295
|
+
).strip()
|
|
296
|
+
except Exception as e:
|
|
297
|
+
log.error(
|
|
298
|
+
f"Error running `{curl_cmd_updates_complete_until}`: {e}"
|
|
299
|
+
)
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
# Special handling of Ctrl+C, see `handle_ctrl_c` above.
|
|
303
|
+
signal.signal(signal.SIGINT, self.handle_ctrl_c)
|
|
304
|
+
log.warn("Press Ctrl+C to finish and exit gracefully")
|
|
305
|
+
log.info("")
|
|
306
|
+
|
|
307
|
+
# If --offset is not provided, first try to get the stored offset from
|
|
308
|
+
# the knowledge base. Only fall back to date-based approach if no
|
|
309
|
+
# offset is stored.
|
|
310
|
+
if not args.offset:
|
|
311
|
+
try:
|
|
312
|
+
sparql_query_stored_offset = (
|
|
313
|
+
"PREFIX wikibase: <http://wikiba.se/ontology#> "
|
|
314
|
+
"SELECT (MAX(?offset) AS ?maxOffset) WHERE { "
|
|
315
|
+
"<http://wikiba.se/ontology#Dump> "
|
|
316
|
+
"wikibase:updateStreamNextOffset ?offset "
|
|
317
|
+
"}"
|
|
318
|
+
)
|
|
319
|
+
curl_cmd_get_stored_offset = (
|
|
320
|
+
f"curl -s {sparql_endpoint}"
|
|
321
|
+
f' -H "Accept: text/csv"'
|
|
322
|
+
f' -H "Content-type: application/sparql-query"'
|
|
323
|
+
f' --data "{sparql_query_stored_offset}"'
|
|
324
|
+
)
|
|
325
|
+
result = run_command(
|
|
326
|
+
f"{curl_cmd_get_stored_offset} | sed 1d",
|
|
327
|
+
return_output=True,
|
|
328
|
+
).strip()
|
|
329
|
+
if result and result != '""':
|
|
330
|
+
args.offset = int(result.strip('"'))
|
|
331
|
+
log.info(
|
|
332
|
+
f"Resuming from stored offset in knowledge base: "
|
|
333
|
+
f"{args.offset}"
|
|
334
|
+
)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
log.debug(
|
|
337
|
+
f"Could not retrieve stored offset from knowledge base: {e}. "
|
|
338
|
+
f"Will determine offset from date instead."
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# If --offset is still not set, determine it by reading a single
|
|
342
|
+
# message from the SSE stream using the `since` date.
|
|
343
|
+
if not args.offset:
|
|
344
|
+
try:
|
|
345
|
+
source = retry_with_backoff(
|
|
346
|
+
lambda: connect_to_sse_stream(
|
|
347
|
+
args.sse_stream_url, since=since
|
|
348
|
+
),
|
|
349
|
+
"SSE stream connection",
|
|
350
|
+
args.num_retries,
|
|
351
|
+
log,
|
|
352
|
+
)
|
|
353
|
+
offset = None
|
|
354
|
+
for event in source:
|
|
355
|
+
if event.type == "message" and event.data:
|
|
356
|
+
event_data = json.loads(event.data)
|
|
357
|
+
event_topic = event_data.get("meta").get("topic")
|
|
358
|
+
if event_topic == args.topic:
|
|
359
|
+
offset = event_data.get("meta").get("offset")
|
|
360
|
+
log.debug(
|
|
361
|
+
f"Determined offset from date: {since} -> {offset}"
|
|
362
|
+
)
|
|
363
|
+
break
|
|
364
|
+
source.close()
|
|
365
|
+
if offset is None:
|
|
366
|
+
raise Exception(
|
|
367
|
+
f"No event with topic {args.topic} found in stream"
|
|
368
|
+
)
|
|
369
|
+
args.offset = offset
|
|
370
|
+
except Exception as e:
|
|
371
|
+
log.error(f"Error determining offset from stream: {e}")
|
|
372
|
+
return False
|
|
373
|
+
|
|
374
|
+
# Initialize all the statistics variables.
|
|
375
|
+
batch_count = 0
|
|
376
|
+
total_num_messages = 0
|
|
377
|
+
total_update_time = 0
|
|
378
|
+
start_time = time.perf_counter()
|
|
379
|
+
wait_before_next_batch = False
|
|
380
|
+
event_id_for_next_batch = (
|
|
381
|
+
[
|
|
382
|
+
{
|
|
383
|
+
"topic": args.topic,
|
|
384
|
+
"partition": args.partition,
|
|
385
|
+
"offset": args.offset,
|
|
386
|
+
}
|
|
387
|
+
]
|
|
388
|
+
if args.offset
|
|
389
|
+
else None
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Track whether this is the first batch (to skip offset check)
|
|
393
|
+
first_batch = True
|
|
394
|
+
|
|
395
|
+
# Main event loop: Either resume from `event_id_for_next_batch` (if set),
|
|
396
|
+
# or start a new connection to `args.sse_stream_url` (with URL
|
|
397
|
+
# parameter `?since=`).
|
|
398
|
+
while True:
|
|
399
|
+
# Optionally wait before processing the next batch (make sure that
|
|
400
|
+
# the wait is interruptible by Ctrl+C).
|
|
401
|
+
if wait_before_next_batch:
|
|
402
|
+
log.info(
|
|
403
|
+
f"Waiting {args.wait_between_batches} "
|
|
404
|
+
f"second{'s' if args.wait_between_batches > 1 else ''} "
|
|
405
|
+
f"before processing the next batch"
|
|
406
|
+
)
|
|
407
|
+
log.info("")
|
|
408
|
+
wait_before_next_batch = False
|
|
409
|
+
for _ in range(args.wait_between_batches):
|
|
410
|
+
if self.ctrl_c_pressed:
|
|
411
|
+
break
|
|
412
|
+
time.sleep(1)
|
|
413
|
+
if self.ctrl_c_pressed:
|
|
414
|
+
log.warn(
|
|
415
|
+
"\rCtrl+C pressed while waiting in between batches, "
|
|
416
|
+
"exiting"
|
|
417
|
+
)
|
|
418
|
+
break
|
|
419
|
+
|
|
420
|
+
# Start stream from either `event_id_for_next_batch` or `since`.
|
|
421
|
+
# We'll extract the offset for first_offset_in_batch later.
|
|
422
|
+
if event_id_for_next_batch:
|
|
423
|
+
event_id_json = json.dumps(event_id_for_next_batch)
|
|
424
|
+
if args.verbose == "yes":
|
|
425
|
+
log.info(
|
|
426
|
+
colored(
|
|
427
|
+
f"Consuming stream from event ID: {event_id_json}",
|
|
428
|
+
attrs=["dark"],
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
else:
|
|
432
|
+
if args.verbose == "yes":
|
|
433
|
+
log.info(
|
|
434
|
+
colored(
|
|
435
|
+
f"Consuming stream from date: {since}",
|
|
436
|
+
attrs=["dark"],
|
|
437
|
+
)
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# Connect to the SSE stream with retry logic
|
|
441
|
+
try:
|
|
442
|
+
source = retry_with_backoff(
|
|
443
|
+
lambda: connect_to_sse_stream(
|
|
444
|
+
args.sse_stream_url,
|
|
445
|
+
since=since if not event_id_for_next_batch else None,
|
|
446
|
+
event_id=event_id_for_next_batch,
|
|
447
|
+
),
|
|
448
|
+
"SSE stream connection for batch processing",
|
|
449
|
+
args.num_retries,
|
|
450
|
+
log,
|
|
451
|
+
)
|
|
452
|
+
except Exception as e:
|
|
453
|
+
log.error(
|
|
454
|
+
f"Failed to connect to SSE stream after "
|
|
455
|
+
f"{args.num_retries} retry attempts, last error: {e}"
|
|
456
|
+
)
|
|
457
|
+
break
|
|
458
|
+
|
|
459
|
+
# Next comes the inner loop, which processes exactly one "batch" of
|
|
460
|
+
# messages. The batch is completed (simply using `break`) when either
|
|
461
|
+
# `args.batch_size` messages have been processed, or when one of a
|
|
462
|
+
# variety of conditions occur (Ctrl+C pressed, message within
|
|
463
|
+
# `args.lag_seconds` of current time, delete operation followed by
|
|
464
|
+
# insert of triple with that entity as subject).
|
|
465
|
+
|
|
466
|
+
# Initialize all the batch variables.
|
|
467
|
+
current_batch_size = 0
|
|
468
|
+
# Extract the offset from the event ID to use as the starting offset
|
|
469
|
+
# for this batch. This is set before processing any messages.
|
|
470
|
+
if event_id_for_next_batch:
|
|
471
|
+
first_offset_in_batch = event_id_for_next_batch[0]["offset"]
|
|
472
|
+
event_id_for_next_batch = None
|
|
473
|
+
else:
|
|
474
|
+
# This should not happen since we now always determine the offset
|
|
475
|
+
# before starting, but keep as fallback
|
|
476
|
+
first_offset_in_batch = None
|
|
477
|
+
|
|
478
|
+
# Check that the stream offset matches the stored offset in the KB
|
|
479
|
+
# Skip this check on the first batch (when using --offset to resume)
|
|
480
|
+
if (
|
|
481
|
+
args.check_offset_before_each_batch == "yes"
|
|
482
|
+
and not first_batch
|
|
483
|
+
and first_offset_in_batch is not None
|
|
484
|
+
):
|
|
485
|
+
sparql_query_offset = (
|
|
486
|
+
"PREFIX wikibase: <http://wikiba.se/ontology#> "
|
|
487
|
+
"SELECT (MAX(?offset) AS ?maxOffset) WHERE { "
|
|
488
|
+
"<http://wikiba.se/ontology#Dump> "
|
|
489
|
+
"wikibase:updateStreamNextOffset ?offset "
|
|
490
|
+
"}"
|
|
491
|
+
)
|
|
492
|
+
curl_cmd_check_offset = (
|
|
493
|
+
f"curl -s {sparql_endpoint}"
|
|
494
|
+
f' -H "Accept: text/csv"'
|
|
495
|
+
f' -H "Content-type: application/sparql-query"'
|
|
496
|
+
f' --data "{sparql_query_offset}"'
|
|
497
|
+
)
|
|
498
|
+
# Verify offset with retry logic
|
|
499
|
+
try:
|
|
500
|
+
result = retry_with_backoff(
|
|
501
|
+
lambda: run_command(
|
|
502
|
+
f"{curl_cmd_check_offset} | sed 1d",
|
|
503
|
+
return_output=True,
|
|
504
|
+
).strip(),
|
|
505
|
+
"Offset verification",
|
|
506
|
+
args.num_retries,
|
|
507
|
+
log,
|
|
508
|
+
)
|
|
509
|
+
if not result:
|
|
510
|
+
log.error(
|
|
511
|
+
"Failed to retrieve stored offset from knowledge base: "
|
|
512
|
+
"query returned no results. This might be the first update, "
|
|
513
|
+
"or the offset triple is missing."
|
|
514
|
+
)
|
|
515
|
+
return False
|
|
516
|
+
stored_offset = int(result.strip('"'))
|
|
517
|
+
if stored_offset != first_offset_in_batch:
|
|
518
|
+
log.error(
|
|
519
|
+
f"Offset mismatch: stream offset is {first_offset_in_batch}, "
|
|
520
|
+
f"but stored offset in knowledge base is {stored_offset}. "
|
|
521
|
+
f"This indicates that updates may have been applied "
|
|
522
|
+
f"out of order or some updates are missing."
|
|
523
|
+
)
|
|
524
|
+
return False
|
|
525
|
+
except Exception as e:
|
|
526
|
+
log.error(
|
|
527
|
+
f"Failed to retrieve or verify stored offset from "
|
|
528
|
+
f"SPARQL endpoint after {args.num_retries} retry; "
|
|
529
|
+
f"last error: {e}"
|
|
530
|
+
)
|
|
531
|
+
return False
|
|
532
|
+
|
|
533
|
+
date_list = []
|
|
534
|
+
delete_entity_ids = set()
|
|
535
|
+
delta_to_now_list = []
|
|
536
|
+
batch_assembly_start_time = time.perf_counter()
|
|
537
|
+
insert_triples = set()
|
|
538
|
+
delete_triples = set()
|
|
539
|
+
|
|
540
|
+
# Check if we can use a cached SPARQL query file
|
|
541
|
+
use_cached_file = False
|
|
542
|
+
cached_file_name = None
|
|
543
|
+
cached_meta_file_name = None
|
|
544
|
+
cached_date_range = None
|
|
545
|
+
if (
|
|
546
|
+
args.use_cached_sparql_queries
|
|
547
|
+
and first_offset_in_batch is not None
|
|
548
|
+
):
|
|
549
|
+
cached_file_name = (
|
|
550
|
+
f"update.{first_offset_in_batch}.{args.batch_size}.sparql"
|
|
551
|
+
)
|
|
552
|
+
cached_meta_file_name = (
|
|
553
|
+
f"update.{first_offset_in_batch}.{args.batch_size}.meta"
|
|
554
|
+
)
|
|
555
|
+
if os.path.exists(cached_file_name):
|
|
556
|
+
use_cached_file = True
|
|
557
|
+
# Try to read metadata file for date range
|
|
558
|
+
if os.path.exists(cached_meta_file_name):
|
|
559
|
+
try:
|
|
560
|
+
with open(cached_meta_file_name, "r") as f:
|
|
561
|
+
cached_date_range = f.read().strip()
|
|
562
|
+
except Exception:
|
|
563
|
+
cached_date_range = None
|
|
564
|
+
|
|
565
|
+
if args.verbose == "yes":
|
|
566
|
+
log_msg = f"Using cached SPARQL query file: {cached_file_name}"
|
|
567
|
+
if cached_date_range:
|
|
568
|
+
log_msg += f" [date range: {cached_date_range}]"
|
|
569
|
+
log.info(colored(log_msg, "cyan"))
|
|
570
|
+
|
|
571
|
+
# Process one event at a time (unless using cached file).
|
|
572
|
+
if not use_cached_file:
|
|
573
|
+
with tqdm_logging_redirect(
|
|
574
|
+
loggers=[logging.getLogger("qlever")],
|
|
575
|
+
desc="Batch",
|
|
576
|
+
total=args.batch_size,
|
|
577
|
+
leave=False,
|
|
578
|
+
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}{postfix}",
|
|
579
|
+
) as pbar:
|
|
580
|
+
for event in source:
|
|
581
|
+
# Skip events that are not of type `message` (should not
|
|
582
|
+
# happen), have no field `data` (should not happen either), or
|
|
583
|
+
# where the topic is not in `args.topics` (one topic by itself
|
|
584
|
+
# should provide all relevant updates).
|
|
585
|
+
if event.type != "message" or not event.data:
|
|
586
|
+
continue
|
|
587
|
+
event_data = json.loads(event.data)
|
|
588
|
+
topic = event_data.get("meta").get("topic")
|
|
589
|
+
if topic != args.topic:
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
try:
|
|
593
|
+
# Extract offset, topic, and partition from the message metadata
|
|
594
|
+
# to construct a precise event ID for resuming.
|
|
595
|
+
meta = event_data.get("meta")
|
|
596
|
+
offset = meta.get("offset")
|
|
597
|
+
topic = meta.get("topic")
|
|
598
|
+
partition = meta.get("partition")
|
|
599
|
+
|
|
600
|
+
# Get the date (rounded *down* to seconds).
|
|
601
|
+
date = meta.get("dt")
|
|
602
|
+
date = re.sub(r"\.\d*Z$", "Z", date)
|
|
603
|
+
|
|
604
|
+
# Get the other relevant fields from the message.
|
|
605
|
+
entity_id = event_data.get("entity_id")
|
|
606
|
+
operation = event_data.get("operation")
|
|
607
|
+
rdf_added_data = event_data.get("rdf_added_data")
|
|
608
|
+
rdf_deleted_data = event_data.get(
|
|
609
|
+
"rdf_deleted_data"
|
|
610
|
+
)
|
|
611
|
+
rdf_linked_shared_data = event_data.get(
|
|
612
|
+
"rdf_linked_shared_data"
|
|
613
|
+
)
|
|
614
|
+
rdf_unlinked_shared_data = event_data.get(
|
|
615
|
+
"rdf_unlinked_shared_data"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Check batch completion conditions BEFORE processing the
|
|
619
|
+
# data of this message. If any of the conditions is met,
|
|
620
|
+
# we finish the batch and resume from the LAST PROCESSED
|
|
621
|
+
# message (not the current one that triggered the break).
|
|
622
|
+
#
|
|
623
|
+
# NOTE: We will update event_id_for_next_batch AFTER
|
|
624
|
+
# successfully processing each message (see below), so that
|
|
625
|
+
# when we break, it contains the last processed event ID.
|
|
626
|
+
since = None
|
|
627
|
+
|
|
628
|
+
# Condition 1: Delete followed by insert for same entity.
|
|
629
|
+
operation_adds_data = (
|
|
630
|
+
rdf_added_data is not None
|
|
631
|
+
or rdf_linked_shared_data is not None
|
|
632
|
+
)
|
|
633
|
+
if (
|
|
634
|
+
operation_adds_data
|
|
635
|
+
and entity_id in delete_entity_ids
|
|
636
|
+
):
|
|
637
|
+
if args.verbose == "yes":
|
|
638
|
+
log.warn(
|
|
639
|
+
f"Encountered operation that adds data for "
|
|
640
|
+
f"an entity ID ({entity_id}) that was deleted "
|
|
641
|
+
f"earlier in this batch; finishing batch and "
|
|
642
|
+
f"resuming from this message in the next batch"
|
|
643
|
+
)
|
|
644
|
+
break
|
|
645
|
+
|
|
646
|
+
# Condition 2: Batch size or limit on number of
|
|
647
|
+
# messages reached.
|
|
648
|
+
if current_batch_size >= args.batch_size or (
|
|
649
|
+
args.num_messages is not None
|
|
650
|
+
and total_num_messages >= args.num_messages
|
|
651
|
+
):
|
|
652
|
+
break
|
|
653
|
+
|
|
654
|
+
# Condition 3: Message close to current time.
|
|
655
|
+
date_obj = datetime.strptime(
|
|
656
|
+
date, "%Y-%m-%dT%H:%M:%SZ"
|
|
657
|
+
).replace(tzinfo=timezone.utc)
|
|
658
|
+
date_as_epoch_s = date_obj.timestamp()
|
|
659
|
+
|
|
660
|
+
now_as_epoch_s = time.time()
|
|
661
|
+
delta_to_now_s = now_as_epoch_s - date_as_epoch_s
|
|
662
|
+
if (
|
|
663
|
+
delta_to_now_s < args.lag_seconds
|
|
664
|
+
and current_batch_size > 0
|
|
665
|
+
):
|
|
666
|
+
if args.verbose == "yes":
|
|
667
|
+
log.warn(
|
|
668
|
+
f"Encountered message with date {date}, which is within "
|
|
669
|
+
f"{args.lag_seconds} "
|
|
670
|
+
f"second{'s' if args.lag_seconds > 1 else ''} "
|
|
671
|
+
f"of the current time, finishing the current batch"
|
|
672
|
+
)
|
|
673
|
+
wait_before_next_batch = (
|
|
674
|
+
args.wait_between_batches is not None
|
|
675
|
+
and args.wait_between_batches > 0
|
|
676
|
+
)
|
|
677
|
+
break
|
|
678
|
+
|
|
679
|
+
# Condition 4: Reached `--until` date and at least one
|
|
680
|
+
# message was processed.
|
|
681
|
+
if (
|
|
682
|
+
args.until
|
|
683
|
+
and date >= args.until
|
|
684
|
+
and current_batch_size > 0
|
|
685
|
+
):
|
|
686
|
+
log.warn(
|
|
687
|
+
f"Reached --until date {args.until} "
|
|
688
|
+
f"(message date: {date}), that's it folks"
|
|
689
|
+
)
|
|
690
|
+
self.finished = True
|
|
691
|
+
break
|
|
692
|
+
|
|
693
|
+
# Delete operations are postponed until the end of the
|
|
694
|
+
# batch, so remember the entity ID here.
|
|
695
|
+
if operation == "delete":
|
|
696
|
+
delete_entity_ids.add(entity_id)
|
|
697
|
+
|
|
698
|
+
# Replace each occurrence of `\\` by `\u005C\u005C`
|
|
699
|
+
# (which is twice the Unicode for backslash).
|
|
700
|
+
#
|
|
701
|
+
# NOTE: Strictly speaking, it would be enough to do
|
|
702
|
+
# this for two backslashes followed by a `u`, but
|
|
703
|
+
# doing it for all double backslashes does not
|
|
704
|
+
# harm. When parsing a SPARQL query, then according
|
|
705
|
+
# to the standar, first all occurrences of `\uxxxx`
|
|
706
|
+
# (where `xxxx` are four hex digits) are replaced
|
|
707
|
+
# by the corresponding Unicode character. That is a
|
|
708
|
+
# problem when `\\uxxxx` occurs in a literal,
|
|
709
|
+
# because then it would be replaced by `\` followed
|
|
710
|
+
# by the Unicode character, which is invalied
|
|
711
|
+
# SPARQL. The subsitution avoids that problem.
|
|
712
|
+
def node_to_sparql(node: rdflib.term.Node) -> str:
|
|
713
|
+
return node.n3().replace(
|
|
714
|
+
"\\\\", "\\u005C\\u005C"
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# Process the to-be-deleted triples.
|
|
718
|
+
for rdf_to_be_deleted in (
|
|
719
|
+
rdf_deleted_data,
|
|
720
|
+
rdf_unlinked_shared_data,
|
|
721
|
+
):
|
|
722
|
+
if rdf_to_be_deleted is not None:
|
|
723
|
+
try:
|
|
724
|
+
rdf_to_be_deleted_data = (
|
|
725
|
+
rdf_to_be_deleted.get("data")
|
|
726
|
+
)
|
|
727
|
+
graph = Graph()
|
|
728
|
+
log.debug(
|
|
729
|
+
f"RDF to_be_deleted data: {rdf_to_be_deleted_data}"
|
|
730
|
+
)
|
|
731
|
+
graph.parse(
|
|
732
|
+
data=rdf_to_be_deleted_data,
|
|
733
|
+
format="turtle",
|
|
734
|
+
)
|
|
735
|
+
for s, p, o in graph:
|
|
736
|
+
triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}"
|
|
737
|
+
# NOTE: In case there was a previous `insert` of that
|
|
738
|
+
# triple, it is safe to remove that `insert`, but not
|
|
739
|
+
# the `delete` (in case the triple is contained in the
|
|
740
|
+
# original data).
|
|
741
|
+
if triple in insert_triples:
|
|
742
|
+
insert_triples.remove(triple)
|
|
743
|
+
delete_triples.add(triple)
|
|
744
|
+
except Exception as e:
|
|
745
|
+
log.error(
|
|
746
|
+
f"Error reading `rdf_to_be_deleted_data`: {e}"
|
|
747
|
+
)
|
|
748
|
+
return False
|
|
749
|
+
|
|
750
|
+
# Process the to-be-added triples.
|
|
751
|
+
for rdf_to_be_added in (
|
|
752
|
+
rdf_added_data,
|
|
753
|
+
rdf_linked_shared_data,
|
|
754
|
+
):
|
|
755
|
+
if rdf_to_be_added is not None:
|
|
756
|
+
try:
|
|
757
|
+
rdf_to_be_added_data = (
|
|
758
|
+
rdf_to_be_added.get("data")
|
|
759
|
+
)
|
|
760
|
+
graph = Graph()
|
|
761
|
+
log.debug(
|
|
762
|
+
"RDF to be added data: {rdf_to_be_added_data}"
|
|
763
|
+
)
|
|
764
|
+
graph.parse(
|
|
765
|
+
data=rdf_to_be_added_data,
|
|
766
|
+
format="turtle",
|
|
767
|
+
)
|
|
768
|
+
for s, p, o in graph:
|
|
769
|
+
triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}"
|
|
770
|
+
# NOTE: In case there was a previous `delete` of that
|
|
771
|
+
# triple, it is safe to remove that `delete`, but not
|
|
772
|
+
# the `insert` (in case the triple is not contained in
|
|
773
|
+
# the original data).
|
|
774
|
+
if triple in delete_triples:
|
|
775
|
+
delete_triples.remove(triple)
|
|
776
|
+
insert_triples.add(triple)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
log.error(
|
|
779
|
+
f"Error reading `rdf_to_be_added_data`: {e}"
|
|
780
|
+
)
|
|
781
|
+
return False
|
|
782
|
+
|
|
783
|
+
except Exception as e:
|
|
784
|
+
log.error(f"Error reading data from message: {e}")
|
|
785
|
+
log.info(event)
|
|
786
|
+
continue
|
|
787
|
+
|
|
788
|
+
# Message was successfully processed, update batch tracking
|
|
789
|
+
current_batch_size += 1
|
|
790
|
+
total_num_messages += 1
|
|
791
|
+
pbar_update_frequency = 100
|
|
792
|
+
if (current_batch_size % pbar_update_frequency) == 0:
|
|
793
|
+
pbar.set_postfix(
|
|
794
|
+
{
|
|
795
|
+
"Time": date_obj.strftime(
|
|
796
|
+
"%Y-%m-%d %H:%M:%S"
|
|
797
|
+
)
|
|
798
|
+
}
|
|
799
|
+
)
|
|
800
|
+
pbar.update(pbar_update_frequency)
|
|
801
|
+
log.debug(
|
|
802
|
+
f"DATE: {date_as_epoch_s:.0f} [{date}], "
|
|
803
|
+
f"NOW: {now_as_epoch_s:.0f}, "
|
|
804
|
+
f"DELTA: {now_as_epoch_s - date_as_epoch_s:.0f}"
|
|
805
|
+
)
|
|
806
|
+
date_list.append(date)
|
|
807
|
+
delta_to_now_list.append(delta_to_now_s)
|
|
808
|
+
|
|
809
|
+
# Update the event ID for the next batch. We increment the
|
|
810
|
+
# offset by 1 so that the next batch starts with the next
|
|
811
|
+
# message (not re-processing the current one).
|
|
812
|
+
event_id_for_next_batch = [
|
|
813
|
+
{
|
|
814
|
+
"topic": topic,
|
|
815
|
+
"partition": partition,
|
|
816
|
+
"offset": offset + 1,
|
|
817
|
+
}
|
|
818
|
+
]
|
|
819
|
+
|
|
820
|
+
# Ctrl+C finishes the current batch (this should come at the
|
|
821
|
+
# end of the inner event loop so that always at least one
|
|
822
|
+
# message is processed).
|
|
823
|
+
if self.ctrl_c_pressed:
|
|
824
|
+
log.warn(
|
|
825
|
+
"\rCtrl+C pressed while processing a batch, "
|
|
826
|
+
"finishing it and exiting"
|
|
827
|
+
)
|
|
828
|
+
break
|
|
829
|
+
else:
|
|
830
|
+
# Using cached file - set batch size and calculate next offset
|
|
831
|
+
current_batch_size = args.batch_size
|
|
832
|
+
total_num_messages += current_batch_size
|
|
833
|
+
event_id_for_next_batch = [
|
|
834
|
+
{
|
|
835
|
+
"topic": args.topic,
|
|
836
|
+
"partition": args.partition,
|
|
837
|
+
"offset": first_offset_in_batch + current_batch_size,
|
|
838
|
+
}
|
|
839
|
+
]
|
|
840
|
+
|
|
841
|
+
# Process the current batch of messages (or skip if using cached).
|
|
842
|
+
batch_count += 1
|
|
843
|
+
if not use_cached_file:
|
|
844
|
+
batch_assembly_end_time = time.perf_counter()
|
|
845
|
+
batch_assembly_time_ms = int(
|
|
846
|
+
1000
|
|
847
|
+
* (batch_assembly_end_time - batch_assembly_start_time)
|
|
848
|
+
)
|
|
849
|
+
date_list.sort()
|
|
850
|
+
delta_to_now_list.sort()
|
|
851
|
+
min_delta_to_now_s = delta_to_now_list[0]
|
|
852
|
+
if min_delta_to_now_s < 10:
|
|
853
|
+
min_delta_to_now_s = f"{min_delta_to_now_s:.1f}"
|
|
854
|
+
else:
|
|
855
|
+
min_delta_to_now_s = f"{int(min_delta_to_now_s):,}"
|
|
856
|
+
log.info(
|
|
857
|
+
f"Assembled batch #{batch_count}, "
|
|
858
|
+
f"#messages: {current_batch_size:2,}, "
|
|
859
|
+
f"date range: {date_list[0]} - {date_list[-1]} "
|
|
860
|
+
f"[assembly time: {batch_assembly_time_ms:3,}ms, "
|
|
861
|
+
f"min delta to NOW: {min_delta_to_now_s}s]"
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
# Add the min and max date of the batch to `insert_triples`.
|
|
865
|
+
#
|
|
866
|
+
# NOTE: The min date means that we have *all* updates until that
|
|
867
|
+
# date. The max date is the date of the latest update we have seen.
|
|
868
|
+
# However, there may still be earlier updates that we have not seen
|
|
869
|
+
# yet. Wikidata uses `schema:dateModified` for the latter semantics,
|
|
870
|
+
# so we use it here as well. For the other semantics, we invent
|
|
871
|
+
# a new property `wikibase:updatesCompleteUntil`.
|
|
872
|
+
insert_triples.add(
|
|
873
|
+
f"<http://wikiba.se/ontology#Dump> "
|
|
874
|
+
f"<http://schema.org/dateModified> "
|
|
875
|
+
f'"{date_list[-1]}"^^<http://www.w3.org/2001/XMLSchema#dateTime>'
|
|
876
|
+
)
|
|
877
|
+
updates_complete_until = (
|
|
878
|
+
date_list[-1]
|
|
879
|
+
if args.min_or_max_date == "max"
|
|
880
|
+
else date_list[0]
|
|
881
|
+
)
|
|
882
|
+
insert_triples.add(
|
|
883
|
+
f"<http://wikiba.se/ontology#Dump> "
|
|
884
|
+
f"<http://wikiba.se/ontology#updatesCompleteUntil> "
|
|
885
|
+
f'"{updates_complete_until}"'
|
|
886
|
+
f"^^<http://www.w3.org/2001/XMLSchema#dateTime>"
|
|
887
|
+
)
|
|
888
|
+
insert_triples.add(
|
|
889
|
+
"<http://wikiba.se/ontology#Dump> "
|
|
890
|
+
"<http://wikiba.se/ontology#updateStreamNextOffset> "
|
|
891
|
+
f'"{event_id_for_next_batch[0]["offset"]}"'
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
# Construct UPDATE operation.
|
|
895
|
+
delete_block = " . \n ".join(delete_triples)
|
|
896
|
+
insert_block = " . \n ".join(insert_triples)
|
|
897
|
+
delete_insert_operation = (
|
|
898
|
+
f"DELETE {{\n {delete_block} \n}} "
|
|
899
|
+
f"INSERT {{\n {insert_block} \n}} "
|
|
900
|
+
f"WHERE {{ }}\n"
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
# If `delete_entity_ids` is non-empty, add a `DELETE WHERE`
|
|
904
|
+
# operation that deletes all triples that are associated with only
|
|
905
|
+
# those entities.
|
|
906
|
+
delete_entity_ids_as_values = " ".join(
|
|
907
|
+
[f"wd:{qid}" for qid in delete_entity_ids]
|
|
908
|
+
)
|
|
909
|
+
if len(delete_entity_ids) > 0:
|
|
910
|
+
delete_where_operation = (
|
|
911
|
+
f"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
|
|
912
|
+
f"PREFIX wikibase: <http://wikiba.se/ontology#>\n"
|
|
913
|
+
f"PREFIX wd: <http://www.wikidata.org/entity/>\n"
|
|
914
|
+
f"DELETE {{\n"
|
|
915
|
+
f" ?s ?p ?o .\n"
|
|
916
|
+
f"}} WHERE {{\n"
|
|
917
|
+
f" {{\n"
|
|
918
|
+
f" VALUES ?s {{ {delete_entity_ids_as_values} }}\n"
|
|
919
|
+
f" ?s ?p ?o .\n"
|
|
920
|
+
f" }} UNION {{\n"
|
|
921
|
+
f" VALUES ?_1 {{ {delete_entity_ids_as_values} }}\n"
|
|
922
|
+
f" ?_1 ?_2 ?s .\n"
|
|
923
|
+
f" ?s ?p ?o .\n"
|
|
924
|
+
f" ?s rdf:type wikibase:Statement .\n"
|
|
925
|
+
f" }}\n"
|
|
926
|
+
f"}}\n"
|
|
927
|
+
)
|
|
928
|
+
delete_insert_operation += ";\n" + delete_where_operation
|
|
929
|
+
|
|
930
|
+
# Construct curl command. For batch size 1, send the operation via
|
|
931
|
+
# `--data-urlencode`, otherwise write to file and send via `--data-binary`.
|
|
932
|
+
curl_cmd = (
|
|
933
|
+
f"curl -s -X POST"
|
|
934
|
+
f' "{sparql_endpoint}?access-token={args.access_token}"'
|
|
935
|
+
f" -H 'Content-Type: application/sparql-update'"
|
|
936
|
+
)
|
|
937
|
+
if use_cached_file:
|
|
938
|
+
# Use the cached file instead of writing a new one
|
|
939
|
+
update_arg_file_name = cached_file_name
|
|
940
|
+
else:
|
|
941
|
+
# Write the constructed SPARQL update to a file
|
|
942
|
+
update_arg_file_name = f"update.{first_offset_in_batch}.{current_batch_size}.sparql"
|
|
943
|
+
with open(update_arg_file_name, "w") as f:
|
|
944
|
+
f.write(delete_insert_operation)
|
|
945
|
+
# Write metadata file with date range
|
|
946
|
+
meta_file_name = (
|
|
947
|
+
f"update.{first_offset_in_batch}.{current_batch_size}.meta"
|
|
948
|
+
)
|
|
949
|
+
with open(meta_file_name, "w") as f:
|
|
950
|
+
f.write(f"{date_list[0]} - {date_list[-1]}")
|
|
951
|
+
curl_cmd += f" --data-binary @{update_arg_file_name}"
|
|
952
|
+
if args.verbose == "yes":
|
|
953
|
+
log.info(colored(curl_cmd, "blue"))
|
|
954
|
+
|
|
955
|
+
# Run it (using `curl` for batch size up to 1000, otherwise
|
|
956
|
+
# `requests`) with retry logic.
|
|
957
|
+
try:
|
|
958
|
+
result = retry_with_backoff(
|
|
959
|
+
lambda: run_command(curl_cmd, return_output=True),
|
|
960
|
+
"UPDATE request",
|
|
961
|
+
args.num_retries,
|
|
962
|
+
log,
|
|
963
|
+
)
|
|
964
|
+
result_file_name = f"update.{first_offset_in_batch}.{current_batch_size}.result"
|
|
965
|
+
with open(result_file_name, "w") as f:
|
|
966
|
+
f.write(result)
|
|
967
|
+
except Exception as e:
|
|
968
|
+
log.error(
|
|
969
|
+
f"Failed to execute UPDATE request after "
|
|
970
|
+
f"{args.num_retries} retry attempts, last error: "
|
|
971
|
+
f"{e}"
|
|
972
|
+
)
|
|
973
|
+
return False
|
|
974
|
+
|
|
975
|
+
# Results should be a JSON, parse it.
|
|
976
|
+
try:
|
|
977
|
+
result = json.loads(result)
|
|
978
|
+
except Exception as e:
|
|
979
|
+
log.error(
|
|
980
|
+
f"Error parsing JSON result: {e}. "
|
|
981
|
+
f"The first 1000 characters are: {result[:1000]}"
|
|
982
|
+
)
|
|
983
|
+
return False
|
|
984
|
+
|
|
985
|
+
# Check if the result contains a QLever exception.
|
|
986
|
+
if "exception" in result:
|
|
987
|
+
error_msg = result["exception"]
|
|
988
|
+
log.error(f"QLever exception: {error_msg}")
|
|
989
|
+
log.info("")
|
|
990
|
+
continue
|
|
991
|
+
|
|
992
|
+
# Helper function for getting the value of `stats["time"][...]`
|
|
993
|
+
# without the "ms" suffix. If the extraction fails, return 0
|
|
994
|
+
|
|
995
|
+
# (and optionally log the failure).
|
|
996
|
+
class FailureMode(Enum):
|
|
997
|
+
LOG_ERROR = auto()
|
|
998
|
+
SILENTLY_RETURN_ZERO = auto()
|
|
999
|
+
THROW_EXCEPTION = auto()
|
|
1000
|
+
|
|
1001
|
+
def get_time_ms(
|
|
1002
|
+
stats, *keys: str, failure_mode=FailureMode.LOG_ERROR
|
|
1003
|
+
) -> int:
|
|
1004
|
+
try:
|
|
1005
|
+
value = stats["time"]
|
|
1006
|
+
for key in keys:
|
|
1007
|
+
value = value[key]
|
|
1008
|
+
value = int(value)
|
|
1009
|
+
except Exception:
|
|
1010
|
+
if failure_mode == FailureMode.THROW_EXCEPTION:
|
|
1011
|
+
raise
|
|
1012
|
+
elif failure_mode == FailureMode.LOG_ERROR:
|
|
1013
|
+
log.error(
|
|
1014
|
+
f"Error extracting time from JSON statistics, "
|
|
1015
|
+
f"keys: {keys}"
|
|
1016
|
+
)
|
|
1017
|
+
value = 0
|
|
1018
|
+
return value
|
|
1019
|
+
|
|
1020
|
+
# Check for old JSON format (no `operations` or `time` on top level).
|
|
1021
|
+
old_json_message_template = (
|
|
1022
|
+
"Result JSON does not contain `{}` field, you are "
|
|
1023
|
+
"probably using an old version of QLever"
|
|
1024
|
+
)
|
|
1025
|
+
for field in ["operations", "time"]:
|
|
1026
|
+
if field not in result:
|
|
1027
|
+
raise RuntimeError(old_json_message_template.format(field))
|
|
1028
|
+
|
|
1029
|
+
# Get the per-operation statistics.
|
|
1030
|
+
for i, stats in enumerate(result["operations"]):
|
|
1031
|
+
try:
|
|
1032
|
+
ins_after = stats["delta-triples"]["after"]["inserted"]
|
|
1033
|
+
del_after = stats["delta-triples"]["after"]["deleted"]
|
|
1034
|
+
ops_after = stats["delta-triples"]["after"]["total"]
|
|
1035
|
+
num_ins = int(
|
|
1036
|
+
stats["delta-triples"]["operation"]["inserted"]
|
|
1037
|
+
)
|
|
1038
|
+
num_del = int(
|
|
1039
|
+
stats["delta-triples"]["operation"]["deleted"]
|
|
1040
|
+
)
|
|
1041
|
+
num_ops = int(stats["delta-triples"]["operation"]["total"])
|
|
1042
|
+
time_op_total = get_time_ms(stats, "total")
|
|
1043
|
+
time_us_per_op = (
|
|
1044
|
+
int(1000 * time_op_total / num_ops)
|
|
1045
|
+
if num_ops > 0
|
|
1046
|
+
else 0
|
|
1047
|
+
)
|
|
1048
|
+
if args.verbose == "yes":
|
|
1049
|
+
log.info(
|
|
1050
|
+
colored(
|
|
1051
|
+
f"TRIPLES: {num_ops:+10,} -> {ops_after:10,}, "
|
|
1052
|
+
f"INS: {num_ins:+10,} -> {ins_after:10,}, "
|
|
1053
|
+
f"DEL: {num_del:+10,} -> {del_after:10,}, "
|
|
1054
|
+
f"TIME: {time_op_total:7,}ms, "
|
|
1055
|
+
f"TIME/TRIPLE: {time_us_per_op:6,}µs",
|
|
1056
|
+
attrs=["bold"],
|
|
1057
|
+
)
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
time_planning = get_time_ms(stats, "planning")
|
|
1061
|
+
time_compute_ids = get_time_ms(
|
|
1062
|
+
stats,
|
|
1063
|
+
"execution",
|
|
1064
|
+
"computeIds",
|
|
1065
|
+
"total",
|
|
1066
|
+
)
|
|
1067
|
+
time_where = get_time_ms(
|
|
1068
|
+
stats,
|
|
1069
|
+
"execution",
|
|
1070
|
+
"evaluateWhere",
|
|
1071
|
+
)
|
|
1072
|
+
time_metadata = get_time_ms(
|
|
1073
|
+
stats,
|
|
1074
|
+
"updateMetadata",
|
|
1075
|
+
)
|
|
1076
|
+
time_insert = get_time_ms(
|
|
1077
|
+
stats,
|
|
1078
|
+
"execution",
|
|
1079
|
+
"insertTriples",
|
|
1080
|
+
"total",
|
|
1081
|
+
failure_mode=FailureMode.SILENTLY_RETURN_ZERO,
|
|
1082
|
+
)
|
|
1083
|
+
time_delete = get_time_ms(
|
|
1084
|
+
stats,
|
|
1085
|
+
"execution",
|
|
1086
|
+
"deleteTriples",
|
|
1087
|
+
"total",
|
|
1088
|
+
failure_mode=FailureMode.SILENTLY_RETURN_ZERO,
|
|
1089
|
+
)
|
|
1090
|
+
time_unaccounted = time_op_total - (
|
|
1091
|
+
time_planning
|
|
1092
|
+
+ time_compute_ids
|
|
1093
|
+
+ time_where
|
|
1094
|
+
+ time_metadata
|
|
1095
|
+
+ time_delete
|
|
1096
|
+
+ time_insert
|
|
1097
|
+
)
|
|
1098
|
+
if args.verbose == "yes":
|
|
1099
|
+
log.info(
|
|
1100
|
+
f"METADATA: {100 * time_metadata / time_op_total:2.0f}%, "
|
|
1101
|
+
f"PLANNING: {100 * time_planning / time_op_total:2.0f}%, "
|
|
1102
|
+
f"WHERE: {100 * time_where / time_op_total:2.0f}%, "
|
|
1103
|
+
f"IDS: {100 * time_compute_ids / time_op_total:2.0f}%, "
|
|
1104
|
+
f"DELETE: {100 * time_delete / time_op_total:2.0f}%, "
|
|
1105
|
+
f"INSERT: {100 * time_insert / time_op_total:2.0f}%, "
|
|
1106
|
+
f"UNACCOUNTED: {100 * time_unaccounted / time_op_total:2.0f}%",
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
except Exception as e:
|
|
1110
|
+
log.warn(
|
|
1111
|
+
f"Error extracting statistics: {e}, "
|
|
1112
|
+
f"curl command was: {curl_cmd}"
|
|
1113
|
+
)
|
|
1114
|
+
# Show traceback for debugging.
|
|
1115
|
+
import traceback
|
|
1116
|
+
|
|
1117
|
+
traceback.print_exc()
|
|
1118
|
+
log.info("")
|
|
1119
|
+
continue
|
|
1120
|
+
|
|
1121
|
+
# Get times for the whole request (not per operation).
|
|
1122
|
+
time_parsing = get_time_ms(
|
|
1123
|
+
result,
|
|
1124
|
+
"parsing",
|
|
1125
|
+
)
|
|
1126
|
+
time_metadata = get_time_ms(
|
|
1127
|
+
result,
|
|
1128
|
+
"metadataUpdateForSnapshot",
|
|
1129
|
+
)
|
|
1130
|
+
time_snapshot = get_time_ms(
|
|
1131
|
+
result,
|
|
1132
|
+
"snapshotCreation",
|
|
1133
|
+
)
|
|
1134
|
+
time_writeback = get_time_ms(
|
|
1135
|
+
result,
|
|
1136
|
+
"diskWriteback",
|
|
1137
|
+
)
|
|
1138
|
+
time_operations = get_time_ms(
|
|
1139
|
+
result,
|
|
1140
|
+
"operations",
|
|
1141
|
+
)
|
|
1142
|
+
time_total = get_time_ms(
|
|
1143
|
+
result,
|
|
1144
|
+
"total",
|
|
1145
|
+
)
|
|
1146
|
+
time_unaccounted = time_total - (
|
|
1147
|
+
time_parsing
|
|
1148
|
+
+ time_metadata
|
|
1149
|
+
+ time_snapshot
|
|
1150
|
+
+ time_writeback
|
|
1151
|
+
+ time_operations
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
# Update the totals.
|
|
1155
|
+
total_update_time += time_total / 1000.0
|
|
1156
|
+
total_elapsed_time = time.perf_counter() - start_time
|
|
1157
|
+
|
|
1158
|
+
# Show statistics for the completed batch.
|
|
1159
|
+
if args.verbose == "yes":
|
|
1160
|
+
log.info(
|
|
1161
|
+
colored(
|
|
1162
|
+
f"TOTAL UPDATE TIME SO FAR: {total_update_time:4.0f}s, "
|
|
1163
|
+
f"TOTAL ELAPSED TIME SO FAR: {total_elapsed_time:4.0f}s, "
|
|
1164
|
+
f"TOTAL TIME FOR THIS UPDATE REQUEST: {time_total:7,}ms, ",
|
|
1165
|
+
attrs=["bold"],
|
|
1166
|
+
)
|
|
1167
|
+
)
|
|
1168
|
+
log.info(
|
|
1169
|
+
f"PARSING: {100 * time_parsing / time_total:2.0f}%, "
|
|
1170
|
+
f"OPERATIONS: {100 * time_operations / time_total:2.0f}%, "
|
|
1171
|
+
f"METADATA: {100 * time_metadata / time_total:2.0f}%, "
|
|
1172
|
+
f"SNAPSHOT: {100 * time_snapshot / time_total:2.0f}%, "
|
|
1173
|
+
f"WRITEBACK: {100 * time_writeback / time_total:2.0f}%, "
|
|
1174
|
+
f"UNACCOUNTED: {100 * time_unaccounted / time_total:2.0f}%",
|
|
1175
|
+
)
|
|
1176
|
+
log.info("")
|
|
1177
|
+
|
|
1178
|
+
# Close the source connection (for each batch, we open a new one,
|
|
1179
|
+
# either from `event_id_for_next_batch` or from `since`).
|
|
1180
|
+
source.close()
|
|
1181
|
+
|
|
1182
|
+
# After the first batch is processed, enable offset checking for
|
|
1183
|
+
# subsequent batches.
|
|
1184
|
+
first_batch = False
|
|
1185
|
+
|
|
1186
|
+
# If Ctrl+C was pressed, we reached `--until`, or we processed
|
|
1187
|
+
# exactly `--num-messages`, finish.
|
|
1188
|
+
if (
|
|
1189
|
+
self.ctrl_c_pressed
|
|
1190
|
+
or self.finished
|
|
1191
|
+
or (
|
|
1192
|
+
args.num_messages is not None
|
|
1193
|
+
and total_num_messages >= args.num_messages
|
|
1194
|
+
)
|
|
1195
|
+
):
|
|
1196
|
+
break
|
|
1197
|
+
|
|
1198
|
+
# Final message after all batches have been processed.
|
|
1199
|
+
log.info(
|
|
1200
|
+
f"Processed {batch_count} "
|
|
1201
|
+
f"{'batches' if batch_count > 1 else 'batch'} "
|
|
1202
|
+
f"terminating update command"
|
|
1203
|
+
)
|
|
1204
|
+
return True
|