qlever 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qlever might be problematic. Click here for more details.
- qlever/Qleverfiles/Qleverfile.ohm-planet +1 -1
- qlever/commands/benchmark_queries.py +1022 -0
- qlever/commands/cache_stats.py +2 -0
- qlever/commands/index.py +3 -0
- qlever/commands/query.py +1 -0
- qlever/commands/settings.py +7 -0
- qlever/commands/ui.py +12 -8
- qlever/commands/update_wikidata.py +551 -0
- qlever/qlever_main.py +1 -2
- qlever/qleverfile.py +14 -0
- qlever/util.py +1 -1
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/METADATA +4 -2
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/RECORD +17 -16
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/WHEEL +1 -1
- qlever/commands/example_queries.py +0 -605
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/entry_points.txt +0 -0
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/licenses/LICENSE +0 -0
- {qlever-0.5.22.dist-info → qlever-0.5.24.dist-info}/top_level.txt +0 -0
qlever/commands/cache_stats.py
CHANGED
|
@@ -56,6 +56,8 @@ class CacheStatsCommand(QleverCommand):
|
|
|
56
56
|
shell=True)
|
|
57
57
|
cache_stats_dict = json.loads(cache_stats)
|
|
58
58
|
cache_settings_dict = json.loads(cache_settings)
|
|
59
|
+
if isinstance(cache_settings_dict, list):
|
|
60
|
+
cache_settings_dict = cache_settings_dict[0]
|
|
59
61
|
except Exception as e:
|
|
60
62
|
log.error(f"Failed to get cache stats and settings: {e}")
|
|
61
63
|
return False
|
qlever/commands/index.py
CHANGED
|
@@ -39,6 +39,7 @@ class IndexCommand(QleverCommand):
|
|
|
39
39
|
"multi_input_json",
|
|
40
40
|
"parallel_parsing",
|
|
41
41
|
"settings_json",
|
|
42
|
+
"vocabulary_type",
|
|
42
43
|
"index_binary",
|
|
43
44
|
"only_pso_and_pos_permutations",
|
|
44
45
|
"ulimit",
|
|
@@ -184,6 +185,7 @@ class IndexCommand(QleverCommand):
|
|
|
184
185
|
index_cmd = (
|
|
185
186
|
f"{args.cat_input_files} | {args.index_binary}"
|
|
186
187
|
f" -i {args.name} -s {args.name}.settings.json"
|
|
188
|
+
f" --vocabulary-type {args.vocabulary_type}"
|
|
187
189
|
f" -F {args.format} -f -"
|
|
188
190
|
)
|
|
189
191
|
if args.parallel_parsing:
|
|
@@ -199,6 +201,7 @@ class IndexCommand(QleverCommand):
|
|
|
199
201
|
index_cmd = (
|
|
200
202
|
f"{args.index_binary}"
|
|
201
203
|
f" -i {args.name} -s {args.name}.settings.json"
|
|
204
|
+
f" --vocabulary-type {args.vocabulary_type}"
|
|
202
205
|
f" {input_options}"
|
|
203
206
|
)
|
|
204
207
|
else:
|
qlever/commands/query.py
CHANGED
|
@@ -72,6 +72,7 @@ class QueryCommand(QleverCommand):
|
|
|
72
72
|
"application/sparql-results+json",
|
|
73
73
|
"application/sparql-results+xml",
|
|
74
74
|
"application/qlever-results+json",
|
|
75
|
+
"application/octet-stream",
|
|
75
76
|
],
|
|
76
77
|
default="text/tab-separated-values",
|
|
77
78
|
help="Accept header for the SPARQL query",
|
qlever/commands/settings.py
CHANGED
|
@@ -34,6 +34,8 @@ class SettingsCommand(QleverCommand):
|
|
|
34
34
|
"cache-max-size-single-entry",
|
|
35
35
|
"cache-service-results",
|
|
36
36
|
"default-query-timeout",
|
|
37
|
+
"division-by-zero-is-undef",
|
|
38
|
+
"enable-prefilter-on-index-scans",
|
|
37
39
|
"group-by-disable-index-scan-optimizations",
|
|
38
40
|
"group-by-hash-map-enabled",
|
|
39
41
|
"lazy-index-scan-max-size-materialization",
|
|
@@ -44,6 +46,9 @@ class SettingsCommand(QleverCommand):
|
|
|
44
46
|
"request-body-limit",
|
|
45
47
|
"service-max-value-rows",
|
|
46
48
|
"sort-estimate-cancellation-factor",
|
|
49
|
+
"spatial-join-prefilter-max-size",
|
|
50
|
+
"spatial-join-max-num-threads",
|
|
51
|
+
"syntax-test-mode",
|
|
47
52
|
"throw-on-unbound-variables",
|
|
48
53
|
"use-binsearch-transitive-path",
|
|
49
54
|
]
|
|
@@ -97,6 +102,8 @@ class SettingsCommand(QleverCommand):
|
|
|
97
102
|
try:
|
|
98
103
|
settings_json = run_command(curl_cmd, return_output=True)
|
|
99
104
|
settings_dict = json.loads(settings_json)
|
|
105
|
+
if isinstance(settings_dict, list):
|
|
106
|
+
settings_dict = settings_dict[0]
|
|
100
107
|
except Exception as e:
|
|
101
108
|
log.error(f"setting command failed: {e}")
|
|
102
109
|
return False
|
qlever/commands/ui.py
CHANGED
|
@@ -13,13 +13,16 @@ from qlever.util import is_port_used, run_command
|
|
|
13
13
|
|
|
14
14
|
# Return a YAML string for the given dictionary. Format values with
|
|
15
15
|
# newlines using the "|" style.
|
|
16
|
-
def dict_to_yaml(dictionary):
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
def dict_to_yaml(dictionary: dict) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Custom representer for yaml, which uses the "|" style only for
|
|
19
|
+
multiline strings.
|
|
20
|
+
|
|
21
|
+
NOTE: We replace all `\r\n` with `\n` because otherwise the `|` style
|
|
22
|
+
does not work as expected.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
class MultiLineDumper(yaml.SafeDumper):
|
|
23
26
|
def represent_scalar(self, tag, value, style=None):
|
|
24
27
|
value = value.replace("\r\n", "\n")
|
|
25
28
|
if isinstance(value, str) and "\n" in value:
|
|
@@ -30,6 +33,7 @@ def dict_to_yaml(dictionary):
|
|
|
30
33
|
return yaml.dump(
|
|
31
34
|
dictionary,
|
|
32
35
|
sort_keys=False,
|
|
36
|
+
allow_unicode=True,
|
|
33
37
|
Dumper=MultiLineDumper,
|
|
34
38
|
)
|
|
35
39
|
|
|
@@ -108,7 +112,7 @@ class UiCommand(QleverCommand):
|
|
|
108
112
|
|
|
109
113
|
# Construct commands and show them.
|
|
110
114
|
pull_latest_image = "/" in args.ui_image and not args.no_pull_latest
|
|
111
|
-
ui_config_name = args.
|
|
115
|
+
ui_config_name = args.ui_config
|
|
112
116
|
ui_db_file = args.ui_db_file or f"{args.name}.ui-db.sqlite3"
|
|
113
117
|
ui_db_file_from_image = "qleverui.sqlite3"
|
|
114
118
|
ui_config_file = args.ui_config_file
|
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import signal
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
|
|
9
|
+
import rdflib.term
|
|
10
|
+
import requests
|
|
11
|
+
import requests_sse
|
|
12
|
+
from rdflib import Graph
|
|
13
|
+
from termcolor import colored
|
|
14
|
+
|
|
15
|
+
from qlever.command import QleverCommand
|
|
16
|
+
from qlever.log import log
|
|
17
|
+
from qlever.util import run_command
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Monkey patch `rdflib.term._castLexicalToPython` to avoid casting of literals
|
|
21
|
+
# to Python types. We do not need it (all we want it convert Turtle to N-Triples),
|
|
22
|
+
# and we can speed up parsing by a factor of about 2.
|
|
23
|
+
def custom_cast_lexical_to_python(lexical, datatype):
|
|
24
|
+
return None # Your desired behavior
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
rdflib.term._castLexicalToPython = custom_cast_lexical_to_python
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class UpdateWikidataCommand(QleverCommand):
|
|
31
|
+
"""
|
|
32
|
+
Class for executing the `update` command.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
# SPARQL query to get the date until which the updates of the
|
|
37
|
+
# SPARQL endpoint are complete.
|
|
38
|
+
self.sparql_updates_complete_until_query = (
|
|
39
|
+
"PREFIX wikibase: <http://wikiba.se/ontology#> "
|
|
40
|
+
"PREFIX schema: <http://schema.org/> "
|
|
41
|
+
"SELECT * WHERE { "
|
|
42
|
+
"{ SELECT (MIN(?date_modified) AS ?updates_complete_until) { "
|
|
43
|
+
"wikibase:Dump schema:dateModified ?date_modified } } "
|
|
44
|
+
"UNION { wikibase:Dump wikibase:updatesCompleteUntil ?updates_complete_until } "
|
|
45
|
+
"} ORDER BY DESC(?updates_complete_until) LIMIT 1"
|
|
46
|
+
)
|
|
47
|
+
# URL of the Wikidata SSE stream.
|
|
48
|
+
self.wikidata_update_stream_url = (
|
|
49
|
+
"https://stream.wikimedia.org/v2/"
|
|
50
|
+
"stream/rdf-streaming-updater.mutation.v2"
|
|
51
|
+
)
|
|
52
|
+
# Remember if Ctrl+C was pressed, so we can handle it gracefully.
|
|
53
|
+
self.ctrl_c_pressed = False
|
|
54
|
+
|
|
55
|
+
def description(self) -> str:
|
|
56
|
+
return "Update from given SSE stream"
|
|
57
|
+
|
|
58
|
+
def should_have_qleverfile(self) -> bool:
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
|
|
62
|
+
return {"server": ["host_name", "port", "access_token"]}
|
|
63
|
+
|
|
64
|
+
def additional_arguments(self, subparser) -> None:
|
|
65
|
+
subparser.add_argument(
|
|
66
|
+
"sse_stream_url",
|
|
67
|
+
nargs="?",
|
|
68
|
+
type=str,
|
|
69
|
+
default=self.wikidata_update_stream_url,
|
|
70
|
+
help="URL of the SSE stream to update from",
|
|
71
|
+
)
|
|
72
|
+
subparser.add_argument(
|
|
73
|
+
"--batch-size",
|
|
74
|
+
type=int,
|
|
75
|
+
default=100000,
|
|
76
|
+
help="Group this many messages together into one update "
|
|
77
|
+
"(default: one update for each message); NOTE: this simply "
|
|
78
|
+
"concatenates the `rdf_added_data` and `rdf_deleted_data` fields, "
|
|
79
|
+
"which is not 100%% correct; as soon as chaining is supported, "
|
|
80
|
+
"this will be fixed",
|
|
81
|
+
)
|
|
82
|
+
subparser.add_argument(
|
|
83
|
+
"--lag-seconds",
|
|
84
|
+
type=int,
|
|
85
|
+
default=1,
|
|
86
|
+
help="When a message is encountered that is within this many "
|
|
87
|
+
"seconds of the current time, finish the current batch "
|
|
88
|
+
"(and show a warning that this happened)",
|
|
89
|
+
)
|
|
90
|
+
subparser.add_argument(
|
|
91
|
+
"--since",
|
|
92
|
+
type=str,
|
|
93
|
+
help="Consume stream messages since this date "
|
|
94
|
+
"(default: determine automatically from the SPARQL endpoint)",
|
|
95
|
+
)
|
|
96
|
+
subparser.add_argument(
|
|
97
|
+
"--topics",
|
|
98
|
+
type=str,
|
|
99
|
+
default="eqiad.rdf-streaming-updater.mutation",
|
|
100
|
+
help="Comma-separated list of topics to consume from the SSE stream"
|
|
101
|
+
" (default: only eqiad.rdf-streaming-updater.mutation)",
|
|
102
|
+
)
|
|
103
|
+
subparser.add_argument(
|
|
104
|
+
"--min-or-max-date",
|
|
105
|
+
choices=["min", "max"],
|
|
106
|
+
default="max",
|
|
107
|
+
help="Use the minimum or maximum date of the batch for the "
|
|
108
|
+
"`updatesCompleteUntil` property (default: maximum)",
|
|
109
|
+
)
|
|
110
|
+
subparser.add_argument(
|
|
111
|
+
"--wait-between-batches",
|
|
112
|
+
type=int,
|
|
113
|
+
default=3600,
|
|
114
|
+
help="Wait this many seconds between batches that were "
|
|
115
|
+
"finished due to a message that is within `lag_seconds` of "
|
|
116
|
+
"the current time (default: 3600s)",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Handle Ctrl+C gracefully by finishing the current batch and then exiting.
|
|
120
|
+
def handle_ctrl_c(self, signal_received, frame):
|
|
121
|
+
if self.ctrl_c_pressed:
|
|
122
|
+
log.warn("\rCtrl+C pressed again, undoing the previous Ctrl+C")
|
|
123
|
+
self.ctrl_c_pressed = False
|
|
124
|
+
else:
|
|
125
|
+
self.ctrl_c_pressed = True
|
|
126
|
+
log.warn(
|
|
127
|
+
"\rCtrl+C pressed, will finish the current batch and then exit"
|
|
128
|
+
" [press Ctrl+C again to continue]"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def execute(self, args) -> bool:
|
|
132
|
+
# cURL command to get the date until which the updates of the
|
|
133
|
+
# SPARQL endpoint are complete.
|
|
134
|
+
sparql_endpoint = f"http://{args.host_name}:{args.port}"
|
|
135
|
+
curl_cmd_updates_complete_until = (
|
|
136
|
+
f"curl -s {sparql_endpoint}"
|
|
137
|
+
f' -H "Accept: text/csv"'
|
|
138
|
+
f' -H "Content-type: application/sparql-query"'
|
|
139
|
+
f' --data "{self.sparql_updates_complete_until_query}"'
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Construct the command and show it.
|
|
143
|
+
lag_seconds_str = (
|
|
144
|
+
f"{args.lag_seconds} second{'s' if args.lag_seconds > 1 else ''}"
|
|
145
|
+
)
|
|
146
|
+
cmd_description = []
|
|
147
|
+
if args.since:
|
|
148
|
+
cmd_description.append(f"SINCE={args.since}")
|
|
149
|
+
else:
|
|
150
|
+
cmd_description.append(
|
|
151
|
+
f"SINCE=$({curl_cmd_updates_complete_until} | sed 1d)"
|
|
152
|
+
)
|
|
153
|
+
cmd_description.append(
|
|
154
|
+
f"Process SSE stream from {args.sse_stream_url}?since=$SINCE "
|
|
155
|
+
f"in batches of {args.batch_size:,} messages "
|
|
156
|
+
f"(less if a message is encountered that is within "
|
|
157
|
+
f"{lag_seconds_str} of the current time)"
|
|
158
|
+
)
|
|
159
|
+
self.show("\n".join(cmd_description), only_show=args.show)
|
|
160
|
+
if args.show:
|
|
161
|
+
return True
|
|
162
|
+
|
|
163
|
+
# Compute the `since` date if not given.
|
|
164
|
+
if not args.since:
|
|
165
|
+
try:
|
|
166
|
+
args.since = run_command(
|
|
167
|
+
f"{curl_cmd_updates_complete_until} | sed 1d",
|
|
168
|
+
return_output=True,
|
|
169
|
+
).strip()
|
|
170
|
+
except Exception as e:
|
|
171
|
+
log.error(
|
|
172
|
+
f"Error running `{curl_cmd_updates_complete_until}`: {e}"
|
|
173
|
+
)
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
# Special handling of Ctrl+C, see `handle_ctrl_c` above.
|
|
177
|
+
signal.signal(signal.SIGINT, self.handle_ctrl_c)
|
|
178
|
+
log.warn(
|
|
179
|
+
"Press Ctrl+C to finish the current batch and end gracefully, "
|
|
180
|
+
"press Ctrl+C again to continue with the next batch"
|
|
181
|
+
)
|
|
182
|
+
log.info("")
|
|
183
|
+
log.info(f"SINCE={args.since}")
|
|
184
|
+
log.info("")
|
|
185
|
+
args.sse_stream_url = f"{args.sse_stream_url}?since={args.since}"
|
|
186
|
+
|
|
187
|
+
# Initialize the SSE stream and all the statistics variables.
|
|
188
|
+
source = requests_sse.EventSource(
|
|
189
|
+
args.sse_stream_url,
|
|
190
|
+
headers={
|
|
191
|
+
"Accept": "text/event-stream",
|
|
192
|
+
"User-Agent": "qlever update-wikidata",
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
source.connect()
|
|
196
|
+
current_batch_size = 0
|
|
197
|
+
batch_count = 0
|
|
198
|
+
total_num_ops = 0
|
|
199
|
+
total_time_s = 0
|
|
200
|
+
start_time = time.perf_counter()
|
|
201
|
+
topics_to_consider = set(args.topics.split(","))
|
|
202
|
+
wait_before_next_batch = False
|
|
203
|
+
|
|
204
|
+
# Iterating over all messages in the stream.
|
|
205
|
+
for event in source:
|
|
206
|
+
# Beginning of a new batch of messages.
|
|
207
|
+
if current_batch_size == 0:
|
|
208
|
+
date_list = []
|
|
209
|
+
delta_to_now_list = []
|
|
210
|
+
batch_assembly_start_time = time.perf_counter()
|
|
211
|
+
insert_triples = set()
|
|
212
|
+
delete_triples = set()
|
|
213
|
+
if wait_before_next_batch:
|
|
214
|
+
log.info(
|
|
215
|
+
f"Waiting {args.wait_between_batches} "
|
|
216
|
+
f"second{'s' if args.wait_between_batches > 1 else ''} "
|
|
217
|
+
f"before processing the next batch"
|
|
218
|
+
)
|
|
219
|
+
log.info("")
|
|
220
|
+
time.sleep(args.wait_between_batches)
|
|
221
|
+
wait_before_next_batch = False
|
|
222
|
+
|
|
223
|
+
# Check if the `args.batch_size` is reached (note that we come here
|
|
224
|
+
# after a `continue` due to an error).
|
|
225
|
+
if self.ctrl_c_pressed:
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
# Process the message. Skip messages that are not of type `message`
|
|
229
|
+
# (should not happen), have no field `data` (should not happen
|
|
230
|
+
# either), or where the topic is not in `args.topics`.
|
|
231
|
+
if event.type != "message" or not event.data:
|
|
232
|
+
continue
|
|
233
|
+
event_data = json.loads(event.data)
|
|
234
|
+
topic = event_data.get("meta").get("topic")
|
|
235
|
+
if topic not in topics_to_consider:
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
# event_id = json.loads(event.last_event_id)
|
|
240
|
+
# date_ms_since_epoch = event_id[0].get("timestamp")
|
|
241
|
+
# date = time.strftime(
|
|
242
|
+
# "%Y-%m-%dT%H:%M:%SZ",
|
|
243
|
+
# time.gmtime(date_ms_since_epoch / 1000.0),
|
|
244
|
+
# )
|
|
245
|
+
date = event_data.get("meta").get("dt")
|
|
246
|
+
# date = event_data.get("dt")
|
|
247
|
+
date = re.sub(r"\.\d*Z$", "Z", date)
|
|
248
|
+
# entity_id = event_data.get("entity_id")
|
|
249
|
+
# operation = event_data.get("operation")
|
|
250
|
+
rdf_added_data = event_data.get("rdf_added_data")
|
|
251
|
+
rdf_deleted_data = event_data.get("rdf_deleted_data")
|
|
252
|
+
|
|
253
|
+
# Process the to-be-deleted triples.
|
|
254
|
+
if rdf_deleted_data is not None:
|
|
255
|
+
try:
|
|
256
|
+
rdf_deleted_data = rdf_deleted_data.get("data")
|
|
257
|
+
graph = Graph()
|
|
258
|
+
log.debug(f"RDF deleted data: {rdf_deleted_data}")
|
|
259
|
+
graph.parse(data=rdf_deleted_data, format="turtle")
|
|
260
|
+
for s, p, o in graph:
|
|
261
|
+
triple = f"{s.n3()} {p.n3()} {o.n3()}"
|
|
262
|
+
# NOTE: In case there was a previous `insert` of that
|
|
263
|
+
# triple, it is safe to remove that `insert`, but not
|
|
264
|
+
# the `delete` (in case the triple is contained in the
|
|
265
|
+
# original data).
|
|
266
|
+
if triple in insert_triples:
|
|
267
|
+
insert_triples.remove(triple)
|
|
268
|
+
delete_triples.add(triple)
|
|
269
|
+
except Exception as e:
|
|
270
|
+
log.error(f"Error reading `rdf_deleted_data`: {e}")
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
# Process the to-be-added triples.
|
|
274
|
+
if rdf_added_data is not None:
|
|
275
|
+
try:
|
|
276
|
+
rdf_added_data = rdf_added_data.get("data")
|
|
277
|
+
graph = Graph()
|
|
278
|
+
log.debug("RDF added data: {rdf_added_data}")
|
|
279
|
+
graph.parse(data=rdf_added_data, format="turtle")
|
|
280
|
+
for s, p, o in graph:
|
|
281
|
+
triple = f"{s.n3()} {p.n3()} {o.n3()}"
|
|
282
|
+
# NOTE: In case there was a previous `delete` of that
|
|
283
|
+
# triple, it is safe to remove that `delete`, but not
|
|
284
|
+
# the `insert` (in case the triple is not contained in
|
|
285
|
+
# the original data).
|
|
286
|
+
if triple in delete_triples:
|
|
287
|
+
delete_triples.remove(triple)
|
|
288
|
+
insert_triples.add(triple)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
log.error(f"Error reading `rdf_added_data`: {e}")
|
|
291
|
+
return False
|
|
292
|
+
|
|
293
|
+
except Exception as e:
|
|
294
|
+
log.error(f"Error reading data from message: {e}")
|
|
295
|
+
log.info(event)
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Continue assembling until either the batch size is reached, or
|
|
299
|
+
# we encounter a message that is within `args.lag_seconds` of the
|
|
300
|
+
# current time.
|
|
301
|
+
current_batch_size += 1
|
|
302
|
+
date_as_epoch_s = (
|
|
303
|
+
datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
|
|
304
|
+
.replace(tzinfo=timezone.utc)
|
|
305
|
+
.timestamp()
|
|
306
|
+
)
|
|
307
|
+
now_as_epoch_s = time.time()
|
|
308
|
+
delta_to_now_s = now_as_epoch_s - date_as_epoch_s
|
|
309
|
+
log.debug(
|
|
310
|
+
f"DATE: {date_as_epoch_s:.0f} [{date}], "
|
|
311
|
+
f"NOW: {now_as_epoch_s:.0f}, "
|
|
312
|
+
f"DELTA: {now_as_epoch_s - date_as_epoch_s:.0f}"
|
|
313
|
+
)
|
|
314
|
+
date_list.append(date)
|
|
315
|
+
delta_to_now_list.append(delta_to_now_s)
|
|
316
|
+
if (
|
|
317
|
+
current_batch_size < args.batch_size
|
|
318
|
+
and not self.ctrl_c_pressed
|
|
319
|
+
):
|
|
320
|
+
if delta_to_now_s < args.lag_seconds:
|
|
321
|
+
log.warn(
|
|
322
|
+
f"Encountered message with date {date}, which is within "
|
|
323
|
+
f"{args.lag_seconds} "
|
|
324
|
+
f"second{'s' if args.lag_seconds > 1 else ''} "
|
|
325
|
+
f"of the current time, finishing the current batch"
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
# Process the current batch of messages.
|
|
331
|
+
batch_assembly_end_time = time.perf_counter()
|
|
332
|
+
batch_assembly_time_ms = int(
|
|
333
|
+
1000 * (batch_assembly_end_time - batch_assembly_start_time)
|
|
334
|
+
)
|
|
335
|
+
batch_count += 1
|
|
336
|
+
date_list.sort()
|
|
337
|
+
delta_to_now_list.sort()
|
|
338
|
+
min_delta_to_now_s = delta_to_now_list[0]
|
|
339
|
+
if min_delta_to_now_s < 10:
|
|
340
|
+
min_delta_to_now_s = f"{min_delta_to_now_s:.1f}"
|
|
341
|
+
else:
|
|
342
|
+
min_delta_to_now_s = f"{int(min_delta_to_now_s):,}"
|
|
343
|
+
log.info(
|
|
344
|
+
f"Processing batch #{batch_count} "
|
|
345
|
+
f"with {current_batch_size:,} "
|
|
346
|
+
f"message{'s' if current_batch_size > 1 else ''}, "
|
|
347
|
+
f"date range: {date_list[0]} - {date_list[-1]} "
|
|
348
|
+
f"[assembly time: {batch_assembly_time_ms:,} ms, "
|
|
349
|
+
f"min delta to NOW: {min_delta_to_now_s} s]"
|
|
350
|
+
)
|
|
351
|
+
wait_before_next_batch = (
|
|
352
|
+
args.wait_between_batches is not None
|
|
353
|
+
and current_batch_size < args.batch_size
|
|
354
|
+
)
|
|
355
|
+
current_batch_size = 0
|
|
356
|
+
|
|
357
|
+
# Add the min and max date of the batch to `insert_triples`.
|
|
358
|
+
#
|
|
359
|
+
# NOTE: The min date means that we have *all* updates until that
|
|
360
|
+
# date. The max date is the date of the latest update we have seen.
|
|
361
|
+
# However, there may still be earlier updates that we have not seen
|
|
362
|
+
# yet. Wikidata uses `schema:dateModified` for the latter semantics,
|
|
363
|
+
# so we use it here as well. For the other semantics, we invent
|
|
364
|
+
# a new property `wikibase:updatesCompleteUntil`.
|
|
365
|
+
insert_triples.add(
|
|
366
|
+
f"<http://wikiba.se/ontology#Dump> "
|
|
367
|
+
f"<http://schema.org/dateModified> "
|
|
368
|
+
f'"{date_list[-1]}"^^<http://www.w3.org/2001/XMLSchema#dateTime>'
|
|
369
|
+
)
|
|
370
|
+
updates_complete_until = (
|
|
371
|
+
date_list[-1]
|
|
372
|
+
if args.min_or_max_date == "max"
|
|
373
|
+
else date_list[0]
|
|
374
|
+
)
|
|
375
|
+
insert_triples.add(
|
|
376
|
+
f"<http://wikiba.se/ontology#Dump> "
|
|
377
|
+
f"<http://wikiba.se/ontology#updatesCompleteUntil> "
|
|
378
|
+
f'"{updates_complete_until}"'
|
|
379
|
+
f"^^<http://www.w3.org/2001/XMLSchema#dateTime>"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Construct update operation.
|
|
383
|
+
delete_block = " . \n ".join(delete_triples)
|
|
384
|
+
insert_block = " . \n ".join(insert_triples)
|
|
385
|
+
delete_insert_operation = (
|
|
386
|
+
f"DELETE {{\n {delete_block} .\n}} "
|
|
387
|
+
f"INSERT {{\n {insert_block} .\n}} "
|
|
388
|
+
f"WHERE {{ }}\n"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Construct curl command. For batch size 1, send the operation via
|
|
392
|
+
# `--data-urlencode`, otherwise write to file and send via `--data-binary`.
|
|
393
|
+
curl_cmd = (
|
|
394
|
+
f"curl -s -X POST {sparql_endpoint}"
|
|
395
|
+
f" -H 'Authorization: Bearer {args.access_token}'"
|
|
396
|
+
f" -H 'Content-Type: application/sparql-update'"
|
|
397
|
+
)
|
|
398
|
+
update_arg_file_name = f"update.sparql.{batch_count}"
|
|
399
|
+
with open(update_arg_file_name, "w") as f:
|
|
400
|
+
f.write(delete_insert_operation)
|
|
401
|
+
curl_cmd += f" --data-binary @{update_arg_file_name}"
|
|
402
|
+
log.info(colored(curl_cmd, "blue"))
|
|
403
|
+
|
|
404
|
+
# Run it (using `curl` for batch size up to 1000, otherwise
|
|
405
|
+
# `requests`).
|
|
406
|
+
try:
|
|
407
|
+
headers = {
|
|
408
|
+
"Authorization": f"Bearer {args.access_token}",
|
|
409
|
+
"Content-Type": "application/sparql-update",
|
|
410
|
+
}
|
|
411
|
+
response = requests.post(
|
|
412
|
+
url=sparql_endpoint,
|
|
413
|
+
headers=headers,
|
|
414
|
+
data=delete_insert_operation,
|
|
415
|
+
)
|
|
416
|
+
result = response.text
|
|
417
|
+
with open(f"update.result.{batch_count}", "w") as f:
|
|
418
|
+
f.write(result)
|
|
419
|
+
except Exception as e:
|
|
420
|
+
log.warn(f"Error running `requests.post`: {e}")
|
|
421
|
+
log.info("")
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Results should be a JSON, parse it.
|
|
425
|
+
try:
|
|
426
|
+
result = json.loads(result)
|
|
427
|
+
if isinstance(result, list):
|
|
428
|
+
result = result[0]
|
|
429
|
+
except Exception as e:
|
|
430
|
+
log.error(
|
|
431
|
+
f"Error parsing JSON result: {e}"
|
|
432
|
+
f", the first 1000 characters are:"
|
|
433
|
+
)
|
|
434
|
+
log.info(result[:1000])
|
|
435
|
+
log.info("")
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
# Check if the result contains a QLever exception.
|
|
439
|
+
if "exception" in result:
|
|
440
|
+
error_msg = result["exception"]
|
|
441
|
+
log.error(f"QLever exception: {error_msg}")
|
|
442
|
+
log.info("")
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
# Helper function for getting the value of `result["time"][...]`
|
|
446
|
+
# without the "ms" suffix.
|
|
447
|
+
def get_time_ms(*keys: str) -> int:
|
|
448
|
+
value = result["time"]
|
|
449
|
+
for key in keys:
|
|
450
|
+
value = value[key]
|
|
451
|
+
return int(value)
|
|
452
|
+
# return int(re.sub(r"ms$", "", value))
|
|
453
|
+
|
|
454
|
+
# Show statistics of the update operation.
|
|
455
|
+
try:
|
|
456
|
+
ins_after = result["delta-triples"]["after"]["inserted"]
|
|
457
|
+
del_after = result["delta-triples"]["after"]["deleted"]
|
|
458
|
+
ops_after = result["delta-triples"]["after"]["total"]
|
|
459
|
+
num_ins = int(result["delta-triples"]["operation"]["inserted"])
|
|
460
|
+
num_del = int(result["delta-triples"]["operation"]["deleted"])
|
|
461
|
+
num_ops = int(result["delta-triples"]["operation"]["total"])
|
|
462
|
+
time_ms = get_time_ms("total")
|
|
463
|
+
time_us_per_op = int(1000 * time_ms / num_ops)
|
|
464
|
+
log.info(
|
|
465
|
+
colored(
|
|
466
|
+
f"NUM_OPS: {num_ops:+6,} -> {ops_after:6,}, "
|
|
467
|
+
f"INS: {num_ins:+6,} -> {ins_after:6,}, "
|
|
468
|
+
f"DEL: {num_del:+6,} -> {del_after:6,}, "
|
|
469
|
+
f"TIME: {time_ms:7,} ms, "
|
|
470
|
+
f"TIME/OP: {time_us_per_op:,} µs",
|
|
471
|
+
attrs=["bold"],
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Also show a detailed breakdown of the total time.
|
|
476
|
+
time_preparation = get_time_ms(
|
|
477
|
+
"execution", "processUpdateImpl", "preparation"
|
|
478
|
+
)
|
|
479
|
+
time_insert = get_time_ms(
|
|
480
|
+
"execution", "processUpdateImpl", "insertTriples", "total"
|
|
481
|
+
)
|
|
482
|
+
time_delete = get_time_ms(
|
|
483
|
+
"execution", "processUpdateImpl", "deleteTriples", "total"
|
|
484
|
+
)
|
|
485
|
+
time_snapshot = get_time_ms("execution", "snapshotCreation")
|
|
486
|
+
time_writeback = get_time_ms("execution", "diskWriteback")
|
|
487
|
+
time_unaccounted = time_ms - (
|
|
488
|
+
time_delete
|
|
489
|
+
+ time_insert
|
|
490
|
+
+ time_preparation
|
|
491
|
+
+ time_snapshot
|
|
492
|
+
+ time_writeback
|
|
493
|
+
)
|
|
494
|
+
log.info(
|
|
495
|
+
f"PREPARATION: {100 * time_preparation / time_ms:2.0f}%, "
|
|
496
|
+
# f"PLANNING: {100 * time_planning / time_ms:2.0f}%, "
|
|
497
|
+
f"INSERT: {100 * time_insert / time_ms:2.0f}%, "
|
|
498
|
+
f"DELETE: {100 * time_delete / time_ms:2.0f}%, "
|
|
499
|
+
f"SNAPSHOT: {100 * time_snapshot / time_ms:2.0f}%, "
|
|
500
|
+
f"WRITEBACK: {100 * time_writeback / time_ms:2.0f}%, "
|
|
501
|
+
f"UNACCOUNTED: {100 * time_unaccounted / time_ms:2.0f}%",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Show the totals so far.
|
|
505
|
+
total_num_ops += num_ops
|
|
506
|
+
total_time_s += time_ms / 1000.0
|
|
507
|
+
elapsed_time_s = time.perf_counter() - start_time
|
|
508
|
+
time_us_per_op = int(1e6 * total_time_s / total_num_ops)
|
|
509
|
+
log.info(
|
|
510
|
+
colored(
|
|
511
|
+
f"TOTAL NUM_OPS SO FAR: {total_num_ops:8,}, "
|
|
512
|
+
f"TOTAL UPDATE TIME SO FAR: {total_time_s:4.0f} s, "
|
|
513
|
+
f"ELAPSED TIME SO FAR: {elapsed_time_s:4.0f} s, "
|
|
514
|
+
f"AVG TIME/OP SO FAR: {time_us_per_op:,} µs",
|
|
515
|
+
attrs=["bold"],
|
|
516
|
+
)
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
except Exception as e:
|
|
520
|
+
log.warn(
|
|
521
|
+
f"Error extracting statistics: {e}, "
|
|
522
|
+
f"curl command was: {curl_cmd}"
|
|
523
|
+
)
|
|
524
|
+
# Show traceback for debugging.
|
|
525
|
+
import traceback
|
|
526
|
+
|
|
527
|
+
traceback.print_exc()
|
|
528
|
+
log.info("")
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
# Stop after processing the specified number of batches.
|
|
532
|
+
log.info("")
|
|
533
|
+
|
|
534
|
+
# Final statistics after all batches have been processed.
|
|
535
|
+
elapsed_time_s = time.perf_counter() - start_time
|
|
536
|
+
time_us_per_op = int(1e6 * total_time_s / total_num_ops)
|
|
537
|
+
log.info(
|
|
538
|
+
f"Processed {batch_count} "
|
|
539
|
+
f"{'batches' if batch_count > 1 else 'batch'} "
|
|
540
|
+
f"terminating update command"
|
|
541
|
+
)
|
|
542
|
+
log.info(
|
|
543
|
+
colored(
|
|
544
|
+
f"TOTAL NUM_OPS: {total_num_ops:8,}, "
|
|
545
|
+
f"TOTAL TIME: {total_time_s:4.0f} s, "
|
|
546
|
+
f"ELAPSED TIME: {elapsed_time_s:4.0f} s, "
|
|
547
|
+
f"AVG TIME/OP: {time_us_per_op:,} µs",
|
|
548
|
+
attrs=["bold"],
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
return True
|
qlever/qlever_main.py
CHANGED