qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. qlever/Qleverfiles/Qleverfile.dblp +36 -0
  2. qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
  3. qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
  4. qlever/Qleverfiles/Qleverfile.default +51 -0
  5. qlever/Qleverfiles/Qleverfile.dnb +40 -0
  6. qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
  7. qlever/Qleverfiles/Qleverfile.freebase +28 -0
  8. qlever/Qleverfiles/Qleverfile.imdb +36 -0
  9. qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
  10. qlever/Qleverfiles/Qleverfile.olympics +31 -0
  11. qlever/Qleverfiles/Qleverfile.orkg +30 -0
  12. qlever/Qleverfiles/Qleverfile.osm-country +39 -0
  13. qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
  14. qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
  15. qlever/Qleverfiles/Qleverfile.pubchem +131 -0
  16. qlever/Qleverfiles/Qleverfile.scientists +29 -0
  17. qlever/Qleverfiles/Qleverfile.uniprot +74 -0
  18. qlever/Qleverfiles/Qleverfile.vvz +31 -0
  19. qlever/Qleverfiles/Qleverfile.wikidata +42 -0
  20. qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
  21. qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
  22. qlever/__init__.py +44 -1380
  23. qlever/command.py +87 -0
  24. qlever/commands/__init__.py +0 -0
  25. qlever/commands/add_text_index.py +115 -0
  26. qlever/commands/benchmark_queries.py +1019 -0
  27. qlever/commands/cache_stats.py +125 -0
  28. qlever/commands/clear_cache.py +88 -0
  29. qlever/commands/extract_queries.py +120 -0
  30. qlever/commands/get_data.py +48 -0
  31. qlever/commands/index.py +333 -0
  32. qlever/commands/index_stats.py +306 -0
  33. qlever/commands/log.py +66 -0
  34. qlever/commands/materialized_view.py +110 -0
  35. qlever/commands/query.py +142 -0
  36. qlever/commands/rebuild_index.py +176 -0
  37. qlever/commands/reset_updates.py +59 -0
  38. qlever/commands/settings.py +115 -0
  39. qlever/commands/setup_config.py +97 -0
  40. qlever/commands/start.py +336 -0
  41. qlever/commands/status.py +50 -0
  42. qlever/commands/stop.py +90 -0
  43. qlever/commands/system_info.py +130 -0
  44. qlever/commands/ui.py +271 -0
  45. qlever/commands/update.py +90 -0
  46. qlever/commands/update_wikidata.py +1204 -0
  47. qlever/commands/warmup.py +41 -0
  48. qlever/config.py +223 -0
  49. qlever/containerize.py +167 -0
  50. qlever/log.py +55 -0
  51. qlever/qlever_main.py +79 -0
  52. qlever/qleverfile.py +530 -0
  53. qlever/util.py +330 -0
  54. qlever-0.5.41.dist-info/METADATA +127 -0
  55. qlever-0.5.41.dist-info/RECORD +59 -0
  56. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
  57. qlever-0.5.41.dist-info/entry_points.txt +2 -0
  58. qlever-0.5.41.dist-info/top_level.txt +1 -0
  59. build/lib/qlever/__init__.py +0 -1383
  60. build/lib/qlever/__main__.py +0 -4
  61. qlever/__main__.py +0 -4
  62. qlever-0.2.5.dist-info/METADATA +0 -277
  63. qlever-0.2.5.dist-info/RECORD +0 -12
  64. qlever-0.2.5.dist-info/entry_points.txt +0 -2
  65. qlever-0.2.5.dist-info/top_level.txt +0 -4
  66. src/qlever/__init__.py +0 -1383
  67. src/qlever/__main__.py +0 -4
  68. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1204 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import re
7
+ import signal
8
+ import time
9
+ from datetime import datetime, timezone
10
+ from enum import Enum, auto
11
+
12
+ import rdflib.term
13
+ import requests_sse
14
+ from rdflib import Graph
15
+ from termcolor import colored
16
+ from tqdm.contrib.logging import tqdm_logging_redirect
17
+
18
+ from qlever.command import QleverCommand
19
+ from qlever.log import log
20
+ from qlever.util import run_command
21
+
22
+
23
+ # Monkey patch `rdflib.term._castLexicalToPython` to avoid casting of literals
24
+ # to Python types. We do not need it (all we want it convert Turtle to N-Triples),
25
+ # and we can speed up parsing by a factor of about 2.
26
+ def custom_cast_lexical_to_python(lexical, datatype):
27
+ return None # Your desired behavior
28
+
29
+
30
+ rdflib.term._castLexicalToPython = custom_cast_lexical_to_python
31
+
32
+
33
+ def retry_with_backoff(operation, operation_name, max_retries, log):
34
+ """
35
+ Retry an operation with exponential backoff, see backoff intervals below
36
+ (in seconds). Returns the result of the operation if successful, or raises
37
+ the last exception.
38
+ """
39
+ backoff_intervals = [5, 10, 30, 60, 300, 900, 1800, 3600]
40
+
41
+ for attempt in range(max_retries):
42
+ try:
43
+ return operation()
44
+ except Exception as e:
45
+ if attempt < max_retries - 1:
46
+ # Use the appropriate backoff interval (once we get to the end
47
+ # of the list, keep using the last interval).
48
+ retry_delay = (
49
+ backoff_intervals[attempt]
50
+ if attempt < len(backoff_intervals)
51
+ else backoff_intervals[-1]
52
+ )
53
+ # Show the delay as seconds, minutes, or hours.
54
+ if retry_delay >= 3600:
55
+ delay_str = f"{retry_delay // 3600}h"
56
+ elif retry_delay >= 60:
57
+ delay_str = f"{retry_delay // 60}min"
58
+ else:
59
+ delay_str = f"{retry_delay}s"
60
+ log.warn(
61
+ f"{operation_name} failed (attempt {attempt + 1}/{max_retries}): {e}. "
62
+ f"Retrying in {delay_str} ..."
63
+ )
64
+ time.sleep(retry_delay)
65
+ else:
66
+ # If this was the last attempt, re-raise the exception.
67
+ raise
68
+
69
+
70
+ def connect_to_sse_stream(sse_stream_url, since=None, event_id=None):
71
+ """
72
+ Connect to the SSE stream and return the connected EventSource.
73
+
74
+ Args:
75
+ sse_stream_url: URL of the SSE stream
76
+ since: ISO date string to start from (mutually exclusive with event_id)
77
+ event_id: Event ID to resume from (mutually exclusive with since)
78
+
79
+ Returns:
80
+ The connected EventSource object
81
+ """
82
+ if event_id:
83
+ event_id_json = json.dumps(event_id)
84
+ source = requests_sse.EventSource(
85
+ sse_stream_url,
86
+ headers={
87
+ "Accept": "text/event-stream",
88
+ "User-Agent": "qlever update-wikidata",
89
+ "Last-Event-ID": event_id_json,
90
+ },
91
+ )
92
+ else:
93
+ source = requests_sse.EventSource(
94
+ sse_stream_url,
95
+ params={"since": since} if since else {},
96
+ headers={
97
+ "Accept": "text/event-stream",
98
+ "User-Agent": "qlever update-wikidata",
99
+ },
100
+ )
101
+
102
+ source.connect()
103
+ return source
104
+
105
+
106
+ class UpdateWikidataCommand(QleverCommand):
107
+ """
108
+ Class for executing the `update` command.
109
+ """
110
+
111
+ def __init__(self):
112
+ # SPARQL query to get the date until which the updates of the
113
+ # SPARQL endpoint are complete.
114
+ self.sparql_updates_complete_until_query = (
115
+ "PREFIX wikibase: <http://wikiba.se/ontology#> "
116
+ "PREFIX schema: <http://schema.org/> "
117
+ "SELECT * WHERE { "
118
+ "{ SELECT (MIN(?date_modified) AS ?updates_complete_until) { "
119
+ "wikibase:Dump schema:dateModified ?date_modified } } "
120
+ "UNION { wikibase:Dump wikibase:updatesCompleteUntil ?updates_complete_until } "
121
+ "} ORDER BY DESC(?updates_complete_until) LIMIT 1"
122
+ )
123
+ # URL of the Wikidata SSE stream.
124
+ self.wikidata_update_stream_url = (
125
+ "https://stream.wikimedia.org/v2/"
126
+ "stream/rdf-streaming-updater.mutation.v2"
127
+ )
128
+ # Remember if Ctrl+C was pressed, so we can handle it gracefully.
129
+ self.ctrl_c_pressed = False
130
+ # Set to `True` when finished.
131
+ self.finished = False
132
+
133
+ def description(self) -> str:
134
+ return "Update from given SSE stream"
135
+
136
+ def should_have_qleverfile(self) -> bool:
137
+ return True
138
+
139
+ def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
140
+ return {"server": ["host_name", "port", "access_token"]}
141
+
142
+ def additional_arguments(self, subparser) -> None:
143
+ subparser.add_argument(
144
+ "sse_stream_url",
145
+ nargs="?",
146
+ type=str,
147
+ default=self.wikidata_update_stream_url,
148
+ help="URL of the SSE stream to update from",
149
+ )
150
+ subparser.add_argument(
151
+ "--batch-size",
152
+ type=int,
153
+ default=100000,
154
+ help="Group this many messages together into one update "
155
+ "(default: one update for each message); NOTE: this simply "
156
+ "concatenates the `rdf_added_data` and `rdf_deleted_data` fields, "
157
+ "which is not 100%% correct; as soon as chaining is supported, "
158
+ "this will be fixed",
159
+ )
160
+ subparser.add_argument(
161
+ "--lag-seconds",
162
+ type=int,
163
+ default=1,
164
+ help="When a message is encountered that is within this many "
165
+ "seconds of the current time, finish the current batch "
166
+ "(and show a warning that this happened)",
167
+ )
168
+ subparser.add_argument(
169
+ "--since",
170
+ type=str,
171
+ help="Consume stream messages since this date "
172
+ "(default: determine automatically from the SPARQL endpoint)",
173
+ )
174
+ subparser.add_argument(
175
+ "--until",
176
+ type=str,
177
+ help="Stop consuming stream messages when reaching this date "
178
+ "(default: continue indefinitely)",
179
+ )
180
+ subparser.add_argument(
181
+ "--offset",
182
+ type=int,
183
+ help="Consume stream messages starting from this offset "
184
+ "(default: not set)",
185
+ )
186
+ subparser.add_argument(
187
+ "--topic",
188
+ type=str,
189
+ choices=[
190
+ "eqiad.rdf-streaming-updater.mutation",
191
+ "codfw.rdf-streaming-updater.mutation",
192
+ ],
193
+ default="eqiad.rdf-streaming-updater.mutation",
194
+ help="The topic to consume from the SSE stream (default: "
195
+ "eqiad.rdf-streaming-updater.mutation)",
196
+ )
197
+ subparser.add_argument(
198
+ "--partition",
199
+ type=int,
200
+ default=0,
201
+ help="The partition to consume from the SSE stream (default: 0)",
202
+ )
203
+ subparser.add_argument(
204
+ "--min-or-max-date",
205
+ choices=["min", "max"],
206
+ default="max",
207
+ help="Use the minimum or maximum date of the batch for the "
208
+ "`updatesCompleteUntil` property (default: maximum)",
209
+ )
210
+ subparser.add_argument(
211
+ "--wait-between-batches",
212
+ type=int,
213
+ default=300,
214
+ help="Wait this many seconds between batches that were "
215
+ "finished due to a message that is within `lag_seconds` of "
216
+ "the current time (default: 300s)",
217
+ )
218
+ subparser.add_argument(
219
+ "--num-messages",
220
+ type=int,
221
+ help="Process exactly this many messages and then exit "
222
+ "(default: no bound on the number of messages)",
223
+ )
224
+ subparser.add_argument(
225
+ "--verbose",
226
+ choices=["no", "yes"],
227
+ default="yes",
228
+ help='Verbose logging, "yes" or "no" (default: "yes")',
229
+ )
230
+ subparser.add_argument(
231
+ "--use-cached-sparql-queries",
232
+ action="store_true",
233
+ help="Use cached SPARQL query files if they exist with matching "
234
+ "offset and target batch size (default: off)",
235
+ )
236
+ subparser.add_argument(
237
+ "--check-offset-before-each-batch",
238
+ choices=["yes", "no"],
239
+ default="yes",
240
+ help="Before each batch, verify that the stream offset matches the "
241
+ "stored offset in the knowledge base (default: yes)",
242
+ )
243
+ subparser.add_argument(
244
+ "--num-retries",
245
+ type=int,
246
+ default=10,
247
+ help="Number of retries for offset verification queries when they fail "
248
+ "(default: 10)",
249
+ )
250
+
251
+ # Handle Ctrl+C gracefully by finishing the current batch and then exiting.
252
+ def handle_ctrl_c(self, signal_received, frame):
253
+ if self.ctrl_c_pressed:
254
+ log.warn("\rCtrl+C pressed again, watch your blood pressure")
255
+ else:
256
+ self.ctrl_c_pressed = True
257
+
258
+ def execute(self, args) -> bool:
259
+ # cURL command to get the date until which the updates of the
260
+ # SPARQL endpoint are complete.
261
+ sparql_endpoint = f"http://{args.host_name}:{args.port}"
262
+ curl_cmd_updates_complete_until = (
263
+ f"curl -s {sparql_endpoint}"
264
+ f' -H "Accept: text/csv"'
265
+ f' -H "Content-type: application/sparql-query"'
266
+ f' --data "{self.sparql_updates_complete_until_query}"'
267
+ )
268
+
269
+ # Construct the command and show it.
270
+ cmd_description = []
271
+ if args.since:
272
+ cmd_description.append(f"SINCE={args.since}")
273
+ else:
274
+ cmd_description.append(
275
+ f"SINCE=$({curl_cmd_updates_complete_until} | sed 1d)"
276
+ )
277
+ if args.until:
278
+ cmd_description.append(f"UNTIL={args.until}")
279
+ cmd_description.append(
280
+ f"Process SSE stream from {args.sse_stream_url} "
281
+ f"in batches of up to {args.batch_size:,} messages "
282
+ )
283
+ self.show("\n".join(cmd_description), only_show=args.show)
284
+ if args.show:
285
+ return True
286
+
287
+ # Compute the `since` date if not given.
288
+ if args.since:
289
+ since = args.since
290
+ else:
291
+ try:
292
+ since = run_command(
293
+ f"{curl_cmd_updates_complete_until} | sed 1d",
294
+ return_output=True,
295
+ ).strip()
296
+ except Exception as e:
297
+ log.error(
298
+ f"Error running `{curl_cmd_updates_complete_until}`: {e}"
299
+ )
300
+ return False
301
+
302
+ # Special handling of Ctrl+C, see `handle_ctrl_c` above.
303
+ signal.signal(signal.SIGINT, self.handle_ctrl_c)
304
+ log.warn("Press Ctrl+C to finish and exit gracefully")
305
+ log.info("")
306
+
307
+ # If --offset is not provided, first try to get the stored offset from
308
+ # the knowledge base. Only fall back to date-based approach if no
309
+ # offset is stored.
310
+ if not args.offset:
311
+ try:
312
+ sparql_query_stored_offset = (
313
+ "PREFIX wikibase: <http://wikiba.se/ontology#> "
314
+ "SELECT (MAX(?offset) AS ?maxOffset) WHERE { "
315
+ "<http://wikiba.se/ontology#Dump> "
316
+ "wikibase:updateStreamNextOffset ?offset "
317
+ "}"
318
+ )
319
+ curl_cmd_get_stored_offset = (
320
+ f"curl -s {sparql_endpoint}"
321
+ f' -H "Accept: text/csv"'
322
+ f' -H "Content-type: application/sparql-query"'
323
+ f' --data "{sparql_query_stored_offset}"'
324
+ )
325
+ result = run_command(
326
+ f"{curl_cmd_get_stored_offset} | sed 1d",
327
+ return_output=True,
328
+ ).strip()
329
+ if result and result != '""':
330
+ args.offset = int(result.strip('"'))
331
+ log.info(
332
+ f"Resuming from stored offset in knowledge base: "
333
+ f"{args.offset}"
334
+ )
335
+ except Exception as e:
336
+ log.debug(
337
+ f"Could not retrieve stored offset from knowledge base: {e}. "
338
+ f"Will determine offset from date instead."
339
+ )
340
+
341
+ # If --offset is still not set, determine it by reading a single
342
+ # message from the SSE stream using the `since` date.
343
+ if not args.offset:
344
+ try:
345
+ source = retry_with_backoff(
346
+ lambda: connect_to_sse_stream(
347
+ args.sse_stream_url, since=since
348
+ ),
349
+ "SSE stream connection",
350
+ args.num_retries,
351
+ log,
352
+ )
353
+ offset = None
354
+ for event in source:
355
+ if event.type == "message" and event.data:
356
+ event_data = json.loads(event.data)
357
+ event_topic = event_data.get("meta").get("topic")
358
+ if event_topic == args.topic:
359
+ offset = event_data.get("meta").get("offset")
360
+ log.debug(
361
+ f"Determined offset from date: {since} -> {offset}"
362
+ )
363
+ break
364
+ source.close()
365
+ if offset is None:
366
+ raise Exception(
367
+ f"No event with topic {args.topic} found in stream"
368
+ )
369
+ args.offset = offset
370
+ except Exception as e:
371
+ log.error(f"Error determining offset from stream: {e}")
372
+ return False
373
+
374
+ # Initialize all the statistics variables.
375
+ batch_count = 0
376
+ total_num_messages = 0
377
+ total_update_time = 0
378
+ start_time = time.perf_counter()
379
+ wait_before_next_batch = False
380
+ event_id_for_next_batch = (
381
+ [
382
+ {
383
+ "topic": args.topic,
384
+ "partition": args.partition,
385
+ "offset": args.offset,
386
+ }
387
+ ]
388
+ if args.offset
389
+ else None
390
+ )
391
+
392
+ # Track whether this is the first batch (to skip offset check)
393
+ first_batch = True
394
+
395
+ # Main event loop: Either resume from `event_id_for_next_batch` (if set),
396
+ # or start a new connection to `args.sse_stream_url` (with URL
397
+ # parameter `?since=`).
398
+ while True:
399
+ # Optionally wait before processing the next batch (make sure that
400
+ # the wait is interruptible by Ctrl+C).
401
+ if wait_before_next_batch:
402
+ log.info(
403
+ f"Waiting {args.wait_between_batches} "
404
+ f"second{'s' if args.wait_between_batches > 1 else ''} "
405
+ f"before processing the next batch"
406
+ )
407
+ log.info("")
408
+ wait_before_next_batch = False
409
+ for _ in range(args.wait_between_batches):
410
+ if self.ctrl_c_pressed:
411
+ break
412
+ time.sleep(1)
413
+ if self.ctrl_c_pressed:
414
+ log.warn(
415
+ "\rCtrl+C pressed while waiting in between batches, "
416
+ "exiting"
417
+ )
418
+ break
419
+
420
+ # Start stream from either `event_id_for_next_batch` or `since`.
421
+ # We'll extract the offset for first_offset_in_batch later.
422
+ if event_id_for_next_batch:
423
+ event_id_json = json.dumps(event_id_for_next_batch)
424
+ if args.verbose == "yes":
425
+ log.info(
426
+ colored(
427
+ f"Consuming stream from event ID: {event_id_json}",
428
+ attrs=["dark"],
429
+ )
430
+ )
431
+ else:
432
+ if args.verbose == "yes":
433
+ log.info(
434
+ colored(
435
+ f"Consuming stream from date: {since}",
436
+ attrs=["dark"],
437
+ )
438
+ )
439
+
440
+ # Connect to the SSE stream with retry logic
441
+ try:
442
+ source = retry_with_backoff(
443
+ lambda: connect_to_sse_stream(
444
+ args.sse_stream_url,
445
+ since=since if not event_id_for_next_batch else None,
446
+ event_id=event_id_for_next_batch,
447
+ ),
448
+ "SSE stream connection for batch processing",
449
+ args.num_retries,
450
+ log,
451
+ )
452
+ except Exception as e:
453
+ log.error(
454
+ f"Failed to connect to SSE stream after "
455
+ f"{args.num_retries} retry attempts, last error: {e}"
456
+ )
457
+ break
458
+
459
+ # Next comes the inner loop, which processes exactly one "batch" of
460
+ # messages. The batch is completed (simply using `break`) when either
461
+ # `args.batch_size` messages have been processed, or when one of a
462
+ # variety of conditions occur (Ctrl+C pressed, message within
463
+ # `args.lag_seconds` of current time, delete operation followed by
464
+ # insert of triple with that entity as subject).
465
+
466
+ # Initialize all the batch variables.
467
+ current_batch_size = 0
468
+ # Extract the offset from the event ID to use as the starting offset
469
+ # for this batch. This is set before processing any messages.
470
+ if event_id_for_next_batch:
471
+ first_offset_in_batch = event_id_for_next_batch[0]["offset"]
472
+ event_id_for_next_batch = None
473
+ else:
474
+ # This should not happen since we now always determine the offset
475
+ # before starting, but keep as fallback
476
+ first_offset_in_batch = None
477
+
478
+ # Check that the stream offset matches the stored offset in the KB
479
+ # Skip this check on the first batch (when using --offset to resume)
480
+ if (
481
+ args.check_offset_before_each_batch == "yes"
482
+ and not first_batch
483
+ and first_offset_in_batch is not None
484
+ ):
485
+ sparql_query_offset = (
486
+ "PREFIX wikibase: <http://wikiba.se/ontology#> "
487
+ "SELECT (MAX(?offset) AS ?maxOffset) WHERE { "
488
+ "<http://wikiba.se/ontology#Dump> "
489
+ "wikibase:updateStreamNextOffset ?offset "
490
+ "}"
491
+ )
492
+ curl_cmd_check_offset = (
493
+ f"curl -s {sparql_endpoint}"
494
+ f' -H "Accept: text/csv"'
495
+ f' -H "Content-type: application/sparql-query"'
496
+ f' --data "{sparql_query_offset}"'
497
+ )
498
+ # Verify offset with retry logic
499
+ try:
500
+ result = retry_with_backoff(
501
+ lambda: run_command(
502
+ f"{curl_cmd_check_offset} | sed 1d",
503
+ return_output=True,
504
+ ).strip(),
505
+ "Offset verification",
506
+ args.num_retries,
507
+ log,
508
+ )
509
+ if not result:
510
+ log.error(
511
+ "Failed to retrieve stored offset from knowledge base: "
512
+ "query returned no results. This might be the first update, "
513
+ "or the offset triple is missing."
514
+ )
515
+ return False
516
+ stored_offset = int(result.strip('"'))
517
+ if stored_offset != first_offset_in_batch:
518
+ log.error(
519
+ f"Offset mismatch: stream offset is {first_offset_in_batch}, "
520
+ f"but stored offset in knowledge base is {stored_offset}. "
521
+ f"This indicates that updates may have been applied "
522
+ f"out of order or some updates are missing."
523
+ )
524
+ return False
525
+ except Exception as e:
526
+ log.error(
527
+ f"Failed to retrieve or verify stored offset from "
528
+ f"SPARQL endpoint after {args.num_retries} retry; "
529
+ f"last error: {e}"
530
+ )
531
+ return False
532
+
533
+ date_list = []
534
+ delete_entity_ids = set()
535
+ delta_to_now_list = []
536
+ batch_assembly_start_time = time.perf_counter()
537
+ insert_triples = set()
538
+ delete_triples = set()
539
+
540
+ # Check if we can use a cached SPARQL query file
541
+ use_cached_file = False
542
+ cached_file_name = None
543
+ cached_meta_file_name = None
544
+ cached_date_range = None
545
+ if (
546
+ args.use_cached_sparql_queries
547
+ and first_offset_in_batch is not None
548
+ ):
549
+ cached_file_name = (
550
+ f"update.{first_offset_in_batch}.{args.batch_size}.sparql"
551
+ )
552
+ cached_meta_file_name = (
553
+ f"update.{first_offset_in_batch}.{args.batch_size}.meta"
554
+ )
555
+ if os.path.exists(cached_file_name):
556
+ use_cached_file = True
557
+ # Try to read metadata file for date range
558
+ if os.path.exists(cached_meta_file_name):
559
+ try:
560
+ with open(cached_meta_file_name, "r") as f:
561
+ cached_date_range = f.read().strip()
562
+ except Exception:
563
+ cached_date_range = None
564
+
565
+ if args.verbose == "yes":
566
+ log_msg = f"Using cached SPARQL query file: {cached_file_name}"
567
+ if cached_date_range:
568
+ log_msg += f" [date range: {cached_date_range}]"
569
+ log.info(colored(log_msg, "cyan"))
570
+
571
+ # Process one event at a time (unless using cached file).
572
+ if not use_cached_file:
573
+ with tqdm_logging_redirect(
574
+ loggers=[logging.getLogger("qlever")],
575
+ desc="Batch",
576
+ total=args.batch_size,
577
+ leave=False,
578
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}{postfix}",
579
+ ) as pbar:
580
+ for event in source:
581
+ # Skip events that are not of type `message` (should not
582
+ # happen), have no field `data` (should not happen either), or
583
+ # where the topic is not in `args.topics` (one topic by itself
584
+ # should provide all relevant updates).
585
+ if event.type != "message" or not event.data:
586
+ continue
587
+ event_data = json.loads(event.data)
588
+ topic = event_data.get("meta").get("topic")
589
+ if topic != args.topic:
590
+ continue
591
+
592
+ try:
593
+ # Extract offset, topic, and partition from the message metadata
594
+ # to construct a precise event ID for resuming.
595
+ meta = event_data.get("meta")
596
+ offset = meta.get("offset")
597
+ topic = meta.get("topic")
598
+ partition = meta.get("partition")
599
+
600
+ # Get the date (rounded *down* to seconds).
601
+ date = meta.get("dt")
602
+ date = re.sub(r"\.\d*Z$", "Z", date)
603
+
604
+ # Get the other relevant fields from the message.
605
+ entity_id = event_data.get("entity_id")
606
+ operation = event_data.get("operation")
607
+ rdf_added_data = event_data.get("rdf_added_data")
608
+ rdf_deleted_data = event_data.get(
609
+ "rdf_deleted_data"
610
+ )
611
+ rdf_linked_shared_data = event_data.get(
612
+ "rdf_linked_shared_data"
613
+ )
614
+ rdf_unlinked_shared_data = event_data.get(
615
+ "rdf_unlinked_shared_data"
616
+ )
617
+
618
+ # Check batch completion conditions BEFORE processing the
619
+ # data of this message. If any of the conditions is met,
620
+ # we finish the batch and resume from the LAST PROCESSED
621
+ # message (not the current one that triggered the break).
622
+ #
623
+ # NOTE: We will update event_id_for_next_batch AFTER
624
+ # successfully processing each message (see below), so that
625
+ # when we break, it contains the last processed event ID.
626
+ since = None
627
+
628
+ # Condition 1: Delete followed by insert for same entity.
629
+ operation_adds_data = (
630
+ rdf_added_data is not None
631
+ or rdf_linked_shared_data is not None
632
+ )
633
+ if (
634
+ operation_adds_data
635
+ and entity_id in delete_entity_ids
636
+ ):
637
+ if args.verbose == "yes":
638
+ log.warn(
639
+ f"Encountered operation that adds data for "
640
+ f"an entity ID ({entity_id}) that was deleted "
641
+ f"earlier in this batch; finishing batch and "
642
+ f"resuming from this message in the next batch"
643
+ )
644
+ break
645
+
646
+ # Condition 2: Batch size or limit on number of
647
+ # messages reached.
648
+ if current_batch_size >= args.batch_size or (
649
+ args.num_messages is not None
650
+ and total_num_messages >= args.num_messages
651
+ ):
652
+ break
653
+
654
+ # Condition 3: Message close to current time.
655
+ date_obj = datetime.strptime(
656
+ date, "%Y-%m-%dT%H:%M:%SZ"
657
+ ).replace(tzinfo=timezone.utc)
658
+ date_as_epoch_s = date_obj.timestamp()
659
+
660
+ now_as_epoch_s = time.time()
661
+ delta_to_now_s = now_as_epoch_s - date_as_epoch_s
662
+ if (
663
+ delta_to_now_s < args.lag_seconds
664
+ and current_batch_size > 0
665
+ ):
666
+ if args.verbose == "yes":
667
+ log.warn(
668
+ f"Encountered message with date {date}, which is within "
669
+ f"{args.lag_seconds} "
670
+ f"second{'s' if args.lag_seconds > 1 else ''} "
671
+ f"of the current time, finishing the current batch"
672
+ )
673
+ wait_before_next_batch = (
674
+ args.wait_between_batches is not None
675
+ and args.wait_between_batches > 0
676
+ )
677
+ break
678
+
679
+ # Condition 4: Reached `--until` date and at least one
680
+ # message was processed.
681
+ if (
682
+ args.until
683
+ and date >= args.until
684
+ and current_batch_size > 0
685
+ ):
686
+ log.warn(
687
+ f"Reached --until date {args.until} "
688
+ f"(message date: {date}), that's it folks"
689
+ )
690
+ self.finished = True
691
+ break
692
+
693
+ # Delete operations are postponed until the end of the
694
+ # batch, so remember the entity ID here.
695
+ if operation == "delete":
696
+ delete_entity_ids.add(entity_id)
697
+
698
+ # Replace each occurrence of `\\` by `\u005C\u005C`
699
+ # (which is twice the Unicode for backslash).
700
+ #
701
+ # NOTE: Strictly speaking, it would be enough to do
702
+ # this for two backslashes followed by a `u`, but
703
+ # doing it for all double backslashes does not
704
+ # harm. When parsing a SPARQL query, then according
705
+ # to the standar, first all occurrences of `\uxxxx`
706
+ # (where `xxxx` are four hex digits) are replaced
707
+ # by the corresponding Unicode character. That is a
708
+ # problem when `\\uxxxx` occurs in a literal,
709
+ # because then it would be replaced by `\` followed
710
+ # by the Unicode character, which is invalied
711
+ # SPARQL. The subsitution avoids that problem.
712
+ def node_to_sparql(node: rdflib.term.Node) -> str:
713
+ return node.n3().replace(
714
+ "\\\\", "\\u005C\\u005C"
715
+ )
716
+
717
+ # Process the to-be-deleted triples.
718
+ for rdf_to_be_deleted in (
719
+ rdf_deleted_data,
720
+ rdf_unlinked_shared_data,
721
+ ):
722
+ if rdf_to_be_deleted is not None:
723
+ try:
724
+ rdf_to_be_deleted_data = (
725
+ rdf_to_be_deleted.get("data")
726
+ )
727
+ graph = Graph()
728
+ log.debug(
729
+ f"RDF to_be_deleted data: {rdf_to_be_deleted_data}"
730
+ )
731
+ graph.parse(
732
+ data=rdf_to_be_deleted_data,
733
+ format="turtle",
734
+ )
735
+ for s, p, o in graph:
736
+ triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}"
737
+ # NOTE: In case there was a previous `insert` of that
738
+ # triple, it is safe to remove that `insert`, but not
739
+ # the `delete` (in case the triple is contained in the
740
+ # original data).
741
+ if triple in insert_triples:
742
+ insert_triples.remove(triple)
743
+ delete_triples.add(triple)
744
+ except Exception as e:
745
+ log.error(
746
+ f"Error reading `rdf_to_be_deleted_data`: {e}"
747
+ )
748
+ return False
749
+
750
+ # Process the to-be-added triples.
751
+ for rdf_to_be_added in (
752
+ rdf_added_data,
753
+ rdf_linked_shared_data,
754
+ ):
755
+ if rdf_to_be_added is not None:
756
+ try:
757
+ rdf_to_be_added_data = (
758
+ rdf_to_be_added.get("data")
759
+ )
760
+ graph = Graph()
761
+ log.debug(
762
+ "RDF to be added data: {rdf_to_be_added_data}"
763
+ )
764
+ graph.parse(
765
+ data=rdf_to_be_added_data,
766
+ format="turtle",
767
+ )
768
+ for s, p, o in graph:
769
+ triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}"
770
+ # NOTE: In case there was a previous `delete` of that
771
+ # triple, it is safe to remove that `delete`, but not
772
+ # the `insert` (in case the triple is not contained in
773
+ # the original data).
774
+ if triple in delete_triples:
775
+ delete_triples.remove(triple)
776
+ insert_triples.add(triple)
777
+ except Exception as e:
778
+ log.error(
779
+ f"Error reading `rdf_to_be_added_data`: {e}"
780
+ )
781
+ return False
782
+
783
+ except Exception as e:
784
+ log.error(f"Error reading data from message: {e}")
785
+ log.info(event)
786
+ continue
787
+
788
+ # Message was successfully processed, update batch tracking
789
+ current_batch_size += 1
790
+ total_num_messages += 1
791
+ pbar_update_frequency = 100
792
+ if (current_batch_size % pbar_update_frequency) == 0:
793
+ pbar.set_postfix(
794
+ {
795
+ "Time": date_obj.strftime(
796
+ "%Y-%m-%d %H:%M:%S"
797
+ )
798
+ }
799
+ )
800
+ pbar.update(pbar_update_frequency)
801
+ log.debug(
802
+ f"DATE: {date_as_epoch_s:.0f} [{date}], "
803
+ f"NOW: {now_as_epoch_s:.0f}, "
804
+ f"DELTA: {now_as_epoch_s - date_as_epoch_s:.0f}"
805
+ )
806
+ date_list.append(date)
807
+ delta_to_now_list.append(delta_to_now_s)
808
+
809
+ # Update the event ID for the next batch. We increment the
810
+ # offset by 1 so that the next batch starts with the next
811
+ # message (not re-processing the current one).
812
+ event_id_for_next_batch = [
813
+ {
814
+ "topic": topic,
815
+ "partition": partition,
816
+ "offset": offset + 1,
817
+ }
818
+ ]
819
+
820
+ # Ctrl+C finishes the current batch (this should come at the
821
+ # end of the inner event loop so that always at least one
822
+ # message is processed).
823
+ if self.ctrl_c_pressed:
824
+ log.warn(
825
+ "\rCtrl+C pressed while processing a batch, "
826
+ "finishing it and exiting"
827
+ )
828
+ break
829
+ else:
830
+ # Using cached file - set batch size and calculate next offset
831
+ current_batch_size = args.batch_size
832
+ total_num_messages += current_batch_size
833
+ event_id_for_next_batch = [
834
+ {
835
+ "topic": args.topic,
836
+ "partition": args.partition,
837
+ "offset": first_offset_in_batch + current_batch_size,
838
+ }
839
+ ]
840
+
841
+ # Process the current batch of messages (or skip if using cached).
842
+ batch_count += 1
843
+ if not use_cached_file:
844
+ batch_assembly_end_time = time.perf_counter()
845
+ batch_assembly_time_ms = int(
846
+ 1000
847
+ * (batch_assembly_end_time - batch_assembly_start_time)
848
+ )
849
+ date_list.sort()
850
+ delta_to_now_list.sort()
851
+ min_delta_to_now_s = delta_to_now_list[0]
852
+ if min_delta_to_now_s < 10:
853
+ min_delta_to_now_s = f"{min_delta_to_now_s:.1f}"
854
+ else:
855
+ min_delta_to_now_s = f"{int(min_delta_to_now_s):,}"
856
+ log.info(
857
+ f"Assembled batch #{batch_count}, "
858
+ f"#messages: {current_batch_size:2,}, "
859
+ f"date range: {date_list[0]} - {date_list[-1]} "
860
+ f"[assembly time: {batch_assembly_time_ms:3,}ms, "
861
+ f"min delta to NOW: {min_delta_to_now_s}s]"
862
+ )
863
+
864
+ # Add the min and max date of the batch to `insert_triples`.
865
+ #
866
+ # NOTE: The min date means that we have *all* updates until that
867
+ # date. The max date is the date of the latest update we have seen.
868
+ # However, there may still be earlier updates that we have not seen
869
+ # yet. Wikidata uses `schema:dateModified` for the latter semantics,
870
+ # so we use it here as well. For the other semantics, we invent
871
+ # a new property `wikibase:updatesCompleteUntil`.
872
+ insert_triples.add(
873
+ f"<http://wikiba.se/ontology#Dump> "
874
+ f"<http://schema.org/dateModified> "
875
+ f'"{date_list[-1]}"^^<http://www.w3.org/2001/XMLSchema#dateTime>'
876
+ )
877
+ updates_complete_until = (
878
+ date_list[-1]
879
+ if args.min_or_max_date == "max"
880
+ else date_list[0]
881
+ )
882
+ insert_triples.add(
883
+ f"<http://wikiba.se/ontology#Dump> "
884
+ f"<http://wikiba.se/ontology#updatesCompleteUntil> "
885
+ f'"{updates_complete_until}"'
886
+ f"^^<http://www.w3.org/2001/XMLSchema#dateTime>"
887
+ )
888
+ insert_triples.add(
889
+ "<http://wikiba.se/ontology#Dump> "
890
+ "<http://wikiba.se/ontology#updateStreamNextOffset> "
891
+ f'"{event_id_for_next_batch[0]["offset"]}"'
892
+ )
893
+
894
+ # Construct UPDATE operation.
895
+ delete_block = " . \n ".join(delete_triples)
896
+ insert_block = " . \n ".join(insert_triples)
897
+ delete_insert_operation = (
898
+ f"DELETE {{\n {delete_block} \n}} "
899
+ f"INSERT {{\n {insert_block} \n}} "
900
+ f"WHERE {{ }}\n"
901
+ )
902
+
903
+ # If `delete_entity_ids` is non-empty, add a `DELETE WHERE`
904
+ # operation that deletes all triples that are associated with only
905
+ # those entities.
906
+ delete_entity_ids_as_values = " ".join(
907
+ [f"wd:{qid}" for qid in delete_entity_ids]
908
+ )
909
+ if len(delete_entity_ids) > 0:
910
+ delete_where_operation = (
911
+ f"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
912
+ f"PREFIX wikibase: <http://wikiba.se/ontology#>\n"
913
+ f"PREFIX wd: <http://www.wikidata.org/entity/>\n"
914
+ f"DELETE {{\n"
915
+ f" ?s ?p ?o .\n"
916
+ f"}} WHERE {{\n"
917
+ f" {{\n"
918
+ f" VALUES ?s {{ {delete_entity_ids_as_values} }}\n"
919
+ f" ?s ?p ?o .\n"
920
+ f" }} UNION {{\n"
921
+ f" VALUES ?_1 {{ {delete_entity_ids_as_values} }}\n"
922
+ f" ?_1 ?_2 ?s .\n"
923
+ f" ?s ?p ?o .\n"
924
+ f" ?s rdf:type wikibase:Statement .\n"
925
+ f" }}\n"
926
+ f"}}\n"
927
+ )
928
+ delete_insert_operation += ";\n" + delete_where_operation
929
+
930
+ # Construct curl command. For batch size 1, send the operation via
931
+ # `--data-urlencode`, otherwise write to file and send via `--data-binary`.
932
+ curl_cmd = (
933
+ f"curl -s -X POST"
934
+ f' "{sparql_endpoint}?access-token={args.access_token}"'
935
+ f" -H 'Content-Type: application/sparql-update'"
936
+ )
937
+ if use_cached_file:
938
+ # Use the cached file instead of writing a new one
939
+ update_arg_file_name = cached_file_name
940
+ else:
941
+ # Write the constructed SPARQL update to a file
942
+ update_arg_file_name = f"update.{first_offset_in_batch}.{current_batch_size}.sparql"
943
+ with open(update_arg_file_name, "w") as f:
944
+ f.write(delete_insert_operation)
945
+ # Write metadata file with date range
946
+ meta_file_name = (
947
+ f"update.{first_offset_in_batch}.{current_batch_size}.meta"
948
+ )
949
+ with open(meta_file_name, "w") as f:
950
+ f.write(f"{date_list[0]} - {date_list[-1]}")
951
+ curl_cmd += f" --data-binary @{update_arg_file_name}"
952
+ if args.verbose == "yes":
953
+ log.info(colored(curl_cmd, "blue"))
954
+
955
+ # Run it (using `curl` for batch size up to 1000, otherwise
956
+ # `requests`) with retry logic.
957
+ try:
958
+ result = retry_with_backoff(
959
+ lambda: run_command(curl_cmd, return_output=True),
960
+ "UPDATE request",
961
+ args.num_retries,
962
+ log,
963
+ )
964
+ result_file_name = f"update.{first_offset_in_batch}.{current_batch_size}.result"
965
+ with open(result_file_name, "w") as f:
966
+ f.write(result)
967
+ except Exception as e:
968
+ log.error(
969
+ f"Failed to execute UPDATE request after "
970
+ f"{args.num_retries} retry attempts, last error: "
971
+ f"{e}"
972
+ )
973
+ return False
974
+
975
+ # Results should be a JSON, parse it.
976
+ try:
977
+ result = json.loads(result)
978
+ except Exception as e:
979
+ log.error(
980
+ f"Error parsing JSON result: {e}. "
981
+ f"The first 1000 characters are: {result[:1000]}"
982
+ )
983
+ return False
984
+
985
+ # Check if the result contains a QLever exception.
986
+ if "exception" in result:
987
+ error_msg = result["exception"]
988
+ log.error(f"QLever exception: {error_msg}")
989
+ log.info("")
990
+ continue
991
+
992
+ # Helper function for getting the value of `stats["time"][...]`
993
+ # without the "ms" suffix. If the extraction fails, return 0
994
+
995
+ # (and optionally log the failure).
996
+ class FailureMode(Enum):
997
+ LOG_ERROR = auto()
998
+ SILENTLY_RETURN_ZERO = auto()
999
+ THROW_EXCEPTION = auto()
1000
+
1001
+ def get_time_ms(
1002
+ stats, *keys: str, failure_mode=FailureMode.LOG_ERROR
1003
+ ) -> int:
1004
+ try:
1005
+ value = stats["time"]
1006
+ for key in keys:
1007
+ value = value[key]
1008
+ value = int(value)
1009
+ except Exception:
1010
+ if failure_mode == FailureMode.THROW_EXCEPTION:
1011
+ raise
1012
+ elif failure_mode == FailureMode.LOG_ERROR:
1013
+ log.error(
1014
+ f"Error extracting time from JSON statistics, "
1015
+ f"keys: {keys}"
1016
+ )
1017
+ value = 0
1018
+ return value
1019
+
1020
+ # Check for old JSON format (no `operations` or `time` on top level).
1021
+ old_json_message_template = (
1022
+ "Result JSON does not contain `{}` field, you are "
1023
+ "probably using an old version of QLever"
1024
+ )
1025
+ for field in ["operations", "time"]:
1026
+ if field not in result:
1027
+ raise RuntimeError(old_json_message_template.format(field))
1028
+
1029
+ # Get the per-operation statistics.
1030
+ for i, stats in enumerate(result["operations"]):
1031
+ try:
1032
+ ins_after = stats["delta-triples"]["after"]["inserted"]
1033
+ del_after = stats["delta-triples"]["after"]["deleted"]
1034
+ ops_after = stats["delta-triples"]["after"]["total"]
1035
+ num_ins = int(
1036
+ stats["delta-triples"]["operation"]["inserted"]
1037
+ )
1038
+ num_del = int(
1039
+ stats["delta-triples"]["operation"]["deleted"]
1040
+ )
1041
+ num_ops = int(stats["delta-triples"]["operation"]["total"])
1042
+ time_op_total = get_time_ms(stats, "total")
1043
+ time_us_per_op = (
1044
+ int(1000 * time_op_total / num_ops)
1045
+ if num_ops > 0
1046
+ else 0
1047
+ )
1048
+ if args.verbose == "yes":
1049
+ log.info(
1050
+ colored(
1051
+ f"TRIPLES: {num_ops:+10,} -> {ops_after:10,}, "
1052
+ f"INS: {num_ins:+10,} -> {ins_after:10,}, "
1053
+ f"DEL: {num_del:+10,} -> {del_after:10,}, "
1054
+ f"TIME: {time_op_total:7,}ms, "
1055
+ f"TIME/TRIPLE: {time_us_per_op:6,}µs",
1056
+ attrs=["bold"],
1057
+ )
1058
+ )
1059
+
1060
+ time_planning = get_time_ms(stats, "planning")
1061
+ time_compute_ids = get_time_ms(
1062
+ stats,
1063
+ "execution",
1064
+ "computeIds",
1065
+ "total",
1066
+ )
1067
+ time_where = get_time_ms(
1068
+ stats,
1069
+ "execution",
1070
+ "evaluateWhere",
1071
+ )
1072
+ time_metadata = get_time_ms(
1073
+ stats,
1074
+ "updateMetadata",
1075
+ )
1076
+ time_insert = get_time_ms(
1077
+ stats,
1078
+ "execution",
1079
+ "insertTriples",
1080
+ "total",
1081
+ failure_mode=FailureMode.SILENTLY_RETURN_ZERO,
1082
+ )
1083
+ time_delete = get_time_ms(
1084
+ stats,
1085
+ "execution",
1086
+ "deleteTriples",
1087
+ "total",
1088
+ failure_mode=FailureMode.SILENTLY_RETURN_ZERO,
1089
+ )
1090
+ time_unaccounted = time_op_total - (
1091
+ time_planning
1092
+ + time_compute_ids
1093
+ + time_where
1094
+ + time_metadata
1095
+ + time_delete
1096
+ + time_insert
1097
+ )
1098
+ if args.verbose == "yes":
1099
+ log.info(
1100
+ f"METADATA: {100 * time_metadata / time_op_total:2.0f}%, "
1101
+ f"PLANNING: {100 * time_planning / time_op_total:2.0f}%, "
1102
+ f"WHERE: {100 * time_where / time_op_total:2.0f}%, "
1103
+ f"IDS: {100 * time_compute_ids / time_op_total:2.0f}%, "
1104
+ f"DELETE: {100 * time_delete / time_op_total:2.0f}%, "
1105
+ f"INSERT: {100 * time_insert / time_op_total:2.0f}%, "
1106
+ f"UNACCOUNTED: {100 * time_unaccounted / time_op_total:2.0f}%",
1107
+ )
1108
+
1109
+ except Exception as e:
1110
+ log.warn(
1111
+ f"Error extracting statistics: {e}, "
1112
+ f"curl command was: {curl_cmd}"
1113
+ )
1114
+ # Show traceback for debugging.
1115
+ import traceback
1116
+
1117
+ traceback.print_exc()
1118
+ log.info("")
1119
+ continue
1120
+
1121
+ # Get times for the whole request (not per operation).
1122
+ time_parsing = get_time_ms(
1123
+ result,
1124
+ "parsing",
1125
+ )
1126
+ time_metadata = get_time_ms(
1127
+ result,
1128
+ "metadataUpdateForSnapshot",
1129
+ )
1130
+ time_snapshot = get_time_ms(
1131
+ result,
1132
+ "snapshotCreation",
1133
+ )
1134
+ time_writeback = get_time_ms(
1135
+ result,
1136
+ "diskWriteback",
1137
+ )
1138
+ time_operations = get_time_ms(
1139
+ result,
1140
+ "operations",
1141
+ )
1142
+ time_total = get_time_ms(
1143
+ result,
1144
+ "total",
1145
+ )
1146
+ time_unaccounted = time_total - (
1147
+ time_parsing
1148
+ + time_metadata
1149
+ + time_snapshot
1150
+ + time_writeback
1151
+ + time_operations
1152
+ )
1153
+
1154
+ # Update the totals.
1155
+ total_update_time += time_total / 1000.0
1156
+ total_elapsed_time = time.perf_counter() - start_time
1157
+
1158
+ # Show statistics for the completed batch.
1159
+ if args.verbose == "yes":
1160
+ log.info(
1161
+ colored(
1162
+ f"TOTAL UPDATE TIME SO FAR: {total_update_time:4.0f}s, "
1163
+ f"TOTAL ELAPSED TIME SO FAR: {total_elapsed_time:4.0f}s, "
1164
+ f"TOTAL TIME FOR THIS UPDATE REQUEST: {time_total:7,}ms, ",
1165
+ attrs=["bold"],
1166
+ )
1167
+ )
1168
+ log.info(
1169
+ f"PARSING: {100 * time_parsing / time_total:2.0f}%, "
1170
+ f"OPERATIONS: {100 * time_operations / time_total:2.0f}%, "
1171
+ f"METADATA: {100 * time_metadata / time_total:2.0f}%, "
1172
+ f"SNAPSHOT: {100 * time_snapshot / time_total:2.0f}%, "
1173
+ f"WRITEBACK: {100 * time_writeback / time_total:2.0f}%, "
1174
+ f"UNACCOUNTED: {100 * time_unaccounted / time_total:2.0f}%",
1175
+ )
1176
+ log.info("")
1177
+
1178
+ # Close the source connection (for each batch, we open a new one,
1179
+ # either from `event_id_for_next_batch` or from `since`).
1180
+ source.close()
1181
+
1182
+ # After the first batch is processed, enable offset checking for
1183
+ # subsequent batches.
1184
+ first_batch = False
1185
+
1186
+ # If Ctrl+C was pressed, we reached `--until`, or we processed
1187
+ # exactly `--num-messages`, finish.
1188
+ if (
1189
+ self.ctrl_c_pressed
1190
+ or self.finished
1191
+ or (
1192
+ args.num_messages is not None
1193
+ and total_num_messages >= args.num_messages
1194
+ )
1195
+ ):
1196
+ break
1197
+
1198
+ # Final message after all batches have been processed.
1199
+ log.info(
1200
+ f"Processed {batch_count} "
1201
+ f"{'batches' if batch_count > 1 else 'batch'} "
1202
+ f"terminating update command"
1203
+ )
1204
+ return True