qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. qlever/Qleverfiles/Qleverfile.dblp +36 -0
  2. qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
  3. qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
  4. qlever/Qleverfiles/Qleverfile.default +51 -0
  5. qlever/Qleverfiles/Qleverfile.dnb +40 -0
  6. qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
  7. qlever/Qleverfiles/Qleverfile.freebase +28 -0
  8. qlever/Qleverfiles/Qleverfile.imdb +36 -0
  9. qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
  10. qlever/Qleverfiles/Qleverfile.olympics +31 -0
  11. qlever/Qleverfiles/Qleverfile.orkg +30 -0
  12. qlever/Qleverfiles/Qleverfile.osm-country +39 -0
  13. qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
  14. qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
  15. qlever/Qleverfiles/Qleverfile.pubchem +131 -0
  16. qlever/Qleverfiles/Qleverfile.scientists +29 -0
  17. qlever/Qleverfiles/Qleverfile.uniprot +74 -0
  18. qlever/Qleverfiles/Qleverfile.vvz +31 -0
  19. qlever/Qleverfiles/Qleverfile.wikidata +42 -0
  20. qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
  21. qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
  22. qlever/__init__.py +44 -1380
  23. qlever/command.py +87 -0
  24. qlever/commands/__init__.py +0 -0
  25. qlever/commands/add_text_index.py +115 -0
  26. qlever/commands/benchmark_queries.py +1019 -0
  27. qlever/commands/cache_stats.py +125 -0
  28. qlever/commands/clear_cache.py +88 -0
  29. qlever/commands/extract_queries.py +120 -0
  30. qlever/commands/get_data.py +48 -0
  31. qlever/commands/index.py +333 -0
  32. qlever/commands/index_stats.py +306 -0
  33. qlever/commands/log.py +66 -0
  34. qlever/commands/materialized_view.py +110 -0
  35. qlever/commands/query.py +142 -0
  36. qlever/commands/rebuild_index.py +176 -0
  37. qlever/commands/reset_updates.py +59 -0
  38. qlever/commands/settings.py +115 -0
  39. qlever/commands/setup_config.py +97 -0
  40. qlever/commands/start.py +336 -0
  41. qlever/commands/status.py +50 -0
  42. qlever/commands/stop.py +90 -0
  43. qlever/commands/system_info.py +130 -0
  44. qlever/commands/ui.py +271 -0
  45. qlever/commands/update.py +90 -0
  46. qlever/commands/update_wikidata.py +1204 -0
  47. qlever/commands/warmup.py +41 -0
  48. qlever/config.py +223 -0
  49. qlever/containerize.py +167 -0
  50. qlever/log.py +55 -0
  51. qlever/qlever_main.py +79 -0
  52. qlever/qleverfile.py +530 -0
  53. qlever/util.py +330 -0
  54. qlever-0.5.41.dist-info/METADATA +127 -0
  55. qlever-0.5.41.dist-info/RECORD +59 -0
  56. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
  57. qlever-0.5.41.dist-info/entry_points.txt +2 -0
  58. qlever-0.5.41.dist-info/top_level.txt +1 -0
  59. build/lib/qlever/__init__.py +0 -1383
  60. build/lib/qlever/__main__.py +0 -4
  61. qlever/__main__.py +0 -4
  62. qlever-0.2.5.dist-info/METADATA +0 -277
  63. qlever-0.2.5.dist-info/RECORD +0 -12
  64. qlever-0.2.5.dist-info/entry_points.txt +0 -2
  65. qlever-0.2.5.dist-info/top_level.txt +0 -4
  66. src/qlever/__init__.py +0 -1383
  67. src/qlever/__main__.py +0 -4
  68. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1019 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ import re
6
+ import shlex
7
+ import subprocess
8
+ import time
9
+ import traceback
10
+ from io import StringIO
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import rdflib
15
+ import yaml
16
+ from termcolor import colored
17
+
18
+ from qlever.command import QleverCommand
19
+ from qlever.commands.clear_cache import ClearCacheCommand
20
+ from qlever.commands.ui import dict_to_yaml
21
+ from qlever.log import log, mute_log
22
+ from qlever.util import run_command, run_curl_command
23
+
24
+
25
+ class BenchmarkQueriesCommand(QleverCommand):
26
+ """
27
+ Class for running a given sequence of benchmark or example queries and
28
+ showing their processing times and result sizes.
29
+ """
30
+
31
+ def __init__(self):
32
+ pass
33
+
34
+ def description(self) -> str:
35
+ return (
36
+ "Run the given benchmark or example queries and show their "
37
+ "processing times and result sizes"
38
+ )
39
+
40
+ def should_have_qleverfile(self) -> bool:
41
+ return False
42
+
43
+ def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
44
+ return {"server": ["host_name", "port"], "ui": ["ui_config"]}
45
+
46
+ def additional_arguments(self, subparser) -> None:
47
+ subparser.add_argument(
48
+ "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint"
49
+ )
50
+ subparser.add_argument(
51
+ "--sparql-endpoint-preset",
52
+ choices=[
53
+ "https://qlever.dev/api/wikidata",
54
+ "https://qlever.dev/api/uniprot",
55
+ "https://qlever.dev/api/pubchem",
56
+ "https://qlever.dev/api/osm-planet",
57
+ "https://wikidata.demo.openlinksw.com/sparql",
58
+ "https://sparql.uniprot.org/sparql",
59
+ ],
60
+ help="SPARQL endpoint from fixed list (to save typing)",
61
+ )
62
+ subparser.add_argument(
63
+ "--queries-tsv",
64
+ type=str,
65
+ default=None,
66
+ help=(
67
+ "Path to a TSV file containing benchmark queries "
68
+ "(query_description, full_sparql_query)"
69
+ ),
70
+ )
71
+ subparser.add_argument(
72
+ "--queries-yml",
73
+ type=str,
74
+ default=None,
75
+ help=(
76
+ "Path to a YAML file containing benchmark queries. "
77
+ "The YAML file should have a top-level "
78
+ "key called 'queries', which is a list of dictionaries. "
79
+ "Each dictionary should contain 'query' for the query "
80
+ "description and 'sparql' for the full SPARQL query."
81
+ ),
82
+ )
83
+ subparser.add_argument(
84
+ "--query-ids",
85
+ type=str,
86
+ default="1-$",
87
+ help="Query IDs as comma-separated list of "
88
+ "ranges (e.g., 1-5,7,12-$)",
89
+ )
90
+ subparser.add_argument(
91
+ "--query-regex",
92
+ type=str,
93
+ help="Only consider example queries matching "
94
+ "this regex (using grep -Pi)",
95
+ )
96
+ subparser.add_argument(
97
+ "--example-queries",
98
+ action="store_true",
99
+ default=False,
100
+ help=(
101
+ "Run the example queries for the given --ui-config "
102
+ "instead of the benchmark queries from a TSV or YML file"
103
+ ),
104
+ )
105
+ subparser.add_argument(
106
+ "--download-or-count",
107
+ choices=["download", "count"],
108
+ default="download",
109
+ help="Whether to download the full result "
110
+ "or just compute the size of the result",
111
+ )
112
+ subparser.add_argument(
113
+ "--limit", type=int, help="Limit on the number of results"
114
+ )
115
+ subparser.add_argument(
116
+ "--remove-offset-and-limit",
117
+ action="store_true",
118
+ default=False,
119
+ help="Remove OFFSET and LIMIT from the query",
120
+ )
121
+ subparser.add_argument(
122
+ "--accept",
123
+ type=str,
124
+ choices=[
125
+ "text/tab-separated-values",
126
+ "text/csv",
127
+ "application/sparql-results+json",
128
+ "application/qlever-results+json",
129
+ "application/octet-stream",
130
+ "text/turtle",
131
+ "AUTO",
132
+ ],
133
+ default="application/sparql-results+json",
134
+ help="Accept header for the SPARQL query; AUTO means "
135
+ "`text/turtle` for CONSTRUCT AND DESCRIBE queries, "
136
+ "`application/sparql-results+json` for all others",
137
+ )
138
+ subparser.add_argument(
139
+ "--clear-cache",
140
+ choices=["yes", "no"],
141
+ default="no",
142
+ help="Clear the cache before each query (only works for QLever)",
143
+ )
144
+ subparser.add_argument(
145
+ "--width-query-description",
146
+ type=int,
147
+ default=70,
148
+ help="Width for printing the query description",
149
+ )
150
+ subparser.add_argument(
151
+ "--width-error-message",
152
+ type=int,
153
+ default=50,
154
+ help="Width for printing the error message (0 = no limit)",
155
+ )
156
+ subparser.add_argument(
157
+ "--width-result-size",
158
+ type=int,
159
+ default=14,
160
+ help="Width for printing the result size",
161
+ )
162
+ subparser.add_argument(
163
+ "--add-query-type-to-description",
164
+ action="store_true",
165
+ default=False,
166
+ help="Add the query type (SELECT, ASK, CONSTRUCT, DESCRIBE, "
167
+ "UNKNOWN) to the description",
168
+ )
169
+ subparser.add_argument(
170
+ "--show-query",
171
+ choices=["always", "never", "on-error"],
172
+ default="never",
173
+ help="Show the queries that will be executed (always, never, on error)",
174
+ )
175
+ subparser.add_argument(
176
+ "--show-prefixes",
177
+ action="store_true",
178
+ default=False,
179
+ help="When showing the query, also show the prefixes",
180
+ )
181
+ subparser.add_argument(
182
+ "--results-dir",
183
+ type=str,
184
+ default=".",
185
+ help=(
186
+ "The directory where the YML result file would be saved "
187
+ "for the evaluation web app (Default = current working directory)"
188
+ ),
189
+ )
190
+ subparser.add_argument(
191
+ "--result-file",
192
+ type=str,
193
+ default=None,
194
+ help=(
195
+ "Base name used for the result YML file, should be of the "
196
+ "form `<dataset>.<engine>`, e.g., `wikidata.qlever`"
197
+ ),
198
+ )
199
+ subparser.add_argument(
200
+ "--max-results-output-file",
201
+ type=int,
202
+ default=5,
203
+ help=(
204
+ "Maximum number of results per query in the output result "
205
+ "YML file (Default = 5)"
206
+ ),
207
+ )
208
+
209
+ def pretty_printed_query(self, query: str, show_prefixes: bool) -> str:
210
+ remove_prefixes_cmd = (
211
+ " | sed '/^PREFIX /Id'" if not show_prefixes else ""
212
+ )
213
+ pretty_print_query_cmd = (
214
+ f"echo {shlex.quote(query)}"
215
+ f" | docker run -i --rm sparqling/sparql-formatter"
216
+ f"{remove_prefixes_cmd} | grep -v '^$'"
217
+ )
218
+ try:
219
+ query_pretty_printed = run_command(
220
+ pretty_print_query_cmd, return_output=True
221
+ )
222
+ return query_pretty_printed.rstrip()
223
+ except Exception as e:
224
+ log.error(
225
+ f"Failed to pretty-print query, returning original query: {e}"
226
+ )
227
+ return query.rstrip()
228
+
229
+ def sparql_query_type(self, query: str) -> str:
230
+ match = re.search(
231
+ r"(SELECT|ASK|CONSTRUCT|DESCRIBE)\s", query, re.IGNORECASE
232
+ )
233
+ if match:
234
+ return match.group(1).upper()
235
+ else:
236
+ return "UNKNOWN"
237
+
238
+ @staticmethod
239
+ def filter_queries(
240
+ queries: list[tuple[str, str]], query_ids: str, query_regex: str
241
+ ) -> list[tuple[str, str]]:
242
+ """
243
+ Given a list of queries (tuple of query desc and full sparql query),
244
+ filter them and keep the ones which are a part of query_ids
245
+ or match with query_regex
246
+ """
247
+ # Get the list of query indices to keep
248
+ total_queries = len(queries)
249
+ query_indices = []
250
+ for part in query_ids.split(","):
251
+ if "-" in part:
252
+ start, end = part.split("-")
253
+ if end == "$":
254
+ end = total_queries
255
+ query_indices.extend(range(int(start) - 1, int(end)))
256
+ else:
257
+ idx = int(part) if part != "$" else total_queries
258
+ query_indices.append(idx - 1)
259
+
260
+ try:
261
+ filtered_queries = []
262
+ pattern = (
263
+ re.compile(query_regex, re.IGNORECASE) if query_regex else None
264
+ )
265
+ for query_idx in query_indices:
266
+ if query_idx >= total_queries:
267
+ continue
268
+
269
+ query_desc, sparql = queries[query_idx]
270
+
271
+ # Only include queries that match the query_regex if present
272
+ if pattern and not (
273
+ pattern.search(query_desc) or pattern.search(sparql)
274
+ ):
275
+ continue
276
+
277
+ filtered_queries.append((query_desc, sparql))
278
+ return filtered_queries
279
+ except Exception as exc:
280
+ log.error(f"Error filtering queries: {exc}")
281
+ return []
282
+
283
+ @staticmethod
284
+ def parse_queries_tsv(queries_cmd: str) -> list[tuple[str, str]]:
285
+ """
286
+ Execute the given bash command to fetch tsv queries and return a
287
+ list of queries i.e. tuple(query_description, full_sparql_query)
288
+ """
289
+ try:
290
+ tsv_queries_str = run_command(queries_cmd, return_output=True)
291
+ if len(tsv_queries_str) == 0:
292
+ log.error("No queries found in the TSV queries file")
293
+ return []
294
+ return [
295
+ tuple(line.split("\t"))
296
+ for line in tsv_queries_str.strip().splitlines()
297
+ ]
298
+ except Exception as exc:
299
+ log.error(f"Failed to read the TSV queries file: {exc}")
300
+ return []
301
+
302
+ @staticmethod
303
+ def parse_queries_yml(queries_file: str) -> list[tuple[str, str]]:
304
+ """
305
+ Parse a YML file, validate its structure and return a list of
306
+ queries i.e. tuple(query_description, full_sparql_query)
307
+ """
308
+ with open(queries_file, "r", encoding="utf-8") as q_file:
309
+ try:
310
+ data = yaml.safe_load(q_file) # Load YAML safely
311
+ except yaml.YAMLError as exc:
312
+ log.error(f"Error parsing {queries_file} file: {exc}")
313
+ return []
314
+
315
+ # Validate the structure
316
+ if not isinstance(data, dict) or "queries" not in data:
317
+ log.error(
318
+ "Error: YAML file must contain a top-level 'queries' key"
319
+ )
320
+ return []
321
+
322
+ if not isinstance(data["queries"], list):
323
+ log.error("Error: 'queries' key in YML file must hold a list.")
324
+ return []
325
+
326
+ for item in data["queries"]:
327
+ if (
328
+ not isinstance(item, dict)
329
+ or "query" not in item
330
+ or "sparql" not in item
331
+ ):
332
+ log.error(
333
+ "Error: Each item in 'queries' must contain "
334
+ "'query' and 'sparql' keys."
335
+ )
336
+ return []
337
+
338
+ return [(query["query"], query["sparql"]) for query in data["queries"]]
339
+
340
+ def get_result_size(
341
+ self,
342
+ count_only: bool,
343
+ query_type: str,
344
+ accept_header: str,
345
+ result_file: str,
346
+ ) -> tuple[int, dict[str, str] | None]:
347
+ """
348
+ Get the result size and error_msg dict (if query failed) for
349
+ different accept headers
350
+ """
351
+
352
+ def get_json_error_msg(e: Exception) -> dict[str, str]:
353
+ error_msg = {
354
+ "short": "Malformed JSON",
355
+ "long": "curl returned with code 200, "
356
+ "but the JSON is malformed: " + re.sub(r"\s+", " ", str(e)),
357
+ }
358
+ return error_msg
359
+
360
+ result_size = 0
361
+ error_msg = None
362
+ # CASE 0: The result is empty despite a 200 HTTP code (not a
363
+ # problem for CONSTRUCT and DESCRIBE queries).
364
+ if Path(result_file).stat().st_size == 0 and (
365
+ not query_type == "CONSTRUCT" and not query_type == "DESCRIBE"
366
+ ):
367
+ result_size = 0
368
+ error_msg = {
369
+ "short": "Empty result",
370
+ "long": "curl returned with code 200, but the result is empty",
371
+ }
372
+
373
+ # CASE 1: Just counting the size of the result (TSV or JSON).
374
+ elif count_only:
375
+ if accept_header in ("text/tab-separated-values", "text/csv"):
376
+ result_size = run_command(
377
+ f"sed 1d {result_file}", return_output=True
378
+ )
379
+ elif accept_header == "application/qlever-results+json":
380
+ try:
381
+ # sed cmd to get the number between 2nd and 3rd double_quotes
382
+ result_size = run_command(
383
+ f"jq '.res[0]' {result_file}"
384
+ " | sed 's/[^0-9]*\\([0-9]*\\).*/\\1/'",
385
+ return_output=True,
386
+ )
387
+ except Exception as e:
388
+ error_msg = get_json_error_msg(e)
389
+ else:
390
+ try:
391
+ result_size = run_command(
392
+ f'jq -r ".results.bindings[0]'
393
+ f" | to_entries[0].value.value"
394
+ f' | tonumber" {result_file}',
395
+ return_output=True,
396
+ )
397
+ except Exception as e:
398
+ error_msg = get_json_error_msg(e)
399
+
400
+ # CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON).
401
+ else:
402
+ if accept_header in ("text/tab-separated-values", "text/csv"):
403
+ result_size = run_command(
404
+ f"sed 1d {result_file} | wc -l", return_output=True
405
+ )
406
+ elif accept_header == "text/turtle":
407
+ result_size = run_command(
408
+ f"sed '1d;/^@prefix/d;/^\\s*$/d' {result_file} | wc -l",
409
+ return_output=True,
410
+ )
411
+ elif accept_header == "application/qlever-results+json":
412
+ try:
413
+ result_size = run_command(
414
+ f'jq -r ".resultsize" {result_file}',
415
+ return_output=True,
416
+ )
417
+ except Exception as e:
418
+ error_msg = get_json_error_msg(e)
419
+ else:
420
+ try:
421
+ result_size = int(
422
+ run_command(
423
+ f'jq -r ".results.bindings | length"'
424
+ f" {result_file}",
425
+ return_output=True,
426
+ ).rstrip()
427
+ )
428
+ except Exception as e:
429
+ error_msg = get_json_error_msg(e)
430
+ return int(result_size), error_msg
431
+
432
+ @staticmethod
433
+ def get_single_int_result(result_file: str) -> int | None:
434
+ """
435
+ When downloading the full result of a query with accept header as
436
+ application/sparql-results+json and result_size == 1, get the single
437
+ integer result value (if any).
438
+ """
439
+ single_int_result = None
440
+ try:
441
+ single_int_result = int(
442
+ run_command(
443
+ f'jq -e -r ".results.bindings[0][] | .value"'
444
+ f" {result_file}",
445
+ return_output=True,
446
+ ).rstrip()
447
+ )
448
+ except Exception:
449
+ pass
450
+ return single_int_result
451
+
452
+ def execute(self, args) -> bool:
453
+ # We can't have both `--remove-offset-and-limit` and `--limit`.
454
+ if args.remove_offset_and_limit and args.limit:
455
+ log.error("Cannot have both --remove-offset-and-limit and --limit")
456
+ return False
457
+
458
+ # Extract dataset and sparql_engine name from result file
459
+ dataset, engine = None, None
460
+ if args.result_file is not None:
461
+ result_file_parts = args.result_file.split(".")
462
+ if len(result_file_parts) != 2:
463
+ log.error(
464
+ "The argument of --result-file should be of the form "
465
+ "`<dataset>.<engine>`, e.g., `wikidata.qlever`"
466
+ )
467
+ return False
468
+ results_dir_path = Path(args.results_dir)
469
+ if results_dir_path.exists():
470
+ if not results_dir_path.is_dir():
471
+ log.error(
472
+ f"{results_dir_path} exists but is not a directory"
473
+ )
474
+ return False
475
+ else:
476
+ log.info(
477
+ f"Creating results directory: {results_dir_path.absolute()}"
478
+ )
479
+ results_dir_path.mkdir(parents=True, exist_ok=True)
480
+ dataset, engine = result_file_parts
481
+
482
+ # If `args.accept` is `application/sparql-results+json` or
483
+ # `application/qlever-results+json` or `AUTO`, we need `jq`.
484
+ if args.accept in (
485
+ "application/sparql-results+json",
486
+ "application/qlever-results+json",
487
+ "AUTO",
488
+ ):
489
+ try:
490
+ subprocess.run(
491
+ "jq --version",
492
+ shell=True,
493
+ check=True,
494
+ stdout=subprocess.DEVNULL,
495
+ stderr=subprocess.DEVNULL,
496
+ )
497
+ except Exception as e:
498
+ log.error(f"Please install `jq` for {args.accept} ({e})")
499
+ return False
500
+
501
+ if not any((args.queries_tsv, args.queries_yml, args.example_queries)):
502
+ log.error(
503
+ "No benchmark or example queries to read! Either pass benchmark "
504
+ "queries using --queries-tsv or --queries-yml, or pass the "
505
+ "argument --example-queries to run example queries for the "
506
+ f"given ui_config {args.ui_config}"
507
+ )
508
+ return False
509
+
510
+ if all((args.queries_tsv, args.queries_yml)):
511
+ log.error("Cannot have both --queries-tsv and --queries-yml")
512
+ return False
513
+
514
+ if any((args.queries_tsv, args.queries_yml)) and args.example_queries:
515
+ queries_file_arg = "tsv" if args.queries_tsv else "yml"
516
+ log.error(
517
+ f"Cannot have both --queries-{queries_file_arg} and "
518
+ "--example-queries"
519
+ )
520
+ return False
521
+
522
+ # Handle shortcuts for SPARQL endpoint.
523
+ if args.sparql_endpoint_preset:
524
+ args.sparql_endpoint = args.sparql_endpoint_preset
525
+
526
+ # Limit only works with full result.
527
+ if args.limit and args.download_or_count == "count":
528
+ log.error("Limit only works with full result")
529
+ return False
530
+
531
+ # Clear cache only works for QLever.
532
+ is_qlever = (
533
+ not args.sparql_endpoint
534
+ or args.sparql_endpoint.startswith("https://qlever")
535
+ )
536
+ if engine is not None:
537
+ is_qlever = is_qlever or "qlever" in engine.lower()
538
+ if args.clear_cache == "yes":
539
+ if is_qlever:
540
+ log.warning(
541
+ "Clearing the cache before each query"
542
+ " (only works for QLever)"
543
+ )
544
+ else:
545
+ log.warning(
546
+ "Clearing the cache only works for QLever"
547
+ ", option `--clear-cache` is ignored"
548
+ )
549
+ args.clear_cache = "no"
550
+
551
+ # Show what the command will do.
552
+ example_queries_cmd = (
553
+ f"curl -sv https://qlever.dev/api/examples/{args.ui_config}"
554
+ )
555
+ sparql_endpoint = (
556
+ args.sparql_endpoint
557
+ if args.sparql_endpoint
558
+ else f"{args.host_name}:{args.port}"
559
+ )
560
+
561
+ self.show(
562
+ f"Obtain queries via: {args.queries_yml or args.queries_tsv or example_queries_cmd}\n"
563
+ f"SPARQL endpoint: {sparql_endpoint}\n"
564
+ f"Accept header: {args.accept}\n"
565
+ f"Download result for each query or just count:"
566
+ f" {args.download_or_count.upper()}"
567
+ + (f" with LIMIT {args.limit}" if args.limit else ""),
568
+ only_show=args.show,
569
+ )
570
+ if args.show:
571
+ return True
572
+
573
+ if args.queries_yml:
574
+ queries = self.parse_queries_yml(args.queries_yml)
575
+ elif args.queries_tsv:
576
+ queries = self.parse_queries_tsv(f"cat {args.queries_tsv}")
577
+ else:
578
+ queries = self.parse_queries_tsv(example_queries_cmd)
579
+
580
+ filtered_queries = self.filter_queries(
581
+ queries, args.query_ids, args.query_regex
582
+ )
583
+
584
+ if len(filtered_queries) == 0 or not filtered_queries[0]:
585
+ log.error("No queries to process!")
586
+ return False
587
+
588
+ # We want the width of the query description to be an uneven number (in
589
+ # case we have to truncated it, in which case we want to have a " ... "
590
+ # in the middle).
591
+ width_query_description_half = args.width_query_description // 2
592
+ width_query_description = 2 * width_query_description_half + 1
593
+
594
+ # Launch the queries one after the other and for each print: the
595
+ # description, the result size (number of rows), and the query
596
+ # processing time (seconds).
597
+ query_times = []
598
+ result_sizes = []
599
+ result_yml_query_records = {"queries": []}
600
+ num_failed = 0
601
+ for description, query in filtered_queries:
602
+ if len(query) == 0:
603
+ log.error("Could not parse description and query, line is:")
604
+ log.info("")
605
+ log.info(f"{description}\t{query}")
606
+ return False
607
+ query_type = self.sparql_query_type(query)
608
+ if args.add_query_type_to_description or args.accept == "AUTO":
609
+ description = f"{description} [{query_type}]"
610
+
611
+ # Clear the cache.
612
+ if args.clear_cache == "yes":
613
+ args.server_url = sparql_endpoint
614
+ args.complete = False
615
+ clear_cache_successful = False
616
+ with mute_log():
617
+ clear_cache_successful = ClearCacheCommand().execute(args)
618
+ if not clear_cache_successful:
619
+ log.warn("Failed to clear the cache")
620
+
621
+ # Remove OFFSET and LIMIT (after the last closing bracket).
622
+ if args.remove_offset_and_limit or args.limit:
623
+ closing_bracket_idx = query.rfind("}")
624
+ regexes = [
625
+ re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE),
626
+ re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE),
627
+ ]
628
+ for regex in regexes:
629
+ match = re.search(regex, query[closing_bracket_idx:])
630
+ if match:
631
+ query = (
632
+ query[: closing_bracket_idx + match.start()]
633
+ + query[closing_bracket_idx + match.end() :]
634
+ )
635
+
636
+ # Limit query.
637
+ if args.limit:
638
+ query += f" LIMIT {args.limit}"
639
+
640
+ # Count query.
641
+ if args.download_or_count == "count":
642
+ # First find out if there is a FROM clause.
643
+ regex_from_clause = re.compile(
644
+ r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE
645
+ )
646
+ match_from_clause = re.search(regex_from_clause, query)
647
+ from_clause = " "
648
+ if match_from_clause:
649
+ from_clause = match_from_clause.group(0)
650
+ query = (
651
+ query[: match_from_clause.start()]
652
+ + " "
653
+ + query[match_from_clause.end() :]
654
+ )
655
+ # Now we can add the outer SELECT COUNT(*).
656
+ query = (
657
+ re.sub(
658
+ r"SELECT ",
659
+ "SELECT (COUNT(*) AS ?qlever_count_)"
660
+ + from_clause
661
+ + "WHERE { SELECT ",
662
+ query,
663
+ count=1,
664
+ flags=re.IGNORECASE,
665
+ )
666
+ + " }"
667
+ )
668
+
669
+ # A bit of pretty-printing.
670
+ query = re.sub(r"\s+", " ", query)
671
+ query = re.sub(r"\s*\.\s*\}", " }", query)
672
+ if args.show_query == "always":
673
+ log.info("")
674
+ log.info(
675
+ colored(
676
+ self.pretty_printed_query(query, args.show_prefixes),
677
+ "cyan",
678
+ )
679
+ )
680
+
681
+ # Accept header. For "AUTO", use `text/turtle` for CONSTRUCT
682
+ # queries and `application/sparql-results+json` for all others.
683
+ accept_header = args.accept
684
+ if accept_header == "AUTO":
685
+ if query_type == "CONSTRUCT" or query_type == "DESCRIBE":
686
+ accept_header = "text/turtle"
687
+ else:
688
+ accept_header = "application/sparql-results+json"
689
+
690
+ # Launch query.
691
+ curl_cmd = (
692
+ f"curl -Ls {sparql_endpoint}"
693
+ f' -w "HTTP code: %{{http_code}}\\n"'
694
+ f' -H "Accept: {accept_header}"'
695
+ f" --data-urlencode query={shlex.quote(query)}"
696
+ )
697
+ log.debug(curl_cmd)
698
+ result_file = (
699
+ f"qlever.example_queries.result.{abs(hash(curl_cmd))}.tmp"
700
+ )
701
+ start_time = time.time()
702
+ try:
703
+ http_code = run_curl_command(
704
+ sparql_endpoint,
705
+ headers={"Accept": accept_header},
706
+ params={"query": query},
707
+ result_file=result_file,
708
+ ).strip()
709
+ if http_code == "200":
710
+ time_seconds = time.time() - start_time
711
+ error_msg = None
712
+ else:
713
+ time_seconds = time.time() - start_time
714
+ error_msg = {
715
+ "short": f"HTTP code: {http_code}",
716
+ "long": re.sub(
717
+ r"\s+", " ", Path(result_file).read_text()
718
+ ),
719
+ }
720
+ except Exception as e:
721
+ time_seconds = time.time() - start_time
722
+ if args.log_level == "DEBUG":
723
+ traceback.print_exc()
724
+ error_msg = {
725
+ "short": "Exception",
726
+ "long": re.sub(r"\s+", " ", str(e)),
727
+ }
728
+
729
+ # Get result size (via the command line, in order to avoid loading
730
+ # a potentially large JSON file into Python, which is slow).
731
+ if error_msg is None:
732
+ result_size, error_msg = self.get_result_size(
733
+ args.download_or_count == "count",
734
+ query_type,
735
+ accept_header,
736
+ result_file,
737
+ )
738
+ single_int_result = None
739
+ if (
740
+ result_size == 1
741
+ and accept_header == "application/sparql-results+json"
742
+ and args.download_or_count == "download"
743
+ ):
744
+ single_int_result = self.get_single_int_result(result_file)
745
+
746
+ # Get the result yaml record if output file needs to be generated
747
+ if args.result_file is not None:
748
+ result_length = None if error_msg is not None else 1
749
+ result_length = (
750
+ result_size
751
+ if args.download_or_count == "download"
752
+ and result_length is not None
753
+ else result_length
754
+ )
755
+ query_results = (
756
+ error_msg if error_msg is not None else result_file
757
+ )
758
+ query_record = self.get_result_yml_query_record(
759
+ query=description,
760
+ sparql=self.pretty_printed_query(
761
+ query, args.show_prefixes
762
+ ),
763
+ client_time=time_seconds,
764
+ result=query_results,
765
+ result_size=result_length,
766
+ max_result_size=args.max_results_output_file,
767
+ accept_header=accept_header,
768
+ )
769
+ result_yml_query_records["queries"].append(query_record)
770
+
771
+ # Print description, time, result in tabular form.
772
+ if len(description) > width_query_description:
773
+ description = (
774
+ description[: width_query_description_half - 2]
775
+ + " ... "
776
+ + description[-width_query_description_half + 2 :]
777
+ )
778
+ if error_msg is None:
779
+ result_size = int(result_size)
780
+ single_int_result = (
781
+ f" [single int result: {single_int_result:,}]"
782
+ if single_int_result is not None
783
+ else ""
784
+ )
785
+ log.info(
786
+ f"{description:<{width_query_description}} "
787
+ f"{time_seconds:6.2f} s "
788
+ f"{result_size:>{args.width_result_size},}"
789
+ f"{single_int_result}"
790
+ )
791
+ query_times.append(time_seconds)
792
+ result_sizes.append(result_size)
793
+ else:
794
+ num_failed += 1
795
+ if (
796
+ args.width_error_message > 0
797
+ and len(error_msg["long"]) > args.width_error_message
798
+ and args.log_level != "DEBUG"
799
+ and args.show_query != "on-error"
800
+ ):
801
+ error_msg["long"] = (
802
+ error_msg["long"][: args.width_error_message - 3]
803
+ + "..."
804
+ )
805
+ seperator_short_long = (
806
+ "\n" if args.show_query == "on-error" else " "
807
+ )
808
+ log.info(
809
+ f"{description:<{width_query_description}} "
810
+ f"{colored('FAILED ', 'red')}"
811
+ f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}"
812
+ f"{seperator_short_long}"
813
+ f"{colored(error_msg['long'], 'red')}"
814
+ )
815
+ if args.show_query == "on-error":
816
+ log.info(
817
+ colored(
818
+ self.pretty_printed_query(
819
+ query, args.show_prefixes
820
+ ),
821
+ "cyan",
822
+ )
823
+ )
824
+ log.info("")
825
+
826
+ # Remove the result file (unless in debug mode).
827
+ if args.log_level != "DEBUG":
828
+ Path(result_file).unlink(missing_ok=True)
829
+
830
+ # Check that each query has a time and a result size, or it failed.
831
+ assert len(result_sizes) == len(query_times)
832
+ assert len(query_times) + num_failed == len(filtered_queries)
833
+
834
+ if args.result_file:
835
+ if len(result_yml_query_records["queries"]) != 0:
836
+ outfile_name = f"{dataset}.{engine}.results.yaml"
837
+ outfile = Path(args.results_dir) / outfile_name
838
+ self.write_query_records_to_result_file(
839
+ query_data=result_yml_query_records,
840
+ out_file=outfile,
841
+ )
842
+ else:
843
+ log.error(
844
+ f"Nothing to write to output result YML file: {args.result_file}"
845
+ )
846
+
847
+ # Show statistics.
848
+ if len(query_times) > 0:
849
+ n = len(query_times)
850
+ total_query_time = sum(query_times)
851
+ average_query_time = total_query_time / n
852
+ median_query_time = sorted(query_times)[n // 2]
853
+ total_result_size = sum(result_sizes)
854
+ average_result_size = round(total_result_size / n)
855
+ median_result_size = sorted(result_sizes)[n // 2]
856
+ query_or_queries = "query" if n == 1 else "queries"
857
+ description = f"TOTAL for {n} {query_or_queries}"
858
+ log.info("")
859
+ log.info(
860
+ f"{description:<{width_query_description}} "
861
+ f"{total_query_time:6.2f} s "
862
+ f"{total_result_size:>14,}"
863
+ )
864
+ description = f"AVERAGE for {n} {query_or_queries}"
865
+ log.info(
866
+ f"{description:<{width_query_description}} "
867
+ f"{average_query_time:6.2f} s "
868
+ f"{average_result_size:>14,}"
869
+ )
870
+ description = f"MEDIAN for {n} {query_or_queries}"
871
+ log.info(
872
+ f"{description:<{width_query_description}} "
873
+ f"{median_query_time:6.2f} s "
874
+ f"{median_result_size:>14,}"
875
+ )
876
+
877
+ # Show number of failed queries.
878
+ if num_failed > 0:
879
+ log.info("")
880
+ description = "Number of FAILED queries"
881
+ num_failed_string = f"{num_failed:>6}"
882
+ if num_failed == len(filtered_queries):
883
+ num_failed_string += " [all]"
884
+ log.info(
885
+ colored(
886
+ f"{description:<{width_query_description}} "
887
+ f"{num_failed:>24}",
888
+ "red",
889
+ )
890
+ )
891
+
892
+ # Return success (has nothing to do with how many queries failed).
893
+ return True
894
+
895
+ def get_result_yml_query_record(
896
+ self,
897
+ query: str,
898
+ sparql: str,
899
+ client_time: float,
900
+ result: str | dict[str, str],
901
+ result_size: int | None,
902
+ max_result_size: int,
903
+ accept_header: str,
904
+ ) -> dict[str, Any]:
905
+ """
906
+ Construct a dictionary with query information for output result yaml file
907
+ """
908
+ record = {
909
+ "query": query,
910
+ "sparql": sparql,
911
+ "runtime_info": {},
912
+ }
913
+ if result_size is None:
914
+ results = f"{result['short']}: {result['long']}"
915
+ headers = []
916
+ else:
917
+ record["result_size"] = result_size
918
+ result_size = (
919
+ max_result_size
920
+ if result_size > max_result_size
921
+ else result_size
922
+ )
923
+ headers, results = self.get_query_results(
924
+ result, result_size, accept_header
925
+ )
926
+ if accept_header == "application/qlever-results+json":
927
+ runtime_info_cmd = (
928
+ f"jq 'if .runtimeInformation then"
929
+ f" .runtimeInformation else"
930
+ f' "null" end\' {result}'
931
+ )
932
+ runtime_info_str = run_command(
933
+ runtime_info_cmd, return_output=True
934
+ )
935
+ if runtime_info_str != "null":
936
+ record["runtime_info"] = json.loads(runtime_info_str)
937
+ record["runtime_info"]["client_time"] = client_time
938
+ record["headers"] = headers
939
+ record["results"] = results
940
+ return record
941
+
942
+ def get_query_results(
943
+ self, result_file: str, result_size: int, accept_header: str
944
+ ) -> tuple[list[str], list[list[str]]]:
945
+ """
946
+ Return headers and query results as a tuple for various accept headers
947
+ """
948
+ if accept_header in ("text/tab-separated-values", "text/csv"):
949
+ separator = "," if accept_header == "text/csv" else "\t"
950
+ get_result_cmd = f"sed -n '1,{result_size + 1}p' {result_file}"
951
+ results_str = run_command(get_result_cmd, return_output=True)
952
+ results = results_str.splitlines()
953
+ reader = csv.reader(StringIO(results_str), delimiter=separator)
954
+ headers = next(reader)
955
+ results = [row for row in reader]
956
+ return headers, results
957
+
958
+ elif accept_header == "application/qlever-results+json":
959
+ get_result_cmd = (
960
+ f"jq '{{headers: .selected, results: .res[0:{result_size}]}}' "
961
+ f"{result_file}"
962
+ )
963
+ results_str = run_command(get_result_cmd, return_output=True)
964
+ results_json = json.loads(results_str)
965
+ return results_json["headers"], results_json["results"]
966
+
967
+ elif accept_header == "application/sparql-results+json":
968
+ get_result_cmd = (
969
+ f"jq '{{headers: .head.vars, "
970
+ f"bindings: .results.bindings[0:{result_size}]}}' "
971
+ f"{result_file}"
972
+ )
973
+ results_str = run_command(get_result_cmd, return_output=True)
974
+ results_json = json.loads(results_str)
975
+ results = []
976
+ bindings = results_json.get("bindings", [])
977
+ for binding in bindings:
978
+ result = []
979
+ if not binding or not isinstance(binding, dict):
980
+ results.append([])
981
+ continue
982
+ for obj in binding.values():
983
+ value = '"' + obj["value"] + '"'
984
+ if obj["type"] == "uri":
985
+ value = "<" + value.strip('"') + ">"
986
+ elif "datatype" in obj:
987
+ value += "^^<" + obj["datatype"] + ">"
988
+ elif "xml:lang" in obj:
989
+ value += "@" + obj["xml:lang"]
990
+ result.append(value)
991
+ results.append(result)
992
+ return results_json["headers"], results
993
+
994
+ else: # text/turtle
995
+ graph = rdflib.Graph()
996
+ graph.parse(result_file, format="turtle")
997
+ headers = ["?subject", "?predicate", "?object"]
998
+ results = []
999
+ for i, (s, p, o) in enumerate(graph):
1000
+ if i >= result_size:
1001
+ break
1002
+ results.append([str(s), str(p), str(o)])
1003
+ return headers, results
1004
+
1005
+ @staticmethod
1006
+ def write_query_records_to_result_file(
1007
+ query_data: dict[str, list[dict[str, Any]]], out_file: Path
1008
+ ) -> None:
1009
+ """
1010
+ Write yaml record for all queries to output yaml file
1011
+ """
1012
+ config_yaml = dict_to_yaml(query_data)
1013
+ with open(out_file, "w") as eval_yaml_file:
1014
+ eval_yaml_file.write(config_yaml)
1015
+ log.info("")
1016
+ log.info(
1017
+ f"Generated result yaml file: {out_file.stem}{out_file.suffix} "
1018
+ f"in the directory {out_file.parent.resolve()}"
1019
+ )