qlever 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -0,0 +1,1022 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ import re
6
+ import shlex
7
+ import subprocess
8
+ import time
9
+ import traceback
10
+ from io import StringIO
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import rdflib
15
+ import yaml
16
+ from termcolor import colored
17
+
18
+ from qlever.command import QleverCommand
19
+ from qlever.commands.clear_cache import ClearCacheCommand
20
+ from qlever.commands.ui import dict_to_yaml
21
+ from qlever.log import log, mute_log
22
+ from qlever.util import run_command, run_curl_command
23
+
24
+
25
+ class BenchmarkQueriesCommand(QleverCommand):
26
+ """
27
+ Class for running a given sequence of benchmark or example queries and
28
+ showing their processing times and result sizes.
29
+ """
30
+
31
+ def __init__(self):
32
+ pass
33
+
34
+ def description(self) -> str:
35
+ return (
36
+ "Run the given benchmark or example queries and show their "
37
+ "processing times and result sizes"
38
+ )
39
+
40
+ def should_have_qleverfile(self) -> bool:
41
+ return False
42
+
43
+ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
44
+ return {"server": ["host_name", "port"], "ui": ["ui_config"]}
45
+
46
+ def additional_arguments(self, subparser) -> None:
47
+ subparser.add_argument(
48
+ "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint"
49
+ )
50
+ subparser.add_argument(
51
+ "--sparql-endpoint-preset",
52
+ choices=[
53
+ "https://qlever.dev/api/wikidata",
54
+ "https://qlever.dev/api/uniprot",
55
+ "https://qlever.dev/api/pubchem",
56
+ "https://qlever.dev/api/osm-planet",
57
+ "https://wikidata.demo.openlinksw.com/sparql",
58
+ "https://sparql.uniprot.org/sparql",
59
+ ],
60
+ help="SPARQL endpoint from fixed list (to save typing)",
61
+ )
62
+ subparser.add_argument(
63
+ "--queries-tsv",
64
+ type=str,
65
+ default=None,
66
+ help=(
67
+ "Path to a TSV file containing benchmark queries "
68
+ "(query_description, full_sparql_query)"
69
+ ),
70
+ )
71
+ subparser.add_argument(
72
+ "--queries-yml",
73
+ type=str,
74
+ default=None,
75
+ help=(
76
+ "Path to a YAML file containing benchmark queries. "
77
+ "The YAML file should have a top-level "
78
+ "key called 'queries', which is a list of dictionaries. "
79
+ "Each dictionary should contain 'query' for the query "
80
+ "description and 'sparql' for the full SPARQL query."
81
+ ),
82
+ )
83
+ subparser.add_argument(
84
+ "--query-ids",
85
+ type=str,
86
+ default="1-$",
87
+ help="Query IDs as comma-separated list of "
88
+ "ranges (e.g., 1-5,7,12-$)",
89
+ )
90
+ subparser.add_argument(
91
+ "--query-regex",
92
+ type=str,
93
+ help="Only consider example queries matching "
94
+ "this regex (using grep -Pi)",
95
+ )
96
+ subparser.add_argument(
97
+ "--example-queries",
98
+ action="store_true",
99
+ default=False,
100
+ help=(
101
+ "Run the example queries for the given --ui-config "
102
+ "instead of the benchmark queries from a TSV or YML file"
103
+ ),
104
+ )
105
+ subparser.add_argument(
106
+ "--download-or-count",
107
+ choices=["download", "count"],
108
+ default="download",
109
+ help="Whether to download the full result "
110
+ "or just compute the size of the result",
111
+ )
112
+ subparser.add_argument(
113
+ "--limit", type=int, help="Limit on the number of results"
114
+ )
115
+ subparser.add_argument(
116
+ "--remove-offset-and-limit",
117
+ action="store_true",
118
+ default=False,
119
+ help="Remove OFFSET and LIMIT from the query",
120
+ )
121
+ subparser.add_argument(
122
+ "--accept",
123
+ type=str,
124
+ choices=[
125
+ "text/tab-separated-values",
126
+ "text/csv",
127
+ "application/sparql-results+json",
128
+ "application/qlever-results+json",
129
+ "application/octet-stream",
130
+ "text/turtle",
131
+ "AUTO",
132
+ ],
133
+ default="application/sparql-results+json",
134
+ help="Accept header for the SPARQL query; AUTO means "
135
+ "`text/turtle` for CONSTRUCT AND DESCRIBE queries, "
136
+ "`application/sparql-results+json` for all others",
137
+ )
138
+ subparser.add_argument(
139
+ "--clear-cache",
140
+ choices=["yes", "no"],
141
+ default="no",
142
+ help="Clear the cache before each query (only works for QLever)",
143
+ )
144
+ subparser.add_argument(
145
+ "--width-query-description",
146
+ type=int,
147
+ default=70,
148
+ help="Width for printing the query description",
149
+ )
150
+ subparser.add_argument(
151
+ "--width-error-message",
152
+ type=int,
153
+ default=50,
154
+ help="Width for printing the error message (0 = no limit)",
155
+ )
156
+ subparser.add_argument(
157
+ "--width-result-size",
158
+ type=int,
159
+ default=14,
160
+ help="Width for printing the result size",
161
+ )
162
+ subparser.add_argument(
163
+ "--add-query-type-to-description",
164
+ action="store_true",
165
+ default=False,
166
+ help="Add the query type (SELECT, ASK, CONSTRUCT, DESCRIBE, "
167
+ "UNKNOWN) to the description",
168
+ )
169
+ subparser.add_argument(
170
+ "--show-query",
171
+ choices=["always", "never", "on-error"],
172
+ default="never",
173
+ help="Show the queries that will be executed (always, never, on error)",
174
+ )
175
+ subparser.add_argument(
176
+ "--show-prefixes",
177
+ action="store_true",
178
+ default=False,
179
+ help="When showing the query, also show the prefixes",
180
+ )
181
+ subparser.add_argument(
182
+ "--results-dir",
183
+ type=str,
184
+ default=".",
185
+ help=(
186
+ "The directory where the YML result file would be saved "
187
+ "for the evaluation web app (Default = current working directory)"
188
+ ),
189
+ )
190
+ subparser.add_argument(
191
+ "--result-file",
192
+ type=str,
193
+ default=None,
194
+ help=(
195
+ "Base name used for the result YML file, should be of the "
196
+ "form `<dataset>.<engine>`, e.g., `wikidata.qlever`"
197
+ ),
198
+ )
199
+ subparser.add_argument(
200
+ "--max-results-output-file",
201
+ type=int,
202
+ default=5,
203
+ help=(
204
+ "Maximum number of results per query in the output result "
205
+ "YML file (Default = 5)"
206
+ ),
207
+ )
208
+
209
+ def pretty_printed_query(self, query: str, show_prefixes: bool) -> str:
210
+ remove_prefixes_cmd = (
211
+ " | sed '/^PREFIX /Id'" if not show_prefixes else ""
212
+ )
213
+ pretty_print_query_cmd = (
214
+ f"echo {shlex.quote(query)}"
215
+ f" | docker run -i --rm sparqling/sparql-formatter"
216
+ f"{remove_prefixes_cmd} | grep -v '^$'"
217
+ )
218
+ try:
219
+ query_pretty_printed = run_command(
220
+ pretty_print_query_cmd, return_output=True
221
+ )
222
+ return query_pretty_printed.rstrip()
223
+ except Exception as e:
224
+ log.error(
225
+ f"Failed to pretty-print query, returning original query: {e}"
226
+ )
227
+ return query.rstrip()
228
+
229
+ def sparql_query_type(self, query: str) -> str:
230
+ match = re.search(
231
+ r"(SELECT|ASK|CONSTRUCT|DESCRIBE)\s", query, re.IGNORECASE
232
+ )
233
+ if match:
234
+ return match.group(1).upper()
235
+ else:
236
+ return "UNKNOWN"
237
+
238
+ @staticmethod
239
+ def filter_queries(
240
+ queries: list[tuple[str, str]], query_ids: str, query_regex: str
241
+ ) -> list[tuple[str, str]]:
242
+ """
243
+ Given a list of queries (tuple of query desc and full sparql query),
244
+ filter them and keep the ones which are a part of query_ids
245
+ or match with query_regex
246
+ """
247
+ # Get the list of query indices to keep
248
+ total_queries = len(queries)
249
+ query_indices = []
250
+ for part in query_ids.split(","):
251
+ if "-" in part:
252
+ start, end = part.split("-")
253
+ if end == "$":
254
+ end = total_queries
255
+ query_indices.extend(range(int(start) - 1, int(end)))
256
+ else:
257
+ idx = int(part) if part != "$" else total_queries
258
+ query_indices.append(idx - 1)
259
+
260
+ try:
261
+ filtered_queries = []
262
+ pattern = (
263
+ re.compile(query_regex, re.IGNORECASE) if query_regex else None
264
+ )
265
+ for query_idx in query_indices:
266
+ if query_idx >= total_queries:
267
+ continue
268
+
269
+ query_desc, sparql = queries[query_idx]
270
+
271
+ # Only include queries that match the query_regex if present
272
+ if pattern and not (
273
+ pattern.search(query_desc) or pattern.search(sparql)
274
+ ):
275
+ continue
276
+
277
+ filtered_queries.append((query_desc, sparql))
278
+ return filtered_queries
279
+ except Exception as exc:
280
+ log.error(f"Error filtering queries: {exc}")
281
+ return []
282
+
283
+ @staticmethod
284
+ def parse_queries_tsv(queries_cmd: str) -> list[tuple[str, str]]:
285
+ """
286
+ Execute the given bash command to fetch tsv queries and return a
287
+ list of queries i.e. tuple(query_description, full_sparql_query)
288
+ """
289
+ try:
290
+ tsv_queries_str = run_command(queries_cmd, return_output=True)
291
+ if len(tsv_queries_str) == 0:
292
+ log.error("No queries found in the TSV queries file")
293
+ return []
294
+ return [
295
+ tuple(line.split("\t"))
296
+ for line in tsv_queries_str.strip().splitlines()
297
+ ]
298
+ except Exception as exc:
299
+ log.error(f"Failed to read the TSV queries file: {exc}")
300
+ return []
301
+
302
+ @staticmethod
303
+ def parse_queries_yml(queries_file: str) -> list[tuple[str, str]]:
304
+ """
305
+ Parse a YML file, validate its structure and return a list of
306
+ queries i.e. tuple(query_description, full_sparql_query)
307
+ """
308
+ with open(queries_file, "r", encoding="utf-8") as q_file:
309
+ try:
310
+ data = yaml.safe_load(q_file) # Load YAML safely
311
+ except yaml.YAMLError as exc:
312
+ log.error(f"Error parsing {queries_file} file: {exc}")
313
+ return []
314
+
315
+ # Validate the structure
316
+ if not isinstance(data, dict) or "queries" not in data:
317
+ log.error(
318
+ "Error: YAML file must contain a top-level 'queries' key"
319
+ )
320
+ return []
321
+
322
+ if not isinstance(data["queries"], list):
323
+ log.error("Error: 'queries' key in YML file must hold a list.")
324
+ return []
325
+
326
+ for item in data["queries"]:
327
+ if (
328
+ not isinstance(item, dict)
329
+ or "query" not in item
330
+ or "sparql" not in item
331
+ ):
332
+ log.error(
333
+ "Error: Each item in 'queries' must contain "
334
+ "'query' and 'sparql' keys."
335
+ )
336
+ return []
337
+
338
+ return [
339
+ (query['query'], query['sparql']) for query in data["queries"]
340
+ ]
341
+
342
+ def get_result_size(
343
+ self,
344
+ count_only: bool,
345
+ query_type: str,
346
+ accept_header: str,
347
+ result_file: str,
348
+ ) -> tuple[int, dict[str, str] | None]:
349
+ """
350
+ Get the result size and error_msg dict (if query failed) for
351
+ different accept headers
352
+ """
353
+
354
+ def get_json_error_msg(e: Exception) -> dict[str, str]:
355
+ error_msg = {
356
+ "short": "Malformed JSON",
357
+ "long": "curl returned with code 200, "
358
+ "but the JSON is malformed: " + re.sub(r"\s+", " ", str(e)),
359
+ }
360
+ return error_msg
361
+
362
+ result_size = 0
363
+ error_msg = None
364
+ # CASE 0: The result is empty despite a 200 HTTP code (not a
365
+ # problem for CONSTRUCT and DESCRIBE queries).
366
+ if Path(result_file).stat().st_size == 0 and (
367
+ not query_type == "CONSTRUCT" and not query_type == "DESCRIBE"
368
+ ):
369
+ result_size = 0
370
+ error_msg = {
371
+ "short": "Empty result",
372
+ "long": "curl returned with code 200, but the result is empty",
373
+ }
374
+
375
+ # CASE 1: Just counting the size of the result (TSV or JSON).
376
+ elif count_only:
377
+ if accept_header in ("text/tab-separated-values", "text/csv"):
378
+ result_size = run_command(
379
+ f"sed 1d {result_file}", return_output=True
380
+ )
381
+ elif accept_header == "application/qlever-results+json":
382
+ try:
383
+ # sed cmd to get the number between 2nd and 3rd double_quotes
384
+ result_size = run_command(
385
+ f"jq '.res[0]' {result_file}"
386
+ " | sed 's/[^0-9]*\\([0-9]*\\).*/\\1/'",
387
+ return_output=True,
388
+ )
389
+ except Exception as e:
390
+ error_msg = get_json_error_msg(e)
391
+ else:
392
+ try:
393
+ result_size = run_command(
394
+ f'jq -r ".results.bindings[0]'
395
+ f" | to_entries[0].value.value"
396
+ f' | tonumber" {result_file}',
397
+ return_output=True,
398
+ )
399
+ except Exception as e:
400
+ error_msg = get_json_error_msg(e)
401
+
402
+ # CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON).
403
+ else:
404
+ if accept_header in ("text/tab-separated-values", "text/csv"):
405
+ result_size = run_command(
406
+ f"sed 1d {result_file} | wc -l", return_output=True
407
+ )
408
+ elif accept_header == "text/turtle":
409
+ result_size = run_command(
410
+ f"sed '1d;/^@prefix/d;/^\\s*$/d' {result_file} | wc -l",
411
+ return_output=True,
412
+ )
413
+ elif accept_header == "application/qlever-results+json":
414
+ try:
415
+ result_size = run_command(
416
+ f'jq -r ".resultsize" {result_file}',
417
+ return_output=True,
418
+ )
419
+ except Exception as e:
420
+ error_msg = get_json_error_msg(e)
421
+ else:
422
+ try:
423
+ result_size = int(
424
+ run_command(
425
+ f'jq -r ".results.bindings | length"'
426
+ f" {result_file}",
427
+ return_output=True,
428
+ ).rstrip()
429
+ )
430
+ except Exception as e:
431
+ error_msg = get_json_error_msg(e)
432
+ return int(result_size), error_msg
433
+
434
+ @staticmethod
435
+ def get_single_int_result(result_file: str) -> int | None:
436
+ """
437
+ When downloading the full result of a query with accept header as
438
+ application/sparql-results+json and result_size == 1, get the single
439
+ integer result value (if any).
440
+ """
441
+ single_int_result = None
442
+ try:
443
+ single_int_result = int(
444
+ run_command(
445
+ f'jq -e -r ".results.bindings[0][] | .value"'
446
+ f" {result_file}",
447
+ return_output=True,
448
+ ).rstrip()
449
+ )
450
+ except Exception:
451
+ pass
452
+ return single_int_result
453
+
454
+ def execute(self, args) -> bool:
455
+ # We can't have both `--remove-offset-and-limit` and `--limit`.
456
+ if args.remove_offset_and_limit and args.limit:
457
+ log.error("Cannot have both --remove-offset-and-limit and --limit")
458
+ return False
459
+
460
+ # Extract dataset and sparql_engine name from result file
461
+ dataset, engine = None, None
462
+ if args.result_file is not None:
463
+ result_file_parts = args.result_file.split(".")
464
+ if len(result_file_parts) != 2:
465
+ log.error(
466
+ "The argument of --result-file should be of the form "
467
+ "`<dataset>.<engine>`, e.g., `wikidata.qlever`"
468
+ )
469
+ return False
470
+ results_dir_path = Path(args.results_dir)
471
+ if results_dir_path.exists():
472
+ if not results_dir_path.is_dir():
473
+ log.error(
474
+ f"{results_dir_path} exists but is not a directory"
475
+ )
476
+ return False
477
+ else:
478
+ log.info(
479
+ f"Creating results directory: {results_dir_path.absolute()}"
480
+ )
481
+ results_dir_path.mkdir(parents=True, exist_ok=True)
482
+ dataset, engine = result_file_parts
483
+
484
+ # If `args.accept` is `application/sparql-results+json` or
485
+ # `application/qlever-results+json` or `AUTO`, we need `jq`.
486
+ if args.accept in (
487
+ "application/sparql-results+json",
488
+ "application/qlever-results+json",
489
+ "AUTO",
490
+ ):
491
+ try:
492
+ subprocess.run(
493
+ "jq --version",
494
+ shell=True,
495
+ check=True,
496
+ stdout=subprocess.DEVNULL,
497
+ stderr=subprocess.DEVNULL,
498
+ )
499
+ except Exception as e:
500
+ log.error(f"Please install `jq` for {args.accept} ({e})")
501
+ return False
502
+
503
+ if not any((args.queries_tsv, args.queries_yml, args.example_queries)):
504
+ log.error(
505
+ "No benchmark or example queries to read! Either pass benchmark "
506
+ "queries using --queries-tsv or --queries-yml, or pass the "
507
+ "argument --example-queries to run example queries for the "
508
+ f"given ui_config {args.ui_config}"
509
+ )
510
+ return False
511
+
512
+ if all((args.queries_tsv, args.queries_yml)):
513
+ log.error("Cannot have both --queries-tsv and --queries-yml")
514
+ return False
515
+
516
+ if any((args.queries_tsv, args.queries_yml)) and args.example_queries:
517
+ queries_file_arg = "tsv" if args.queries_tsv else "yml"
518
+ log.error(
519
+ f"Cannot have both --queries-{queries_file_arg} and "
520
+ "--example-queries"
521
+ )
522
+ return False
523
+
524
+ # Handle shortcuts for SPARQL endpoint.
525
+ if args.sparql_endpoint_preset:
526
+ args.sparql_endpoint = args.sparql_endpoint_preset
527
+
528
+ # Limit only works with full result.
529
+ if args.limit and args.download_or_count == "count":
530
+ log.error("Limit only works with full result")
531
+ return False
532
+
533
+ # Clear cache only works for QLever.
534
+ is_qlever = (
535
+ not args.sparql_endpoint
536
+ or args.sparql_endpoint.startswith("https://qlever")
537
+ )
538
+ if engine is not None:
539
+ is_qlever = is_qlever or "qlever" in engine.lower()
540
+ if args.clear_cache == "yes":
541
+ if is_qlever:
542
+ log.warning(
543
+ "Clearing the cache before each query"
544
+ " (only works for QLever)"
545
+ )
546
+ else:
547
+ log.warning(
548
+ "Clearing the cache only works for QLever"
549
+ ", option `--clear-cache` is ignored"
550
+ )
551
+ args.clear_cache = "no"
552
+
553
+ # Show what the command will do.
554
+ example_queries_cmd = (
555
+ "curl -sv https://qlever.cs.uni-freiburg.de/"
556
+ f"api/examples/{args.ui_config}"
557
+ )
558
+ sparql_endpoint = (
559
+ args.sparql_endpoint
560
+ if args.sparql_endpoint
561
+ else f"{args.host_name}:{args.port}"
562
+ )
563
+
564
+ self.show(
565
+ f"Obtain queries via: {args.queries_yml or args.queries_tsv or example_queries_cmd}\n"
566
+ f"SPARQL endpoint: {sparql_endpoint}\n"
567
+ f"Accept header: {args.accept}\n"
568
+ f"Download result for each query or just count:"
569
+ f" {args.download_or_count.upper()}"
570
+ + (f" with LIMIT {args.limit}" if args.limit else ""),
571
+ only_show=args.show,
572
+ )
573
+ if args.show:
574
+ return True
575
+
576
+ if args.queries_yml:
577
+ queries = self.parse_queries_yml(args.queries_yml)
578
+ elif args.queries_tsv:
579
+ queries = self.parse_queries_tsv(f"cat {args.queries_tsv}")
580
+ else:
581
+ queries = self.parse_queries_tsv(example_queries_cmd)
582
+
583
+ filtered_queries = self.filter_queries(
584
+ queries, args.query_ids, args.query_regex
585
+ )
586
+
587
+ if len(filtered_queries) == 0 or not filtered_queries[0]:
588
+ log.error("No queries to process!")
589
+ return False
590
+
591
+ # We want the width of the query description to be an uneven number (in
592
+ # case we have to truncated it, in which case we want to have a " ... "
593
+ # in the middle).
594
+ width_query_description_half = args.width_query_description // 2
595
+ width_query_description = 2 * width_query_description_half + 1
596
+
597
+ # Launch the queries one after the other and for each print: the
598
+ # description, the result size (number of rows), and the query
599
+ # processing time (seconds).
600
+ query_times = []
601
+ result_sizes = []
602
+ result_yml_query_records = {"queries": []}
603
+ num_failed = 0
604
+ for description, query in filtered_queries:
605
+ if len(query) == 0:
606
+ log.error("Could not parse description and query, line is:")
607
+ log.info("")
608
+ log.info(f"{description}\t{query}")
609
+ return False
610
+ query_type = self.sparql_query_type(query)
611
+ if args.add_query_type_to_description or args.accept == "AUTO":
612
+ description = f"{description} [{query_type}]"
613
+
614
+ # Clear the cache.
615
+ if args.clear_cache == "yes":
616
+ args.server_url = sparql_endpoint
617
+ args.complete = False
618
+ clear_cache_successful = False
619
+ with mute_log():
620
+ clear_cache_successful = ClearCacheCommand().execute(args)
621
+ if not clear_cache_successful:
622
+ log.warn("Failed to clear the cache")
623
+
624
+ # Remove OFFSET and LIMIT (after the last closing bracket).
625
+ if args.remove_offset_and_limit or args.limit:
626
+ closing_bracket_idx = query.rfind("}")
627
+ regexes = [
628
+ re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE),
629
+ re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE),
630
+ ]
631
+ for regex in regexes:
632
+ match = re.search(regex, query[closing_bracket_idx:])
633
+ if match:
634
+ query = (
635
+ query[: closing_bracket_idx + match.start()]
636
+ + query[closing_bracket_idx + match.end() :]
637
+ )
638
+
639
+ # Limit query.
640
+ if args.limit:
641
+ query += f" LIMIT {args.limit}"
642
+
643
+ # Count query.
644
+ if args.download_or_count == "count":
645
+ # First find out if there is a FROM clause.
646
+ regex_from_clause = re.compile(
647
+ r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE
648
+ )
649
+ match_from_clause = re.search(regex_from_clause, query)
650
+ from_clause = " "
651
+ if match_from_clause:
652
+ from_clause = match_from_clause.group(0)
653
+ query = (
654
+ query[: match_from_clause.start()]
655
+ + " "
656
+ + query[match_from_clause.end() :]
657
+ )
658
+ # Now we can add the outer SELECT COUNT(*).
659
+ query = (
660
+ re.sub(
661
+ r"SELECT ",
662
+ "SELECT (COUNT(*) AS ?qlever_count_)"
663
+ + from_clause
664
+ + "WHERE { SELECT ",
665
+ query,
666
+ count=1,
667
+ flags=re.IGNORECASE,
668
+ )
669
+ + " }"
670
+ )
671
+
672
+ # A bit of pretty-printing.
673
+ query = re.sub(r"\s+", " ", query)
674
+ query = re.sub(r"\s*\.\s*\}", " }", query)
675
+ if args.show_query == "always":
676
+ log.info("")
677
+ log.info(
678
+ colored(
679
+ self.pretty_printed_query(query, args.show_prefixes),
680
+ "cyan",
681
+ )
682
+ )
683
+
684
+ # Accept header. For "AUTO", use `text/turtle` for CONSTRUCT
685
+ # queries and `application/sparql-results+json` for all others.
686
+ accept_header = args.accept
687
+ if accept_header == "AUTO":
688
+ if query_type == "CONSTRUCT" or query_type == "DESCRIBE":
689
+ accept_header = "text/turtle"
690
+ else:
691
+ accept_header = "application/sparql-results+json"
692
+
693
+ # Launch query.
694
+ curl_cmd = (
695
+ f"curl -Ls {sparql_endpoint}"
696
+ f' -w "HTTP code: %{{http_code}}\\n"'
697
+ f' -H "Accept: {accept_header}"'
698
+ f" --data-urlencode query={shlex.quote(query)}"
699
+ )
700
+ log.debug(curl_cmd)
701
+ result_file = (
702
+ f"qlever.example_queries.result.{abs(hash(curl_cmd))}.tmp"
703
+ )
704
+ start_time = time.time()
705
+ try:
706
+ http_code = run_curl_command(
707
+ sparql_endpoint,
708
+ headers={"Accept": accept_header},
709
+ params={"query": query},
710
+ result_file=result_file,
711
+ ).strip()
712
+ if http_code == "200":
713
+ time_seconds = time.time() - start_time
714
+ error_msg = None
715
+ else:
716
+ time_seconds = time.time() - start_time
717
+ error_msg = {
718
+ "short": f"HTTP code: {http_code}",
719
+ "long": re.sub(
720
+ r"\s+", " ", Path(result_file).read_text()
721
+ ),
722
+ }
723
+ except Exception as e:
724
+ time_seconds = time.time() - start_time
725
+ if args.log_level == "DEBUG":
726
+ traceback.print_exc()
727
+ error_msg = {
728
+ "short": "Exception",
729
+ "long": re.sub(r"\s+", " ", str(e)),
730
+ }
731
+
732
+ # Get result size (via the command line, in order to avoid loading
733
+ # a potentially large JSON file into Python, which is slow).
734
+ if error_msg is None:
735
+ result_size, error_msg = self.get_result_size(
736
+ args.download_or_count == "count",
737
+ query_type,
738
+ accept_header,
739
+ result_file,
740
+ )
741
+ single_int_result = None
742
+ if (
743
+ result_size == 1
744
+ and accept_header == "application/sparql-results+json"
745
+ and args.download_or_count == "download"
746
+ ):
747
+ single_int_result = self.get_single_int_result(result_file)
748
+
749
+ # Get the result yaml record if output file needs to be generated
750
+ if args.result_file is not None:
751
+ result_length = None if error_msg is not None else 1
752
+ result_length = (
753
+ result_size
754
+ if args.download_or_count == "download"
755
+ and result_length is not None
756
+ else result_length
757
+ )
758
+ query_results = (
759
+ error_msg if error_msg is not None else result_file
760
+ )
761
+ query_record = self.get_result_yml_query_record(
762
+ query=description,
763
+ sparql=self.pretty_printed_query(
764
+ query, args.show_prefixes
765
+ ),
766
+ client_time=time_seconds,
767
+ result=query_results,
768
+ result_size=result_length,
769
+ max_result_size=args.max_results_output_file,
770
+ accept_header=accept_header,
771
+ )
772
+ result_yml_query_records["queries"].append(query_record)
773
+
774
+ # Print description, time, result in tabular form.
775
+ if len(description) > width_query_description:
776
+ description = (
777
+ description[: width_query_description_half - 2]
778
+ + " ... "
779
+ + description[-width_query_description_half + 2 :]
780
+ )
781
+ if error_msg is None:
782
+ result_size = int(result_size)
783
+ single_int_result = (
784
+ f" [single int result: {single_int_result:,}]"
785
+ if single_int_result is not None
786
+ else ""
787
+ )
788
+ log.info(
789
+ f"{description:<{width_query_description}} "
790
+ f"{time_seconds:6.2f} s "
791
+ f"{result_size:>{args.width_result_size},}"
792
+ f"{single_int_result}"
793
+ )
794
+ query_times.append(time_seconds)
795
+ result_sizes.append(result_size)
796
+ else:
797
+ num_failed += 1
798
+ if (
799
+ args.width_error_message > 0
800
+ and len(error_msg["long"]) > args.width_error_message
801
+ and args.log_level != "DEBUG"
802
+ and args.show_query != "on-error"
803
+ ):
804
+ error_msg["long"] = (
805
+ error_msg["long"][: args.width_error_message - 3]
806
+ + "..."
807
+ )
808
+ seperator_short_long = (
809
+ "\n" if args.show_query == "on-error" else " "
810
+ )
811
+ log.info(
812
+ f"{description:<{width_query_description}} "
813
+ f"{colored('FAILED ', 'red')}"
814
+ f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}"
815
+ f"{seperator_short_long}"
816
+ f"{colored(error_msg['long'], 'red')}"
817
+ )
818
+ if args.show_query == "on-error":
819
+ log.info(
820
+ colored(
821
+ self.pretty_printed_query(
822
+ query, args.show_prefixes
823
+ ),
824
+ "cyan",
825
+ )
826
+ )
827
+ log.info("")
828
+
829
+ # Remove the result file (unless in debug mode).
830
+ if args.log_level != "DEBUG":
831
+ Path(result_file).unlink(missing_ok=True)
832
+
833
+ # Check that each query has a time and a result size, or it failed.
834
+ assert len(result_sizes) == len(query_times)
835
+ assert len(query_times) + num_failed == len(filtered_queries)
836
+
837
+ if args.result_file:
838
+ if len(result_yml_query_records["queries"]) != 0:
839
+ outfile_name = f"{dataset}.{engine}.results.yaml"
840
+ outfile = Path(args.results_dir) / outfile_name
841
+ self.write_query_records_to_result_file(
842
+ query_data=result_yml_query_records,
843
+ out_file=outfile,
844
+ )
845
+ else:
846
+ log.error(
847
+ f"Nothing to write to output result YML file: {args.result_file}"
848
+ )
849
+
850
+ # Show statistics.
851
+ if len(query_times) > 0:
852
+ n = len(query_times)
853
+ total_query_time = sum(query_times)
854
+ average_query_time = total_query_time / n
855
+ median_query_time = sorted(query_times)[n // 2]
856
+ total_result_size = sum(result_sizes)
857
+ average_result_size = round(total_result_size / n)
858
+ median_result_size = sorted(result_sizes)[n // 2]
859
+ query_or_queries = "query" if n == 1 else "queries"
860
+ description = f"TOTAL for {n} {query_or_queries}"
861
+ log.info("")
862
+ log.info(
863
+ f"{description:<{width_query_description}} "
864
+ f"{total_query_time:6.2f} s "
865
+ f"{total_result_size:>14,}"
866
+ )
867
+ description = f"AVERAGE for {n} {query_or_queries}"
868
+ log.info(
869
+ f"{description:<{width_query_description}} "
870
+ f"{average_query_time:6.2f} s "
871
+ f"{average_result_size:>14,}"
872
+ )
873
+ description = f"MEDIAN for {n} {query_or_queries}"
874
+ log.info(
875
+ f"{description:<{width_query_description}} "
876
+ f"{median_query_time:6.2f} s "
877
+ f"{median_result_size:>14,}"
878
+ )
879
+
880
+ # Show number of failed queries.
881
+ if num_failed > 0:
882
+ log.info("")
883
+ description = "Number of FAILED queries"
884
+ num_failed_string = f"{num_failed:>6}"
885
+ if num_failed == len(filtered_queries):
886
+ num_failed_string += " [all]"
887
+ log.info(
888
+ colored(
889
+ f"{description:<{width_query_description}} "
890
+ f"{num_failed:>24}",
891
+ "red",
892
+ )
893
+ )
894
+
895
+ # Return success (has nothing to do with how many queries failed).
896
+ return True
897
+
898
+ def get_result_yml_query_record(
899
+ self,
900
+ query: str,
901
+ sparql: str,
902
+ client_time: float,
903
+ result: str | dict[str, str],
904
+ result_size: int | None,
905
+ max_result_size: int,
906
+ accept_header: str,
907
+ ) -> dict[str, Any]:
908
+ """
909
+ Construct a dictionary with query information for output result yaml file
910
+ """
911
+ record = {
912
+ "query": query,
913
+ "sparql": sparql,
914
+ "runtime_info": {},
915
+ }
916
+ if result_size is None:
917
+ results = f"{result['short']}: {result['long']}"
918
+ headers = []
919
+ else:
920
+ record["result_size"] = result_size
921
+ result_size = (
922
+ max_result_size
923
+ if result_size > max_result_size
924
+ else result_size
925
+ )
926
+ headers, results = self.get_query_results(
927
+ result, result_size, accept_header
928
+ )
929
+ if accept_header == "application/qlever-results+json":
930
+ runtime_info_cmd = (
931
+ f"jq 'if .runtimeInformation then"
932
+ f" .runtimeInformation else"
933
+ f' "null" end\' {result}'
934
+ )
935
+ runtime_info_str = run_command(
936
+ runtime_info_cmd, return_output=True
937
+ )
938
+ if runtime_info_str != "null":
939
+ record["runtime_info"] = json.loads(runtime_info_str)
940
+ record["runtime_info"]["client_time"] = client_time
941
+ record["headers"] = headers
942
+ record["results"] = results
943
+ return record
944
+
945
+ def get_query_results(
946
+ self, result_file: str, result_size: int, accept_header: str
947
+ ) -> tuple[list[str], list[list[str]]]:
948
+ """
949
+ Return headers and query results as a tuple for various accept headers
950
+ """
951
+ if accept_header in ("text/tab-separated-values", "text/csv"):
952
+ separator = "," if accept_header == "text/csv" else "\t"
953
+ get_result_cmd = f"sed -n '1,{result_size + 1}p' {result_file}"
954
+ results_str = run_command(get_result_cmd, return_output=True)
955
+ results = results_str.splitlines()
956
+ reader = csv.reader(StringIO(results_str), delimiter=separator)
957
+ headers = next(reader)
958
+ results = [row for row in reader]
959
+ return headers, results
960
+
961
+ elif accept_header == "application/qlever-results+json":
962
+ get_result_cmd = (
963
+ f"jq '{{headers: .selected, results: .res[0:{result_size}]}}' "
964
+ f"{result_file}"
965
+ )
966
+ results_str = run_command(get_result_cmd, return_output=True)
967
+ results_json = json.loads(results_str)
968
+ return results_json["headers"], results_json["results"]
969
+
970
+ elif accept_header == "application/sparql-results+json":
971
+ get_result_cmd = (
972
+ f"jq '{{headers: .head.vars, "
973
+ f"bindings: .results.bindings[0:{result_size}]}}' "
974
+ f"{result_file}"
975
+ )
976
+ results_str = run_command(get_result_cmd, return_output=True)
977
+ results_json = json.loads(results_str)
978
+ results = []
979
+ bindings = results_json.get("bindings", [])
980
+ for binding in bindings:
981
+ result = []
982
+ if not binding or not isinstance(binding, dict):
983
+ results.append([])
984
+ continue
985
+ for obj in binding.values():
986
+ value = '"' + obj["value"] + '"'
987
+ if obj["type"] == "uri":
988
+ value = "<" + value.strip('"') + ">"
989
+ elif "datatype" in obj:
990
+ value += "^^<" + obj["datatype"] + ">"
991
+ elif "xml:lang" in obj:
992
+ value += "@" + obj["xml:lang"]
993
+ result.append(value)
994
+ results.append(result)
995
+ return results_json["headers"], results
996
+
997
+ else: # text/turtle
998
+ graph = rdflib.Graph()
999
+ graph.parse(result_file, format="turtle")
1000
+ headers = ["?subject", "?predicate", "?object"]
1001
+ results = []
1002
+ for i, (s, p, o) in enumerate(graph):
1003
+ if i >= result_size:
1004
+ break
1005
+ results.append([str(s), str(p), str(o)])
1006
+ return headers, results
1007
+
1008
+ @staticmethod
1009
+ def write_query_records_to_result_file(
1010
+ query_data: dict[str, list[dict[str, Any]]], out_file: Path
1011
+ ) -> None:
1012
+ """
1013
+ Write yaml record for all queries to output yaml file
1014
+ """
1015
+ config_yaml = dict_to_yaml(query_data)
1016
+ with open(out_file, "w") as eval_yaml_file:
1017
+ eval_yaml_file.write(config_yaml)
1018
+ log.info("")
1019
+ log.info(
1020
+ f"Generated result yaml file: {out_file.stem}{out_file.suffix} "
1021
+ f"in the directory {out_file.parent.resolve()}"
1022
+ )