qlever 0.5.15__py3-none-any.whl → 0.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -17,14 +17,15 @@ from qlever.util import run_command, run_curl_command
17
17
 
18
18
  class ExampleQueriesCommand(QleverCommand):
19
19
  """
20
- Class for executing the `warmup` command.
20
+ Class for running a given sequence of example queries and showing
21
+ their processing times and result sizes.
21
22
  """
22
23
 
23
24
  def __init__(self):
24
25
  pass
25
26
 
26
27
  def description(self) -> str:
27
- return "Show how much of the cache is currently being used"
28
+ return "Run the given queries and show their processing times and result sizes"
28
29
 
29
30
  def should_have_qleverfile(self) -> bool:
30
31
  return False
@@ -51,13 +52,15 @@ class ExampleQueriesCommand(QleverCommand):
51
52
  subparser.add_argument(
52
53
  "--get-queries-cmd",
53
54
  type=str,
54
- help="Command to get example queries as TSV " "(description, query)",
55
+ help="Command to get example queries as TSV "
56
+ "(description, query)",
55
57
  )
56
58
  subparser.add_argument(
57
59
  "--query-ids",
58
60
  type=str,
59
61
  default="1-$",
60
- help="Query IDs as comma-separated list of " "ranges (e.g., 1-5,7,12-$)",
62
+ help="Query IDs as comma-separated list of "
63
+ "ranges (e.g., 1-5,7,12-$)",
61
64
  )
62
65
  subparser.add_argument(
63
66
  "--query-regex",
@@ -68,7 +71,7 @@ class ExampleQueriesCommand(QleverCommand):
68
71
  subparser.add_argument(
69
72
  "--download-or-count",
70
73
  choices=["download", "count"],
71
- default="count",
74
+ default="download",
72
75
  help="Whether to download the full result "
73
76
  "or just compute the size of the result",
74
77
  )
@@ -88,10 +91,14 @@ class ExampleQueriesCommand(QleverCommand):
88
91
  "text/tab-separated-values",
89
92
  "text/csv",
90
93
  "application/sparql-results+json",
94
+ "application/qlever-results+json",
91
95
  "text/turtle",
96
+ "AUTO",
92
97
  ],
93
98
  default="application/sparql-results+json",
94
- help="Accept header for the SPARQL query",
99
+ help="Accept header for the SPARQL query; AUTO means "
100
+ "`text/turtle` for CONSTRUCT AND DESCRIBE queries, "
101
+ "`application/sparql-results+json` for all others",
95
102
  )
96
103
  subparser.add_argument(
97
104
  "--clear-cache",
@@ -117,6 +124,13 @@ class ExampleQueriesCommand(QleverCommand):
117
124
  default=14,
118
125
  help="Width for printing the result size",
119
126
  )
127
+ subparser.add_argument(
128
+ "--add-query-type-to-description",
129
+ action="store_true",
130
+ default=False,
131
+ help="Add the query type (SELECT, ASK, CONSTRUCT, DESCRIBE, "
132
+ "UNKNOWN) to the description",
133
+ )
120
134
  subparser.add_argument(
121
135
  "--show-query",
122
136
  choices=["always", "never", "on-error"],
@@ -130,19 +144,35 @@ class ExampleQueriesCommand(QleverCommand):
130
144
  help="When showing the query, also show the prefixes",
131
145
  )
132
146
 
133
- def pretty_print_query(self, query: str, show_prefixes: bool) -> None:
134
- remove_prefixes_cmd = " | sed '/^PREFIX /Id'" if not show_prefixes else ""
147
+ def pretty_printed_query(self, query: str, show_prefixes: bool) -> str:
148
+ remove_prefixes_cmd = (
149
+ " | sed '/^PREFIX /Id'" if not show_prefixes else ""
150
+ )
135
151
  pretty_print_query_cmd = (
136
152
  f"echo {shlex.quote(query)}"
137
153
  f" | docker run -i --rm sparqling/sparql-formatter"
138
154
  f"{remove_prefixes_cmd} | grep -v '^$'"
139
155
  )
140
156
  try:
141
- query_pp = run_command(pretty_print_query_cmd, return_output=True)
142
- log.info(colored(query_pp.rstrip(), "cyan"))
143
- except Exception as e:
144
- log.error(f"Failed to pretty-print query: {e}")
145
- log.info(colored(query.rstrip(), "cyan"))
157
+ query_pretty_printed = run_command(
158
+ pretty_print_query_cmd, return_output=True
159
+ )
160
+ return query_pretty_printed.rstrip()
161
+ except Exception:
162
+ log.error(
163
+ "Failed to pretty-print query, "
164
+ "returning original query: {e}"
165
+ )
166
+ return query.rstrip()
167
+
168
+ def sparql_query_type(self, query: str) -> str:
169
+ match = re.search(
170
+ r"(SELECT|ASK|CONSTRUCT|DESCRIBE)\s", query, re.IGNORECASE
171
+ )
172
+ if match:
173
+ return match.group(1).upper()
174
+ else:
175
+ return "UNKNOWN"
146
176
 
147
177
  def execute(self, args) -> bool:
148
178
  # We can't have both `--remove-offset-and-limit` and `--limit`.
@@ -150,8 +180,13 @@ class ExampleQueriesCommand(QleverCommand):
150
180
  log.error("Cannot have both --remove-offset-and-limit and --limit")
151
181
  return False
152
182
 
153
- # If `args.accept` is `application/sparql-results+json`, we need `jq`.
154
- if args.accept == "application/sparql-results+json":
183
+ # If `args.accept` is `application/sparql-results+json` or
184
+ # `application/qlever-results+json` or `AUTO`, we need `jq`.
185
+ if (
186
+ args.accept == "application/sparql-results+json"
187
+ or args.accept == "application/qlever-results+json"
188
+ or args.accept == "AUTO"
189
+ ):
155
190
  try:
156
191
  subprocess.run(
157
192
  "jq --version",
@@ -174,8 +209,9 @@ class ExampleQueriesCommand(QleverCommand):
174
209
  return False
175
210
 
176
211
  # Clear cache only works for QLever.
177
- is_qlever = not args.sparql_endpoint or args.sparql_endpoint.startswith(
178
- "https://qlever"
212
+ is_qlever = (
213
+ not args.sparql_endpoint
214
+ or args.sparql_endpoint.startswith("https://qlever")
179
215
  )
180
216
  if args.clear_cache == "yes" and not is_qlever:
181
217
  log.warning("Clearing the cache only works for QLever")
@@ -193,7 +229,9 @@ class ExampleQueriesCommand(QleverCommand):
193
229
  if args.query_regex:
194
230
  get_queries_cmd += f" | grep -Pi {shlex.quote(args.query_regex)}"
195
231
  sparql_endpoint = (
196
- args.sparql_endpoint if args.sparql_endpoint else f"localhost:{args.port}"
232
+ args.sparql_endpoint
233
+ if args.sparql_endpoint
234
+ else f"localhost:{args.port}"
197
235
  )
198
236
  self.show(
199
237
  f"Obtain queries via: {get_queries_cmd}\n"
@@ -211,7 +249,9 @@ class ExampleQueriesCommand(QleverCommand):
211
249
 
212
250
  # Get the example queries.
213
251
  try:
214
- example_query_lines = run_command(get_queries_cmd, return_output=True)
252
+ example_query_lines = run_command(
253
+ get_queries_cmd, return_output=True
254
+ )
215
255
  if len(example_query_lines) == 0:
216
256
  log.error("No example queries matching the criteria found")
217
257
  return False
@@ -220,6 +260,12 @@ class ExampleQueriesCommand(QleverCommand):
220
260
  log.error(f"Failed to get example queries: {e}")
221
261
  return False
222
262
 
263
+ # We want the width of the query description to be an uneven number (in
264
+ # case we have to truncated it, in which case we want to have a " ... "
265
+ # in the middle).
266
+ width_query_description_half = args.width_query_description // 2
267
+ width_query_description = 2 * width_query_description_half + 1
268
+
223
269
  # Launch the queries one after the other and for each print: the
224
270
  # description, the result size (number of rows), and the query
225
271
  # processing time (seconds).
@@ -227,13 +273,16 @@ class ExampleQueriesCommand(QleverCommand):
227
273
  result_sizes = []
228
274
  num_failed = 0
229
275
  for example_query_line in example_query_lines:
230
- # Parse description and query.
276
+ # Parse description and query, and determine query type.
231
277
  description, query = example_query_line.split("\t")
232
278
  if len(query) == 0:
233
279
  log.error("Could not parse description and query, line is:")
234
280
  log.info("")
235
281
  log.info(example_query_line)
236
282
  return False
283
+ query_type = self.sparql_query_type(query)
284
+ if args.add_query_type_to_description or args.accept == "AUTO":
285
+ description = f"{description} [{query_type}]"
237
286
 
238
287
  # Clear the cache.
239
288
  if args.clear_cache == "yes":
@@ -267,7 +316,9 @@ class ExampleQueriesCommand(QleverCommand):
267
316
  # Count query.
268
317
  if args.download_or_count == "count":
269
318
  # First find out if there is a FROM clause.
270
- regex_from_clause = re.compile(r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE)
319
+ regex_from_clause = re.compile(
320
+ r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE
321
+ )
271
322
  match_from_clause = re.search(regex_from_clause, query)
272
323
  from_clause = " "
273
324
  if match_from_clause:
@@ -296,24 +347,39 @@ class ExampleQueriesCommand(QleverCommand):
296
347
  query = re.sub(r"\s*\.\s*\}", " }", query)
297
348
  if args.show_query == "always":
298
349
  log.info("")
299
- self.pretty_print_query(query, args.show_prefixes)
350
+ log.info(
351
+ colored(
352
+ self.pretty_printed_query(query, args.show_prefixes),
353
+ "cyan",
354
+ )
355
+ )
356
+
357
+ # Accept header. For "AUTO", use `text/turtle` for CONSTRUCT
358
+ # queries and `application/sparql-results+json` for all others.
359
+ accept_header = args.accept
360
+ if accept_header == "AUTO":
361
+ if query_type == "CONSTRUCT" or query_type == "DESCRIBE":
362
+ accept_header = "text/turtle"
363
+ else:
364
+ accept_header = "application/sparql-results+json"
300
365
 
301
366
  # Launch query.
302
367
  try:
303
368
  curl_cmd = (
304
369
  f"curl -s {sparql_endpoint}"
305
370
  f' -w "HTTP code: %{{http_code}}\\n"'
306
- f' -H "Accept: {args.accept}"'
371
+ f' -H "Accept: {accept_header}"'
307
372
  f" --data-urlencode query={shlex.quote(query)}"
308
373
  )
309
374
  log.debug(curl_cmd)
310
375
  result_file = (
311
- f"qlever.example_queries.result." f"{abs(hash(curl_cmd))}.tmp"
376
+ f"qlever.example_queries.result."
377
+ f"{abs(hash(curl_cmd))}.tmp"
312
378
  )
313
379
  start_time = time.time()
314
380
  http_code = run_curl_command(
315
381
  sparql_endpoint,
316
- headers={"Accept": args.accept},
382
+ headers={"Accept": accept_header},
317
383
  params={"query": query},
318
384
  result_file=result_file,
319
385
  ).strip()
@@ -323,7 +389,9 @@ class ExampleQueriesCommand(QleverCommand):
323
389
  else:
324
390
  error_msg = {
325
391
  "short": f"HTTP code: {http_code}",
326
- "long": re.sub(r"\s+", " ", Path(result_file).read_text()),
392
+ "long": re.sub(
393
+ r"\s+", " ", Path(result_file).read_text()
394
+ ),
327
395
  }
328
396
  except Exception as e:
329
397
  if args.log_level == "DEBUG":
@@ -336,8 +404,12 @@ class ExampleQueriesCommand(QleverCommand):
336
404
  # Get result size (via the command line, in order to avoid loading
337
405
  # a potentially large JSON file into Python, which is slow).
338
406
  if error_msg is None:
339
- # CASE 0: Rhe result is empty despite a 200 HTTP code.
340
- if Path(result_file).stat().st_size == 0:
407
+ # CASE 0: The result is empty despite a 200 HTTP code (not a
408
+ # problem for CONSTRUCT and DESCRIBE queries).
409
+ if Path(result_file).stat().st_size == 0 and (
410
+ not query_type == "CONSTRUCT"
411
+ and not query_type == "DESCRIBE"
412
+ ):
341
413
  result_size = 0
342
414
  error_msg = {
343
415
  "short": "Empty result",
@@ -347,7 +419,7 @@ class ExampleQueriesCommand(QleverCommand):
347
419
 
348
420
  # CASE 1: Just counting the size of the result (TSV or JSON).
349
421
  elif args.download_or_count == "count":
350
- if args.accept == "text/tab-separated-values":
422
+ if accept_header == "text/tab-separated-values":
351
423
  result_size = run_command(
352
424
  f"sed 1d {result_file}", return_output=True
353
425
  )
@@ -370,21 +442,28 @@ class ExampleQueriesCommand(QleverCommand):
370
442
  # CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON).
371
443
  else:
372
444
  if (
373
- args.accept == "text/tab-separated-values"
374
- or args.accept == "text/csv"
445
+ accept_header == "text/tab-separated-values"
446
+ or accept_header == "text/csv"
375
447
  ):
376
448
  result_size = run_command(
377
449
  f"sed 1d {result_file} | wc -l", return_output=True
378
450
  )
379
- elif args.accept == "text/turtle":
451
+ elif accept_header == "text/turtle":
380
452
  result_size = run_command(
381
- f"sed '1d;/^@prefix/d;/^\\s*$/d' " f"{result_file} | wc -l",
453
+ f"sed '1d;/^@prefix/d;/^\\s*$/d' "
454
+ f"{result_file} | wc -l",
455
+ return_output=True,
456
+ )
457
+ elif accept_header == "application/qlever-results+json":
458
+ result_size = run_command(
459
+ f'jq -r ".resultsize" {result_file}',
382
460
  return_output=True,
383
461
  )
384
462
  else:
385
463
  try:
386
464
  result_size = run_command(
387
- f'jq -r ".results.bindings | length"' f" {result_file}",
465
+ f'jq -r ".results.bindings | length"'
466
+ f" {result_file}",
388
467
  return_output=True,
389
468
  )
390
469
  except Exception as e:
@@ -398,13 +477,16 @@ class ExampleQueriesCommand(QleverCommand):
398
477
  Path(result_file).unlink(missing_ok=True)
399
478
 
400
479
  # Print description, time, result in tabular form.
401
- if len(description) > args.width_query_description:
402
- description = description[: args.width_query_description - 3]
403
- description += "..."
480
+ if len(description) > width_query_description:
481
+ description = (
482
+ description[: width_query_description_half - 2]
483
+ + " ... "
484
+ + description[-width_query_description_half + 2 :]
485
+ )
404
486
  if error_msg is None:
405
487
  result_size = int(result_size)
406
488
  log.info(
407
- f"{description:<{args.width_query_description}} "
489
+ f"{description:<{width_query_description}} "
408
490
  f"{time_seconds:6.2f} s "
409
491
  f"{result_size:>{args.width_result_size},}"
410
492
  )
@@ -419,18 +501,28 @@ class ExampleQueriesCommand(QleverCommand):
419
501
  and args.show_query != "on-error"
420
502
  ):
421
503
  error_msg["long"] = (
422
- error_msg["long"][: args.width_error_message - 3] + "..."
504
+ error_msg["long"][: args.width_error_message - 3]
505
+ + "..."
423
506
  )
424
- seperator_short_long = "\n" if args.show_query == "on-error" else " "
507
+ seperator_short_long = (
508
+ "\n" if args.show_query == "on-error" else " "
509
+ )
425
510
  log.info(
426
- f"{description:<{args.width_query_description}} "
511
+ f"{description:<{width_query_description}} "
427
512
  f"{colored('FAILED ', 'red')}"
428
513
  f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}"
429
514
  f"{seperator_short_long}"
430
515
  f"{colored(error_msg['long'], 'red')}"
431
516
  )
432
517
  if args.show_query == "on-error":
433
- self.pretty_print_query(query, args.show_prefixes)
518
+ log.info(
519
+ colored(
520
+ self.pretty_printed_query(
521
+ query, args.show_prefixes
522
+ ),
523
+ "cyan",
524
+ )
525
+ )
434
526
  log.info("")
435
527
 
436
528
  # Check that each query has a time and a result size, or it failed.
@@ -450,19 +542,19 @@ class ExampleQueriesCommand(QleverCommand):
450
542
  description = f"TOTAL for {n} {query_or_queries}"
451
543
  log.info("")
452
544
  log.info(
453
- f"{description:<{args.width_query_description}} "
545
+ f"{description:<{width_query_description}} "
454
546
  f"{total_query_time:6.2f} s "
455
547
  f"{total_result_size:>14,}"
456
548
  )
457
549
  description = f"AVERAGE for {n} {query_or_queries}"
458
550
  log.info(
459
- f"{description:<{args.width_query_description}} "
551
+ f"{description:<{width_query_description}} "
460
552
  f"{average_query_time:6.2f} s "
461
553
  f"{average_result_size:>14,}"
462
554
  )
463
555
  description = f"MEDIAN for {n} {query_or_queries}"
464
556
  log.info(
465
- f"{description:<{args.width_query_description}} "
557
+ f"{description:<{width_query_description}} "
466
558
  f"{median_query_time:6.2f} s "
467
559
  f"{median_result_size:>14,}"
468
560
  )
@@ -476,7 +568,7 @@ class ExampleQueriesCommand(QleverCommand):
476
568
  num_failed_string += " [all]"
477
569
  log.info(
478
570
  colored(
479
- f"{description:<{args.width_query_description}} "
571
+ f"{description:<{width_query_description}} "
480
572
  f"{num_failed:>24}",
481
573
  "red",
482
574
  )
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from qlever.command import QleverCommand
6
+ from qlever.log import log
7
+
8
+
9
+ class ExtractQueriesCommand(QleverCommand):
10
+ """
11
+ Class for executing the `extract-queries` command.
12
+ """
13
+
14
+ def __init__(self):
15
+ pass
16
+
17
+ def description(self) -> str:
18
+ return "Extract all SPARQL queries from the server log"
19
+
20
+ def should_have_qleverfile(self) -> bool:
21
+ return True
22
+
23
+ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
24
+ return {"data": ["name"]}
25
+
26
+ def additional_arguments(self, subparser) -> None:
27
+ subparser.add_argument(
28
+ "--description-base",
29
+ type=str,
30
+ default="Log extract",
31
+ help="Base name for the query descriptions"
32
+ " (default: `Log extract`)",
33
+ )
34
+ subparser.add_argument(
35
+ "--log-file",
36
+ type=str,
37
+ help="Name of the log file to extract queries from"
38
+ " (default: `<name>.server-log.txt`)",
39
+ )
40
+ subparser.add_argument(
41
+ "--output-file",
42
+ type=str,
43
+ default="log-queries.txt",
44
+ help="Output file for the extracted queries (default: `log-queries.txt`)",
45
+ )
46
+
47
+ def execute(self, args) -> bool:
48
+ # Show what the command does.
49
+ if args.log_file is not None:
50
+ log_file_name = args.log_file
51
+ else:
52
+ log_file_name = f"{args.name}.server-log.txt"
53
+ self.show(
54
+ f"Extract SPARQL queries from `{log_file_name}`"
55
+ f" and write them to `{args.output_file}`",
56
+ only_show=args.show,
57
+ )
58
+ if args.show:
59
+ return True
60
+
61
+ # Regex for log entries of the form
62
+ # 2025-01-14 04:47:44.950 - INFO
63
+ log_line_regex = (
64
+ r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}) - [A-Z]+:"
65
+ )
66
+
67
+ # Read the log file line by line.
68
+ log_file = open(log_file_name, "r")
69
+ queries_file = open(args.output_file, "w")
70
+ query = None
71
+ description_base = args.description_base
72
+ description_base_count = {}
73
+ tsv_line_short_width = 150
74
+ for line in log_file:
75
+ # An "Alive check" message contains a tag, which we use as the base
76
+ # name of the query description.
77
+ alive_check_regex = r"Alive check with message \"(.*)\""
78
+ match = re.search(alive_check_regex, line)
79
+ if match:
80
+ description_base = match.group(1)
81
+ continue
82
+
83
+ # A new query in the log.
84
+ if "Processing the following SPARQL query" in line:
85
+ query = []
86
+ query_index = (
87
+ description_base_count.get(description_base, 0) + 1
88
+ )
89
+ description_base_count[description_base] = query_index
90
+ continue
91
+ # If we have started a query: extend until we meet the next log
92
+ # line, then push the query. Remove comments.
93
+ if query is not None:
94
+ if not re.match(log_line_regex, line):
95
+ if not re.match(r"^\s*#", line):
96
+ line = re.sub(r" #.*", "", line)
97
+ query.append(line)
98
+ else:
99
+ query = re.sub(r"\s+", " ", "\n".join(query)).strip()
100
+ description = f"{description_base}, Query #{query_index}"
101
+ tsv_line = f"{description}\t{query}"
102
+ tsv_line_short = (
103
+ tsv_line
104
+ if len(tsv_line) < tsv_line_short_width
105
+ else tsv_line[:tsv_line_short_width] + "..."
106
+ )
107
+ log.info(tsv_line_short)
108
+ print(tsv_line, file=queries_file)
109
+ query = None
110
+
111
+ log_file.close()
112
+ queries_file.close()
113
+ return True
qlever/commands/index.py CHANGED
@@ -2,13 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import glob
4
4
  import json
5
- import shlex
6
5
  import re
6
+ import shlex
7
7
 
8
8
  from qlever.command import QleverCommand
9
9
  from qlever.containerize import Containerize
10
10
  from qlever.log import log
11
- from qlever.util import get_existing_index_files, get_total_file_size, run_command
11
+ from qlever.util import (
12
+ get_existing_index_files,
13
+ get_total_file_size,
14
+ run_command,
15
+ )
12
16
 
13
17
 
14
18
  class IndexCommand(QleverCommand):
@@ -36,9 +40,11 @@ class IndexCommand(QleverCommand):
36
40
  "settings_json",
37
41
  "index_binary",
38
42
  "only_pso_and_pos_permutations",
43
+ "ulimit",
39
44
  "use_patterns",
40
45
  "text_index",
41
46
  "stxxl_memory",
47
+ "parser_buffer_size",
42
48
  ],
43
49
  "runtime": ["system", "image", "index_container"],
44
50
  }
@@ -48,7 +54,7 @@ class IndexCommand(QleverCommand):
48
54
  "--overwrite-existing",
49
55
  action="store_true",
50
56
  default=False,
51
- help="Overwrite an existing index, think twice before using.",
57
+ help="Overwrite an existing index, think twice before using this",
52
58
  )
53
59
 
54
60
  # Exception for invalid JSON.
@@ -76,7 +82,8 @@ class IndexCommand(QleverCommand):
76
82
  # Check that it is an array of length at least one.
77
83
  if not isinstance(input_specs, list):
78
84
  raise self.InvalidInputJson(
79
- "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
85
+ "`MULTI_INPUT_JSON` must be a JSON array",
86
+ args.multi_input_json,
80
87
  )
81
88
  if len(input_specs) == 0:
82
89
  raise self.InvalidInputJson(
@@ -90,13 +97,15 @@ class IndexCommand(QleverCommand):
90
97
  # Check that `input_spec` is a dictionary.
91
98
  if not isinstance(input_spec, dict):
92
99
  raise self.InvalidInputJson(
93
- f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
100
+ f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
101
+ "object",
94
102
  input_spec,
95
103
  )
96
104
  # For each `input_spec`, we must have a command.
97
105
  if "cmd" not in input_spec:
98
106
  raise self.InvalidInputJson(
99
- f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
107
+ f"Element {i} in `MULTI_INPUT_JSON` must contain a "
108
+ "key `cmd`",
100
109
  input_spec,
101
110
  )
102
111
  # If the command contains a `{}` placeholder, we need a `for-each`
@@ -147,7 +156,7 @@ class IndexCommand(QleverCommand):
147
156
  raise self.InvalidInputJson(
148
157
  f"Element {i} in `MULTI_INPUT_JSON` must only contain "
149
158
  "the keys `format`, `graph`, and `parallel`. Contains "
150
- "extra keys {extra_keys}.",
159
+ f"extra keys {extra_keys}.",
151
160
  input_spec,
152
161
  )
153
162
  # Add the command-line options for this input stream. We use
@@ -204,20 +213,31 @@ class IndexCommand(QleverCommand):
204
213
  index_cmd += " --only-pso-and-pos-permutations --no-patterns"
205
214
  if not args.use_patterns:
206
215
  index_cmd += " --no-patterns"
207
- if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
216
+ if args.text_index in [
217
+ "from_text_records",
218
+ "from_text_records_and_literals",
219
+ ]:
208
220
  index_cmd += (
209
- f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
221
+ f" -w {args.name}.wordsfile.tsv"
222
+ f" -d {args.name}.docsfile.tsv"
210
223
  )
211
- if args.text_index in ["from_literals", "from_text_records_and_literals"]:
224
+ if args.text_index in [
225
+ "from_literals",
226
+ "from_text_records_and_literals",
227
+ ]:
212
228
  index_cmd += " --text-words-from-literals"
213
229
  if args.stxxl_memory:
214
230
  index_cmd += f" --stxxl-memory {args.stxxl_memory}"
231
+ if args.parser_buffer_size:
232
+ index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
215
233
  index_cmd += f" | tee {args.name}.index-log.txt"
216
234
 
217
235
  # If the total file size is larger than 10 GB, set ulimit (such that a
218
236
  # large number of open files is allowed).
219
237
  total_file_size = get_total_file_size(shlex.split(args.input_files))
220
- if total_file_size > 1e10:
238
+ if args.ulimit is not None:
239
+ index_cmd = f"ulimit -Sn {args.ulimit}; {index_cmd}"
240
+ elif total_file_size > 1e10:
221
241
  index_cmd = f"ulimit -Sn 1048576; {index_cmd}"
222
242
 
223
243
  # Run the command in a container (if so desired).
@@ -234,7 +254,8 @@ class IndexCommand(QleverCommand):
234
254
 
235
255
  # Command for writing the settings JSON to a file.
236
256
  settings_json_cmd = (
237
- f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
257
+ f"echo {shlex.quote(args.settings_json)} "
258
+ f"> {args.name}.settings.json"
238
259
  )
239
260
 
240
261
  # Show the command line.
@@ -279,9 +300,15 @@ class IndexCommand(QleverCommand):
279
300
  return False
280
301
 
281
302
  # Remove already existing container.
282
- if args.system in Containerize.supported_systems() and args.overwrite_existing:
303
+ if (
304
+ args.system in Containerize.supported_systems()
305
+ and args.overwrite_existing
306
+ ):
283
307
  if Containerize.is_running(args.system, args.index_container):
284
- log.info("Another index process is running, trying to stop " "it ...")
308
+ log.info(
309
+ "Another index process is running, trying to stop "
310
+ "it ..."
311
+ )
285
312
  log.info("")
286
313
  try:
287
314
  run_command(f"{args.system} rm -f {args.index_container}")