qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. qlever/Qleverfiles/Qleverfile.dblp +36 -0
  2. qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
  3. qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
  4. qlever/Qleverfiles/Qleverfile.default +51 -0
  5. qlever/Qleverfiles/Qleverfile.dnb +40 -0
  6. qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
  7. qlever/Qleverfiles/Qleverfile.freebase +28 -0
  8. qlever/Qleverfiles/Qleverfile.imdb +36 -0
  9. qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
  10. qlever/Qleverfiles/Qleverfile.olympics +31 -0
  11. qlever/Qleverfiles/Qleverfile.orkg +30 -0
  12. qlever/Qleverfiles/Qleverfile.osm-country +39 -0
  13. qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
  14. qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
  15. qlever/Qleverfiles/Qleverfile.pubchem +131 -0
  16. qlever/Qleverfiles/Qleverfile.scientists +29 -0
  17. qlever/Qleverfiles/Qleverfile.uniprot +74 -0
  18. qlever/Qleverfiles/Qleverfile.vvz +31 -0
  19. qlever/Qleverfiles/Qleverfile.wikidata +42 -0
  20. qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
  21. qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
  22. qlever/__init__.py +44 -1380
  23. qlever/command.py +87 -0
  24. qlever/commands/__init__.py +0 -0
  25. qlever/commands/add_text_index.py +115 -0
  26. qlever/commands/benchmark_queries.py +1019 -0
  27. qlever/commands/cache_stats.py +125 -0
  28. qlever/commands/clear_cache.py +88 -0
  29. qlever/commands/extract_queries.py +120 -0
  30. qlever/commands/get_data.py +48 -0
  31. qlever/commands/index.py +333 -0
  32. qlever/commands/index_stats.py +306 -0
  33. qlever/commands/log.py +66 -0
  34. qlever/commands/materialized_view.py +110 -0
  35. qlever/commands/query.py +142 -0
  36. qlever/commands/rebuild_index.py +176 -0
  37. qlever/commands/reset_updates.py +59 -0
  38. qlever/commands/settings.py +115 -0
  39. qlever/commands/setup_config.py +97 -0
  40. qlever/commands/start.py +336 -0
  41. qlever/commands/status.py +50 -0
  42. qlever/commands/stop.py +90 -0
  43. qlever/commands/system_info.py +130 -0
  44. qlever/commands/ui.py +271 -0
  45. qlever/commands/update.py +90 -0
  46. qlever/commands/update_wikidata.py +1204 -0
  47. qlever/commands/warmup.py +41 -0
  48. qlever/config.py +223 -0
  49. qlever/containerize.py +167 -0
  50. qlever/log.py +55 -0
  51. qlever/qlever_main.py +79 -0
  52. qlever/qleverfile.py +530 -0
  53. qlever/util.py +330 -0
  54. qlever-0.5.41.dist-info/METADATA +127 -0
  55. qlever-0.5.41.dist-info/RECORD +59 -0
  56. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
  57. qlever-0.5.41.dist-info/entry_points.txt +2 -0
  58. qlever-0.5.41.dist-info/top_level.txt +1 -0
  59. build/lib/qlever/__init__.py +0 -1383
  60. build/lib/qlever/__main__.py +0 -4
  61. qlever/__main__.py +0 -4
  62. qlever-0.2.5.dist-info/METADATA +0 -277
  63. qlever-0.2.5.dist-info/RECORD +0 -12
  64. qlever-0.2.5.dist-info/entry_points.txt +0 -2
  65. qlever-0.2.5.dist-info/top_level.txt +0 -4
  66. src/qlever/__init__.py +0 -1383
  67. src/qlever/__main__.py +0 -4
  68. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,333 @@
1
+ from __future__ import annotations
2
+
3
+ import glob
4
+ import json
5
+ import re
6
+ import shlex
7
+
8
+ from qlever.command import QleverCommand
9
+ from qlever.containerize import Containerize
10
+ from qlever.log import log
11
+ from qlever.util import (
12
+ binary_exists,
13
+ get_existing_index_files,
14
+ get_total_file_size,
15
+ run_command,
16
+ )
17
+
18
+
19
+ class IndexCommand(QleverCommand):
20
+ """
21
+ Class for executing the `index` command.
22
+ """
23
+
24
+ def __init__(self):
25
+ pass
26
+
27
+ def description(self) -> str:
28
+ return "Build the index for a given RDF dataset"
29
+
30
+ def should_have_qleverfile(self) -> bool:
31
+ return True
32
+
33
+ def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
34
+ return {
35
+ "data": ["name", "format"],
36
+ "index": [
37
+ "input_files",
38
+ "cat_input_files",
39
+ "encode_as_id",
40
+ "multi_input_json",
41
+ "parallel_parsing",
42
+ "settings_json",
43
+ "vocabulary_type",
44
+ "index_binary",
45
+ "only_pso_and_pos_permutations",
46
+ "ulimit",
47
+ "use_patterns",
48
+ "add_has_word_triples",
49
+ "text_index",
50
+ "stxxl_memory",
51
+ "parser_buffer_size",
52
+ ],
53
+ "runtime": ["system", "image", "index_container"],
54
+ }
55
+
56
+ def additional_arguments(self, subparser) -> None:
57
+ subparser.add_argument(
58
+ "--overwrite-existing",
59
+ action="store_true",
60
+ default=False,
61
+ help="Overwrite an existing index, think twice before using this",
62
+ )
63
+
64
+ # Exception for invalid JSON.
65
+ class InvalidInputJson(Exception):
66
+ def __init__(self, error_message, additional_info):
67
+ self.error_message = error_message
68
+ self.additional_info = additional_info
69
+ super().__init__()
70
+
71
+ # Helper function to get command line options from JSON.
72
+ def get_input_options_for_json(self, args) -> str:
73
+ # Parse the JSON. If `args.multi_input_json` look like JSONL, turn
74
+ # it into a JSON array.
75
+ try:
76
+ jsonl_line_regex = re.compile(r"^\s*\{.*\}\s*$")
77
+ jsonl_lines = args.multi_input_json.split("\n")
78
+ if all(re.match(jsonl_line_regex, line) for line in jsonl_lines):
79
+ args.multi_input_json = "[" + ", ".join(jsonl_lines) + "]"
80
+ input_specs = json.loads(args.multi_input_json)
81
+ except Exception as e:
82
+ raise self.InvalidInputJson(
83
+ f"Failed to parse `MULTI_INPUT_JSON` as either JSON or JSONL ({e})",
84
+ args.multi_input_json,
85
+ )
86
+ # Check that it is an array of length at least one.
87
+ if not isinstance(input_specs, list):
88
+ raise self.InvalidInputJson(
89
+ "`MULTI_INPUT_JSON` must be a JSON array",
90
+ args.multi_input_json,
91
+ )
92
+ if len(input_specs) == 0:
93
+ raise self.InvalidInputJson(
94
+ "`MULTI_INPUT_JSON` must contain at least one element",
95
+ args.multi_input_json,
96
+ )
97
+ # For each of the maps, construct the corresponding command-line
98
+ # options to the index binary.
99
+ input_options = []
100
+ for i, input_spec in enumerate(input_specs):
101
+ # Check that `input_spec` is a dictionary.
102
+ if not isinstance(input_spec, dict):
103
+ raise self.InvalidInputJson(
104
+ f"Element {i} in `MULTI_INPUT_JSON` must be a JSON object",
105
+ input_spec,
106
+ )
107
+ # For each `input_spec`, we must have a command.
108
+ if "cmd" not in input_spec:
109
+ raise self.InvalidInputJson(
110
+ f"Element {i} in `MULTI_INPUT_JSON` must contain a "
111
+ "key `cmd`",
112
+ input_spec,
113
+ )
114
+ # If the command contains a `{}` placeholder, we need a `for-each`
115
+ # key` specifying the pattern for the placeholder values, and vice
116
+ # versa.
117
+ if "{}" in input_spec["cmd"] and "for-each" not in input_spec:
118
+ raise self.InvalidInputJson(
119
+ f"Element {i} in `MULTI_INPUT_JSON` must contain a "
120
+ "key `for-each` if the command contains a placeholder "
121
+ "`{}`",
122
+ input_spec,
123
+ )
124
+ if "for-each" in input_spec and "{}" not in input_spec["cmd"]:
125
+ raise self.InvalidInputJson(
126
+ f"Element {i} in `MULTI_INPUT_JSON` contains a "
127
+ "key `for-each`, but the command does not contain a "
128
+ "placeholder `{{}}`",
129
+ input_spec,
130
+ )
131
+ # Get all commands. This is just the value of the `cmd` key if no
132
+ # `for-each` key is specified. Otherwise, we have a command for
133
+ # each file matching the pattern.
134
+ if "for-each" not in input_spec:
135
+ input_cmds = [input_spec["cmd"]]
136
+ else:
137
+ try:
138
+ files = sorted(glob.glob(input_spec["for-each"]))
139
+ except Exception as e:
140
+ raise self.InvalidInputJson(
141
+ f"Element {i} in `MULTI_INPUT_JSON` contains an "
142
+ f"invalid `for-each` pattern: {e}",
143
+ input_spec,
144
+ )
145
+ input_cmds = [input_spec["cmd"].format(file) for file in files]
146
+ # The `format`, `graph`, and `parallel` keys are optional.
147
+ input_format = input_spec.get("format", args.format)
148
+ input_graph = input_spec.get("graph", "-")
149
+ input_parallel = input_spec.get("parallel", "false")
150
+ # There must not be any other keys.
151
+ extra_keys = input_spec.keys() - {
152
+ "cmd",
153
+ "format",
154
+ "graph",
155
+ "parallel",
156
+ "for-each",
157
+ }
158
+ if extra_keys:
159
+ raise self.InvalidInputJson(
160
+ f"Element {i} in `MULTI_INPUT_JSON` must only contain "
161
+ "the keys `format`, `graph`, and `parallel`. Contains "
162
+ f"extra keys {extra_keys}.",
163
+ input_spec,
164
+ )
165
+ # Add the command-line options for this input stream. We use
166
+ # process substitution `<(...)` as a convenient way to handle an
167
+ # input stream just like a file. This is not POSIX compliant, but
168
+ # supported by various shells, including bash and zsh. If
169
+ # `for-each` is specified, add one command for each matching file.
170
+ for input_cmd in input_cmds:
171
+ input_option = f"-f <({input_cmd}) -g {input_graph}"
172
+ input_option += f" -F {input_format}"
173
+ if input_parallel == "true":
174
+ input_option += " -p true"
175
+ else:
176
+ input_option += " -p false"
177
+ input_options.append(input_option)
178
+ # Return the concatenated command-line options.
179
+ return " ".join(input_options)
180
+
181
+ def execute(self, args) -> bool:
182
+ # The mandatory part of the command line (specifying the input, the
183
+ # basename of the index, and the settings file). There are two ways
184
+ # to specify the input: via a single stream or via multiple streams.
185
+ if args.cat_input_files and not args.multi_input_json:
186
+ index_cmd = (
187
+ f"{args.cat_input_files} | {args.index_binary}"
188
+ f" -i {args.name} -s {args.name}.settings.json"
189
+ f" --vocabulary-type {args.vocabulary_type}"
190
+ f" -F {args.format} -f -"
191
+ )
192
+ if args.parallel_parsing:
193
+ index_cmd += f" -p {args.parallel_parsing}"
194
+ elif args.multi_input_json and not args.cat_input_files:
195
+ try:
196
+ input_options = self.get_input_options_for_json(args)
197
+ except self.InvalidInputJson as e:
198
+ log.error(e.error_message)
199
+ log.info("")
200
+ log.info(e.additional_info)
201
+ return False
202
+ index_cmd = (
203
+ f"{args.index_binary}"
204
+ f" -i {args.name} -s {args.name}.settings.json"
205
+ f" --vocabulary-type {args.vocabulary_type}"
206
+ f" {input_options}"
207
+ )
208
+ else:
209
+ log.error(
210
+ "Specify exactly one of `CAT_INPUT_FILES` (for a "
211
+ "single input stream) or `MULTI_INPUT_JSON` (for "
212
+ "multiple input streams)"
213
+ )
214
+ log.info("")
215
+ log.info("See `qlever index --help` for more information")
216
+ return False
217
+
218
+ # Add remaining options.
219
+ if args.encode_as_id:
220
+ index_cmd += f" --encode-as-id {args.encode_as_id}"
221
+ if args.only_pso_and_pos_permutations:
222
+ index_cmd += " --only-pso-and-pos-permutations"
223
+ if args.use_patterns == "no":
224
+ index_cmd += " --no-patterns"
225
+ if args.add_has_word_triples:
226
+ index_cmd += " --add-has-word-triples"
227
+ if args.text_index in [
228
+ "from_text_records",
229
+ "from_text_records_and_literals",
230
+ ]:
231
+ index_cmd += (
232
+ f" -w {args.name}.wordsfile.tsv -d {args.name}.docsfile.tsv"
233
+ )
234
+ if args.text_index in [
235
+ "from_literals",
236
+ "from_text_records_and_literals",
237
+ ]:
238
+ index_cmd += " --text-words-from-literals"
239
+ if args.stxxl_memory:
240
+ index_cmd += f" --stxxl-memory {args.stxxl_memory}"
241
+ if args.parser_buffer_size:
242
+ index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
243
+ index_cmd += f" 2>&1 | tee {args.name}.index-log.txt"
244
+
245
+ # If the total file size is larger than 10 GB, set ulimit (such that a
246
+ # large number of open files is allowed).
247
+ total_file_size = get_total_file_size(shlex.split(args.input_files))
248
+ if args.ulimit is not None:
249
+ index_cmd = f"ulimit -Sn {args.ulimit} && {index_cmd}"
250
+ elif total_file_size > 1e10:
251
+ index_cmd = f"ulimit -Sn 500000 && {index_cmd}"
252
+
253
+ # Run the command in a container (if so desired).
254
+ if args.system in Containerize.supported_systems():
255
+ index_cmd = Containerize().containerize_command(
256
+ index_cmd,
257
+ args.system,
258
+ "run --rm",
259
+ args.image,
260
+ args.index_container,
261
+ volumes=[("$(pwd)", "/index")],
262
+ working_directory="/index",
263
+ )
264
+
265
+ # Command for writing the settings JSON to a file.
266
+ settings_json_cmd = (
267
+ f"echo {shlex.quote(args.settings_json)} "
268
+ f"> {args.name}.settings.json"
269
+ )
270
+
271
+ # Show the command line.
272
+ self.show(f"{settings_json_cmd}\n{index_cmd}", only_show=args.show)
273
+ if args.show:
274
+ return True
275
+
276
+ # When running natively, check if the binary exists and works.
277
+ if args.system == "native":
278
+ if not binary_exists(args.index_binary, "index-binary"):
279
+ return False
280
+
281
+ # Check if all of the input files exist.
282
+ for pattern in shlex.split(args.input_files):
283
+ if len(glob.glob(pattern)) == 0:
284
+ log.error(f'No file matching "{pattern}" found')
285
+ log.info("")
286
+ log.info(
287
+ "Did you call `qlever get-data`? If you did, check "
288
+ "GET_DATA_CMD and INPUT_FILES in the QLeverfile"
289
+ )
290
+ return False
291
+
292
+ # Check if index files (name.index.*) already exist.
293
+ existing_index_files = get_existing_index_files(args.name)
294
+ if len(existing_index_files) > 0 and not args.overwrite_existing:
295
+ log.error(
296
+ f'Index files for basename "{args.name}" found, if you '
297
+ f"want to overwrite them, use --overwrite-existing"
298
+ )
299
+ log.info("")
300
+ log.info(f"Index files found: {existing_index_files}")
301
+ return False
302
+
303
+ # Remove already existing container.
304
+ if (
305
+ args.system in Containerize.supported_systems()
306
+ and args.overwrite_existing
307
+ ):
308
+ if Containerize.is_running(args.system, args.index_container):
309
+ log.info(
310
+ "Another index process is running, trying to stop it ..."
311
+ )
312
+ log.info("")
313
+ try:
314
+ run_command(f"{args.system} rm -f {args.index_container}")
315
+ except Exception as e:
316
+ log.error(f"Removing existing container failed: {e}")
317
+ return False
318
+
319
+ # Write settings.json file.
320
+ try:
321
+ run_command(settings_json_cmd)
322
+ except Exception as e:
323
+ log.error(f"Writing the settings.json file failed: {e}")
324
+ return False
325
+
326
+ # Run the index command.
327
+ try:
328
+ run_command(index_cmd, show_output=True)
329
+ except Exception as e:
330
+ log.error(f"Building the index failed: {e}")
331
+ return False
332
+
333
+ return True
@@ -0,0 +1,306 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ from qlever.command import QleverCommand
8
+ from qlever.log import log
9
+ from qlever.util import get_total_file_size
10
+
11
+
12
+ class IndexStatsCommand(QleverCommand):
13
+ """
14
+ Class for executing the `index-stats` command.
15
+ """
16
+
17
+ def __init__(self):
18
+ pass
19
+
20
+ def description(self) -> str:
21
+ return "Breakdown of the time and space used for the index build"
22
+
23
+ def should_have_qleverfile(self) -> bool:
24
+ return False
25
+
26
+ def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
27
+ return {"data": ["name"]}
28
+
29
+ def additional_arguments(self, subparser) -> None:
30
+ subparser.add_argument(
31
+ "--only-time",
32
+ action="store_true",
33
+ default=False,
34
+ help="Show only the time used",
35
+ )
36
+ subparser.add_argument(
37
+ "--only-space",
38
+ action="store_true",
39
+ default=False,
40
+ help="Show only the space used",
41
+ )
42
+ subparser.add_argument(
43
+ "--ignore-text-index",
44
+ action="store_true",
45
+ default=False,
46
+ help="Ignore the text index",
47
+ )
48
+ subparser.add_argument(
49
+ "--time-unit",
50
+ choices=["s", "min", "h", "auto"],
51
+ default="auto",
52
+ help="The time unit",
53
+ )
54
+ subparser.add_argument(
55
+ "--size-unit",
56
+ choices=["B", "MB", "GB", "TB", "auto"],
57
+ default="auto",
58
+ help="The size unit",
59
+ )
60
+
61
+ def execute_time(self, args, log_file_name) -> bool:
62
+ """
63
+ Part of `execute` that shows the time used.
64
+ """
65
+
66
+ # Read the content of `log_file_name` into a list of lines.
67
+ try:
68
+ with open(log_file_name, "r") as log_file:
69
+ lines = log_file.readlines()
70
+ except Exception as e:
71
+ log.error(f"Problem reading index log file {log_file_name}: {e}")
72
+ return False
73
+ # If there is a separate `add-text-index-log.txt` file, append those
74
+ # lines.
75
+ try:
76
+ text_log_file_name = f"{args.name}.text-index-log.txt"
77
+ if Path(text_log_file_name).exists():
78
+ with open(text_log_file_name, "r") as text_log_file:
79
+ lines.extend(text_log_file.readlines())
80
+ except Exception as e:
81
+ log.error(
82
+ f"Problem reading text index log file " f"{text_log_file_name}: {e}"
83
+ )
84
+ return False
85
+
86
+ # Helper function that finds the next line matching the given `regex`,
87
+ # starting from `current_line`, and extracts the time. Returns a tuple
88
+ # of the time and the regex match object.
89
+ #
90
+ # If `update_current_line` is `False`, then `current_line` will not be
91
+ # updated by this call.
92
+ #
93
+ # Otherwise, and this is the default behavior, `current_line` will be
94
+ # updated to the line after the first match, or one beyond the last
95
+ # line if no match is found.
96
+ current_line = 0
97
+
98
+ def find_next_line(regex, update_current_line=True):
99
+ nonlocal lines
100
+ nonlocal current_line
101
+ current_line_backup = current_line
102
+ # Find starting from `current_line`.
103
+ while current_line < len(lines):
104
+ line = lines[current_line]
105
+ current_line += 1
106
+ timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
107
+ timestamp_format = "%Y-%m-%d %H:%M:%S"
108
+ regex_match = re.search(regex, line)
109
+ if regex_match:
110
+ try:
111
+ return datetime.strptime(
112
+ re.match(timestamp_regex, line).group(), timestamp_format
113
+ ), regex_match
114
+ except Exception as e:
115
+ log.error(
116
+ f"Could not parse timestamp of form "
117
+ f'"{timestamp_regex}" from line '
118
+ f' "{line.rstrip()}" ({e})'
119
+ )
120
+ # If we get here, we did not find a matching line.
121
+ if not update_current_line:
122
+ current_line = current_line_backup
123
+ return None, None
124
+
125
+ # Find the lines matching the key_lines_regex and extract the time
126
+ # information from them.
127
+ overall_begin, _ = find_next_line(r"INFO:\s*Processing")
128
+ merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab")
129
+ convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
130
+ perm_begin_and_info = []
131
+ while True:
132
+ # Find the next line that starts a permutation.
133
+ #
134
+ # NOTE: Should work for the old and new format of the index log
135
+ # file (old format: "Creating a pair" + names of permutations in
136
+ # line "Writing meta data for ..."; new format: name of
137
+ # permutations already in line "Creating permutations ...").
138
+ perm_begin, _ = find_next_line(
139
+ r"INFO:\s*Creating a pair", update_current_line=False
140
+ )
141
+ if perm_begin is None:
142
+ perm_begin, perm_info = find_next_line(
143
+ r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)",
144
+ update_current_line=False,
145
+ )
146
+ else:
147
+ _, perm_info = find_next_line(
148
+ r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)",
149
+ update_current_line=False,
150
+ )
151
+ if perm_info is None:
152
+ break
153
+ perm_begin_and_info.append((perm_begin, perm_info))
154
+ convert_end = (
155
+ perm_begin_and_info[0][0] if len(perm_begin_and_info) > 0 else None
156
+ )
157
+ normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
158
+ text_begin, _ = find_next_line(
159
+ r"INFO:\s*Adding text index", update_current_line=False
160
+ )
161
+ text_end, _ = find_next_line(
162
+ r"INFO:\s*Text index build comp", update_current_line=False
163
+ )
164
+ if args.ignore_text_index:
165
+ text_begin = text_end = None
166
+
167
+ # Check whether at least the first phase is done.
168
+ if overall_begin is None:
169
+ log.error("Missing line that index build has started")
170
+ return False
171
+ if overall_begin and not merge_begin:
172
+ log.error(
173
+ "According to the log file, the index build "
174
+ "has started, but is still in its first "
175
+ "phase (parsing the input)"
176
+ )
177
+ return False
178
+
179
+ # Helper function that shows the duration for a phase (if the start and
180
+ # end timestamps are available).
181
+ def show_duration(heading, start_end_pairs):
182
+ nonlocal time_unit
183
+ num_start_end_pairs = 0
184
+ diff_seconds = 0
185
+ for start, end in start_end_pairs:
186
+ if start and end:
187
+ diff_seconds += (end - start).total_seconds()
188
+ num_start_end_pairs += 1
189
+ if num_start_end_pairs > 0:
190
+ if time_unit == "h":
191
+ diff = diff_seconds / 3600
192
+ elif time_unit == "min":
193
+ diff = diff_seconds / 60
194
+ else:
195
+ diff = diff_seconds
196
+ log.info(f"{heading:<21} : {diff:>6.1f} {time_unit}")
197
+
198
+ # Get the times of the various phases (hours or minutes, depending on
199
+ # how long the first phase took).
200
+ time_unit = args.time_unit
201
+ if time_unit == "auto":
202
+ time_unit = "h"
203
+ if merge_begin and overall_begin:
204
+ parse_duration = (merge_begin - overall_begin).total_seconds()
205
+ if parse_duration < 200:
206
+ time_unit = "s"
207
+ elif parse_duration < 3600:
208
+ time_unit = "min"
209
+ show_duration("Parse input", [(overall_begin, merge_begin)])
210
+ show_duration("Build vocabularies", [(merge_begin, convert_begin)])
211
+ show_duration("Convert to global IDs", [(convert_begin, convert_end)])
212
+ for i in range(len(perm_begin_and_info)):
213
+ perm_begin, perm_info = perm_begin_and_info[i]
214
+ perm_end = (
215
+ perm_begin_and_info[i + 1][0]
216
+ if i + 1 < len(perm_begin_and_info)
217
+ else normal_end
218
+ )
219
+ perm_info_text = (
220
+ perm_info.group(1).replace(" and ", " & ") if perm_info else f"#{i + 1}"
221
+ )
222
+ show_duration(f"Permutation {perm_info_text}", [(perm_begin, perm_end)])
223
+ show_duration("Text index", [(text_begin, text_end)])
224
+ if text_begin and text_end:
225
+ log.info("")
226
+ show_duration(
227
+ "TOTAL time", [(overall_begin, normal_end), (text_begin, text_end)]
228
+ )
229
+ elif normal_end:
230
+ log.info("")
231
+ show_duration("TOTAL time", [(overall_begin, normal_end)])
232
+ return True
233
+
234
+ def execute_space(self, args) -> bool:
235
+ """
236
+ Part of `execute` that shows the space used.
237
+ """
238
+
239
+ # Get the sizes for the various groups of index files.
240
+ index_size = get_total_file_size([f"{args.name}.index.*"])
241
+ vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"])
242
+ text_size = get_total_file_size([f"{args.name}.text.*"])
243
+ if args.ignore_text_index:
244
+ text_size = 0
245
+ total_size = index_size + vocab_size + text_size
246
+
247
+ # Determing the proper unit for the size.
248
+ size_unit = args.size_unit
249
+ if size_unit == "auto":
250
+ size_unit = "TB"
251
+ if total_size < 1e6:
252
+ size_unit = "B"
253
+ elif total_size < 1e9:
254
+ size_unit = "MB"
255
+ elif total_size < 1e12:
256
+ size_unit = "GB"
257
+
258
+ # Helper function for showing the size in a uniform way.
259
+ def show_size(heading, size):
260
+ nonlocal size_unit
261
+ if size_unit == "GB":
262
+ size /= 1e9
263
+ elif size_unit == "MB":
264
+ size /= 1e6
265
+ elif size_unit == "TB":
266
+ size /= 1e12
267
+ if size_unit == "B":
268
+ log.info(f"{heading:<21} : {size:,} {size_unit}")
269
+ else:
270
+ log.info(f"{heading:<21} : {size:>6.1f} {size_unit}")
271
+
272
+ show_size("Files index.*", index_size)
273
+ show_size("Files vocabulary.*", vocab_size)
274
+ if text_size > 0:
275
+ show_size("Files text.*", text_size)
276
+ log.info("")
277
+ show_size("TOTAL size", total_size)
278
+ return True
279
+
280
+ def execute(self, args) -> bool:
281
+ return_value = True
282
+
283
+ # The "time" part of the command.
284
+ if not args.only_space:
285
+ log_file_name = f"{args.name}.index-log.txt"
286
+ self.show(
287
+ f"Breakdown of the time used for "
288
+ f"building the index, based on the timestamps for key "
289
+ f'lines in "{log_file_name}"',
290
+ only_show=args.show,
291
+ )
292
+ if not args.show:
293
+ return_value &= self.execute_time(args, log_file_name)
294
+ if not args.only_time:
295
+ log.info("")
296
+
297
+ # The "space" part of the command.
298
+ if not args.only_time:
299
+ self.show(
300
+ "Breakdown of the space used for building the index",
301
+ only_show=args.show,
302
+ )
303
+ if not args.show:
304
+ return_value &= self.execute_space(args)
305
+
306
+ return return_value