qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qlever/Qleverfiles/Qleverfile.dblp +36 -0
- qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
- qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
- qlever/Qleverfiles/Qleverfile.default +51 -0
- qlever/Qleverfiles/Qleverfile.dnb +40 -0
- qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
- qlever/Qleverfiles/Qleverfile.freebase +28 -0
- qlever/Qleverfiles/Qleverfile.imdb +36 -0
- qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
- qlever/Qleverfiles/Qleverfile.olympics +31 -0
- qlever/Qleverfiles/Qleverfile.orkg +30 -0
- qlever/Qleverfiles/Qleverfile.osm-country +39 -0
- qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
- qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
- qlever/Qleverfiles/Qleverfile.pubchem +131 -0
- qlever/Qleverfiles/Qleverfile.scientists +29 -0
- qlever/Qleverfiles/Qleverfile.uniprot +74 -0
- qlever/Qleverfiles/Qleverfile.vvz +31 -0
- qlever/Qleverfiles/Qleverfile.wikidata +42 -0
- qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
- qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
- qlever/__init__.py +44 -1380
- qlever/command.py +87 -0
- qlever/commands/__init__.py +0 -0
- qlever/commands/add_text_index.py +115 -0
- qlever/commands/benchmark_queries.py +1019 -0
- qlever/commands/cache_stats.py +125 -0
- qlever/commands/clear_cache.py +88 -0
- qlever/commands/extract_queries.py +120 -0
- qlever/commands/get_data.py +48 -0
- qlever/commands/index.py +333 -0
- qlever/commands/index_stats.py +306 -0
- qlever/commands/log.py +66 -0
- qlever/commands/materialized_view.py +110 -0
- qlever/commands/query.py +142 -0
- qlever/commands/rebuild_index.py +176 -0
- qlever/commands/reset_updates.py +59 -0
- qlever/commands/settings.py +115 -0
- qlever/commands/setup_config.py +97 -0
- qlever/commands/start.py +336 -0
- qlever/commands/status.py +50 -0
- qlever/commands/stop.py +90 -0
- qlever/commands/system_info.py +130 -0
- qlever/commands/ui.py +271 -0
- qlever/commands/update.py +90 -0
- qlever/commands/update_wikidata.py +1204 -0
- qlever/commands/warmup.py +41 -0
- qlever/config.py +223 -0
- qlever/containerize.py +167 -0
- qlever/log.py +55 -0
- qlever/qlever_main.py +79 -0
- qlever/qleverfile.py +530 -0
- qlever/util.py +330 -0
- qlever-0.5.41.dist-info/METADATA +127 -0
- qlever-0.5.41.dist-info/RECORD +59 -0
- {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
- qlever-0.5.41.dist-info/entry_points.txt +2 -0
- qlever-0.5.41.dist-info/top_level.txt +1 -0
- build/lib/qlever/__init__.py +0 -1383
- build/lib/qlever/__main__.py +0 -4
- qlever/__main__.py +0 -4
- qlever-0.2.5.dist-info/METADATA +0 -277
- qlever-0.2.5.dist-info/RECORD +0 -12
- qlever-0.2.5.dist-info/entry_points.txt +0 -2
- qlever-0.2.5.dist-info/top_level.txt +0 -4
- src/qlever/__init__.py +0 -1383
- src/qlever/__main__.py +0 -4
- {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
qlever/commands/index.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import shlex
|
|
7
|
+
|
|
8
|
+
from qlever.command import QleverCommand
|
|
9
|
+
from qlever.containerize import Containerize
|
|
10
|
+
from qlever.log import log
|
|
11
|
+
from qlever.util import (
|
|
12
|
+
binary_exists,
|
|
13
|
+
get_existing_index_files,
|
|
14
|
+
get_total_file_size,
|
|
15
|
+
run_command,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class IndexCommand(QleverCommand):
|
|
20
|
+
"""
|
|
21
|
+
Class for executing the `index` command.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def description(self) -> str:
|
|
28
|
+
return "Build the index for a given RDF dataset"
|
|
29
|
+
|
|
30
|
+
def should_have_qleverfile(self) -> bool:
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
|
|
34
|
+
return {
|
|
35
|
+
"data": ["name", "format"],
|
|
36
|
+
"index": [
|
|
37
|
+
"input_files",
|
|
38
|
+
"cat_input_files",
|
|
39
|
+
"encode_as_id",
|
|
40
|
+
"multi_input_json",
|
|
41
|
+
"parallel_parsing",
|
|
42
|
+
"settings_json",
|
|
43
|
+
"vocabulary_type",
|
|
44
|
+
"index_binary",
|
|
45
|
+
"only_pso_and_pos_permutations",
|
|
46
|
+
"ulimit",
|
|
47
|
+
"use_patterns",
|
|
48
|
+
"add_has_word_triples",
|
|
49
|
+
"text_index",
|
|
50
|
+
"stxxl_memory",
|
|
51
|
+
"parser_buffer_size",
|
|
52
|
+
],
|
|
53
|
+
"runtime": ["system", "image", "index_container"],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def additional_arguments(self, subparser) -> None:
|
|
57
|
+
subparser.add_argument(
|
|
58
|
+
"--overwrite-existing",
|
|
59
|
+
action="store_true",
|
|
60
|
+
default=False,
|
|
61
|
+
help="Overwrite an existing index, think twice before using this",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Exception for invalid JSON.
|
|
65
|
+
class InvalidInputJson(Exception):
|
|
66
|
+
def __init__(self, error_message, additional_info):
|
|
67
|
+
self.error_message = error_message
|
|
68
|
+
self.additional_info = additional_info
|
|
69
|
+
super().__init__()
|
|
70
|
+
|
|
71
|
+
# Helper function to get command line options from JSON.
|
|
72
|
+
def get_input_options_for_json(self, args) -> str:
|
|
73
|
+
# Parse the JSON. If `args.multi_input_json` look like JSONL, turn
|
|
74
|
+
# it into a JSON array.
|
|
75
|
+
try:
|
|
76
|
+
jsonl_line_regex = re.compile(r"^\s*\{.*\}\s*$")
|
|
77
|
+
jsonl_lines = args.multi_input_json.split("\n")
|
|
78
|
+
if all(re.match(jsonl_line_regex, line) for line in jsonl_lines):
|
|
79
|
+
args.multi_input_json = "[" + ", ".join(jsonl_lines) + "]"
|
|
80
|
+
input_specs = json.loads(args.multi_input_json)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
raise self.InvalidInputJson(
|
|
83
|
+
f"Failed to parse `MULTI_INPUT_JSON` as either JSON or JSONL ({e})",
|
|
84
|
+
args.multi_input_json,
|
|
85
|
+
)
|
|
86
|
+
# Check that it is an array of length at least one.
|
|
87
|
+
if not isinstance(input_specs, list):
|
|
88
|
+
raise self.InvalidInputJson(
|
|
89
|
+
"`MULTI_INPUT_JSON` must be a JSON array",
|
|
90
|
+
args.multi_input_json,
|
|
91
|
+
)
|
|
92
|
+
if len(input_specs) == 0:
|
|
93
|
+
raise self.InvalidInputJson(
|
|
94
|
+
"`MULTI_INPUT_JSON` must contain at least one element",
|
|
95
|
+
args.multi_input_json,
|
|
96
|
+
)
|
|
97
|
+
# For each of the maps, construct the corresponding command-line
|
|
98
|
+
# options to the index binary.
|
|
99
|
+
input_options = []
|
|
100
|
+
for i, input_spec in enumerate(input_specs):
|
|
101
|
+
# Check that `input_spec` is a dictionary.
|
|
102
|
+
if not isinstance(input_spec, dict):
|
|
103
|
+
raise self.InvalidInputJson(
|
|
104
|
+
f"Element {i} in `MULTI_INPUT_JSON` must be a JSON object",
|
|
105
|
+
input_spec,
|
|
106
|
+
)
|
|
107
|
+
# For each `input_spec`, we must have a command.
|
|
108
|
+
if "cmd" not in input_spec:
|
|
109
|
+
raise self.InvalidInputJson(
|
|
110
|
+
f"Element {i} in `MULTI_INPUT_JSON` must contain a "
|
|
111
|
+
"key `cmd`",
|
|
112
|
+
input_spec,
|
|
113
|
+
)
|
|
114
|
+
# If the command contains a `{}` placeholder, we need a `for-each`
|
|
115
|
+
# key` specifying the pattern for the placeholder values, and vice
|
|
116
|
+
# versa.
|
|
117
|
+
if "{}" in input_spec["cmd"] and "for-each" not in input_spec:
|
|
118
|
+
raise self.InvalidInputJson(
|
|
119
|
+
f"Element {i} in `MULTI_INPUT_JSON` must contain a "
|
|
120
|
+
"key `for-each` if the command contains a placeholder "
|
|
121
|
+
"`{}`",
|
|
122
|
+
input_spec,
|
|
123
|
+
)
|
|
124
|
+
if "for-each" in input_spec and "{}" not in input_spec["cmd"]:
|
|
125
|
+
raise self.InvalidInputJson(
|
|
126
|
+
f"Element {i} in `MULTI_INPUT_JSON` contains a "
|
|
127
|
+
"key `for-each`, but the command does not contain a "
|
|
128
|
+
"placeholder `{{}}`",
|
|
129
|
+
input_spec,
|
|
130
|
+
)
|
|
131
|
+
# Get all commands. This is just the value of the `cmd` key if no
|
|
132
|
+
# `for-each` key is specified. Otherwise, we have a command for
|
|
133
|
+
# each file matching the pattern.
|
|
134
|
+
if "for-each" not in input_spec:
|
|
135
|
+
input_cmds = [input_spec["cmd"]]
|
|
136
|
+
else:
|
|
137
|
+
try:
|
|
138
|
+
files = sorted(glob.glob(input_spec["for-each"]))
|
|
139
|
+
except Exception as e:
|
|
140
|
+
raise self.InvalidInputJson(
|
|
141
|
+
f"Element {i} in `MULTI_INPUT_JSON` contains an "
|
|
142
|
+
f"invalid `for-each` pattern: {e}",
|
|
143
|
+
input_spec,
|
|
144
|
+
)
|
|
145
|
+
input_cmds = [input_spec["cmd"].format(file) for file in files]
|
|
146
|
+
# The `format`, `graph`, and `parallel` keys are optional.
|
|
147
|
+
input_format = input_spec.get("format", args.format)
|
|
148
|
+
input_graph = input_spec.get("graph", "-")
|
|
149
|
+
input_parallel = input_spec.get("parallel", "false")
|
|
150
|
+
# There must not be any other keys.
|
|
151
|
+
extra_keys = input_spec.keys() - {
|
|
152
|
+
"cmd",
|
|
153
|
+
"format",
|
|
154
|
+
"graph",
|
|
155
|
+
"parallel",
|
|
156
|
+
"for-each",
|
|
157
|
+
}
|
|
158
|
+
if extra_keys:
|
|
159
|
+
raise self.InvalidInputJson(
|
|
160
|
+
f"Element {i} in `MULTI_INPUT_JSON` must only contain "
|
|
161
|
+
"the keys `format`, `graph`, and `parallel`. Contains "
|
|
162
|
+
f"extra keys {extra_keys}.",
|
|
163
|
+
input_spec,
|
|
164
|
+
)
|
|
165
|
+
# Add the command-line options for this input stream. We use
|
|
166
|
+
# process substitution `<(...)` as a convenient way to handle an
|
|
167
|
+
# input stream just like a file. This is not POSIX compliant, but
|
|
168
|
+
# supported by various shells, including bash and zsh. If
|
|
169
|
+
# `for-each` is specified, add one command for each matching file.
|
|
170
|
+
for input_cmd in input_cmds:
|
|
171
|
+
input_option = f"-f <({input_cmd}) -g {input_graph}"
|
|
172
|
+
input_option += f" -F {input_format}"
|
|
173
|
+
if input_parallel == "true":
|
|
174
|
+
input_option += " -p true"
|
|
175
|
+
else:
|
|
176
|
+
input_option += " -p false"
|
|
177
|
+
input_options.append(input_option)
|
|
178
|
+
# Return the concatenated command-line options.
|
|
179
|
+
return " ".join(input_options)
|
|
180
|
+
|
|
181
|
+
def execute(self, args) -> bool:
|
|
182
|
+
# The mandatory part of the command line (specifying the input, the
|
|
183
|
+
# basename of the index, and the settings file). There are two ways
|
|
184
|
+
# to specify the input: via a single stream or via multiple streams.
|
|
185
|
+
if args.cat_input_files and not args.multi_input_json:
|
|
186
|
+
index_cmd = (
|
|
187
|
+
f"{args.cat_input_files} | {args.index_binary}"
|
|
188
|
+
f" -i {args.name} -s {args.name}.settings.json"
|
|
189
|
+
f" --vocabulary-type {args.vocabulary_type}"
|
|
190
|
+
f" -F {args.format} -f -"
|
|
191
|
+
)
|
|
192
|
+
if args.parallel_parsing:
|
|
193
|
+
index_cmd += f" -p {args.parallel_parsing}"
|
|
194
|
+
elif args.multi_input_json and not args.cat_input_files:
|
|
195
|
+
try:
|
|
196
|
+
input_options = self.get_input_options_for_json(args)
|
|
197
|
+
except self.InvalidInputJson as e:
|
|
198
|
+
log.error(e.error_message)
|
|
199
|
+
log.info("")
|
|
200
|
+
log.info(e.additional_info)
|
|
201
|
+
return False
|
|
202
|
+
index_cmd = (
|
|
203
|
+
f"{args.index_binary}"
|
|
204
|
+
f" -i {args.name} -s {args.name}.settings.json"
|
|
205
|
+
f" --vocabulary-type {args.vocabulary_type}"
|
|
206
|
+
f" {input_options}"
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
log.error(
|
|
210
|
+
"Specify exactly one of `CAT_INPUT_FILES` (for a "
|
|
211
|
+
"single input stream) or `MULTI_INPUT_JSON` (for "
|
|
212
|
+
"multiple input streams)"
|
|
213
|
+
)
|
|
214
|
+
log.info("")
|
|
215
|
+
log.info("See `qlever index --help` for more information")
|
|
216
|
+
return False
|
|
217
|
+
|
|
218
|
+
# Add remaining options.
|
|
219
|
+
if args.encode_as_id:
|
|
220
|
+
index_cmd += f" --encode-as-id {args.encode_as_id}"
|
|
221
|
+
if args.only_pso_and_pos_permutations:
|
|
222
|
+
index_cmd += " --only-pso-and-pos-permutations"
|
|
223
|
+
if args.use_patterns == "no":
|
|
224
|
+
index_cmd += " --no-patterns"
|
|
225
|
+
if args.add_has_word_triples:
|
|
226
|
+
index_cmd += " --add-has-word-triples"
|
|
227
|
+
if args.text_index in [
|
|
228
|
+
"from_text_records",
|
|
229
|
+
"from_text_records_and_literals",
|
|
230
|
+
]:
|
|
231
|
+
index_cmd += (
|
|
232
|
+
f" -w {args.name}.wordsfile.tsv -d {args.name}.docsfile.tsv"
|
|
233
|
+
)
|
|
234
|
+
if args.text_index in [
|
|
235
|
+
"from_literals",
|
|
236
|
+
"from_text_records_and_literals",
|
|
237
|
+
]:
|
|
238
|
+
index_cmd += " --text-words-from-literals"
|
|
239
|
+
if args.stxxl_memory:
|
|
240
|
+
index_cmd += f" --stxxl-memory {args.stxxl_memory}"
|
|
241
|
+
if args.parser_buffer_size:
|
|
242
|
+
index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
|
|
243
|
+
index_cmd += f" 2>&1 | tee {args.name}.index-log.txt"
|
|
244
|
+
|
|
245
|
+
# If the total file size is larger than 10 GB, set ulimit (such that a
|
|
246
|
+
# large number of open files is allowed).
|
|
247
|
+
total_file_size = get_total_file_size(shlex.split(args.input_files))
|
|
248
|
+
if args.ulimit is not None:
|
|
249
|
+
index_cmd = f"ulimit -Sn {args.ulimit} && {index_cmd}"
|
|
250
|
+
elif total_file_size > 1e10:
|
|
251
|
+
index_cmd = f"ulimit -Sn 500000 && {index_cmd}"
|
|
252
|
+
|
|
253
|
+
# Run the command in a container (if so desired).
|
|
254
|
+
if args.system in Containerize.supported_systems():
|
|
255
|
+
index_cmd = Containerize().containerize_command(
|
|
256
|
+
index_cmd,
|
|
257
|
+
args.system,
|
|
258
|
+
"run --rm",
|
|
259
|
+
args.image,
|
|
260
|
+
args.index_container,
|
|
261
|
+
volumes=[("$(pwd)", "/index")],
|
|
262
|
+
working_directory="/index",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Command for writing the settings JSON to a file.
|
|
266
|
+
settings_json_cmd = (
|
|
267
|
+
f"echo {shlex.quote(args.settings_json)} "
|
|
268
|
+
f"> {args.name}.settings.json"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Show the command line.
|
|
272
|
+
self.show(f"{settings_json_cmd}\n{index_cmd}", only_show=args.show)
|
|
273
|
+
if args.show:
|
|
274
|
+
return True
|
|
275
|
+
|
|
276
|
+
# When running natively, check if the binary exists and works.
|
|
277
|
+
if args.system == "native":
|
|
278
|
+
if not binary_exists(args.index_binary, "index-binary"):
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
# Check if all of the input files exist.
|
|
282
|
+
for pattern in shlex.split(args.input_files):
|
|
283
|
+
if len(glob.glob(pattern)) == 0:
|
|
284
|
+
log.error(f'No file matching "{pattern}" found')
|
|
285
|
+
log.info("")
|
|
286
|
+
log.info(
|
|
287
|
+
"Did you call `qlever get-data`? If you did, check "
|
|
288
|
+
"GET_DATA_CMD and INPUT_FILES in the QLeverfile"
|
|
289
|
+
)
|
|
290
|
+
return False
|
|
291
|
+
|
|
292
|
+
# Check if index files (name.index.*) already exist.
|
|
293
|
+
existing_index_files = get_existing_index_files(args.name)
|
|
294
|
+
if len(existing_index_files) > 0 and not args.overwrite_existing:
|
|
295
|
+
log.error(
|
|
296
|
+
f'Index files for basename "{args.name}" found, if you '
|
|
297
|
+
f"want to overwrite them, use --overwrite-existing"
|
|
298
|
+
)
|
|
299
|
+
log.info("")
|
|
300
|
+
log.info(f"Index files found: {existing_index_files}")
|
|
301
|
+
return False
|
|
302
|
+
|
|
303
|
+
# Remove already existing container.
|
|
304
|
+
if (
|
|
305
|
+
args.system in Containerize.supported_systems()
|
|
306
|
+
and args.overwrite_existing
|
|
307
|
+
):
|
|
308
|
+
if Containerize.is_running(args.system, args.index_container):
|
|
309
|
+
log.info(
|
|
310
|
+
"Another index process is running, trying to stop it ..."
|
|
311
|
+
)
|
|
312
|
+
log.info("")
|
|
313
|
+
try:
|
|
314
|
+
run_command(f"{args.system} rm -f {args.index_container}")
|
|
315
|
+
except Exception as e:
|
|
316
|
+
log.error(f"Removing existing container failed: {e}")
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
# Write settings.json file.
|
|
320
|
+
try:
|
|
321
|
+
run_command(settings_json_cmd)
|
|
322
|
+
except Exception as e:
|
|
323
|
+
log.error(f"Writing the settings.json file failed: {e}")
|
|
324
|
+
return False
|
|
325
|
+
|
|
326
|
+
# Run the index command.
|
|
327
|
+
try:
|
|
328
|
+
run_command(index_cmd, show_output=True)
|
|
329
|
+
except Exception as e:
|
|
330
|
+
log.error(f"Building the index failed: {e}")
|
|
331
|
+
return False
|
|
332
|
+
|
|
333
|
+
return True
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from qlever.command import QleverCommand
|
|
8
|
+
from qlever.log import log
|
|
9
|
+
from qlever.util import get_total_file_size
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IndexStatsCommand(QleverCommand):
|
|
13
|
+
"""
|
|
14
|
+
Class for executing the `index-stats` command.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
def description(self) -> str:
|
|
21
|
+
return "Breakdown of the time and space used for the index build"
|
|
22
|
+
|
|
23
|
+
def should_have_qleverfile(self) -> bool:
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
def relevant_qleverfile_arguments(self) -> dict[str, list[str]]:
|
|
27
|
+
return {"data": ["name"]}
|
|
28
|
+
|
|
29
|
+
def additional_arguments(self, subparser) -> None:
|
|
30
|
+
subparser.add_argument(
|
|
31
|
+
"--only-time",
|
|
32
|
+
action="store_true",
|
|
33
|
+
default=False,
|
|
34
|
+
help="Show only the time used",
|
|
35
|
+
)
|
|
36
|
+
subparser.add_argument(
|
|
37
|
+
"--only-space",
|
|
38
|
+
action="store_true",
|
|
39
|
+
default=False,
|
|
40
|
+
help="Show only the space used",
|
|
41
|
+
)
|
|
42
|
+
subparser.add_argument(
|
|
43
|
+
"--ignore-text-index",
|
|
44
|
+
action="store_true",
|
|
45
|
+
default=False,
|
|
46
|
+
help="Ignore the text index",
|
|
47
|
+
)
|
|
48
|
+
subparser.add_argument(
|
|
49
|
+
"--time-unit",
|
|
50
|
+
choices=["s", "min", "h", "auto"],
|
|
51
|
+
default="auto",
|
|
52
|
+
help="The time unit",
|
|
53
|
+
)
|
|
54
|
+
subparser.add_argument(
|
|
55
|
+
"--size-unit",
|
|
56
|
+
choices=["B", "MB", "GB", "TB", "auto"],
|
|
57
|
+
default="auto",
|
|
58
|
+
help="The size unit",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def execute_time(self, args, log_file_name) -> bool:
|
|
62
|
+
"""
|
|
63
|
+
Part of `execute` that shows the time used.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Read the content of `log_file_name` into a list of lines.
|
|
67
|
+
try:
|
|
68
|
+
with open(log_file_name, "r") as log_file:
|
|
69
|
+
lines = log_file.readlines()
|
|
70
|
+
except Exception as e:
|
|
71
|
+
log.error(f"Problem reading index log file {log_file_name}: {e}")
|
|
72
|
+
return False
|
|
73
|
+
# If there is a separate `add-text-index-log.txt` file, append those
|
|
74
|
+
# lines.
|
|
75
|
+
try:
|
|
76
|
+
text_log_file_name = f"{args.name}.text-index-log.txt"
|
|
77
|
+
if Path(text_log_file_name).exists():
|
|
78
|
+
with open(text_log_file_name, "r") as text_log_file:
|
|
79
|
+
lines.extend(text_log_file.readlines())
|
|
80
|
+
except Exception as e:
|
|
81
|
+
log.error(
|
|
82
|
+
f"Problem reading text index log file " f"{text_log_file_name}: {e}"
|
|
83
|
+
)
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
# Helper function that finds the next line matching the given `regex`,
|
|
87
|
+
# starting from `current_line`, and extracts the time. Returns a tuple
|
|
88
|
+
# of the time and the regex match object.
|
|
89
|
+
#
|
|
90
|
+
# If `update_current_line` is `False`, then `current_line` will not be
|
|
91
|
+
# updated by this call.
|
|
92
|
+
#
|
|
93
|
+
# Otherwise, and this is the default behavior, `current_line` will be
|
|
94
|
+
# updated to the line after the first match, or one beyond the last
|
|
95
|
+
# line if no match is found.
|
|
96
|
+
current_line = 0
|
|
97
|
+
|
|
98
|
+
def find_next_line(regex, update_current_line=True):
|
|
99
|
+
nonlocal lines
|
|
100
|
+
nonlocal current_line
|
|
101
|
+
current_line_backup = current_line
|
|
102
|
+
# Find starting from `current_line`.
|
|
103
|
+
while current_line < len(lines):
|
|
104
|
+
line = lines[current_line]
|
|
105
|
+
current_line += 1
|
|
106
|
+
timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
|
|
107
|
+
timestamp_format = "%Y-%m-%d %H:%M:%S"
|
|
108
|
+
regex_match = re.search(regex, line)
|
|
109
|
+
if regex_match:
|
|
110
|
+
try:
|
|
111
|
+
return datetime.strptime(
|
|
112
|
+
re.match(timestamp_regex, line).group(), timestamp_format
|
|
113
|
+
), regex_match
|
|
114
|
+
except Exception as e:
|
|
115
|
+
log.error(
|
|
116
|
+
f"Could not parse timestamp of form "
|
|
117
|
+
f'"{timestamp_regex}" from line '
|
|
118
|
+
f' "{line.rstrip()}" ({e})'
|
|
119
|
+
)
|
|
120
|
+
# If we get here, we did not find a matching line.
|
|
121
|
+
if not update_current_line:
|
|
122
|
+
current_line = current_line_backup
|
|
123
|
+
return None, None
|
|
124
|
+
|
|
125
|
+
# Find the lines matching the key_lines_regex and extract the time
|
|
126
|
+
# information from them.
|
|
127
|
+
overall_begin, _ = find_next_line(r"INFO:\s*Processing")
|
|
128
|
+
merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab")
|
|
129
|
+
convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
|
|
130
|
+
perm_begin_and_info = []
|
|
131
|
+
while True:
|
|
132
|
+
# Find the next line that starts a permutation.
|
|
133
|
+
#
|
|
134
|
+
# NOTE: Should work for the old and new format of the index log
|
|
135
|
+
# file (old format: "Creating a pair" + names of permutations in
|
|
136
|
+
# line "Writing meta data for ..."; new format: name of
|
|
137
|
+
# permutations already in line "Creating permutations ...").
|
|
138
|
+
perm_begin, _ = find_next_line(
|
|
139
|
+
r"INFO:\s*Creating a pair", update_current_line=False
|
|
140
|
+
)
|
|
141
|
+
if perm_begin is None:
|
|
142
|
+
perm_begin, perm_info = find_next_line(
|
|
143
|
+
r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)",
|
|
144
|
+
update_current_line=False,
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
_, perm_info = find_next_line(
|
|
148
|
+
r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)",
|
|
149
|
+
update_current_line=False,
|
|
150
|
+
)
|
|
151
|
+
if perm_info is None:
|
|
152
|
+
break
|
|
153
|
+
perm_begin_and_info.append((perm_begin, perm_info))
|
|
154
|
+
convert_end = (
|
|
155
|
+
perm_begin_and_info[0][0] if len(perm_begin_and_info) > 0 else None
|
|
156
|
+
)
|
|
157
|
+
normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
|
|
158
|
+
text_begin, _ = find_next_line(
|
|
159
|
+
r"INFO:\s*Adding text index", update_current_line=False
|
|
160
|
+
)
|
|
161
|
+
text_end, _ = find_next_line(
|
|
162
|
+
r"INFO:\s*Text index build comp", update_current_line=False
|
|
163
|
+
)
|
|
164
|
+
if args.ignore_text_index:
|
|
165
|
+
text_begin = text_end = None
|
|
166
|
+
|
|
167
|
+
# Check whether at least the first phase is done.
|
|
168
|
+
if overall_begin is None:
|
|
169
|
+
log.error("Missing line that index build has started")
|
|
170
|
+
return False
|
|
171
|
+
if overall_begin and not merge_begin:
|
|
172
|
+
log.error(
|
|
173
|
+
"According to the log file, the index build "
|
|
174
|
+
"has started, but is still in its first "
|
|
175
|
+
"phase (parsing the input)"
|
|
176
|
+
)
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
# Helper function that shows the duration for a phase (if the start and
|
|
180
|
+
# end timestamps are available).
|
|
181
|
+
def show_duration(heading, start_end_pairs):
|
|
182
|
+
nonlocal time_unit
|
|
183
|
+
num_start_end_pairs = 0
|
|
184
|
+
diff_seconds = 0
|
|
185
|
+
for start, end in start_end_pairs:
|
|
186
|
+
if start and end:
|
|
187
|
+
diff_seconds += (end - start).total_seconds()
|
|
188
|
+
num_start_end_pairs += 1
|
|
189
|
+
if num_start_end_pairs > 0:
|
|
190
|
+
if time_unit == "h":
|
|
191
|
+
diff = diff_seconds / 3600
|
|
192
|
+
elif time_unit == "min":
|
|
193
|
+
diff = diff_seconds / 60
|
|
194
|
+
else:
|
|
195
|
+
diff = diff_seconds
|
|
196
|
+
log.info(f"{heading:<21} : {diff:>6.1f} {time_unit}")
|
|
197
|
+
|
|
198
|
+
# Get the times of the various phases (hours or minutes, depending on
|
|
199
|
+
# how long the first phase took).
|
|
200
|
+
time_unit = args.time_unit
|
|
201
|
+
if time_unit == "auto":
|
|
202
|
+
time_unit = "h"
|
|
203
|
+
if merge_begin and overall_begin:
|
|
204
|
+
parse_duration = (merge_begin - overall_begin).total_seconds()
|
|
205
|
+
if parse_duration < 200:
|
|
206
|
+
time_unit = "s"
|
|
207
|
+
elif parse_duration < 3600:
|
|
208
|
+
time_unit = "min"
|
|
209
|
+
show_duration("Parse input", [(overall_begin, merge_begin)])
|
|
210
|
+
show_duration("Build vocabularies", [(merge_begin, convert_begin)])
|
|
211
|
+
show_duration("Convert to global IDs", [(convert_begin, convert_end)])
|
|
212
|
+
for i in range(len(perm_begin_and_info)):
|
|
213
|
+
perm_begin, perm_info = perm_begin_and_info[i]
|
|
214
|
+
perm_end = (
|
|
215
|
+
perm_begin_and_info[i + 1][0]
|
|
216
|
+
if i + 1 < len(perm_begin_and_info)
|
|
217
|
+
else normal_end
|
|
218
|
+
)
|
|
219
|
+
perm_info_text = (
|
|
220
|
+
perm_info.group(1).replace(" and ", " & ") if perm_info else f"#{i + 1}"
|
|
221
|
+
)
|
|
222
|
+
show_duration(f"Permutation {perm_info_text}", [(perm_begin, perm_end)])
|
|
223
|
+
show_duration("Text index", [(text_begin, text_end)])
|
|
224
|
+
if text_begin and text_end:
|
|
225
|
+
log.info("")
|
|
226
|
+
show_duration(
|
|
227
|
+
"TOTAL time", [(overall_begin, normal_end), (text_begin, text_end)]
|
|
228
|
+
)
|
|
229
|
+
elif normal_end:
|
|
230
|
+
log.info("")
|
|
231
|
+
show_duration("TOTAL time", [(overall_begin, normal_end)])
|
|
232
|
+
return True
|
|
233
|
+
|
|
234
|
+
def execute_space(self, args) -> bool:
|
|
235
|
+
"""
|
|
236
|
+
Part of `execute` that shows the space used.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
# Get the sizes for the various groups of index files.
|
|
240
|
+
index_size = get_total_file_size([f"{args.name}.index.*"])
|
|
241
|
+
vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"])
|
|
242
|
+
text_size = get_total_file_size([f"{args.name}.text.*"])
|
|
243
|
+
if args.ignore_text_index:
|
|
244
|
+
text_size = 0
|
|
245
|
+
total_size = index_size + vocab_size + text_size
|
|
246
|
+
|
|
247
|
+
# Determing the proper unit for the size.
|
|
248
|
+
size_unit = args.size_unit
|
|
249
|
+
if size_unit == "auto":
|
|
250
|
+
size_unit = "TB"
|
|
251
|
+
if total_size < 1e6:
|
|
252
|
+
size_unit = "B"
|
|
253
|
+
elif total_size < 1e9:
|
|
254
|
+
size_unit = "MB"
|
|
255
|
+
elif total_size < 1e12:
|
|
256
|
+
size_unit = "GB"
|
|
257
|
+
|
|
258
|
+
# Helper function for showing the size in a uniform way.
|
|
259
|
+
def show_size(heading, size):
|
|
260
|
+
nonlocal size_unit
|
|
261
|
+
if size_unit == "GB":
|
|
262
|
+
size /= 1e9
|
|
263
|
+
elif size_unit == "MB":
|
|
264
|
+
size /= 1e6
|
|
265
|
+
elif size_unit == "TB":
|
|
266
|
+
size /= 1e12
|
|
267
|
+
if size_unit == "B":
|
|
268
|
+
log.info(f"{heading:<21} : {size:,} {size_unit}")
|
|
269
|
+
else:
|
|
270
|
+
log.info(f"{heading:<21} : {size:>6.1f} {size_unit}")
|
|
271
|
+
|
|
272
|
+
show_size("Files index.*", index_size)
|
|
273
|
+
show_size("Files vocabulary.*", vocab_size)
|
|
274
|
+
if text_size > 0:
|
|
275
|
+
show_size("Files text.*", text_size)
|
|
276
|
+
log.info("")
|
|
277
|
+
show_size("TOTAL size", total_size)
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
def execute(self, args) -> bool:
|
|
281
|
+
return_value = True
|
|
282
|
+
|
|
283
|
+
# The "time" part of the command.
|
|
284
|
+
if not args.only_space:
|
|
285
|
+
log_file_name = f"{args.name}.index-log.txt"
|
|
286
|
+
self.show(
|
|
287
|
+
f"Breakdown of the time used for "
|
|
288
|
+
f"building the index, based on the timestamps for key "
|
|
289
|
+
f'lines in "{log_file_name}"',
|
|
290
|
+
only_show=args.show,
|
|
291
|
+
)
|
|
292
|
+
if not args.show:
|
|
293
|
+
return_value &= self.execute_time(args, log_file_name)
|
|
294
|
+
if not args.only_time:
|
|
295
|
+
log.info("")
|
|
296
|
+
|
|
297
|
+
# The "space" part of the command.
|
|
298
|
+
if not args.only_time:
|
|
299
|
+
self.show(
|
|
300
|
+
"Breakdown of the space used for building the index",
|
|
301
|
+
only_show=args.show,
|
|
302
|
+
)
|
|
303
|
+
if not args.show:
|
|
304
|
+
return_value &= self.execute_space(args)
|
|
305
|
+
|
|
306
|
+
return return_value
|