qlever 0.5.11__py3-none-any.whl → 0.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ FORMAT = ttl
17
17
 
18
18
  [index]
19
19
  INPUT_FILES = *.gz
20
- MULTI_INPUT_JSON = $$(ls *.gz | awk 'BEGIN { printf "[ " } NR > 1 { printf ", " } { printf "{\"cmd\": \"zcat " $$0 "\"}" } END { printf "]" }')
20
+ MULTI_INPUT_JSON = $$(ls *.gz | xargs -I {} echo '{ "cmd": "zcat {}" }')
21
21
  SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] }
22
22
 
23
23
  [server]
@@ -16,8 +16,7 @@ GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
16
16
  GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
17
17
  DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
18
18
  DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE")
19
- DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) + English Wikipeda abstracts (version ${DATE_WIKIPEDIA}, available via schema:description)
20
- TEXT_DESCRIPTION = All English and German literals + all sentences from the English Wikipedia (version ${DATE_WIKIPEDIA}), use with FILTER KEYWORDS(...)
19
+ DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA})
21
20
 
22
21
  [index]
23
22
  INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt
@@ -26,7 +25,6 @@ MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl",
26
25
  { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }]
27
26
  SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
28
27
  STXXL_MEMORY = 10G
29
- TEXT_INDEX = from_text_records
30
28
 
31
29
  [server]
32
30
  PORT = 7001
qlever/commands/index.py CHANGED
@@ -3,12 +3,12 @@ from __future__ import annotations
3
3
  import glob
4
4
  import json
5
5
  import shlex
6
+ import re
6
7
 
7
8
  from qlever.command import QleverCommand
8
9
  from qlever.containerize import Containerize
9
10
  from qlever.log import log
10
- from qlever.util import (get_existing_index_files, get_total_file_size,
11
- run_command)
11
+ from qlever.util import get_existing_index_files, get_total_file_size, run_command
12
12
 
13
13
 
14
14
  class IndexCommand(QleverCommand):
@@ -20,24 +20,36 @@ class IndexCommand(QleverCommand):
20
20
  pass
21
21
 
22
22
  def description(self) -> str:
23
- return ("Build the index for a given RDF dataset")
23
+ return "Build the index for a given RDF dataset"
24
24
 
25
25
  def should_have_qleverfile(self) -> bool:
26
26
  return True
27
27
 
28
- def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
29
- return {"data": ["name", "format"],
30
- "index": ["input_files", "cat_input_files", "multi_input_json",
31
- "parallel_parsing", "settings_json", "index_binary",
32
- "only_pso_and_pos_permutations", "use_patterns",
33
- "text_index", "stxxl_memory"],
34
- "runtime": ["system", "image", "index_container"]}
28
+ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
29
+ return {
30
+ "data": ["name", "format"],
31
+ "index": [
32
+ "input_files",
33
+ "cat_input_files",
34
+ "multi_input_json",
35
+ "parallel_parsing",
36
+ "settings_json",
37
+ "index_binary",
38
+ "only_pso_and_pos_permutations",
39
+ "use_patterns",
40
+ "text_index",
41
+ "stxxl_memory",
42
+ ],
43
+ "runtime": ["system", "image", "index_container"],
44
+ }
35
45
 
36
46
  def additional_arguments(self, subparser) -> None:
37
47
  subparser.add_argument(
38
- "--overwrite-existing", action="store_true",
39
- default=False,
40
- help="Overwrite an existing index, think twice before using.")
48
+ "--overwrite-existing",
49
+ action="store_true",
50
+ default=False,
51
+ help="Overwrite an existing index, think twice before using.",
52
+ )
41
53
 
42
54
  # Exception for invalid JSON.
43
55
  class InvalidInputJson(Exception):
@@ -48,22 +60,29 @@ class IndexCommand(QleverCommand):
48
60
 
49
61
  # Helper function to get command line options from JSON.
50
62
  def get_input_options_for_json(self, args) -> str:
51
- # Parse the JSON.
63
+ # Parse the JSON. If `args.multi_input_json` look like JSONL, turn
64
+ # it into a JSON array.
52
65
  try:
66
+ jsonl_line_regex = re.compile(r"^\s*\{.*\}\s*$")
67
+ jsonl_lines = args.multi_input_json.split("\n")
68
+ if all(re.match(jsonl_line_regex, line) for line in jsonl_lines):
69
+ args.multi_input_json = "[" + ", ".join(jsonl_lines) + "]"
53
70
  input_specs = json.loads(args.multi_input_json)
54
71
  except Exception as e:
55
72
  raise self.InvalidInputJson(
56
- f"Failed to parse `MULTI_INPUT_JSON` ({e})",
57
- args.multi_input_json)
73
+ f"Failed to parse `MULTI_INPUT_JSON` as either JSON or JSONL ({e})",
74
+ args.multi_input_json,
75
+ )
58
76
  # Check that it is an array of length at least one.
59
77
  if not isinstance(input_specs, list):
60
78
  raise self.InvalidInputJson(
61
- "`MULTI_INPUT_JSON` must be a JSON array",
62
- args.multi_input_json)
79
+ "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
80
+ )
63
81
  if len(input_specs) == 0:
64
82
  raise self.InvalidInputJson(
65
- "`MULTI_INPUT_JSON` must contain at least one element",
66
- args.multi_input_json)
83
+ "`MULTI_INPUT_JSON` must contain at least one element",
84
+ args.multi_input_json,
85
+ )
67
86
  # For each of the maps, construct the corresponding command-line
68
87
  # options to the index binary.
69
88
  input_options = []
@@ -71,15 +90,15 @@ class IndexCommand(QleverCommand):
71
90
  # Check that `input_spec` is a dictionary.
72
91
  if not isinstance(input_spec, dict):
73
92
  raise self.InvalidInputJson(
74
- f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
75
- "object",
76
- input_spec)
93
+ f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
94
+ input_spec,
95
+ )
77
96
  # For each `input_spec`, we must have a command.
78
97
  if "cmd" not in input_spec:
79
98
  raise self.InvalidInputJson(
80
- f"Element {i} in `MULTI_INPUT_JSON` must contain a "
81
- "key `cmd`",
82
- input_spec)
99
+ f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
100
+ input_spec,
101
+ )
83
102
  input_cmd = input_spec["cmd"]
84
103
  # The `format`, `graph`, and `parallel` keys are optional.
85
104
  input_format = input_spec.get("format", args.format)
@@ -89,17 +108,19 @@ class IndexCommand(QleverCommand):
89
108
  extra_keys = input_spec.keys() - {"cmd", "format", "graph", "parallel"}
90
109
  if extra_keys:
91
110
  raise self.InvalidInputJson(
92
- f"Element {i} in `MULTI_INPUT_JSON` must only contain "
93
- "the keys `format`, `graph`, and `parallel`. Contains "
94
- "extra keys {extra_keys}.",
95
- input_spec)
111
+ f"Element {i} in `MULTI_INPUT_JSON` must only contain "
112
+ "the keys `format`, `graph`, and `parallel`. Contains "
113
+ "extra keys {extra_keys}.",
114
+ input_spec,
115
+ )
96
116
  # Add the command-line options for this input stream. We use
97
117
  # process substitution `<(...)` as a convenient way to handle
98
118
  # an input stream just like a file. This is not POSIX compliant,
99
119
  # but supported by various shells, including bash and zsh.
100
120
  input_options.append(
101
- f"-f <({input_cmd}) -F {input_format} "
102
- f"-g \"{input_graph}\" -p {input_parallel}")
121
+ f"-f <({input_cmd}) -F {input_format} "
122
+ f'-g "{input_graph}" -p {input_parallel}'
123
+ )
103
124
  # Return the concatenated command-line options.
104
125
  return " ".join(input_options)
105
126
 
@@ -108,11 +129,13 @@ class IndexCommand(QleverCommand):
108
129
  # basename of the index, and the settings file). There are two ways
109
130
  # to specify the input: via a single stream or via multiple streams.
110
131
  if args.cat_input_files and not args.multi_input_json:
111
- index_cmd = (f"{args.cat_input_files} | {args.index_binary}"
112
- f" -i {args.name} -s {args.name}.settings.json"
113
- f" -F {args.format} -f -")
132
+ index_cmd = (
133
+ f"{args.cat_input_files} | {args.index_binary}"
134
+ f" -i {args.name} -s {args.name}.settings.json"
135
+ f" -F {args.format} -f -"
136
+ )
114
137
  if args.parallel_parsing:
115
- index_cmd += (f" -p {args.parallel_parsing}")
138
+ index_cmd += f" -p {args.parallel_parsing}"
116
139
  elif args.multi_input_json and not args.cat_input_files:
117
140
  try:
118
141
  input_options = self.get_input_options_for_json(args)
@@ -121,13 +144,17 @@ class IndexCommand(QleverCommand):
121
144
  log.info("")
122
145
  log.info(e.additional_info)
123
146
  return False
124
- index_cmd = (f"{args.index_binary}"
125
- f" -i {args.name} -s {args.name}.settings.json"
126
- f" {input_options}")
147
+ index_cmd = (
148
+ f"{args.index_binary}"
149
+ f" -i {args.name} -s {args.name}.settings.json"
150
+ f" {input_options}"
151
+ )
127
152
  else:
128
- log.error("Specify exactly one of `CAT_INPUT_FILES` (for a "
129
- "single input stream) or `MULTI_INPUT_JSON` (for "
130
- "multiple input streams)")
153
+ log.error(
154
+ "Specify exactly one of `CAT_INPUT_FILES` (for a "
155
+ "single input stream) or `MULTI_INPUT_JSON` (for "
156
+ "multiple input streams)"
157
+ )
131
158
  log.info("")
132
159
  log.info("See `qlever index --help` for more information")
133
160
  return False
@@ -137,12 +164,11 @@ class IndexCommand(QleverCommand):
137
164
  index_cmd += " --only-pso-and-pos-permutations --no-patterns"
138
165
  if not args.use_patterns:
139
166
  index_cmd += " --no-patterns"
140
- if args.text_index in \
141
- ["from_text_records", "from_text_records_and_literals"]:
142
- index_cmd += (f" -w {args.name}.wordsfile.tsv"
143
- f" -d {args.name}.docsfile.tsv")
144
- if args.text_index in \
145
- ["from_literals", "from_text_records_and_literals"]:
167
+ if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
168
+ index_cmd += (
169
+ f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
170
+ )
171
+ if args.text_index in ["from_literals", "from_text_records_and_literals"]:
146
172
  index_cmd += " --text-words-from-literals"
147
173
  if args.stxxl_memory:
148
174
  index_cmd += f" --stxxl-memory {args.stxxl_memory}"
@@ -150,24 +176,26 @@ class IndexCommand(QleverCommand):
150
176
 
151
177
  # If the total file size is larger than 10 GB, set ulimit (such that a
152
178
  # large number of open files is allowed).
153
- total_file_size = get_total_file_size(
154
- shlex.split(args.input_files))
179
+ total_file_size = get_total_file_size(shlex.split(args.input_files))
155
180
  if total_file_size > 1e10:
156
181
  index_cmd = f"ulimit -Sn 1048576; {index_cmd}"
157
182
 
158
183
  # Run the command in a container (if so desired).
159
184
  if args.system in Containerize.supported_systems():
160
185
  index_cmd = Containerize().containerize_command(
161
- index_cmd,
162
- args.system, "run --rm",
163
- args.image,
164
- args.index_container,
165
- volumes=[("$(pwd)", "/index")],
166
- working_directory="/index")
186
+ index_cmd,
187
+ args.system,
188
+ "run --rm",
189
+ args.image,
190
+ args.index_container,
191
+ volumes=[("$(pwd)", "/index")],
192
+ working_directory="/index",
193
+ )
167
194
 
168
195
  # Command for writing the settings JSON to a file.
169
- settings_json_cmd = (f"echo {shlex.quote(args.settings_json)} "
170
- f"> {args.name}.settings.json")
196
+ settings_json_cmd = (
197
+ f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
198
+ )
171
199
 
172
200
  # Show the command line.
173
201
  self.show(f"{settings_json_cmd}\n{index_cmd}", only_show=args.show)
@@ -179,9 +207,11 @@ class IndexCommand(QleverCommand):
179
207
  try:
180
208
  run_command(f"{args.index_binary} --help")
181
209
  except Exception as e:
182
- log.error(f"Running \"{args.index_binary}\" failed, "
183
- f"set `--index-binary` to a different binary or "
184
- f"set `--system to a container system`")
210
+ log.error(
211
+ f'Running "{args.index_binary}" failed, '
212
+ f"set `--index-binary` to a different binary or "
213
+ f"set `--system to a container system`"
214
+ )
185
215
  log.info("")
186
216
  log.info(f"The error message was: {e}")
187
217
  return False
@@ -189,28 +219,29 @@ class IndexCommand(QleverCommand):
189
219
  # Check if all of the input files exist.
190
220
  for pattern in shlex.split(args.input_files):
191
221
  if len(glob.glob(pattern)) == 0:
192
- log.error(f"No file matching \"{pattern}\" found")
222
+ log.error(f'No file matching "{pattern}" found')
193
223
  log.info("")
194
- log.info("Did you call `qlever get-data`? If you did, check "
195
- "GET_DATA_CMD and INPUT_FILES in the QLeverfile")
224
+ log.info(
225
+ "Did you call `qlever get-data`? If you did, check "
226
+ "GET_DATA_CMD and INPUT_FILES in the QLeverfile"
227
+ )
196
228
  return False
197
229
 
198
230
  # Check if index files (name.index.*) already exist.
199
231
  existing_index_files = get_existing_index_files(args.name)
200
232
  if len(existing_index_files) > 0 and not args.overwrite_existing:
201
233
  log.error(
202
- f"Index files for basename \"{args.name}\" found, if you "
203
- f"want to overwrite them, use --overwrite-existing")
234
+ f'Index files for basename "{args.name}" found, if you '
235
+ f"want to overwrite them, use --overwrite-existing"
236
+ )
204
237
  log.info("")
205
238
  log.info(f"Index files found: {existing_index_files}")
206
239
  return False
207
240
 
208
241
  # Remove already existing container.
209
- if args.system in Containerize.supported_systems() \
210
- and args.overwrite_existing:
242
+ if args.system in Containerize.supported_systems() and args.overwrite_existing:
211
243
  if Containerize.is_running(args.system, args.index_container):
212
- log.info("Another index process is running, trying to stop "
213
- "it ...")
244
+ log.info("Another index process is running, trying to stop " "it ...")
214
245
  log.info("")
215
246
  try:
216
247
  run_command(f"{args.system} rm -f {args.index_container}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qlever
3
- Version: 0.5.11
3
+ Version: 0.5.12
4
4
  Summary: Script for using the QLever SPARQL engine.
5
5
  Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
6
6
  License: Apache-2.0
@@ -7,7 +7,7 @@ qlever/qlever_main.py,sha256=tA_xqOs_FjvqlDIvKTprwuysfTwzsUjE7at26gRhCVA,2336
7
7
  qlever/qlever_old.py,sha256=X-JxmepFKYeFgSLLp0TRDNqXSxDwIbc8_0Xstiems8c,62026
8
8
  qlever/qleverfile.py,sha256=lygAjI5_wV_e-JoIGIqVTdd4yyvApzZiSlqSsmdlJpU,14529
9
9
  qlever/util.py,sha256=qLxBRyHPT2VTj0xcOCFcP6HV-Lm-g-64QpvOc4V0_a8,8029
10
- qlever/Qleverfiles/Qleverfile.dblp,sha256=wXZweNfYgEx-IGF1vIJMgqYUSWanQlzPj1EzUOGVuXA,1340
10
+ qlever/Qleverfiles/Qleverfile.dblp,sha256=nbkjBb6ZlmRgCsVcsx8U0EwI8csf8G03Rn2E10uCxVk,1269
11
11
  qlever/Qleverfiles/Qleverfile.dblp-plus,sha256=TJHxp8I1P6JKJjbuAllEpB32-huuY1gH0FlenqPVJ5g,1334
12
12
  qlever/Qleverfiles/Qleverfile.dbpedia,sha256=aaNZZayE-zVePGSwPzXemkX__Ns8-kP_E7DNNKZPnqg,1160
13
13
  qlever/Qleverfiles/Qleverfile.default,sha256=Kj-J1Kkv8PWN7wuMdZU6DUUlEuBIcSNysJCE-R63we8,2407
@@ -24,7 +24,7 @@ qlever/Qleverfiles/Qleverfile.pubchem,sha256=YuDzWQmukSvL1opu7cf1KX9407_P21lmecY
24
24
  qlever/Qleverfiles/Qleverfile.scientists,sha256=9eZ2c6P9a3E3VHa3RR7LdOQbF4k3oyyrn56Z3u4LZYs,1164
25
25
  qlever/Qleverfiles/Qleverfile.uniprot,sha256=9kAKseomdUnIt7EAZge39g1MTuaLVaSW9JYLHzIMolM,2338
26
26
  qlever/Qleverfiles/Qleverfile.vvz,sha256=cLzm85erKoFCDllH5eFcSi35MdR6Tahj1MgtvGRxanM,922
27
- qlever/Qleverfiles/Qleverfile.wikidata,sha256=JyWB7haqulineO3aPEsXqx12OpantxfxueeofcvMjpk,2343
27
+ qlever/Qleverfiles/Qleverfile.wikidata,sha256=zVUXF75XJyK1h-J-7EjFemzmkSyoPtng1mNY3U7S78M,2061
28
28
  qlever/Qleverfiles/Qleverfile.wikipathways,sha256=UFEVLrtOBiSQfibBN9xc2wDXrnWcnx5f8PY9khcE6bc,1983
29
29
  qlever/Qleverfiles/Qleverfile.yago-4,sha256=hAS_2ZmC1zxNsKXip7t1F_iqu3CC-6O7v6HZhuFbnWY,1819
30
30
  qlever/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -33,7 +33,7 @@ qlever/commands/cache_stats.py,sha256=6JjueQstAqc8dNfgY8TP2EitFMxdUvCwrcyd7KUEb2
33
33
  qlever/commands/clear_cache.py,sha256=AnE1MOoj1ZexxrRT8FGeBLlv8rtQIVV4DP8VBn5-X-s,2843
34
34
  qlever/commands/example_queries.py,sha256=rtMOQw7cJe0Aia_1O7UyKcxHbz7ln9BSZYWUQI9OFA8,16389
35
35
  qlever/commands/get_data.py,sha256=f9kjZI3TKad6JHSuXWNkeoajmW8h0Sx8ShvjauDCtNo,1412
36
- qlever/commands/index.py,sha256=G4SPxJ1PW8KsLYl4OBV4rOLKSo-O3aR6nTT_0K6zAgU,10376
36
+ qlever/commands/index.py,sha256=y0HlFSTbRQSnXRz0fn3Mar9FqfnOtC2U1nu74N5IwCA,10842
37
37
  qlever/commands/index_stats.py,sha256=_BiUNBhmbYd9RPxrlm4HF0oENO6JmqnRiAkwkyOdN4U,11722
38
38
  qlever/commands/log.py,sha256=8Krt3MsTUDapYqVw1zUu5X15SF8mV97Uj0qKOWK8jXk,1861
39
39
  qlever/commands/query.py,sha256=_IDH-M8gKL_f1i5wzu0X452pZSUD0_qXl6bPXC85wX0,2750
@@ -44,9 +44,9 @@ qlever/commands/stop.py,sha256=TZs4bxKHvujlZAU8BZmFjA5eXSZNAa6EeNzvPpEZsuI,4139
44
44
  qlever/commands/system_info.py,sha256=SShsnEV7QROgdbABeJ6Wk4U_CNjqlYO1J5HrpNCVNMs,4615
45
45
  qlever/commands/ui.py,sha256=rZxIYHZr-PqgQKfvVwl8woDnTGR-sFc-f6cPjcORaOk,3611
46
46
  qlever/commands/warmup.py,sha256=WOZSxeV8U_F6pEEnAb6YybXLQMxZFTRJXs4BPHUhsmc,1030
47
- qlever-0.5.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
- qlever-0.5.11.dist-info/METADATA,sha256=Cy1xk-ZJLRR1r5LPwvAFwzuvJZwdTpDW8puG57Un-0Y,4583
49
- qlever-0.5.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
50
- qlever-0.5.11.dist-info/entry_points.txt,sha256=U_gbYYi0wwdsn884eb0XoOXfvhACOsxhlO330dZ9bi0,87
51
- qlever-0.5.11.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
52
- qlever-0.5.11.dist-info/RECORD,,
47
+ qlever-0.5.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
+ qlever-0.5.12.dist-info/METADATA,sha256=mm1muAfcKHTS4pN-xS3D0P4P57Z450Acifk2BckhAgc,4583
49
+ qlever-0.5.12.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
50
+ qlever-0.5.12.dist-info/entry_points.txt,sha256=U_gbYYi0wwdsn884eb0XoOXfvhACOsxhlO330dZ9bi0,87
51
+ qlever-0.5.12.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
52
+ qlever-0.5.12.dist-info/RECORD,,