qlever 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -1,20 +1,24 @@
1
- # Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
1
+ # Qleverfile for DBLP, use with QLever CLI (`pip install qlever`)
2
2
  #
3
- # qlever get-data # ~5 GB compressed, 1.3 B triples
4
- # qlever index # ~30 min (on an AMD Ryzen 9 5900X)
5
- # qlever start # ~1 sec
3
+ # qlever get-data # ~1 min, ~5 GB compressed, 1.3 B triples
4
+ # qlever index # ~30 min, ~20 GB RAM, ~25 GB index size on disk
5
+ # qlever start # ~3 s, adjust MEMORY_FOR_QUERIES as needed
6
+ #
7
+ # Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (25.10.2024)
6
8
 
7
9
  [data]
8
10
  NAME = dblp
9
- GET_DATA_URL = https://sparql.dblp.org/download/dblp_KG_with_associated_data.tar
10
- GET_DATA_CMD = (curl -LRC - -o dblp+citations.tar ${GET_DATA_URL} && tar -xf dblp+citations.tar) 2>&1 | tee ${NAME}.download-log.txt
11
+ DATA_TARFILE = dblp_KG_with_associated_data.tar
12
+ GET_DATA_URL = https://sparql.dblp.org/download/${DATA_TARFILE}
13
+ GET_DATA_CMD = (curl -LROC - ${GET_DATA_URL} && tar -xf ${DATA_TARFILE}) 2>&1 | tee ${NAME}.download-log.txt && rm -f ${DATA_TARFILE}
11
14
  VERSION = $$(date -r dblp.ttl.gz +"%d.%m.%Y %H:%M" || echo "NO_DATE")
12
15
  DESCRIPTION = DBLP computer science bibliography + citations from OpenCitations, data from ${GET_DATA_URL} (version ${VERSION})
16
+ FORMAT = ttl
13
17
 
14
18
  [index]
15
- INPUT_FILES = *.gz
16
- CAT_INPUT_FILES = zcat ${INPUT_FILES}
17
- SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "languages-internal": [], "prefixes-external": [""] }
19
+ INPUT_FILES = *.gz
20
+ MULTI_INPUT_JSON = $$(ls *.gz | awk 'BEGIN { printf "[ " } NR > 1 { printf ", " } { printf "{\"cmd\": \"zcat " $$0 "\"}" } END { printf "]" }')
21
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] }
18
22
 
19
23
  [server]
20
24
  PORT = 7015
@@ -9,12 +9,12 @@
9
9
  [data]
10
10
  NAME = dblp-plus
11
11
  GET_DATA_CMD = wget -nc -O dblp.ttl.gz https://dblp.org/rdf/dblp.ttl.gz
12
- INDEX_DESCRIPTION = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
12
+ DESCRIPTION = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
13
13
  TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
14
14
 
15
15
  [index]
16
16
  INPUT_FILES = dblp.ttl.gz affiliations.nt affiliations.additions.nt citations.nt
17
- CAT_INPUT_FILES = zcat -f ${RDF_FILES}
17
+ CAT_INPUT_FILES = zcat -f ${INPUT_FILES}
18
18
  SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [ "<https://w3id.org", "<https://doi.org", "<http://dx.doi.org" ] }
19
19
  TEXT_INDEX = from_literals
20
20
 
@@ -31,7 +31,7 @@ SETTINGS_JSON = { "num-triples-per-batch": 1000000 }
31
31
  # URL parameter `access_token`. It should not be easily guessable, unless you
32
32
  # don't mind others to get privileged access to your server.
33
33
  [server]
34
- PORT =
34
+ PORT = 8888
35
35
  ACCESS_TOKEN =
36
36
 
37
37
  # Use SYSTEM = docker to run QLever inside a docker container; the Docker image
@@ -13,13 +13,13 @@ TEXT_DESCRIPTION = Sentences from Wikipedia that mention at least one Freebase
13
13
 
14
14
  [index]
15
15
  INPUT_FILES = fbeasy.nt
16
- CAT_INPUT_FILES = cat ${RDF_FILES}
16
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
17
17
  SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }
18
18
 
19
19
  [server]
20
- PORT = 7003
21
- ACCESS_TOKEN = ${data:NAME}_12631403
22
- MEMORY_FOR_QUERIES = 5G
20
+ PORT = 7003
21
+ ACCESS_TOKEN = ${data:NAME}
22
+ MEMORY_FOR_QUERIES = 5G
23
23
 
24
24
  [runtime]
25
25
  SYSTEM = docker
@@ -12,12 +12,12 @@ DESCRIPTION = RDF data from ${DATA_URL}, latest (and final) version from 09.08.
12
12
 
13
13
  [index]
14
14
  INPUT_FILES = freebase-rdf-latest.gz
15
- CAT_INPUT_FILES = zcat ${RDF_FILES}
15
+ CAT_INPUT_FILES = zcat ${INPUT_FILES}
16
16
  SETTINGS_JSON = { "languages-internal": [ "en" ], "prefixes-external": ["<"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 10000000 }
17
17
 
18
18
  [server]
19
19
  PORT = 7002
20
- ACCESS_TOKEN = ${data:NAME}_12631403
20
+ ACCESS_TOKEN = ${data:NAME}
21
21
  MEMORY_FOR_QUERIES = 10G
22
22
 
23
23
  [runtime]
@@ -28,7 +28,7 @@ ACCESS_TOKEN = ${data:NAME}
28
28
  MEMORY_FOR_QUERIES = 5G
29
29
 
30
30
  [runtime]
31
- SYSTEM = native
31
+ SYSTEM = docker
32
32
  IMAGE = docker.io/adfreiburg/qlever:latest
33
33
 
34
34
  [ui]
@@ -0,0 +1,30 @@
1
+ # Qleverfile for ORKG, use with the QLever CLI (`pip install qlever`)
2
+ #
3
+ # qlever get-data # Get the dataset
4
+ # qlever index # Build index data structures
5
+ # qlever start # Start the server
6
+
7
+ [data]
8
+ NAME = orkg
9
+ GET_DATA_URL = https://orkg.org/api/rdf/dump
10
+ GET_DATA_CMD = curl -LR -o ${NAME}.ttl ${GET_DATA_URL} 2>&1 | tee ${NAME}.download-log.txt
11
+ VERSION = $$(date -r ${NAME}.ttl +%d.%m.%Y || echo "NO_DATE")
12
+ DESCRIPTION = The Open Research Knowledge Graph (ORKG) (data from ${GET_DATA_URL}, version ${VERSION})
13
+
14
+ [index]
15
+ INPUT_FILES = ${data:NAME}.ttl
16
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
17
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
18
+
19
+ [server]
20
+ PORT = 7053
21
+ ACCESS_TOKEN = ${data:NAME}
22
+ MEMORY_FOR_QUERIES = 10G
23
+ CACHE_MAX_SIZE = 5G
24
+
25
+ [runtime]
26
+ SYSTEM = docker
27
+ IMAGE = docker.io/adfreiburg/qlever:latest
28
+
29
+ [ui]
30
+ UI_CONFIG = orkg
@@ -11,7 +11,7 @@
11
11
  NAME = osm-planet
12
12
  DATA_URL = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
13
13
  GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
14
- VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y")
14
+ VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE")
15
15
  DESCRIPTION = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
16
16
 
17
17
  [index]
@@ -14,13 +14,13 @@ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
14
14
 
15
15
  [index]
16
16
  INPUT_FILES = vvz.ttl
17
- CAT_INPUT_FILES = cat ${FILE_NAMES}
18
- SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
17
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
18
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
19
19
  TEXT_INDEX = from_literals
20
20
 
21
21
  [server]
22
22
  PORT = 7041
23
- ACCESS_TOKEN = ${data:NAME}_8736426534
23
+ ACCESS_TOKEN = ${data:NAME}
24
24
  MEMORY_FOR_QUERIES = 10G
25
25
 
26
26
  [runtime]
@@ -1,33 +1,45 @@
1
- # Qleverfile for Wikidata, use with qlever script (`pip install qlever`)
1
+ # Qleverfile for Wikidata, use with the QLever CLI (`pip install qlever`)
2
2
  #
3
- # qlever get-data # downloads two .bz2 files of total size ~100 GB
4
- # qlever index # takes ~4.5 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
5
- # qlever start # starts the server (takes a few seconds)
3
+ # qlever get-data # ~7 hours, ~110 GB (compressed), ~20 billion triples
4
+ # qlever index # ~5 hours, ~20 GB RAM, ~500 GB index size on disk
5
+ # qlever start # a few seconds, adjust MEMORY_FOR_QUERIES as needed
6
+ #
7
+ # Adding a text index takes an additional ~2 hours and ~50 GB of disk space
8
+ #
9
+ # Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (18.10.2024)
6
10
 
7
11
  [DEFAULT]
8
12
  NAME = wikidata
9
13
 
10
14
  [data]
11
- GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
12
- GET_DATA_CMD = curl -LRC - --remote-name-all ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1
13
- VERSION = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
14
- DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${VERSION})
15
+ GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
16
+ GET_DATA_CMD = curl -LROC - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
17
+ DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
18
+ DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE")
19
+ DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) + English Wikipeda abstracts (version ${DATE_WIKIPEDIA}, available via schema:description)
20
+ TEXT_DESCRIPTION = All English and German literals + all sentences from the English Wikipedia (version ${DATE_WIKIPEDIA}), use with FILTER KEYWORDS(...)
15
21
 
16
22
  [index]
17
- INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2
18
- CAT_INPUT_FILES = lbzcat -n 4 -f ${INPUT_FILES}
19
- SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
20
- STXXL_MEMORY = 10G
23
+ INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 wikipedia-abstracts.nt dcatap.nt
24
+ MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl", "parallel": "true" },
25
+ { "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" },
26
+ { "cmd": "cat wikipedia-abstracts.nt", "format": "nt", "parallel": "false" },
27
+ { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }]
28
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
29
+ STXXL_MEMORY = 10G
30
+ TEXT_INDEX = from_text_records
21
31
 
22
32
  [server]
23
- PORT = 7001
24
- ACCESS_TOKEN = ${data:NAME}
25
- MEMORY_FOR_QUERIES = 20G
26
- CACHE_MAX_SIZE = 10G
33
+ PORT = 7001
34
+ ACCESS_TOKEN = ${data:NAME}_3fz47hfzrbf64b
35
+ MEMORY_FOR_QUERIES = 40G
36
+ CACHE_MAX_SIZE = 30G
37
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
38
+ TIMEOUT = 300s
27
39
 
28
40
  [runtime]
29
41
  SYSTEM = docker
30
- IMAGE = docker.io/adfreiburg/qlever:latest
42
+ IMAGE = adfreiburg/qlever
31
43
 
32
44
  [ui]
33
45
  UI_CONFIG = wikidata
@@ -16,14 +16,14 @@ DESCRIPTION = "Full dump from https://yago-knowledge.org/downloads/yago-4, vers
16
16
 
17
17
  [index]
18
18
  INPUT_FILES = yago-wd-*.nt.gz
19
- CAT_INPUT_FILES = zcat ${FILE_NAMES}
19
+ CAT_INPUT_FILES = zcat ${INPUT_FILES}
20
20
  SETTINGS_JSON = { "languages-internal": ["en"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
21
21
  STXXL_MEMORY = 10G
22
22
 
23
23
  [server]
24
- PORT = 9004
25
- ACCESS_TOKEN = ${DB}_2347348732
26
- MEMORY_FOR_QUERIES = 30G
24
+ PORT = 9004
25
+ ACCESS_TOKEN = ${data:NAME}
26
+ MEMORY_FOR_QUERIES = 30G
27
27
 
28
28
  [runtime]
29
29
  SYSTEM = docker