qlever 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -0,0 +1,34 @@
1
+ # Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .gz file of size ~3 GB (as of 31.07.2022)
4
+ # qlever index # takes ~30 minutes and ~20 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (takes around 2 minutes)
6
+ #
7
+ # Also builds a text index for fast kewyword search in literals. Without that
8
+ # (WITH_TEXT_INDEX = false), the index build takes only ~10 minutes.
9
+
10
+ [data]
11
+ NAME = dblp
12
+ GET_DATA_URL = https://dblp.org/rdf/${index:INPUT_FILES}
13
+ GET_DATA_CMD = curl -LO -C - ${GET_DATA_URL}
14
+ DESCRIPTION = DBLP computer science bibliography, data from ${GET_DATA_URL}
15
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
16
+
17
+ [index]
18
+ INPUT_FILES = dblp.ttl.gz
19
+ CAT_INPUT_FILES = zcat ${INPUT_FILES}
20
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
21
+ TEXT_INDEX = from_literals
22
+
23
+ [server]
24
+ PORT = 7015
25
+ ACCESS_TOKEN = ${data:NAME}_7643543846
26
+ MEMORY_FOR_QUERIES = 30G
27
+ CACHE_MAX_SIZE = 5G
28
+
29
+ [runtime]
30
+ SYSTEM = docker
31
+ IMAGE = docker.io/adfreiburg/qlever:latest
32
+
33
+ [ui]
34
+ UI_CONFIG = dblp
@@ -0,0 +1,33 @@
1
+ # Qleverfile for DBLP Plus, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data downloads .gz file of size ~3 GB (as of 31.07.2022)
4
+ # qlever index takes ~30 minutes and ~20 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start starts the server
6
+ #
7
+ # Also builds a text index for fast kewyword search in literals.
8
+
9
+ [data]
10
+ NAME = dblp-plus
11
+ GET_DATA_CMD = wget -nc -O dblp.ttl.gz https://dblp.org/rdf/dblp.ttl.gz
12
+ INDEX_DESCRIPTION = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
13
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
14
+
15
+ [index]
16
+ INPUT_FILES = dblp.ttl.gz affiliations.nt affiliations.additions.nt citations.nt
17
+ CAT_INPUT_FILES = zcat -f ${RDF_FILES}
18
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [ "<https://w3id.org", "<https://doi.org", "<http://dx.doi.org" ] }
19
+ TEXT_INDEX = from_literals
20
+
21
+ [server]
22
+ PORT = 7027
23
+ ACCESS_TOKEN = ${data:NAME}_169238202
24
+ MEMORY_FOR_QUERIES = 20G
25
+ CACHE_MAX_SIZE = 10G
26
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 2G
27
+
28
+ [runtime]
29
+ SYSTEM = docker
30
+ IMAGE = docker.io/adfreiburg/qlever:latest
31
+
32
+ [ui]
33
+ UI_CONFIG = dblp-plus
@@ -0,0 +1,47 @@
1
+ # Automatically created by the "qlever" script
2
+ #
3
+ # Modify as you see fit. Beware that some of the values below are executed as
4
+ # commands by the script.
5
+ #
6
+ # If you have never seen a Qleverfile before, we recommend that you look at the
7
+ # pre-filled Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
8
+ # Qleverfiles first to get some inspiration. Or execute `qlever setup-config
9
+ # <config name>` with a config name of your choice.
10
+
11
+ # As a minimum, each dataset needs a name. If you want `qlever get-data` to do
12
+ # something meaningful, you need to define GET_DATA_CMD. If you want to use the
13
+ # QLever UI, you should define DESCRIPTION (and if you have a text index,
14
+ # also TEXT_DESCRIPTION).
15
+ [data]
16
+ NAME =
17
+ # GET_DATA_CMD =
18
+ # DESCRIPTION =
19
+ # TEXT_DESCRIPTION =
20
+
21
+ # CAT_INPUT_FILES produces the data that is piped into QLever's index builder.
22
+ # Use SETTINGS_JSON for more advanced configuration settings (see the other
23
+ # Qleverfiles for examples).
24
+ [index]
25
+ # INPUT_FILES =
26
+ # CAT_INPUT_FILES = cat ${INPUT_FILES}
27
+ # SETTINGS_JSON = {}
28
+
29
+ # As a minimum, you need to specify the PORT, where QLever will listen for
30
+ # SPARQL queries. If you want to send priviledged commands to the server, you
31
+ # need to specify an ACCESS_TOKEN (modify the random number below).
32
+ [server]
33
+ PORT = 7001
34
+ # ACCESS_TOKEN = ${data:NAME}_1234567890
35
+
36
+ # With USE_DOCKER = true, the qlever script will download the docker image for
37
+ # you and run QLever inside docker containers. With USE_DOCKER = false, you need
38
+ # the QLever binaries in the PATH of your sheel.
39
+ [runtime]
40
+ SYSTEM = true
41
+ IMAGE = docker.io/adfreiburg/qlever:latest
42
+
43
+ # The UI_CONFIG must be one of the slugs from http://qlever.cs.uni-freiburg.de
44
+ # (see the dropdown menu on the top right, the slug is the last part of the URL).
45
+ # In partiular, this determines the example queries.
46
+ [ui]
47
+ UI_CONFIG = default
@@ -0,0 +1,37 @@
1
+ # Qleverfile for Olympics, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # takes ~ 10 min to download .nt.gz file of size ~ 8 GB
4
+ # qlever index # takes ~ 20 min and ~ 5 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server
6
+ #
7
+ # NOTE: https://data.dnb.de/opendata/ is rather confusing becase of the many
8
+ # files. This Qleverfile downloads all the datasets named "Gesamtabzug", except
9
+ # bib_lds.nt.gz, which contains incorrectly formatted IRIs. The file
10
+ # dnb-all_ldsprov.nt.gz contains invalid floating point literals; to ignore
11
+ # them, compile QLever with TurtleParserBase::invalidLiteralsAreSkipped_ = true
12
+
13
+ [data]
14
+ NAME = dnb
15
+ BASE_URL = https://data.dnb.de/opendata
16
+ GET_DATA_CMD = curl -L -C - --remote-name-all ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz
17
+ DESCRIPTION = DNB data from ${BASE_ULR} (authoritities-gnd_lds, dnb_all_lds, dnb-all_ldsprov, zdb_lds)
18
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?var, "...")
19
+
20
+ [index]
21
+ INPUT_FILES = *.nt.gz
22
+ CAT_INPUT_FILES = zcat ${FILE_NAMES}
23
+ SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
24
+ TEXT_INDEX = from_literals
25
+
26
+ [server]
27
+ PORT = 7035
28
+ ACCESS_TOKEN = ${data:NAME}_284732743
29
+ MEMORY_FOR_QUERIES = 10G
30
+ CACHE_MAX_SIZE = 2G
31
+
32
+ [runtime]
33
+ SYSTEM = true
34
+ IMAGE = docker.io/adfreiburg/qlever:latest
35
+
36
+ [ui]
37
+ UI_CONFIG = dnb
@@ -0,0 +1,29 @@
1
+ # Qleverfile for Fbeasy, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .gz file of size ~3 GB (as of 31.07.2022)
4
+ # qlever index # takes ~10 minutes and ~10 GB RAM (on an AMD Ryzen 7 5900X)
5
+ # qlever start # starts the server
6
+
7
+ [data]
8
+ NAME = fbeasy
9
+ DATA_URL = https://freebase-easy.cs.uni-freiburg.de
10
+ GET_DATA_CMD = wget -nc ${DATA_URL}/dump/fbeasy.nt
11
+ DESCRIPTION = RDF data from ${DATA_URL}, latest version from 18.07.2019
12
+ TEXT_DESCRIPTION = Sentences from Wikipedia that mention at least one Freebase entity
13
+
14
+ [index]
15
+ INPUT_FILES = fbeasy.nt
16
+ CAT_INPUT_FILES = cat ${RDF_FILES}
17
+ SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }
18
+
19
+ [server]
20
+ PORT = 7003
21
+ ACCESS_TOKEN = ${data:NAME}_12631403
22
+ MEMORY_FOR_QUERIES = 5G
23
+
24
+ [runtime]
25
+ SYSTEM = docker
26
+ IMAGE = docker.io/adfreiburg/qlever:latest
27
+
28
+ [ui]
29
+ UI_CONFIG = fbeasy
@@ -0,0 +1,28 @@
1
+ # Qleverfile for Freebase, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .gz file of size ~3 GB (as of 31.07.2022)
4
+ # qlever index # takes ~4 hours and ~20 GB RAM (on an AMD Ryzen 7 5900X)
5
+ # qlever start # starts the server
6
+
7
+ [data]
8
+ NAME = freebase
9
+ DATA_URL = http://commondatastorage.googleapis.com/freebase-public/rdf/freebase-rdf-latest.gz
10
+ GET_DATA_CMD = wget -nc ${DATA_URL}
11
+ DESCRIPTION = RDF data from ${DATA_URL}, latest (and final) version from 09.08.2015
12
+
13
+ [index]
14
+ INPUT_FILES = freebase-rdf-latest.gz
15
+ CAT_INPUT_FILES = zcat ${RDF_FILES}
16
+ SETTINGS_JSON = { "languages-internal": [ "en" ], "prefixes-external": ["<"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 10000000 }
17
+
18
+ [server]
19
+ PORT = 7002
20
+ ACCESS_TOKEN = ${data:NAME}_12631403
21
+ MEMORY_FOR_QUERIES = 10G
22
+
23
+ [runtime]
24
+ SYSTEM = docker
25
+ IMAGE = docker.io/adfreiburg/qlever:latest
26
+
27
+ [ui]
28
+ UI_CONFIG = freebase
@@ -0,0 +1,35 @@
1
+ # Qleverfile for IMDB, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads "basics" and "ratings" of size ~1 GB
4
+ # qlever index # takes ~5 minutes and ~5 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (takes a few seconds)
6
+ #
7
+ # Supports fast kewyword search in literals (WITH_TEXT_INDEX = from_literals).
8
+
9
+ [data]
10
+ NAME = imdb
11
+ IMDB_DATA_URL = https://datasets.imdbws.com
12
+ GET_PREFIXES = echo "@prefix imdb: <https://www.imdb.com/> .\n"
13
+ GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
14
+ GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE}
15
+ GET_DATA_CMD = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl
16
+ DESCRIPTION = RDF data derived from ${IMDB_DATA_URL}
17
+ TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
18
+
19
+ [index]
20
+ INPUT_FILES = ${data:NAME}.ttl
21
+ CAT_INPUT_FILES = cat ${FILE_NAMES}
22
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
23
+ TEXT_INDEX = from_literals
24
+
25
+ [server]
26
+ PORT = 7029
27
+ ACCESS_TOKEN = ${data:NAME}_1234567890
28
+ MEMORY_FOR_QUERIES = 5G
29
+
30
+ [runtime]
31
+ SYSTEM = docker
32
+ IMAGE = docker.io/adfreiburg/qlever:latest
33
+
34
+ [ui]
35
+ UI_CONFIG = imdb
@@ -0,0 +1,31 @@
1
+ # Qleverfile for Olympics, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .zip file of size 13 MB, uncompressed to 323 MB
4
+ # qlever index # takes ~10 seconds and ~1 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (instant)
6
+
7
+ [data]
8
+ NAME = olympics
9
+ BASE_URL = https://github.com/wallscope/olympics-rdf
10
+ GET_DATA_CMD = curl -sLo olympics.zip -C - ${BASE_URL}/raw/master/data/olympics-nt-nodup.zip && unzip -q -o olympics.zip && rm olympics.zip
11
+ DESCRIPTION = 120 Years of Olympics, data from ${BASE_URL}
12
+ TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
13
+
14
+ [index]
15
+ INPUT_FILES = olympics.nt
16
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
17
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 100000 }
18
+
19
+ [server]
20
+ PORT = 7019
21
+ ACCESS_TOKEN = ${data:NAME}_7643543846
22
+ MEMORY_FOR_QUERIES = 5G
23
+ CACHE_MAX_SIZE = 2G
24
+ TIMEOUT = 30s
25
+
26
+ [runtime]
27
+ SYSTEM = docker
28
+ IMAGE = docker.io/adfreiburg/qlever:latest
29
+
30
+ [ui]
31
+ UI_CONFIG = olympics
@@ -0,0 +1,42 @@
1
+ # Qleverfile for OSM of some country, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .pbf file from Geofabrik und builds .ttl.bz2 using osm2rdf
4
+ # qlever index # for example Germany takes ~30 minutes and ~10 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server
6
+ #
7
+ # Make sure that osm2rdf is in your path. Set CONTINENT and COUNTRY such that
8
+ # the link under GET_DATA_CMD exists (the names are usually the canonical
9
+ # names). The time for osm2rdf is around the same as that for "qlever index".
10
+
11
+ # Indexer settings
12
+ CONTINENT = europe
13
+ COUNTRY = switzerland
14
+ DB = osm-${COUNTRY}
15
+ PBF = ${DB}.pbf
16
+ RDF_FILES = "${DB}.ttl.bz2"
17
+ CAT_FILES = "bzcat ${RDF_FILES}"
18
+ WITH_TEXT = false
19
+ STXXL_MEMORY = 10
20
+ SETTINGS_JSON = '{ "prefixes-external": [ "\"LINESTRING(", "\"MULTIPOLYGON(", "\"POLYGON(" ], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }'
21
+ GET_DATA_CMD = "wget -nc -O ${PBF} https://download.geofabrik.de/${CONTINENT}/${COUNTRY}-latest.osm.pbf; rm -f ${DB}.*.bz2; ( time /local/data/osm2rdf/build/apps/osm2rdf ${PBF} -o ${DB}.ttl --cache . --write-geometric-relation-statistics ) 2>&1 | tee ${DB}.osm2rdf-log.txt; rm -f spatial-*"
22
+ DESCRIPTION = "OSM ${COUNTRY^}, dump from $(ls -l --time-style=+%d.%m.%Y ${PBF} 2> /dev/null | cut -d' ' -f6) with ogc:contains"
23
+
24
+ # Server settings
25
+ HOSTNAME = $(hostname -f)
26
+ SERVER_PORT = 7025
27
+ ACCESS_TOKEN = ${DB}_%RANDOM%
28
+ MEMORY_FOR_QUERIES = 20G
29
+ CACHE_MAX_SIZE = 10G
30
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
31
+ CACHE_MAX_NUM_ENTRIES = 100
32
+
33
+ # QLever binaries
34
+ QLEVER_BIN_DIR = %QLEVER_BIN_DIR%
35
+ USE_DOCKER = true
36
+ QLEVER_DOCKER_IMAGE = adfreiburg/qlever
37
+ QLEVER_DOCKER_CONTAINER = qlever.${DB}
38
+
39
+ # QLever UI
40
+ QLEVERUI_PORT = 7000
41
+ QLEVERUI_DIR = qlever-ui
42
+ QLEVERUI_CONFIG = osm
@@ -0,0 +1,36 @@
1
+ # Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
2
+ #
3
+ # qlever get-data # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB
4
+ # qlever index # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # takes a few seconds
6
+ #
7
+ # For the OSM data of a single country, do `qlever setup-config osm-country`
8
+ # and edit the Qleverfile to specify the country,
9
+
10
+ [data]
11
+ NAME = osm-planet
12
+ DATA_URL = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
13
+ GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
14
+ VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y")
15
+ DESCRIPTION = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
16
+
17
+ [index]
18
+ INPUT_FILES = ${data:NAME}.ttl.bz2
19
+ CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES}
20
+ STXXL_MEMORY = 20G
21
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
22
+
23
+ [server]
24
+ PORT = 7007
25
+ ACCESS_TOKEN = ${data:NAME}
26
+ MEMORY_FOR_QUERIES = 90G
27
+ CACHE_MAX_SIZE = 40G
28
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
29
+ TIMEOUT = 300s
30
+
31
+ [runtime]
32
+ SYSTEM = docker
33
+ IMAGE = docker.io/adfreiburg/qlever:latest
34
+
35
+ [ui]
36
+ UI_CONFIG = osm-planet
@@ -0,0 +1,66 @@
1
+ # Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4
4
+ # qlever index # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
5
+ # qlever start # starts the server (a few seconds)
6
+ #
7
+ # IMPORTANT NOTES:
8
+ #
9
+ # NOTE 1: The SPARQL endpoint at https://qlever.cs.uni-freiburg.de/pubchem also
10
+ # contains data from the following ontologies, which are very useful for
11
+ # resolving names of IRIs like `sio:SIO_000008` or `obo:IAO_0000412`, but which
12
+ # are not part of the PubChem RDF data. For the corresponding URLs, see
13
+ # https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1200479401 .
14
+ #
15
+ # bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit
16
+ # obi pr ro sio skos so uo
17
+ #
18
+ # NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
19
+ # disallows downloading the PubChem RDF data using `wget --recursive` as in the
20
+ # GET_DATA_CMD below. As a workaround, you can write a simple Python script
21
+ # (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
22
+ # pages and download the files individually. This was done for the latest
23
+ # version of https://qlever.cs.uni-freiburg.de/pubchem .
24
+ #
25
+ # NOTE 3: Many of the TTL files have generic prefix definitions in the middle
26
+ # of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
27
+ # See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
28
+ # This is allowed by the standard, but VERY unusual. For use with QLever,
29
+ # convert the TTL files to NT before indexing, see GET_DATA_CMD below.
30
+ #
31
+ # NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
32
+ # spaces and braces are not properly escaped. Here is a simple awk-based script
33
+ # to percent-encode spaces and braces in all IRIs in the NT files:
34
+ #
35
+ # for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt
36
+ # cat fix-nt.commands.txt | parallel
37
+
38
+
39
+ [DEFAULT]
40
+ NAME = pubchem
41
+ DATE = 2024-02-03
42
+
43
+ [data]
44
+ GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
45
+ MAKE_GET_DATA_CMD = curl -s ${GET_DATA_URL}/void.ttl | grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' | grep -v "nbr[23]d" | while read URL; do echo "echo \"Processing $$URL ...\"; curl --silent --remote-time --output ttl.${DATE}/$$(basename $$URL) $$URL && docker run --rm -v $$(pwd)/ttl.${DATE}:/data stain/jena turtle --output=NT /data/$$(basename $$URL) | sed 's/> />\t/1; s/> />\t/1; s/ \.\$$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$$i ~ /^<.*>\$$/) { gsub(/ /, \"%20\", \$$i); gsub(/\[/, \"%5B\", \$$i); gsub(/\]/, \"%5D\", \$$i); gsub(/{/, \"%7B\", \$$i); gsub(/}/, \"%7D\", \$$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}/$$(basename -s .ttl.gz $$URL).nt.gz"; done > pubchem.get-data-cmds.txt
46
+ GET_DATA_CMD = mkdir -p ttl.${DATE} && mkdir -p nt.${DATE} && ${MAKE_GET_DATA_CMD} && cat pubchem.get-data-cmds.txt | parallel --line-buffer
47
+ DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version ${DATE} (all folders except nbr2d and nbr3d)
48
+
49
+ [index]
50
+ INPUT_FILES = pubchem.additional-ontologies.nt.gz nt.${DATE}/*.nt.gz
51
+ CAT_INPUT_FILES = zcat ${FILE_NAMES}
52
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
53
+ STXXL_MEMORY = 10G
54
+
55
+ [server]
56
+ PORT = 7023
57
+ ACCESS_TOKEN = ${NAME}_310129823
58
+ MEMORY_FOR_QUERIES = 20G
59
+ TIMEOUT = 120s
60
+
61
+ [runtime]
62
+ SYSTEM = docker
63
+ IMAGE = docker.io/adfreiburg/qlever:latest
64
+
65
+ [ui]
66
+ UI_CONFIG = pubchem
@@ -0,0 +1,39 @@
1
+ # Qleverfile for Scientists, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # download .zip file of size 79 MB, uncompressed to 318 MB
4
+ # qlever index # takes ~20 seconds and ~1 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (instant)
6
+ #
7
+ # Also builds a text index for keyword search on the literals AND keyword search
8
+ # in Wikipedia sentences linked to the RDF data; see TEXT_DESCRIPTION below.
9
+
10
+ # Indexer settings
11
+ DB = scientists
12
+ RDF_FILES = "${DB}.nt"
13
+ CAT_FILES = "cat ${RDF_FILES}"
14
+ WITH_TEXT_INDEX = from_text_records_and_literals
15
+ STXXL_MEMORY = 1G
16
+ SETTINGS_JSON = '{ "ascii-prefixes-only": true, "num-triples-per-batch": 100000 }'
17
+ GET_DATA_CMD = "wget https://github.com/ad-freiburg/qlever/raw/master/e2e/scientist-collection.zip && unzip -j scientist-collection.zip && rm -f scientist-collection.zip"
18
+ INDEX_DESCRIPTION = "Scientist collection from QLever's end-to-end test, see https://github.com/ad-freiburg/qlever/tree/master/e2e"
19
+ TEXT_DESCRIPTION = "Literals (use FILTER CONTAINS) and Wikipedia articles (use ql:contains-entity and ql:contains-word)"
20
+
21
+ # Server settings
22
+ HOSTNAME = $(hostname -f)
23
+ SERVER_PORT = 7020
24
+ ACCESS_TOKEN = ${DB}_%RANDOM%
25
+ MEMORY_FOR_QUERIES = 5G
26
+ CACHE_MAX_SIZE = 2G
27
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 1G
28
+ CACHE_MAX_NUM_ENTRIES = 100
29
+
30
+ # QLever binaries
31
+ QLEVER_BIN_DIR = %QLEVER_BIN_DIR%
32
+ USE_DOCKER = true
33
+ QLEVER_DOCKER_IMAGE = adfreiburg/qlever
34
+ QLEVER_DOCKER_CONTAINER = qlever.scientists
35
+
36
+ # QLever UI
37
+ QLEVERUI_PORT = 7000
38
+ QLEVERUI_DIR = qlever-ui
39
+ QLEVERUI_CONFIG = default
@@ -0,0 +1,41 @@
1
+ # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # download RDFXML and convert to NT (around 1 TB each)
4
+ # qlever index # takes ~ 1.5 days and ~40 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (takes a few second)
6
+ #
7
+ # Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
8
+ # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
9
+ #
10
+ # Set DATE to the date of the latest release
11
+ #
12
+ # IMPORTANT: Build on SSD, disk space required: ~ 10 T. For running the server,
13
+ # the uniprot.index.???.meta files can be on HDD.
14
+
15
+ [data]
16
+ NAME = uniprot
17
+ DATE = 2024-01-24
18
+ DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
19
+ GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
20
+ RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | xz -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.xz/') && echo 'DONE converting $$RDFXML'"; done | parallel
21
+ GET_DATA_CMD = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date
22
+ DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}
23
+
24
+ [index]
25
+ INPUT_FILES = nt.${data:DATE}/*.nt.xz
26
+ CAT_INPUT_FILES = parallel --tmpdir . -j 4 'xzcat -f {}' ::: nt.${data:DATE}/*.nt.xz | pv -q -B 5G
27
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
28
+ STXXL_MEMORY = 60G
29
+
30
+ [server]
31
+ PORT = 7018
32
+ ACCESS_TOKEN = ${data:NAME}_1369924040
33
+ MEMORY_FOR_QUERIES = 20G
34
+ CACHE_MAX_SIZE = 10G
35
+
36
+ [runtime]
37
+ SYSTEM = docker
38
+ IMAGE = docker.io/adfreiburg/qlever:latest
39
+
40
+ [ui]
41
+ UI_CONFIG = uniprot
@@ -0,0 +1,31 @@
1
+ # Qleverfile for VVZ, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # this requires a separate internal tool
4
+ # qlever index # builds the index (takes a few seconds)
5
+ # qlever start # starts the server (takes a few seconds)
6
+ #
7
+ # Also builds a text index for fast kewyword search in literals.
8
+
9
+ [data]
10
+ NAME = vvz
11
+ GET_DATA_CMD = echo "This requires a separate tool"
12
+ DESCRIPTION = VVZ Uni Freiburg, selected faculties
13
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
14
+
15
+ [index]
16
+ INPUT_FILES = vvz.ttl
17
+ CAT_INPUT_FILES = cat ${FILE_NAMES}
18
+ SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
19
+ TEXT_INDEX = from_literals
20
+
21
+ [server]
22
+ PORT = 7041
23
+ ACCESS_TOKEN = ${data:NAME}_8736426534
24
+ MEMORY_FOR_QUERIES = 10G
25
+
26
+ [runtime]
27
+ SYSTEM = docker
28
+ IMAGE = docker.io/adfreiburg/qlever:latest
29
+
30
+ [ui]
31
+ UI_CONFIG = vvz
@@ -0,0 +1,30 @@
1
+ # Qleverfile for Wikidata, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data downloads two .bz2 files of total size ~100 GB
4
+ # qlever index takes ~7 hours and ~40 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start starts the server (takes around 30 seconds)
6
+
7
+ [data]
8
+ NAME = wikidata
9
+ GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
10
+ GET_DATA_CMD = curl -LO -C - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2
11
+ DESCRIPTION = "Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2)"
12
+
13
+ [index]
14
+ INPUT_FILES = latest-lexemes.ttl.bz2 latest-all.ttl.bz2
15
+ CAT_INPUT_FILES = bzcat ${FILE_NAMES}
16
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
17
+ STXXL_MEMORY = 10G
18
+
19
+ [server]
20
+ PORT = 7001
21
+ ACCESS_TOKEN = ${data:NAME}_372483264
22
+ MEMORY_FOR_QUERIES = 20G
23
+ CACHE_MAX_SIZE = 10G
24
+
25
+ [runtime]
26
+ SYSTEM = docker
27
+ IMAGE = docker.io/adfreiburg/qlever:latest
28
+
29
+ [ui]
30
+ UI_CONFIG = wikidata
@@ -0,0 +1,40 @@
1
+ # Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads .gz file of size ~100 MB (as of 24.02.2024)
4
+ # qlever index # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server (takes around 2 minutes)
6
+ #
7
+ # Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
8
+
9
+ [data]
10
+ NAME = wikipathways
11
+ RELEASE = 20231210
12
+ GET_DATA_URL = https://data.wikipathways.org/${RELEASE}/rdf
13
+ GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-void.ttl && \
14
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
15
+ unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
16
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
17
+ unzip -qq -c wikipathways-${RELEASE}-rdf-gpml.zip -x gpml/gpmlOntology.ttl > wikipathways-rdf-gpml.ttl && \
18
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-authors.zip && \
19
+ unzip -qq -c wikipathways-${RELEASE}-rdf-authors.zip > wikipathways-rdf-authors.ttl && \
20
+ cat wikipathways-rdf-*.ttl | grep ^@prefix | tr -s ' ' | sort -u > ${NAME}.prefix-definitions
21
+ DESCRIPTION = WikiPathways RDF, from ${GET_DATA_URL}
22
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
23
+
24
+ [index]
25
+ INPUT_FILES = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
26
+ CAT_INPUT_FILES = cat ${FILE_NAMES}
27
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
28
+ TEXT_INDEX = from_literals
29
+
30
+ [server]
31
+ PORT = 7040
32
+ ACCESS_TOKEN = ${data:NAME}_7643543846
33
+ MEMORY_FOR_QUERIES = 5G
34
+
35
+ [runtime]
36
+ SYSTEM = docker
37
+ IMAGE = docker.io/adfreiburg/qlever:latest
38
+
39
+ [ui]
40
+ UI_CONFIG = wikipathways
@@ -0,0 +1,33 @@
1
+ # Qleverfile for YAGO 4, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads 8 nt.gz file of size ~60 GB (as of 12.03.2020)
4
+ # qlever index # takes ~4 hours and ~10 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server
6
+
7
+ # NOTE concerning GET_DATA_CMD: The triples from wd-annotated-facts are
8
+ # contained in wd-facts. The "full types" are the YAGO types, the "simple
9
+ # types" are the schema.org types. They don't interfere with each other because
10
+ # they have distinct prefixes.
11
+
12
+ [data]
13
+ NAME = yago-4
14
+ GET_DATA_CMD = curl --location --continue-at - --remote-name-all https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-class.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-facts.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-full-types.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-labels.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-sameAs.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-schema.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-shapes.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-simple-types.nt.gz
15
+ DESCRIPTION = "Full dump from https://yago-knowledge.org/downloads/yago-4, version 12.03.2020"
16
+
17
+ [index]
18
+ INPUT_FILES = yago-wd-*.nt.gz
19
+ CAT_INPUT_FILES = zcat ${FILE_NAMES}
20
+ SETTINGS_JSON = { "languages-internal": ["en"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
21
+ STXXL_MEMORY = 10G
22
+
23
+ [server]
24
+ PORT = 9004
25
+ ACCESS_TOKEN = ${DB}_2347348732
26
+ MEMORY_FOR_QUERIES = 30G
27
+
28
+ [runtime]
29
+ SYSTEM = docker
30
+ IMAGE = docker.io/adfreiburg/qlever:latest
31
+
32
+ [ui]
33
+ UI_CONFIG = yago-4
@@ -4,13 +4,15 @@ import re
4
4
  import shlex
5
5
  import subprocess
6
6
  import time
7
+ import traceback
8
+ from pathlib import Path
7
9
 
8
10
  from termcolor import colored
9
11
 
10
12
  from qlever.command import QleverCommand
11
13
  from qlever.commands.clear_cache import ClearCacheCommand
12
14
  from qlever.log import log, mute_log
13
- from qlever.util import run_command
15
+ from qlever.util import run_command, run_curl_command
14
16
 
15
17
 
16
18
  class ExampleQueriesCommand(QleverCommand):
@@ -57,12 +59,27 @@ class ExampleQueriesCommand(QleverCommand):
57
59
  "or just compute the size of the result")
58
60
  subparser.add_argument("--limit", type=int,
59
61
  help="Limit on the number of results")
62
+ subparser.add_argument("--accept", type=str,
63
+ choices=["text/tab-separated-values",
64
+ "application/sparql-results+json"],
65
+ default="text/tab-separated-values",
66
+ help="Accept header for the SPARQL query")
60
67
  subparser.add_argument("--clear-cache",
61
68
  choices=["yes", "no"],
62
69
  default="yes",
63
70
  help="Clear the cache before each query")
64
71
 
65
72
  def execute(self, args) -> bool:
73
+ # If `args.accept` is `application/sparql-results+json`, we need `jq`.
74
+ if args.accept == "application/sparql-results+json":
75
+ try:
76
+ subprocess.run("jq --version", shell=True, check=True,
77
+ stdout=subprocess.DEVNULL,
78
+ stderr=subprocess.DEVNULL)
79
+ except Exception as e:
80
+ log.error(f"Please install `jq` for {args.accept} ({e})")
81
+ return False
82
+
66
83
  # Handle shotcuts for SPARQL endpoint.
67
84
  if args.sparql_endpoint_preset in self.presets:
68
85
  args.sparql_endpoint = self.presets[args.sparql_endpoint_preset]
@@ -92,6 +109,7 @@ class ExampleQueriesCommand(QleverCommand):
92
109
  else f"localhost:{args.port}")
93
110
  self.show(f"Obtain queries via: {get_queries_cmd}\n"
94
111
  f"SPARQL endpoint: {sparql_endpoint}\n"
112
+ f"Accept header: {args.accept}\n"
95
113
  f"Clear cache before each query:"
96
114
  f" {args.clear_cache.upper()}\n"
97
115
  f"Download result for each query or just count:"
@@ -103,7 +121,8 @@ class ExampleQueriesCommand(QleverCommand):
103
121
 
104
122
  # Get the example queries.
105
123
  try:
106
- example_query_lines = run_command(get_queries_cmd, return_output=True)
124
+ example_query_lines = run_command(get_queries_cmd,
125
+ return_output=True)
107
126
  if len(example_query_lines) == 0:
108
127
  log.error("No example queries matching the criteria found")
109
128
  return False
@@ -114,9 +133,10 @@ class ExampleQueriesCommand(QleverCommand):
114
133
 
115
134
  # Launch the queries one after the other and for each print: the
116
135
  # description, the result size, and the query processing time.
117
- count = 0
118
136
  total_time_seconds = 0.0
119
137
  total_result_size = 0
138
+ count_succeeded = 0
139
+ count_failed = 0
120
140
  for example_query_line in example_query_lines:
121
141
  # Parse description and query.
122
142
  description, query = example_query_line.split("\t")
@@ -155,44 +175,93 @@ class ExampleQueriesCommand(QleverCommand):
155
175
  + f" }} LIMIT {args.limit}"
156
176
 
157
177
  # Launch query.
158
- query_cmd = (f"curl -sv {sparql_endpoint}"
159
- f" -H \"Accept: text/tab-separated-values\""
160
- f" --data-urlencode query={shlex.quote(query)}")
161
- if args.download_or_count == "count":
162
- query_cmd += " | sed 1d"
163
- else:
164
- query_cmd += " | sed 1d | wc -l"
165
178
  try:
166
- log.debug(query_cmd)
179
+ curl_cmd = (f"curl -s {sparql_endpoint}"
180
+ f" -w \"HTTP code: %{{http_code}}\\n\""
181
+ f" -H \"Accept: {args.accept}\""
182
+ f" --data-urlencode query={shlex.quote(query)}")
183
+ log.debug(curl_cmd)
184
+ result_file = (f"qlever.example_queries.result."
185
+ f"{abs(hash(curl_cmd))}.tmp")
167
186
  start_time = time.time()
168
- result_size = run_command(query_cmd, return_output=True)
169
- result_size = int(result_size.strip())
187
+ http_code = run_curl_command(sparql_endpoint,
188
+ headers={"Accept": args.accept},
189
+ params={"query": query},
190
+ result_file=result_file).strip()
191
+ if http_code != "200":
192
+ raise Exception(f"HTTP code {http_code}"
193
+ f" {Path(result_file).read_text()}")
170
194
  time_seconds = time.time() - start_time
171
- time_string = f"{time_seconds:.2f}"
172
- result_string = f"{result_size:>14,}"
195
+ error_msg = None
173
196
  except Exception as e:
174
- time_seconds = 0.0
175
- time_string = "---"
176
- result_size = 0
177
- result_string = colored(f" FAILED {e}", "red")
197
+ if args.log_level == "DEBUG":
198
+ traceback.print_exc()
199
+ error_msg = re.sub(r"\s+", " ", str(e))
200
+
201
+ # Get result size (via the command line, in order to avoid loading
202
+ # a potentially large JSON file into Python, which is slow).
203
+ if error_msg is None:
204
+ try:
205
+ if args.download_or_count == "count":
206
+ if args.accept == "text/tab-separated-values":
207
+ result_size = run_command(
208
+ f"sed 1d {result_file}",
209
+ return_output=True)
210
+ else:
211
+ result_size = run_command(
212
+ f"jq -r \".results.bindings[0]"
213
+ f" | to_entries[0].value.value"
214
+ f" | tonumber\" {result_file}",
215
+ return_output=True)
216
+ else:
217
+ if args.accept == "text/tab-separated-values":
218
+ result_size = run_command(
219
+ f"sed 1d {result_file} | wc -l",
220
+ return_output=True)
221
+ else:
222
+ result_size = run_command(
223
+ f"jq -r \".results.bindings | length\""
224
+ f" {result_file}",
225
+ return_output=True)
226
+ result_size = int(result_size)
227
+ except Exception as e:
228
+ error_msg = str(e)
178
229
 
179
230
  # Print description, time, result in tabular form.
180
231
  if (len(description) > 60):
181
232
  description = description[:57] + "..."
182
- log.info(f"{description:<60} {time_string:>6} s "
183
- f"{result_string}")
184
- count += 1
185
- total_time_seconds += time_seconds
186
- total_result_size += result_size
233
+ if error_msg is None:
234
+ log.info(f"{description:<60} {time_seconds:6.2f} s "
235
+ f"{result_size:14,}")
236
+ count_succeeded += 1
237
+ total_time_seconds += time_seconds
238
+ total_result_size += result_size
239
+ else:
240
+ count_failed += 1
241
+ if (len(error_msg) > 60) and args.log_level != "DEBUG":
242
+ error_msg = error_msg[:57] + "..."
243
+ log.error(f"{description:<60} failed "
244
+ f"{colored(error_msg, 'red')}")
187
245
 
188
246
  # Print total time.
189
247
  log.info("")
190
- description = (f"TOTAL for {count} "
191
- f"{'query' if count == 1 else 'queries'}")
192
- log.info(f"{description:<60} {total_time_seconds:6.2f} s "
193
- f"{total_result_size:>14,}")
194
- description = (f"AVERAGE for {count} "
195
- f"{'query' if count == 1 else 'queries'}")
196
- log.info(f"{description:<60} {total_time_seconds / count:6.2f} s "
197
- f"{round(total_result_size / count):>14,}")
248
+ if count_succeeded > 0:
249
+ query_or_queries = "query" if count_succeeded == 1 else "queries"
250
+ description = (f"TOTAL for {count_succeeded} {query_or_queries}")
251
+ log.info(f"{description:<60} "
252
+ f"{total_time_seconds:6.2f} s "
253
+ f"{total_result_size:>14,}")
254
+ description = (f"AVERAGE for {count_succeeded} {query_or_queries}")
255
+ log.info(f"{description:<60} "
256
+ f"{total_time_seconds / count_succeeded:6.2f} s "
257
+ f"{round(total_result_size / count_succeeded):>14,}")
258
+ else:
259
+ if count_failed == 1:
260
+ log.info(colored("One query failed", "red"))
261
+ elif count_failed > 1:
262
+ log.info(colored("All queries failed", "red"))
263
+
264
+ # Return success (has nothing to do with how many queries failed).
265
+ if args.log_level != "DEBUG":
266
+ Path(result_file).unlink(missing_ok=True)
198
267
  return True
@@ -71,14 +71,17 @@ class IndexStatsCommand(QleverCommand):
71
71
 
72
72
  # Helper function that finds the next line matching the given `regex`,
73
73
  # starting from `current_line`, and extracts the time. Returns a tuple
74
- # of the time and the regex match object. If a match is found,
75
- # `current_line` is updated to the line after the match. Otherwise,
76
- # `current_line` will be one beyond the last line, unless
77
- # `line_is_optional` is true, in which case it will be the same as when
78
- # the function was entered.
74
+ # of the time and the regex match object.
75
+ #
76
+ # If `update_current_line` is `False`, then `current_line` will not be
77
+ # updated by this call.
78
+ #
79
+ # Otherwise, and this is the default behavior, `current_line` will be
80
+ # updated to the line after the first match, or one beyond the last
81
+ # line if no match is found.
79
82
  current_line = 0
80
83
 
81
- def find_next_line(regex, line_is_optional=False):
84
+ def find_next_line(regex, update_current_line=True):
82
85
  nonlocal lines
83
86
  nonlocal current_line
84
87
  current_line_backup = current_line
@@ -99,7 +102,7 @@ class IndexStatsCommand(QleverCommand):
99
102
  f"\"{timestamp_regex}\" from line "
100
103
  f" \"{line.rstrip()}\" ({e})")
101
104
  # If we get here, we did not find a matching line.
102
- if line_is_optional:
105
+ if not update_current_line:
103
106
  current_line = current_line_backup
104
107
  return None, None
105
108
 
@@ -110,24 +113,34 @@ class IndexStatsCommand(QleverCommand):
110
113
  convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
111
114
  perm_begin_and_info = []
112
115
  while True:
113
- perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", True)
116
+ # Find the next line that starts a permutation.
117
+ #
118
+ # NOTE: Should work for the old and new format of the index log
119
+ # file (old format: "Creating a pair" + names of permutations in
120
+ # line "Writing meta data for ..."; new format: name of
121
+ # permutations already in line "Creating permutations ...").
122
+ perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair",
123
+ update_current_line=False)
114
124
  if perm_begin is None:
125
+ perm_begin, perm_info = find_next_line(
126
+ r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)",
127
+ update_current_line=False)
128
+ else:
129
+ _, perm_info = find_next_line(
130
+ r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)",
131
+ update_current_line=False)
132
+ if perm_info is None:
115
133
  break
116
- _, perm_info = find_next_line(r"INFO:\s*Writing meta data for"
117
- r" ([A-Z]+ and [A-Z]+)", True)
118
- # if perm_info is None:
119
- # break
120
134
  perm_begin_and_info.append((perm_begin, perm_info))
121
135
  convert_end = (perm_begin_and_info[0][0] if
122
136
  len(perm_begin_and_info) > 0 else None)
123
137
  normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
124
- text_begin, _ = find_next_line(r"INFO:\s*Adding text index", True)
125
- text_end, _ = find_next_line(r"INFO:\s*Text index build comp", True)
138
+ text_begin, _ = find_next_line(r"INFO:\s*Adding text index",
139
+ update_current_line=False)
140
+ text_end, _ = find_next_line(r"INFO:\s*Text index build comp",
141
+ update_current_line=False)
126
142
  if args.ignore_text_index:
127
143
  text_begin = text_end = None
128
- # print("DEBUG:", len(perm_begin_and_info), perm_begin_and_info)
129
- # print("DEBUG:", overall_begin)
130
- # print("DEBUG:", normal_end)
131
144
 
132
145
  # Check whether at least the first phase is done.
133
146
  if overall_begin is None:
qlever/config.py CHANGED
@@ -4,6 +4,7 @@ import argparse
4
4
  import os
5
5
  import traceback
6
6
  from pathlib import Path
7
+ from importlib.metadata import version
7
8
 
8
9
  import argcomplete
9
10
 
@@ -180,6 +181,8 @@ class QleverConfig:
180
181
  # are defined in the modules in `qlever/commands`. In `__init__.py`
181
182
  # an object of each class is created and stored in `command_objects`.
182
183
  parser = argparse.ArgumentParser()
184
+ parser.add_argument("--version", action="version",
185
+ version=f"%(prog)s {version('qlever')}")
183
186
  add_qleverfile_option(parser)
184
187
  subparsers = parser.add_subparsers(dest='command')
185
188
  subparsers.required = True
qlever/util.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- import random
4
3
  import re
4
+ import secrets
5
5
  import shlex
6
6
  import shutil
7
7
  import string
@@ -31,7 +31,7 @@ def run_command(cmd: str, return_output: bool = False,
31
31
  show_output: bool = False) -> Optional[str]:
32
32
  """
33
33
  Run the given command and throw an exception if the exit code is non-zero.
34
- If `get_output` is `True`, return what the command wrote to `stdout`.
34
+ If `return_output` is `True`, return what the command wrote to `stdout`.
35
35
 
36
36
  NOTE: The `set -o pipefail` ensures that the exit code of the command is
37
37
  non-zero if any part of the pipeline fails (not just the last part).
@@ -68,6 +68,45 @@ def run_command(cmd: str, return_output: bool = False,
68
68
  return result.stdout
69
69
 
70
70
 
71
+ def run_curl_command(url: str,
72
+ headers: dict[str, str] = {},
73
+ params: dict[str, str] = {},
74
+ result_file: Optional[str] = None) -> str:
75
+ """
76
+ Run `curl` with the given `url`, `headers`, and `params`. If `result_file`
77
+ is `None`, return the output, otherwise, write the output to the given file
78
+ and return the HTTP code. If the `curl` command fails, throw an exception.
79
+
80
+ """
81
+ # Construct and run the `curl` command.
82
+ default_result_file = "/tmp/qlever.curl.result"
83
+ actual_result_file = result_file if result_file else default_result_file
84
+ curl_cmd = (f"curl -s -o \"{actual_result_file}\""
85
+ f" -w \"%{{http_code}}\n\" {url}"
86
+ + "".join([f" -H \"{key}: {value}\""
87
+ for key, value in headers.items()])
88
+ + "".join([f" --data-urlencode {key}={shlex.quote(value)}"
89
+ for key, value in params.items()]))
90
+ result = subprocess.run(curl_cmd, shell=True, text=True,
91
+ stdout=subprocess.PIPE,
92
+ stderr=subprocess.PIPE)
93
+ # Case 1: An error occurred, raise an exception.
94
+ if result.returncode != 0:
95
+ if len(result.stderr) > 0:
96
+ raise Exception(result.stderr)
97
+ else:
98
+ raise Exception(f"curl command failed with exit code "
99
+ f"{result.returncode}, stderr is empty")
100
+ # Case 2: Return output (read from `default_result_file`).
101
+ if result_file is None:
102
+ result_file_path = Path(default_result_file)
103
+ result = result_file_path.read_text()
104
+ result_file_path.unlink()
105
+ return result
106
+ # Case 3: Return HTTP code.
107
+ return result.stdout
108
+
109
+
71
110
  def is_qlever_server_alive(port: str) -> bool:
72
111
  """
73
112
  Helper function that checks if a QLever server is running on the given
@@ -82,30 +121,6 @@ def is_qlever_server_alive(port: str) -> bool:
82
121
  return exit_code == 0
83
122
 
84
123
 
85
- def get_curl_cmd_for_sparql_query(
86
- query: str, port: int,
87
- host: str = "localhost",
88
- media_type: str = "application/sparql-results+qlever",
89
- verbose: bool = False,
90
- pinresult: bool = False,
91
- access_token: Optional[str] = None,
92
- send: Optional[int] = None) -> str:
93
- """
94
- Get curl command for given SPARQL query.
95
- """
96
- curl_cmd = (f"curl -s http::{host}:{port}"
97
- f" -H \"Accept: {media_type}\" "
98
- f" --data-urlencode query={shlex.quote(query)}")
99
- if pinresult and access_token is not None:
100
- curl_cmd += " --data-urlencode pinresult=true"
101
- curl_cmd += f" --data-urlencode access_token={access_token}"
102
- if send is not None:
103
- curl_cmd += f" --data-urlencode send={send}"
104
- if verbose:
105
- curl_cmd += " --verbose"
106
- return curl_cmd
107
-
108
-
109
124
  def get_existing_index_files(basename: str) -> list[str]:
110
125
  """
111
126
  Helper function that returns a list of all index files for `basename` in
@@ -137,8 +152,9 @@ def show_process_info(psutil_process, cmdline_regex, show_heading=True):
137
152
  pinfo = psutil_process.as_dict(
138
153
  attrs=['pid', 'username', 'create_time',
139
154
  'memory_info', 'cmdline'])
140
- cmdline = " ".join(pinfo['cmdline'])
141
- if not re.search(cmdline_regex, cmdline):
155
+ # Note: pinfo[`cmdline`] is `None` if the process is a zombie.
156
+ cmdline = " ".join(pinfo['cmdline'] or [])
157
+ if len(cmdline) == 0 or not re.search(cmdline_regex, cmdline):
142
158
  return False
143
159
  pid = pinfo['pid']
144
160
  user = pinfo['username'] if pinfo['username'] else ""
@@ -162,6 +178,5 @@ def get_random_string(length: int) -> str:
162
178
  Helper function that returns a randomly chosen string of the given
163
179
  length. Take the current time as seed.
164
180
  """
165
- random.seed(datetime.now())
166
- return "".join(random.choices(string.ascii_letters + string.digits,
167
- k=length))
181
+ characters = string.ascii_letters + string.digits
182
+ return "".join(secrets.choice(characters) for _ in range(length))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qlever
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Script for using the QLever SPARQL engine.
5
5
  Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
6
6
  License: Apache License
@@ -214,6 +214,7 @@ Description-Content-Type: text/markdown
214
214
  License-File: LICENSE
215
215
  Requires-Dist: psutil
216
216
  Requires-Dist: termcolor
217
+ Requires-Dist: argcomplete
217
218
 
218
219
  # QLever
219
220
 
@@ -0,0 +1,47 @@
1
+ qlever/__init__.py,sha256=IyfS1OhlVE7-rjtv6FPlL0R56VxcNsS6KS7NJQhTDIM,1367
2
+ qlever/__main__.py,sha256=MqM37bEzQeJEGUXZvuLcilIvnObZiG2eTGIkfKGpdnw,62016
3
+ qlever/command.py,sha256=yOr0Uc8D8-AM7EjwDsVzbc3KNYjPH-FVOZhIHkqO588,2749
4
+ qlever/config.py,sha256=-jjHAL8jdp25v53SqXKP4gWip6Qw9OdlDvFN6X7uk_4,10184
5
+ qlever/containerize.py,sha256=p8g3O3G8a_0XLzSTzl_e5t9dqjbCQ-ippoA8vI2Z9pI,4193
6
+ qlever/log.py,sha256=k9Mq4hxQ_d2k0e-5ZVgcB2XIRhOsGMO9I3rIR7YQyDA,1376
7
+ qlever/qlever_main.py,sha256=k8vIQYK7zqObFNet11iLf--nrLdPooL5amprmlySi4k,2300
8
+ qlever/qleverfile.py,sha256=6Ll81xkzel_s2Ju9ZfBXUGlRfikaAzZM6Do-dTrdo3k,12934
9
+ qlever/util.py,sha256=eepj0SY9JJOUQq5kvtoPnWfoLLV9fbw_sTEWKHet66E,7147
10
+ qlever/Qleverfiles/Qleverfile.dblp,sha256=SFjBD20aOSWod4mEQnxHSDWdInoE_EFp2nyMw7ev7ZA,1167
11
+ qlever/Qleverfiles/Qleverfile.dblp-plus,sha256=Dwd9pK1vPcelKfw6sA-IuyhbZ6yIxOh6_84JgPYnB9Q,1332
12
+ qlever/Qleverfiles/Qleverfile.default,sha256=mljl6I1RCkpIWOqMQwjzPZIsarYQx1R0mIlc583KuqU,1869
13
+ qlever/Qleverfiles/Qleverfile.dnb,sha256=yw4MmLsDPP3P5JWPgJwgPJh66TqwkyUXbQR5lSf5oHc,1511
14
+ qlever/Qleverfiles/Qleverfile.fbeasy,sha256=jeztW4gFpWL_w1nCH5qGHeZyZv2lz_kG6f1G3r3DkJ4,974
15
+ qlever/Qleverfiles/Qleverfile.freebase,sha256=k6PqYrtHTBr0EydObm1Hg9QWyAAM9fXkdcjhReDg0fM,1035
16
+ qlever/Qleverfiles/Qleverfile.imdb,sha256=uL5XlPwX01AmH-j6_Bc-PRm2fuPxGSIu8NaDflY525U,1623
17
+ qlever/Qleverfiles/Qleverfile.olympics,sha256=5w9BOFwEBhdSzPz-0LRxwhv-7Gj6xbF539HOXr3cqD0,1088
18
+ qlever/Qleverfiles/Qleverfile.osm-country,sha256=UnlkckSXJDrknZORlU-Hdj_J82U4kStl1aRctCc5n6M,1953
19
+ qlever/Qleverfiles/Qleverfile.osm-planet,sha256=2RilNix0fplN3GsNNyOu3GzmUss1Pq7586WKOFAQnSs,1400
20
+ qlever/Qleverfiles/Qleverfile.pubchem,sha256=bOhiJKUxzDiAm1UyXFPDQLYTqGc9jM8240fhobYLij0,3898
21
+ qlever/Qleverfiles/Qleverfile.scientists,sha256=oFhzURcRFciA27GZ-ux_hsDe0esBLobWHC6h_Vf2xy8,1735
22
+ qlever/Qleverfiles/Qleverfile.uniprot,sha256=FS8QLHvujbjUYyU2Ma0PRgfCWlulviaGLc_1csxpuic,2201
23
+ qlever/Qleverfiles/Qleverfile.vvz,sha256=ftdMj5dCC9jAlFtNt2WR7kP30w0itT_iYtj5HoUVyWU,931
24
+ qlever/Qleverfiles/Qleverfile.wikidata,sha256=fhWSChZTH3c2y14kgP1P5Duq1SsewTOK3wETf6RRmI8,1172
25
+ qlever/Qleverfiles/Qleverfile.wikipathways,sha256=qWjfT-CVQCgRfN6fXPwBORMbjzXS_xsJ2DoCamQI7Rs,2045
26
+ qlever/Qleverfiles/Qleverfile.yago-4,sha256=GikYPqChCtbAyZOVqszmVUwgQxSePTcgM8xw2b_21e4,1849
27
+ qlever/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ qlever/commands/add_text_index.py,sha256=dkqYtwgOhgnXiei_eyhBWYCtdAiQUEmjWoa3JMlMb4c,3641
29
+ qlever/commands/cache_stats.py,sha256=6JjueQstAqc8dNfgY8TP2EitFMxdUvCwrcyd7KUEb2o,4157
30
+ qlever/commands/clear_cache.py,sha256=AnE1MOoj1ZexxrRT8FGeBLlv8rtQIVV4DP8VBn5-X-s,2843
31
+ qlever/commands/example_queries.py,sha256=2rYTd35t0r7et0i-IBBcCpmVlYZya9kvwSI-gdTpNdE,12326
32
+ qlever/commands/get_data.py,sha256=0fGuRLDB7YofHtpqk0ctq9_de_xeuliSmSZafGXAo1A,1470
33
+ qlever/commands/index.py,sha256=lJhDnweknFZQm1czqPzNyz33EvbjIvOrS4j0wDaJ98o,5663
34
+ qlever/commands/index_stats.py,sha256=_BiUNBhmbYd9RPxrlm4HF0oENO6JmqnRiAkwkyOdN4U,11722
35
+ qlever/commands/log.py,sha256=8Krt3MsTUDapYqVw1zUu5X15SF8mV97Uj0qKOWK8jXk,1861
36
+ qlever/commands/setup_config.py,sha256=mFkEtCPZ6oeVfehjVLrcLttYcPDgtwXHrNIWWzvHOfo,2928
37
+ qlever/commands/start.py,sha256=2rOtk3NmhEs28D5csL_a1BdjSWU9VkcH6AqYT0vdww0,9285
38
+ qlever/commands/status.py,sha256=5S6EdapZEwFKV9cQZtNYcZhMbAXAY-FP6ggjIhfX8ek,1631
39
+ qlever/commands/stop.py,sha256=TZs4bxKHvujlZAU8BZmFjA5eXSZNAa6EeNzvPpEZsuI,4139
40
+ qlever/commands/ui.py,sha256=rV8u017WLbfz0zVT_c9GC4d9v1WWwrTM3kfGONbeCvQ,2499
41
+ qlever/commands/warmup.py,sha256=WOZSxeV8U_F6pEEnAb6YybXLQMxZFTRJXs4BPHUhsmc,1030
42
+ qlever-0.4.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
43
+ qlever-0.4.2.dist-info/METADATA,sha256=tyLaWQtRaXbIaQkJ72mCcRpjxlusFHztHdAWedpZ1QE,17076
44
+ qlever-0.4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
45
+ qlever-0.4.2.dist-info/entry_points.txt,sha256=s0iWBHKRUzsJ7B6nVGiyMdOJtiOS84IJMSSxgbNU6LU,85
46
+ qlever-0.4.2.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
47
+ qlever-0.4.2.dist-info/RECORD,,
@@ -1,30 +0,0 @@
1
- qlever/__init__.py,sha256=IyfS1OhlVE7-rjtv6FPlL0R56VxcNsS6KS7NJQhTDIM,1367
2
- qlever/__main__.py,sha256=MqM37bEzQeJEGUXZvuLcilIvnObZiG2eTGIkfKGpdnw,62016
3
- qlever/command.py,sha256=yOr0Uc8D8-AM7EjwDsVzbc3KNYjPH-FVOZhIHkqO588,2749
4
- qlever/config.py,sha256=LOVW8alFCVgZz_GAWm7vnjZVMVE7m3QTecy34lHgjGE,10017
5
- qlever/containerize.py,sha256=p8g3O3G8a_0XLzSTzl_e5t9dqjbCQ-ippoA8vI2Z9pI,4193
6
- qlever/log.py,sha256=k9Mq4hxQ_d2k0e-5ZVgcB2XIRhOsGMO9I3rIR7YQyDA,1376
7
- qlever/qlever_main.py,sha256=k8vIQYK7zqObFNet11iLf--nrLdPooL5amprmlySi4k,2300
8
- qlever/qleverfile.py,sha256=6Ll81xkzel_s2Ju9ZfBXUGlRfikaAzZM6Do-dTrdo3k,12934
9
- qlever/util.py,sha256=WM09PMRffUoPpEse4VwK9BzUavFkaB2Bm8KfVWxC3sQ,6161
10
- qlever/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- qlever/commands/add_text_index.py,sha256=dkqYtwgOhgnXiei_eyhBWYCtdAiQUEmjWoa3JMlMb4c,3641
12
- qlever/commands/cache_stats.py,sha256=6JjueQstAqc8dNfgY8TP2EitFMxdUvCwrcyd7KUEb2o,4157
13
- qlever/commands/clear_cache.py,sha256=AnE1MOoj1ZexxrRT8FGeBLlv8rtQIVV4DP8VBn5-X-s,2843
14
- qlever/commands/example_queries.py,sha256=3jlfHyL7pw1OSTuu3fY-23XaRAPIuEdNGW8QnIY2Va8,8644
15
- qlever/commands/get_data.py,sha256=0fGuRLDB7YofHtpqk0ctq9_de_xeuliSmSZafGXAo1A,1470
16
- qlever/commands/index.py,sha256=lJhDnweknFZQm1czqPzNyz33EvbjIvOrS4j0wDaJ98o,5663
17
- qlever/commands/index_stats.py,sha256=ao7_ySyz8MAjUvCbEp3Kj30PsR5x3MBM3ohgEUWdALM,11083
18
- qlever/commands/log.py,sha256=8Krt3MsTUDapYqVw1zUu5X15SF8mV97Uj0qKOWK8jXk,1861
19
- qlever/commands/setup_config.py,sha256=mFkEtCPZ6oeVfehjVLrcLttYcPDgtwXHrNIWWzvHOfo,2928
20
- qlever/commands/start.py,sha256=2rOtk3NmhEs28D5csL_a1BdjSWU9VkcH6AqYT0vdww0,9285
21
- qlever/commands/status.py,sha256=5S6EdapZEwFKV9cQZtNYcZhMbAXAY-FP6ggjIhfX8ek,1631
22
- qlever/commands/stop.py,sha256=TZs4bxKHvujlZAU8BZmFjA5eXSZNAa6EeNzvPpEZsuI,4139
23
- qlever/commands/ui.py,sha256=rV8u017WLbfz0zVT_c9GC4d9v1WWwrTM3kfGONbeCvQ,2499
24
- qlever/commands/warmup.py,sha256=WOZSxeV8U_F6pEEnAb6YybXLQMxZFTRJXs4BPHUhsmc,1030
25
- qlever-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
26
- qlever-0.4.0.dist-info/METADATA,sha256=DuPh4u9Ukjt3-z31WK0mb_zj2OUV6bHnVLn1ESY7Gc0,17049
27
- qlever-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
28
- qlever-0.4.0.dist-info/entry_points.txt,sha256=s0iWBHKRUzsJ7B6nVGiyMdOJtiOS84IJMSSxgbNU6LU,85
29
- qlever-0.4.0.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
30
- qlever-0.4.0.dist-info/RECORD,,
File without changes