PyPI - qlever - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

qlever 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of qlever might be problematic. Click here for more details.

Files changed (28) hide show

qlever/Qleverfiles/Qleverfile.dblp +34 -0
qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
qlever/Qleverfiles/Qleverfile.default +47 -0
qlever/Qleverfiles/Qleverfile.dnb +37 -0
qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
qlever/Qleverfiles/Qleverfile.freebase +28 -0
qlever/Qleverfiles/Qleverfile.imdb +35 -0
qlever/Qleverfiles/Qleverfile.olympics +31 -0
qlever/Qleverfiles/Qleverfile.osm-country +42 -0
qlever/Qleverfiles/Qleverfile.osm-planet +36 -0
qlever/Qleverfiles/Qleverfile.pubchem +66 -0
qlever/Qleverfiles/Qleverfile.scientists +39 -0
qlever/Qleverfiles/Qleverfile.uniprot +41 -0
qlever/Qleverfiles/Qleverfile.vvz +31 -0
qlever/Qleverfiles/Qleverfile.wikidata +30 -0
qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
qlever/commands/example_queries.py +101 -32
qlever/commands/index_stats.py +30 -17
qlever/config.py +3 -0
qlever/util.py +46 -31
{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/METADATA +2 -1
qlever-0.4.2.dist-info/RECORD +47 -0
qlever-0.4.0.dist-info/RECORD +0 -30
{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/LICENSE +0 -0
{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/WHEEL +0 -0
{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/entry_points.txt +0 -0
{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/top_level.txt +0 -0

qlever/Qleverfiles/Qleverfile.dblp ADDED Viewed

@@ -0,0 +1,34 @@
+# Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .gz file of size ~3 GB (as of 31.07.2022)
+# qlever index     # takes ~30 minutes and ~20 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (takes around 2 minutes)
+#
+# Also builds a text index for fast kewyword search in literals. Without that
+# (WITH_TEXT_INDEX = false), the index build takes only ~10 minutes.
+[data]
+NAME              = dblp
+GET_DATA_URL      = https://dblp.org/rdf/${index:INPUT_FILES}
+GET_DATA_CMD      = curl -LO -C - ${GET_DATA_URL}
+DESCRIPTION       = DBLP computer science bibliography, data from ${GET_DATA_URL}
+TEXT_DESCRIPTION  = All literals, search with FILTER KEYWORDS(?text, "...")
+[index]
+INPUT_FILES     = dblp.ttl.gz
+CAT_INPUT_FILES = zcat ${INPUT_FILES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
+TEXT_INDEX      = from_literals
+[server]
+PORT               = 7015
+ACCESS_TOKEN       = ${data:NAME}_7643543846
+MEMORY_FOR_QUERIES = 30G
+CACHE_MAX_SIZE     = 5G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = dblp

qlever/Qleverfiles/Qleverfile.dblp-plus ADDED Viewed

@@ -0,0 +1,33 @@
+# Qleverfile for DBLP Plus, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data    downloads .gz file of size ~3 GB (as of 31.07.2022)
+# qlever index       takes ~30 minutes and ~20 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start       starts the server
+#
+# Also builds a text index for fast kewyword search in literals.
+[data]
+NAME              = dblp-plus
+GET_DATA_CMD      = wget -nc -O dblp.ttl.gz https://dblp.org/rdf/dblp.ttl.gz
+INDEX_DESCRIPTION = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
+TEXT_DESCRIPTION  = All literals, search with FILTER KEYWORDS(?text, "...")
+[index]
+INPUT_FILES       = dblp.ttl.gz affiliations.nt affiliations.additions.nt citations.nt
+CAT_INPUT_FILES   = zcat -f ${RDF_FILES}
+SETTINGS_JSON     = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [ "<https://w3id.org", "<https://doi.org", "<http://dx.doi.org" ] }
+TEXT_INDEX        = from_literals
+[server]
+PORT                        = 7027
+ACCESS_TOKEN                = ${data:NAME}_169238202
+MEMORY_FOR_QUERIES          = 20G
+CACHE_MAX_SIZE              = 10G
+CACHE_MAX_SIZE_SINGLE_ENTRY = 2G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = dblp-plus

qlever/Qleverfiles/Qleverfile.default ADDED Viewed

@@ -0,0 +1,47 @@
+# Automatically created by the "qlever" script
+#
+# Modify as you see fit. Beware that some of the values below are executed as
+# commands  by the script.
+#
+# If you have never seen a Qleverfile before, we recommend that you look at the
+# pre-filled Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
+# Qleverfiles first to get some inspiration. Or execute `qlever setup-config
+# <config name>` with a config name of your choice.
+# As a minimum, each dataset needs a name. If you want `qlever get-data` to do
+# something meaningful, you need to define GET_DATA_CMD. If you want to use the
+# QLever UI, you should define DESCRIPTION (and if you have a text index,
+# also TEXT_DESCRIPTION).
+[data]
+NAME              =
+# GET_DATA_CMD      =
+# DESCRIPTION =
+# TEXT_DESCRIPTION  =
+# CAT_INPUT_FILES produces the data that is piped into QLever's index builder.
+# Use SETTINGS_JSON for more advanced configuration settings (see the other
+# Qleverfiles for examples).
+[index]
+# INPUT_FILES     =
+# CAT_INPUT_FILES = cat ${INPUT_FILES}
+# SETTINGS_JSON   = {}
+# As a minimum, you need to specify the PORT, where QLever will listen for
+# SPARQL queries. If you want to send priviledged commands to the server, you
+# need to specify an ACCESS_TOKEN (modify the random number below).
+[server]
+PORT = 7001
+# ACCESS_TOKEN = ${data:NAME}_1234567890
+# With USE_DOCKER = true, the qlever script will download the docker image for
+# you and run QLever inside docker containers. With USE_DOCKER = false, you need
+# the QLever binaries in the PATH of your sheel.
+[runtime]
+SYSTEM = true
+IMAGE  = docker.io/adfreiburg/qlever:latest
+# The UI_CONFIG must be one of the slugs from http://qlever.cs.uni-freiburg.de
+# (see the dropdown menu on the top right, the slug is the last part of the URL).
+# In partiular, this determines the example queries.
+[ui]
+UI_CONFIG = default

qlever/Qleverfiles/Qleverfile.dnb ADDED Viewed

@@ -0,0 +1,37 @@
+# Qleverfile for Olympics, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # takes ~ 10 min to download .nt.gz file of size ~ 8 GB
+# qlever index     # takes ~ 20 min and ~ 5 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server
+#
+# NOTE: https://data.dnb.de/opendata/ is rather confusing becase of the many
+# files. This Qleverfile downloads all the datasets named "Gesamtabzug", except
+# bib_lds.nt.gz, which contains incorrectly formatted IRIs. The file
+# dnb-all_ldsprov.nt.gz contains invalid floating point literals; to ignore
+# them, compile QLever with TurtleParserBase::invalidLiteralsAreSkipped_ = true
+[data]
+NAME              = dnb
+BASE_URL          = https://data.dnb.de/opendata
+GET_DATA_CMD      = curl -L -C - --remote-name-all ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz
+DESCRIPTION       = DNB data from ${BASE_ULR} (authoritities-gnd_lds, dnb_all_lds, dnb-all_ldsprov, zdb_lds)
+TEXT_DESCRIPTION  = All literals, search with FILTER KEYWORDS(?var, "...")
+[index]
+INPUT_FILES     = *.nt.gz
+CAT_INPUT_FILES = zcat ${FILE_NAMES}
+SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
+TEXT_INDEX      = from_literals
+[server]
+PORT               = 7035
+ACCESS_TOKEN       = ${data:NAME}_284732743
+MEMORY_FOR_QUERIES = 10G
+CACHE_MAX_SIZE     = 2G
+[runtime]
+SYSTEM = true
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = dnb

qlever/Qleverfiles/Qleverfile.fbeasy ADDED Viewed

@@ -0,0 +1,29 @@
+# Qleverfile for Fbeasy, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .gz file of size ~3 GB (as of 31.07.2022)
+# qlever index     # takes ~10 minutes and ~10 GB RAM (on an AMD Ryzen 7 5900X)
+# qlever start     # starts the server
+[data]
+NAME              = fbeasy
+DATA_URL          = https://freebase-easy.cs.uni-freiburg.de
+GET_DATA_CMD      = wget -nc ${DATA_URL}/dump/fbeasy.nt
+DESCRIPTION       = RDF data from ${DATA_URL}, latest version from 18.07.2019
+TEXT_DESCRIPTION  = Sentences from Wikipedia that mention at least one Freebase entity
+[index]
+INPUT_FILES     = fbeasy.nt
+CAT_INPUT_FILES = cat ${RDF_FILES}
+SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }
+[server]
+PORT                        = 7003
+ACCESS_TOKEN                = ${data:NAME}_12631403
+MEMORY_FOR_QUERIES          = 5G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = fbeasy

qlever/Qleverfiles/Qleverfile.freebase ADDED Viewed

@@ -0,0 +1,28 @@
+# Qleverfile for Freebase, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .gz file of size ~3 GB (as of 31.07.2022)
+# qlever index     # takes ~4 hours and ~20 GB RAM (on an AMD Ryzen 7 5900X)
+# qlever start     # starts the server
+[data]
+NAME         = freebase
+DATA_URL     = http://commondatastorage.googleapis.com/freebase-public/rdf/freebase-rdf-latest.gz
+GET_DATA_CMD = wget -nc ${DATA_URL}
+DESCRIPTION  = RDF data from ${DATA_URL}, latest (and final) version from 09.08.2015
+[index]
+INPUT_FILES     = freebase-rdf-latest.gz
+CAT_INPUT_FILES = zcat ${RDF_FILES}
+SETTINGS_JSON   = { "languages-internal": [ "en" ], "prefixes-external": ["<"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 10000000 }
+[server]
+PORT               = 7002
+ACCESS_TOKEN       = ${data:NAME}_12631403
+MEMORY_FOR_QUERIES = 10G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = freebase

qlever/Qleverfiles/Qleverfile.imdb ADDED Viewed

@@ -0,0 +1,35 @@
+# Qleverfile for IMDB, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads "basics" and "ratings" of size ~1 GB
+# qlever index     # takes ~5 minutes and ~5 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (takes a few seconds)
+#
+# Supports fast kewyword search in literals (WITH_TEXT_INDEX = from_literals).
+[data]
+NAME             = imdb
+IMDB_DATA_URL    = https://datasets.imdbws.com
+GET_PREFIXES     = echo "@prefix imdb: <https://www.imdb.com/> .\n"
+GET_IMDB_BASICS  = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
+GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE}
+GET_DATA_CMD     = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl
+DESCRIPTION      = RDF data derived from ${IMDB_DATA_URL}
+TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
+[index]
+INPUT_FILES     = ${data:NAME}.ttl
+CAT_INPUT_FILES = cat ${FILE_NAMES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
+TEXT_INDEX      = from_literals
+[server]
+PORT               = 7029
+ACCESS_TOKEN       = ${data:NAME}_1234567890
+MEMORY_FOR_QUERIES = 5G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = imdb

qlever/Qleverfiles/Qleverfile.olympics ADDED Viewed

@@ -0,0 +1,31 @@
+# Qleverfile for Olympics, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .zip file of size 13 MB, uncompressed to 323 MB
+# qlever index     # takes ~10 seconds and ~1 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (instant)
+[data]
+NAME              = olympics
+BASE_URL          = https://github.com/wallscope/olympics-rdf
+GET_DATA_CMD      = curl -sLo olympics.zip -C - ${BASE_URL}/raw/master/data/olympics-nt-nodup.zip && unzip -q -o olympics.zip && rm olympics.zip
+DESCRIPTION       = 120 Years of Olympics, data from ${BASE_URL}
+TEXT_DESCRIPTION  = All literals, search with FILTER CONTAINS(?var, "...")
+[index]
+INPUT_FILES     = olympics.nt
+CAT_INPUT_FILES = cat ${INPUT_FILES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 100000 }
+[server]
+PORT               = 7019
+ACCESS_TOKEN       = ${data:NAME}_7643543846
+MEMORY_FOR_QUERIES = 5G
+CACHE_MAX_SIZE     = 2G
+TIMEOUT            = 30s
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = olympics

qlever/Qleverfiles/Qleverfile.osm-country ADDED Viewed

@@ -0,0 +1,42 @@
+# Qleverfile for OSM of some country, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .pbf file from Geofabrik und builds .ttl.bz2 using osm2rdf
+# qlever index     # for example Germany takes ~30 minutes and ~10 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server
+#
+# Make sure that osm2rdf is in your path. Set CONTINENT and COUNTRY such that
+# the link under GET_DATA_CMD exists (the names are usually the canonical
+# names). The time for osm2rdf is around the same as that for "qlever index".
+# Indexer settings
+CONTINENT         = europe
+COUNTRY           = switzerland
+DB                = osm-${COUNTRY}
+PBF               = ${DB}.pbf
+RDF_FILES         = "${DB}.ttl.bz2"
+CAT_FILES         = "bzcat ${RDF_FILES}"
+WITH_TEXT         = false
+STXXL_MEMORY   = 10
+SETTINGS_JSON     = '{ "prefixes-external": [ "\"LINESTRING(", "\"MULTIPOLYGON(", "\"POLYGON(" ], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }'
+GET_DATA_CMD      = "wget -nc -O ${PBF} https://download.geofabrik.de/${CONTINENT}/${COUNTRY}-latest.osm.pbf; rm -f ${DB}.*.bz2; ( time /local/data/osm2rdf/build/apps/osm2rdf ${PBF} -o ${DB}.ttl --cache . --write-geometric-relation-statistics ) 2>&1 | tee ${DB}.osm2rdf-log.txt; rm -f spatial-*"
+DESCRIPTION = "OSM ${COUNTRY^}, dump from $(ls -l --time-style=+%d.%m.%Y ${PBF} 2> /dev/null | cut -d' ' -f6) with ogc:contains"
+# Server settings
+HOSTNAME                    = $(hostname -f)
+SERVER_PORT                 = 7025
+ACCESS_TOKEN                = ${DB}_%RANDOM%
+MEMORY_FOR_QUERIES          = 20G
+CACHE_MAX_SIZE              = 10G
+CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
+CACHE_MAX_NUM_ENTRIES       = 100
+# QLever binaries
+QLEVER_BIN_DIR          = %QLEVER_BIN_DIR%
+USE_DOCKER              = true
+QLEVER_DOCKER_IMAGE     = adfreiburg/qlever
+QLEVER_DOCKER_CONTAINER = qlever.${DB}
+# QLever UI
+QLEVERUI_PORT   = 7000
+QLEVERUI_DIR    = qlever-ui
+QLEVERUI_CONFIG = osm

qlever/Qleverfiles/Qleverfile.osm-planet ADDED Viewed

@@ -0,0 +1,36 @@
+# Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
+#
+# qlever get-data  # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB
+# qlever index     # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # takes a few seconds
+#
+# For the OSM data of a single country, do `qlever setup-config osm-country`
+# and edit the Qleverfile to specify the country,
+[data]
+NAME         = osm-planet
+DATA_URL     = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
+GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
+VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y")
+DESCRIPTION  = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
+[index]
+INPUT_FILES     = ${data:NAME}.ttl.bz2
+CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES}
+STXXL_MEMORY    = 20G
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+[server]
+PORT                        = 7007
+ACCESS_TOKEN                = ${data:NAME}
+MEMORY_FOR_QUERIES          = 90G
+CACHE_MAX_SIZE              = 40G
+CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
+TIMEOUT                     = 300s
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = osm-planet

qlever/Qleverfiles/Qleverfile.pubchem ADDED Viewed

@@ -0,0 +1,66 @@
+# Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4
+# qlever index     # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
+# qlever start     # starts the server (a few seconds)
+#
+# IMPORTANT NOTES:
+#
+# NOTE 1: The SPARQL endpoint at https://qlever.cs.uni-freiburg.de/pubchem also
+# contains data from the following ontologies, which are very useful for
+# resolving names of IRIs like `sio:SIO_000008` or `obo:IAO_0000412`, but which
+# are not part of the PubChem RDF data. For the corresponding URLs, see
+# https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1200479401 .
+#
+# bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit
+# obi pr ro sio skos so uo
+#
+# NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
+# disallows downloading the PubChem RDF data using `wget --recursive` as in the
+# GET_DATA_CMD below. As a workaround, you can write a simple Python script
+# (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
+# pages and download the files individually. This was done for the latest
+# version of https://qlever.cs.uni-freiburg.de/pubchem .
+#
+# NOTE 3: Many of the TTL files have generic prefix definitions in the middle
+# of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
+# See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
+# This is allowed by the standard, but VERY unusual. For use with QLever,
+# convert the TTL files to NT before indexing, see GET_DATA_CMD below.
+#
+# NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
+# spaces and braces are not properly escaped. Here is a simple awk-based script
+# to percent-encode spaces and braces in all IRIs in the NT files:
+#
+# for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt
+# cat fix-nt.commands.txt | parallel
+[DEFAULT]
+NAME = pubchem
+DATE = 2024-02-03
+[data]
+GET_DATA_URL      = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
+MAKE_GET_DATA_CMD = curl -s ${GET_DATA_URL}/void.ttl | grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' | grep -v "nbr[23]d" | while read URL; do echo "echo \"Processing $$URL ...\"; curl --silent --remote-time --output ttl.${DATE}/$$(basename $$URL) $$URL && docker run --rm -v $$(pwd)/ttl.${DATE}:/data stain/jena turtle --output=NT /data/$$(basename $$URL) | sed 's/> />\t/1; s/> />\t/1; s/ \.\$$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$$i ~ /^<.*>\$$/) { gsub(/ /, \"%20\", \$$i); gsub(/\[/, \"%5B\", \$$i); gsub(/\]/, \"%5D\", \$$i); gsub(/{/, \"%7B\", \$$i); gsub(/}/, \"%7D\", \$$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}/$$(basename -s .ttl.gz $$URL).nt.gz"; done > pubchem.get-data-cmds.txt
+GET_DATA_CMD      = mkdir -p ttl.${DATE} && mkdir -p nt.${DATE} && ${MAKE_GET_DATA_CMD} && cat pubchem.get-data-cmds.txt | parallel --line-buffer
+DESCRIPTION       = PubChem RDF from ${GET_DATA_URL}, version ${DATE} (all folders except nbr2d and nbr3d)
+[index]
+INPUT_FILES     = pubchem.additional-ontologies.nt.gz nt.${DATE}/*.nt.gz
+CAT_INPUT_FILES = zcat ${FILE_NAMES}
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
+STXXL_MEMORY    = 10G
+[server]
+PORT               = 7023
+ACCESS_TOKEN       = ${NAME}_310129823
+MEMORY_FOR_QUERIES = 20G
+TIMEOUT            = 120s
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = pubchem

qlever/Qleverfiles/Qleverfile.scientists ADDED Viewed

@@ -0,0 +1,39 @@
+# Qleverfile for Scientists, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # download .zip file of size 79 MB, uncompressed to 318 MB
+# qlever index     # takes ~20 seconds and ~1 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (instant)
+#
+# Also builds a text index for keyword search on the literals AND keyword search
+# in Wikipedia sentences linked to the RDF data; see TEXT_DESCRIPTION below.
+# Indexer settings
+DB                = scientists
+RDF_FILES         = "${DB}.nt"
+CAT_FILES         = "cat ${RDF_FILES}"
+WITH_TEXT_INDEX   = from_text_records_and_literals
+STXXL_MEMORY      = 1G
+SETTINGS_JSON     = '{ "ascii-prefixes-only": true, "num-triples-per-batch": 100000 }'
+GET_DATA_CMD      = "wget https://github.com/ad-freiburg/qlever/raw/master/e2e/scientist-collection.zip && unzip -j scientist-collection.zip && rm -f scientist-collection.zip"
+INDEX_DESCRIPTION = "Scientist collection from QLever's end-to-end test, see https://github.com/ad-freiburg/qlever/tree/master/e2e"
+TEXT_DESCRIPTION  = "Literals (use FILTER CONTAINS) and Wikipedia articles (use ql:contains-entity and ql:contains-word)"
+# Server settings
+HOSTNAME                       = $(hostname -f)
+SERVER_PORT                    = 7020
+ACCESS_TOKEN                   = ${DB}_%RANDOM%
+MEMORY_FOR_QUERIES             = 5G
+CACHE_MAX_SIZE                 = 2G
+CACHE_MAX_SIZE_SINGLE_ENTRY    = 1G
+CACHE_MAX_NUM_ENTRIES          = 100
+# QLever binaries
+QLEVER_BIN_DIR          = %QLEVER_BIN_DIR%
+USE_DOCKER              = true
+QLEVER_DOCKER_IMAGE     = adfreiburg/qlever
+QLEVER_DOCKER_CONTAINER = qlever.scientists
+# QLever UI
+QLEVERUI_PORT   = 7000
+QLEVERUI_DIR    = qlever-ui
+QLEVERUI_CONFIG = default

qlever/Qleverfiles/Qleverfile.uniprot ADDED Viewed

@@ -0,0 +1,41 @@
+# Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # download RDFXML and convert to NT (around 1 TB each)
+# qlever index     # takes ~ 1.5 days and ~40 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (takes a few second)
+#
+# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
+# Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
+#
+# Set DATE to the date of the latest release
+#
+# IMPORTANT: Build on SSD, disk space required: ~ 10 T. For running the server,
+# the uniprot.index.???.meta files can be on HDD.
+[data]
+NAME           = uniprot
+DATE           = 2024-01-24
+DOWNLOAD_URL   = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
+GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
+RDFXML2NT_CMD  = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | xz -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.xz/') && echo 'DONE converting $$RDFXML'"; done | parallel
+GET_DATA_CMD   = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date
+DESCRIPTION    = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}
+[index]
+INPUT_FILES     = nt.${data:DATE}/*.nt.xz
+CAT_INPUT_FILES = parallel --tmpdir . -j 4 'xzcat -f {}' ::: nt.${data:DATE}/*.nt.xz | pv -q -B 5G
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
+STXXL_MEMORY    = 60G
+[server]
+PORT               = 7018
+ACCESS_TOKEN       = ${data:NAME}_1369924040
+MEMORY_FOR_QUERIES = 20G
+CACHE_MAX_SIZE     = 10G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = uniprot

qlever/Qleverfiles/Qleverfile.vvz ADDED Viewed

@@ -0,0 +1,31 @@
+# Qleverfile for VVZ, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # this requires a separate internal tool
+# qlever index     # builds the index (takes a few seconds)
+# qlever start     # starts the server (takes a few seconds)
+#
+# Also builds a text index for fast kewyword search in literals.
+[data]
+NAME             = vvz
+GET_DATA_CMD     = echo "This requires a separate tool"
+DESCRIPTION      = VVZ Uni Freiburg, selected faculties
+TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
+[index]
+INPUT_FILES     = vvz.ttl
+CAT_INPUT_FILES = cat ${FILE_NAMES}
+SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
+TEXT_INDEX      = from_literals
+[server]
+PORT               = 7041
+ACCESS_TOKEN       = ${data:NAME}_8736426534
+MEMORY_FOR_QUERIES = 10G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = vvz

qlever/Qleverfiles/Qleverfile.wikidata ADDED Viewed

@@ -0,0 +1,30 @@
+# Qleverfile for Wikidata, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data    downloads two .bz2 files of total size ~100 GB
+# qlever index       takes ~7 hours and ~40 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start       starts the server (takes around 30 seconds)
+[data]
+NAME         = wikidata
+GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
+GET_DATA_CMD = curl -LO -C - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2
+DESCRIPTION  = "Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2)"
+[index]
+INPUT_FILES     = latest-lexemes.ttl.bz2 latest-all.ttl.bz2
+CAT_INPUT_FILES = bzcat ${FILE_NAMES}
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+STXXL_MEMORY    = 10G
+[server]
+PORT               = 7001
+ACCESS_TOKEN       = ${data:NAME}_372483264
+MEMORY_FOR_QUERIES = 20G
+CACHE_MAX_SIZE     = 10G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = wikidata

qlever/Qleverfiles/Qleverfile.wikipathways ADDED Viewed

@@ -0,0 +1,40 @@
+# Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads .gz file of size ~100 MB (as of 24.02.2024)
+# qlever index     # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server (takes around 2 minutes)
+#
+# Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
+[data]
+NAME             = wikipathways
+RELEASE          = 20231210
+GET_DATA_URL     = https://data.wikipathways.org/${RELEASE}/rdf
+GET_DATA_CMD     = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-void.ttl && \
+                    wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
+                      unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
+                    wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
+                      unzip -qq -c wikipathways-${RELEASE}-rdf-gpml.zip -x gpml/gpmlOntology.ttl > wikipathways-rdf-gpml.ttl && \
+                    wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-authors.zip && \
+                      unzip -qq -c wikipathways-${RELEASE}-rdf-authors.zip > wikipathways-rdf-authors.ttl && \
+                    cat wikipathways-rdf-*.ttl | grep ^@prefix | tr -s ' ' | sort -u > ${NAME}.prefix-definitions
+DESCRIPTION      = WikiPathways RDF, from ${GET_DATA_URL}
+TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
+[index]
+INPUT_FILES     = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
+CAT_INPUT_FILES = cat ${FILE_NAMES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
+TEXT_INDEX      = from_literals
+[server]
+PORT               = 7040
+ACCESS_TOKEN       = ${data:NAME}_7643543846
+MEMORY_FOR_QUERIES = 5G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = wikipathways

qlever/Qleverfiles/Qleverfile.yago-4 ADDED Viewed

@@ -0,0 +1,33 @@
+# Qleverfile for YAGO 4, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # downloads 8 nt.gz file of size ~60 GB (as of 12.03.2020)
+# qlever index     # takes ~4 hours and ~10 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # starts the server
+# NOTE concerning GET_DATA_CMD: The triples from wd-annotated-facts are
+# contained in wd-facts. The "full types" are the YAGO types, the "simple
+# types" are the schema.org types. They don't interfere with each other because
+# they have distinct prefixes.
+[data]
+NAME         = yago-4
+GET_DATA_CMD = curl --location --continue-at - --remote-name-all https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-class.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-facts.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-full-types.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-labels.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-sameAs.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-schema.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-shapes.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-simple-types.nt.gz
+DESCRIPTION  = "Full dump from https://yago-knowledge.org/downloads/yago-4, version 12.03.2020"
+[index]
+INPUT_FILES     = yago-wd-*.nt.gz
+CAT_INPUT_FILES = zcat ${FILE_NAMES}
+SETTINGS_JSON   = { "languages-internal": ["en"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+STXXL_MEMORY    = 10G
+[server]
+PORT                        = 9004
+ACCESS_TOKEN                = ${DB}_2347348732
+MEMORY_FOR_QUERIES          = 30G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = yago-4

qlever/commands/example_queries.py CHANGED Viewed

@@ -4,13 +4,15 @@ import re
 import shlex
 import subprocess
 import time
+import traceback
+from pathlib import Path
 from termcolor import colored
 from qlever.command import QleverCommand
 from qlever.commands.clear_cache import ClearCacheCommand
 from qlever.log import log, mute_log
-from qlever.util import run_command
+from qlever.util import run_command, run_curl_command
 class ExampleQueriesCommand(QleverCommand):
@@ -57,12 +59,27 @@ class ExampleQueriesCommand(QleverCommand):
                                "or just compute the size of the result")
         subparser.add_argument("--limit", type=int,
                                help="Limit on the number of results")
+        subparser.add_argument("--accept", type=str,
+                               choices=["text/tab-separated-values",
+                                        "application/sparql-results+json"],
+                               default="text/tab-separated-values",
+                               help="Accept header for the SPARQL query")
         subparser.add_argument("--clear-cache",
                                choices=["yes", "no"],
                                default="yes",
                                help="Clear the cache before each query")
     def execute(self, args) -> bool:
+        # If `args.accept` is `application/sparql-results+json`, we need `jq`.
+        if args.accept == "application/sparql-results+json":
+            try:
+                subprocess.run("jq --version", shell=True, check=True,
+                               stdout=subprocess.DEVNULL,
+                               stderr=subprocess.DEVNULL)
+            except Exception as e:
+                log.error(f"Please install `jq` for {args.accept} ({e})")
+                return False
         # Handle shotcuts for SPARQL endpoint.
         if args.sparql_endpoint_preset in self.presets:
             args.sparql_endpoint = self.presets[args.sparql_endpoint_preset]
@@ -92,6 +109,7 @@ class ExampleQueriesCommand(QleverCommand):
                            else f"localhost:{args.port}")
         self.show(f"Obtain queries via: {get_queries_cmd}\n"
                   f"SPARQL endpoint: {sparql_endpoint}\n"
+                  f"Accept header: {args.accept}\n"
                   f"Clear cache before each query:"
                   f" {args.clear_cache.upper()}\n"
                   f"Download result for each query or just count:"
@@ -103,7 +121,8 @@ class ExampleQueriesCommand(QleverCommand):
         # Get the example queries.
         try:
-            example_query_lines = run_command(get_queries_cmd, return_output=True)
+            example_query_lines = run_command(get_queries_cmd,
+                                              return_output=True)
             if len(example_query_lines) == 0:
                 log.error("No example queries matching the criteria found")
                 return False
@@ -114,9 +133,10 @@ class ExampleQueriesCommand(QleverCommand):
         # Launch the queries one after the other and for each print: the
         # description, the result size, and the query processing time.
-        count = 0
         total_time_seconds = 0.0
         total_result_size = 0
+        count_succeeded = 0
+        count_failed = 0
         for example_query_line in example_query_lines:
             # Parse description and query.
             description, query = example_query_line.split("\t")
@@ -155,44 +175,93 @@ class ExampleQueriesCommand(QleverCommand):
                           + f" }} LIMIT {args.limit}"
             # Launch query.
-            query_cmd = (f"curl -sv {sparql_endpoint}"
-                         f" -H \"Accept: text/tab-separated-values\""
-                         f" --data-urlencode query={shlex.quote(query)}")
-            if args.download_or_count == "count":
-                query_cmd += " | sed 1d"
-            else:
-                query_cmd += " | sed 1d | wc -l"
             try:
-                log.debug(query_cmd)
+                curl_cmd = (f"curl -s {sparql_endpoint}"
+                            f" -w \"HTTP code: %{{http_code}}\\n\""
+                            f" -H \"Accept: {args.accept}\""
+                            f" --data-urlencode query={shlex.quote(query)}")
+                log.debug(curl_cmd)
+                result_file = (f"qlever.example_queries.result."
+                               f"{abs(hash(curl_cmd))}.tmp")
                 start_time = time.time()
-                result_size = run_command(query_cmd, return_output=True)
-                result_size = int(result_size.strip())
+                http_code = run_curl_command(sparql_endpoint,
+                                             headers={"Accept": args.accept},
+                                             params={"query": query},
+                                             result_file=result_file).strip()
+                if http_code != "200":
+                    raise Exception(f"HTTP code {http_code}"
+                                    f"  {Path(result_file).read_text()}")
                 time_seconds = time.time() - start_time
-                time_string = f"{time_seconds:.2f}"
-                result_string = f"{result_size:>14,}"
+                error_msg = None
             except Exception as e:
-                time_seconds = 0.0
-                time_string = "---"
-                result_size = 0
-                result_string = colored(f"        FAILED {e}", "red")
+                if args.log_level == "DEBUG":
+                    traceback.print_exc()
+                error_msg = re.sub(r"\s+", " ", str(e))
+            # Get result size (via the command line, in order to avoid loading
+            # a potentially large JSON file into Python, which is slow).
+            if error_msg is None:
+                try:
+                    if args.download_or_count == "count":
+                        if args.accept == "text/tab-separated-values":
+                            result_size = run_command(
+                                    f"sed 1d {result_file}",
+                                    return_output=True)
+                        else:
+                            result_size = run_command(
+                                    f"jq -r \".results.bindings[0]"
+                                    f" | to_entries[0].value.value"
+                                    f" | tonumber\" {result_file}",
+                                    return_output=True)
+                    else:
+                        if args.accept == "text/tab-separated-values":
+                            result_size = run_command(
+                                    f"sed 1d {result_file} | wc -l",
+                                    return_output=True)
+                        else:
+                            result_size = run_command(
+                                    f"jq -r \".results.bindings | length\""
+                                    f" {result_file}",
+                                    return_output=True)
+                    result_size = int(result_size)
+                except Exception as e:
+                    error_msg = str(e)
             # Print description, time, result in tabular form.
             if (len(description) > 60):
                 description = description[:57] + "..."
-            log.info(f"{description:<60}  {time_string:>6} s  "
-                     f"{result_string}")
-            count += 1
-            total_time_seconds += time_seconds
-            total_result_size += result_size
+            if error_msg is None:
+                log.info(f"{description:<60}  {time_seconds:6.2f} s  "
+                         f"{result_size:14,}")
+                count_succeeded += 1
+                total_time_seconds += time_seconds
+                total_result_size += result_size
+            else:
+                count_failed += 1
+                if (len(error_msg) > 60) and args.log_level != "DEBUG":
+                    error_msg = error_msg[:57] + "..."
+                log.error(f"{description:<60}    failed   "
+                          f"{colored(error_msg, 'red')}")
         # Print total time.
         log.info("")
-        description = (f"TOTAL   for {count} "
-                       f"{'query' if count == 1 else 'queries'}")
-        log.info(f"{description:<60}  {total_time_seconds:6.2f} s  "
-                 f"{total_result_size:>14,}")
-        description = (f"AVERAGE for {count} "
-                       f"{'query' if count == 1 else 'queries'}")
-        log.info(f"{description:<60}  {total_time_seconds / count:6.2f} s  "
-                 f"{round(total_result_size / count):>14,}")
+        if count_succeeded > 0:
+            query_or_queries = "query" if count_succeeded == 1 else "queries"
+            description = (f"TOTAL   for {count_succeeded} {query_or_queries}")
+            log.info(f"{description:<60}  "
+                     f"{total_time_seconds:6.2f} s  "
+                     f"{total_result_size:>14,}")
+            description = (f"AVERAGE for {count_succeeded} {query_or_queries}")
+            log.info(f"{description:<60}  "
+                     f"{total_time_seconds / count_succeeded:6.2f} s  "
+                     f"{round(total_result_size / count_succeeded):>14,}")
+        else:
+            if count_failed == 1:
+                log.info(colored("One query failed", "red"))
+            elif count_failed > 1:
+                log.info(colored("All queries failed", "red"))
+        # Return success (has nothing to do with how many queries failed).
+        if args.log_level != "DEBUG":
+            Path(result_file).unlink(missing_ok=True)
         return True

qlever/commands/index_stats.py CHANGED Viewed

@@ -71,14 +71,17 @@ class IndexStatsCommand(QleverCommand):
         # Helper function that finds the next line matching the given `regex`,
         # starting from `current_line`, and extracts the time. Returns a tuple
-        # of the time and the regex match object. If a match is found,
-        # `current_line` is updated to the line after the match. Otherwise,
-        # `current_line` will be one beyond the last line, unless
-        # `line_is_optional` is true, in which case it will be the same as when
-        # the function was entered.
+        # of the time and the regex match object.
+        #
+        # If `update_current_line` is `False`, then `current_line` will not be
+        # updated by this call.
+        #
+        # Otherwise, and this is the default behavior, `current_line` will be
+        # updated to the line after the first match, or one beyond the last
+        # line if no match is found.
         current_line = 0
-        def find_next_line(regex, line_is_optional=False):
+        def find_next_line(regex, update_current_line=True):
             nonlocal lines
             nonlocal current_line
             current_line_backup = current_line
@@ -99,7 +102,7 @@ class IndexStatsCommand(QleverCommand):
                                   f"\"{timestamp_regex}\" from line "
                                   f" \"{line.rstrip()}\" ({e})")
             # If we get here, we did not find a matching line.
-            if line_is_optional:
+            if not update_current_line:
                 current_line = current_line_backup
             return None, None
@@ -110,24 +113,34 @@ class IndexStatsCommand(QleverCommand):
         convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
         perm_begin_and_info = []
         while True:
-            perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", True)
+            # Find the next line that starts a permutation.
+            #
+            # NOTE: Should work for the old and new format of the index log
+            # file (old format: "Creating a pair" + names of permutations in
+            # line "Writing meta data for ..."; new format: name of
+            # permutations already in line "Creating permutations ...").
+            perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair",
+                                           update_current_line=False)
             if perm_begin is None:
+                perm_begin, perm_info = find_next_line(
+                    r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)",
+                    update_current_line=False)
+            else:
+                _, perm_info = find_next_line(
+                    r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)",
+                    update_current_line=False)
+            if perm_info is None:
                 break
-            _, perm_info = find_next_line(r"INFO:\s*Writing meta data for"
-                                          r" ([A-Z]+ and [A-Z]+)", True)
-            # if perm_info is None:
-            #     break
             perm_begin_and_info.append((perm_begin, perm_info))
         convert_end = (perm_begin_and_info[0][0] if
                        len(perm_begin_and_info) > 0 else None)
         normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
-        text_begin, _ = find_next_line(r"INFO:\s*Adding text index", True)
-        text_end, _ = find_next_line(r"INFO:\s*Text index build comp", True)
+        text_begin, _ = find_next_line(r"INFO:\s*Adding text index",
+                                       update_current_line=False)
+        text_end, _ = find_next_line(r"INFO:\s*Text index build comp",
+                                     update_current_line=False)
         if args.ignore_text_index:
             text_begin = text_end = None
-        # print("DEBUG:", len(perm_begin_and_info), perm_begin_and_info)
-        # print("DEBUG:", overall_begin)
-        # print("DEBUG:", normal_end)
         # Check whether at least the first phase is done.
         if overall_begin is None:

qlever/config.py CHANGED Viewed

@@ -4,6 +4,7 @@ import argparse
 import os
 import traceback
 from pathlib import Path
+from importlib.metadata import version
 import argcomplete
@@ -180,6 +181,8 @@ class QleverConfig:
         # are defined in the modules in `qlever/commands`. In `__init__.py`
         # an object of each class is created and stored in `command_objects`.
         parser = argparse.ArgumentParser()
+        parser.add_argument("--version", action="version",
+                            version=f"%(prog)s {version('qlever')}")
         add_qleverfile_option(parser)
         subparsers = parser.add_subparsers(dest='command')
         subparsers.required = True

qlever/util.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-import random
 import re
+import secrets
 import shlex
 import shutil
 import string
@@ -31,7 +31,7 @@ def run_command(cmd: str, return_output: bool = False,
                 show_output: bool = False) -> Optional[str]:
     """
     Run the given command and throw an exception if the exit code is non-zero.
-    If `get_output` is `True`, return what the command wrote to `stdout`.
+    If `return_output` is `True`, return what the command wrote to `stdout`.
     NOTE: The `set -o pipefail` ensures that the exit code of the command is
     non-zero if any part of the pipeline fails (not just the last part).
@@ -68,6 +68,45 @@ def run_command(cmd: str, return_output: bool = False,
         return result.stdout
+def run_curl_command(url: str,
+                     headers: dict[str, str] = {},
+                     params: dict[str, str] = {},
+                     result_file: Optional[str] = None) -> str:
+    """
+    Run `curl` with the given `url`, `headers`, and `params`. If `result_file`
+    is `None`, return the output, otherwise, write the output to the given file
+    and return the HTTP code. If the `curl` command fails, throw an exception.
+    """
+    # Construct and run the `curl` command.
+    default_result_file = "/tmp/qlever.curl.result"
+    actual_result_file = result_file if result_file else default_result_file
+    curl_cmd = (f"curl -s -o \"{actual_result_file}\""
+                f" -w \"%{{http_code}}\n\" {url}"
+                + "".join([f" -H \"{key}: {value}\""
+                           for key, value in headers.items()])
+                + "".join([f" --data-urlencode {key}={shlex.quote(value)}"
+                           for key, value in params.items()]))
+    result = subprocess.run(curl_cmd, shell=True, text=True,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    # Case 1: An error occurred, raise an exception.
+    if result.returncode != 0:
+        if len(result.stderr) > 0:
+            raise Exception(result.stderr)
+        else:
+            raise Exception(f"curl command failed with exit code "
+                            f"{result.returncode}, stderr is empty")
+    # Case 2: Return output (read from `default_result_file`).
+    if result_file is None:
+        result_file_path = Path(default_result_file)
+        result = result_file_path.read_text()
+        result_file_path.unlink()
+        return result
+    # Case 3: Return HTTP code.
+    return result.stdout
 def is_qlever_server_alive(port: str) -> bool:
     """
     Helper function that checks if a QLever server is running on the given
@@ -82,30 +121,6 @@ def is_qlever_server_alive(port: str) -> bool:
     return exit_code == 0
-def get_curl_cmd_for_sparql_query(
-        query: str, port: int,
-        host: str = "localhost",
-        media_type: str = "application/sparql-results+qlever",
-        verbose: bool = False,
-        pinresult: bool = False,
-        access_token: Optional[str] = None,
-        send: Optional[int] = None) -> str:
-    """
-    Get curl command for given SPARQL query.
-    """
-    curl_cmd = (f"curl -s http::{host}:{port}"
-                f" -H \"Accept: {media_type}\" "
-                f" --data-urlencode query={shlex.quote(query)}")
-    if pinresult and access_token is not None:
-        curl_cmd += " --data-urlencode pinresult=true"
-        curl_cmd += f" --data-urlencode access_token={access_token}"
-    if send is not None:
-        curl_cmd += f" --data-urlencode send={send}"
-    if verbose:
-        curl_cmd += " --verbose"
-    return curl_cmd
 def get_existing_index_files(basename: str) -> list[str]:
     """
     Helper function that returns a list of all index files for `basename` in
@@ -137,8 +152,9 @@ def show_process_info(psutil_process, cmdline_regex, show_heading=True):
         pinfo = psutil_process.as_dict(
                 attrs=['pid', 'username', 'create_time',
                        'memory_info', 'cmdline'])
-        cmdline = " ".join(pinfo['cmdline'])
-        if not re.search(cmdline_regex, cmdline):
+        # Note: pinfo[`cmdline`] is `None` if the process is a zombie.
+        cmdline = " ".join(pinfo['cmdline'] or [])
+        if len(cmdline) == 0 or not re.search(cmdline_regex, cmdline):
             return False
         pid = pinfo['pid']
         user = pinfo['username'] if pinfo['username'] else ""
@@ -162,6 +178,5 @@ def get_random_string(length: int) -> str:
     Helper function that returns a randomly chosen string of the given
     length. Take the current time as seed.
     """
-    random.seed(datetime.now())
-    return "".join(random.choices(string.ascii_letters + string.digits,
-                                  k=length))
+    characters = string.ascii_letters + string.digits
+    return "".join(secrets.choice(characters) for _ in range(length))

{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: qlever
-Version: 0.4.0
+Version: 0.4.2
 Summary: Script for using the QLever SPARQL engine.
 Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
 License: Apache License
@@ -214,6 +214,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: psutil
 Requires-Dist: termcolor
+Requires-Dist: argcomplete
 # QLever

qlever-0.4.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,47 @@
+qlever/__init__.py,sha256=IyfS1OhlVE7-rjtv6FPlL0R56VxcNsS6KS7NJQhTDIM,1367
+qlever/__main__.py,sha256=MqM37bEzQeJEGUXZvuLcilIvnObZiG2eTGIkfKGpdnw,62016
+qlever/command.py,sha256=yOr0Uc8D8-AM7EjwDsVzbc3KNYjPH-FVOZhIHkqO588,2749
+qlever/config.py,sha256=-jjHAL8jdp25v53SqXKP4gWip6Qw9OdlDvFN6X7uk_4,10184
+qlever/containerize.py,sha256=p8g3O3G8a_0XLzSTzl_e5t9dqjbCQ-ippoA8vI2Z9pI,4193
+qlever/log.py,sha256=k9Mq4hxQ_d2k0e-5ZVgcB2XIRhOsGMO9I3rIR7YQyDA,1376
+qlever/qlever_main.py,sha256=k8vIQYK7zqObFNet11iLf--nrLdPooL5amprmlySi4k,2300
+qlever/qleverfile.py,sha256=6Ll81xkzel_s2Ju9ZfBXUGlRfikaAzZM6Do-dTrdo3k,12934
+qlever/util.py,sha256=eepj0SY9JJOUQq5kvtoPnWfoLLV9fbw_sTEWKHet66E,7147
+qlever/Qleverfiles/Qleverfile.dblp,sha256=SFjBD20aOSWod4mEQnxHSDWdInoE_EFp2nyMw7ev7ZA,1167
+qlever/Qleverfiles/Qleverfile.dblp-plus,sha256=Dwd9pK1vPcelKfw6sA-IuyhbZ6yIxOh6_84JgPYnB9Q,1332
+qlever/Qleverfiles/Qleverfile.default,sha256=mljl6I1RCkpIWOqMQwjzPZIsarYQx1R0mIlc583KuqU,1869
+qlever/Qleverfiles/Qleverfile.dnb,sha256=yw4MmLsDPP3P5JWPgJwgPJh66TqwkyUXbQR5lSf5oHc,1511
+qlever/Qleverfiles/Qleverfile.fbeasy,sha256=jeztW4gFpWL_w1nCH5qGHeZyZv2lz_kG6f1G3r3DkJ4,974
+qlever/Qleverfiles/Qleverfile.freebase,sha256=k6PqYrtHTBr0EydObm1Hg9QWyAAM9fXkdcjhReDg0fM,1035
+qlever/Qleverfiles/Qleverfile.imdb,sha256=uL5XlPwX01AmH-j6_Bc-PRm2fuPxGSIu8NaDflY525U,1623
+qlever/Qleverfiles/Qleverfile.olympics,sha256=5w9BOFwEBhdSzPz-0LRxwhv-7Gj6xbF539HOXr3cqD0,1088
+qlever/Qleverfiles/Qleverfile.osm-country,sha256=UnlkckSXJDrknZORlU-Hdj_J82U4kStl1aRctCc5n6M,1953
+qlever/Qleverfiles/Qleverfile.osm-planet,sha256=2RilNix0fplN3GsNNyOu3GzmUss1Pq7586WKOFAQnSs,1400
+qlever/Qleverfiles/Qleverfile.pubchem,sha256=bOhiJKUxzDiAm1UyXFPDQLYTqGc9jM8240fhobYLij0,3898
+qlever/Qleverfiles/Qleverfile.scientists,sha256=oFhzURcRFciA27GZ-ux_hsDe0esBLobWHC6h_Vf2xy8,1735
+qlever/Qleverfiles/Qleverfile.uniprot,sha256=FS8QLHvujbjUYyU2Ma0PRgfCWlulviaGLc_1csxpuic,2201
+qlever/Qleverfiles/Qleverfile.vvz,sha256=ftdMj5dCC9jAlFtNt2WR7kP30w0itT_iYtj5HoUVyWU,931
+qlever/Qleverfiles/Qleverfile.wikidata,sha256=fhWSChZTH3c2y14kgP1P5Duq1SsewTOK3wETf6RRmI8,1172
+qlever/Qleverfiles/Qleverfile.wikipathways,sha256=qWjfT-CVQCgRfN6fXPwBORMbjzXS_xsJ2DoCamQI7Rs,2045
+qlever/Qleverfiles/Qleverfile.yago-4,sha256=GikYPqChCtbAyZOVqszmVUwgQxSePTcgM8xw2b_21e4,1849
+qlever/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+qlever/commands/add_text_index.py,sha256=dkqYtwgOhgnXiei_eyhBWYCtdAiQUEmjWoa3JMlMb4c,3641
+qlever/commands/cache_stats.py,sha256=6JjueQstAqc8dNfgY8TP2EitFMxdUvCwrcyd7KUEb2o,4157
+qlever/commands/clear_cache.py,sha256=AnE1MOoj1ZexxrRT8FGeBLlv8rtQIVV4DP8VBn5-X-s,2843
+qlever/commands/example_queries.py,sha256=2rYTd35t0r7et0i-IBBcCpmVlYZya9kvwSI-gdTpNdE,12326
+qlever/commands/get_data.py,sha256=0fGuRLDB7YofHtpqk0ctq9_de_xeuliSmSZafGXAo1A,1470
+qlever/commands/index.py,sha256=lJhDnweknFZQm1czqPzNyz33EvbjIvOrS4j0wDaJ98o,5663
+qlever/commands/index_stats.py,sha256=_BiUNBhmbYd9RPxrlm4HF0oENO6JmqnRiAkwkyOdN4U,11722
+qlever/commands/log.py,sha256=8Krt3MsTUDapYqVw1zUu5X15SF8mV97Uj0qKOWK8jXk,1861
+qlever/commands/setup_config.py,sha256=mFkEtCPZ6oeVfehjVLrcLttYcPDgtwXHrNIWWzvHOfo,2928
+qlever/commands/start.py,sha256=2rOtk3NmhEs28D5csL_a1BdjSWU9VkcH6AqYT0vdww0,9285
+qlever/commands/status.py,sha256=5S6EdapZEwFKV9cQZtNYcZhMbAXAY-FP6ggjIhfX8ek,1631
+qlever/commands/stop.py,sha256=TZs4bxKHvujlZAU8BZmFjA5eXSZNAa6EeNzvPpEZsuI,4139
+qlever/commands/ui.py,sha256=rV8u017WLbfz0zVT_c9GC4d9v1WWwrTM3kfGONbeCvQ,2499
+qlever/commands/warmup.py,sha256=WOZSxeV8U_F6pEEnAb6YybXLQMxZFTRJXs4BPHUhsmc,1030
+qlever-0.4.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+qlever-0.4.2.dist-info/METADATA,sha256=tyLaWQtRaXbIaQkJ72mCcRpjxlusFHztHdAWedpZ1QE,17076
+qlever-0.4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+qlever-0.4.2.dist-info/entry_points.txt,sha256=s0iWBHKRUzsJ7B6nVGiyMdOJtiOS84IJMSSxgbNU6LU,85
+qlever-0.4.2.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
+qlever-0.4.2.dist-info/RECORD,,

qlever-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,30 +0,0 @@
-qlever/__init__.py,sha256=IyfS1OhlVE7-rjtv6FPlL0R56VxcNsS6KS7NJQhTDIM,1367
-qlever/__main__.py,sha256=MqM37bEzQeJEGUXZvuLcilIvnObZiG2eTGIkfKGpdnw,62016
-qlever/command.py,sha256=yOr0Uc8D8-AM7EjwDsVzbc3KNYjPH-FVOZhIHkqO588,2749
-qlever/config.py,sha256=LOVW8alFCVgZz_GAWm7vnjZVMVE7m3QTecy34lHgjGE,10017
-qlever/containerize.py,sha256=p8g3O3G8a_0XLzSTzl_e5t9dqjbCQ-ippoA8vI2Z9pI,4193
-qlever/log.py,sha256=k9Mq4hxQ_d2k0e-5ZVgcB2XIRhOsGMO9I3rIR7YQyDA,1376
-qlever/qlever_main.py,sha256=k8vIQYK7zqObFNet11iLf--nrLdPooL5amprmlySi4k,2300
-qlever/qleverfile.py,sha256=6Ll81xkzel_s2Ju9ZfBXUGlRfikaAzZM6Do-dTrdo3k,12934
-qlever/util.py,sha256=WM09PMRffUoPpEse4VwK9BzUavFkaB2Bm8KfVWxC3sQ,6161
-qlever/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-qlever/commands/add_text_index.py,sha256=dkqYtwgOhgnXiei_eyhBWYCtdAiQUEmjWoa3JMlMb4c,3641
-qlever/commands/cache_stats.py,sha256=6JjueQstAqc8dNfgY8TP2EitFMxdUvCwrcyd7KUEb2o,4157
-qlever/commands/clear_cache.py,sha256=AnE1MOoj1ZexxrRT8FGeBLlv8rtQIVV4DP8VBn5-X-s,2843
-qlever/commands/example_queries.py,sha256=3jlfHyL7pw1OSTuu3fY-23XaRAPIuEdNGW8QnIY2Va8,8644
-qlever/commands/get_data.py,sha256=0fGuRLDB7YofHtpqk0ctq9_de_xeuliSmSZafGXAo1A,1470
-qlever/commands/index.py,sha256=lJhDnweknFZQm1czqPzNyz33EvbjIvOrS4j0wDaJ98o,5663
-qlever/commands/index_stats.py,sha256=ao7_ySyz8MAjUvCbEp3Kj30PsR5x3MBM3ohgEUWdALM,11083
-qlever/commands/log.py,sha256=8Krt3MsTUDapYqVw1zUu5X15SF8mV97Uj0qKOWK8jXk,1861
-qlever/commands/setup_config.py,sha256=mFkEtCPZ6oeVfehjVLrcLttYcPDgtwXHrNIWWzvHOfo,2928
-qlever/commands/start.py,sha256=2rOtk3NmhEs28D5csL_a1BdjSWU9VkcH6AqYT0vdww0,9285
-qlever/commands/status.py,sha256=5S6EdapZEwFKV9cQZtNYcZhMbAXAY-FP6ggjIhfX8ek,1631
-qlever/commands/stop.py,sha256=TZs4bxKHvujlZAU8BZmFjA5eXSZNAa6EeNzvPpEZsuI,4139
-qlever/commands/ui.py,sha256=rV8u017WLbfz0zVT_c9GC4d9v1WWwrTM3kfGONbeCvQ,2499
-qlever/commands/warmup.py,sha256=WOZSxeV8U_F6pEEnAb6YybXLQMxZFTRJXs4BPHUhsmc,1030
-qlever-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-qlever-0.4.0.dist-info/METADATA,sha256=DuPh4u9Ukjt3-z31WK0mb_zj2OUV6bHnVLn1ESY7Gc0,17049
-qlever-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-qlever-0.4.0.dist-info/entry_points.txt,sha256=s0iWBHKRUzsJ7B6nVGiyMdOJtiOS84IJMSSxgbNU6LU,85
-qlever-0.4.0.dist-info/top_level.txt,sha256=kd3zsYqiFd0--Czh5XTVkfEq6XR-XgRFW35X0v0GT-c,7
-qlever-0.4.0.dist-info/RECORD,,

{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{qlever-0.4.0.dist-info → qlever-0.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

qlever 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

Potentially problematic release.

qlever 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl