PyPI - qlever - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

qlever 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of qlever might be problematic. Click here for more details.

Files changed (20) hide show

qlever/Qleverfiles/Qleverfile.dblp +1 -1
qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
qlever/Qleverfiles/Qleverfile.default +35 -31
qlever/Qleverfiles/Qleverfile.dnb +3 -3
qlever/Qleverfiles/Qleverfile.imdb +5 -5
qlever/Qleverfiles/Qleverfile.pubchem +29 -40
qlever/Qleverfiles/Qleverfile.wikipathways +6 -6
qlever/commands/example_queries.py +79 -30
qlever/commands/index.py +2 -2
qlever/commands/ui.py +6 -1
qlever/qlever_old.py +1 -1
qlever/qleverfile.py +6 -2
qlever/util.py +20 -0
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/METADATA +9 -1
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/RECORD +19 -19
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/WHEEL +1 -1
qlever/__main__.py +0 -1476
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/LICENSE +0 -0
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/entry_points.txt +0 -0
{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/top_level.txt +0 -0

qlever/Qleverfiles/Qleverfile.dblp CHANGED Viewed

@@ -1,7 +1,7 @@
 # Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
 #
 # qlever get-data  # takes ~3 mins (downloads .ttl.gz file of size ~3 GB)
-# qlever index     # takes ~3 mins (on an AMD Ryzen 9 5900X)
+# qlever index     # takes ~4 mins (on an AMD Ryzen 9 5900X)
 # qlever start     # takes a few seconds
 [data]

qlever/Qleverfiles/Qleverfile.dbpedia ADDED Viewed

@@ -0,0 +1,30 @@
+# Qleverfile for DBpedia, use with https://github.com/ad-freiburg/qlever-control
+#
+# qlever get-data  # ~14 GB, ~850 M triples (as of 30.07.2024)
+# qlever index     # ~20 min (on an AMD Ryzen 9 5900X)
+# qlever start     # ~3 sec
+[data]
+NAME         = dbpedia
+DATABUS_URL  = https://databus.dbpedia.org/dbpedia/collections/latest-core
+GET_DATA_CMD = curl -X POST -H "Accept: text/csv" --data-urlencode "query=$$(curl -s -H "Accept:text/sparql" https://databus.dbpedia.org/dbpedia/collections/latest-core)" https://databus.dbpedia.org/sparql | tail -n+2 | sed 's/\r$$//' | sed 's/"//g' | while read -r file; do wget -P rdf-input $$file; done
+DESCRIPTION  = RDF data from ${DATABUS_URL}
+[index]
+INPUT_FILES     = rdf-input/*
+CAT_INPUT_FILES = (cat rdf-input/*.nt; lbzcat -n2 rdf-input/*.bzip2 rdf-input/*.bz2)
+SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
+WITH_TEXT_INDEX = false
+[server]
+PORT               = 7012
+ACCESS_TOKEN       = ${data:NAME}
+MEMORY_FOR_QUERIES = 10G
+CACHE_MAX_SIZE     = 5G
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+[ui]
+UI_CONFIG = dbpedia

qlever/Qleverfiles/Qleverfile.default CHANGED Viewed

@@ -1,47 +1,51 @@
-# Automatically created by the "qlever" script
+# Default Qleverfile, use with https://github.com/ad-freiburg/qlever-control
 #
-# Modify as you see fit. Beware that some of the values below are executed as
-# commands  by the script.
-#
-# If you have never seen a Qleverfile before, we recommend that you look at the
-# pre-filled Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
-# Qleverfiles first to get some inspiration. Or execute `qlever setup-config
-# <config name>` with a config name of your choice.
+# If you have never seen a Qleverfile before, we recommend that you first look
+# at the example Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
+# src/qlever/Qleverfiles . Or execute `qlever setup-config <dataset>` on the
+# command line to obtain the example Qleverfiles for <dataset>.
 # As a minimum, each dataset needs a name. If you want `qlever get-data` to do
-# something meaningful, you need to define GET_DATA_CMD. If you want to use the
-# QLever UI, you should define DESCRIPTION (and if you have a text index,
-# also TEXT_DESCRIPTION).
+# something meaningful, you need to define GET_DATA_CMD. Otherwise, you need to
+# generate (or download or copy from somewhere) the input files yourself. Each
+# dataset should have a short DESCRIPTION, ideally with a date.
 [data]
-NAME              =
-# GET_DATA_CMD      =
-# DESCRIPTION =
-# TEXT_DESCRIPTION  =
+NAME         =
+GET_DATA_CMD =
+DESCRIPTION  =
-# CAT_INPUT_FILES produces the data that is piped into QLever's index builder.
-# Use SETTINGS_JSON for more advanced configuration settings (see the other
-# Qleverfiles for examples).
+# The format for INPUT_FILES should be such that `ls ${INPUT_FILES}` lists all
+# input files. CAT_INPUT_FILES should write a concatenation of all input files
+# to stdout. For example, if your input files are gzipped, you can write `zcat
+# ${INPUT_FILES}`. Regarding SETTINGS_JSON, look at the other Qleverfiles for
+# examples. Several batches of size `num-triples-per-batch` are kept in RAM at
+# the same time; increasing this, increases the memory usage but speeds up the
+# loading process.
 [index]
-# INPUT_FILES     =
-# CAT_INPUT_FILES = cat ${INPUT_FILES}
-# SETTINGS_JSON   = {}
+INPUT_FILES     = *.ttl
+CAT_INPUT_FILES = cat ${INPUT_FILES}
+SETTINGS_JSON   = { "num-triples-per-batch": 1000000 }
-# As a minimum, you need to specify the PORT, where QLever will listen for
-# SPARQL queries. If you want to send priviledged commands to the server, you
-# need to specify an ACCESS_TOKEN (modify the random number below).
+# The server listens on PORT. If you want to send privileged commands to the
+# server, you need to specify an ACCESS_TOKEN, which you then have to set via a
+# URL parameter `access_token`. It should not be easily guessable, unless you
+# don't mind others to get privileged access to your server.
 [server]
-PORT = 7001
-# ACCESS_TOKEN = ${data:NAME}_1234567890
+PORT         =
+ACCESS_TOKEN =
-# With USE_DOCKER = true, the qlever script will download the docker image for
-# you and run QLever inside docker containers. With USE_DOCKER = false, you need
-# the QLever binaries in the PATH of your sheel.
+# Use SYSTEM = docker to run QLever inside a docker container; the Docker image
+# will be downloaded automatically. Use SYSTEM = native to use self-compiled
+# binaries `IndexBuilderMain` and `ServerMain` (which should be in you PATH).
 [runtime]
-SYSTEM = true
+SYSTEM = docker
 IMAGE  = docker.io/adfreiburg/qlever:latest
+# UI_PORT specifies the port of the QLever UI web app, when you run `qlever ui`.
 # The UI_CONFIG must be one of the slugs from http://qlever.cs.uni-freiburg.de
 # (see the dropdown menu on the top right, the slug is the last part of the URL).
-# In partiular, this determines the example queries.
+# It determines the example queries and which SPARQL queries are launched to
+# obtain suggestions as you type a query.
 [ui]
+UI_PORT   = 8176
 UI_CONFIG = default

qlever/Qleverfiles/Qleverfile.dnb CHANGED Viewed

@@ -17,14 +17,14 @@
 [data]
 NAME              = dnb
 BASE_URL          = https://data.dnb.de/opendata
-GET_DATA_CMD      = curl -L -C - --remote-name-all ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
+GET_DATA_CMD      = curl -L -C - --remote-name-all --remote-time ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
 VERSION           = $$(date -r dnb-all_lds.nt.gz +%d.%m.%Y || echo "NO_DATE")
 DESCRIPTION       = DNB data from ${BASE_URL} (authoritities-gnd_lds, dnb_all_lds, dnb-all_ldsprov, zdb_lds), version ${VERSION}
 [index]
 INPUT_FILES     = *.nt.gz
-CAT_INPUT_FILES = zcat ${INPUT_FILES}
-SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
+CAT_INPUT_FILES = zcat ${INPUT_FILES} | sed '/"\$$R0"/d;/"0\.03013\$$D"/d'
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
 [server]
 PORT               = 7035

qlever/Qleverfiles/Qleverfile.imdb CHANGED Viewed

@@ -9,8 +9,8 @@
 [data]
 NAME             = imdb
 IMDB_DATA_URL    = https://datasets.imdbws.com
-GET_PREFIXES     = echo "@prefix imdb: <https://www.imdb.com/> .\n"
-GET_IMDB_BASICS  = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
+GET_PREFIXES     = echo "@prefix imdb: <https://www.imdb.com/> ."
+GET_IMDB_BASICS  = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\\\\", "\\\\", $$3); gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
 GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE}
 GET_DATA_CMD     = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl
 DESCRIPTION      = RDF data derived from ${IMDB_DATA_URL}
@@ -18,17 +18,17 @@ TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
 [index]
 INPUT_FILES     = ${data:NAME}.ttl
-CAT_INPUT_FILES = cat ${FILE_NAMES}
+CAT_INPUT_FILES = cat ${INPUT_FILES}
 SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
 TEXT_INDEX      = from_literals
 [server]
 PORT               = 7029
-ACCESS_TOKEN       = ${data:NAME}_1234567890
+ACCESS_TOKEN       = ${data:NAME}
 MEMORY_FOR_QUERIES = 5G
 [runtime]
-SYSTEM = docker
+SYSTEM = native
 IMAGE  = docker.io/adfreiburg/qlever:latest
 [ui]

qlever/Qleverfiles/Qleverfile.pubchem CHANGED Viewed

@@ -1,60 +1,49 @@
 # Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
 #
-# qlever get-data  # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4
-# qlever index     # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
-# qlever start     # starts the server (a few seconds)
+# Resource requirements (as of 18.08.2024, on an AMD Ryzen 9 5900X):
 #
-# IMPORTANT NOTES:
+# qlever get-data  # ~2 hours, ~150 GB, ~19 billion triples
+# qlever index     # ~7 hours, ~20 GB RAM, ~400 GB disk space
+# qlever start     # a few seconds
 #
-# NOTE 1: The SPARQL endpoint at https://qlever.cs.uni-freiburg.de/pubchem also
-# contains data from the following ontologies, which are very useful for
-# resolving names of IRIs like `sio:SIO_000008` or `obo:IAO_0000412`, but which
-# are not part of the PubChem RDF data. For the corresponding URLs, see
-# https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1200479401 .
+# NOTE 1: `get-data` does not only download the PubChem RDF data, but also
+# a number of ontologies. These are very useful to obtain names for IRIs like
+# `sio:SIO_000008` or `obo:IAO_0000412` (otherwise very hard to understand).
+# The ontologies BAO and NDF-RT are infrequently updated, for latest versions,
+# see the download links at https://bioportal.bioontology.org/ontologies/BAO
+# and https://bioportal.bioontology.org/ontologies/NDF-RT .
 #
-# bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit
-# obi pr ro sio skos so uo
-#
-# NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
-# disallows downloading the PubChem RDF data using `wget --recursive` as in the
-# GET_DATA_CMD below. As a workaround, you can write a simple Python script
-# (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
-# pages and download the files individually. This was done for the latest
-# version of https://qlever.cs.uni-freiburg.de/pubchem .
-#
-# NOTE 3: Many of the TTL files have generic prefix definitions in the middle
+# NOTE 2: Many of the TTL files have generic prefix definitions in the middle
 # of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
 # See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
-# This is allowed by the standard, but VERY unusual. For use with QLever,
-# convert the TTL files to NT before indexing, see GET_DATA_CMD below.
-#
-# NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
-# spaces and braces are not properly escaped. Here is a simple awk-based script
-# to percent-encode spaces and braces in all IRIs in the NT files:
+# This is allowed by the standard, but unusual. For use with QLever, we
+# therefore convert the TTL files to NT when downloading them.
 #
-# for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt
-# cat fix-nt.commands.txt | parallel
-[DEFAULT]
-NAME = pubchem
-DATE = 2024-02-03
+# NOTE 3: The PubChem data contains several invalid IRIs, in particular,
+# containing spaces. The previous version of this Qleverfile used a combination
+# of `sed` and `awk` to fix this. In the meantime, QLever's default is to warn
+# about such IRIs while indexing, but accept them anyway.
 [data]
-GET_DATA_URL      = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
-MAKE_GET_DATA_CMD = curl -s ${GET_DATA_URL}/void.ttl | grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' | grep -v "nbr[23]d" | while read URL; do echo "echo \"Processing $$URL ...\"; curl --silent --remote-time --output ttl.${DATE}/$$(basename $$URL) $$URL && docker run --rm -v $$(pwd)/ttl.${DATE}:/data stain/jena turtle --output=NT /data/$$(basename $$URL) | sed 's/> />\t/1; s/> />\t/1; s/ \.\$$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$$i ~ /^<.*>\$$/) { gsub(/ /, \"%20\", \$$i); gsub(/\[/, \"%5B\", \$$i); gsub(/\]/, \"%5D\", \$$i); gsub(/{/, \"%7B\", \$$i); gsub(/}/, \"%7D\", \$$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}/$$(basename -s .ttl.gz $$URL).nt.gz"; done > pubchem.get-data-cmds.txt
-GET_DATA_CMD      = mkdir -p ttl.${DATE} && mkdir -p nt.${DATE} && ${MAKE_GET_DATA_CMD} && cat pubchem.get-data-cmds.txt | parallel --line-buffer
-DESCRIPTION       = PubChem RDF from ${GET_DATA_URL}, version ${DATE} (all folders except nbr2d and nbr3d)
+NAME                = pubchem
+GET_DATA_URL        = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
+CHECK_REQUIREMENTS  = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
+MAKE_GET_DATA_CMD_1 = DIR=DATA.ontologies && mkdir -p $$DIR && cat $$DIR/ontologies.csv | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "echo \"Processing $$URL ($$FILE) ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$DIR/$$FILE 2> $$DIR/$$ERRFILE | gzip -c > $$DIR/$${FILE%.*}.nt.gz && rm -f $$DIR/$$FILE && if [ ! -s $$DIR/$$ERRFILE ]; then rm -f $$DIR/$$ERRFILE; fi || echo \"ERROR processing $$URL ($$FILE)\""; done > pubchem.get-data-cmds.txt
+MAKE_GET_DATA_CMD_2 = DIR=DATA.pubchem && mkdir -p $$DIR && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' void.ttl | while read URL; do FILE=$$(basename $$URL); echo "echo \"Processing $$URL ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run -i --rm -v $$(pwd):/data stain/jena turtle --output=NT /data/$$DIR/$$FILE | gzip -c > $$DIR/$${FILE%%.*}.nt.gz && rm -f $$DIR/$$FILE || echo \"ERROR processing $$URL\""; done >> pubchem.get-data-cmds.txt
+GET_DATA_CMD        = ${CHECK_REQUIREMENTS} && ${MAKE_GET_DATA_CMD_1} && ${MAKE_GET_DATA_CMD_2} && cat pubchem.get-data-cmds.txt | parallel --line-buffer 2>&1 | tee pubchem.get-data-log.txt
+VERSION             = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE")
+DESCRIPTION         = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo)
+MAKE_ONTOLOGIES_CSV = $$(mkdir -p DATA.ontologies && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\n BioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\n CHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\n ChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\n CiTO,cito.nt,http://purl.org/spar/cito.nt\n DCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\n FaBiO,fabio.nt,http://purl.org/spar/fabio.nt\n GO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\n IAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\n NCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\n NDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\n OBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\n OWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\n PDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\n PR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\n RDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\n RDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\n RO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\n SIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\n SKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\n SO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\n UO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > DATA.ontologies/ontologies.csv)
 [index]
-INPUT_FILES     = pubchem.additional-ontologies.nt.gz nt.${DATE}/*.nt.gz
+INPUT_FILES     = DATA.ontologies/*.nt.gz DATA.pubchem/*.nt.gz
 CAT_INPUT_FILES = zcat ${INPUT_FILES}
-SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
 STXXL_MEMORY    = 10G
 [server]
 PORT               = 7023
-ACCESS_TOKEN       = ${NAME}_310129823
+ACCESS_TOKEN       = ${data:NAME}
 MEMORY_FOR_QUERIES = 20G
 TIMEOUT            = 120s

qlever/Qleverfiles/Qleverfile.wikipathways CHANGED Viewed

@@ -1,16 +1,16 @@
 # Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
 #
-# qlever get-data  # downloads .gz file of size ~100 MB (as of 24.02.2024)
+# qlever get-data  # takes ~3 seconds, generates TTL of size ~600 MB
 # qlever index     # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
-# qlever start     # starts the server (takes around 2 minutes)
+# qlever start     # instant
 #
 # Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
 [data]
 NAME             = wikipathways
-RELEASE          = 20231210
+RELEASE          = 20240810
 GET_DATA_URL     = https://data.wikipathways.org/${RELEASE}/rdf
-GET_DATA_CMD     = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-void.ttl && \
+GET_DATA_CMD     = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-rdf-void.ttl && \
                     wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
                       unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
                     wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
@@ -23,13 +23,13 @@ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
 [index]
 INPUT_FILES     = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
-CAT_INPUT_FILES = cat ${FILE_NAMES}
+CAT_INPUT_FILES = cat ${INPUT_FILES}
 SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
 TEXT_INDEX      = from_literals
 [server]
 PORT               = 7040
-ACCESS_TOKEN       = ${data:NAME}_7643543846
+ACCESS_TOKEN       = ${data:NAME}
 MEMORY_FOR_QUERIES = 5G
 [runtime]

qlever/commands/example_queries.py CHANGED Viewed

@@ -59,17 +59,37 @@ class ExampleQueriesCommand(QleverCommand):
                                "or just compute the size of the result")
         subparser.add_argument("--limit", type=int,
                                help="Limit on the number of results")
+        subparser.add_argument("--remove-offset-and-limit",
+                               action="store_true", default=False,
+                               help="Remove OFFSET and LIMIT from the query")
         subparser.add_argument("--accept", type=str,
                                choices=["text/tab-separated-values",
-                                        "application/sparql-results+json"],
+                                        "text/csv",
+                                        "application/sparql-results+json",
+                                        "text/turtle"],
                                default="text/tab-separated-values",
                                help="Accept header for the SPARQL query")
         subparser.add_argument("--clear-cache",
                                choices=["yes", "no"],
                                default="yes",
                                help="Clear the cache before each query")
+        subparser.add_argument("--width-query-description", type=int,
+                               default=40,
+                               help="Width for printing the query description")
+        subparser.add_argument("--width-error-message", type=int,
+                               default=80,
+                               help="Width for printing the error message "
+                               "(0 = no limit)")
+        subparser.add_argument("--width-result-size", type=int,
+                               default=14,
+                               help="Width for printing the result size")
     def execute(self, args) -> bool:
+        # We can't have both `--remove-offset-and-limit` and `--limit`.
+        if args.remove_offset_and_limit and args.limit:
+            log.error("Cannot have both --remove-offset-and-limit and --limit")
+            return False
         # If `args.accept` is `application/sparql-results+json`, we need `jq`.
         if args.accept == "application/sparql-results+json":
             try:
@@ -153,26 +173,41 @@ class ExampleQueriesCommand(QleverCommand):
                 with mute_log():
                     ClearCacheCommand().execute(args)
-            # Count query.
-            if args.download_or_count == "count":
-                # Find first string matching ?[a-zA-Z0-9_]+ in query.
-                match = re.search(r"\?[a-zA-Z0-9_]+", query)
-                if not match:
-                    log.error("Could not find a variable in this query:")
-                    log.info("")
-                    log.info(query)
-                    return False
-                first_var = match.group(0)
-                query = query.replace(
-                        "SELECT ",
-                        f"SELECT (COUNT({first_var}) AS {first_var}_count_) "
-                        f"WHERE {{ SELECT ", 1) + " }"
+            # Remove OFFSET and LIMIT (after the last closing bracket).
+            if args.remove_offset_and_limit or args.limit:
+                closing_bracket_idx = query.rfind("}")
+                regexes = [re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE),
+                           re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE)]
+                for regex in regexes:
+                    match = re.search(regex, query[closing_bracket_idx:])
+                    if match:
+                        query = query[:closing_bracket_idx + match.start()] + \
+                                query[closing_bracket_idx + match.end():]
             # Limit query.
             if args.limit:
-                query = query.replace(
-                        "SELECT ", "SELECT * WHERE { SELECT ", 1) \
-                          + f" }} LIMIT {args.limit}"
+                query += f" LIMIT {args.limit}"
+            # Count query.
+            if args.download_or_count == "count":
+                # First find out if there is a FROM clause.
+                regex_from_clause = re.compile(r"\s*FROM\s+<[^>]+>\s*",
+                                               re.IGNORECASE)
+                match_from_clause = re.search(regex_from_clause, query)
+                from_clause = " "
+                if match_from_clause:
+                    from_clause = match_from_clause.group(0)
+                    query = (query[:match_from_clause.start()] + " " +
+                             query[match_from_clause.end():])
+                # Now we can add the outer SELECT COUNT(*).
+                query = re.sub(r"SELECT ",
+                               "SELECT (COUNT(*) AS ?qlever_count_)"
+                               + from_clause + "WHERE { SELECT ",
+                               query, count=1, flags=re.IGNORECASE) + " }"
+            # A bit of pretty-printing.
+            query = re.sub(r"\s+", " ", query)
+            query = re.sub(r"\s*\.\s*\}", " }", query)
             # Launch query.
             try:
@@ -214,10 +249,16 @@ class ExampleQueriesCommand(QleverCommand):
                                     f" | tonumber\" {result_file}",
                                     return_output=True)
                     else:
-                        if args.accept == "text/tab-separated-values":
+                        if (args.accept == "text/tab-separated-values"
+                                or args.accept == "text/csv"):
                             result_size = run_command(
                                     f"sed 1d {result_file} | wc -l",
                                     return_output=True)
+                        elif args.accept == "text/turtle":
+                            result_size = run_command(
+                                    f"sed '1d;/^@prefix/d;/^\\s*$/d' "
+                                    f"{result_file} | wc -l",
+                                    return_output=True)
                         else:
                             result_size = run_command(
                                     f"jq -r \".results.bindings | length\""
@@ -227,20 +268,30 @@ class ExampleQueriesCommand(QleverCommand):
                 except Exception as e:
                     error_msg = str(e)
+            # Remove the result file (unless in debug mode).
+            if args.log_level != "DEBUG":
+                Path(result_file).unlink(missing_ok=True)
             # Print description, time, result in tabular form.
-            if (len(description) > 60):
-                description = description[:57] + "..."
+            if len(description) > args.width_query_description:
+                description = description[:args.width_query_description - 3]
+                description += "..."
             if error_msg is None:
-                log.info(f"{description:<60}  {time_seconds:6.2f} s  "
-                         f"{result_size:14,}")
+                log.info(f"{description:<{args.width_query_description}}  "
+                         f"{time_seconds:6.2f} s  "
+                         f"{result_size:>{args.width_result_size},}")
                 count_succeeded += 1
                 total_time_seconds += time_seconds
                 total_result_size += result_size
             else:
                 count_failed += 1
-                if (len(error_msg) > 60) and args.log_level != "DEBUG":
-                    error_msg = error_msg[:57] + "..."
-                log.error(f"{description:<60}    failed   "
+                if (args.width_error_message > 0
+                        and len(error_msg) > args.width_error_message
+                        and args.log_level != "DEBUG"):
+                    error_msg = error_msg[:args.width_error_message - 3]
+                    error_msg += "..."
+                log.error(f"{description:<{args.width_query_description}}    "
+                          f"failed   "
                           f"{colored(error_msg, 'red')}")
         # Print total time.
@@ -248,11 +299,11 @@ class ExampleQueriesCommand(QleverCommand):
         if count_succeeded > 0:
             query_or_queries = "query" if count_succeeded == 1 else "queries"
             description = (f"TOTAL   for {count_succeeded} {query_or_queries}")
-            log.info(f"{description:<60}  "
+            log.info(f"{description:<{args.width_query_description}}  "
                      f"{total_time_seconds:6.2f} s  "
                      f"{total_result_size:>14,}")
             description = (f"AVERAGE for {count_succeeded} {query_or_queries}")
-            log.info(f"{description:<60}  "
+            log.info(f"{description:<{args.width_query_description}}  "
                      f"{total_time_seconds / count_succeeded:6.2f} s  "
                      f"{round(total_result_size / count_succeeded):>14,}")
         else:
@@ -262,6 +313,4 @@ class ExampleQueriesCommand(QleverCommand):
                 log.info(colored("All queries failed", "red"))
         # Return success (has nothing to do with how many queries failed).
-        if args.log_level != "DEBUG":
-            Path(result_file).unlink(missing_ok=True)
         return True

qlever/commands/index.py CHANGED Viewed

@@ -25,7 +25,7 @@ class IndexCommand(QleverCommand):
         return True
     def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
-        return {"data": ["name"],
+        return {"data": ["name", "format"],
                 "index": ["input_files", "cat_input_files", "settings_json",
                           "index_binary",
                           "only_pso_and_pos_permutations", "use_patterns",
@@ -41,7 +41,7 @@ class IndexCommand(QleverCommand):
     def execute(self, args) -> bool:
         # Construct the command line.
         index_cmd = (f"{args.cat_input_files} | {args.index_binary}"
-                     f" -F ttl -f -"
+                     f" -F {args.format} -"
                      f" -i {args.name}"
                      f" -s {args.name}.settings.json")
         if args.only_pso_and_pos_permutations:

qlever/commands/ui.py CHANGED Viewed

@@ -5,6 +5,7 @@ import subprocess
 from qlever.command import QleverCommand
 from qlever.containerize import Containerize
 from qlever.log import log
+from qlever.util import is_port_used
 class UiCommand(QleverCommand):
@@ -53,6 +54,10 @@ class UiCommand(QleverCommand):
             Containerize.stop_and_remove_container(
                     container_system, args.ui_container)
+        # Check if the UI port is already being used.
+        if is_port_used(args.ui_port):
+            log.warning(f"It looks like the specified port for the UI ({args.ui_port}) is already in use. You can set another port in the Qleverfile in the [ui] section with the UI_PORT variable.")
         # Try to start the QLever UI.
         try:
             subprocess.run(pull_cmd, shell=True, stdout=subprocess.DEVNULL)
@@ -65,5 +70,5 @@ class UiCommand(QleverCommand):
         # Success.
         log.info(f"The QLever UI should now be up at {ui_url} ..."
                  f"You can log in as QLever UI admin with username and "
-                 f"passwort \"demo\"")
+                 f"password \"demo\"")
         return True

qlever/qlever_old.py CHANGED Viewed

@@ -985,7 +985,7 @@ class Actions:
         log.info(f"The QLever UI should now be up at "
                  f"http://{host_name}:{self.config['ui']['port']}")
         log.info("You can log in as QLever UI admin with username and "
-                 "passwort \"demo\"")
+                 "password \"demo\"")
     @track_action_rank
     def action_cache_stats_and_settings(self, only_show=False):

qlever/qleverfile.py CHANGED Viewed

@@ -51,8 +51,12 @@ class Qleverfile:
                 help="A concise description of the dataset")
         data_args["text_description"] = arg(
                 "--text-description", type=str, default=None,
-                help="A concice description of the addtional text data"
+                help="A concise description of the additional text data"
                      " if any")
+        data_args["format"] = arg(
+                "--format", type=str, default="ttl",
+                choices=["ttl", "nt", "nq"],
+                help="The format of the data")
         index_args["input_files"] = arg(
                 "--input-files", type=str, required=True,
@@ -173,7 +177,7 @@ class Qleverfile:
                 help="The name of the container used by `qlever start`")
         ui_args["ui_port"] = arg(
-                "--ui_port", type=int, default=7000,
+                "--ui-port", type=int, default=8176,
                 help="The port of the Qlever UI when running `qlever ui`")
         ui_args["ui_config"] = arg(
                 "--ui-config", type=str, default="default",

qlever/util.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
+import errno
 import re
 import secrets
+import socket
 import shlex
 import shutil
 import string
@@ -180,3 +182,21 @@ def get_random_string(length: int) -> str:
     """
     characters = string.ascii_letters + string.digits
     return "".join(secrets.choice(characters) for _ in range(length))
+def is_port_used(port: int) -> bool:
+    """
+    Try to bind to the port on all interfaces to check if the port is already in use.
+    If the port is already in use, `socket.bind` will raise an `OSError` with errno EADDRINUSE.
+    """
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        # Ensure that the port is not blocked after the check.
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        sock.bind(('', port))
+        sock.close()
+        return False
+    except OSError as err:
+        if err.errno != errno.EADDRINUSE:
+            log.warning(f"Failed to determine if port is used: {err}")
+        return True

{qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: qlever
-Version: 0.5.3
+Version: 0.5.5
 Summary: Script for using the QLever SPARQL engine.
 Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
 License: Apache-2.0
@@ -77,6 +77,14 @@ There are many more commands and options, see `qlever --help` for general help,
 `qlever <command> --help` for help on a specific command, or just the
 autocompletion.
+# Use with your own dataset
+To use QLever with your own dataset, you should also write a `Qleverfile`, like
+in the example above. The easiest way to write a `Qleverfile` is to get one of
+the existing ones (using `qlever setup-config ...` as explained above) and then
+change it according to your needs (the variable names should be self-explanatory).
+Pick one for a dataset that is similar to yours and when in doubt, pick `olympics`.
 # For developers
 The (Python) code for the script is in the `*.py` files in `src/qlever`. The

qlever 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

qlever 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl