qlever 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qlever might be problematic. Click here for more details.
- qlever/Qleverfiles/Qleverfile.dblp +1 -1
- qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
- qlever/Qleverfiles/Qleverfile.default +35 -31
- qlever/Qleverfiles/Qleverfile.dnb +3 -3
- qlever/Qleverfiles/Qleverfile.imdb +5 -5
- qlever/Qleverfiles/Qleverfile.pubchem +29 -40
- qlever/Qleverfiles/Qleverfile.wikipathways +6 -6
- qlever/commands/example_queries.py +79 -30
- qlever/commands/index.py +2 -2
- qlever/commands/ui.py +6 -1
- qlever/qlever_old.py +1 -1
- qlever/qleverfile.py +6 -2
- qlever/util.py +20 -0
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/METADATA +9 -1
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/RECORD +19 -19
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/WHEEL +1 -1
- qlever/__main__.py +0 -1476
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/LICENSE +0 -0
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/entry_points.txt +0 -0
- {qlever-0.5.3.dist-info → qlever-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
|
|
2
2
|
#
|
|
3
3
|
# qlever get-data # takes ~3 mins (downloads .ttl.gz file of size ~3 GB)
|
|
4
|
-
# qlever index # takes ~
|
|
4
|
+
# qlever index # takes ~4 mins (on an AMD Ryzen 9 5900X)
|
|
5
5
|
# qlever start # takes a few seconds
|
|
6
6
|
|
|
7
7
|
[data]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Qleverfile for DBpedia, use with https://github.com/ad-freiburg/qlever-control
|
|
2
|
+
#
|
|
3
|
+
# qlever get-data # ~14 GB, ~850 M triples (as of 30.07.2024)
|
|
4
|
+
# qlever index # ~20 min (on an AMD Ryzen 9 5900X)
|
|
5
|
+
# qlever start # ~3 sec
|
|
6
|
+
|
|
7
|
+
[data]
|
|
8
|
+
NAME = dbpedia
|
|
9
|
+
DATABUS_URL = https://databus.dbpedia.org/dbpedia/collections/latest-core
|
|
10
|
+
GET_DATA_CMD = curl -X POST -H "Accept: text/csv" --data-urlencode "query=$$(curl -s -H "Accept:text/sparql" https://databus.dbpedia.org/dbpedia/collections/latest-core)" https://databus.dbpedia.org/sparql | tail -n+2 | sed 's/\r$$//' | sed 's/"//g' | while read -r file; do wget -P rdf-input $$file; done
|
|
11
|
+
DESCRIPTION = RDF data from ${DATABUS_URL}
|
|
12
|
+
|
|
13
|
+
[index]
|
|
14
|
+
INPUT_FILES = rdf-input/*
|
|
15
|
+
CAT_INPUT_FILES = (cat rdf-input/*.nt; lbzcat -n2 rdf-input/*.bzip2 rdf-input/*.bz2)
|
|
16
|
+
SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
|
|
17
|
+
WITH_TEXT_INDEX = false
|
|
18
|
+
|
|
19
|
+
[server]
|
|
20
|
+
PORT = 7012
|
|
21
|
+
ACCESS_TOKEN = ${data:NAME}
|
|
22
|
+
MEMORY_FOR_QUERIES = 10G
|
|
23
|
+
CACHE_MAX_SIZE = 5G
|
|
24
|
+
|
|
25
|
+
[runtime]
|
|
26
|
+
SYSTEM = docker
|
|
27
|
+
IMAGE = docker.io/adfreiburg/qlever:latest
|
|
28
|
+
|
|
29
|
+
[ui]
|
|
30
|
+
UI_CONFIG = dbpedia
|
|
@@ -1,47 +1,51 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Default Qleverfile, use with https://github.com/ad-freiburg/qlever-control
|
|
2
2
|
#
|
|
3
|
-
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
# pre-filled Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
|
|
8
|
-
# Qleverfiles first to get some inspiration. Or execute `qlever setup-config
|
|
9
|
-
# <config name>` with a config name of your choice.
|
|
3
|
+
# If you have never seen a Qleverfile before, we recommend that you first look
|
|
4
|
+
# at the example Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
|
|
5
|
+
# src/qlever/Qleverfiles . Or execute `qlever setup-config <dataset>` on the
|
|
6
|
+
# command line to obtain the example Qleverfiles for <dataset>.
|
|
10
7
|
|
|
11
8
|
# As a minimum, each dataset needs a name. If you want `qlever get-data` to do
|
|
12
|
-
# something meaningful, you need to define GET_DATA_CMD.
|
|
13
|
-
#
|
|
14
|
-
#
|
|
9
|
+
# something meaningful, you need to define GET_DATA_CMD. Otherwise, you need to
|
|
10
|
+
# generate (or download or copy from somewhere) the input files yourself. Each
|
|
11
|
+
# dataset should have a short DESCRIPTION, ideally with a date.
|
|
15
12
|
[data]
|
|
16
|
-
NAME
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# TEXT_DESCRIPTION =
|
|
13
|
+
NAME =
|
|
14
|
+
GET_DATA_CMD =
|
|
15
|
+
DESCRIPTION =
|
|
20
16
|
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
17
|
+
# The format for INPUT_FILES should be such that `ls ${INPUT_FILES}` lists all
|
|
18
|
+
# input files. CAT_INPUT_FILES should write a concatenation of all input files
|
|
19
|
+
# to stdout. For example, if your input files are gzipped, you can write `zcat
|
|
20
|
+
# ${INPUT_FILES}`. Regarding SETTINGS_JSON, look at the other Qleverfiles for
|
|
21
|
+
# examples. Several batches of size `num-triples-per-batch` are kept in RAM at
|
|
22
|
+
# the same time; increasing this, increases the memory usage but speeds up the
|
|
23
|
+
# loading process.
|
|
24
24
|
[index]
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
INPUT_FILES = *.ttl
|
|
26
|
+
CAT_INPUT_FILES = cat ${INPUT_FILES}
|
|
27
|
+
SETTINGS_JSON = { "num-triples-per-batch": 1000000 }
|
|
28
28
|
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
#
|
|
29
|
+
# The server listens on PORT. If you want to send privileged commands to the
|
|
30
|
+
# server, you need to specify an ACCESS_TOKEN, which you then have to set via a
|
|
31
|
+
# URL parameter `access_token`. It should not be easily guessable, unless you
|
|
32
|
+
# don't mind others to get privileged access to your server.
|
|
32
33
|
[server]
|
|
33
|
-
PORT
|
|
34
|
-
|
|
34
|
+
PORT =
|
|
35
|
+
ACCESS_TOKEN =
|
|
35
36
|
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
#
|
|
37
|
+
# Use SYSTEM = docker to run QLever inside a docker container; the Docker image
|
|
38
|
+
# will be downloaded automatically. Use SYSTEM = native to use self-compiled
|
|
39
|
+
# binaries `IndexBuilderMain` and `ServerMain` (which should be in you PATH).
|
|
39
40
|
[runtime]
|
|
40
|
-
SYSTEM =
|
|
41
|
+
SYSTEM = docker
|
|
41
42
|
IMAGE = docker.io/adfreiburg/qlever:latest
|
|
42
43
|
|
|
44
|
+
# UI_PORT specifies the port of the QLever UI web app, when you run `qlever ui`.
|
|
43
45
|
# The UI_CONFIG must be one of the slugs from http://qlever.cs.uni-freiburg.de
|
|
44
46
|
# (see the dropdown menu on the top right, the slug is the last part of the URL).
|
|
45
|
-
#
|
|
47
|
+
# It determines the example queries and which SPARQL queries are launched to
|
|
48
|
+
# obtain suggestions as you type a query.
|
|
46
49
|
[ui]
|
|
50
|
+
UI_PORT = 8176
|
|
47
51
|
UI_CONFIG = default
|
|
@@ -17,14 +17,14 @@
|
|
|
17
17
|
[data]
|
|
18
18
|
NAME = dnb
|
|
19
19
|
BASE_URL = https://data.dnb.de/opendata
|
|
20
|
-
GET_DATA_CMD = curl -L -C - --remote-name-all ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
|
|
20
|
+
GET_DATA_CMD = curl -L -C - --remote-name-all --remote-time ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
|
|
21
21
|
VERSION = $$(date -r dnb-all_lds.nt.gz +%d.%m.%Y || echo "NO_DATE")
|
|
22
22
|
DESCRIPTION = DNB data from ${BASE_URL} (authoritities-gnd_lds, dnb_all_lds, dnb-all_ldsprov, zdb_lds), version ${VERSION}
|
|
23
23
|
|
|
24
24
|
[index]
|
|
25
25
|
INPUT_FILES = *.nt.gz
|
|
26
|
-
CAT_INPUT_FILES = zcat ${INPUT_FILES}
|
|
27
|
-
SETTINGS_JSON = { "ascii-prefixes-only":
|
|
26
|
+
CAT_INPUT_FILES = zcat ${INPUT_FILES} | sed '/"\$$R0"/d;/"0\.03013\$$D"/d'
|
|
27
|
+
SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
|
|
28
28
|
|
|
29
29
|
[server]
|
|
30
30
|
PORT = 7035
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
[data]
|
|
10
10
|
NAME = imdb
|
|
11
11
|
IMDB_DATA_URL = https://datasets.imdbws.com
|
|
12
|
-
GET_PREFIXES = echo "@prefix imdb: <https://www.imdb.com/>
|
|
13
|
-
GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
|
|
12
|
+
GET_PREFIXES = echo "@prefix imdb: <https://www.imdb.com/> ."
|
|
13
|
+
GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\\\\", "\\\\", $$3); gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
|
|
14
14
|
GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE}
|
|
15
15
|
GET_DATA_CMD = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl
|
|
16
16
|
DESCRIPTION = RDF data derived from ${IMDB_DATA_URL}
|
|
@@ -18,17 +18,17 @@ TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
|
|
|
18
18
|
|
|
19
19
|
[index]
|
|
20
20
|
INPUT_FILES = ${data:NAME}.ttl
|
|
21
|
-
CAT_INPUT_FILES = cat ${
|
|
21
|
+
CAT_INPUT_FILES = cat ${INPUT_FILES}
|
|
22
22
|
SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
|
|
23
23
|
TEXT_INDEX = from_literals
|
|
24
24
|
|
|
25
25
|
[server]
|
|
26
26
|
PORT = 7029
|
|
27
|
-
ACCESS_TOKEN = ${data:NAME}
|
|
27
|
+
ACCESS_TOKEN = ${data:NAME}
|
|
28
28
|
MEMORY_FOR_QUERIES = 5G
|
|
29
29
|
|
|
30
30
|
[runtime]
|
|
31
|
-
SYSTEM =
|
|
31
|
+
SYSTEM = native
|
|
32
32
|
IMAGE = docker.io/adfreiburg/qlever:latest
|
|
33
33
|
|
|
34
34
|
[ui]
|
|
@@ -1,60 +1,49 @@
|
|
|
1
1
|
# Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
|
|
2
2
|
#
|
|
3
|
-
#
|
|
4
|
-
# qlever index # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
|
|
5
|
-
# qlever start # starts the server (a few seconds)
|
|
3
|
+
# Resource requirements (as of 18.08.2024, on an AMD Ryzen 9 5900X):
|
|
6
4
|
#
|
|
7
|
-
#
|
|
5
|
+
# qlever get-data # ~2 hours, ~150 GB, ~19 billion triples
|
|
6
|
+
# qlever index # ~7 hours, ~20 GB RAM, ~400 GB disk space
|
|
7
|
+
# qlever start # a few seconds
|
|
8
8
|
#
|
|
9
|
-
# NOTE 1:
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# https://
|
|
9
|
+
# NOTE 1: `get-data` does not only download the PubChem RDF data, but also
|
|
10
|
+
# a number of ontologies. These are very useful to obtain names for IRIs like
|
|
11
|
+
# `sio:SIO_000008` or `obo:IAO_0000412` (otherwise very hard to understand).
|
|
12
|
+
# The ontologies BAO and NDF-RT are infrequently updated, for latest versions,
|
|
13
|
+
# see the download links at https://bioportal.bioontology.org/ontologies/BAO
|
|
14
|
+
# and https://bioportal.bioontology.org/ontologies/NDF-RT .
|
|
14
15
|
#
|
|
15
|
-
#
|
|
16
|
-
# obi pr ro sio skos so uo
|
|
17
|
-
#
|
|
18
|
-
# NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
|
|
19
|
-
# disallows downloading the PubChem RDF data using `wget --recursive` as in the
|
|
20
|
-
# GET_DATA_CMD below. As a workaround, you can write a simple Python script
|
|
21
|
-
# (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
|
|
22
|
-
# pages and download the files individually. This was done for the latest
|
|
23
|
-
# version of https://qlever.cs.uni-freiburg.de/pubchem .
|
|
24
|
-
#
|
|
25
|
-
# NOTE 3: Many of the TTL files have generic prefix definitions in the middle
|
|
16
|
+
# NOTE 2: Many of the TTL files have generic prefix definitions in the middle
|
|
26
17
|
# of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
|
|
27
18
|
# See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
|
|
28
|
-
# This is allowed by the standard, but
|
|
29
|
-
# convert the TTL files to NT
|
|
30
|
-
#
|
|
31
|
-
# NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
|
|
32
|
-
# spaces and braces are not properly escaped. Here is a simple awk-based script
|
|
33
|
-
# to percent-encode spaces and braces in all IRIs in the NT files:
|
|
19
|
+
# This is allowed by the standard, but unusual. For use with QLever, we
|
|
20
|
+
# therefore convert the TTL files to NT when downloading them.
|
|
34
21
|
#
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
[DEFAULT]
|
|
40
|
-
NAME = pubchem
|
|
41
|
-
DATE = 2024-02-03
|
|
22
|
+
# NOTE 3: The PubChem data contains several invalid IRIs, in particular,
|
|
23
|
+
# containing spaces. The previous version of this Qleverfile used a combination
|
|
24
|
+
# of `sed` and `awk` to fix this. In the meantime, QLever's default is to warn
|
|
25
|
+
# about such IRIs while indexing, but accept them anyway.
|
|
42
26
|
|
|
43
27
|
[data]
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
28
|
+
NAME = pubchem
|
|
29
|
+
GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
|
|
30
|
+
CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
|
|
31
|
+
MAKE_GET_DATA_CMD_1 = DIR=DATA.ontologies && mkdir -p $$DIR && cat $$DIR/ontologies.csv | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "echo \"Processing $$URL ($$FILE) ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$DIR/$$FILE 2> $$DIR/$$ERRFILE | gzip -c > $$DIR/$${FILE%.*}.nt.gz && rm -f $$DIR/$$FILE && if [ ! -s $$DIR/$$ERRFILE ]; then rm -f $$DIR/$$ERRFILE; fi || echo \"ERROR processing $$URL ($$FILE)\""; done > pubchem.get-data-cmds.txt
|
|
32
|
+
MAKE_GET_DATA_CMD_2 = DIR=DATA.pubchem && mkdir -p $$DIR && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' void.ttl | while read URL; do FILE=$$(basename $$URL); echo "echo \"Processing $$URL ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run -i --rm -v $$(pwd):/data stain/jena turtle --output=NT /data/$$DIR/$$FILE | gzip -c > $$DIR/$${FILE%%.*}.nt.gz && rm -f $$DIR/$$FILE || echo \"ERROR processing $$URL\""; done >> pubchem.get-data-cmds.txt
|
|
33
|
+
GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${MAKE_GET_DATA_CMD_1} && ${MAKE_GET_DATA_CMD_2} && cat pubchem.get-data-cmds.txt | parallel --line-buffer 2>&1 | tee pubchem.get-data-log.txt
|
|
34
|
+
VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE")
|
|
35
|
+
DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo)
|
|
36
|
+
MAKE_ONTOLOGIES_CSV = $$(mkdir -p DATA.ontologies && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\n BioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\n CHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\n ChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\n CiTO,cito.nt,http://purl.org/spar/cito.nt\n DCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\n FaBiO,fabio.nt,http://purl.org/spar/fabio.nt\n GO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\n IAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\n NCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\n NDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\n OBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\n OWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\n PDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\n PR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\n RDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\n RDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\n RO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\n SIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\n SKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\n SO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\n UO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > DATA.ontologies/ontologies.csv)
|
|
48
37
|
|
|
49
38
|
[index]
|
|
50
|
-
INPUT_FILES =
|
|
39
|
+
INPUT_FILES = DATA.ontologies/*.nt.gz DATA.pubchem/*.nt.gz
|
|
51
40
|
CAT_INPUT_FILES = zcat ${INPUT_FILES}
|
|
52
|
-
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch":
|
|
41
|
+
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
|
|
53
42
|
STXXL_MEMORY = 10G
|
|
54
43
|
|
|
55
44
|
[server]
|
|
56
45
|
PORT = 7023
|
|
57
|
-
ACCESS_TOKEN = ${NAME}
|
|
46
|
+
ACCESS_TOKEN = ${data:NAME}
|
|
58
47
|
MEMORY_FOR_QUERIES = 20G
|
|
59
48
|
TIMEOUT = 120s
|
|
60
49
|
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
# Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
|
|
2
2
|
#
|
|
3
|
-
# qlever get-data #
|
|
3
|
+
# qlever get-data # takes ~3 seconds, generates TTL of size ~600 MB
|
|
4
4
|
# qlever index # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
|
|
5
|
-
# qlever start #
|
|
5
|
+
# qlever start # instant
|
|
6
6
|
#
|
|
7
7
|
# Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
|
|
8
8
|
|
|
9
9
|
[data]
|
|
10
10
|
NAME = wikipathways
|
|
11
|
-
RELEASE =
|
|
11
|
+
RELEASE = 20240810
|
|
12
12
|
GET_DATA_URL = https://data.wikipathways.org/${RELEASE}/rdf
|
|
13
|
-
GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways
|
|
13
|
+
GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-rdf-void.ttl && \
|
|
14
14
|
wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
|
|
15
15
|
unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
|
|
16
16
|
wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
|
|
@@ -23,13 +23,13 @@ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
|
|
|
23
23
|
|
|
24
24
|
[index]
|
|
25
25
|
INPUT_FILES = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
|
|
26
|
-
CAT_INPUT_FILES = cat ${
|
|
26
|
+
CAT_INPUT_FILES = cat ${INPUT_FILES}
|
|
27
27
|
SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
|
|
28
28
|
TEXT_INDEX = from_literals
|
|
29
29
|
|
|
30
30
|
[server]
|
|
31
31
|
PORT = 7040
|
|
32
|
-
ACCESS_TOKEN = ${data:NAME}
|
|
32
|
+
ACCESS_TOKEN = ${data:NAME}
|
|
33
33
|
MEMORY_FOR_QUERIES = 5G
|
|
34
34
|
|
|
35
35
|
[runtime]
|
|
@@ -59,17 +59,37 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
59
59
|
"or just compute the size of the result")
|
|
60
60
|
subparser.add_argument("--limit", type=int,
|
|
61
61
|
help="Limit on the number of results")
|
|
62
|
+
subparser.add_argument("--remove-offset-and-limit",
|
|
63
|
+
action="store_true", default=False,
|
|
64
|
+
help="Remove OFFSET and LIMIT from the query")
|
|
62
65
|
subparser.add_argument("--accept", type=str,
|
|
63
66
|
choices=["text/tab-separated-values",
|
|
64
|
-
"
|
|
67
|
+
"text/csv",
|
|
68
|
+
"application/sparql-results+json",
|
|
69
|
+
"text/turtle"],
|
|
65
70
|
default="text/tab-separated-values",
|
|
66
71
|
help="Accept header for the SPARQL query")
|
|
67
72
|
subparser.add_argument("--clear-cache",
|
|
68
73
|
choices=["yes", "no"],
|
|
69
74
|
default="yes",
|
|
70
75
|
help="Clear the cache before each query")
|
|
76
|
+
subparser.add_argument("--width-query-description", type=int,
|
|
77
|
+
default=40,
|
|
78
|
+
help="Width for printing the query description")
|
|
79
|
+
subparser.add_argument("--width-error-message", type=int,
|
|
80
|
+
default=80,
|
|
81
|
+
help="Width for printing the error message "
|
|
82
|
+
"(0 = no limit)")
|
|
83
|
+
subparser.add_argument("--width-result-size", type=int,
|
|
84
|
+
default=14,
|
|
85
|
+
help="Width for printing the result size")
|
|
71
86
|
|
|
72
87
|
def execute(self, args) -> bool:
|
|
88
|
+
# We can't have both `--remove-offset-and-limit` and `--limit`.
|
|
89
|
+
if args.remove_offset_and_limit and args.limit:
|
|
90
|
+
log.error("Cannot have both --remove-offset-and-limit and --limit")
|
|
91
|
+
return False
|
|
92
|
+
|
|
73
93
|
# If `args.accept` is `application/sparql-results+json`, we need `jq`.
|
|
74
94
|
if args.accept == "application/sparql-results+json":
|
|
75
95
|
try:
|
|
@@ -153,26 +173,41 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
153
173
|
with mute_log():
|
|
154
174
|
ClearCacheCommand().execute(args)
|
|
155
175
|
|
|
156
|
-
#
|
|
157
|
-
if args.
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
query = query.replace(
|
|
167
|
-
"SELECT ",
|
|
168
|
-
f"SELECT (COUNT({first_var}) AS {first_var}_count_) "
|
|
169
|
-
f"WHERE {{ SELECT ", 1) + " }"
|
|
176
|
+
# Remove OFFSET and LIMIT (after the last closing bracket).
|
|
177
|
+
if args.remove_offset_and_limit or args.limit:
|
|
178
|
+
closing_bracket_idx = query.rfind("}")
|
|
179
|
+
regexes = [re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE),
|
|
180
|
+
re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE)]
|
|
181
|
+
for regex in regexes:
|
|
182
|
+
match = re.search(regex, query[closing_bracket_idx:])
|
|
183
|
+
if match:
|
|
184
|
+
query = query[:closing_bracket_idx + match.start()] + \
|
|
185
|
+
query[closing_bracket_idx + match.end():]
|
|
170
186
|
|
|
171
187
|
# Limit query.
|
|
172
188
|
if args.limit:
|
|
173
|
-
query
|
|
174
|
-
|
|
175
|
-
|
|
189
|
+
query += f" LIMIT {args.limit}"
|
|
190
|
+
|
|
191
|
+
# Count query.
|
|
192
|
+
if args.download_or_count == "count":
|
|
193
|
+
# First find out if there is a FROM clause.
|
|
194
|
+
regex_from_clause = re.compile(r"\s*FROM\s+<[^>]+>\s*",
|
|
195
|
+
re.IGNORECASE)
|
|
196
|
+
match_from_clause = re.search(regex_from_clause, query)
|
|
197
|
+
from_clause = " "
|
|
198
|
+
if match_from_clause:
|
|
199
|
+
from_clause = match_from_clause.group(0)
|
|
200
|
+
query = (query[:match_from_clause.start()] + " " +
|
|
201
|
+
query[match_from_clause.end():])
|
|
202
|
+
# Now we can add the outer SELECT COUNT(*).
|
|
203
|
+
query = re.sub(r"SELECT ",
|
|
204
|
+
"SELECT (COUNT(*) AS ?qlever_count_)"
|
|
205
|
+
+ from_clause + "WHERE { SELECT ",
|
|
206
|
+
query, count=1, flags=re.IGNORECASE) + " }"
|
|
207
|
+
|
|
208
|
+
# A bit of pretty-printing.
|
|
209
|
+
query = re.sub(r"\s+", " ", query)
|
|
210
|
+
query = re.sub(r"\s*\.\s*\}", " }", query)
|
|
176
211
|
|
|
177
212
|
# Launch query.
|
|
178
213
|
try:
|
|
@@ -214,10 +249,16 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
214
249
|
f" | tonumber\" {result_file}",
|
|
215
250
|
return_output=True)
|
|
216
251
|
else:
|
|
217
|
-
if args.accept == "text/tab-separated-values"
|
|
252
|
+
if (args.accept == "text/tab-separated-values"
|
|
253
|
+
or args.accept == "text/csv"):
|
|
218
254
|
result_size = run_command(
|
|
219
255
|
f"sed 1d {result_file} | wc -l",
|
|
220
256
|
return_output=True)
|
|
257
|
+
elif args.accept == "text/turtle":
|
|
258
|
+
result_size = run_command(
|
|
259
|
+
f"sed '1d;/^@prefix/d;/^\\s*$/d' "
|
|
260
|
+
f"{result_file} | wc -l",
|
|
261
|
+
return_output=True)
|
|
221
262
|
else:
|
|
222
263
|
result_size = run_command(
|
|
223
264
|
f"jq -r \".results.bindings | length\""
|
|
@@ -227,20 +268,30 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
227
268
|
except Exception as e:
|
|
228
269
|
error_msg = str(e)
|
|
229
270
|
|
|
271
|
+
# Remove the result file (unless in debug mode).
|
|
272
|
+
if args.log_level != "DEBUG":
|
|
273
|
+
Path(result_file).unlink(missing_ok=True)
|
|
274
|
+
|
|
230
275
|
# Print description, time, result in tabular form.
|
|
231
|
-
if
|
|
232
|
-
description = description[:
|
|
276
|
+
if len(description) > args.width_query_description:
|
|
277
|
+
description = description[:args.width_query_description - 3]
|
|
278
|
+
description += "..."
|
|
233
279
|
if error_msg is None:
|
|
234
|
-
log.info(f"{description:<
|
|
235
|
-
f"{
|
|
280
|
+
log.info(f"{description:<{args.width_query_description}} "
|
|
281
|
+
f"{time_seconds:6.2f} s "
|
|
282
|
+
f"{result_size:>{args.width_result_size},}")
|
|
236
283
|
count_succeeded += 1
|
|
237
284
|
total_time_seconds += time_seconds
|
|
238
285
|
total_result_size += result_size
|
|
239
286
|
else:
|
|
240
287
|
count_failed += 1
|
|
241
|
-
if (
|
|
242
|
-
|
|
243
|
-
|
|
288
|
+
if (args.width_error_message > 0
|
|
289
|
+
and len(error_msg) > args.width_error_message
|
|
290
|
+
and args.log_level != "DEBUG"):
|
|
291
|
+
error_msg = error_msg[:args.width_error_message - 3]
|
|
292
|
+
error_msg += "..."
|
|
293
|
+
log.error(f"{description:<{args.width_query_description}} "
|
|
294
|
+
f"failed "
|
|
244
295
|
f"{colored(error_msg, 'red')}")
|
|
245
296
|
|
|
246
297
|
# Print total time.
|
|
@@ -248,11 +299,11 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
248
299
|
if count_succeeded > 0:
|
|
249
300
|
query_or_queries = "query" if count_succeeded == 1 else "queries"
|
|
250
301
|
description = (f"TOTAL for {count_succeeded} {query_or_queries}")
|
|
251
|
-
log.info(f"{description:<
|
|
302
|
+
log.info(f"{description:<{args.width_query_description}} "
|
|
252
303
|
f"{total_time_seconds:6.2f} s "
|
|
253
304
|
f"{total_result_size:>14,}")
|
|
254
305
|
description = (f"AVERAGE for {count_succeeded} {query_or_queries}")
|
|
255
|
-
log.info(f"{description:<
|
|
306
|
+
log.info(f"{description:<{args.width_query_description}} "
|
|
256
307
|
f"{total_time_seconds / count_succeeded:6.2f} s "
|
|
257
308
|
f"{round(total_result_size / count_succeeded):>14,}")
|
|
258
309
|
else:
|
|
@@ -262,6 +313,4 @@ class ExampleQueriesCommand(QleverCommand):
|
|
|
262
313
|
log.info(colored("All queries failed", "red"))
|
|
263
314
|
|
|
264
315
|
# Return success (has nothing to do with how many queries failed).
|
|
265
|
-
if args.log_level != "DEBUG":
|
|
266
|
-
Path(result_file).unlink(missing_ok=True)
|
|
267
316
|
return True
|
qlever/commands/index.py
CHANGED
|
@@ -25,7 +25,7 @@ class IndexCommand(QleverCommand):
|
|
|
25
25
|
return True
|
|
26
26
|
|
|
27
27
|
def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
|
|
28
|
-
return {"data": ["name"],
|
|
28
|
+
return {"data": ["name", "format"],
|
|
29
29
|
"index": ["input_files", "cat_input_files", "settings_json",
|
|
30
30
|
"index_binary",
|
|
31
31
|
"only_pso_and_pos_permutations", "use_patterns",
|
|
@@ -41,7 +41,7 @@ class IndexCommand(QleverCommand):
|
|
|
41
41
|
def execute(self, args) -> bool:
|
|
42
42
|
# Construct the command line.
|
|
43
43
|
index_cmd = (f"{args.cat_input_files} | {args.index_binary}"
|
|
44
|
-
f" -F
|
|
44
|
+
f" -F {args.format} -"
|
|
45
45
|
f" -i {args.name}"
|
|
46
46
|
f" -s {args.name}.settings.json")
|
|
47
47
|
if args.only_pso_and_pos_permutations:
|
qlever/commands/ui.py
CHANGED
|
@@ -5,6 +5,7 @@ import subprocess
|
|
|
5
5
|
from qlever.command import QleverCommand
|
|
6
6
|
from qlever.containerize import Containerize
|
|
7
7
|
from qlever.log import log
|
|
8
|
+
from qlever.util import is_port_used
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class UiCommand(QleverCommand):
|
|
@@ -53,6 +54,10 @@ class UiCommand(QleverCommand):
|
|
|
53
54
|
Containerize.stop_and_remove_container(
|
|
54
55
|
container_system, args.ui_container)
|
|
55
56
|
|
|
57
|
+
# Check if the UI port is already being used.
|
|
58
|
+
if is_port_used(args.ui_port):
|
|
59
|
+
log.warning(f"It looks like the specified port for the UI ({args.ui_port}) is already in use. You can set another port in the Qleverfile in the [ui] section with the UI_PORT variable.")
|
|
60
|
+
|
|
56
61
|
# Try to start the QLever UI.
|
|
57
62
|
try:
|
|
58
63
|
subprocess.run(pull_cmd, shell=True, stdout=subprocess.DEVNULL)
|
|
@@ -65,5 +70,5 @@ class UiCommand(QleverCommand):
|
|
|
65
70
|
# Success.
|
|
66
71
|
log.info(f"The QLever UI should now be up at {ui_url} ..."
|
|
67
72
|
f"You can log in as QLever UI admin with username and "
|
|
68
|
-
f"
|
|
73
|
+
f"password \"demo\"")
|
|
69
74
|
return True
|
qlever/qlever_old.py
CHANGED
|
@@ -985,7 +985,7 @@ class Actions:
|
|
|
985
985
|
log.info(f"The QLever UI should now be up at "
|
|
986
986
|
f"http://{host_name}:{self.config['ui']['port']}")
|
|
987
987
|
log.info("You can log in as QLever UI admin with username and "
|
|
988
|
-
"
|
|
988
|
+
"password \"demo\"")
|
|
989
989
|
|
|
990
990
|
@track_action_rank
|
|
991
991
|
def action_cache_stats_and_settings(self, only_show=False):
|
qlever/qleverfile.py
CHANGED
|
@@ -51,8 +51,12 @@ class Qleverfile:
|
|
|
51
51
|
help="A concise description of the dataset")
|
|
52
52
|
data_args["text_description"] = arg(
|
|
53
53
|
"--text-description", type=str, default=None,
|
|
54
|
-
help="A
|
|
54
|
+
help="A concise description of the additional text data"
|
|
55
55
|
" if any")
|
|
56
|
+
data_args["format"] = arg(
|
|
57
|
+
"--format", type=str, default="ttl",
|
|
58
|
+
choices=["ttl", "nt", "nq"],
|
|
59
|
+
help="The format of the data")
|
|
56
60
|
|
|
57
61
|
index_args["input_files"] = arg(
|
|
58
62
|
"--input-files", type=str, required=True,
|
|
@@ -173,7 +177,7 @@ class Qleverfile:
|
|
|
173
177
|
help="The name of the container used by `qlever start`")
|
|
174
178
|
|
|
175
179
|
ui_args["ui_port"] = arg(
|
|
176
|
-
"--
|
|
180
|
+
"--ui-port", type=int, default=8176,
|
|
177
181
|
help="The port of the Qlever UI when running `qlever ui`")
|
|
178
182
|
ui_args["ui_config"] = arg(
|
|
179
183
|
"--ui-config", type=str, default="default",
|
qlever/util.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import errno
|
|
3
4
|
import re
|
|
4
5
|
import secrets
|
|
6
|
+
import socket
|
|
5
7
|
import shlex
|
|
6
8
|
import shutil
|
|
7
9
|
import string
|
|
@@ -180,3 +182,21 @@ def get_random_string(length: int) -> str:
|
|
|
180
182
|
"""
|
|
181
183
|
characters = string.ascii_letters + string.digits
|
|
182
184
|
return "".join(secrets.choice(characters) for _ in range(length))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def is_port_used(port: int) -> bool:
|
|
188
|
+
"""
|
|
189
|
+
Try to bind to the port on all interfaces to check if the port is already in use.
|
|
190
|
+
If the port is already in use, `socket.bind` will raise an `OSError` with errno EADDRINUSE.
|
|
191
|
+
"""
|
|
192
|
+
try:
|
|
193
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
194
|
+
# Ensure that the port is not blocked after the check.
|
|
195
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
196
|
+
sock.bind(('', port))
|
|
197
|
+
sock.close()
|
|
198
|
+
return False
|
|
199
|
+
except OSError as err:
|
|
200
|
+
if err.errno != errno.EADDRINUSE:
|
|
201
|
+
log.warning(f"Failed to determine if port is used: {err}")
|
|
202
|
+
return True
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: qlever
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: Script for using the QLever SPARQL engine.
|
|
5
5
|
Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -77,6 +77,14 @@ There are many more commands and options, see `qlever --help` for general help,
|
|
|
77
77
|
`qlever <command> --help` for help on a specific command, or just the
|
|
78
78
|
autocompletion.
|
|
79
79
|
|
|
80
|
+
# Use with your own dataset
|
|
81
|
+
|
|
82
|
+
To use QLever with your own dataset, you should also write a `Qleverfile`, like
|
|
83
|
+
in the example above. The easiest way to write a `Qleverfile` is to get one of
|
|
84
|
+
the existing ones (using `qlever setup-config ...` as explained above) and then
|
|
85
|
+
change it according to your needs (the variable names should be self-explanatory).
|
|
86
|
+
Pick one for a dataset that is similar to yours and when in doubt, pick `olympics`.
|
|
87
|
+
|
|
80
88
|
# For developers
|
|
81
89
|
|
|
82
90
|
The (Python) code for the script is in the `*.py` files in `src/qlever`. The
|