qlever 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qlever might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  # Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
2
2
  #
3
3
  # qlever get-data # takes ~3 mins (downloads .ttl.gz file of size ~3 GB)
4
- # qlever index # takes ~3 mins (on an AMD Ryzen 9 5900X)
4
+ # qlever index # takes ~4 mins (on an AMD Ryzen 9 5900X)
5
5
  # qlever start # takes a few seconds
6
6
 
7
7
  [data]
@@ -0,0 +1,30 @@
1
+ # Qleverfile for DBpedia, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # ~14 GB, ~850 M triples (as of 30.07.2024)
4
+ # qlever index # ~20 min (on an AMD Ryzen 9 5900X)
5
+ # qlever start # ~3 sec
6
+
7
+ [data]
8
+ NAME = dbpedia
9
+ DATABUS_URL = https://databus.dbpedia.org/dbpedia/collections/latest-core
10
+ GET_DATA_CMD = curl -X POST -H "Accept: text/csv" --data-urlencode "query=$$(curl -s -H "Accept:text/sparql" https://databus.dbpedia.org/dbpedia/collections/latest-core)" https://databus.dbpedia.org/sparql | tail -n+2 | sed 's/\r$$//' | sed 's/"//g' | while read -r file; do wget -P rdf-input $$file; done
11
+ DESCRIPTION = RDF data from ${DATABUS_URL}
12
+
13
+ [index]
14
+ INPUT_FILES = rdf-input/*
15
+ CAT_INPUT_FILES = (cat rdf-input/*.nt; lbzcat -n2 rdf-input/*.bzip2 rdf-input/*.bz2)
16
+ SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
17
+ WITH_TEXT_INDEX = false
18
+
19
+ [server]
20
+ PORT = 7012
21
+ ACCESS_TOKEN = ${data:NAME}
22
+ MEMORY_FOR_QUERIES = 10G
23
+ CACHE_MAX_SIZE = 5G
24
+
25
+ [runtime]
26
+ SYSTEM = docker
27
+ IMAGE = docker.io/adfreiburg/qlever:latest
28
+
29
+ [ui]
30
+ UI_CONFIG = dbpedia
@@ -1,47 +1,51 @@
1
- # Automatically created by the "qlever" script
1
+ # Default Qleverfile, use with https://github.com/ad-freiburg/qlever-control
2
2
  #
3
- # Modify as you see fit. Beware that some of the values below are executed as
4
- # commands by the script.
5
- #
6
- # If you have never seen a Qleverfile before, we recommend that you look at the
7
- # pre-filled Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
8
- # Qleverfiles first to get some inspiration. Or execute `qlever setup-config
9
- # <config name>` with a config name of your choice.
3
+ # If you have never seen a Qleverfile before, we recommend that you first look
4
+ # at the example Qleverfiles on http://qlever.cs.uni-freiburg.de/qlever-control/
5
+ # src/qlever/Qleverfiles . Or execute `qlever setup-config <dataset>` on the
6
+ # command line to obtain the example Qleverfiles for <dataset>.
10
7
 
11
8
  # As a minimum, each dataset needs a name. If you want `qlever get-data` to do
12
- # something meaningful, you need to define GET_DATA_CMD. If you want to use the
13
- # QLever UI, you should define DESCRIPTION (and if you have a text index,
14
- # also TEXT_DESCRIPTION).
9
+ # something meaningful, you need to define GET_DATA_CMD. Otherwise, you need to
10
+ # generate (or download or copy from somewhere) the input files yourself. Each
11
+ # dataset should have a short DESCRIPTION, ideally with a date.
15
12
  [data]
16
- NAME =
17
- # GET_DATA_CMD =
18
- # DESCRIPTION =
19
- # TEXT_DESCRIPTION =
13
+ NAME =
14
+ GET_DATA_CMD =
15
+ DESCRIPTION =
20
16
 
21
- # CAT_INPUT_FILES produces the data that is piped into QLever's index builder.
22
- # Use SETTINGS_JSON for more advanced configuration settings (see the other
23
- # Qleverfiles for examples).
17
+ # The format for INPUT_FILES should be such that `ls ${INPUT_FILES}` lists all
18
+ # input files. CAT_INPUT_FILES should write a concatenation of all input files
19
+ # to stdout. For example, if your input files are gzipped, you can write `zcat
20
+ # ${INPUT_FILES}`. Regarding SETTINGS_JSON, look at the other Qleverfiles for
21
+ # examples. Several batches of size `num-triples-per-batch` are kept in RAM at
22
+ # the same time; increasing this, increases the memory usage but speeds up the
23
+ # loading process.
24
24
  [index]
25
- # INPUT_FILES =
26
- # CAT_INPUT_FILES = cat ${INPUT_FILES}
27
- # SETTINGS_JSON = {}
25
+ INPUT_FILES = *.ttl
26
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
27
+ SETTINGS_JSON = { "num-triples-per-batch": 1000000 }
28
28
 
29
- # As a minimum, you need to specify the PORT, where QLever will listen for
30
- # SPARQL queries. If you want to send priviledged commands to the server, you
31
- # need to specify an ACCESS_TOKEN (modify the random number below).
29
+ # The server listens on PORT. If you want to send privileged commands to the
30
+ # server, you need to specify an ACCESS_TOKEN, which you then have to set via a
31
+ # URL parameter `access_token`. It should not be easily guessable, unless you
32
+ # don't mind others to get privileged access to your server.
32
33
  [server]
33
- PORT = 7001
34
- # ACCESS_TOKEN = ${data:NAME}_1234567890
34
+ PORT =
35
+ ACCESS_TOKEN =
35
36
 
36
- # With USE_DOCKER = true, the qlever script will download the docker image for
37
- # you and run QLever inside docker containers. With USE_DOCKER = false, you need
38
- # the QLever binaries in the PATH of your sheel.
37
+ # Use SYSTEM = docker to run QLever inside a docker container; the Docker image
38
+ # will be downloaded automatically. Use SYSTEM = native to use self-compiled
39
+ # binaries `IndexBuilderMain` and `ServerMain` (which should be in you PATH).
39
40
  [runtime]
40
- SYSTEM = true
41
+ SYSTEM = docker
41
42
  IMAGE = docker.io/adfreiburg/qlever:latest
42
43
 
44
+ # UI_PORT specifies the port of the QLever UI web app, when you run `qlever ui`.
43
45
  # The UI_CONFIG must be one of the slugs from http://qlever.cs.uni-freiburg.de
44
46
  # (see the dropdown menu on the top right, the slug is the last part of the URL).
45
- # In partiular, this determines the example queries.
47
+ # It determines the example queries and which SPARQL queries are launched to
48
+ # obtain suggestions as you type a query.
46
49
  [ui]
50
+ UI_PORT = 8176
47
51
  UI_CONFIG = default
@@ -17,14 +17,14 @@
17
17
  [data]
18
18
  NAME = dnb
19
19
  BASE_URL = https://data.dnb.de/opendata
20
- GET_DATA_CMD = curl -L -C - --remote-name-all ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
20
+ GET_DATA_CMD = curl -L -C - --remote-name-all --remote-time ${BASE_URL}/authorities-gnd_lds.nt.gz ${BASE_URL}/dnb-all_lds.nt.gz ${BASE_URL}/dnb-all_ldsprov.nt.gz ${BASE_URL}/zdb_lds.nt.gz 2>&1 | tee ${data:NAME}.getdata-log.txt
21
21
  VERSION = $$(date -r dnb-all_lds.nt.gz +%d.%m.%Y || echo "NO_DATE")
22
22
  DESCRIPTION = DNB data from ${BASE_URL} (authoritities-gnd_lds, dnb_all_lds, dnb-all_ldsprov, zdb_lds), version ${VERSION}
23
23
 
24
24
  [index]
25
25
  INPUT_FILES = *.nt.gz
26
- CAT_INPUT_FILES = zcat ${INPUT_FILES}
27
- SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }
26
+ CAT_INPUT_FILES = zcat ${INPUT_FILES} | sed '/"\$$R0"/d;/"0\.03013\$$D"/d'
27
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
28
28
 
29
29
  [server]
30
30
  PORT = 7035
@@ -9,8 +9,8 @@
9
9
  [data]
10
10
  NAME = imdb
11
11
  IMDB_DATA_URL = https://datasets.imdbws.com
12
- GET_PREFIXES = echo "@prefix imdb: <https://www.imdb.com/> .\n"
13
- GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
12
+ GET_PREFIXES = echo "@prefix imdb: <https://www.imdb.com/> ."
13
+ GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\\\\", "\\\\", $$3); gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE}
14
14
  GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE}
15
15
  GET_DATA_CMD = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl
16
16
  DESCRIPTION = RDF data derived from ${IMDB_DATA_URL}
@@ -18,17 +18,17 @@ TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...")
18
18
 
19
19
  [index]
20
20
  INPUT_FILES = ${data:NAME}.ttl
21
- CAT_INPUT_FILES = cat ${FILE_NAMES}
21
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
22
22
  SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
23
23
  TEXT_INDEX = from_literals
24
24
 
25
25
  [server]
26
26
  PORT = 7029
27
- ACCESS_TOKEN = ${data:NAME}_1234567890
27
+ ACCESS_TOKEN = ${data:NAME}
28
28
  MEMORY_FOR_QUERIES = 5G
29
29
 
30
30
  [runtime]
31
- SYSTEM = docker
31
+ SYSTEM = native
32
32
  IMAGE = docker.io/adfreiburg/qlever:latest
33
33
 
34
34
  [ui]
@@ -1,60 +1,49 @@
1
1
  # Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
2
2
  #
3
- # qlever get-data # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4
4
- # qlever index # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
5
- # qlever start # starts the server (a few seconds)
3
+ # Resource requirements (as of 18.08.2024, on an AMD Ryzen 9 5900X):
6
4
  #
7
- # IMPORTANT NOTES:
5
+ # qlever get-data # ~2 hours, ~150 GB, ~19 billion triples
6
+ # qlever index # ~7 hours, ~20 GB RAM, ~400 GB disk space
7
+ # qlever start # a few seconds
8
8
  #
9
- # NOTE 1: The SPARQL endpoint at https://qlever.cs.uni-freiburg.de/pubchem also
10
- # contains data from the following ontologies, which are very useful for
11
- # resolving names of IRIs like `sio:SIO_000008` or `obo:IAO_0000412`, but which
12
- # are not part of the PubChem RDF data. For the corresponding URLs, see
13
- # https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1200479401 .
9
+ # NOTE 1: `get-data` does not only download the PubChem RDF data, but also
10
+ # a number of ontologies. These are very useful to obtain names for IRIs like
11
+ # `sio:SIO_000008` or `obo:IAO_0000412` (otherwise very hard to understand).
12
+ # The ontologies BAO and NDF-RT are infrequently updated, for latest versions,
13
+ # see the download links at https://bioportal.bioontology.org/ontologies/BAO
14
+ # and https://bioportal.bioontology.org/ontologies/NDF-RT .
14
15
  #
15
- # bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit
16
- # obi pr ro sio skos so uo
17
- #
18
- # NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
19
- # disallows downloading the PubChem RDF data using `wget --recursive` as in the
20
- # GET_DATA_CMD below. As a workaround, you can write a simple Python script
21
- # (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
22
- # pages and download the files individually. This was done for the latest
23
- # version of https://qlever.cs.uni-freiburg.de/pubchem .
24
- #
25
- # NOTE 3: Many of the TTL files have generic prefix definitions in the middle
16
+ # NOTE 2: Many of the TTL files have generic prefix definitions in the middle
26
17
  # of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
27
18
  # See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
28
- # This is allowed by the standard, but VERY unusual. For use with QLever,
29
- # convert the TTL files to NT before indexing, see GET_DATA_CMD below.
30
- #
31
- # NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
32
- # spaces and braces are not properly escaped. Here is a simple awk-based script
33
- # to percent-encode spaces and braces in all IRIs in the NT files:
19
+ # This is allowed by the standard, but unusual. For use with QLever, we
20
+ # therefore convert the TTL files to NT when downloading them.
34
21
  #
35
- # for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt
36
- # cat fix-nt.commands.txt | parallel
37
-
38
-
39
- [DEFAULT]
40
- NAME = pubchem
41
- DATE = 2024-02-03
22
+ # NOTE 3: The PubChem data contains several invalid IRIs, in particular,
23
+ # containing spaces. The previous version of this Qleverfile used a combination
24
+ # of `sed` and `awk` to fix this. In the meantime, QLever's default is to warn
25
+ # about such IRIs while indexing, but accept them anyway.
42
26
 
43
27
  [data]
44
- GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
45
- MAKE_GET_DATA_CMD = curl -s ${GET_DATA_URL}/void.ttl | grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' | grep -v "nbr[23]d" | while read URL; do echo "echo \"Processing $$URL ...\"; curl --silent --remote-time --output ttl.${DATE}/$$(basename $$URL) $$URL && docker run --rm -v $$(pwd)/ttl.${DATE}:/data stain/jena turtle --output=NT /data/$$(basename $$URL) | sed 's/> />\t/1; s/> />\t/1; s/ \.\$$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$$i ~ /^<.*>\$$/) { gsub(/ /, \"%20\", \$$i); gsub(/\[/, \"%5B\", \$$i); gsub(/\]/, \"%5D\", \$$i); gsub(/{/, \"%7B\", \$$i); gsub(/}/, \"%7D\", \$$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}/$$(basename -s .ttl.gz $$URL).nt.gz"; done > pubchem.get-data-cmds.txt
46
- GET_DATA_CMD = mkdir -p ttl.${DATE} && mkdir -p nt.${DATE} && ${MAKE_GET_DATA_CMD} && cat pubchem.get-data-cmds.txt | parallel --line-buffer
47
- DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version ${DATE} (all folders except nbr2d and nbr3d)
28
+ NAME = pubchem
29
+ GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
30
+ CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
31
+ MAKE_GET_DATA_CMD_1 = DIR=DATA.ontologies && mkdir -p $$DIR && cat $$DIR/ontologies.csv | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "echo \"Processing $$URL ($$FILE) ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$DIR/$$FILE 2> $$DIR/$$ERRFILE | gzip -c > $$DIR/$${FILE%.*}.nt.gz && rm -f $$DIR/$$FILE && if [ ! -s $$DIR/$$ERRFILE ]; then rm -f $$DIR/$$ERRFILE; fi || echo \"ERROR processing $$URL ($$FILE)\""; done > pubchem.get-data-cmds.txt
32
+ MAKE_GET_DATA_CMD_2 = DIR=DATA.pubchem && mkdir -p $$DIR && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' void.ttl | while read URL; do FILE=$$(basename $$URL); echo "echo \"Processing $$URL ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run -i --rm -v $$(pwd):/data stain/jena turtle --output=NT /data/$$DIR/$$FILE | gzip -c > $$DIR/$${FILE%%.*}.nt.gz && rm -f $$DIR/$$FILE || echo \"ERROR processing $$URL\""; done >> pubchem.get-data-cmds.txt
33
+ GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${MAKE_GET_DATA_CMD_1} && ${MAKE_GET_DATA_CMD_2} && cat pubchem.get-data-cmds.txt | parallel --line-buffer 2>&1 | tee pubchem.get-data-log.txt
34
+ VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE")
35
+ DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo)
36
+ MAKE_ONTOLOGIES_CSV = $$(mkdir -p DATA.ontologies && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\n BioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\n CHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\n ChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\n CiTO,cito.nt,http://purl.org/spar/cito.nt\n DCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\n FaBiO,fabio.nt,http://purl.org/spar/fabio.nt\n GO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\n IAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\n NCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\n NDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\n OBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\n OWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\n PDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\n PR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\n RDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\n RDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\n RO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\n SIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\n SKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\n SO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\n UO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > DATA.ontologies/ontologies.csv)
48
37
 
49
38
  [index]
50
- INPUT_FILES = pubchem.additional-ontologies.nt.gz nt.${DATE}/*.nt.gz
39
+ INPUT_FILES = DATA.ontologies/*.nt.gz DATA.pubchem/*.nt.gz
51
40
  CAT_INPUT_FILES = zcat ${INPUT_FILES}
52
- SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
41
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
53
42
  STXXL_MEMORY = 10G
54
43
 
55
44
  [server]
56
45
  PORT = 7023
57
- ACCESS_TOKEN = ${NAME}_310129823
46
+ ACCESS_TOKEN = ${data:NAME}
58
47
  MEMORY_FOR_QUERIES = 20G
59
48
  TIMEOUT = 120s
60
49
 
@@ -1,16 +1,16 @@
1
1
  # Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
2
2
  #
3
- # qlever get-data # downloads .gz file of size ~100 MB (as of 24.02.2024)
3
+ # qlever get-data # takes ~3 seconds, generates TTL of size ~600 MB
4
4
  # qlever index # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
5
- # qlever start # starts the server (takes around 2 minutes)
5
+ # qlever start # instant
6
6
  #
7
7
  # Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
8
8
 
9
9
  [data]
10
10
  NAME = wikipathways
11
- RELEASE = 20231210
11
+ RELEASE = 20240810
12
12
  GET_DATA_URL = https://data.wikipathways.org/${RELEASE}/rdf
13
- GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-void.ttl && \
13
+ GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-rdf-void.ttl && \
14
14
  wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
15
15
  unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
16
16
  wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
@@ -23,13 +23,13 @@ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
23
23
 
24
24
  [index]
25
25
  INPUT_FILES = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
26
- CAT_INPUT_FILES = cat ${FILE_NAMES}
26
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
27
27
  SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
28
28
  TEXT_INDEX = from_literals
29
29
 
30
30
  [server]
31
31
  PORT = 7040
32
- ACCESS_TOKEN = ${data:NAME}_7643543846
32
+ ACCESS_TOKEN = ${data:NAME}
33
33
  MEMORY_FOR_QUERIES = 5G
34
34
 
35
35
  [runtime]
@@ -59,17 +59,37 @@ class ExampleQueriesCommand(QleverCommand):
59
59
  "or just compute the size of the result")
60
60
  subparser.add_argument("--limit", type=int,
61
61
  help="Limit on the number of results")
62
+ subparser.add_argument("--remove-offset-and-limit",
63
+ action="store_true", default=False,
64
+ help="Remove OFFSET and LIMIT from the query")
62
65
  subparser.add_argument("--accept", type=str,
63
66
  choices=["text/tab-separated-values",
64
- "application/sparql-results+json"],
67
+ "text/csv",
68
+ "application/sparql-results+json",
69
+ "text/turtle"],
65
70
  default="text/tab-separated-values",
66
71
  help="Accept header for the SPARQL query")
67
72
  subparser.add_argument("--clear-cache",
68
73
  choices=["yes", "no"],
69
74
  default="yes",
70
75
  help="Clear the cache before each query")
76
+ subparser.add_argument("--width-query-description", type=int,
77
+ default=40,
78
+ help="Width for printing the query description")
79
+ subparser.add_argument("--width-error-message", type=int,
80
+ default=80,
81
+ help="Width for printing the error message "
82
+ "(0 = no limit)")
83
+ subparser.add_argument("--width-result-size", type=int,
84
+ default=14,
85
+ help="Width for printing the result size")
71
86
 
72
87
  def execute(self, args) -> bool:
88
+ # We can't have both `--remove-offset-and-limit` and `--limit`.
89
+ if args.remove_offset_and_limit and args.limit:
90
+ log.error("Cannot have both --remove-offset-and-limit and --limit")
91
+ return False
92
+
73
93
  # If `args.accept` is `application/sparql-results+json`, we need `jq`.
74
94
  if args.accept == "application/sparql-results+json":
75
95
  try:
@@ -153,26 +173,41 @@ class ExampleQueriesCommand(QleverCommand):
153
173
  with mute_log():
154
174
  ClearCacheCommand().execute(args)
155
175
 
156
- # Count query.
157
- if args.download_or_count == "count":
158
- # Find first string matching ?[a-zA-Z0-9_]+ in query.
159
- match = re.search(r"\?[a-zA-Z0-9_]+", query)
160
- if not match:
161
- log.error("Could not find a variable in this query:")
162
- log.info("")
163
- log.info(query)
164
- return False
165
- first_var = match.group(0)
166
- query = query.replace(
167
- "SELECT ",
168
- f"SELECT (COUNT({first_var}) AS {first_var}_count_) "
169
- f"WHERE {{ SELECT ", 1) + " }"
176
+ # Remove OFFSET and LIMIT (after the last closing bracket).
177
+ if args.remove_offset_and_limit or args.limit:
178
+ closing_bracket_idx = query.rfind("}")
179
+ regexes = [re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE),
180
+ re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE)]
181
+ for regex in regexes:
182
+ match = re.search(regex, query[closing_bracket_idx:])
183
+ if match:
184
+ query = query[:closing_bracket_idx + match.start()] + \
185
+ query[closing_bracket_idx + match.end():]
170
186
 
171
187
  # Limit query.
172
188
  if args.limit:
173
- query = query.replace(
174
- "SELECT ", "SELECT * WHERE { SELECT ", 1) \
175
- + f" }} LIMIT {args.limit}"
189
+ query += f" LIMIT {args.limit}"
190
+
191
+ # Count query.
192
+ if args.download_or_count == "count":
193
+ # First find out if there is a FROM clause.
194
+ regex_from_clause = re.compile(r"\s*FROM\s+<[^>]+>\s*",
195
+ re.IGNORECASE)
196
+ match_from_clause = re.search(regex_from_clause, query)
197
+ from_clause = " "
198
+ if match_from_clause:
199
+ from_clause = match_from_clause.group(0)
200
+ query = (query[:match_from_clause.start()] + " " +
201
+ query[match_from_clause.end():])
202
+ # Now we can add the outer SELECT COUNT(*).
203
+ query = re.sub(r"SELECT ",
204
+ "SELECT (COUNT(*) AS ?qlever_count_)"
205
+ + from_clause + "WHERE { SELECT ",
206
+ query, count=1, flags=re.IGNORECASE) + " }"
207
+
208
+ # A bit of pretty-printing.
209
+ query = re.sub(r"\s+", " ", query)
210
+ query = re.sub(r"\s*\.\s*\}", " }", query)
176
211
 
177
212
  # Launch query.
178
213
  try:
@@ -214,10 +249,16 @@ class ExampleQueriesCommand(QleverCommand):
214
249
  f" | tonumber\" {result_file}",
215
250
  return_output=True)
216
251
  else:
217
- if args.accept == "text/tab-separated-values":
252
+ if (args.accept == "text/tab-separated-values"
253
+ or args.accept == "text/csv"):
218
254
  result_size = run_command(
219
255
  f"sed 1d {result_file} | wc -l",
220
256
  return_output=True)
257
+ elif args.accept == "text/turtle":
258
+ result_size = run_command(
259
+ f"sed '1d;/^@prefix/d;/^\\s*$/d' "
260
+ f"{result_file} | wc -l",
261
+ return_output=True)
221
262
  else:
222
263
  result_size = run_command(
223
264
  f"jq -r \".results.bindings | length\""
@@ -227,20 +268,30 @@ class ExampleQueriesCommand(QleverCommand):
227
268
  except Exception as e:
228
269
  error_msg = str(e)
229
270
 
271
+ # Remove the result file (unless in debug mode).
272
+ if args.log_level != "DEBUG":
273
+ Path(result_file).unlink(missing_ok=True)
274
+
230
275
  # Print description, time, result in tabular form.
231
- if (len(description) > 60):
232
- description = description[:57] + "..."
276
+ if len(description) > args.width_query_description:
277
+ description = description[:args.width_query_description - 3]
278
+ description += "..."
233
279
  if error_msg is None:
234
- log.info(f"{description:<60} {time_seconds:6.2f} s "
235
- f"{result_size:14,}")
280
+ log.info(f"{description:<{args.width_query_description}} "
281
+ f"{time_seconds:6.2f} s "
282
+ f"{result_size:>{args.width_result_size},}")
236
283
  count_succeeded += 1
237
284
  total_time_seconds += time_seconds
238
285
  total_result_size += result_size
239
286
  else:
240
287
  count_failed += 1
241
- if (len(error_msg) > 60) and args.log_level != "DEBUG":
242
- error_msg = error_msg[:57] + "..."
243
- log.error(f"{description:<60} failed "
288
+ if (args.width_error_message > 0
289
+ and len(error_msg) > args.width_error_message
290
+ and args.log_level != "DEBUG"):
291
+ error_msg = error_msg[:args.width_error_message - 3]
292
+ error_msg += "..."
293
+ log.error(f"{description:<{args.width_query_description}} "
294
+ f"failed "
244
295
  f"{colored(error_msg, 'red')}")
245
296
 
246
297
  # Print total time.
@@ -248,11 +299,11 @@ class ExampleQueriesCommand(QleverCommand):
248
299
  if count_succeeded > 0:
249
300
  query_or_queries = "query" if count_succeeded == 1 else "queries"
250
301
  description = (f"TOTAL for {count_succeeded} {query_or_queries}")
251
- log.info(f"{description:<60} "
302
+ log.info(f"{description:<{args.width_query_description}} "
252
303
  f"{total_time_seconds:6.2f} s "
253
304
  f"{total_result_size:>14,}")
254
305
  description = (f"AVERAGE for {count_succeeded} {query_or_queries}")
255
- log.info(f"{description:<60} "
306
+ log.info(f"{description:<{args.width_query_description}} "
256
307
  f"{total_time_seconds / count_succeeded:6.2f} s "
257
308
  f"{round(total_result_size / count_succeeded):>14,}")
258
309
  else:
@@ -262,6 +313,4 @@ class ExampleQueriesCommand(QleverCommand):
262
313
  log.info(colored("All queries failed", "red"))
263
314
 
264
315
  # Return success (has nothing to do with how many queries failed).
265
- if args.log_level != "DEBUG":
266
- Path(result_file).unlink(missing_ok=True)
267
316
  return True
qlever/commands/index.py CHANGED
@@ -25,7 +25,7 @@ class IndexCommand(QleverCommand):
25
25
  return True
26
26
 
27
27
  def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
28
- return {"data": ["name"],
28
+ return {"data": ["name", "format"],
29
29
  "index": ["input_files", "cat_input_files", "settings_json",
30
30
  "index_binary",
31
31
  "only_pso_and_pos_permutations", "use_patterns",
@@ -41,7 +41,7 @@ class IndexCommand(QleverCommand):
41
41
  def execute(self, args) -> bool:
42
42
  # Construct the command line.
43
43
  index_cmd = (f"{args.cat_input_files} | {args.index_binary}"
44
- f" -F ttl -f -"
44
+ f" -F {args.format} -"
45
45
  f" -i {args.name}"
46
46
  f" -s {args.name}.settings.json")
47
47
  if args.only_pso_and_pos_permutations:
qlever/commands/ui.py CHANGED
@@ -5,6 +5,7 @@ import subprocess
5
5
  from qlever.command import QleverCommand
6
6
  from qlever.containerize import Containerize
7
7
  from qlever.log import log
8
+ from qlever.util import is_port_used
8
9
 
9
10
 
10
11
  class UiCommand(QleverCommand):
@@ -53,6 +54,10 @@ class UiCommand(QleverCommand):
53
54
  Containerize.stop_and_remove_container(
54
55
  container_system, args.ui_container)
55
56
 
57
+ # Check if the UI port is already being used.
58
+ if is_port_used(args.ui_port):
59
+ log.warning(f"It looks like the specified port for the UI ({args.ui_port}) is already in use. You can set another port in the Qleverfile in the [ui] section with the UI_PORT variable.")
60
+
56
61
  # Try to start the QLever UI.
57
62
  try:
58
63
  subprocess.run(pull_cmd, shell=True, stdout=subprocess.DEVNULL)
@@ -65,5 +70,5 @@ class UiCommand(QleverCommand):
65
70
  # Success.
66
71
  log.info(f"The QLever UI should now be up at {ui_url} ..."
67
72
  f"You can log in as QLever UI admin with username and "
68
- f"passwort \"demo\"")
73
+ f"password \"demo\"")
69
74
  return True
qlever/qlever_old.py CHANGED
@@ -985,7 +985,7 @@ class Actions:
985
985
  log.info(f"The QLever UI should now be up at "
986
986
  f"http://{host_name}:{self.config['ui']['port']}")
987
987
  log.info("You can log in as QLever UI admin with username and "
988
- "passwort \"demo\"")
988
+ "password \"demo\"")
989
989
 
990
990
  @track_action_rank
991
991
  def action_cache_stats_and_settings(self, only_show=False):
qlever/qleverfile.py CHANGED
@@ -51,8 +51,12 @@ class Qleverfile:
51
51
  help="A concise description of the dataset")
52
52
  data_args["text_description"] = arg(
53
53
  "--text-description", type=str, default=None,
54
- help="A concice description of the addtional text data"
54
+ help="A concise description of the additional text data"
55
55
  " if any")
56
+ data_args["format"] = arg(
57
+ "--format", type=str, default="ttl",
58
+ choices=["ttl", "nt", "nq"],
59
+ help="The format of the data")
56
60
 
57
61
  index_args["input_files"] = arg(
58
62
  "--input-files", type=str, required=True,
@@ -173,7 +177,7 @@ class Qleverfile:
173
177
  help="The name of the container used by `qlever start`")
174
178
 
175
179
  ui_args["ui_port"] = arg(
176
- "--ui_port", type=int, default=7000,
180
+ "--ui-port", type=int, default=8176,
177
181
  help="The port of the Qlever UI when running `qlever ui`")
178
182
  ui_args["ui_config"] = arg(
179
183
  "--ui-config", type=str, default="default",
qlever/util.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import errno
3
4
  import re
4
5
  import secrets
6
+ import socket
5
7
  import shlex
6
8
  import shutil
7
9
  import string
@@ -180,3 +182,21 @@ def get_random_string(length: int) -> str:
180
182
  """
181
183
  characters = string.ascii_letters + string.digits
182
184
  return "".join(secrets.choice(characters) for _ in range(length))
185
+
186
+
187
+ def is_port_used(port: int) -> bool:
188
+ """
189
+ Try to bind to the port on all interfaces to check if the port is already in use.
190
+ If the port is already in use, `socket.bind` will raise an `OSError` with errno EADDRINUSE.
191
+ """
192
+ try:
193
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
194
+ # Ensure that the port is not blocked after the check.
195
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
196
+ sock.bind(('', port))
197
+ sock.close()
198
+ return False
199
+ except OSError as err:
200
+ if err.errno != errno.EADDRINUSE:
201
+ log.warning(f"Failed to determine if port is used: {err}")
202
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qlever
3
- Version: 0.5.3
3
+ Version: 0.5.5
4
4
  Summary: Script for using the QLever SPARQL engine.
5
5
  Author-email: Hannah Bast <bast@cs.uni-freiburg.de>
6
6
  License: Apache-2.0
@@ -77,6 +77,14 @@ There are many more commands and options, see `qlever --help` for general help,
77
77
  `qlever <command> --help` for help on a specific command, or just the
78
78
  autocompletion.
79
79
 
80
+ # Use with your own dataset
81
+
82
+ To use QLever with your own dataset, you should also write a `Qleverfile`, like
83
+ in the example above. The easiest way to write a `Qleverfile` is to get one of
84
+ the existing ones (using `qlever setup-config ...` as explained above) and then
85
+ change it according to your needs (the variable names should be self-explanatory).
86
+ Pick one for a dataset that is similar to yours and when in doubt, pick `olympics`.
87
+
80
88
  # For developers
81
89
 
82
90
  The (Python) code for the script is in the `*.py` files in `src/qlever`. The