qlever 0.2.5__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. qlever/Qleverfiles/Qleverfile.dblp +36 -0
  2. qlever/Qleverfiles/Qleverfile.dblp-plus +33 -0
  3. qlever/Qleverfiles/Qleverfile.dbpedia +30 -0
  4. qlever/Qleverfiles/Qleverfile.default +51 -0
  5. qlever/Qleverfiles/Qleverfile.dnb +40 -0
  6. qlever/Qleverfiles/Qleverfile.fbeasy +29 -0
  7. qlever/Qleverfiles/Qleverfile.freebase +28 -0
  8. qlever/Qleverfiles/Qleverfile.imdb +36 -0
  9. qlever/Qleverfiles/Qleverfile.ohm-planet +41 -0
  10. qlever/Qleverfiles/Qleverfile.olympics +31 -0
  11. qlever/Qleverfiles/Qleverfile.orkg +30 -0
  12. qlever/Qleverfiles/Qleverfile.osm-country +39 -0
  13. qlever/Qleverfiles/Qleverfile.osm-planet +39 -0
  14. qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf +42 -0
  15. qlever/Qleverfiles/Qleverfile.pubchem +131 -0
  16. qlever/Qleverfiles/Qleverfile.scientists +29 -0
  17. qlever/Qleverfiles/Qleverfile.uniprot +74 -0
  18. qlever/Qleverfiles/Qleverfile.vvz +31 -0
  19. qlever/Qleverfiles/Qleverfile.wikidata +42 -0
  20. qlever/Qleverfiles/Qleverfile.wikipathways +40 -0
  21. qlever/Qleverfiles/Qleverfile.yago-4 +33 -0
  22. qlever/__init__.py +44 -1380
  23. qlever/command.py +87 -0
  24. qlever/commands/__init__.py +0 -0
  25. qlever/commands/add_text_index.py +115 -0
  26. qlever/commands/benchmark_queries.py +1019 -0
  27. qlever/commands/cache_stats.py +125 -0
  28. qlever/commands/clear_cache.py +88 -0
  29. qlever/commands/extract_queries.py +120 -0
  30. qlever/commands/get_data.py +48 -0
  31. qlever/commands/index.py +333 -0
  32. qlever/commands/index_stats.py +306 -0
  33. qlever/commands/log.py +66 -0
  34. qlever/commands/materialized_view.py +110 -0
  35. qlever/commands/query.py +142 -0
  36. qlever/commands/rebuild_index.py +176 -0
  37. qlever/commands/reset_updates.py +59 -0
  38. qlever/commands/settings.py +115 -0
  39. qlever/commands/setup_config.py +97 -0
  40. qlever/commands/start.py +336 -0
  41. qlever/commands/status.py +50 -0
  42. qlever/commands/stop.py +90 -0
  43. qlever/commands/system_info.py +130 -0
  44. qlever/commands/ui.py +271 -0
  45. qlever/commands/update.py +90 -0
  46. qlever/commands/update_wikidata.py +1204 -0
  47. qlever/commands/warmup.py +41 -0
  48. qlever/config.py +223 -0
  49. qlever/containerize.py +167 -0
  50. qlever/log.py +55 -0
  51. qlever/qlever_main.py +79 -0
  52. qlever/qleverfile.py +530 -0
  53. qlever/util.py +330 -0
  54. qlever-0.5.41.dist-info/METADATA +127 -0
  55. qlever-0.5.41.dist-info/RECORD +59 -0
  56. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info}/WHEEL +1 -1
  57. qlever-0.5.41.dist-info/entry_points.txt +2 -0
  58. qlever-0.5.41.dist-info/top_level.txt +1 -0
  59. build/lib/qlever/__init__.py +0 -1383
  60. build/lib/qlever/__main__.py +0 -4
  61. qlever/__main__.py +0 -4
  62. qlever-0.2.5.dist-info/METADATA +0 -277
  63. qlever-0.2.5.dist-info/RECORD +0 -12
  64. qlever-0.2.5.dist-info/entry_points.txt +0 -2
  65. qlever-0.2.5.dist-info/top_level.txt +0 -4
  66. src/qlever/__init__.py +0 -1383
  67. src/qlever/__main__.py +0 -4
  68. {qlever-0.2.5.dist-info → qlever-0.5.41.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,131 @@
1
+ # Qleverfile for PubChem, use with the QLever CLI (`pip install qlever`)
2
+ #
3
+ # qlever get-data # ~2 hours, ~120 GB, ~19 billion triples
4
+ # qlever index # ~6 hours, ~20 GB RAM, ~350 GB disk space (for the index)
5
+ # qlever start # a few seconds
6
+ #
7
+ # Measured on an AMD Ryzen 9 7950X with 128 GB RAM, and NVMe SSD (17.12.2024)
8
+ #
9
+ # NOTE 1: `qlever get-data` does not only download the PubChem RDF data, but also
10
+ # a number of ontologies. These are very useful to obtain names for IRIs like
11
+ # `sio:SIO_000008` or `obo:IAO_0000412` (otherwise very hard to understand).
12
+ # The ontologies BAO and NDF-RT are occasionally updated; for latest versions,
13
+ # see the download links at https://bioportal.bioontology.org/ontologies/BAO
14
+ # and https://bioportal.bioontology.org/ontologies/NDF-RT .
15
+ #
16
+ # NOTE 2: The `MULTI_INPUT_JSON` zcats selected files together in one input
17
+ # stream because there are too many files and the command line triggered by
18
+ # `qlever index` would be too long otherwise.
19
+
20
+ [data]
21
+ NAME = pubchem
22
+ GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
23
+ ONTOLOGIES_DIR = RDF.ontologies
24
+ PUBCHEM_DIR = RDF.pubchem
25
+ ONTOLOGIES_CSV = ontologies.csv
26
+ CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
27
+ GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi'
28
+ GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && wget -r -nv -nH --cut-dirs=2 --no-parent -P ${PUBCHEM_DIR} ${GET_DATA_URL}
29
+ GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${GET_DATA_CMD_1} 2>&1 | tee pubchem.get-data-log.txt; ${GET_DATA_CMD_2} 2>&1 | tee -a pubchem.get-data-log.txt
30
+ VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE")
31
+ DESCRIPTION = PubChem, RDF TTL from ${GET_DATA_URL} + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo), version ${data:VERSION}
32
+ MAKE_ONTOLOGIES_CSV = $$(mkdir -p ${ONTOLOGIES_DIR} && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > ${ONTOLOGIES_DIR}/${ONTOLOGIES_CSV})
33
+
34
+ [index]
35
+ INPUT_FILES = ${data:ONTOLOGIES_DIR}/*.nt.gz ${data:PUBCHEM_DIR}/*/*.ttl.gz ${data:PUBCHEM_DIR}/*/*/*.ttl.gz
36
+ BASE_URL = http://rdf.ncbi.nlm.nih.gov/pubchem
37
+ MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${BASE_URL}/ruleset"},
38
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/anatomy", "for-each": "${data:PUBCHEM_DIR}/anatomy/*.ttl.gz" },
39
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/author", "for-each": "${data:PUBCHEM_DIR}/author/*.ttl.gz" },
40
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/bioassay", "for-each": "${data:PUBCHEM_DIR}/bioassay/*.ttl.gz" },
41
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/book", "for-each": "${data:PUBCHEM_DIR}/book/*.ttl.gz" },
42
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/cell", "for-each": "${data:PUBCHEM_DIR}/cell/*.ttl.gz" },
43
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*0.ttl.gz", "graph": "${BASE_URL}/compound" },
44
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*1.ttl.gz", "graph": "${BASE_URL}/compound" },
45
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*2.ttl.gz", "graph": "${BASE_URL}/compound" },
46
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*3.ttl.gz", "graph": "${BASE_URL}/compound" },
47
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*4.ttl.gz", "graph": "${BASE_URL}/compound" },
48
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*5.ttl.gz", "graph": "${BASE_URL}/compound" },
49
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*6.ttl.gz", "graph": "${BASE_URL}/compound" },
50
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*7.ttl.gz", "graph": "${BASE_URL}/compound" },
51
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*8.ttl.gz", "graph": "${BASE_URL}/compound" },
52
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*9.ttl.gz", "graph": "${BASE_URL}/compound" },
53
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/compound", "for-each": "${data:PUBCHEM_DIR}/compound/general/*[!0-9].ttl.gz" },
54
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/concept", "for-each": "${data:PUBCHEM_DIR}/concept/*.ttl.gz" },
55
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/conserveddomain", "for-each": "${data:PUBCHEM_DIR}/conserveddomain/*.ttl.gz" },
56
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/cooccurrence", "for-each": "${data:PUBCHEM_DIR}/cooccurrence/*.ttl.gz" },
57
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*0.ttl.gz", "graph": "${BASE_URL}/descriptor" },
58
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*1.ttl.gz", "graph": "${BASE_URL}/descriptor" },
59
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*2.ttl.gz", "graph": "${BASE_URL}/descriptor" },
60
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*3.ttl.gz", "graph": "${BASE_URL}/descriptor" },
61
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*4.ttl.gz", "graph": "${BASE_URL}/descriptor" },
62
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*5.ttl.gz", "graph": "${BASE_URL}/descriptor" },
63
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*6.ttl.gz", "graph": "${BASE_URL}/descriptor" },
64
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*7.ttl.gz", "graph": "${BASE_URL}/descriptor" },
65
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*8.ttl.gz", "graph": "${BASE_URL}/descriptor" },
66
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*9.ttl.gz", "graph": "${BASE_URL}/descriptor" },
67
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/descriptor", "for-each": "${data:PUBCHEM_DIR}/descriptor/compound/*[!0-9].ttl.gz" },
68
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/descriptor", "for-each": "${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz" },
69
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/disease", "for-each": "${data:PUBCHEM_DIR}/disease/*.ttl.gz" },
70
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/endpoint", "for-each": "${data:PUBCHEM_DIR}/endpoint/*.ttl.gz" },
71
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/gene", "for-each": "${data:PUBCHEM_DIR}/gene/*.ttl.gz"},
72
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/grant", "for-each": "${data:PUBCHEM_DIR}/grant/*.ttl.gz" },
73
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/inchikey", "for-each": "${data:PUBCHEM_DIR}/inchikey/*.ttl.gz" },
74
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/journal", "for-each": "${data:PUBCHEM_DIR}/journal/*.ttl.gz" },
75
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/measuregroup", "for-each": "${data:PUBCHEM_DIR}/measuregroup/*.ttl.gz" },
76
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/organization", "for-each": "${data:PUBCHEM_DIR}/organization/*.ttl.gz" },
77
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*0.ttl.gz", "graph": "${BASE_URL}/patent" },
78
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*1.ttl.gz", "graph": "${BASE_URL}/patent" },
79
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*2.ttl.gz", "graph": "${BASE_URL}/patent" },
80
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*3.ttl.gz", "graph": "${BASE_URL}/patent" },
81
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*4.ttl.gz", "graph": "${BASE_URL}/patent" },
82
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*5.ttl.gz", "graph": "${BASE_URL}/patent" },
83
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*6.ttl.gz", "graph": "${BASE_URL}/patent" },
84
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*7.ttl.gz", "graph": "${BASE_URL}/patent" },
85
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*8.ttl.gz", "graph": "${BASE_URL}/patent" },
86
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*9.ttl.gz", "graph": "${BASE_URL}/patent" },
87
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent" },
88
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent" },
89
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/patent", "for-each": "${data:PUBCHEM_DIR}/patent/*[!0-9].ttl.gz" },
90
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/pathway", "for-each": "${data:PUBCHEM_DIR}/pathway/*.ttl.gz" },
91
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/protein", "for-each": "${data:PUBCHEM_DIR}/protein/*.ttl.gz" },
92
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*0.ttl.gz", "graph": "${BASE_URL}/reference" },
93
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*1.ttl.gz", "graph": "${BASE_URL}/reference" },
94
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*2.ttl.gz", "graph": "${BASE_URL}/reference" },
95
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*3.ttl.gz", "graph": "${BASE_URL}/reference" },
96
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*4.ttl.gz", "graph": "${BASE_URL}/reference" },
97
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*5.ttl.gz", "graph": "${BASE_URL}/reference" },
98
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*6.ttl.gz", "graph": "${BASE_URL}/reference" },
99
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*7.ttl.gz", "graph": "${BASE_URL}/reference" },
100
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*8.ttl.gz", "graph": "${BASE_URL}/reference" },
101
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*9.ttl.gz", "graph": "${BASE_URL}/reference" },
102
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/reference", "for-each": "${data:PUBCHEM_DIR}/reference/*[!0-9].ttl.gz" },
103
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/source", "for-each": "${data:PUBCHEM_DIR}/source/*.ttl.gz" },
104
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*0.ttl.gz", "graph": "${BASE_URL}/substance" },
105
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*1.ttl.gz", "graph": "${BASE_URL}/substance" },
106
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*2.ttl.gz", "graph": "${BASE_URL}/substance" },
107
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*3.ttl.gz", "graph": "${BASE_URL}/substance" },
108
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*4.ttl.gz", "graph": "${BASE_URL}/substance" },
109
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*5.ttl.gz", "graph": "${BASE_URL}/substance" },
110
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*6.ttl.gz", "graph": "${BASE_URL}/substance" },
111
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*7.ttl.gz", "graph": "${BASE_URL}/substance" },
112
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*8.ttl.gz", "graph": "${BASE_URL}/substance" },
113
+ { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*9.ttl.gz", "graph": "${BASE_URL}/substance" },
114
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/substance", "for-each": "${data:PUBCHEM_DIR}/substance/*[!0-9].ttl.gz" },
115
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/synonym", "for-each": "${data:PUBCHEM_DIR}/synonym/*.ttl.gz" },
116
+ { "cmd": "zcat {}", "graph": "${BASE_URL}/taxonomy", "for-each": "${data:PUBCHEM_DIR}/taxonomy/*.ttl.gz" }]
117
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "num-triples-per-batch": 10000000 }
118
+ STXXL_MEMORY = 20G
119
+
120
+ [server]
121
+ PORT = 7023
122
+ ACCESS_TOKEN = ${data:NAME}
123
+ MEMORY_FOR_QUERIES = 20G
124
+ TIMEOUT = 600s
125
+
126
+ [runtime]
127
+ SYSTEM = docker
128
+ IMAGE = docker.io/adfreiburg/qlever:latest
129
+
130
+ [ui]
131
+ UI_CONFIG = pubchem
@@ -0,0 +1,29 @@
1
+ # Qleverfile for "scientists", use with qlever script (pip install qlever)
2
+ #
3
+ # qlever get-data # get "scientists" dataset (370M triples, 2.2 M texts records)
4
+ # qlever index # build index, including text index (takes ~20 seconds)
5
+ # qlever start # start the server (instant)
6
+
7
+ [data]
8
+ NAME = scientists
9
+ GET_DATA_CMD = curl -LRC - -O https://github.com/ad-freiburg/qlever/raw/master/e2e/scientist-collection.zip && unzip -j scientist-collection.zip && rm -f scientist-collection.zip
10
+ DESCRIPTION = Test collection from https://github.com/ad-freiburg/qlever/tree/master/e2e (triples and text about scientists)
11
+ TEXT_DESCRIPTION = Text from all literals and Wikipedia articles on scientists (use ql:contains-entity and ql:contains-word)
12
+
13
+ [index]
14
+ INPUT_FILES = ${data:NAME}.nt
15
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
16
+ SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 100000 }
17
+ TEXT_INDEX = from_text_records_and_literals
18
+
19
+ [server]
20
+ PORT = 7020
21
+ ACCESS_TOKEN = ${data:NAME}
22
+ MEMORY_FOR_QUERIES = 5G
23
+
24
+ [runtime]
25
+ SYSTEM = docker
26
+ IMAGE = docker.io/adfreiburg/qlever:latest
27
+
28
+ [ui]
29
+ UI_CONFIG = scientists
@@ -0,0 +1,74 @@
1
+ # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files)
4
+ # qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X)
5
+ # qlever start # starts the server (takes a few seconds)
6
+ #
7
+ # Install packages: sudo apt install -y libxml2-utils raptor2-utils parallel xz-utils wget
8
+ # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
9
+ #
10
+ # Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB
11
+ # during build, ~ 3 TB after build).
12
+
13
+ [data]
14
+ NAME = uniprot
15
+ DATE = 2025-06-18
16
+ RDFXML_DIR = rdf.${DATE}
17
+ TTL_DIR = ttl.${DATE}
18
+ UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
19
+ RHEA_URL = https://ftp.expasy.org/databases/rhea/rdf
20
+ EXAMPLES_URL = https://github.com/sib-swiss/sparql-examples
21
+ GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples)
22
+ GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done
23
+ RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel
24
+ GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date
25
+ DESCRIPTION = UniProt, RDF XML from ${UNIPROT_URL} + additional data from ${RHEA_URL} and ${EXAMPLES_URL}, version ${DATE}
26
+
27
+ [index]
28
+ INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz
29
+ MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" },
30
+ { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" },
31
+ { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" },
32
+ { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" },
33
+ { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" },
34
+ { "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" },
35
+ { "cmd": "zcat ${data:TTL_DIR}/citation_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" },
36
+ { "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" },
37
+ { "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" },
38
+ { "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" },
39
+ { "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
40
+ { "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" },
41
+ { "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
42
+ { "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" },
43
+ { "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" },
44
+ { "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
45
+ { "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" },
46
+ { "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
47
+ { "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" },
48
+ { "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
49
+ { "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" },
50
+ { "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" },
51
+ { "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
52
+ { "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" },
53
+ { "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" },
54
+ { "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" },
55
+ { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" },
56
+ { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" }]
57
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }
58
+ STXXL_MEMORY = 80G
59
+ ULIMIT = 50000
60
+
61
+ [server]
62
+ PORT = 7018
63
+ ACCESS_TOKEN = ${data:NAME}
64
+ MEMORY_FOR_QUERIES = 20G
65
+ CACHE_MAX_SIZE = 10G
66
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
67
+ TIMEOUT = 300s
68
+
69
+ [runtime]
70
+ SYSTEM = docker
71
+ IMAGE = docker.io/adfreiburg/qlever:latest
72
+
73
+ [ui]
74
+ UI_CONFIG = uniprot
@@ -0,0 +1,31 @@
1
+ # Qleverfile for VVZ, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # this requires a separate internal tool
4
+ # qlever index # builds the index (takes a few seconds)
5
+ # qlever start # starts the server (takes a few seconds)
6
+ #
7
+ # Also builds a text index for fast kewyword search in literals.
8
+
9
+ [data]
10
+ NAME = vvz
11
+ GET_DATA_CMD = echo "This requires a separate tool"
12
+ DESCRIPTION = VVZ Uni Freiburg, selected faculties
13
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
14
+
15
+ [index]
16
+ INPUT_FILES = vvz.ttl
17
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
18
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }
19
+ TEXT_INDEX = from_literals
20
+
21
+ [server]
22
+ PORT = 7041
23
+ ACCESS_TOKEN = ${data:NAME}
24
+ MEMORY_FOR_QUERIES = 10G
25
+
26
+ [runtime]
27
+ SYSTEM = docker
28
+ IMAGE = docker.io/adfreiburg/qlever:latest
29
+
30
+ [ui]
31
+ UI_CONFIG = vvz
@@ -0,0 +1,42 @@
1
+ # Qleverfile for Wikidata, use with the QLever CLI (`pip install qlever`)
2
+ #
3
+ # qlever get-data # ~7 hours, ~110 GB (compressed), ~20 billion triples
4
+ # qlever index # ~5 hours, ~20 GB RAM, ~500 GB index size on disk
5
+ # qlever start # a few seconds, adjust MEMORY_FOR_QUERIES as needed
6
+ #
7
+ # Adding a text index takes an additional ~2 hours and ~50 GB of disk space
8
+ #
9
+ # Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (18.10.2024)
10
+
11
+ [DEFAULT]
12
+ NAME = wikidata
13
+
14
+ [data]
15
+ GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities
16
+ GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt
17
+ DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE")
18
+ DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE")
19
+ DESCRIPTION = Complete Wikidata, from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2), version ${DATE_WIKIDATA}
20
+
21
+ [index]
22
+ INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt
23
+ MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl", "parallel": "true" },
24
+ { "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" },
25
+ { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }]
26
+ SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 }
27
+ STXXL_MEMORY = 10G
28
+
29
+ [server]
30
+ PORT = 7001
31
+ ACCESS_TOKEN = ${data:NAME}
32
+ MEMORY_FOR_QUERIES = 20G
33
+ CACHE_MAX_SIZE = 15G
34
+ CACHE_MAX_SIZE_SINGLE_ENTRY = 5G
35
+ TIMEOUT = 600s
36
+
37
+ [runtime]
38
+ SYSTEM = docker
39
+ IMAGE = adfreiburg/qlever
40
+
41
+ [ui]
42
+ UI_CONFIG = wikidata
@@ -0,0 +1,40 @@
1
+ # Qleverfile for WikiPathways, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # takes ~3 seconds, generates TTL of size ~600 MB
4
+ # qlever index # takes ~20 seconds and little RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # instant
6
+ #
7
+ # Limitations: does not include the ontologies (WP, GPML, ChEBI, PW, CLO, ...) yet
8
+
9
+ [data]
10
+ NAME = wikipathways
11
+ RELEASE = current
12
+ GET_DATA_URL = https://data.wikipathways.org/${RELEASE}/rdf
13
+ GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-rdf-void.ttl && \
14
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \
15
+ unzip -qq -c wikipathways-${RELEASE}-rdf-wp.zip -x wp/wpOntology.ttl > wikipathways-rdf-wp.ttl && \
16
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-gpml.zip &&
17
+ unzip -qq -c wikipathways-${RELEASE}-rdf-gpml.zip -x gpml/gpmlOntology.ttl > wikipathways-rdf-gpml.ttl && \
18
+ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-authors.zip && \
19
+ unzip -qq -c wikipathways-${RELEASE}-rdf-authors.zip > wikipathways-rdf-authors.ttl && \
20
+ cat wikipathways-rdf-*.ttl | grep ^@prefix | tr -s ' ' | sort -u > ${NAME}.prefix-definitions
21
+ DESCRIPTION = WikiPathways RDF, from ${GET_DATA_URL}
22
+ TEXT_DESCRIPTION = All literals, search with FILTER KEYWORDS(?text, "...")
23
+
24
+ [index]
25
+ INPUT_FILES = ${data:NAME}.prefix-definitions wikipathways-rdf-wp.ttl wikipathways-rdf-gpml.ttl wikipathways-rdf-void.ttl wikipathways-rdf-authors.ttl
26
+ CAT_INPUT_FILES = cat ${INPUT_FILES}
27
+ SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
28
+ TEXT_INDEX = from_literals
29
+
30
+ [server]
31
+ PORT = 7040
32
+ ACCESS_TOKEN = ${data:NAME}
33
+ MEMORY_FOR_QUERIES = 5G
34
+
35
+ [runtime]
36
+ SYSTEM = docker
37
+ IMAGE = docker.io/adfreiburg/qlever:latest
38
+
39
+ [ui]
40
+ UI_CONFIG = wikipathways
@@ -0,0 +1,33 @@
1
+ # Qleverfile for YAGO 4, use with https://github.com/ad-freiburg/qlever-control
2
+ #
3
+ # qlever get-data # downloads 8 nt.gz file of size ~60 GB (as of 12.03.2020)
4
+ # qlever index # takes ~4 hours and ~10 GB RAM (on an AMD Ryzen 9 5900X)
5
+ # qlever start # starts the server
6
+
7
+ # NOTE concerning GET_DATA_CMD: The triples from wd-annotated-facts are
8
+ # contained in wd-facts. The "full types" are the YAGO types, the "simple
9
+ # types" are the schema.org types. They don't interfere with each other because
10
+ # they have distinct prefixes.
11
+
12
+ [data]
13
+ NAME = yago-4
14
+ GET_DATA_CMD = curl --location --continue-at - --remote-name-all https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-class.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-facts.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-full-types.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-labels.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-sameAs.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-schema.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-shapes.nt.gz https://yago-knowledge.org/data/yago4/full/2020-02-24/yago-wd-simple-types.nt.gz
15
+ DESCRIPTION = "Full dump from https://yago-knowledge.org/downloads/yago-4, version 12.03.2020"
16
+
17
+ [index]
18
+ INPUT_FILES = yago-wd-*.nt.gz
19
+ CAT_INPUT_FILES = zcat ${INPUT_FILES}
20
+ SETTINGS_JSON = { "languages-internal": ["en"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
21
+ STXXL_MEMORY = 10G
22
+
23
+ [server]
24
+ PORT = 9004
25
+ ACCESS_TOKEN = ${data:NAME}
26
+ MEMORY_FOR_QUERIES = 30G
27
+
28
+ [runtime]
29
+ SYSTEM = docker
30
+ IMAGE = docker.io/adfreiburg/qlever:latest
31
+
32
+ [ui]
33
+ UI_CONFIG = yago-4