setlr 0.2.19__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {setlr-0.2.19 → setlr-1.0.0}/PKG-INFO +20 -3
- {setlr-0.2.19 → setlr-1.0.0}/setlr/__init__.py +54 -28
- setlr-1.0.0/setlr/_version.py +4 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/PKG-INFO +20 -3
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/SOURCES.txt +1 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/entry_points.txt +0 -1
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/requires.txt +3 -0
- {setlr-0.2.19 → setlr-1.0.0}/setup.py +5 -1
- {setlr-0.2.19 → setlr-1.0.0}/LICENSE +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/README.md +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr/iterparse_filter.py +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr/sqlite-store.py +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr/trig_store.py +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/dependency_links.txt +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/pbr.json +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setlr.egg-info/top_level.txt +0 -0
- {setlr-0.2.19 → setlr-1.0.0}/setup.cfg +0 -0
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: setlr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
|
|
5
5
|
Home-page: http://packages.python.org/setlr
|
|
6
6
|
Author: Jamie McCusker
|
|
7
7
|
Author-email: mccusj@cs.rpi.edu
|
|
8
8
|
License: Apache License 2.0
|
|
9
9
|
Keywords: rdf semantic etl
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Topic :: Utilities
|
|
13
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
License-File: LICENSE
|
|
14
|
+
Requires-Dist: future
|
|
15
|
+
Requires-Dist: pip>=9.0.0
|
|
16
|
+
Requires-Dist: cython
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: rdflib>=6.0.0
|
|
19
|
+
Requires-Dist: pandas>=0.23.0
|
|
20
|
+
Requires-Dist: requests
|
|
21
|
+
Requires-Dist: toposort
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Requires-Dist: jinja2
|
|
24
|
+
Requires-Dist: lxml
|
|
25
|
+
Requires-Dist: six
|
|
26
|
+
Requires-Dist: xlrd
|
|
27
|
+
Requires-Dist: ijson
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: requests-testadapter
|
|
31
|
+
Requires-Dist: python-slugify
|
|
32
|
+
Requires-Dist: pyshacl[js]
|
|
15
33
|
|
|
16
34
|
SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
|
|
17
|
-
|
|
@@ -33,8 +33,11 @@ import gzip
|
|
|
33
33
|
|
|
34
34
|
import logging
|
|
35
35
|
|
|
36
|
+
from tqdm import tqdm
|
|
37
|
+
|
|
36
38
|
import hashlib
|
|
37
39
|
from slugify import slugify
|
|
40
|
+
from pyshacl import validate
|
|
38
41
|
|
|
39
42
|
from .trig_store import TrigStore
|
|
40
43
|
|
|
@@ -52,6 +55,7 @@ sp = Namespace('http://spinrdf.org/sp#')
|
|
|
52
55
|
sd = Namespace('http://www.w3.org/ns/sparql-service-description#')
|
|
53
56
|
dc = Namespace('http://purl.org/dc/terms/')
|
|
54
57
|
void = Namespace('http://rdfs.org/ns/void#')
|
|
58
|
+
shacl = Namespace('http://www.w3.org/ns/shacl#')
|
|
55
59
|
api_vocab = Namespace('http://purl.org/linked-data/api/vocab#')
|
|
56
60
|
|
|
57
61
|
sys.setrecursionlimit(10000)
|
|
@@ -87,7 +91,7 @@ datatypeConverters.update({
|
|
|
87
91
|
XSD.double: float
|
|
88
92
|
})
|
|
89
93
|
|
|
90
|
-
run_samples =
|
|
94
|
+
run_samples = -1
|
|
91
95
|
|
|
92
96
|
_rdf_formats_to_guess = [
|
|
93
97
|
'xml',
|
|
@@ -110,7 +114,7 @@ def read_csv(location, result):
|
|
|
110
114
|
args['header'] = [0]
|
|
111
115
|
with get_content(location, result) as fo:
|
|
112
116
|
df = pandas.read_csv(fo, encoding='utf-8', **args)
|
|
113
|
-
logger.debug("Loaded %s", location)
|
|
117
|
+
#logger.debug("Loaded %s", location)
|
|
114
118
|
return df
|
|
115
119
|
|
|
116
120
|
def read_graph(location, result, g = None):
|
|
@@ -263,7 +267,7 @@ def read_xml(location, result):
|
|
|
263
267
|
for xp in result[setl.xpath]:
|
|
264
268
|
f.iter_end(xp.value)
|
|
265
269
|
with get_content(location, result) as fo:
|
|
266
|
-
for (i, (event, ele)) in enumerate(f.iterparse(fo)):
|
|
270
|
+
for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
|
|
267
271
|
yield i, ele
|
|
268
272
|
|
|
269
273
|
|
|
@@ -274,7 +278,7 @@ def read_json(location, result):
|
|
|
274
278
|
else:
|
|
275
279
|
selector = ""
|
|
276
280
|
with get_content(location, result) as fo:
|
|
277
|
-
yield from enumerate(ijson.items(fo, selector))
|
|
281
|
+
yield from enumerate(tqdm(ijson.items(fo, selector)))
|
|
278
282
|
|
|
279
283
|
|
|
280
284
|
extractors = {
|
|
@@ -387,7 +391,7 @@ def get_order(setl_graph):
|
|
|
387
391
|
return toposort_flatten(nodes)
|
|
388
392
|
|
|
389
393
|
def extract(e, resources):
|
|
390
|
-
logger.info('
|
|
394
|
+
logger.info('Extract %s',e.identifier)
|
|
391
395
|
used = e.value(prov.used)
|
|
392
396
|
for result in e.subjects(prov.wasGeneratedBy):
|
|
393
397
|
if used is None:
|
|
@@ -395,7 +399,7 @@ def extract(e, resources):
|
|
|
395
399
|
for t in result[RDF.type]:
|
|
396
400
|
# Do we know how to generate this?
|
|
397
401
|
if t.identifier in extractors:
|
|
398
|
-
logger.info("
|
|
402
|
+
logger.info("Using %s", used.identifier)
|
|
399
403
|
resources[result.identifier] = extractors[t.identifier](used.identifier, result)
|
|
400
404
|
return resources[result.identifier]
|
|
401
405
|
|
|
@@ -606,7 +610,7 @@ def process_row(row, template, rowname, table, resources, transform, variables):
|
|
|
606
610
|
return flatten_lists(result)
|
|
607
611
|
|
|
608
612
|
def json_transform(transform, resources):
|
|
609
|
-
logger.info("
|
|
613
|
+
logger.info("Transform %s", transform.identifier)
|
|
610
614
|
tables = [u for u in transform[prov.used]]
|
|
611
615
|
variables = {}
|
|
612
616
|
for usage in transform[prov.qualifiedUsage]:
|
|
@@ -619,6 +623,20 @@ def json_transform(transform, resources):
|
|
|
619
623
|
generated = list(transform.subjects(prov.wasGeneratedBy))[0]
|
|
620
624
|
logger.info("Generating %s", generated.identifier)
|
|
621
625
|
|
|
626
|
+
connected_downstream_graph = '''
|
|
627
|
+
construct {
|
|
628
|
+
?target ?p ?o
|
|
629
|
+
} where {
|
|
630
|
+
?source (<>|!<>)* ?target.
|
|
631
|
+
?target ?p ?o.
|
|
632
|
+
}
|
|
633
|
+
'''
|
|
634
|
+
shape_graph = Graph()
|
|
635
|
+
for shape in transform.objects(dc.conformsTo):
|
|
636
|
+
if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]:
|
|
637
|
+
logger.info("Validating against SHACL shape %s", shape.identifier)
|
|
638
|
+
shape_graph += transform.graph.query(connected_downstream_graph,
|
|
639
|
+
initBindings={"source":shape.identifier})
|
|
622
640
|
if generated.identifier in resources:
|
|
623
641
|
result = resources[generated.identifier]
|
|
624
642
|
else:
|
|
@@ -652,12 +670,12 @@ def json_transform(transform, resources):
|
|
|
652
670
|
if isinstance(table, pandas.DataFrame):
|
|
653
671
|
#if run_samples:
|
|
654
672
|
# table = table.head()
|
|
655
|
-
it = table.iterrows()
|
|
656
|
-
logger.info("Transforming %s rows.", len(table.index))
|
|
673
|
+
it = tqdm(table.iterrows(), total=table.shape[0])
|
|
674
|
+
#logger.info("Transforming %s rows.", len(table.index))
|
|
657
675
|
else:
|
|
658
|
-
logger.info("
|
|
676
|
+
logger.info("Transform %s", t.identifier)
|
|
659
677
|
for rowname, row in it:
|
|
660
|
-
if run_samples and rowname >=
|
|
678
|
+
if run_samples > 0 and rowname >= run_samples:
|
|
661
679
|
break
|
|
662
680
|
try:
|
|
663
681
|
root = None
|
|
@@ -668,17 +686,28 @@ def json_transform(transform, resources):
|
|
|
668
686
|
}
|
|
669
687
|
if context is not None:
|
|
670
688
|
root['@context'] = context
|
|
689
|
+
|
|
671
690
|
#logger.debug(json.dumps(root, indent=4))
|
|
672
691
|
#before = len(result)
|
|
673
692
|
#graph = ConjunctiveGraph(identifier=generated.identifier)
|
|
674
693
|
#graph.parse(data=json.dumps(root),format="json-ld")
|
|
675
694
|
data = json.dumps(root)
|
|
676
695
|
#del root
|
|
696
|
+
|
|
697
|
+
if len(shape_graph) > 0:
|
|
698
|
+
d = ConjunctiveGraph()
|
|
699
|
+
d.parse(data=data,format='json-ld')
|
|
700
|
+
conforms, report, message = validate(d,
|
|
701
|
+
shacl_graph=shape_graph,
|
|
702
|
+
advanced=True,
|
|
703
|
+
debug=False)
|
|
704
|
+
if not conforms:
|
|
705
|
+
print(message)
|
|
677
706
|
result.parse(data=data, format="json-ld")
|
|
678
707
|
#del data
|
|
679
708
|
#after = len(result)
|
|
680
|
-
logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
|
|
681
|
-
sys.stdout.flush()
|
|
709
|
+
#logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
|
|
710
|
+
#sys.stdout.flush()
|
|
682
711
|
except Exception as e:
|
|
683
712
|
trace = sys.exc_info()[2]
|
|
684
713
|
if data is not None:
|
|
@@ -752,7 +781,7 @@ def _load_open(generated):
|
|
|
752
781
|
return fh
|
|
753
782
|
|
|
754
783
|
def load(load_resource, resources):
|
|
755
|
-
logger.info('
|
|
784
|
+
logger.info('Load %s',load_resource.identifier)
|
|
756
785
|
file_graph = Dataset(default_union=True)
|
|
757
786
|
to_disk = False
|
|
758
787
|
for used in load_resource[prov.used]:
|
|
@@ -821,10 +850,16 @@ def _setl(setl_graph):
|
|
|
821
850
|
return resources
|
|
822
851
|
logger = None
|
|
823
852
|
|
|
824
|
-
|
|
825
|
-
|
|
853
|
+
import click
|
|
854
|
+
@click.command()
|
|
855
|
+
@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
|
|
856
|
+
@click.option('-n', default=-1, help="Only process the first N rows.", type=int)
|
|
857
|
+
#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
|
|
858
|
+
#@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
|
|
859
|
+
@click.argument('script', type=click.Path(exists=True))
|
|
860
|
+
def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
|
|
826
861
|
logging_level = logging.DEBUG
|
|
827
|
-
if
|
|
862
|
+
if quiet:
|
|
828
863
|
logging_level = logging.WARNING
|
|
829
864
|
logging.basicConfig(level=logging_level)
|
|
830
865
|
|
|
@@ -832,18 +867,9 @@ def main():
|
|
|
832
867
|
logger = logging.getLogger(__name__)
|
|
833
868
|
|
|
834
869
|
global run_samples
|
|
835
|
-
|
|
836
|
-
if 'sample' in args:
|
|
837
|
-
run_samples = True
|
|
838
|
-
logger.warning("Only processing a few sample rows.")
|
|
870
|
+
run_samples = n
|
|
839
871
|
setl_graph = ConjunctiveGraph()
|
|
840
|
-
content = open(
|
|
872
|
+
content = open(script).read()
|
|
841
873
|
setl_graph.parse(data=content, format="turtle")
|
|
842
874
|
|
|
843
875
|
graphs = _setl(setl_graph)
|
|
844
|
-
# print "Finished processing"
|
|
845
|
-
# return graphs
|
|
846
|
-
|
|
847
|
-
if __name__ == '__main__':
|
|
848
|
-
result = main()
|
|
849
|
-
logger.info("Exiting")
|
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: setlr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
|
|
5
5
|
Home-page: http://packages.python.org/setlr
|
|
6
6
|
Author: Jamie McCusker
|
|
7
7
|
Author-email: mccusj@cs.rpi.edu
|
|
8
8
|
License: Apache License 2.0
|
|
9
9
|
Keywords: rdf semantic etl
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Topic :: Utilities
|
|
13
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
License-File: LICENSE
|
|
14
|
+
Requires-Dist: future
|
|
15
|
+
Requires-Dist: pip>=9.0.0
|
|
16
|
+
Requires-Dist: cython
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: rdflib>=6.0.0
|
|
19
|
+
Requires-Dist: pandas>=0.23.0
|
|
20
|
+
Requires-Dist: requests
|
|
21
|
+
Requires-Dist: toposort
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Requires-Dist: jinja2
|
|
24
|
+
Requires-Dist: lxml
|
|
25
|
+
Requires-Dist: six
|
|
26
|
+
Requires-Dist: xlrd
|
|
27
|
+
Requires-Dist: ijson
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: requests-testadapter
|
|
31
|
+
Requires-Dist: python-slugify
|
|
32
|
+
Requires-Dist: pyshacl[js]
|
|
15
33
|
|
|
16
34
|
SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
|
|
17
|
-
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from setuptools import setup, find_packages
|
|
3
|
+
from setlr._version import __version__
|
|
3
4
|
|
|
4
5
|
# Utility function to read the README file.
|
|
5
6
|
# Used for the long_description. It's nice, because now 1) we have a top level
|
|
@@ -10,7 +11,7 @@ def read(fname):
|
|
|
10
11
|
|
|
11
12
|
setup(
|
|
12
13
|
name = "setlr",
|
|
13
|
-
version =
|
|
14
|
+
version = __version__,
|
|
14
15
|
author = "Jamie McCusker",
|
|
15
16
|
author_email = "mccusj@cs.rpi.edu",
|
|
16
17
|
description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
|
|
@@ -35,8 +36,11 @@ setup(
|
|
|
35
36
|
'six',
|
|
36
37
|
'xlrd',
|
|
37
38
|
'ijson',
|
|
39
|
+
'click',
|
|
40
|
+
'tqdm',
|
|
38
41
|
'requests-testadapter',
|
|
39
42
|
'python-slugify',
|
|
43
|
+
'pyshacl[js]'
|
|
40
44
|
],
|
|
41
45
|
entry_points = {
|
|
42
46
|
'console_scripts': ['setlr=setlr:main'],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|