setlr 0.2.18__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {setlr-0.2.18 → setlr-1.0.0}/PKG-INFO +20 -3
- {setlr-0.2.18 → setlr-1.0.0}/setlr/__init__.py +82 -33
- setlr-1.0.0/setlr/_version.py +4 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/PKG-INFO +20 -3
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/SOURCES.txt +1 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/entry_points.txt +0 -1
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/requires.txt +3 -0
- {setlr-0.2.18 → setlr-1.0.0}/setup.py +5 -1
- {setlr-0.2.18 → setlr-1.0.0}/LICENSE +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/README.md +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr/iterparse_filter.py +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr/sqlite-store.py +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr/trig_store.py +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/dependency_links.txt +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/pbr.json +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/top_level.txt +0 -0
- {setlr-0.2.18 → setlr-1.0.0}/setup.cfg +0 -0
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: setlr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
|
|
5
5
|
Home-page: http://packages.python.org/setlr
|
|
6
6
|
Author: Jamie McCusker
|
|
7
7
|
Author-email: mccusj@cs.rpi.edu
|
|
8
8
|
License: Apache License 2.0
|
|
9
9
|
Keywords: rdf semantic etl
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Topic :: Utilities
|
|
13
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
License-File: LICENSE
|
|
14
|
+
Requires-Dist: future
|
|
15
|
+
Requires-Dist: pip>=9.0.0
|
|
16
|
+
Requires-Dist: cython
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: rdflib>=6.0.0
|
|
19
|
+
Requires-Dist: pandas>=0.23.0
|
|
20
|
+
Requires-Dist: requests
|
|
21
|
+
Requires-Dist: toposort
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Requires-Dist: jinja2
|
|
24
|
+
Requires-Dist: lxml
|
|
25
|
+
Requires-Dist: six
|
|
26
|
+
Requires-Dist: xlrd
|
|
27
|
+
Requires-Dist: ijson
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: requests-testadapter
|
|
31
|
+
Requires-Dist: python-slugify
|
|
32
|
+
Requires-Dist: pyshacl[js]
|
|
15
33
|
|
|
16
34
|
SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
|
|
17
|
-
|
|
@@ -33,8 +33,11 @@ import gzip
|
|
|
33
33
|
|
|
34
34
|
import logging
|
|
35
35
|
|
|
36
|
+
from tqdm import tqdm
|
|
37
|
+
|
|
36
38
|
import hashlib
|
|
37
39
|
from slugify import slugify
|
|
40
|
+
from pyshacl import validate
|
|
38
41
|
|
|
39
42
|
from .trig_store import TrigStore
|
|
40
43
|
|
|
@@ -52,6 +55,7 @@ sp = Namespace('http://spinrdf.org/sp#')
|
|
|
52
55
|
sd = Namespace('http://www.w3.org/ns/sparql-service-description#')
|
|
53
56
|
dc = Namespace('http://purl.org/dc/terms/')
|
|
54
57
|
void = Namespace('http://rdfs.org/ns/void#')
|
|
58
|
+
shacl = Namespace('http://www.w3.org/ns/shacl#')
|
|
55
59
|
api_vocab = Namespace('http://purl.org/linked-data/api/vocab#')
|
|
56
60
|
|
|
57
61
|
sys.setrecursionlimit(10000)
|
|
@@ -87,7 +91,7 @@ datatypeConverters.update({
|
|
|
87
91
|
XSD.double: float
|
|
88
92
|
})
|
|
89
93
|
|
|
90
|
-
run_samples =
|
|
94
|
+
run_samples = -1
|
|
91
95
|
|
|
92
96
|
_rdf_formats_to_guess = [
|
|
93
97
|
'xml',
|
|
@@ -110,7 +114,7 @@ def read_csv(location, result):
|
|
|
110
114
|
args['header'] = [0]
|
|
111
115
|
with get_content(location, result) as fo:
|
|
112
116
|
df = pandas.read_csv(fo, encoding='utf-8', **args)
|
|
113
|
-
logger.debug("Loaded %s", location)
|
|
117
|
+
#logger.debug("Loaded %s", location)
|
|
114
118
|
return df
|
|
115
119
|
|
|
116
120
|
def read_graph(location, result, g = None):
|
|
@@ -263,7 +267,7 @@ def read_xml(location, result):
|
|
|
263
267
|
for xp in result[setl.xpath]:
|
|
264
268
|
f.iter_end(xp.value)
|
|
265
269
|
with get_content(location, result) as fo:
|
|
266
|
-
for (i, (event, ele)) in enumerate(f.iterparse(fo)):
|
|
270
|
+
for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
|
|
267
271
|
yield i, ele
|
|
268
272
|
|
|
269
273
|
|
|
@@ -274,7 +278,7 @@ def read_json(location, result):
|
|
|
274
278
|
else:
|
|
275
279
|
selector = ""
|
|
276
280
|
with get_content(location, result) as fo:
|
|
277
|
-
yield from enumerate(ijson.items(fo, selector))
|
|
281
|
+
yield from enumerate(tqdm(ijson.items(fo, selector)))
|
|
278
282
|
|
|
279
283
|
|
|
280
284
|
extractors = {
|
|
@@ -387,7 +391,7 @@ def get_order(setl_graph):
|
|
|
387
391
|
return toposort_flatten(nodes)
|
|
388
392
|
|
|
389
393
|
def extract(e, resources):
|
|
390
|
-
logger.info('
|
|
394
|
+
logger.info('Extract %s',e.identifier)
|
|
391
395
|
used = e.value(prov.used)
|
|
392
396
|
for result in e.subjects(prov.wasGeneratedBy):
|
|
393
397
|
if used is None:
|
|
@@ -395,7 +399,7 @@ def extract(e, resources):
|
|
|
395
399
|
for t in result[RDF.type]:
|
|
396
400
|
# Do we know how to generate this?
|
|
397
401
|
if t.identifier in extractors:
|
|
398
|
-
logger.info("
|
|
402
|
+
logger.info("Using %s", used.identifier)
|
|
399
403
|
resources[result.identifier] = extractors[t.identifier](used.identifier, result)
|
|
400
404
|
return resources[result.identifier]
|
|
401
405
|
|
|
@@ -416,9 +420,11 @@ def clone(value):
|
|
|
416
420
|
|
|
417
421
|
functions = {}
|
|
418
422
|
def get_function(expr, local_keys):
|
|
419
|
-
|
|
423
|
+
used_local_keys = [k for k in local_keys if k in expr]
|
|
424
|
+
key = tuple([expr]+sorted(used_local_keys))
|
|
420
425
|
if key not in functions:
|
|
421
|
-
script = '''lambda %s: %s'''% (', '.join(sorted(
|
|
426
|
+
script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr)
|
|
427
|
+
#print(script)
|
|
422
428
|
fn = eval(script)
|
|
423
429
|
fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
|
|
424
430
|
functions[key] = fn
|
|
@@ -431,6 +437,23 @@ def get_template(templ):
|
|
|
431
437
|
templates[templ] = t
|
|
432
438
|
return templates[templ]
|
|
433
439
|
|
|
440
|
+
def flatten_lists(o):
|
|
441
|
+
if isinstance(o, list):
|
|
442
|
+
result = []
|
|
443
|
+
for x in o:
|
|
444
|
+
flattened = flatten_lists(x)
|
|
445
|
+
if isinstance(flattened, list):
|
|
446
|
+
result.extend(flattened)
|
|
447
|
+
else:
|
|
448
|
+
result.append(flattened)
|
|
449
|
+
return result
|
|
450
|
+
elif isinstance(o, dict):
|
|
451
|
+
for key in o.keys():
|
|
452
|
+
o[key] = flatten_lists(o[key])
|
|
453
|
+
return o
|
|
454
|
+
else:
|
|
455
|
+
return o
|
|
456
|
+
|
|
434
457
|
def process_row(row, template, rowname, table, resources, transform, variables):
|
|
435
458
|
result = []
|
|
436
459
|
e = {'row':row,
|
|
@@ -534,7 +557,9 @@ def process_row(row, template, rowname, table, resources, transform, variables):
|
|
|
534
557
|
fn = get_function(expression, list(env.keys()))
|
|
535
558
|
v = fn(**env)
|
|
536
559
|
if v is not None:
|
|
537
|
-
if len(variable_list) == 1
|
|
560
|
+
if len(variable_list) == 1 and not (
|
|
561
|
+
isinstance(v, collections.Iterable)
|
|
562
|
+
and not isinstance(v, str)):
|
|
538
563
|
v = [v]
|
|
539
564
|
new_env = dict(env)
|
|
540
565
|
for i, variable in enumerate(variable_list):
|
|
@@ -581,10 +606,11 @@ def process_row(row, template, rowname, table, resources, transform, variables):
|
|
|
581
606
|
parent[key] = this
|
|
582
607
|
else:
|
|
583
608
|
parent.append(this)
|
|
584
|
-
|
|
609
|
+
|
|
610
|
+
return flatten_lists(result)
|
|
585
611
|
|
|
586
612
|
def json_transform(transform, resources):
|
|
587
|
-
logger.info("
|
|
613
|
+
logger.info("Transform %s", transform.identifier)
|
|
588
614
|
tables = [u for u in transform[prov.used]]
|
|
589
615
|
variables = {}
|
|
590
616
|
for usage in transform[prov.qualifiedUsage]:
|
|
@@ -597,6 +623,20 @@ def json_transform(transform, resources):
|
|
|
597
623
|
generated = list(transform.subjects(prov.wasGeneratedBy))[0]
|
|
598
624
|
logger.info("Generating %s", generated.identifier)
|
|
599
625
|
|
|
626
|
+
connected_downstream_graph = '''
|
|
627
|
+
construct {
|
|
628
|
+
?target ?p ?o
|
|
629
|
+
} where {
|
|
630
|
+
?source (<>|!<>)* ?target.
|
|
631
|
+
?target ?p ?o.
|
|
632
|
+
}
|
|
633
|
+
'''
|
|
634
|
+
shape_graph = Graph()
|
|
635
|
+
for shape in transform.objects(dc.conformsTo):
|
|
636
|
+
if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]:
|
|
637
|
+
logger.info("Validating against SHACL shape %s", shape.identifier)
|
|
638
|
+
shape_graph += transform.graph.query(connected_downstream_graph,
|
|
639
|
+
initBindings={"source":shape.identifier})
|
|
600
640
|
if generated.identifier in resources:
|
|
601
641
|
result = resources[generated.identifier]
|
|
602
642
|
else:
|
|
@@ -630,12 +670,12 @@ def json_transform(transform, resources):
|
|
|
630
670
|
if isinstance(table, pandas.DataFrame):
|
|
631
671
|
#if run_samples:
|
|
632
672
|
# table = table.head()
|
|
633
|
-
it = table.iterrows()
|
|
634
|
-
logger.info("Transforming %s rows.", len(table.index))
|
|
673
|
+
it = tqdm(table.iterrows(), total=table.shape[0])
|
|
674
|
+
#logger.info("Transforming %s rows.", len(table.index))
|
|
635
675
|
else:
|
|
636
|
-
logger.info("
|
|
676
|
+
logger.info("Transform %s", t.identifier)
|
|
637
677
|
for rowname, row in it:
|
|
638
|
-
if run_samples and rowname >=
|
|
678
|
+
if run_samples > 0 and rowname >= run_samples:
|
|
639
679
|
break
|
|
640
680
|
try:
|
|
641
681
|
root = None
|
|
@@ -646,16 +686,28 @@ def json_transform(transform, resources):
|
|
|
646
686
|
}
|
|
647
687
|
if context is not None:
|
|
648
688
|
root['@context'] = context
|
|
689
|
+
|
|
690
|
+
#logger.debug(json.dumps(root, indent=4))
|
|
649
691
|
#before = len(result)
|
|
650
692
|
#graph = ConjunctiveGraph(identifier=generated.identifier)
|
|
651
693
|
#graph.parse(data=json.dumps(root),format="json-ld")
|
|
652
694
|
data = json.dumps(root)
|
|
653
695
|
#del root
|
|
696
|
+
|
|
697
|
+
if len(shape_graph) > 0:
|
|
698
|
+
d = ConjunctiveGraph()
|
|
699
|
+
d.parse(data=data,format='json-ld')
|
|
700
|
+
conforms, report, message = validate(d,
|
|
701
|
+
shacl_graph=shape_graph,
|
|
702
|
+
advanced=True,
|
|
703
|
+
debug=False)
|
|
704
|
+
if not conforms:
|
|
705
|
+
print(message)
|
|
654
706
|
result.parse(data=data, format="json-ld")
|
|
655
707
|
#del data
|
|
656
708
|
#after = len(result)
|
|
657
|
-
logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
|
|
658
|
-
sys.stdout.flush()
|
|
709
|
+
#logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
|
|
710
|
+
#sys.stdout.flush()
|
|
659
711
|
except Exception as e:
|
|
660
712
|
trace = sys.exc_info()[2]
|
|
661
713
|
if data is not None:
|
|
@@ -729,7 +781,7 @@ def _load_open(generated):
|
|
|
729
781
|
return fh
|
|
730
782
|
|
|
731
783
|
def load(load_resource, resources):
|
|
732
|
-
logger.info('
|
|
784
|
+
logger.info('Load %s',load_resource.identifier)
|
|
733
785
|
file_graph = Dataset(default_union=True)
|
|
734
786
|
to_disk = False
|
|
735
787
|
for used in load_resource[prov.used]:
|
|
@@ -762,7 +814,7 @@ def load(load_resource, resources):
|
|
|
762
814
|
#print fmt
|
|
763
815
|
with _load_open(generated) as o:
|
|
764
816
|
file_graph.serialize(o, format=fmt)
|
|
765
|
-
|
|
817
|
+
|
|
766
818
|
elif generated[RDF.type:sd.Service]:
|
|
767
819
|
from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
|
|
768
820
|
endpoint = generated.value(sd.endpoint, default=generated).identifier
|
|
@@ -798,10 +850,16 @@ def _setl(setl_graph):
|
|
|
798
850
|
return resources
|
|
799
851
|
logger = None
|
|
800
852
|
|
|
801
|
-
|
|
802
|
-
|
|
853
|
+
import click
|
|
854
|
+
@click.command()
|
|
855
|
+
@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
|
|
856
|
+
@click.option('-n', default=-1, help="Only process the first N rows.", type=int)
|
|
857
|
+
#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
|
|
858
|
+
#@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
|
|
859
|
+
@click.argument('script', type=click.Path(exists=True))
|
|
860
|
+
def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
|
|
803
861
|
logging_level = logging.DEBUG
|
|
804
|
-
if
|
|
862
|
+
if quiet:
|
|
805
863
|
logging_level = logging.WARNING
|
|
806
864
|
logging.basicConfig(level=logging_level)
|
|
807
865
|
|
|
@@ -809,18 +867,9 @@ def main():
|
|
|
809
867
|
logger = logging.getLogger(__name__)
|
|
810
868
|
|
|
811
869
|
global run_samples
|
|
812
|
-
|
|
813
|
-
if 'sample' in args:
|
|
814
|
-
run_samples = True
|
|
815
|
-
logger.warning("Only processing a few sample rows.")
|
|
870
|
+
run_samples = n
|
|
816
871
|
setl_graph = ConjunctiveGraph()
|
|
817
|
-
content = open(
|
|
872
|
+
content = open(script).read()
|
|
818
873
|
setl_graph.parse(data=content, format="turtle")
|
|
819
874
|
|
|
820
875
|
graphs = _setl(setl_graph)
|
|
821
|
-
# print "Finished processing"
|
|
822
|
-
# return graphs
|
|
823
|
-
|
|
824
|
-
if __name__ == '__main__':
|
|
825
|
-
result = main()
|
|
826
|
-
logger.info("Exiting")
|
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: setlr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
|
|
5
5
|
Home-page: http://packages.python.org/setlr
|
|
6
6
|
Author: Jamie McCusker
|
|
7
7
|
Author-email: mccusj@cs.rpi.edu
|
|
8
8
|
License: Apache License 2.0
|
|
9
9
|
Keywords: rdf semantic etl
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Topic :: Utilities
|
|
13
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
License-File: LICENSE
|
|
14
|
+
Requires-Dist: future
|
|
15
|
+
Requires-Dist: pip>=9.0.0
|
|
16
|
+
Requires-Dist: cython
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: rdflib>=6.0.0
|
|
19
|
+
Requires-Dist: pandas>=0.23.0
|
|
20
|
+
Requires-Dist: requests
|
|
21
|
+
Requires-Dist: toposort
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Requires-Dist: jinja2
|
|
24
|
+
Requires-Dist: lxml
|
|
25
|
+
Requires-Dist: six
|
|
26
|
+
Requires-Dist: xlrd
|
|
27
|
+
Requires-Dist: ijson
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: requests-testadapter
|
|
31
|
+
Requires-Dist: python-slugify
|
|
32
|
+
Requires-Dist: pyshacl[js]
|
|
15
33
|
|
|
16
34
|
SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
|
|
17
|
-
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from setuptools import setup, find_packages
|
|
3
|
+
from setlr._version import __version__
|
|
3
4
|
|
|
4
5
|
# Utility function to read the README file.
|
|
5
6
|
# Used for the long_description. It's nice, because now 1) we have a top level
|
|
@@ -10,7 +11,7 @@ def read(fname):
|
|
|
10
11
|
|
|
11
12
|
setup(
|
|
12
13
|
name = "setlr",
|
|
13
|
-
version =
|
|
14
|
+
version = __version__,
|
|
14
15
|
author = "Jamie McCusker",
|
|
15
16
|
author_email = "mccusj@cs.rpi.edu",
|
|
16
17
|
description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
|
|
@@ -35,8 +36,11 @@ setup(
|
|
|
35
36
|
'six',
|
|
36
37
|
'xlrd',
|
|
37
38
|
'ijson',
|
|
39
|
+
'click',
|
|
40
|
+
'tqdm',
|
|
38
41
|
'requests-testadapter',
|
|
39
42
|
'python-slugify',
|
|
43
|
+
'pyshacl[js]'
|
|
40
44
|
],
|
|
41
45
|
entry_points = {
|
|
42
46
|
'console_scripts': ['setlr=setlr:main'],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|