setlr 0.2.18__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,34 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: setlr
3
- Version: 0.2.18
3
+ Version: 1.0.0
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jamie McCusker
7
7
  Author-email: mccusj@cs.rpi.edu
8
8
  License: Apache License 2.0
9
9
  Keywords: rdf semantic etl
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Topic :: Utilities
13
12
  Classifier: License :: OSI Approved :: Apache Software License
14
13
  License-File: LICENSE
14
+ Requires-Dist: future
15
+ Requires-Dist: pip>=9.0.0
16
+ Requires-Dist: cython
17
+ Requires-Dist: numpy
18
+ Requires-Dist: rdflib>=6.0.0
19
+ Requires-Dist: pandas>=0.23.0
20
+ Requires-Dist: requests
21
+ Requires-Dist: toposort
22
+ Requires-Dist: beautifulsoup4
23
+ Requires-Dist: jinja2
24
+ Requires-Dist: lxml
25
+ Requires-Dist: six
26
+ Requires-Dist: xlrd
27
+ Requires-Dist: ijson
28
+ Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: requests-testadapter
31
+ Requires-Dist: python-slugify
32
+ Requires-Dist: pyshacl[js]
15
33
 
16
34
  SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
17
-
@@ -33,8 +33,11 @@ import gzip
33
33
 
34
34
  import logging
35
35
 
36
+ from tqdm import tqdm
37
+
36
38
  import hashlib
37
39
  from slugify import slugify
40
+ from pyshacl import validate
38
41
 
39
42
  from .trig_store import TrigStore
40
43
 
@@ -52,6 +55,7 @@ sp = Namespace('http://spinrdf.org/sp#')
52
55
  sd = Namespace('http://www.w3.org/ns/sparql-service-description#')
53
56
  dc = Namespace('http://purl.org/dc/terms/')
54
57
  void = Namespace('http://rdfs.org/ns/void#')
58
+ shacl = Namespace('http://www.w3.org/ns/shacl#')
55
59
  api_vocab = Namespace('http://purl.org/linked-data/api/vocab#')
56
60
 
57
61
  sys.setrecursionlimit(10000)
@@ -87,7 +91,7 @@ datatypeConverters.update({
87
91
  XSD.double: float
88
92
  })
89
93
 
90
- run_samples = False
94
+ run_samples = -1
91
95
 
92
96
  _rdf_formats_to_guess = [
93
97
  'xml',
@@ -110,7 +114,7 @@ def read_csv(location, result):
110
114
  args['header'] = [0]
111
115
  with get_content(location, result) as fo:
112
116
  df = pandas.read_csv(fo, encoding='utf-8', **args)
113
- logger.debug("Loaded %s", location)
117
+ #logger.debug("Loaded %s", location)
114
118
  return df
115
119
 
116
120
  def read_graph(location, result, g = None):
@@ -263,7 +267,7 @@ def read_xml(location, result):
263
267
  for xp in result[setl.xpath]:
264
268
  f.iter_end(xp.value)
265
269
  with get_content(location, result) as fo:
266
- for (i, (event, ele)) in enumerate(f.iterparse(fo)):
270
+ for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
267
271
  yield i, ele
268
272
 
269
273
 
@@ -274,7 +278,7 @@ def read_json(location, result):
274
278
  else:
275
279
  selector = ""
276
280
  with get_content(location, result) as fo:
277
- yield from enumerate(ijson.items(fo, selector))
281
+ yield from enumerate(tqdm(ijson.items(fo, selector)))
278
282
 
279
283
 
280
284
  extractors = {
@@ -387,7 +391,7 @@ def get_order(setl_graph):
387
391
  return toposort_flatten(nodes)
388
392
 
389
393
  def extract(e, resources):
390
- logger.info('Extracting %s',e.identifier)
394
+ logger.info('Extract %s',e.identifier)
391
395
  used = e.value(prov.used)
392
396
  for result in e.subjects(prov.wasGeneratedBy):
393
397
  if used is None:
@@ -395,7 +399,7 @@ def extract(e, resources):
395
399
  for t in result[RDF.type]:
396
400
  # Do we know how to generate this?
397
401
  if t.identifier in extractors:
398
- logger.info("Extracted %s", used.identifier)
402
+ logger.info("Using %s", used.identifier)
399
403
  resources[result.identifier] = extractors[t.identifier](used.identifier, result)
400
404
  return resources[result.identifier]
401
405
 
@@ -416,9 +420,11 @@ def clone(value):
416
420
 
417
421
  functions = {}
418
422
  def get_function(expr, local_keys):
419
- key = tuple([expr]+sorted(local_keys))
423
+ used_local_keys = [k for k in local_keys if k in expr]
424
+ key = tuple([expr]+sorted(used_local_keys))
420
425
  if key not in functions:
421
- script = '''lambda %s: %s'''% (', '.join(sorted(local_keys)), expr)
426
+ script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr)
427
+ #print(script)
422
428
  fn = eval(script)
423
429
  fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
424
430
  functions[key] = fn
@@ -431,6 +437,23 @@ def get_template(templ):
431
437
  templates[templ] = t
432
438
  return templates[templ]
433
439
 
440
+ def flatten_lists(o):
441
+ if isinstance(o, list):
442
+ result = []
443
+ for x in o:
444
+ flattened = flatten_lists(x)
445
+ if isinstance(flattened, list):
446
+ result.extend(flattened)
447
+ else:
448
+ result.append(flattened)
449
+ return result
450
+ elif isinstance(o, dict):
451
+ for key in o.keys():
452
+ o[key] = flatten_lists(o[key])
453
+ return o
454
+ else:
455
+ return o
456
+
434
457
  def process_row(row, template, rowname, table, resources, transform, variables):
435
458
  result = []
436
459
  e = {'row':row,
@@ -534,7 +557,9 @@ def process_row(row, template, rowname, table, resources, transform, variables):
534
557
  fn = get_function(expression, list(env.keys()))
535
558
  v = fn(**env)
536
559
  if v is not None:
537
- if len(variable_list) == 1:
560
+ if len(variable_list) == 1 and not (
561
+ isinstance(v, collections.Iterable)
562
+ and not isinstance(v, str)):
538
563
  v = [v]
539
564
  new_env = dict(env)
540
565
  for i, variable in enumerate(variable_list):
@@ -581,10 +606,11 @@ def process_row(row, template, rowname, table, resources, transform, variables):
581
606
  parent[key] = this
582
607
  else:
583
608
  parent.append(this)
584
- return result
609
+
610
+ return flatten_lists(result)
585
611
 
586
612
  def json_transform(transform, resources):
587
- logger.info("Transforming %s", transform.identifier)
613
+ logger.info("Transform %s", transform.identifier)
588
614
  tables = [u for u in transform[prov.used]]
589
615
  variables = {}
590
616
  for usage in transform[prov.qualifiedUsage]:
@@ -597,6 +623,20 @@ def json_transform(transform, resources):
597
623
  generated = list(transform.subjects(prov.wasGeneratedBy))[0]
598
624
  logger.info("Generating %s", generated.identifier)
599
625
 
626
+ connected_downstream_graph = '''
627
+ construct {
628
+ ?target ?p ?o
629
+ } where {
630
+ ?source (<>|!<>)* ?target.
631
+ ?target ?p ?o.
632
+ }
633
+ '''
634
+ shape_graph = Graph()
635
+ for shape in transform.objects(dc.conformsTo):
636
+ if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]:
637
+ logger.info("Validating against SHACL shape %s", shape.identifier)
638
+ shape_graph += transform.graph.query(connected_downstream_graph,
639
+ initBindings={"source":shape.identifier})
600
640
  if generated.identifier in resources:
601
641
  result = resources[generated.identifier]
602
642
  else:
@@ -630,12 +670,12 @@ def json_transform(transform, resources):
630
670
  if isinstance(table, pandas.DataFrame):
631
671
  #if run_samples:
632
672
  # table = table.head()
633
- it = table.iterrows()
634
- logger.info("Transforming %s rows.", len(table.index))
673
+ it = tqdm(table.iterrows(), total=table.shape[0])
674
+ #logger.info("Transforming %s rows.", len(table.index))
635
675
  else:
636
- logger.info("Transforming %s", t.identifier)
676
+ logger.info("Transform %s", t.identifier)
637
677
  for rowname, row in it:
638
- if run_samples and rowname >= 100:
678
+ if run_samples > 0 and rowname >= run_samples:
639
679
  break
640
680
  try:
641
681
  root = None
@@ -646,16 +686,28 @@ def json_transform(transform, resources):
646
686
  }
647
687
  if context is not None:
648
688
  root['@context'] = context
689
+
690
+ #logger.debug(json.dumps(root, indent=4))
649
691
  #before = len(result)
650
692
  #graph = ConjunctiveGraph(identifier=generated.identifier)
651
693
  #graph.parse(data=json.dumps(root),format="json-ld")
652
694
  data = json.dumps(root)
653
695
  #del root
696
+
697
+ if len(shape_graph) > 0:
698
+ d = ConjunctiveGraph()
699
+ d.parse(data=data,format='json-ld')
700
+ conforms, report, message = validate(d,
701
+ shacl_graph=shape_graph,
702
+ advanced=True,
703
+ debug=False)
704
+ if not conforms:
705
+ print(message)
654
706
  result.parse(data=data, format="json-ld")
655
707
  #del data
656
708
  #after = len(result)
657
- logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
658
- sys.stdout.flush()
709
+ #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
710
+ #sys.stdout.flush()
659
711
  except Exception as e:
660
712
  trace = sys.exc_info()[2]
661
713
  if data is not None:
@@ -729,7 +781,7 @@ def _load_open(generated):
729
781
  return fh
730
782
 
731
783
  def load(load_resource, resources):
732
- logger.info('Loading %s',load_resource.identifier)
784
+ logger.info('Load %s',load_resource.identifier)
733
785
  file_graph = Dataset(default_union=True)
734
786
  to_disk = False
735
787
  for used in load_resource[prov.used]:
@@ -762,7 +814,7 @@ def load(load_resource, resources):
762
814
  #print fmt
763
815
  with _load_open(generated) as o:
764
816
  file_graph.serialize(o, format=fmt)
765
-
817
+
766
818
  elif generated[RDF.type:sd.Service]:
767
819
  from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
768
820
  endpoint = generated.value(sd.endpoint, default=generated).identifier
@@ -798,10 +850,16 @@ def _setl(setl_graph):
798
850
  return resources
799
851
  logger = None
800
852
 
801
- def main():
802
- args = sys.argv[1:]
853
+ import click
854
+ @click.command()
855
+ @click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
856
+ @click.option('-n', default=-1, help="Only process the first N rows.", type=int)
857
+ #@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
858
+ #@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
859
+ @click.argument('script', type=click.Path(exists=True))
860
+ def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
803
861
  logging_level = logging.DEBUG
804
- if '-q' in args or '--quiet' in args:
862
+ if quiet:
805
863
  logging_level = logging.WARNING
806
864
  logging.basicConfig(level=logging_level)
807
865
 
@@ -809,18 +867,9 @@ def main():
809
867
  logger = logging.getLogger(__name__)
810
868
 
811
869
  global run_samples
812
- setl_file = args[0]
813
- if 'sample' in args:
814
- run_samples = True
815
- logger.warning("Only processing a few sample rows.")
870
+ run_samples = n
816
871
  setl_graph = ConjunctiveGraph()
817
- content = open(setl_file).read()
872
+ content = open(script).read()
818
873
  setl_graph.parse(data=content, format="turtle")
819
874
 
820
875
  graphs = _setl(setl_graph)
821
- # print "Finished processing"
822
- # return graphs
823
-
824
- if __name__ == '__main__':
825
- result = main()
826
- logger.info("Exiting")
@@ -0,0 +1,4 @@
1
+ __version__='1.0.0'
2
+
3
+ if __name__ == '__main__':
4
+ print(__version__)
@@ -1,17 +1,34 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: setlr
3
- Version: 0.2.18
3
+ Version: 1.0.0
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jamie McCusker
7
7
  Author-email: mccusj@cs.rpi.edu
8
8
  License: Apache License 2.0
9
9
  Keywords: rdf semantic etl
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Topic :: Utilities
13
12
  Classifier: License :: OSI Approved :: Apache Software License
14
13
  License-File: LICENSE
14
+ Requires-Dist: future
15
+ Requires-Dist: pip>=9.0.0
16
+ Requires-Dist: cython
17
+ Requires-Dist: numpy
18
+ Requires-Dist: rdflib>=6.0.0
19
+ Requires-Dist: pandas>=0.23.0
20
+ Requires-Dist: requests
21
+ Requires-Dist: toposort
22
+ Requires-Dist: beautifulsoup4
23
+ Requires-Dist: jinja2
24
+ Requires-Dist: lxml
25
+ Requires-Dist: six
26
+ Requires-Dist: xlrd
27
+ Requires-Dist: ijson
28
+ Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: requests-testadapter
31
+ Requires-Dist: python-slugify
32
+ Requires-Dist: pyshacl[js]
15
33
 
16
34
  SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
17
-
@@ -3,6 +3,7 @@ README.md
3
3
  setup.cfg
4
4
  setup.py
5
5
  setlr/__init__.py
6
+ setlr/_version.py
6
7
  setlr/iterparse_filter.py
7
8
  setlr/sqlite-store.py
8
9
  setlr/trig_store.py
@@ -1,3 +1,2 @@
1
1
  [console_scripts]
2
2
  setlr = setlr:main
3
-
@@ -12,5 +12,8 @@ lxml
12
12
  six
13
13
  xlrd
14
14
  ijson
15
+ click
16
+ tqdm
15
17
  requests-testadapter
16
18
  python-slugify
19
+ pyshacl[js]
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from setuptools import setup, find_packages
3
+ from setlr._version import __version__
3
4
 
4
5
  # Utility function to read the README file.
5
6
  # Used for the long_description. It's nice, because now 1) we have a top level
@@ -10,7 +11,7 @@ def read(fname):
10
11
 
11
12
  setup(
12
13
  name = "setlr",
13
- version = "0.2.18",
14
+ version = __version__,
14
15
  author = "Jamie McCusker",
15
16
  author_email = "mccusj@cs.rpi.edu",
16
17
  description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
@@ -35,8 +36,11 @@ setup(
35
36
  'six',
36
37
  'xlrd',
37
38
  'ijson',
39
+ 'click',
40
+ 'tqdm',
38
41
  'requests-testadapter',
39
42
  'python-slugify',
43
+ 'pyshacl[js]'
40
44
  ],
41
45
  entry_points = {
42
46
  'console_scripts': ['setlr=setlr:main'],
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes