setlr 0.2.19__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,34 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: setlr
3
- Version: 0.2.19
3
+ Version: 1.0.0
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jamie McCusker
7
7
  Author-email: mccusj@cs.rpi.edu
8
8
  License: Apache License 2.0
9
9
  Keywords: rdf semantic etl
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Topic :: Utilities
13
12
  Classifier: License :: OSI Approved :: Apache Software License
14
13
  License-File: LICENSE
14
+ Requires-Dist: future
15
+ Requires-Dist: pip>=9.0.0
16
+ Requires-Dist: cython
17
+ Requires-Dist: numpy
18
+ Requires-Dist: rdflib>=6.0.0
19
+ Requires-Dist: pandas>=0.23.0
20
+ Requires-Dist: requests
21
+ Requires-Dist: toposort
22
+ Requires-Dist: beautifulsoup4
23
+ Requires-Dist: jinja2
24
+ Requires-Dist: lxml
25
+ Requires-Dist: six
26
+ Requires-Dist: xlrd
27
+ Requires-Dist: ijson
28
+ Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: requests-testadapter
31
+ Requires-Dist: python-slugify
32
+ Requires-Dist: pyshacl[js]
15
33
 
16
34
  SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
17
-
@@ -33,8 +33,11 @@ import gzip
33
33
 
34
34
  import logging
35
35
 
36
+ from tqdm import tqdm
37
+
36
38
  import hashlib
37
39
  from slugify import slugify
40
+ from pyshacl import validate
38
41
 
39
42
  from .trig_store import TrigStore
40
43
 
@@ -52,6 +55,7 @@ sp = Namespace('http://spinrdf.org/sp#')
52
55
  sd = Namespace('http://www.w3.org/ns/sparql-service-description#')
53
56
  dc = Namespace('http://purl.org/dc/terms/')
54
57
  void = Namespace('http://rdfs.org/ns/void#')
58
+ shacl = Namespace('http://www.w3.org/ns/shacl#')
55
59
  api_vocab = Namespace('http://purl.org/linked-data/api/vocab#')
56
60
 
57
61
  sys.setrecursionlimit(10000)
@@ -87,7 +91,7 @@ datatypeConverters.update({
87
91
  XSD.double: float
88
92
  })
89
93
 
90
- run_samples = False
94
+ run_samples = -1
91
95
 
92
96
  _rdf_formats_to_guess = [
93
97
  'xml',
@@ -110,7 +114,7 @@ def read_csv(location, result):
110
114
  args['header'] = [0]
111
115
  with get_content(location, result) as fo:
112
116
  df = pandas.read_csv(fo, encoding='utf-8', **args)
113
- logger.debug("Loaded %s", location)
117
+ #logger.debug("Loaded %s", location)
114
118
  return df
115
119
 
116
120
  def read_graph(location, result, g = None):
@@ -263,7 +267,7 @@ def read_xml(location, result):
263
267
  for xp in result[setl.xpath]:
264
268
  f.iter_end(xp.value)
265
269
  with get_content(location, result) as fo:
266
- for (i, (event, ele)) in enumerate(f.iterparse(fo)):
270
+ for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
267
271
  yield i, ele
268
272
 
269
273
 
@@ -274,7 +278,7 @@ def read_json(location, result):
274
278
  else:
275
279
  selector = ""
276
280
  with get_content(location, result) as fo:
277
- yield from enumerate(ijson.items(fo, selector))
281
+ yield from enumerate(tqdm(ijson.items(fo, selector)))
278
282
 
279
283
 
280
284
  extractors = {
@@ -387,7 +391,7 @@ def get_order(setl_graph):
387
391
  return toposort_flatten(nodes)
388
392
 
389
393
  def extract(e, resources):
390
- logger.info('Extracting %s',e.identifier)
394
+ logger.info('Extract %s',e.identifier)
391
395
  used = e.value(prov.used)
392
396
  for result in e.subjects(prov.wasGeneratedBy):
393
397
  if used is None:
@@ -395,7 +399,7 @@ def extract(e, resources):
395
399
  for t in result[RDF.type]:
396
400
  # Do we know how to generate this?
397
401
  if t.identifier in extractors:
398
- logger.info("Extracted %s", used.identifier)
402
+ logger.info("Using %s", used.identifier)
399
403
  resources[result.identifier] = extractors[t.identifier](used.identifier, result)
400
404
  return resources[result.identifier]
401
405
 
@@ -606,7 +610,7 @@ def process_row(row, template, rowname, table, resources, transform, variables):
606
610
  return flatten_lists(result)
607
611
 
608
612
  def json_transform(transform, resources):
609
- logger.info("Transforming %s", transform.identifier)
613
+ logger.info("Transform %s", transform.identifier)
610
614
  tables = [u for u in transform[prov.used]]
611
615
  variables = {}
612
616
  for usage in transform[prov.qualifiedUsage]:
@@ -619,6 +623,20 @@ def json_transform(transform, resources):
619
623
  generated = list(transform.subjects(prov.wasGeneratedBy))[0]
620
624
  logger.info("Generating %s", generated.identifier)
621
625
 
626
+ connected_downstream_graph = '''
627
+ construct {
628
+ ?target ?p ?o
629
+ } where {
630
+ ?source (<>|!<>)* ?target.
631
+ ?target ?p ?o.
632
+ }
633
+ '''
634
+ shape_graph = Graph()
635
+ for shape in transform.objects(dc.conformsTo):
636
+ if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]:
637
+ logger.info("Validating against SHACL shape %s", shape.identifier)
638
+ shape_graph += transform.graph.query(connected_downstream_graph,
639
+ initBindings={"source":shape.identifier})
622
640
  if generated.identifier in resources:
623
641
  result = resources[generated.identifier]
624
642
  else:
@@ -652,12 +670,12 @@ def json_transform(transform, resources):
652
670
  if isinstance(table, pandas.DataFrame):
653
671
  #if run_samples:
654
672
  # table = table.head()
655
- it = table.iterrows()
656
- logger.info("Transforming %s rows.", len(table.index))
673
+ it = tqdm(table.iterrows(), total=table.shape[0])
674
+ #logger.info("Transforming %s rows.", len(table.index))
657
675
  else:
658
- logger.info("Transforming %s", t.identifier)
676
+ logger.info("Transform %s", t.identifier)
659
677
  for rowname, row in it:
660
- if run_samples and rowname >= 100:
678
+ if run_samples > 0 and rowname >= run_samples:
661
679
  break
662
680
  try:
663
681
  root = None
@@ -668,17 +686,28 @@ def json_transform(transform, resources):
668
686
  }
669
687
  if context is not None:
670
688
  root['@context'] = context
689
+
671
690
  #logger.debug(json.dumps(root, indent=4))
672
691
  #before = len(result)
673
692
  #graph = ConjunctiveGraph(identifier=generated.identifier)
674
693
  #graph.parse(data=json.dumps(root),format="json-ld")
675
694
  data = json.dumps(root)
676
695
  #del root
696
+
697
+ if len(shape_graph) > 0:
698
+ d = ConjunctiveGraph()
699
+ d.parse(data=data,format='json-ld')
700
+ conforms, report, message = validate(d,
701
+ shacl_graph=shape_graph,
702
+ advanced=True,
703
+ debug=False)
704
+ if not conforms:
705
+ print(message)
677
706
  result.parse(data=data, format="json-ld")
678
707
  #del data
679
708
  #after = len(result)
680
- logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
681
- sys.stdout.flush()
709
+ #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
710
+ #sys.stdout.flush()
682
711
  except Exception as e:
683
712
  trace = sys.exc_info()[2]
684
713
  if data is not None:
@@ -752,7 +781,7 @@ def _load_open(generated):
752
781
  return fh
753
782
 
754
783
  def load(load_resource, resources):
755
- logger.info('Loading %s',load_resource.identifier)
784
+ logger.info('Load %s',load_resource.identifier)
756
785
  file_graph = Dataset(default_union=True)
757
786
  to_disk = False
758
787
  for used in load_resource[prov.used]:
@@ -821,10 +850,16 @@ def _setl(setl_graph):
821
850
  return resources
822
851
  logger = None
823
852
 
824
- def main():
825
- args = sys.argv[1:]
853
+ import click
854
+ @click.command()
855
+ @click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
856
+ @click.option('-n', default=-1, help="Only process the first N rows.", type=int)
857
+ #@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
858
+ #@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
859
+ @click.argument('script', type=click.Path(exists=True))
860
+ def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
826
861
  logging_level = logging.DEBUG
827
- if '-q' in args or '--quiet' in args:
862
+ if quiet:
828
863
  logging_level = logging.WARNING
829
864
  logging.basicConfig(level=logging_level)
830
865
 
@@ -832,18 +867,9 @@ def main():
832
867
  logger = logging.getLogger(__name__)
833
868
 
834
869
  global run_samples
835
- setl_file = args[0]
836
- if 'sample' in args:
837
- run_samples = True
838
- logger.warning("Only processing a few sample rows.")
870
+ run_samples = n
839
871
  setl_graph = ConjunctiveGraph()
840
- content = open(setl_file).read()
872
+ content = open(script).read()
841
873
  setl_graph.parse(data=content, format="turtle")
842
874
 
843
875
  graphs = _setl(setl_graph)
844
- # print "Finished processing"
845
- # return graphs
846
-
847
- if __name__ == '__main__':
848
- result = main()
849
- logger.info("Exiting")
@@ -0,0 +1,4 @@
1
+ __version__='1.0.0'
2
+
3
+ if __name__ == '__main__':
4
+ print(__version__)
@@ -1,17 +1,34 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: setlr
3
- Version: 0.2.19
3
+ Version: 1.0.0
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jamie McCusker
7
7
  Author-email: mccusj@cs.rpi.edu
8
8
  License: Apache License 2.0
9
9
  Keywords: rdf semantic etl
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Topic :: Utilities
13
12
  Classifier: License :: OSI Approved :: Apache Software License
14
13
  License-File: LICENSE
14
+ Requires-Dist: future
15
+ Requires-Dist: pip>=9.0.0
16
+ Requires-Dist: cython
17
+ Requires-Dist: numpy
18
+ Requires-Dist: rdflib>=6.0.0
19
+ Requires-Dist: pandas>=0.23.0
20
+ Requires-Dist: requests
21
+ Requires-Dist: toposort
22
+ Requires-Dist: beautifulsoup4
23
+ Requires-Dist: jinja2
24
+ Requires-Dist: lxml
25
+ Requires-Dist: six
26
+ Requires-Dist: xlrd
27
+ Requires-Dist: ijson
28
+ Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: requests-testadapter
31
+ Requires-Dist: python-slugify
32
+ Requires-Dist: pyshacl[js]
15
33
 
16
34
  SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
17
-
@@ -3,6 +3,7 @@ README.md
3
3
  setup.cfg
4
4
  setup.py
5
5
  setlr/__init__.py
6
+ setlr/_version.py
6
7
  setlr/iterparse_filter.py
7
8
  setlr/sqlite-store.py
8
9
  setlr/trig_store.py
@@ -1,3 +1,2 @@
1
1
  [console_scripts]
2
2
  setlr = setlr:main
3
-
@@ -12,5 +12,8 @@ lxml
12
12
  six
13
13
  xlrd
14
14
  ijson
15
+ click
16
+ tqdm
15
17
  requests-testadapter
16
18
  python-slugify
19
+ pyshacl[js]
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from setuptools import setup, find_packages
3
+ from setlr._version import __version__
3
4
 
4
5
  # Utility function to read the README file.
5
6
  # Used for the long_description. It's nice, because now 1) we have a top level
@@ -10,7 +11,7 @@ def read(fname):
10
11
 
11
12
  setup(
12
13
  name = "setlr",
13
- version = "0.2.19",
14
+ version = __version__,
14
15
  author = "Jamie McCusker",
15
16
  author_email = "mccusj@cs.rpi.edu",
16
17
  description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
@@ -35,8 +36,11 @@ setup(
35
36
  'six',
36
37
  'xlrd',
37
38
  'ijson',
39
+ 'click',
40
+ 'tqdm',
38
41
  'requests-testadapter',
39
42
  'python-slugify',
43
+ 'pyshacl[js]'
40
44
  ],
41
45
  entry_points = {
42
46
  'console_scripts': ['setlr=setlr:main'],
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes