PyPI - setlr - Versions diffs - 0.2.18__tar.gz → 1.0.0__tar.gz - Mend

setlr 0.2.18tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{setlr-0.2.18 → setlr-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,17 +1,34 @@
 Metadata-Version: 2.1
 Name: setlr
-Version: 0.2.18
+Version: 1.0.0
 Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
 Home-page: http://packages.python.org/setlr
 Author: Jamie McCusker
 Author-email: mccusj@cs.rpi.edu
 License: Apache License 2.0
 Keywords: rdf semantic etl
-Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Topic :: Utilities
 Classifier: License :: OSI Approved :: Apache Software License
 License-File: LICENSE
+Requires-Dist: future
+Requires-Dist: pip>=9.0.0
+Requires-Dist: cython
+Requires-Dist: numpy
+Requires-Dist: rdflib>=6.0.0
+Requires-Dist: pandas>=0.23.0
+Requires-Dist: requests
+Requires-Dist: toposort
+Requires-Dist: beautifulsoup4
+Requires-Dist: jinja2
+Requires-Dist: lxml
+Requires-Dist: six
+Requires-Dist: xlrd
+Requires-Dist: ijson
+Requires-Dist: click
+Requires-Dist: tqdm
+Requires-Dist: requests-testadapter
+Requires-Dist: python-slugify
+Requires-Dist: pyshacl[js]
 SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.

{setlr-0.2.18 → setlr-1.0.0}/setlr/__init__.py RENAMED Viewed

@@ -33,8 +33,11 @@ import gzip
 import logging
+from tqdm import tqdm
 import hashlib
 from slugify import slugify
+from pyshacl import validate
 from .trig_store import TrigStore
@@ -52,6 +55,7 @@ sp = Namespace('http://spinrdf.org/sp#')
 sd = Namespace('http://www.w3.org/ns/sparql-service-description#')
 dc = Namespace('http://purl.org/dc/terms/')
 void = Namespace('http://rdfs.org/ns/void#')
+shacl = Namespace('http://www.w3.org/ns/shacl#')
 api_vocab = Namespace('http://purl.org/linked-data/api/vocab#')
 sys.setrecursionlimit(10000)
@@ -87,7 +91,7 @@ datatypeConverters.update({
     XSD.double: float
 })
-run_samples = False
+run_samples = -1
 _rdf_formats_to_guess = [
     'xml',
@@ -110,7 +114,7 @@ def read_csv(location, result):
         args['header'] = [0]
     with get_content(location, result) as fo:
         df = pandas.read_csv(fo, encoding='utf-8', **args)
-        logger.debug("Loaded %s", location)
+        #logger.debug("Loaded %s", location)
         return df
 def read_graph(location, result, g = None):
@@ -263,7 +267,7 @@ def read_xml(location, result):
     for xp in result[setl.xpath]:
         f.iter_end(xp.value)
     with get_content(location, result) as fo:
-        for (i, (event, ele)) in enumerate(f.iterparse(fo)):
+        for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
             yield i, ele
@@ -274,7 +278,7 @@ def read_json(location, result):
     else:
         selector = ""
     with get_content(location, result) as fo:
-        yield from enumerate(ijson.items(fo, selector))
+        yield from enumerate(tqdm(ijson.items(fo, selector)))
 extractors = {
@@ -387,7 +391,7 @@ def get_order(setl_graph):
     return toposort_flatten(nodes)
 def extract(e, resources):
-    logger.info('Extracting %s',e.identifier)
+    logger.info('Extract %s',e.identifier)
     used = e.value(prov.used)
     for result in e.subjects(prov.wasGeneratedBy):
         if used is None:
@@ -395,7 +399,7 @@ def extract(e, resources):
         for t in result[RDF.type]:
             # Do we know how to generate this?
             if t.identifier in extractors:
-                logger.info("Extracted %s", used.identifier)
+                logger.info("Using %s", used.identifier)
                 resources[result.identifier] = extractors[t.identifier](used.identifier, result)
                 return resources[result.identifier]
@@ -416,9 +420,11 @@ def clone(value):
 functions = {}
 def get_function(expr, local_keys):
-    key = tuple([expr]+sorted(local_keys))
+    used_local_keys = [k for k in local_keys if k in expr]
+    key = tuple([expr]+sorted(used_local_keys))
     if key not in functions:
-        script = '''lambda %s: %s'''% (', '.join(sorted(local_keys)), expr)
+        script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr)
+        #print(script)
         fn = eval(script)
         fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
         functions[key] = fn
@@ -431,6 +437,23 @@ def get_template(templ):
         templates[templ] = t
     return templates[templ]
+def flatten_lists(o):
+    if isinstance(o, list):
+        result = []
+        for x in o:
+            flattened = flatten_lists(x)
+            if isinstance(flattened, list):
+                result.extend(flattened)
+            else:
+                result.append(flattened)
+        return result
+    elif isinstance(o, dict):
+        for key in o.keys():
+            o[key] = flatten_lists(o[key])
+        return o
+    else:
+        return o
 def process_row(row, template, rowname, table, resources, transform, variables):
     result = []
     e = {'row':row,
@@ -534,7 +557,9 @@ def process_row(row, template, rowname, table, resources, transform, variables):
                     fn = get_function(expression, list(env.keys()))
                     v = fn(**env)
                     if v is not None:
-                        if len(variable_list) == 1:
+                        if len(variable_list) == 1 and not (
+                                isinstance(v, collections.Iterable)
+                                and not isinstance(v, str)):
                             v = [v]
                         new_env = dict(env)
                         for i, variable in enumerate(variable_list):
@@ -581,10 +606,11 @@ def process_row(row, template, rowname, table, resources, transform, variables):
             parent[key] = this
         else:
             parent.append(this)
-    return result
+    return flatten_lists(result)
 def json_transform(transform, resources):
-    logger.info("Transforming %s", transform.identifier)
+    logger.info("Transform %s", transform.identifier)
     tables = [u for u in transform[prov.used]]
     variables = {}
     for usage in transform[prov.qualifiedUsage]:
@@ -597,6 +623,20 @@ def json_transform(transform, resources):
     generated = list(transform.subjects(prov.wasGeneratedBy))[0]
     logger.info("Generating %s", generated.identifier)
+    connected_downstream_graph = '''
+construct {
+   ?target ?p ?o
+} where {
+   ?source (<>|!<>)* ?target.
+   ?target ?p ?o.
+}
+'''
+    shape_graph = Graph()
+    for shape in transform.objects(dc.conformsTo):
+        if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]:
+            logger.info("Validating against SHACL shape %s", shape.identifier)
+            shape_graph += transform.graph.query(connected_downstream_graph,
+                                                 initBindings={"source":shape.identifier})
     if generated.identifier in resources:
         result = resources[generated.identifier]
     else:
@@ -630,12 +670,12 @@ def json_transform(transform, resources):
         if isinstance(table, pandas.DataFrame):
             #if run_samples:
             #    table = table.head()
-            it = table.iterrows()
-            logger.info("Transforming %s rows.", len(table.index))
+            it = tqdm(table.iterrows(), total=table.shape[0])
+            #logger.info("Transforming %s rows.", len(table.index))
         else:
-            logger.info("Transforming %s", t.identifier)
+            logger.info("Transform %s", t.identifier)
         for rowname, row in it:
-            if run_samples and rowname >= 100:
+            if run_samples > 0 and rowname >= run_samples:
                 break
             try:
                 root = None
@@ -646,16 +686,28 @@ def json_transform(transform, resources):
                 }
                 if context is not None:
                     root['@context'] = context
+                #logger.debug(json.dumps(root, indent=4))
                 #before = len(result)
                 #graph = ConjunctiveGraph(identifier=generated.identifier)
                 #graph.parse(data=json.dumps(root),format="json-ld")
                 data = json.dumps(root)
                 #del root
+                if len(shape_graph) > 0:
+                    d = ConjunctiveGraph()
+                    d.parse(data=data,format='json-ld')
+                    conforms, report, message = validate(d,
+                                                         shacl_graph=shape_graph,
+                                                         advanced=True,
+                                                         debug=False)
+                    if not conforms:
+                        print(message)
                 result.parse(data=data, format="json-ld")
                 #del data
                 #after = len(result)
-                logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
-                sys.stdout.flush()
+                #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
+                #sys.stdout.flush()
             except Exception as e:
                 trace = sys.exc_info()[2]
                 if data is not None:
@@ -729,7 +781,7 @@ def _load_open(generated):
     return fh
 def load(load_resource, resources):
-    logger.info('Loading %s',load_resource.identifier)
+    logger.info('Load %s',load_resource.identifier)
     file_graph = Dataset(default_union=True)
     to_disk = False
     for used in load_resource[prov.used]:
@@ -762,7 +814,7 @@ def load(load_resource, resources):
                 #print fmt
             with _load_open(generated) as o:
                 file_graph.serialize(o, format=fmt)
         elif generated[RDF.type:sd.Service]:
             from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
             endpoint = generated.value(sd.endpoint, default=generated).identifier
@@ -798,10 +850,16 @@ def _setl(setl_graph):
     return resources
 logger = None
-def main():
-    args = sys.argv[1:]
+import click
+@click.command()
+@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
+@click.option('-n', default=-1, help="Only process the first N rows.", type=int)
+#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
+#@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
+@click.argument('script', type=click.Path(exists=True))
+def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
     logging_level = logging.DEBUG
-    if '-q' in args or '--quiet' in args:
+    if quiet:
         logging_level = logging.WARNING
     logging.basicConfig(level=logging_level)
@@ -809,18 +867,9 @@ def main():
     logger = logging.getLogger(__name__)
     global run_samples
-    setl_file = args[0]
-    if 'sample' in args:
-        run_samples = True
-        logger.warning("Only processing a few sample rows.")
+    run_samples = n
     setl_graph = ConjunctiveGraph()
-    content = open(setl_file).read()
+    content = open(script).read()
     setl_graph.parse(data=content, format="turtle")
     graphs = _setl(setl_graph)
-#    print "Finished processing"
-#    return graphs
-if __name__ == '__main__':
-    result = main()
-    logger.info("Exiting")

setlr-1.0.0/setlr/_version.py ADDED Viewed

@@ -0,0 +1,4 @@
+__version__='1.0.0'
+if __name__ == '__main__':
+    print(__version__)

{setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/PKG-INFO RENAMED Viewed

@@ -1,17 +1,34 @@
 Metadata-Version: 2.1
 Name: setlr
-Version: 0.2.18
+Version: 1.0.0
 Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
 Home-page: http://packages.python.org/setlr
 Author: Jamie McCusker
 Author-email: mccusj@cs.rpi.edu
 License: Apache License 2.0
 Keywords: rdf semantic etl
-Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Topic :: Utilities
 Classifier: License :: OSI Approved :: Apache Software License
 License-File: LICENSE
+Requires-Dist: future
+Requires-Dist: pip>=9.0.0
+Requires-Dist: cython
+Requires-Dist: numpy
+Requires-Dist: rdflib>=6.0.0
+Requires-Dist: pandas>=0.23.0
+Requires-Dist: requests
+Requires-Dist: toposort
+Requires-Dist: beautifulsoup4
+Requires-Dist: jinja2
+Requires-Dist: lxml
+Requires-Dist: six
+Requires-Dist: xlrd
+Requires-Dist: ijson
+Requires-Dist: click
+Requires-Dist: tqdm
+Requires-Dist: requests-testadapter
+Requires-Dist: python-slugify
+Requires-Dist: pyshacl[js]
 SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.

{setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,6 +3,7 @@ README.md
 setup.cfg
 setup.py
 setlr/__init__.py
+setlr/_version.py
 setlr/iterparse_filter.py
 setlr/sqlite-store.py
 setlr/trig_store.py

{setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/entry_points.txt RENAMED Viewed

@@ -1,3 +1,2 @@
 [console_scripts]
 setlr = setlr:main

{setlr-0.2.18 → setlr-1.0.0}/setlr.egg-info/requires.txt RENAMED Viewed

@@ -12,5 +12,8 @@ lxml
 six
 xlrd
 ijson
+click
+tqdm
 requests-testadapter
 python-slugify
+pyshacl[js]

{setlr-0.2.18 → setlr-1.0.0}/setup.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import os
 from setuptools import setup, find_packages
+from setlr._version import __version__
 # Utility function to read the README file.
 # Used for the long_description.  It's nice, because now 1) we have a top level
@@ -10,7 +11,7 @@ def read(fname):
 setup(
     name = "setlr",
-    version = "0.2.18",
+    version = __version__,
     author = "Jamie McCusker",
     author_email = "mccusj@cs.rpi.edu",
     description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
@@ -35,8 +36,11 @@ setup(
         'six',
         'xlrd',
         'ijson',
+        'click',
+        'tqdm',
         'requests-testadapter',
         'python-slugify',
+        'pyshacl[js]'
     ],
     entry_points = {
         'console_scripts': ['setlr=setlr:main'],