setlr 0.2.14__tar.gz → 0.2.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/bin/setlr +12 -0
  2. {setlr-0.2.14 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages}/setlr/__init__.py +70 -32
  3. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/__init__.cpython-36.pyc +0 -0
  4. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/iterparse_filter.cpython-36.pyc +0 -0
  5. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/sqlite-store.cpython-36.pyc +0 -0
  6. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/sqlite-store.py +0 -0
  7. {setlr-0.2.14 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/PKG-INFO +1 -1
  8. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/SOURCES.txt +1 -0
  9. setlr-0.2.14/README.md +0 -15
  10. setlr-0.2.14/setlr.egg-info/PKG-INFO +0 -14
  11. setlr-0.2.14/setup.cfg +0 -18
  12. setlr-0.2.14/setup.py +0 -50
  13. {setlr-0.2.14 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages}/setlr/iterparse_filter.py +0 -0
  14. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/dependency_links.txt +0 -0
  15. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/entry_points.txt +0 -0
  16. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/pbr.json +0 -0
  17. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/requires.txt +0 -0
  18. {setlr-0.2.14/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/top_level.txt +0 -0
@@ -0,0 +1,12 @@
1
+ #!/Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/bin/python
2
+ # EASY-INSTALL-ENTRY-SCRIPT: 'setlr==0.2.15','console_scripts','setlr'
3
+ __requires__ = 'setlr==0.2.15'
4
+ import re
5
+ import sys
6
+ from pkg_resources import load_entry_point
7
+
8
+ if __name__ == '__main__':
9
+ sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
10
+ sys.exit(
11
+ load_entry_point('setlr==0.2.15', 'console_scripts', 'setlr')()
12
+ )
@@ -75,7 +75,7 @@ class LocalFileAdapter(requests.adapters.HTTPAdapter):
75
75
  requests_session = requests.session()
76
76
  requests_session.mount('file://', LocalFileAdapter())
77
77
  requests_session.mount('file:///', LocalFileAdapter())
78
-
78
+
79
79
  datatypeConverters = collections.defaultdict(lambda: str)
80
80
  datatypeConverters.update({
81
81
  XSD.string: str,
@@ -105,10 +105,10 @@ def read_csv(location, result):
105
105
  )
106
106
  if result.value(csvw.header):
107
107
  args['header'] = [0]
108
- df = pandas.read_csv(get_content(location, result),encoding='utf-8', **args)
108
+ df = pandas.read_csv(location,encoding='utf-8', **args)
109
109
  logger.debug("Loaded %s", location)
110
110
  return df
111
-
111
+
112
112
  def read_graph(location, result, g = None):
113
113
  if g is None:
114
114
  g = ConjunctiveGraph()
@@ -133,6 +133,8 @@ def read_graph(location, result, g = None):
133
133
  return g
134
134
 
135
135
  class FileLikeFromIter(object):
136
+ _closed = False
137
+
136
138
  def __init__(self, content_iter):
137
139
  self.iter = content_iter
138
140
  self.data = b''
@@ -140,6 +142,27 @@ class FileLikeFromIter(object):
140
142
  def __iter__(self):
141
143
  return self.iter
142
144
 
145
+ def readable(self):
146
+ return True
147
+
148
+ def writable(self):
149
+ return False
150
+
151
+ def seekable(self):
152
+ return False
153
+
154
+ def closed(self):
155
+ if self._closed:
156
+ return True
157
+ if len(self.data) > 0:
158
+ return False
159
+ try:
160
+ self.data = next(self.iter)
161
+ except StopIteration:
162
+ self.closed = True
163
+ return True
164
+ return False
165
+
143
166
  # Enter and Exit are needed to allow this to work with with
144
167
  def __enter__(self):
145
168
  return self
@@ -162,7 +185,7 @@ class FileLikeFromIter(object):
162
185
 
163
186
  def _open_local_file(location):
164
187
  if location.startswith("file://"):
165
- if os.name == 'nt': # skip the initial
188
+ if os.name == 'nt': # skip the initial
166
189
  return open(location.replace('file:///','').replace('file://',''),'rb')
167
190
  else:
168
191
  return open(location.replace('file://',''),'rb')
@@ -171,7 +194,7 @@ content_handlers = [
171
194
  _open_local_file,
172
195
  lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024))
173
196
  ]
174
-
197
+
175
198
  def get_content(location, result):
176
199
  response = None
177
200
  for handler in content_handlers:
@@ -180,7 +203,7 @@ def get_content(location, result):
180
203
  break
181
204
  if result[RDF.type:setl.Tempfile]:
182
205
  result = to_tempfile(response)
183
-
206
+
184
207
  for t in result[RDF.type]:
185
208
  # Do we know how to unpack this?
186
209
  if t.identifier in unpackers:
@@ -208,16 +231,20 @@ unpackers = {
208
231
  setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r')
209
232
  }
210
233
 
234
+ packers = {
235
+ # setl.Tempfile : lambda x: x,
236
+ setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb')
237
+ }
238
+
211
239
  def read_excel(location, result):
212
240
  args = dict(
213
- sheetname = result.value(setl.sheetname, default=Literal(0)).value,
241
+ sheet_name = result.value(setl.sheetname, default=Literal(0)).value,
214
242
  header = [int(x) for x in result.value(csvw.headerRow, default=Literal('0')).value.split(',')],
215
243
  skiprows = result.value(csvw.skipRows, default=Literal(0)).value
216
244
  )
217
245
  if result.value(csvw.header):
218
246
  args['header'] = [result.value(csvw.header).value]
219
- with get_content(location, result) as f:
220
- df = pandas.read_excel(f,encoding='utf-8', **args)
247
+ df = pandas.read_excel(location,encoding='utf-8', **args)
221
248
  return df
222
249
 
223
250
  def read_xml(location, result):
@@ -234,7 +261,7 @@ def read_xml(location, result):
234
261
  for (i, (event, ele)) in enumerate(f.iterparse(fo)):
235
262
  yield i, ele
236
263
 
237
-
264
+
238
265
  def read_json(location, result):
239
266
  selector = result.value(api_vocab.selector)
240
267
  if selector is not None:
@@ -243,7 +270,7 @@ def read_json(location, result):
243
270
  selector = ""
244
271
  with get_content(location, result) as fo:
245
272
  yield from enumerate(ijson.items(fo, selector))
246
-
273
+
247
274
 
248
275
  extractors = {
249
276
  setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'),
@@ -257,14 +284,14 @@ extractors = {
257
284
  URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result)
258
285
  }
259
286
 
260
-
287
+
261
288
  try:
262
289
  from bs4 import BeautifulSoup
263
290
  extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser')
264
291
  except Exception as e:
265
292
  pass
266
-
267
-
293
+
294
+
268
295
  def load_csv(csv_resource):
269
296
  column_descriptions = {}
270
297
  for col in csv_resource[csvw.column]:
@@ -333,7 +360,7 @@ def create_python_function(f, resources):
333
360
  local_vars[name.value] = entity
334
361
  exec(script.value, local_vars, global_vars)
335
362
  resources[f.identifier] = global_vars['result']
336
-
363
+
337
364
  def get_order(setl_graph):
338
365
  nodes = collections.defaultdict(set)
339
366
 
@@ -351,7 +378,7 @@ def get_order(setl_graph):
351
378
  for derivation in task[prov.qualifiedDerivation]:
352
379
  derived = derivation.value(prov.entity)
353
380
  nodes[task.identifier].add(derived.identifier)
354
-
381
+
355
382
  return toposort_flatten(nodes)
356
383
 
357
384
  def extract(e, resources):
@@ -398,7 +425,7 @@ def get_template(templ):
398
425
  t = Template(templ)
399
426
  templates[templ] = t
400
427
  return templates[templ]
401
-
428
+
402
429
  def process_row(row, template, rowname, table, resources, transform, variables):
403
430
  result = []
404
431
  e = {'row':row,
@@ -561,7 +588,7 @@ def json_transform(transform, resources):
561
588
  roleID = role.value(dc.identifier)
562
589
  variables[roleID.value] = resources[used.identifier]
563
590
  #print "Using", used.identifier, "as", roleID.value
564
-
591
+
565
592
  generated = list(transform.subjects(prov.wasGeneratedBy))[0]
566
593
  logger.info("Generating %s", generated.identifier)
567
594
 
@@ -605,6 +632,8 @@ def json_transform(transform, resources):
605
632
  if run_samples and rowname >= 100:
606
633
  break
607
634
  try:
635
+ root = None
636
+ data = None
608
637
  root = {
609
638
  "@id": generated.identifier,
610
639
  "@graph": process_row(row, jslt, rowname, table, resources, transform, variables)
@@ -615,20 +644,22 @@ def json_transform(transform, resources):
615
644
  #graph = ConjunctiveGraph(identifier=generated.identifier)
616
645
  #graph.parse(data=json.dumps(root),format="json-ld")
617
646
  data = json.dumps(root)
618
- del root
647
+ #del root
619
648
  result.parse(data=data, format="json-ld")
620
- del data
649
+ #del data
621
650
  after = len(result)
622
651
  logger.debug("Row "+str(rowname)+" added "+str(after-before)+" triples.")
623
652
  sys.stdout.flush()
624
653
  except Exception as e:
625
654
  trace = sys.exc_info()[2]
655
+ if data is not None:
656
+ logger.error("Error parsing tree: %s", data)
626
657
  if isinstance(table, pandas.DataFrame):
627
658
  logger.error("Error on %s %s", rowname, row)
628
659
  else:
629
660
  logger.error("Error on %s", rowname)
630
661
  raise e
631
-
662
+
632
663
  resources[generated.identifier] = result
633
664
 
634
665
  def transform(transform_resource, resources):
@@ -639,12 +670,12 @@ def transform(transform_resource, resources):
639
670
  transform_graph = ConjunctiveGraph(identifier=result.identifier)
640
671
 
641
672
  used = set(transform_resource[prov.used])
642
-
673
+
643
674
  for csv in [u for u in used if u[RDF.type:csvw.Table]]:
644
675
  csv_graph = Graph(store=transform_graph.store, identifier=csv)
645
676
  csv_graph += graphs[csv.identifier]
646
677
 
647
-
678
+
648
679
  for script in [u for u in used if u[RDF.type:setl.PythonScript]]:
649
680
  logger.info("Script: %s", script.identifier)
650
681
  s = script.value(prov.value).value
@@ -663,22 +694,29 @@ def transform(transform_resource, resources):
663
694
  logger.info("Update: %s", update.identifier)
664
695
  query = update.value(prov.value).value
665
696
  transform_graph.update(query)
666
-
697
+
667
698
  for construct in [u for u in used if u[RDF.type:sp.Construct]]:
668
699
  logger.info("Construct: %s", construct.identifier)
669
700
  query = construct.value(prov.value).value
670
701
  g = transform_graph.query(query)
671
702
  transform_graph += g
672
-
703
+
673
704
  for csv in [u for u in used if u[RDF.type:csvw.Table]]:
674
705
  g = Graph(identifier=csv.identifier,store=transform_graph.store)
675
706
  g.remove((None, None, None))
676
707
  transform_graph.store.remove_graph(csv.identifier)
677
-
708
+
678
709
  for result in transform_graph.subjects(prov.wasGeneratedBy):
679
710
  graphs[result.identifier] = transform_graph
680
711
 
681
-
712
+ def _load_open(generated):
713
+ filename = generated.identifier.replace("file://",'')
714
+ fh = open(filename, 'wb')
715
+ for type, pack in packers.items():
716
+ if generated[RDF.type : type]:
717
+ return pack(fh)
718
+ return fh
719
+
682
720
  def load(load_resource, resources):
683
721
  logger.info('Loading %s',load_resource.identifier)
684
722
  file_graph = Dataset(default_union=True)
@@ -711,7 +749,7 @@ def load(load_resource, resources):
711
749
  if fmt in formats:
712
750
  fmt = formats[fmt]
713
751
  #print fmt
714
- with open(generated.identifier.replace("file://",''), 'wb') as o:
752
+ with _load_open(generated) as o:
715
753
  o.write(file_graph.serialize(format=fmt))
716
754
  o.close()
717
755
  elif generated[RDF.type:sd.Service]:
@@ -723,8 +761,8 @@ def load(load_resource, resources):
723
761
  endpoint_graph.commit()
724
762
  #if to_disk:
725
763
  # file_graph.close()
726
-
727
-
764
+
765
+
728
766
  actions = {
729
767
  setl.Extract : extract,
730
768
  setl.Transform : json_transform,
@@ -732,7 +770,7 @@ actions = {
732
770
  setl.PythonScript : create_python_function,
733
771
  setl.IsEmpty : isempty
734
772
  }
735
-
773
+
736
774
  def _setl(setl_graph):
737
775
  global logger
738
776
  if logger is None:
@@ -758,7 +796,7 @@ def main():
758
796
 
759
797
  global logger
760
798
  logger = logging.getLogger(__name__)
761
-
799
+
762
800
  global run_samples
763
801
  setl_file = args[0]
764
802
  if 'sample' in args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.1
2
2
  Name: setlr
3
- Version: 0.2.14
3
+ Version: 0.2.15
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jim McCusker
@@ -3,6 +3,7 @@ setup.cfg
3
3
  setup.py
4
4
  setlr/__init__.py
5
5
  setlr/iterparse_filter.py
6
+ setlr/sqlite-store.py
6
7
  setlr.egg-info/PKG-INFO
7
8
  setlr.egg-info/SOURCES.txt
8
9
  setlr.egg-info/dependency_links.txt
setlr-0.2.14/README.md DELETED
@@ -1,15 +0,0 @@
1
- # setlr: The Semantic Extract, Transform and Load-er
2
-
3
- setlr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
4
-
5
- # Installation
6
-
7
- Simply check out the code, optionally create a python virtual environment, and install it using pip:
8
-
9
- ```bash
10
- pip install setlr
11
- ```
12
-
13
- # Learning how to SETL
14
-
15
- To learn how to use setlr please visit [the tutorial wiki page](https://github.com/tetherless-world/setlr/wiki/SETLr-Basics-Tutorial).
@@ -1,14 +0,0 @@
1
- Metadata-Version: 1.1
2
- Name: setlr
3
- Version: 0.2.14
4
- Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
- Home-page: http://packages.python.org/setlr
6
- Author: Jim McCusker
7
- Author-email: mccusj@cs.rpi.edu
8
- License: Apache License 2.0
9
- Description: SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
10
- Keywords: rdf semantic etl
11
- Platform: UNKNOWN
12
- Classifier: Development Status :: 5 - Production/Stable
13
- Classifier: Topic :: Utilities
14
- Classifier: License :: OSI Approved :: Apache Software License
setlr-0.2.14/setup.cfg DELETED
@@ -1,18 +0,0 @@
1
- [bdist_wheel]
2
- universal = 1
3
-
4
- [metadata]
5
- description-file = README.md
6
-
7
- [flake8]
8
- exclude = config-template,iterparse_filter,venv
9
- ignore = E115,E116,E121,E122,E126,E127,E128,E201,E202,E203,E226,E225,E228,E231,E241,E251,E261,E265,E301,E302,E303,E305,E501,W291,W293
10
-
11
- [pycodestyle]
12
- exclude = config-template,iterparse_filter,venv
13
- ignore = E115,E116,E121,E122,E126,E127,E128,E201,E202,E203,E211,E221,E226,E225,E228,E231,E241,E251,E261,E265,E266,E301,E302,E303,E305,E501,W291,W293
14
-
15
- [egg_info]
16
- tag_build =
17
- tag_date = 0
18
-
setlr-0.2.14/setup.py DELETED
@@ -1,50 +0,0 @@
1
- import os
2
- from setuptools import setup, find_packages
3
-
4
- # Utility function to read the README file.
5
- # Used for the long_description. It's nice, because now 1) we have a top level
6
- # README file and 2) it's easier to type in the README file than to put a raw
7
- # string in below ...
8
- def read(fname):
9
- return open(os.path.join(os.path.dirname(__file__), fname)).read()
10
-
11
- setup(
12
- name = "setlr",
13
- version = "0.2.14",
14
- author = "Jim McCusker",
15
- author_email = "mccusj@cs.rpi.edu",
16
- description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
17
- license = "Apache License 2.0",
18
- keywords = "rdf semantic etl",
19
- url = "http://packages.python.org/setlr",
20
- packages=['setlr'],
21
- long_description='''SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.''',
22
- include_package_data = True,
23
- install_requires = [
24
- 'future',
25
- 'pip>=9.0.0',
26
- 'cython',
27
- 'numpy',
28
- 'rdflib',
29
- 'rdflib-jsonld',
30
- 'pandas>=0.23.0',
31
- 'requests',
32
- 'toposort',
33
- 'beautifulsoup4',
34
- 'jinja2',
35
- 'lxml',
36
- 'six',
37
- 'xlrd',
38
- 'ijson',
39
- 'requests-testadapter',
40
- 'python-slugify',
41
- ],
42
- entry_points = {
43
- 'console_scripts': ['setlr=setlr:main'],
44
- },
45
- classifiers=[
46
- "Development Status :: 5 - Production/Stable",
47
- "Topic :: Utilities",
48
- "License :: OSI Approved :: Apache Software License",
49
- ],
50
- )