setlr 0.2.13__tar.gz → 0.2.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/bin/setlr +12 -0
  2. {setlr-0.2.13 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages}/setlr/__init__.py +81 -70
  3. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/__init__.cpython-36.pyc +0 -0
  4. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/iterparse_filter.cpython-36.pyc +0 -0
  5. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/__pycache__/sqlite-store.cpython-36.pyc +0 -0
  6. Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr/sqlite-store.py +0 -0
  7. {setlr-0.2.13 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/PKG-INFO +1 -1
  8. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/SOURCES.txt +1 -0
  9. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/requires.txt +1 -1
  10. setlr-0.2.13/README.md +0 -15
  11. setlr-0.2.13/setlr.egg-info/PKG-INFO +0 -14
  12. setlr-0.2.13/setup.cfg +0 -10
  13. setlr-0.2.13/setup.py +0 -50
  14. {setlr-0.2.13 → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages}/setlr/iterparse_filter.py +0 -0
  15. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/dependency_links.txt +0 -0
  16. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/entry_points.txt +0 -0
  17. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/pbr.json +0 -0
  18. {setlr-0.2.13/setlr.egg-info → Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/lib/python3.6/site-packages/setlr-0.2.15-py3.6.egg-info}/top_level.txt +0 -0
@@ -0,0 +1,12 @@
1
+ #!/Users/jimmccusker/.pyenv/versions/3.6.9/envs/venv/bin/python
2
+ # EASY-INSTALL-ENTRY-SCRIPT: 'setlr==0.2.15','console_scripts','setlr'
3
+ __requires__ = 'setlr==0.2.15'
4
+ import re
5
+ import sys
6
+ from pkg_resources import load_entry_point
7
+
8
+ if __name__ == '__main__':
9
+ sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
10
+ sys.exit(
11
+ load_entry_point('setlr==0.2.15', 'console_scripts', 'setlr')()
12
+ )
@@ -75,7 +75,7 @@ class LocalFileAdapter(requests.adapters.HTTPAdapter):
75
75
  requests_session = requests.session()
76
76
  requests_session.mount('file://', LocalFileAdapter())
77
77
  requests_session.mount('file:///', LocalFileAdapter())
78
-
78
+
79
79
  datatypeConverters = collections.defaultdict(lambda: str)
80
80
  datatypeConverters.update({
81
81
  XSD.string: str,
@@ -95,55 +95,20 @@ _rdf_formats_to_guess = [
95
95
  'trix'
96
96
  ]
97
97
 
98
- def lru(original_function, maxsize=1000):
99
- mapping = {}
100
-
101
- PREV, NEXT, KEY, VALUE = 0, 1, 2, 3 # link fields
102
- head = [None, None, None, None] # oldest
103
- tail = [head, None, None, None] # newest
104
- head[NEXT] = tail
105
-
106
- def fn(*args, **kw):
107
- key = (args,tuple(kw.items()))
108
- PREV, NEXT = 0, 1
109
- #print "Cache lookup for "+str(key)
110
- link = mapping.get(key, head)
111
- if link is head:
112
- #print "Cache miss for "+str(key)
113
- value = original_function(*args,**kw)
114
- if len(mapping) >= maxsize:
115
- old_prev, old_next, old_key, old_value = head[NEXT]
116
- head[NEXT] = old_next
117
- old_next[PREV] = head
118
- del mapping[old_key]
119
- last = tail[PREV]
120
- link = [last, tail, key, value]
121
- mapping[key] = last[NEXT] = tail[PREV] = link
122
- else:
123
- #print "Cache hit for "+str(key)
124
- link_prev, link_next, key, value = link
125
- link_prev[NEXT] = link_next
126
- link_next[PREV] = link_prev
127
- last = tail[PREV]
128
- last[NEXT] = tail[PREV] = link
129
- link[PREV] = last
130
- link[NEXT] = tail
131
- return value
132
- return fn
133
98
 
134
99
  def read_csv(location, result):
135
100
  args = dict(
136
101
  sep = result.value(csvw.delimiter, default=Literal(",")).value,
137
102
  #header = result.value(csvw.headerRow, default=Literal(0)).value),
138
103
  skiprows = result.value(csvw.skipRows, default=Literal(0)).value,
139
- dtype = object
104
+ # dtype = object # Does not seem to play well with future and python2/3 conversion
140
105
  )
141
106
  if result.value(csvw.header):
142
107
  args['header'] = [0]
143
- df = pandas.read_csv(get_content(location, result),encoding='utf-8', **args)
108
+ df = pandas.read_csv(location,encoding='utf-8', **args)
144
109
  logger.debug("Loaded %s", location)
145
110
  return df
146
-
111
+
147
112
  def read_graph(location, result, g = None):
148
113
  if g is None:
149
114
  g = ConjunctiveGraph()
@@ -168,6 +133,8 @@ def read_graph(location, result, g = None):
168
133
  return g
169
134
 
170
135
  class FileLikeFromIter(object):
136
+ _closed = False
137
+
171
138
  def __init__(self, content_iter):
172
139
  self.iter = content_iter
173
140
  self.data = b''
@@ -175,6 +142,35 @@ class FileLikeFromIter(object):
175
142
  def __iter__(self):
176
143
  return self.iter
177
144
 
145
+ def readable(self):
146
+ return True
147
+
148
+ def writable(self):
149
+ return False
150
+
151
+ def seekable(self):
152
+ return False
153
+
154
+ def closed(self):
155
+ if self._closed:
156
+ return True
157
+ if len(self.data) > 0:
158
+ return False
159
+ try:
160
+ self.data = next(self.iter)
161
+ except StopIteration:
162
+ self.closed = True
163
+ return True
164
+ return False
165
+
166
+ # Enter and Exit are needed to allow this to work with with
167
+ def __enter__(self):
168
+ return self
169
+
170
+ # Could be improved for better error/exception handling
171
+ def __exit__(self, err_type, value, tracebock):
172
+ pass
173
+
178
174
  def read(self, n=None):
179
175
  if n is None:
180
176
  return self.data + b''.join(l for l in self.iter)
@@ -189,7 +185,7 @@ class FileLikeFromIter(object):
189
185
 
190
186
  def _open_local_file(location):
191
187
  if location.startswith("file://"):
192
- if os.name == 'nt': # skip the initial
188
+ if os.name == 'nt': # skip the initial
193
189
  return open(location.replace('file:///','').replace('file://',''),'rb')
194
190
  else:
195
191
  return open(location.replace('file://',''),'rb')
@@ -198,7 +194,7 @@ content_handlers = [
198
194
  _open_local_file,
199
195
  lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024))
200
196
  ]
201
-
197
+
202
198
  def get_content(location, result):
203
199
  response = None
204
200
  for handler in content_handlers:
@@ -207,7 +203,7 @@ def get_content(location, result):
207
203
  break
208
204
  if result[RDF.type:setl.Tempfile]:
209
205
  result = to_tempfile(response)
210
-
206
+
211
207
  for t in result[RDF.type]:
212
208
  # Do we know how to unpack this?
213
209
  if t.identifier in unpackers:
@@ -235,16 +231,20 @@ unpackers = {
235
231
  setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r')
236
232
  }
237
233
 
234
+ packers = {
235
+ # setl.Tempfile : lambda x: x,
236
+ setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb')
237
+ }
238
+
238
239
  def read_excel(location, result):
239
240
  args = dict(
240
- sheetname = result.value(setl.sheetname, default=Literal(0)).value,
241
+ sheet_name = result.value(setl.sheetname, default=Literal(0)).value,
241
242
  header = [int(x) for x in result.value(csvw.headerRow, default=Literal('0')).value.split(',')],
242
243
  skiprows = result.value(csvw.skipRows, default=Literal(0)).value
243
244
  )
244
245
  if result.value(csvw.header):
245
246
  args['header'] = [result.value(csvw.header).value]
246
- with get_content(location, result) as f:
247
- df = pandas.read_excel(f,encoding='utf-8', **args)
247
+ df = pandas.read_excel(location,encoding='utf-8', **args)
248
248
  return df
249
249
 
250
250
  def read_xml(location, result):
@@ -261,7 +261,7 @@ def read_xml(location, result):
261
261
  for (i, (event, ele)) in enumerate(f.iterparse(fo)):
262
262
  yield i, ele
263
263
 
264
-
264
+
265
265
  def read_json(location, result):
266
266
  selector = result.value(api_vocab.selector)
267
267
  if selector is not None:
@@ -269,8 +269,8 @@ def read_json(location, result):
269
269
  else:
270
270
  selector = ""
271
271
  with get_content(location, result) as fo:
272
- return enumerate(ijson.items(fo, selector))
273
-
272
+ yield from enumerate(ijson.items(fo, selector))
273
+
274
274
 
275
275
  extractors = {
276
276
  setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'),
@@ -284,14 +284,14 @@ extractors = {
284
284
  URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result)
285
285
  }
286
286
 
287
-
287
+
288
288
  try:
289
289
  from bs4 import BeautifulSoup
290
290
  extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser')
291
291
  except Exception as e:
292
292
  pass
293
-
294
-
293
+
294
+
295
295
  def load_csv(csv_resource):
296
296
  column_descriptions = {}
297
297
  for col in csv_resource[csvw.column]:
@@ -360,7 +360,7 @@ def create_python_function(f, resources):
360
360
  local_vars[name.value] = entity
361
361
  exec(script.value, local_vars, global_vars)
362
362
  resources[f.identifier] = global_vars['result']
363
-
363
+
364
364
  def get_order(setl_graph):
365
365
  nodes = collections.defaultdict(set)
366
366
 
@@ -378,7 +378,7 @@ def get_order(setl_graph):
378
378
  for derivation in task[prov.qualifiedDerivation]:
379
379
  derived = derivation.value(prov.entity)
380
380
  nodes[task.identifier].add(derived.identifier)
381
-
381
+
382
382
  return toposort_flatten(nodes)
383
383
 
384
384
  def extract(e, resources):
@@ -415,7 +415,7 @@ def get_function(expr, local_keys):
415
415
  if key not in functions:
416
416
  script = '''lambda %s: %s'''% (', '.join(sorted(local_keys)), expr)
417
417
  fn = eval(script)
418
- fn.__name__ = str(expr)
418
+ fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
419
419
  functions[key] = fn
420
420
  return functions[key]
421
421
 
@@ -425,7 +425,7 @@ def get_template(templ):
425
425
  t = Template(templ)
426
426
  templates[templ] = t
427
427
  return templates[templ]
428
-
428
+
429
429
  def process_row(row, template, rowname, table, resources, transform, variables):
430
430
  result = []
431
431
  e = {'row':row,
@@ -588,7 +588,7 @@ def json_transform(transform, resources):
588
588
  roleID = role.value(dc.identifier)
589
589
  variables[roleID.value] = resources[used.identifier]
590
590
  #print "Using", used.identifier, "as", roleID.value
591
-
591
+
592
592
  generated = list(transform.subjects(prov.wasGeneratedBy))[0]
593
593
  logger.info("Generating %s", generated.identifier)
594
594
 
@@ -632,6 +632,8 @@ def json_transform(transform, resources):
632
632
  if run_samples and rowname >= 100:
633
633
  break
634
634
  try:
635
+ root = None
636
+ data = None
635
637
  root = {
636
638
  "@id": generated.identifier,
637
639
  "@graph": process_row(row, jslt, rowname, table, resources, transform, variables)
@@ -642,20 +644,22 @@ def json_transform(transform, resources):
642
644
  #graph = ConjunctiveGraph(identifier=generated.identifier)
643
645
  #graph.parse(data=json.dumps(root),format="json-ld")
644
646
  data = json.dumps(root)
645
- del root
647
+ #del root
646
648
  result.parse(data=data, format="json-ld")
647
- del data
649
+ #del data
648
650
  after = len(result)
649
651
  logger.debug("Row "+str(rowname)+" added "+str(after-before)+" triples.")
650
652
  sys.stdout.flush()
651
653
  except Exception as e:
652
654
  trace = sys.exc_info()[2]
655
+ if data is not None:
656
+ logger.error("Error parsing tree: %s", data)
653
657
  if isinstance(table, pandas.DataFrame):
654
658
  logger.error("Error on %s %s", rowname, row)
655
659
  else:
656
660
  logger.error("Error on %s", rowname)
657
661
  raise e
658
-
662
+
659
663
  resources[generated.identifier] = result
660
664
 
661
665
  def transform(transform_resource, resources):
@@ -666,12 +670,12 @@ def transform(transform_resource, resources):
666
670
  transform_graph = ConjunctiveGraph(identifier=result.identifier)
667
671
 
668
672
  used = set(transform_resource[prov.used])
669
-
673
+
670
674
  for csv in [u for u in used if u[RDF.type:csvw.Table]]:
671
675
  csv_graph = Graph(store=transform_graph.store, identifier=csv)
672
676
  csv_graph += graphs[csv.identifier]
673
677
 
674
-
678
+
675
679
  for script in [u for u in used if u[RDF.type:setl.PythonScript]]:
676
680
  logger.info("Script: %s", script.identifier)
677
681
  s = script.value(prov.value).value
@@ -690,22 +694,29 @@ def transform(transform_resource, resources):
690
694
  logger.info("Update: %s", update.identifier)
691
695
  query = update.value(prov.value).value
692
696
  transform_graph.update(query)
693
-
697
+
694
698
  for construct in [u for u in used if u[RDF.type:sp.Construct]]:
695
699
  logger.info("Construct: %s", construct.identifier)
696
700
  query = construct.value(prov.value).value
697
701
  g = transform_graph.query(query)
698
702
  transform_graph += g
699
-
703
+
700
704
  for csv in [u for u in used if u[RDF.type:csvw.Table]]:
701
705
  g = Graph(identifier=csv.identifier,store=transform_graph.store)
702
706
  g.remove((None, None, None))
703
707
  transform_graph.store.remove_graph(csv.identifier)
704
-
708
+
705
709
  for result in transform_graph.subjects(prov.wasGeneratedBy):
706
710
  graphs[result.identifier] = transform_graph
707
711
 
708
-
712
+ def _load_open(generated):
713
+ filename = generated.identifier.replace("file://",'')
714
+ fh = open(filename, 'wb')
715
+ for type, pack in packers.items():
716
+ if generated[RDF.type : type]:
717
+ return pack(fh)
718
+ return fh
719
+
709
720
  def load(load_resource, resources):
710
721
  logger.info('Loading %s',load_resource.identifier)
711
722
  file_graph = Dataset(default_union=True)
@@ -738,7 +749,7 @@ def load(load_resource, resources):
738
749
  if fmt in formats:
739
750
  fmt = formats[fmt]
740
751
  #print fmt
741
- with open(generated.identifier.replace("file://",''), 'wb') as o:
752
+ with _load_open(generated) as o:
742
753
  o.write(file_graph.serialize(format=fmt))
743
754
  o.close()
744
755
  elif generated[RDF.type:sd.Service]:
@@ -750,8 +761,8 @@ def load(load_resource, resources):
750
761
  endpoint_graph.commit()
751
762
  #if to_disk:
752
763
  # file_graph.close()
753
-
754
-
764
+
765
+
755
766
  actions = {
756
767
  setl.Extract : extract,
757
768
  setl.Transform : json_transform,
@@ -759,7 +770,7 @@ actions = {
759
770
  setl.PythonScript : create_python_function,
760
771
  setl.IsEmpty : isempty
761
772
  }
762
-
773
+
763
774
  def _setl(setl_graph):
764
775
  global logger
765
776
  if logger is None:
@@ -785,7 +796,7 @@ def main():
785
796
 
786
797
  global logger
787
798
  logger = logging.getLogger(__name__)
788
-
799
+
789
800
  global run_samples
790
801
  setl_file = args[0]
791
802
  if 'sample' in args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.1
2
2
  Name: setlr
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
5
  Home-page: http://packages.python.org/setlr
6
6
  Author: Jim McCusker
@@ -3,6 +3,7 @@ setup.cfg
3
3
  setup.py
4
4
  setlr/__init__.py
5
5
  setlr/iterparse_filter.py
6
+ setlr/sqlite-store.py
6
7
  setlr.egg-info/PKG-INFO
7
8
  setlr.egg-info/SOURCES.txt
8
9
  setlr.egg-info/dependency_links.txt
@@ -4,7 +4,7 @@ cython
4
4
  numpy
5
5
  rdflib
6
6
  rdflib-jsonld
7
- pandas==0.22.0
7
+ pandas>=0.23.0
8
8
  requests
9
9
  toposort
10
10
  beautifulsoup4
setlr-0.2.13/README.md DELETED
@@ -1,15 +0,0 @@
1
- # setlr: The Semantic Extract, Transform and Load-er
2
-
3
- setlr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
4
-
5
- # Installation
6
-
7
- Simply check out the code, optionally create a python virtual environment, and install it using pip:
8
-
9
- ```bash
10
- pip install setlr
11
- ```
12
-
13
- # Learning how to SETL
14
-
15
- To learn how to use setlr please visit [the tutorial wiki page](https://github.com/tetherless-world/setlr/wiki/SETLr-Basics-Tutorial).
@@ -1,14 +0,0 @@
1
- Metadata-Version: 1.1
2
- Name: setlr
3
- Version: 0.2.13
4
- Summary: setlr is a tool for Semantic Extraction, Transformation, and Loading.
5
- Home-page: http://packages.python.org/setlr
6
- Author: Jim McCusker
7
- Author-email: mccusj@cs.rpi.edu
8
- License: Apache License 2.0
9
- Description: SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.
10
- Keywords: rdf semantic etl
11
- Platform: UNKNOWN
12
- Classifier: Development Status :: 5 - Production/Stable
13
- Classifier: Topic :: Utilities
14
- Classifier: License :: OSI Approved :: Apache Software License
setlr-0.2.13/setup.cfg DELETED
@@ -1,10 +0,0 @@
1
- [bdist_wheel]
2
- universal = 1
3
-
4
- [metadata]
5
- description-file = README.md
6
-
7
- [egg_info]
8
- tag_build =
9
- tag_date = 0
10
-
setlr-0.2.13/setup.py DELETED
@@ -1,50 +0,0 @@
1
- import os
2
- from setuptools import setup, find_packages
3
-
4
- # Utility function to read the README file.
5
- # Used for the long_description. It's nice, because now 1) we have a top level
6
- # README file and 2) it's easier to type in the README file than to put a raw
7
- # string in below ...
8
- def read(fname):
9
- return open(os.path.join(os.path.dirname(__file__), fname)).read()
10
-
11
- setup(
12
- name = "setlr",
13
- version = "0.2.13",
14
- author = "Jim McCusker",
15
- author_email = "mccusj@cs.rpi.edu",
16
- description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."),
17
- license = "Apache License 2.0",
18
- keywords = "rdf semantic etl",
19
- url = "http://packages.python.org/setlr",
20
- packages=['setlr'],
21
- long_description='''SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.''',
22
- include_package_data = True,
23
- install_requires = [
24
- 'future',
25
- 'pip>=9.0.0',
26
- 'cython',
27
- 'numpy',
28
- 'rdflib',
29
- 'rdflib-jsonld',
30
- 'pandas==0.22.0',
31
- 'requests',
32
- 'toposort',
33
- 'beautifulsoup4',
34
- 'jinja2',
35
- 'lxml',
36
- 'six',
37
- 'xlrd',
38
- 'ijson',
39
- 'requests-testadapter',
40
- 'python-slugify',
41
- ],
42
- entry_points = {
43
- 'console_scripts': ['setlr=setlr:main'],
44
- },
45
- classifiers=[
46
- "Development Status :: 5 - Production/Stable",
47
- "Topic :: Utilities",
48
- "License :: OSI Approved :: Apache Software License",
49
- ],
50
- )