setlr 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- setlr/__init__.py +89 -0
- setlr/core.py +1019 -0
- setlr/iterparse_filter.py +669 -0
- setlr/trig_store.py +158 -0
- setlr-1.0.2.dist-info/METADATA +209 -0
- setlr-1.0.2.dist-info/RECORD +10 -0
- setlr-1.0.2.dist-info/WHEEL +5 -0
- setlr-1.0.2.dist-info/entry_points.txt +2 -0
- setlr-1.0.2.dist-info/licenses/LICENSE +201 -0
- setlr-1.0.2.dist-info/top_level.txt +1 -0
setlr/core.py
ADDED
|
@@ -0,0 +1,1019 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
from rdflib.util import guess_format
|
|
5
|
+
import rdflib
|
|
6
|
+
import csv
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
import collections
|
|
10
|
+
import requests
|
|
11
|
+
import pandas
|
|
12
|
+
import re
|
|
13
|
+
import os
|
|
14
|
+
import click
|
|
15
|
+
|
|
16
|
+
from jinja2 import Template
|
|
17
|
+
from toposort import toposort_flatten
|
|
18
|
+
from numpy import isnan
|
|
19
|
+
import tempfile
|
|
20
|
+
import ijson
|
|
21
|
+
from . import iterparse_filter
|
|
22
|
+
#import xml.etree.ElementTree as ET
|
|
23
|
+
import xml.etree.ElementTree
|
|
24
|
+
|
|
25
|
+
from itertools import chain
|
|
26
|
+
|
|
27
|
+
import zipfile
|
|
28
|
+
import gzip
|
|
29
|
+
|
|
30
|
+
import logging
|
|
31
|
+
|
|
32
|
+
from tqdm import tqdm
|
|
33
|
+
|
|
34
|
+
import hashlib
|
|
35
|
+
from slugify import slugify
|
|
36
|
+
from pyshacl import validate
|
|
37
|
+
|
|
38
|
+
from .trig_store import TrigStore
|
|
39
|
+
|
|
40
|
+
from requests_testadapter import Resp
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def hash(value):
|
|
44
|
+
m = hashlib.sha256()
|
|
45
|
+
m.update(value.encode('utf-8'))
|
|
46
|
+
return m.hexdigest()
|
|
47
|
+
|
|
48
|
+
csvw = rdflib.Namespace('http://www.w3.org/ns/csvw#')
|
|
49
|
+
ov = rdflib.Namespace('http://open.vocab.org/terms/')
|
|
50
|
+
setl = rdflib.Namespace('http://purl.org/twc/vocab/setl/')
|
|
51
|
+
prov = rdflib.Namespace('http://www.w3.org/ns/prov#')
|
|
52
|
+
pv = rdflib.Namespace('http://purl.org/net/provenance/ns#')
|
|
53
|
+
sp = rdflib.Namespace('http://spinrdf.org/sp#')
|
|
54
|
+
sd = rdflib.Namespace('http://www.w3.org/ns/sparql-service-description#')
|
|
55
|
+
dc = rdflib.Namespace('http://purl.org/dc/terms/')
|
|
56
|
+
void = rdflib.Namespace('http://rdfs.org/ns/void#')
|
|
57
|
+
shacl = rdflib.Namespace('http://www.w3.org/ns/shacl#')
|
|
58
|
+
api_vocab = rdflib.Namespace('http://purl.org/linked-data/api/vocab#')
|
|
59
|
+
|
|
60
|
+
sys.setrecursionlimit(10000)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Regex pattern for extracting Jinja2 template variables (compiled once for performance)
|
|
64
|
+
TEMPLATE_VAR_PATTERN = re.compile(r'\{\{([^}]+)\}\}')
|
|
65
|
+
|
|
66
|
+
def camelcase(s):
|
|
67
|
+
return slugify(s).title().replace("-","")
|
|
68
|
+
|
|
69
|
+
class LocalFileAdapter(requests.adapters.HTTPAdapter):
|
|
70
|
+
def build_response_from_file(self, request):
|
|
71
|
+
file_path = request.url[7:]
|
|
72
|
+
with open(file_path, 'rb') as file:
|
|
73
|
+
buff = bytearray(os.path.getsize(file_path))
|
|
74
|
+
file.readinto(buff)
|
|
75
|
+
resp = Resp(buff)
|
|
76
|
+
r = self.build_response(request, resp)
|
|
77
|
+
return r
|
|
78
|
+
def send(self, request, stream=False, timeout=None,
|
|
79
|
+
verify=True, cert=None, proxies=None):
|
|
80
|
+
return self.build_response_from_file(request)
|
|
81
|
+
|
|
82
|
+
requests_session = requests.session()
|
|
83
|
+
requests_session.mount('file://', LocalFileAdapter())
|
|
84
|
+
requests_session.mount('file:///', LocalFileAdapter())
|
|
85
|
+
|
|
86
|
+
datatypeConverters = collections.defaultdict(lambda: str)
|
|
87
|
+
datatypeConverters.update({
|
|
88
|
+
rdflib.XSD.string: str,
|
|
89
|
+
rdflib.XSD.decimal: float,
|
|
90
|
+
rdflib.XSD.integer: int,
|
|
91
|
+
rdflib.XSD.float: float,
|
|
92
|
+
rdflib.XSD.double: float
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
run_samples = -1
|
|
96
|
+
|
|
97
|
+
_rdf_formats_to_guess = [
|
|
98
|
+
'xml',
|
|
99
|
+
'json-ld',
|
|
100
|
+
'trig',
|
|
101
|
+
'nquads',
|
|
102
|
+
'trix'
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def read_csv(location, result):
|
|
107
|
+
args = dict(
|
|
108
|
+
sep = result.value(csvw.delimiter, default=rdflib.Literal(",")).value,
|
|
109
|
+
#header = result.value(csvw.headerRow, default=rdflib.Literal(0)).value),
|
|
110
|
+
skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value,
|
|
111
|
+
dtype=str,
|
|
112
|
+
# dtype = object # Does not seem to play well with future and python2/3 conversion
|
|
113
|
+
)
|
|
114
|
+
if result.value(csvw.header):
|
|
115
|
+
args['header'] = [0]
|
|
116
|
+
with get_content(location, result) as fo:
|
|
117
|
+
df = pandas.read_csv(fo, encoding='utf-8', **args)
|
|
118
|
+
#logger.debug("Loaded %s", location)
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
def read_graph(location, result, g = None):
|
|
122
|
+
if g is None:
|
|
123
|
+
g = rdflib.ConjunctiveGraph()
|
|
124
|
+
graph = rdflib.ConjunctiveGraph(store=g.store, identifier=result.identifier)
|
|
125
|
+
if len(graph) == 0:
|
|
126
|
+
data = get_content(location, result).read()
|
|
127
|
+
f = guess_format(location)
|
|
128
|
+
for fmt in [f] + _rdf_formats_to_guess:
|
|
129
|
+
try:
|
|
130
|
+
graph.parse(data=data, format=fmt)
|
|
131
|
+
break
|
|
132
|
+
except Exception:
|
|
133
|
+
pass
|
|
134
|
+
if len(graph) == 0:
|
|
135
|
+
logger.error("Could not parse graph: %s", location)
|
|
136
|
+
if result[rdflib.RDF.type:rdflib.OWL.Ontology]:
|
|
137
|
+
for ontology in graph.subjects(rdflib.RDF.type, rdflib.OWL.Ontology):
|
|
138
|
+
imports = [graph.resource(x) for x in graph.objects(ontology, rdflib.OWL.imports)]
|
|
139
|
+
for i in imports:
|
|
140
|
+
read_graph(i.identifier, i, g = g)
|
|
141
|
+
return g
|
|
142
|
+
|
|
143
|
+
class FileLikeFromIter(object):
|
|
144
|
+
_closed = False
|
|
145
|
+
|
|
146
|
+
def __init__(self, content_iter):
|
|
147
|
+
self.iter = content_iter
|
|
148
|
+
self.data = b''
|
|
149
|
+
|
|
150
|
+
def __iter__(self):
|
|
151
|
+
return self.iter
|
|
152
|
+
|
|
153
|
+
def readable(self):
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
def writable(self):
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
def seekable(self):
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
def closed(self):
|
|
163
|
+
if self._closed:
|
|
164
|
+
return True
|
|
165
|
+
if len(self.data) > 0:
|
|
166
|
+
return False
|
|
167
|
+
try:
|
|
168
|
+
self.data = next(self.iter)
|
|
169
|
+
except StopIteration:
|
|
170
|
+
self.closed = True
|
|
171
|
+
return True
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
# Enter and Exit are needed to allow this to work with with
|
|
175
|
+
def __enter__(self):
|
|
176
|
+
return self
|
|
177
|
+
|
|
178
|
+
# Could be improved for better error/exception handling
|
|
179
|
+
def __exit__(self, err_type, value, tracebock):
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
def read(self, n=None):
|
|
183
|
+
if n is None:
|
|
184
|
+
return self.data + b''.join(line for line in self.iter)
|
|
185
|
+
else:
|
|
186
|
+
while len(self.data) < n:
|
|
187
|
+
try:
|
|
188
|
+
self.data = b''.join((self.data, next(self.iter)))
|
|
189
|
+
except StopIteration:
|
|
190
|
+
break
|
|
191
|
+
result, self.data = self.data[:n], self.data[n:]
|
|
192
|
+
return result
|
|
193
|
+
|
|
194
|
+
def _open_local_file(location):
|
|
195
|
+
if location.startswith("file://"):
|
|
196
|
+
if os.name == 'nt': # skip the initial
|
|
197
|
+
return open(location.replace('file:///','').replace('file://',''),'rb')
|
|
198
|
+
else:
|
|
199
|
+
return open(location.replace('file://',''),'rb')
|
|
200
|
+
|
|
201
|
+
content_handlers = [
|
|
202
|
+
_open_local_file,
|
|
203
|
+
lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024))
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
def get_content(location, result):
|
|
207
|
+
response = None
|
|
208
|
+
for handler in content_handlers:
|
|
209
|
+
response = handler(location)
|
|
210
|
+
if response is not None:
|
|
211
|
+
break
|
|
212
|
+
if result[rdflib.RDF.type:setl.Tempfile]:
|
|
213
|
+
result = to_tempfile(response)
|
|
214
|
+
|
|
215
|
+
for t in result[rdflib.RDF.type]:
|
|
216
|
+
# Do we know how to unpack this?
|
|
217
|
+
if t.identifier in unpackers:
|
|
218
|
+
response = unpackers[t.identifier](response)
|
|
219
|
+
return response
|
|
220
|
+
|
|
221
|
+
def to_tempfile(f):
|
|
222
|
+
tf = tempfile.TemporaryFile()
|
|
223
|
+
logger.debug("Writing %s to disk.", f)
|
|
224
|
+
for chunk in f:
|
|
225
|
+
if chunk: # filter out keep-alive new chunks
|
|
226
|
+
tf.write(chunk)
|
|
227
|
+
tf.seek(0)
|
|
228
|
+
logger.debug("Finished writing %s to disk.", f)
|
|
229
|
+
return tf
|
|
230
|
+
|
|
231
|
+
def unpack_zipfile(f):
|
|
232
|
+
zf = zipfile.ZipFile(f, mode='r')
|
|
233
|
+
files = zf.infolist()
|
|
234
|
+
return zf.open(files[0])
|
|
235
|
+
|
|
236
|
+
unpackers = {
|
|
237
|
+
# setl.Tempfile : lambda x: x,
|
|
238
|
+
setl.ZipFile : lambda x: unpack_zipfile(to_tempfile(x)),
|
|
239
|
+
setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r')
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
packers = {
|
|
243
|
+
# setl.Tempfile : lambda x: x,
|
|
244
|
+
setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb')
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
def read_excel(location, result):
|
|
248
|
+
args = dict(
|
|
249
|
+
sheet_name = result.value(setl.sheetname, default=rdflib.Literal(0)).value,
|
|
250
|
+
header = [int(x) for x in result.value(csvw.headerRow, default=rdflib.Literal('0')).value.split(',')],
|
|
251
|
+
skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value
|
|
252
|
+
)
|
|
253
|
+
if result.value(csvw.header):
|
|
254
|
+
args['header'] = [result.value(csvw.header).value]
|
|
255
|
+
with get_content(location, result) as fo:
|
|
256
|
+
df = pandas.read_excel(fo, encoding='utf-8', **args)
|
|
257
|
+
return df
|
|
258
|
+
|
|
259
|
+
def read_xml(location, result):
|
|
260
|
+
validate_dtd = False
|
|
261
|
+
if result[rdflib.RDF.type:setl.DTDValidatedXML]:
|
|
262
|
+
validate_dtd = True
|
|
263
|
+
f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd)
|
|
264
|
+
if result.value(setl.xpath) is None:
|
|
265
|
+
logger.debug("no xpath to select on from %s", location)
|
|
266
|
+
f.iter_end("/*")
|
|
267
|
+
for xp in result[setl.xpath]:
|
|
268
|
+
f.iter_end(xp.value)
|
|
269
|
+
with get_content(location, result) as fo:
|
|
270
|
+
for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
|
|
271
|
+
yield i, ele
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def read_json(location, result):
|
|
275
|
+
selector = result.value(api_vocab.selector)
|
|
276
|
+
if selector is not None:
|
|
277
|
+
selector = selector.value
|
|
278
|
+
else:
|
|
279
|
+
selector = ""
|
|
280
|
+
with get_content(location, result) as fo:
|
|
281
|
+
yield from enumerate(tqdm(ijson.items(fo, selector)))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
extractors = {
|
|
285
|
+
setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'),
|
|
286
|
+
setl.SAS7BDAT : lambda location, result: pandas.read_sas(get_content(location, result), format='sas7bdat'),
|
|
287
|
+
setl.Excel : read_excel,
|
|
288
|
+
csvw.Table : read_csv,
|
|
289
|
+
rdflib.OWL.Ontology : read_graph,
|
|
290
|
+
void.Dataset : read_graph,
|
|
291
|
+
setl.JSON : read_json,
|
|
292
|
+
setl.XML : read_xml,
|
|
293
|
+
rdflib.URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result)
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
from bs4 import BeautifulSoup
|
|
299
|
+
extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser')
|
|
300
|
+
except Exception:
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def load_csv(csv_resource):
|
|
305
|
+
column_descriptions = {}
|
|
306
|
+
for col in csv_resource[csvw.column]:
|
|
307
|
+
label = col.value(rdflib.RDFS.label).value
|
|
308
|
+
column_descriptions[label] = col
|
|
309
|
+
csv_graph = rdflib.Graph(identifier=csv_resource)
|
|
310
|
+
s = [x for x in csv.reader(open(str(csv_resource.value(csvw.url).identifier).replace("file://","")),
|
|
311
|
+
delimiter=str(csv_resource.value(csvw.delimiter,default=",").value),
|
|
312
|
+
quotechar=str(csv_resource.value(csvw.quoteChar,default='"').value))]
|
|
313
|
+
header = None
|
|
314
|
+
properties = []
|
|
315
|
+
propertyMap = {}
|
|
316
|
+
skip_value = csv_resource.value(csvw.null)
|
|
317
|
+
if skip_value is not None:
|
|
318
|
+
skip_value = skip_value.value
|
|
319
|
+
for i, r in enumerate(s):
|
|
320
|
+
if header is None:
|
|
321
|
+
header = r
|
|
322
|
+
for j, h in enumerate(header):
|
|
323
|
+
col_desc = None
|
|
324
|
+
if h in column_descriptions:
|
|
325
|
+
col_desc = column_descriptions[h]
|
|
326
|
+
col = csv_graph.resource(rdflib.URIRef("urn:col_"+str(h)))
|
|
327
|
+
col.add(rdflib.RDFS.label, rdflib.Literal(h))
|
|
328
|
+
col.add(ov.csvCol, rdflib.Literal(j))
|
|
329
|
+
if col_desc is not None:
|
|
330
|
+
col.add(rdflib.RDFS.range, col_desc.value(rdflib.RDFS.range, default=rdflib.XSD.string))
|
|
331
|
+
properties.append(col)
|
|
332
|
+
propertyMap[h] = col
|
|
333
|
+
continue
|
|
334
|
+
res = csv_graph.resource(csv_resource.identifier+"_row_"+str(i))
|
|
335
|
+
res.add(rdflib.RDF.type, csvw.Row)
|
|
336
|
+
res.add(csvw.rownum, rdflib.Literal(i))
|
|
337
|
+
for j, value in enumerate(r):
|
|
338
|
+
if skip_value is not None and skip_value == value:
|
|
339
|
+
continue
|
|
340
|
+
#print i, j, value
|
|
341
|
+
prop = properties[j]
|
|
342
|
+
datatype = prop.value(rdflib.RDFS['range'], default=rdflib.XSD.string)
|
|
343
|
+
lit = rdflib.Literal(value, datatype=datatype.identifier)
|
|
344
|
+
#print i, prop.identifier, lit.n3()
|
|
345
|
+
res.add(prop.identifier, lit)
|
|
346
|
+
logger.debug("Table has %s rows, %s columns, and %s triples", len(s), len(header), len(csv_graph))
|
|
347
|
+
return csv_graph
|
|
348
|
+
|
|
349
|
+
formats = {
|
|
350
|
+
None:'xml',
|
|
351
|
+
"application/rdf+xml":'xml',
|
|
352
|
+
"text/rdf":'xml',
|
|
353
|
+
'text/turtle':'turtle',
|
|
354
|
+
'application/turtle':'turtle',
|
|
355
|
+
'application/x-turtle':'turtle',
|
|
356
|
+
'text/plain':'nt',
|
|
357
|
+
'text/n3':'n3',
|
|
358
|
+
'application/trig':'trig',
|
|
359
|
+
'application/json':'json-ld'
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
def create_python_function(f, resources):
|
|
363
|
+
global_vars = {'this' : f, 'resources': resources}
|
|
364
|
+
local_vars = {}
|
|
365
|
+
script = f.value(prov.value)
|
|
366
|
+
for qd in f[prov.qualifiedDerivation]:
|
|
367
|
+
entity = resources[qd.value(prov.entity).identifier]
|
|
368
|
+
name = qd.value(prov.hadRole).value(dc.identifier)
|
|
369
|
+
local_vars[name.value] = entity
|
|
370
|
+
exec(script.value, local_vars, global_vars)
|
|
371
|
+
resources[f.identifier] = global_vars['result']
|
|
372
|
+
|
|
373
|
+
def get_order(setl_graph):
|
|
374
|
+
nodes = collections.defaultdict(set)
|
|
375
|
+
|
|
376
|
+
for typ in actions:
|
|
377
|
+
for task in setl_graph.subjects(rdflib.RDF.type, typ):
|
|
378
|
+
task = setl_graph.resource(task)
|
|
379
|
+
for used in task[prov.used]:
|
|
380
|
+
nodes[task.identifier].add(used.identifier)
|
|
381
|
+
|
|
382
|
+
for usage in task[prov.qualifiedUsage]:
|
|
383
|
+
used = usage.value(prov.entity)
|
|
384
|
+
nodes[task.identifier].add(used.identifier)
|
|
385
|
+
for generated in task.subjects(prov.wasGeneratedBy):
|
|
386
|
+
nodes[generated.identifier].add(task.identifier)
|
|
387
|
+
for derivation in task[prov.qualifiedDerivation]:
|
|
388
|
+
derived = derivation.value(prov.entity)
|
|
389
|
+
nodes[task.identifier].add(derived.identifier)
|
|
390
|
+
|
|
391
|
+
return toposort_flatten(nodes)
|
|
392
|
+
|
|
393
|
+
def extract(e, resources):
|
|
394
|
+
logger.info('Extract %s',e.identifier)
|
|
395
|
+
used = e.value(prov.used)
|
|
396
|
+
for result in e.subjects(prov.wasGeneratedBy):
|
|
397
|
+
if used is None:
|
|
398
|
+
used = result
|
|
399
|
+
for t in result[rdflib.RDF.type]:
|
|
400
|
+
# Do we know how to generate this?
|
|
401
|
+
if t.identifier in extractors:
|
|
402
|
+
logger.info("Using %s", used.identifier)
|
|
403
|
+
resources[result.identifier] = extractors[t.identifier](used.identifier, result)
|
|
404
|
+
return resources[result.identifier]
|
|
405
|
+
|
|
406
|
+
def isempty(value):
|
|
407
|
+
try:
|
|
408
|
+
return isnan(value)
|
|
409
|
+
except (TypeError, ValueError):
|
|
410
|
+
return value is None
|
|
411
|
+
|
|
412
|
+
def clone(value):
|
|
413
|
+
'''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.'''
|
|
414
|
+
if isinstance(value, list):
|
|
415
|
+
return [x for x in value]
|
|
416
|
+
elif isinstance(value, dict):
|
|
417
|
+
return dict(value)
|
|
418
|
+
else:
|
|
419
|
+
return value
|
|
420
|
+
|
|
421
|
+
functions = {}
|
|
422
|
+
def get_function(expr, local_keys):
|
|
423
|
+
used_local_keys = [k for k in local_keys if k in expr]
|
|
424
|
+
key = tuple([expr]+sorted(used_local_keys))
|
|
425
|
+
if key not in functions:
|
|
426
|
+
script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr)
|
|
427
|
+
#print(script)
|
|
428
|
+
fn = eval(script)
|
|
429
|
+
fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
|
|
430
|
+
functions[key] = fn
|
|
431
|
+
return functions[key]
|
|
432
|
+
|
|
433
|
+
templates = {}
|
|
434
|
+
def get_template(templ):
|
|
435
|
+
if templ not in templates:
|
|
436
|
+
t = Template(templ)
|
|
437
|
+
templates[templ] = t
|
|
438
|
+
return templates[templ]
|
|
439
|
+
|
|
440
|
+
def flatten_lists(o):
|
|
441
|
+
if isinstance(o, list):
|
|
442
|
+
result = []
|
|
443
|
+
for x in o:
|
|
444
|
+
flattened = flatten_lists(x)
|
|
445
|
+
if isinstance(flattened, list):
|
|
446
|
+
result.extend(flattened)
|
|
447
|
+
else:
|
|
448
|
+
result.append(flattened)
|
|
449
|
+
return result
|
|
450
|
+
elif isinstance(o, dict):
|
|
451
|
+
for key in o.keys():
|
|
452
|
+
o[key] = flatten_lists(o[key])
|
|
453
|
+
return o
|
|
454
|
+
else:
|
|
455
|
+
return o
|
|
456
|
+
|
|
457
|
+
def process_row(row, template, rowname, table, resources, transform, variables):
|
|
458
|
+
result = []
|
|
459
|
+
e = {
|
|
460
|
+
'row':row,
|
|
461
|
+
'name': rowname,
|
|
462
|
+
'table': table,
|
|
463
|
+
'resources': resources,
|
|
464
|
+
'template': template,
|
|
465
|
+
"transform": transform,
|
|
466
|
+
"setl_graph": transform.graph,
|
|
467
|
+
"isempty":isempty,
|
|
468
|
+
"slugify" : slugify,
|
|
469
|
+
"camelcase" : camelcase,
|
|
470
|
+
"hash":hash,
|
|
471
|
+
"isinstance":isinstance,
|
|
472
|
+
"str":str,
|
|
473
|
+
"float":float,
|
|
474
|
+
"int":int,
|
|
475
|
+
"chain": lambda x: chain(*x),
|
|
476
|
+
"list":list
|
|
477
|
+
}
|
|
478
|
+
e.update(variables)
|
|
479
|
+
e.update(rdflib.__dict__)
|
|
480
|
+
todo = [[x, result, e] for x in template]
|
|
481
|
+
|
|
482
|
+
while len(todo) > 0:
|
|
483
|
+
task, parent, env = todo.pop()
|
|
484
|
+
key = None
|
|
485
|
+
value = task
|
|
486
|
+
this = None
|
|
487
|
+
if isinstance(parent, dict):
|
|
488
|
+
if len(task) != 2:
|
|
489
|
+
logger.debug(task)
|
|
490
|
+
key, value = task
|
|
491
|
+
kt = get_template(key)
|
|
492
|
+
key = kt.render(**env)
|
|
493
|
+
if isinstance(value, dict):
|
|
494
|
+
if '@if' in value:
|
|
495
|
+
try:
|
|
496
|
+
fn = get_function(value['@if'], list(env.keys()))
|
|
497
|
+
incl = fn(**env)
|
|
498
|
+
if incl is None or not incl:
|
|
499
|
+
continue
|
|
500
|
+
except KeyError:
|
|
501
|
+
continue
|
|
502
|
+
except AttributeError:
|
|
503
|
+
continue
|
|
504
|
+
except TypeError:
|
|
505
|
+
continue
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.error("=" * 80)
|
|
508
|
+
logger.error("Error evaluating @if conditional: %s", value['@if'])
|
|
509
|
+
transform_obj = env.get('transform', {})
|
|
510
|
+
transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
|
|
511
|
+
logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
|
|
512
|
+
logger.error("Error type: %s", type(e).__name__)
|
|
513
|
+
logger.error("Error message: %s", str(e))
|
|
514
|
+
logger.error("Row-specific variables:")
|
|
515
|
+
for key in ['row', 'name']:
|
|
516
|
+
if key in env:
|
|
517
|
+
v = env[key]
|
|
518
|
+
try:
|
|
519
|
+
logger.error(" %s: %s", key, str(v)[:200])
|
|
520
|
+
except Exception:
|
|
521
|
+
logger.error(" %s: <%s>", key, type(v).__name__)
|
|
522
|
+
logger.error("=" * 80)
|
|
523
|
+
raise RuntimeError(f"Error in @if conditional '{value['@if']}': {type(e).__name__}: {str(e)}") from e
|
|
524
|
+
if '@for' in value:
|
|
525
|
+
f = value['@for']
|
|
526
|
+
if isinstance(f, list):
|
|
527
|
+
f = ' '.join(f)
|
|
528
|
+
variable_list, expression = f.split(" in ", 1)
|
|
529
|
+
variable_list = re.split(r',\s+', variable_list.strip())
|
|
530
|
+
val = value
|
|
531
|
+
if '@do' in value:
|
|
532
|
+
val = value['@do']
|
|
533
|
+
else:
|
|
534
|
+
del val['@for']
|
|
535
|
+
try:
|
|
536
|
+
fn = get_function(expression, list(env.keys()))
|
|
537
|
+
values = fn(**env)
|
|
538
|
+
if values is not None:
|
|
539
|
+
for v in values:
|
|
540
|
+
if len(variable_list) == 1:
|
|
541
|
+
v = [v]
|
|
542
|
+
new_env = dict(env)
|
|
543
|
+
for i, variable in enumerate(variable_list):
|
|
544
|
+
new_env[variable] = v[i]
|
|
545
|
+
child = clone(val)
|
|
546
|
+
todo.append((child, parent, new_env))
|
|
547
|
+
except KeyError:
|
|
548
|
+
pass
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.error("=" * 80)
|
|
551
|
+
logger.error("Error in @for loop: %s", value['@for'])
|
|
552
|
+
transform_obj = env.get('transform', {})
|
|
553
|
+
transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
|
|
554
|
+
logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
|
|
555
|
+
logger.error("Error type: %s", type(e).__name__)
|
|
556
|
+
logger.error("Error message: %s", str(e))
|
|
557
|
+
logger.error("Expression: %s", expression)
|
|
558
|
+
logger.error("Variables to assign: %s", variable_list)
|
|
559
|
+
logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')]))
|
|
560
|
+
logger.error("=" * 80)
|
|
561
|
+
raise RuntimeError(f"Error in @for loop '{value['@for']}': {type(e).__name__}: {str(e)}") from e
|
|
562
|
+
continue
|
|
563
|
+
if '@with' in value:
|
|
564
|
+
f = value['@with']
|
|
565
|
+
if isinstance(f, list):
|
|
566
|
+
f = ' '.join(f)
|
|
567
|
+
expression, variable_list = f.split(" as ", 1)
|
|
568
|
+
variable_list = re.split(r',\s+', variable_list.strip())
|
|
569
|
+
val = value
|
|
570
|
+
if '@do' in value:
|
|
571
|
+
val = value['@do']
|
|
572
|
+
else:
|
|
573
|
+
del val['@with']
|
|
574
|
+
try:
|
|
575
|
+
fn = get_function(expression, list(env.keys()))
|
|
576
|
+
v = fn(**env)
|
|
577
|
+
if v is not None:
|
|
578
|
+
if (len(variable_list) == 1 and not (isinstance(v, collections.Iterable) and not isinstance(v, str))):
|
|
579
|
+
v = [v]
|
|
580
|
+
new_env = dict(env)
|
|
581
|
+
for i, variable in enumerate(variable_list):
|
|
582
|
+
new_env[variable] = v[i]
|
|
583
|
+
child = clone(val)
|
|
584
|
+
todo.append((child, parent, new_env))
|
|
585
|
+
except KeyError:
|
|
586
|
+
pass
|
|
587
|
+
except Exception as e:
|
|
588
|
+
logger.error("=" * 80)
|
|
589
|
+
logger.error("Error in @with expression: %s", value['@with'])
|
|
590
|
+
transform_obj = env.get('transform', {})
|
|
591
|
+
transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
|
|
592
|
+
logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
|
|
593
|
+
logger.error("Error type: %s", type(e).__name__)
|
|
594
|
+
logger.error("Error message: %s", str(e))
|
|
595
|
+
logger.error("Expression: %s", expression)
|
|
596
|
+
logger.error("Variables to assign: %s", variable_list)
|
|
597
|
+
logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')]))
|
|
598
|
+
logger.error("=" * 80)
|
|
599
|
+
raise RuntimeError(f"Error in @with expression '{value['@with']}': {type(e).__name__}: {str(e)}") from e
|
|
600
|
+
continue
|
|
601
|
+
this = {}
|
|
602
|
+
for child in list(value.items()):
|
|
603
|
+
if child[0] == '@if':
|
|
604
|
+
continue
|
|
605
|
+
if child[0] == '@for':
|
|
606
|
+
continue
|
|
607
|
+
todo.append((child, this, env))
|
|
608
|
+
elif isinstance(value, list):
|
|
609
|
+
this = []
|
|
610
|
+
for child in value:
|
|
611
|
+
todo.append((child, this, env))
|
|
612
|
+
elif isinstance(value, str):
|
|
613
|
+
try:
|
|
614
|
+
template = get_template(str(value))
|
|
615
|
+
this = template.render(**env)
|
|
616
|
+
except Exception as e:
|
|
617
|
+
logger.error("=" * 80)
|
|
618
|
+
logger.error("Error rendering Jinja2 template: %s", value[:200] if len(value) > 200 else value)
|
|
619
|
+
transform_obj = env.get('transform', {})
|
|
620
|
+
transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
|
|
621
|
+
logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
|
|
622
|
+
logger.error("Error type: %s", type(e).__name__)
|
|
623
|
+
logger.error("Error message: %s", str(e))
|
|
624
|
+
logger.error("Template variables referenced in template:")
|
|
625
|
+
# Try to extract variable references from the template
|
|
626
|
+
matches = TEMPLATE_VAR_PATTERN.findall(value)
|
|
627
|
+
if matches:
|
|
628
|
+
for match in matches:
|
|
629
|
+
var_name = match.strip().split('.')[0].split('[')[0].strip()
|
|
630
|
+
if var_name in env:
|
|
631
|
+
val = env[var_name]
|
|
632
|
+
if type(val).__name__ == 'Element':
|
|
633
|
+
# XML Element
|
|
634
|
+
try:
|
|
635
|
+
val = xml.etree.ElementTree.tostring(val).decode('utf-8', errors='replace')[:200]
|
|
636
|
+
except Exception:
|
|
637
|
+
val = "<XML Element>"
|
|
638
|
+
else:
|
|
639
|
+
try:
|
|
640
|
+
val = str(val)[:200]
|
|
641
|
+
except Exception:
|
|
642
|
+
val = f"<{type(val).__name__}>"
|
|
643
|
+
logger.error(" %s = %s", var_name, val)
|
|
644
|
+
else:
|
|
645
|
+
logger.error(" %s = <NOT FOUND>", var_name)
|
|
646
|
+
logger.error("=" * 80)
|
|
647
|
+
raise RuntimeError(f"Error rendering template: {type(e).__name__}: {str(e)}") from e
|
|
648
|
+
else:
|
|
649
|
+
this = value
|
|
650
|
+
|
|
651
|
+
if key is not None:
|
|
652
|
+
parent[key] = this
|
|
653
|
+
else:
|
|
654
|
+
parent.append(this)
|
|
655
|
+
|
|
656
|
+
return flatten_lists(result)
|
|
657
|
+
|
|
658
|
+
def json_transform(transform, resources):
|
|
659
|
+
logger.info("Transform %s", transform.identifier)
|
|
660
|
+
tables = [u for u in transform[prov.used]]
|
|
661
|
+
variables = {}
|
|
662
|
+
for usage in transform[prov.qualifiedUsage]:
|
|
663
|
+
used = usage.value(prov.entity)
|
|
664
|
+
role = usage.value(prov.hadRole)
|
|
665
|
+
roleID = role.value(dc.identifier)
|
|
666
|
+
variables[roleID.value] = resources[used.identifier]
|
|
667
|
+
#print "Using", used.identifier, "as", roleID.value
|
|
668
|
+
|
|
669
|
+
generated = list(transform.subjects(prov.wasGeneratedBy))[0]
|
|
670
|
+
logger.info("Generating %s", generated.identifier)
|
|
671
|
+
|
|
672
|
+
connected_downstream_graph = '''
|
|
673
|
+
construct {
|
|
674
|
+
?target ?p ?o
|
|
675
|
+
} where {
|
|
676
|
+
?source (<>|!<>)* ?target.
|
|
677
|
+
?target ?p ?o.
|
|
678
|
+
}
|
|
679
|
+
'''
|
|
680
|
+
shape_graph = rdflib.Graph()
|
|
681
|
+
for shape in transform.objects(dc.conformsTo):
|
|
682
|
+
if shape[rdflib.RDF.type:shacl.NodeShape] or shape[rdflib.RDF.type:shacl.PropertyShape]:
|
|
683
|
+
logger.info("Validating against SHACL shape %s", shape.identifier)
|
|
684
|
+
shape_graph += transform.graph.query(connected_downstream_graph,
|
|
685
|
+
initBindings={"source":shape.identifier})
|
|
686
|
+
if generated.identifier in resources:
|
|
687
|
+
result = resources[generated.identifier]
|
|
688
|
+
else:
|
|
689
|
+
result = rdflib.ConjunctiveGraph()
|
|
690
|
+
if generated[rdflib.RDF.type : setl.Persisted]:
|
|
691
|
+
store = TrigStore()
|
|
692
|
+
result = rdflib.ConjunctiveGraph(store=store)
|
|
693
|
+
if generated[rdflib.RDF.type : setl.Persisted]:
|
|
694
|
+
tempdir = tempfile.mktemp()
|
|
695
|
+
logger.info("Persisting %s to %s", generated.identifier, tempdir)
|
|
696
|
+
result.store.open(tempdir, True)
|
|
697
|
+
s = transform.value(prov.value).value
|
|
698
|
+
try:
|
|
699
|
+
jslt = json.loads(s)
|
|
700
|
+
except json.JSONDecodeError as e:
|
|
701
|
+
logger.error("Error parsing JSON-LD template for transform %s", transform.identifier)
|
|
702
|
+
lineno = getattr(e, 'lineno', 0)
|
|
703
|
+
colno = getattr(e, 'colno', 0)
|
|
704
|
+
msg = getattr(e, 'msg', str(e))
|
|
705
|
+
logger.error("JSON parsing error at line %d, column %d: %s", lineno, colno, msg)
|
|
706
|
+
# Show context around the error (8 lines before, 3 after for better bracket matching)
|
|
707
|
+
lines = s.split("\n")
|
|
708
|
+
start_line = max(0, lineno - 8)
|
|
709
|
+
end_line = min(len(lines), lineno + 3)
|
|
710
|
+
logger.error("Template context:")
|
|
711
|
+
for i in range(start_line, end_line):
|
|
712
|
+
prefix = ">>> " if i == lineno - 1 else " "
|
|
713
|
+
logger.error("%s%d: %s", prefix, i + 1, lines[i])
|
|
714
|
+
raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {msg} at line {lineno}, column {colno}") from e
|
|
715
|
+
except Exception as e:
|
|
716
|
+
logger.error("Error parsing JSON-LD template for transform %s: %s", transform.identifier, str(e))
|
|
717
|
+
logger.error("Template content:\n%s", s[:500]) # Show first 500 chars
|
|
718
|
+
raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {str(e)}") from e
|
|
719
|
+
context = transform.value(setl.hasContext)
|
|
720
|
+
if context is not None:
|
|
721
|
+
context = json.loads(context.value)
|
|
722
|
+
for t in tables:
|
|
723
|
+
logger.info("Using %s", t.identifier)
|
|
724
|
+
table = resources[t.identifier]
|
|
725
|
+
it = table
|
|
726
|
+
if isinstance(table, pandas.DataFrame):
|
|
727
|
+
#if run_samples:
|
|
728
|
+
# table = table.head()
|
|
729
|
+
it = tqdm(table.iterrows(), total=table.shape[0])
|
|
730
|
+
#logger.info("Transforming %s rows.", len(table.index))
|
|
731
|
+
else:
|
|
732
|
+
logger.info("Transform %s", t.identifier)
|
|
733
|
+
for rowname, row in it:
|
|
734
|
+
if run_samples > 0 and rowname >= run_samples:
|
|
735
|
+
break
|
|
736
|
+
try:
|
|
737
|
+
root = None
|
|
738
|
+
data = None
|
|
739
|
+
root = {
|
|
740
|
+
"@id": generated.identifier,
|
|
741
|
+
"@graph": process_row(row, jslt, rowname, table, resources, transform, variables)
|
|
742
|
+
}
|
|
743
|
+
if context is not None:
|
|
744
|
+
root['@context'] = context
|
|
745
|
+
|
|
746
|
+
#logger.debug(json.dumps(root, indent=4))
|
|
747
|
+
#before = len(result)
|
|
748
|
+
#graph = rdflib.ConjunctiveGraph(identifier=generated.identifier)
|
|
749
|
+
#graph.parse(data=json.dumps(root),format="json-ld")
|
|
750
|
+
data = json.dumps(root)
|
|
751
|
+
#del root
|
|
752
|
+
|
|
753
|
+
if len(shape_graph) > 0:
|
|
754
|
+
d = rdflib.ConjunctiveGraph()
|
|
755
|
+
d.parse(data=data,format='json-ld')
|
|
756
|
+
conforms, report, message = validate(d,
|
|
757
|
+
shacl_graph=shape_graph,
|
|
758
|
+
advanced=True,
|
|
759
|
+
debug=False)
|
|
760
|
+
if not conforms:
|
|
761
|
+
print(message)
|
|
762
|
+
result.parse(data=data, format="json-ld")
|
|
763
|
+
#del data
|
|
764
|
+
#after = len(result)
|
|
765
|
+
#logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
|
|
766
|
+
#sys.stdout.flush()
|
|
767
|
+
except Exception as e:
|
|
768
|
+
logger.error("=" * 80)
|
|
769
|
+
logger.error("Error in transform %s while processing row %s", transform.identifier, rowname)
|
|
770
|
+
if isinstance(table, pandas.DataFrame):
|
|
771
|
+
# Format row data with better NaN handling
|
|
772
|
+
row_dict = {}
|
|
773
|
+
for key, value in dict(row).items():
|
|
774
|
+
if pandas.isna(value):
|
|
775
|
+
row_dict[key] = "<empty/missing>"
|
|
776
|
+
else:
|
|
777
|
+
row_dict[key] = value
|
|
778
|
+
logger.error("Row data: %s", row_dict)
|
|
779
|
+
else:
|
|
780
|
+
logger.error("Row identifier: %s", rowname)
|
|
781
|
+
|
|
782
|
+
# Try to provide more specific error information
|
|
783
|
+
error_type = type(e).__name__
|
|
784
|
+
if "JSON-LD" in str(e) or "json" in str(e).lower():
|
|
785
|
+
logger.error("JSON-LD processing error: %s", str(e))
|
|
786
|
+
if data is not None:
|
|
787
|
+
logger.error("Generated JSON-LD (first 1000 chars):\n%s", data[:1000])
|
|
788
|
+
elif hasattr(e, 'lineno'):
|
|
789
|
+
logger.error("%s at line %d: %s", error_type, e.lineno, str(e))
|
|
790
|
+
else:
|
|
791
|
+
logger.error("%s: %s", error_type, str(e))
|
|
792
|
+
|
|
793
|
+
logger.error("=" * 80)
|
|
794
|
+
raise RuntimeError(f"Failed to transform row {rowname} in transform {transform.identifier}: {error_type}: {str(e)}") from e
|
|
795
|
+
|
|
796
|
+
resources[generated.identifier] = result
|
|
797
|
+
|
|
798
|
+
def transform(transform_resource, resources):
|
|
799
|
+
logger.info('Transforming %s',transform_resource.identifier)
|
|
800
|
+
|
|
801
|
+
transform_graph = rdflib.ConjunctiveGraph()
|
|
802
|
+
for result in transform_graph.subjects(prov.wasGeneratedBy):
|
|
803
|
+
transform_graph = rdflib.ConjunctiveGraph(identifier=result.identifier)
|
|
804
|
+
|
|
805
|
+
used = set(transform_resource[prov.used])
|
|
806
|
+
|
|
807
|
+
for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]:
|
|
808
|
+
csv_graph = rdflib.Graph(store=transform_graph.store,
|
|
809
|
+
identifier=csv_file)
|
|
810
|
+
csv_graph += resources[csv_file.identifier]
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
for script in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]:
|
|
814
|
+
logger.info("Script: %s", script.identifier)
|
|
815
|
+
s = script.value(prov.value).value
|
|
816
|
+
local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph)
|
|
817
|
+
global_vars = dict()
|
|
818
|
+
exec(s, global_vars, local_vars)
|
|
819
|
+
|
|
820
|
+
for jsldt in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]:
|
|
821
|
+
logger.info("Script: %s", script.identifier)
|
|
822
|
+
s = script.value(prov.value).value
|
|
823
|
+
local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph)
|
|
824
|
+
global_vars = dict()
|
|
825
|
+
exec(s, global_vars, local_vars)
|
|
826
|
+
|
|
827
|
+
for update in [u for u in used if u[rdflib.RDF.type:sp.Update]]:
|
|
828
|
+
logger.info("Update: %s", update.identifier)
|
|
829
|
+
query = update.value(prov.value).value
|
|
830
|
+
transform_graph.update(query)
|
|
831
|
+
|
|
832
|
+
for construct in [u for u in used if u[rdflib.RDF.type:sp.Construct]]:
|
|
833
|
+
logger.info("Construct: %s", construct.identifier)
|
|
834
|
+
query = construct.value(prov.value).value
|
|
835
|
+
g = transform_graph.query(query)
|
|
836
|
+
transform_graph += g
|
|
837
|
+
|
|
838
|
+
for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]:
|
|
839
|
+
g = rdflib.Graph(identifier=csv_file.identifier,store=transform_graph.store)
|
|
840
|
+
g.remove((None, None, None))
|
|
841
|
+
transform_graph.store.remove_graph(csv_file.identifier)
|
|
842
|
+
|
|
843
|
+
for result in transform_graph.subjects(prov.wasGeneratedBy):
|
|
844
|
+
resources[result.identifier] = transform_graph
|
|
845
|
+
|
|
846
|
+
def _load_open(generated):
|
|
847
|
+
if generated.identifier.startswith("file://"):
|
|
848
|
+
if os.name == 'nt': # skip the initial
|
|
849
|
+
filename = generated.identifier.replace('file:///','').replace('file://','')
|
|
850
|
+
else:
|
|
851
|
+
filename = generated.identifier.replace('file://','')
|
|
852
|
+
|
|
853
|
+
fh = open(filename, 'wb')
|
|
854
|
+
for type, pack in packers.items():
|
|
855
|
+
if generated[rdflib.RDF.type : type]:
|
|
856
|
+
return pack(fh)
|
|
857
|
+
return fh
|
|
858
|
+
|
|
859
|
+
def load(load_resource, resources):
|
|
860
|
+
logger.info('Load %s',load_resource.identifier)
|
|
861
|
+
file_graph = rdflib.Dataset(default_union=True)
|
|
862
|
+
for used in load_resource[prov.used]:
|
|
863
|
+
if used[rdflib.RDF.type : setl.Persisted]:
|
|
864
|
+
file_graph = rdflib.Dataset(store='Sleepycat', default_union=True)
|
|
865
|
+
tempdir = tempfile.mkdtemp()
|
|
866
|
+
logger.debug("Gathering %s into %s", load_resource.identifier, tempdir)
|
|
867
|
+
file_graph.store.open(tempdir, True)
|
|
868
|
+
break
|
|
869
|
+
if len(list(load_resource[prov.used])) == 1:
|
|
870
|
+
logger.info("Using %s",load_resource.value(prov.used).identifier)
|
|
871
|
+
file_graph = resources[load_resource.value(prov.used).identifier]
|
|
872
|
+
else:
|
|
873
|
+
for used in load_resource[prov.used]:
|
|
874
|
+
logger.info("Using %s",used.identifier)
|
|
875
|
+
used_graph = resources[used.identifier]
|
|
876
|
+
file_graph.namespace_manager = used_graph.namespace_manager
|
|
877
|
+
#print used_graph.serialize(format="trig")
|
|
878
|
+
file_graph.addN(used_graph.quads())
|
|
879
|
+
|
|
880
|
+
for generated in load_resource.subjects(prov.wasGeneratedBy):
|
|
881
|
+
# TODO: support LDP-based loading
|
|
882
|
+
if generated[rdflib.RDF.type:pv.File]:
|
|
883
|
+
fmt = generated.value(dc['format'])
|
|
884
|
+
if fmt is not None:
|
|
885
|
+
fmt = fmt.value
|
|
886
|
+
if fmt in formats:
|
|
887
|
+
fmt = formats[fmt]
|
|
888
|
+
#print fmt
|
|
889
|
+
with _load_open(generated) as o:
|
|
890
|
+
file_graph.serialize(o, format=fmt)
|
|
891
|
+
|
|
892
|
+
elif generated[rdflib.RDF.type:sd.Service]:
|
|
893
|
+
from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
|
|
894
|
+
endpoint = generated.value(sd.endpoint, default=generated).identifier
|
|
895
|
+
store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False)
|
|
896
|
+
endpoint_graph = rdflib.Dataset(store=store, identifier=generated.identifier, default_union=True)
|
|
897
|
+
endpoint_graph.addN(file_graph.quads())
|
|
898
|
+
endpoint_graph.commit()
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
actions = {
|
|
902
|
+
setl.Extract : extract,
|
|
903
|
+
setl.Transform : json_transform,
|
|
904
|
+
setl.Load : load,
|
|
905
|
+
setl.PythonScript : create_python_function,
|
|
906
|
+
setl.IsEmpty : isempty
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
def _setl(setl_graph):
|
|
910
|
+
"""Internal implementation function. Use run_setl() instead.
|
|
911
|
+
|
|
912
|
+
This function is deprecated and maintained for backward compatibility.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
setl_graph: A ConjunctiveGraph containing the SETL script.
|
|
916
|
+
|
|
917
|
+
Returns:
|
|
918
|
+
dict: A dictionary of resources created during the SETL process.
|
|
919
|
+
"""
|
|
920
|
+
import warnings
|
|
921
|
+
warnings.warn(
|
|
922
|
+
"_setl() is deprecated and will be removed in a future version. "
|
|
923
|
+
"Use run_setl() instead, which provides the same functionality with better documentation.",
|
|
924
|
+
DeprecationWarning,
|
|
925
|
+
stacklevel=2
|
|
926
|
+
)
|
|
927
|
+
return run_setl(setl_graph)
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def run_setl(setl_graph):
|
|
931
|
+
"""Execute a SETL (Semantic Extract, Transform, Load) script.
|
|
932
|
+
|
|
933
|
+
This is the main entry point for programmatically running SETL scripts.
|
|
934
|
+
It processes a SETL graph containing extraction, transformation, and loading
|
|
935
|
+
instructions for working with RDF data.
|
|
936
|
+
|
|
937
|
+
Args:
|
|
938
|
+
setl_graph (ConjunctiveGraph): A ConjunctiveGraph containing the SETL script
|
|
939
|
+
in RDF format. The graph should define resources with types from the
|
|
940
|
+
SETL vocabulary (http://purl.org/twc/vocab/setl/) including:
|
|
941
|
+
- setl:Extract: Extract data from sources
|
|
942
|
+
- setl:Transform: Transform data using JSON-LD templates
|
|
943
|
+
- setl:Load: Load data to destinations
|
|
944
|
+
|
|
945
|
+
Returns:
|
|
946
|
+
dict: A dictionary mapping resource URIs to their generated content.
|
|
947
|
+
The dictionary contains:
|
|
948
|
+
- Extracted data (DataFrames, RDF graphs, etc.)
|
|
949
|
+
- Transformed RDF graphs
|
|
950
|
+
- References to action functions
|
|
951
|
+
|
|
952
|
+
Example:
|
|
953
|
+
>>> from rdflib import ConjunctiveGraph
|
|
954
|
+
>>> from setlr import run_setl
|
|
955
|
+
>>>
|
|
956
|
+
>>> # Load a SETL script
|
|
957
|
+
>>> setl_graph = ConjunctiveGraph()
|
|
958
|
+
>>> setl_graph.parse("my_script.setl.ttl", format="turtle")
|
|
959
|
+
>>>
|
|
960
|
+
>>> # Execute the script
|
|
961
|
+
>>> resources = run_setl(setl_graph)
|
|
962
|
+
>>>
|
|
963
|
+
>>> # Access generated resources
|
|
964
|
+
>>> output_graph = resources['http://example.com/output']
|
|
965
|
+
|
|
966
|
+
Raises:
|
|
967
|
+
RuntimeError: If there are errors during extraction, transformation, or loading.
|
|
968
|
+
ValueError: If the SETL script contains invalid JSON-LD templates or configuration.
|
|
969
|
+
|
|
970
|
+
Note:
|
|
971
|
+
This function initializes the module logger if not already set and processes
|
|
972
|
+
all SETL tasks in topological order based on their dependencies.
|
|
973
|
+
"""
|
|
974
|
+
global logger
|
|
975
|
+
if logger is None:
|
|
976
|
+
logger = logging.getLogger(__name__)
|
|
977
|
+
resources = {}
|
|
978
|
+
resources.update(actions)
|
|
979
|
+
|
|
980
|
+
tasks = [setl_graph.resource(t) for t in get_order(setl_graph)]
|
|
981
|
+
|
|
982
|
+
for task in tasks:
|
|
983
|
+
action = [actions[t.identifier] for t in task[rdflib.RDF.type] if t.identifier in actions]
|
|
984
|
+
if len(action) > 0:
|
|
985
|
+
action[0](task, resources)
|
|
986
|
+
return resources
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
logger = None
|
|
990
|
+
|
|
991
|
+
@click.command()
|
|
992
|
+
@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
|
|
993
|
+
@click.option('-n', default=-1, help="Only process the first N rows.", type=int)
|
|
994
|
+
#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
|
|
995
|
+
#@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
|
|
996
|
+
@click.argument('script', type=click.Path(exists=True))
|
|
997
|
+
def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
|
|
998
|
+
"""Command-line interface for running SETL scripts.
|
|
999
|
+
|
|
1000
|
+
Args:
|
|
1001
|
+
script: Path to the SETL script file (Turtle format).
|
|
1002
|
+
quiet: If True, minimize logging output.
|
|
1003
|
+
n: Only process the first N rows (-1 for all rows).
|
|
1004
|
+
"""
|
|
1005
|
+
logging_level = logging.DEBUG
|
|
1006
|
+
if quiet:
|
|
1007
|
+
logging_level = logging.WARNING
|
|
1008
|
+
logging.basicConfig(level=logging_level)
|
|
1009
|
+
|
|
1010
|
+
global logger
|
|
1011
|
+
logger = logging.getLogger(__name__)
|
|
1012
|
+
|
|
1013
|
+
global run_samples
|
|
1014
|
+
run_samples = n
|
|
1015
|
+
setl_graph = rdflib.ConjunctiveGraph()
|
|
1016
|
+
content = open(script).read()
|
|
1017
|
+
setl_graph.parse(data=content, format="turtle")
|
|
1018
|
+
|
|
1019
|
+
run_setl(setl_graph)
|