setlr 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
setlr/core.py ADDED
@@ -0,0 +1,1019 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from rdflib.util import guess_format
5
+ import rdflib
6
+ import csv
7
+ import json
8
+ import sys
9
+ import collections
10
+ import requests
11
+ import pandas
12
+ import re
13
+ import os
14
+ import click
15
+
16
+ from jinja2 import Template
17
+ from toposort import toposort_flatten
18
+ from numpy import isnan
19
+ import tempfile
20
+ import ijson
21
+ from . import iterparse_filter
22
+ #import xml.etree.ElementTree as ET
23
+ import xml.etree.ElementTree
24
+
25
+ from itertools import chain
26
+
27
+ import zipfile
28
+ import gzip
29
+
30
+ import logging
31
+
32
+ from tqdm import tqdm
33
+
34
+ import hashlib
35
+ from slugify import slugify
36
+ from pyshacl import validate
37
+
38
+ from .trig_store import TrigStore
39
+
40
+ from requests_testadapter import Resp
41
+
42
+
43
+ def hash(value):
44
+ m = hashlib.sha256()
45
+ m.update(value.encode('utf-8'))
46
+ return m.hexdigest()
47
+
48
+ csvw = rdflib.Namespace('http://www.w3.org/ns/csvw#')
49
+ ov = rdflib.Namespace('http://open.vocab.org/terms/')
50
+ setl = rdflib.Namespace('http://purl.org/twc/vocab/setl/')
51
+ prov = rdflib.Namespace('http://www.w3.org/ns/prov#')
52
+ pv = rdflib.Namespace('http://purl.org/net/provenance/ns#')
53
+ sp = rdflib.Namespace('http://spinrdf.org/sp#')
54
+ sd = rdflib.Namespace('http://www.w3.org/ns/sparql-service-description#')
55
+ dc = rdflib.Namespace('http://purl.org/dc/terms/')
56
+ void = rdflib.Namespace('http://rdfs.org/ns/void#')
57
+ shacl = rdflib.Namespace('http://www.w3.org/ns/shacl#')
58
+ api_vocab = rdflib.Namespace('http://purl.org/linked-data/api/vocab#')
59
+
60
+ sys.setrecursionlimit(10000)
61
+
62
+
63
+ # Regex pattern for extracting Jinja2 template variables (compiled once for performance)
64
+ TEMPLATE_VAR_PATTERN = re.compile(r'\{\{([^}]+)\}\}')
65
+
66
+ def camelcase(s):
67
+ return slugify(s).title().replace("-","")
68
+
69
+ class LocalFileAdapter(requests.adapters.HTTPAdapter):
70
+ def build_response_from_file(self, request):
71
+ file_path = request.url[7:]
72
+ with open(file_path, 'rb') as file:
73
+ buff = bytearray(os.path.getsize(file_path))
74
+ file.readinto(buff)
75
+ resp = Resp(buff)
76
+ r = self.build_response(request, resp)
77
+ return r
78
+ def send(self, request, stream=False, timeout=None,
79
+ verify=True, cert=None, proxies=None):
80
+ return self.build_response_from_file(request)
81
+
82
+ requests_session = requests.session()
83
+ requests_session.mount('file://', LocalFileAdapter())
84
+ requests_session.mount('file:///', LocalFileAdapter())
85
+
86
+ datatypeConverters = collections.defaultdict(lambda: str)
87
+ datatypeConverters.update({
88
+ rdflib.XSD.string: str,
89
+ rdflib.XSD.decimal: float,
90
+ rdflib.XSD.integer: int,
91
+ rdflib.XSD.float: float,
92
+ rdflib.XSD.double: float
93
+ })
94
+
95
+ run_samples = -1
96
+
97
+ _rdf_formats_to_guess = [
98
+ 'xml',
99
+ 'json-ld',
100
+ 'trig',
101
+ 'nquads',
102
+ 'trix'
103
+ ]
104
+
105
+
106
+ def read_csv(location, result):
107
+ args = dict(
108
+ sep = result.value(csvw.delimiter, default=rdflib.Literal(",")).value,
109
+ #header = result.value(csvw.headerRow, default=rdflib.Literal(0)).value),
110
+ skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value,
111
+ dtype=str,
112
+ # dtype = object # Does not seem to play well with future and python2/3 conversion
113
+ )
114
+ if result.value(csvw.header):
115
+ args['header'] = [0]
116
+ with get_content(location, result) as fo:
117
+ df = pandas.read_csv(fo, encoding='utf-8', **args)
118
+ #logger.debug("Loaded %s", location)
119
+ return df
120
+
121
+ def read_graph(location, result, g = None):
122
+ if g is None:
123
+ g = rdflib.ConjunctiveGraph()
124
+ graph = rdflib.ConjunctiveGraph(store=g.store, identifier=result.identifier)
125
+ if len(graph) == 0:
126
+ data = get_content(location, result).read()
127
+ f = guess_format(location)
128
+ for fmt in [f] + _rdf_formats_to_guess:
129
+ try:
130
+ graph.parse(data=data, format=fmt)
131
+ break
132
+ except Exception:
133
+ pass
134
+ if len(graph) == 0:
135
+ logger.error("Could not parse graph: %s", location)
136
+ if result[rdflib.RDF.type:rdflib.OWL.Ontology]:
137
+ for ontology in graph.subjects(rdflib.RDF.type, rdflib.OWL.Ontology):
138
+ imports = [graph.resource(x) for x in graph.objects(ontology, rdflib.OWL.imports)]
139
+ for i in imports:
140
+ read_graph(i.identifier, i, g = g)
141
+ return g
142
+
143
+ class FileLikeFromIter(object):
144
+ _closed = False
145
+
146
+ def __init__(self, content_iter):
147
+ self.iter = content_iter
148
+ self.data = b''
149
+
150
+ def __iter__(self):
151
+ return self.iter
152
+
153
+ def readable(self):
154
+ return True
155
+
156
+ def writable(self):
157
+ return False
158
+
159
+ def seekable(self):
160
+ return False
161
+
162
+ def closed(self):
163
+ if self._closed:
164
+ return True
165
+ if len(self.data) > 0:
166
+ return False
167
+ try:
168
+ self.data = next(self.iter)
169
+ except StopIteration:
170
+ self.closed = True
171
+ return True
172
+ return False
173
+
174
+ # Enter and Exit are needed to allow this to work with with
175
+ def __enter__(self):
176
+ return self
177
+
178
+ # Could be improved for better error/exception handling
179
+ def __exit__(self, err_type, value, tracebock):
180
+ pass
181
+
182
+ def read(self, n=None):
183
+ if n is None:
184
+ return self.data + b''.join(line for line in self.iter)
185
+ else:
186
+ while len(self.data) < n:
187
+ try:
188
+ self.data = b''.join((self.data, next(self.iter)))
189
+ except StopIteration:
190
+ break
191
+ result, self.data = self.data[:n], self.data[n:]
192
+ return result
193
+
194
+ def _open_local_file(location):
195
+ if location.startswith("file://"):
196
+ if os.name == 'nt': # skip the initial
197
+ return open(location.replace('file:///','').replace('file://',''),'rb')
198
+ else:
199
+ return open(location.replace('file://',''),'rb')
200
+
201
+ content_handlers = [
202
+ _open_local_file,
203
+ lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024))
204
+ ]
205
+
206
+ def get_content(location, result):
207
+ response = None
208
+ for handler in content_handlers:
209
+ response = handler(location)
210
+ if response is not None:
211
+ break
212
+ if result[rdflib.RDF.type:setl.Tempfile]:
213
+ result = to_tempfile(response)
214
+
215
+ for t in result[rdflib.RDF.type]:
216
+ # Do we know how to unpack this?
217
+ if t.identifier in unpackers:
218
+ response = unpackers[t.identifier](response)
219
+ return response
220
+
221
+ def to_tempfile(f):
222
+ tf = tempfile.TemporaryFile()
223
+ logger.debug("Writing %s to disk.", f)
224
+ for chunk in f:
225
+ if chunk: # filter out keep-alive new chunks
226
+ tf.write(chunk)
227
+ tf.seek(0)
228
+ logger.debug("Finished writing %s to disk.", f)
229
+ return tf
230
+
231
+ def unpack_zipfile(f):
232
+ zf = zipfile.ZipFile(f, mode='r')
233
+ files = zf.infolist()
234
+ return zf.open(files[0])
235
+
236
+ unpackers = {
237
+ # setl.Tempfile : lambda x: x,
238
+ setl.ZipFile : lambda x: unpack_zipfile(to_tempfile(x)),
239
+ setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r')
240
+ }
241
+
242
+ packers = {
243
+ # setl.Tempfile : lambda x: x,
244
+ setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb')
245
+ }
246
+
247
+ def read_excel(location, result):
248
+ args = dict(
249
+ sheet_name = result.value(setl.sheetname, default=rdflib.Literal(0)).value,
250
+ header = [int(x) for x in result.value(csvw.headerRow, default=rdflib.Literal('0')).value.split(',')],
251
+ skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value
252
+ )
253
+ if result.value(csvw.header):
254
+ args['header'] = [result.value(csvw.header).value]
255
+ with get_content(location, result) as fo:
256
+ df = pandas.read_excel(fo, encoding='utf-8', **args)
257
+ return df
258
+
259
+ def read_xml(location, result):
260
+ validate_dtd = False
261
+ if result[rdflib.RDF.type:setl.DTDValidatedXML]:
262
+ validate_dtd = True
263
+ f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd)
264
+ if result.value(setl.xpath) is None:
265
+ logger.debug("no xpath to select on from %s", location)
266
+ f.iter_end("/*")
267
+ for xp in result[setl.xpath]:
268
+ f.iter_end(xp.value)
269
+ with get_content(location, result) as fo:
270
+ for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))):
271
+ yield i, ele
272
+
273
+
274
+ def read_json(location, result):
275
+ selector = result.value(api_vocab.selector)
276
+ if selector is not None:
277
+ selector = selector.value
278
+ else:
279
+ selector = ""
280
+ with get_content(location, result) as fo:
281
+ yield from enumerate(tqdm(ijson.items(fo, selector)))
282
+
283
+
284
+ extractors = {
285
+ setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'),
286
+ setl.SAS7BDAT : lambda location, result: pandas.read_sas(get_content(location, result), format='sas7bdat'),
287
+ setl.Excel : read_excel,
288
+ csvw.Table : read_csv,
289
+ rdflib.OWL.Ontology : read_graph,
290
+ void.Dataset : read_graph,
291
+ setl.JSON : read_json,
292
+ setl.XML : read_xml,
293
+ rdflib.URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result)
294
+ }
295
+
296
+
297
+ try:
298
+ from bs4 import BeautifulSoup
299
+ extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser')
300
+ except Exception:
301
+ pass
302
+
303
+
304
+ def load_csv(csv_resource):
305
+ column_descriptions = {}
306
+ for col in csv_resource[csvw.column]:
307
+ label = col.value(rdflib.RDFS.label).value
308
+ column_descriptions[label] = col
309
+ csv_graph = rdflib.Graph(identifier=csv_resource)
310
+ s = [x for x in csv.reader(open(str(csv_resource.value(csvw.url).identifier).replace("file://","")),
311
+ delimiter=str(csv_resource.value(csvw.delimiter,default=",").value),
312
+ quotechar=str(csv_resource.value(csvw.quoteChar,default='"').value))]
313
+ header = None
314
+ properties = []
315
+ propertyMap = {}
316
+ skip_value = csv_resource.value(csvw.null)
317
+ if skip_value is not None:
318
+ skip_value = skip_value.value
319
+ for i, r in enumerate(s):
320
+ if header is None:
321
+ header = r
322
+ for j, h in enumerate(header):
323
+ col_desc = None
324
+ if h in column_descriptions:
325
+ col_desc = column_descriptions[h]
326
+ col = csv_graph.resource(rdflib.URIRef("urn:col_"+str(h)))
327
+ col.add(rdflib.RDFS.label, rdflib.Literal(h))
328
+ col.add(ov.csvCol, rdflib.Literal(j))
329
+ if col_desc is not None:
330
+ col.add(rdflib.RDFS.range, col_desc.value(rdflib.RDFS.range, default=rdflib.XSD.string))
331
+ properties.append(col)
332
+ propertyMap[h] = col
333
+ continue
334
+ res = csv_graph.resource(csv_resource.identifier+"_row_"+str(i))
335
+ res.add(rdflib.RDF.type, csvw.Row)
336
+ res.add(csvw.rownum, rdflib.Literal(i))
337
+ for j, value in enumerate(r):
338
+ if skip_value is not None and skip_value == value:
339
+ continue
340
+ #print i, j, value
341
+ prop = properties[j]
342
+ datatype = prop.value(rdflib.RDFS['range'], default=rdflib.XSD.string)
343
+ lit = rdflib.Literal(value, datatype=datatype.identifier)
344
+ #print i, prop.identifier, lit.n3()
345
+ res.add(prop.identifier, lit)
346
+ logger.debug("Table has %s rows, %s columns, and %s triples", len(s), len(header), len(csv_graph))
347
+ return csv_graph
348
+
349
+ formats = {
350
+ None:'xml',
351
+ "application/rdf+xml":'xml',
352
+ "text/rdf":'xml',
353
+ 'text/turtle':'turtle',
354
+ 'application/turtle':'turtle',
355
+ 'application/x-turtle':'turtle',
356
+ 'text/plain':'nt',
357
+ 'text/n3':'n3',
358
+ 'application/trig':'trig',
359
+ 'application/json':'json-ld'
360
+ }
361
+
362
+ def create_python_function(f, resources):
363
+ global_vars = {'this' : f, 'resources': resources}
364
+ local_vars = {}
365
+ script = f.value(prov.value)
366
+ for qd in f[prov.qualifiedDerivation]:
367
+ entity = resources[qd.value(prov.entity).identifier]
368
+ name = qd.value(prov.hadRole).value(dc.identifier)
369
+ local_vars[name.value] = entity
370
+ exec(script.value, local_vars, global_vars)
371
+ resources[f.identifier] = global_vars['result']
372
+
373
+ def get_order(setl_graph):
374
+ nodes = collections.defaultdict(set)
375
+
376
+ for typ in actions:
377
+ for task in setl_graph.subjects(rdflib.RDF.type, typ):
378
+ task = setl_graph.resource(task)
379
+ for used in task[prov.used]:
380
+ nodes[task.identifier].add(used.identifier)
381
+
382
+ for usage in task[prov.qualifiedUsage]:
383
+ used = usage.value(prov.entity)
384
+ nodes[task.identifier].add(used.identifier)
385
+ for generated in task.subjects(prov.wasGeneratedBy):
386
+ nodes[generated.identifier].add(task.identifier)
387
+ for derivation in task[prov.qualifiedDerivation]:
388
+ derived = derivation.value(prov.entity)
389
+ nodes[task.identifier].add(derived.identifier)
390
+
391
+ return toposort_flatten(nodes)
392
+
393
+ def extract(e, resources):
394
+ logger.info('Extract %s',e.identifier)
395
+ used = e.value(prov.used)
396
+ for result in e.subjects(prov.wasGeneratedBy):
397
+ if used is None:
398
+ used = result
399
+ for t in result[rdflib.RDF.type]:
400
+ # Do we know how to generate this?
401
+ if t.identifier in extractors:
402
+ logger.info("Using %s", used.identifier)
403
+ resources[result.identifier] = extractors[t.identifier](used.identifier, result)
404
+ return resources[result.identifier]
405
+
406
+ def isempty(value):
407
+ try:
408
+ return isnan(value)
409
+ except (TypeError, ValueError):
410
+ return value is None
411
+
412
+ def clone(value):
413
+ '''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.'''
414
+ if isinstance(value, list):
415
+ return [x for x in value]
416
+ elif isinstance(value, dict):
417
+ return dict(value)
418
+ else:
419
+ return value
420
+
421
+ functions = {}
422
+ def get_function(expr, local_keys):
423
+ used_local_keys = [k for k in local_keys if k in expr]
424
+ key = tuple([expr]+sorted(used_local_keys))
425
+ if key not in functions:
426
+ script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr)
427
+ #print(script)
428
+ fn = eval(script)
429
+ fn.__name__ = expr.encode("ascii", "ignore").decode('utf8')
430
+ functions[key] = fn
431
+ return functions[key]
432
+
433
+ templates = {}
434
+ def get_template(templ):
435
+ if templ not in templates:
436
+ t = Template(templ)
437
+ templates[templ] = t
438
+ return templates[templ]
439
+
440
+ def flatten_lists(o):
441
+ if isinstance(o, list):
442
+ result = []
443
+ for x in o:
444
+ flattened = flatten_lists(x)
445
+ if isinstance(flattened, list):
446
+ result.extend(flattened)
447
+ else:
448
+ result.append(flattened)
449
+ return result
450
+ elif isinstance(o, dict):
451
+ for key in o.keys():
452
+ o[key] = flatten_lists(o[key])
453
+ return o
454
+ else:
455
+ return o
456
+
457
+ def process_row(row, template, rowname, table, resources, transform, variables):
458
+ result = []
459
+ e = {
460
+ 'row':row,
461
+ 'name': rowname,
462
+ 'table': table,
463
+ 'resources': resources,
464
+ 'template': template,
465
+ "transform": transform,
466
+ "setl_graph": transform.graph,
467
+ "isempty":isempty,
468
+ "slugify" : slugify,
469
+ "camelcase" : camelcase,
470
+ "hash":hash,
471
+ "isinstance":isinstance,
472
+ "str":str,
473
+ "float":float,
474
+ "int":int,
475
+ "chain": lambda x: chain(*x),
476
+ "list":list
477
+ }
478
+ e.update(variables)
479
+ e.update(rdflib.__dict__)
480
+ todo = [[x, result, e] for x in template]
481
+
482
+ while len(todo) > 0:
483
+ task, parent, env = todo.pop()
484
+ key = None
485
+ value = task
486
+ this = None
487
+ if isinstance(parent, dict):
488
+ if len(task) != 2:
489
+ logger.debug(task)
490
+ key, value = task
491
+ kt = get_template(key)
492
+ key = kt.render(**env)
493
+ if isinstance(value, dict):
494
+ if '@if' in value:
495
+ try:
496
+ fn = get_function(value['@if'], list(env.keys()))
497
+ incl = fn(**env)
498
+ if incl is None or not incl:
499
+ continue
500
+ except KeyError:
501
+ continue
502
+ except AttributeError:
503
+ continue
504
+ except TypeError:
505
+ continue
506
+ except Exception as e:
507
+ logger.error("=" * 80)
508
+ logger.error("Error evaluating @if conditional: %s", value['@if'])
509
+ transform_obj = env.get('transform', {})
510
+ transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
511
+ logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
512
+ logger.error("Error type: %s", type(e).__name__)
513
+ logger.error("Error message: %s", str(e))
514
+ logger.error("Row-specific variables:")
515
+ for key in ['row', 'name']:
516
+ if key in env:
517
+ v = env[key]
518
+ try:
519
+ logger.error(" %s: %s", key, str(v)[:200])
520
+ except Exception:
521
+ logger.error(" %s: <%s>", key, type(v).__name__)
522
+ logger.error("=" * 80)
523
+ raise RuntimeError(f"Error in @if conditional '{value['@if']}': {type(e).__name__}: {str(e)}") from e
524
+ if '@for' in value:
525
+ f = value['@for']
526
+ if isinstance(f, list):
527
+ f = ' '.join(f)
528
+ variable_list, expression = f.split(" in ", 1)
529
+ variable_list = re.split(r',\s+', variable_list.strip())
530
+ val = value
531
+ if '@do' in value:
532
+ val = value['@do']
533
+ else:
534
+ del val['@for']
535
+ try:
536
+ fn = get_function(expression, list(env.keys()))
537
+ values = fn(**env)
538
+ if values is not None:
539
+ for v in values:
540
+ if len(variable_list) == 1:
541
+ v = [v]
542
+ new_env = dict(env)
543
+ for i, variable in enumerate(variable_list):
544
+ new_env[variable] = v[i]
545
+ child = clone(val)
546
+ todo.append((child, parent, new_env))
547
+ except KeyError:
548
+ pass
549
+ except Exception as e:
550
+ logger.error("=" * 80)
551
+ logger.error("Error in @for loop: %s", value['@for'])
552
+ transform_obj = env.get('transform', {})
553
+ transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
554
+ logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
555
+ logger.error("Error type: %s", type(e).__name__)
556
+ logger.error("Error message: %s", str(e))
557
+ logger.error("Expression: %s", expression)
558
+ logger.error("Variables to assign: %s", variable_list)
559
+ logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')]))
560
+ logger.error("=" * 80)
561
+ raise RuntimeError(f"Error in @for loop '{value['@for']}': {type(e).__name__}: {str(e)}") from e
562
+ continue
563
+ if '@with' in value:
564
+ f = value['@with']
565
+ if isinstance(f, list):
566
+ f = ' '.join(f)
567
+ expression, variable_list = f.split(" as ", 1)
568
+ variable_list = re.split(r',\s+', variable_list.strip())
569
+ val = value
570
+ if '@do' in value:
571
+ val = value['@do']
572
+ else:
573
+ del val['@with']
574
+ try:
575
+ fn = get_function(expression, list(env.keys()))
576
+ v = fn(**env)
577
+ if v is not None:
578
+ if (len(variable_list) == 1 and not (isinstance(v, collections.Iterable) and not isinstance(v, str))):
579
+ v = [v]
580
+ new_env = dict(env)
581
+ for i, variable in enumerate(variable_list):
582
+ new_env[variable] = v[i]
583
+ child = clone(val)
584
+ todo.append((child, parent, new_env))
585
+ except KeyError:
586
+ pass
587
+ except Exception as e:
588
+ logger.error("=" * 80)
589
+ logger.error("Error in @with expression: %s", value['@with'])
590
+ transform_obj = env.get('transform', {})
591
+ transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
592
+ logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
593
+ logger.error("Error type: %s", type(e).__name__)
594
+ logger.error("Error message: %s", str(e))
595
+ logger.error("Expression: %s", expression)
596
+ logger.error("Variables to assign: %s", variable_list)
597
+ logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')]))
598
+ logger.error("=" * 80)
599
+ raise RuntimeError(f"Error in @with expression '{value['@with']}': {type(e).__name__}: {str(e)}") from e
600
+ continue
601
+ this = {}
602
+ for child in list(value.items()):
603
+ if child[0] == '@if':
604
+ continue
605
+ if child[0] == '@for':
606
+ continue
607
+ todo.append((child, this, env))
608
+ elif isinstance(value, list):
609
+ this = []
610
+ for child in value:
611
+ todo.append((child, this, env))
612
+ elif isinstance(value, str):
613
+ try:
614
+ template = get_template(str(value))
615
+ this = template.render(**env)
616
+ except Exception as e:
617
+ logger.error("=" * 80)
618
+ logger.error("Error rendering Jinja2 template: %s", value[:200] if len(value) > 200 else value)
619
+ transform_obj = env.get('transform', {})
620
+ transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown'
621
+ logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown'))
622
+ logger.error("Error type: %s", type(e).__name__)
623
+ logger.error("Error message: %s", str(e))
624
+ logger.error("Template variables referenced in template:")
625
+ # Try to extract variable references from the template
626
+ matches = TEMPLATE_VAR_PATTERN.findall(value)
627
+ if matches:
628
+ for match in matches:
629
+ var_name = match.strip().split('.')[0].split('[')[0].strip()
630
+ if var_name in env:
631
+ val = env[var_name]
632
+ if type(val).__name__ == 'Element':
633
+ # XML Element
634
+ try:
635
+ val = xml.etree.ElementTree.tostring(val).decode('utf-8', errors='replace')[:200]
636
+ except Exception:
637
+ val = "<XML Element>"
638
+ else:
639
+ try:
640
+ val = str(val)[:200]
641
+ except Exception:
642
+ val = f"<{type(val).__name__}>"
643
+ logger.error(" %s = %s", var_name, val)
644
+ else:
645
+ logger.error(" %s = <NOT FOUND>", var_name)
646
+ logger.error("=" * 80)
647
+ raise RuntimeError(f"Error rendering template: {type(e).__name__}: {str(e)}") from e
648
+ else:
649
+ this = value
650
+
651
+ if key is not None:
652
+ parent[key] = this
653
+ else:
654
+ parent.append(this)
655
+
656
+ return flatten_lists(result)
657
+
658
+ def json_transform(transform, resources):
659
+ logger.info("Transform %s", transform.identifier)
660
+ tables = [u for u in transform[prov.used]]
661
+ variables = {}
662
+ for usage in transform[prov.qualifiedUsage]:
663
+ used = usage.value(prov.entity)
664
+ role = usage.value(prov.hadRole)
665
+ roleID = role.value(dc.identifier)
666
+ variables[roleID.value] = resources[used.identifier]
667
+ #print "Using", used.identifier, "as", roleID.value
668
+
669
+ generated = list(transform.subjects(prov.wasGeneratedBy))[0]
670
+ logger.info("Generating %s", generated.identifier)
671
+
672
+ connected_downstream_graph = '''
673
+ construct {
674
+ ?target ?p ?o
675
+ } where {
676
+ ?source (<>|!<>)* ?target.
677
+ ?target ?p ?o.
678
+ }
679
+ '''
680
+ shape_graph = rdflib.Graph()
681
+ for shape in transform.objects(dc.conformsTo):
682
+ if shape[rdflib.RDF.type:shacl.NodeShape] or shape[rdflib.RDF.type:shacl.PropertyShape]:
683
+ logger.info("Validating against SHACL shape %s", shape.identifier)
684
+ shape_graph += transform.graph.query(connected_downstream_graph,
685
+ initBindings={"source":shape.identifier})
686
+ if generated.identifier in resources:
687
+ result = resources[generated.identifier]
688
+ else:
689
+ result = rdflib.ConjunctiveGraph()
690
+ if generated[rdflib.RDF.type : setl.Persisted]:
691
+ store = TrigStore()
692
+ result = rdflib.ConjunctiveGraph(store=store)
693
+ if generated[rdflib.RDF.type : setl.Persisted]:
694
+ tempdir = tempfile.mktemp()
695
+ logger.info("Persisting %s to %s", generated.identifier, tempdir)
696
+ result.store.open(tempdir, True)
697
+ s = transform.value(prov.value).value
698
+ try:
699
+ jslt = json.loads(s)
700
+ except json.JSONDecodeError as e:
701
+ logger.error("Error parsing JSON-LD template for transform %s", transform.identifier)
702
+ lineno = getattr(e, 'lineno', 0)
703
+ colno = getattr(e, 'colno', 0)
704
+ msg = getattr(e, 'msg', str(e))
705
+ logger.error("JSON parsing error at line %d, column %d: %s", lineno, colno, msg)
706
+ # Show context around the error (8 lines before, 3 after for better bracket matching)
707
+ lines = s.split("\n")
708
+ start_line = max(0, lineno - 8)
709
+ end_line = min(len(lines), lineno + 3)
710
+ logger.error("Template context:")
711
+ for i in range(start_line, end_line):
712
+ prefix = ">>> " if i == lineno - 1 else " "
713
+ logger.error("%s%d: %s", prefix, i + 1, lines[i])
714
+ raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {msg} at line {lineno}, column {colno}") from e
715
+ except Exception as e:
716
+ logger.error("Error parsing JSON-LD template for transform %s: %s", transform.identifier, str(e))
717
+ logger.error("Template content:\n%s", s[:500]) # Show first 500 chars
718
+ raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {str(e)}") from e
719
+ context = transform.value(setl.hasContext)
720
+ if context is not None:
721
+ context = json.loads(context.value)
722
+ for t in tables:
723
+ logger.info("Using %s", t.identifier)
724
+ table = resources[t.identifier]
725
+ it = table
726
+ if isinstance(table, pandas.DataFrame):
727
+ #if run_samples:
728
+ # table = table.head()
729
+ it = tqdm(table.iterrows(), total=table.shape[0])
730
+ #logger.info("Transforming %s rows.", len(table.index))
731
+ else:
732
+ logger.info("Transform %s", t.identifier)
733
+ for rowname, row in it:
734
+ if run_samples > 0 and rowname >= run_samples:
735
+ break
736
+ try:
737
+ root = None
738
+ data = None
739
+ root = {
740
+ "@id": generated.identifier,
741
+ "@graph": process_row(row, jslt, rowname, table, resources, transform, variables)
742
+ }
743
+ if context is not None:
744
+ root['@context'] = context
745
+
746
+ #logger.debug(json.dumps(root, indent=4))
747
+ #before = len(result)
748
+ #graph = rdflib.ConjunctiveGraph(identifier=generated.identifier)
749
+ #graph.parse(data=json.dumps(root),format="json-ld")
750
+ data = json.dumps(root)
751
+ #del root
752
+
753
+ if len(shape_graph) > 0:
754
+ d = rdflib.ConjunctiveGraph()
755
+ d.parse(data=data,format='json-ld')
756
+ conforms, report, message = validate(d,
757
+ shacl_graph=shape_graph,
758
+ advanced=True,
759
+ debug=False)
760
+ if not conforms:
761
+ print(message)
762
+ result.parse(data=data, format="json-ld")
763
+ #del data
764
+ #after = len(result)
765
+ #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.")
766
+ #sys.stdout.flush()
767
+ except Exception as e:
768
+ logger.error("=" * 80)
769
+ logger.error("Error in transform %s while processing row %s", transform.identifier, rowname)
770
+ if isinstance(table, pandas.DataFrame):
771
+ # Format row data with better NaN handling
772
+ row_dict = {}
773
+ for key, value in dict(row).items():
774
+ if pandas.isna(value):
775
+ row_dict[key] = "<empty/missing>"
776
+ else:
777
+ row_dict[key] = value
778
+ logger.error("Row data: %s", row_dict)
779
+ else:
780
+ logger.error("Row identifier: %s", rowname)
781
+
782
+ # Try to provide more specific error information
783
+ error_type = type(e).__name__
784
+ if "JSON-LD" in str(e) or "json" in str(e).lower():
785
+ logger.error("JSON-LD processing error: %s", str(e))
786
+ if data is not None:
787
+ logger.error("Generated JSON-LD (first 1000 chars):\n%s", data[:1000])
788
+ elif hasattr(e, 'lineno'):
789
+ logger.error("%s at line %d: %s", error_type, e.lineno, str(e))
790
+ else:
791
+ logger.error("%s: %s", error_type, str(e))
792
+
793
+ logger.error("=" * 80)
794
+ raise RuntimeError(f"Failed to transform row {rowname} in transform {transform.identifier}: {error_type}: {str(e)}") from e
795
+
796
+ resources[generated.identifier] = result
797
+
798
+ def transform(transform_resource, resources):
799
+ logger.info('Transforming %s',transform_resource.identifier)
800
+
801
+ transform_graph = rdflib.ConjunctiveGraph()
802
+ for result in transform_graph.subjects(prov.wasGeneratedBy):
803
+ transform_graph = rdflib.ConjunctiveGraph(identifier=result.identifier)
804
+
805
+ used = set(transform_resource[prov.used])
806
+
807
+ for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]:
808
+ csv_graph = rdflib.Graph(store=transform_graph.store,
809
+ identifier=csv_file)
810
+ csv_graph += resources[csv_file.identifier]
811
+
812
+
813
+ for script in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]:
814
+ logger.info("Script: %s", script.identifier)
815
+ s = script.value(prov.value).value
816
+ local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph)
817
+ global_vars = dict()
818
+ exec(s, global_vars, local_vars)
819
+
820
+ for jsldt in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]:
821
+ logger.info("Script: %s", script.identifier)
822
+ s = script.value(prov.value).value
823
+ local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph)
824
+ global_vars = dict()
825
+ exec(s, global_vars, local_vars)
826
+
827
+ for update in [u for u in used if u[rdflib.RDF.type:sp.Update]]:
828
+ logger.info("Update: %s", update.identifier)
829
+ query = update.value(prov.value).value
830
+ transform_graph.update(query)
831
+
832
+ for construct in [u for u in used if u[rdflib.RDF.type:sp.Construct]]:
833
+ logger.info("Construct: %s", construct.identifier)
834
+ query = construct.value(prov.value).value
835
+ g = transform_graph.query(query)
836
+ transform_graph += g
837
+
838
+ for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]:
839
+ g = rdflib.Graph(identifier=csv_file.identifier,store=transform_graph.store)
840
+ g.remove((None, None, None))
841
+ transform_graph.store.remove_graph(csv_file.identifier)
842
+
843
+ for result in transform_graph.subjects(prov.wasGeneratedBy):
844
+ resources[result.identifier] = transform_graph
845
+
846
+ def _load_open(generated):
847
+ if generated.identifier.startswith("file://"):
848
+ if os.name == 'nt': # skip the initial
849
+ filename = generated.identifier.replace('file:///','').replace('file://','')
850
+ else:
851
+ filename = generated.identifier.replace('file://','')
852
+
853
+ fh = open(filename, 'wb')
854
+ for type, pack in packers.items():
855
+ if generated[rdflib.RDF.type : type]:
856
+ return pack(fh)
857
+ return fh
858
+
859
+ def load(load_resource, resources):
860
+ logger.info('Load %s',load_resource.identifier)
861
+ file_graph = rdflib.Dataset(default_union=True)
862
+ for used in load_resource[prov.used]:
863
+ if used[rdflib.RDF.type : setl.Persisted]:
864
+ file_graph = rdflib.Dataset(store='Sleepycat', default_union=True)
865
+ tempdir = tempfile.mkdtemp()
866
+ logger.debug("Gathering %s into %s", load_resource.identifier, tempdir)
867
+ file_graph.store.open(tempdir, True)
868
+ break
869
+ if len(list(load_resource[prov.used])) == 1:
870
+ logger.info("Using %s",load_resource.value(prov.used).identifier)
871
+ file_graph = resources[load_resource.value(prov.used).identifier]
872
+ else:
873
+ for used in load_resource[prov.used]:
874
+ logger.info("Using %s",used.identifier)
875
+ used_graph = resources[used.identifier]
876
+ file_graph.namespace_manager = used_graph.namespace_manager
877
+ #print used_graph.serialize(format="trig")
878
+ file_graph.addN(used_graph.quads())
879
+
880
+ for generated in load_resource.subjects(prov.wasGeneratedBy):
881
+ # TODO: support LDP-based loading
882
+ if generated[rdflib.RDF.type:pv.File]:
883
+ fmt = generated.value(dc['format'])
884
+ if fmt is not None:
885
+ fmt = fmt.value
886
+ if fmt in formats:
887
+ fmt = formats[fmt]
888
+ #print fmt
889
+ with _load_open(generated) as o:
890
+ file_graph.serialize(o, format=fmt)
891
+
892
+ elif generated[rdflib.RDF.type:sd.Service]:
893
+ from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
894
+ endpoint = generated.value(sd.endpoint, default=generated).identifier
895
+ store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False)
896
+ endpoint_graph = rdflib.Dataset(store=store, identifier=generated.identifier, default_union=True)
897
+ endpoint_graph.addN(file_graph.quads())
898
+ endpoint_graph.commit()
899
+
900
+
901
+ actions = {
902
+ setl.Extract : extract,
903
+ setl.Transform : json_transform,
904
+ setl.Load : load,
905
+ setl.PythonScript : create_python_function,
906
+ setl.IsEmpty : isempty
907
+ }
908
+
909
+ def _setl(setl_graph):
910
+ """Internal implementation function. Use run_setl() instead.
911
+
912
+ This function is deprecated and maintained for backward compatibility.
913
+
914
+ Args:
915
+ setl_graph: A ConjunctiveGraph containing the SETL script.
916
+
917
+ Returns:
918
+ dict: A dictionary of resources created during the SETL process.
919
+ """
920
+ import warnings
921
+ warnings.warn(
922
+ "_setl() is deprecated and will be removed in a future version. "
923
+ "Use run_setl() instead, which provides the same functionality with better documentation.",
924
+ DeprecationWarning,
925
+ stacklevel=2
926
+ )
927
+ return run_setl(setl_graph)
928
+
929
+
930
+ def run_setl(setl_graph):
931
+ """Execute a SETL (Semantic Extract, Transform, Load) script.
932
+
933
+ This is the main entry point for programmatically running SETL scripts.
934
+ It processes a SETL graph containing extraction, transformation, and loading
935
+ instructions for working with RDF data.
936
+
937
+ Args:
938
+ setl_graph (ConjunctiveGraph): A ConjunctiveGraph containing the SETL script
939
+ in RDF format. The graph should define resources with types from the
940
+ SETL vocabulary (http://purl.org/twc/vocab/setl/) including:
941
+ - setl:Extract: Extract data from sources
942
+ - setl:Transform: Transform data using JSON-LD templates
943
+ - setl:Load: Load data to destinations
944
+
945
+ Returns:
946
+ dict: A dictionary mapping resource URIs to their generated content.
947
+ The dictionary contains:
948
+ - Extracted data (DataFrames, RDF graphs, etc.)
949
+ - Transformed RDF graphs
950
+ - References to action functions
951
+
952
+ Example:
953
+ >>> from rdflib import ConjunctiveGraph
954
+ >>> from setlr import run_setl
955
+ >>>
956
+ >>> # Load a SETL script
957
+ >>> setl_graph = ConjunctiveGraph()
958
+ >>> setl_graph.parse("my_script.setl.ttl", format="turtle")
959
+ >>>
960
+ >>> # Execute the script
961
+ >>> resources = run_setl(setl_graph)
962
+ >>>
963
+ >>> # Access generated resources
964
+ >>> output_graph = resources['http://example.com/output']
965
+
966
+ Raises:
967
+ RuntimeError: If there are errors during extraction, transformation, or loading.
968
+ ValueError: If the SETL script contains invalid JSON-LD templates or configuration.
969
+
970
+ Note:
971
+ This function initializes the module logger if not already set and processes
972
+ all SETL tasks in topological order based on their dependencies.
973
+ """
974
+ global logger
975
+ if logger is None:
976
+ logger = logging.getLogger(__name__)
977
+ resources = {}
978
+ resources.update(actions)
979
+
980
+ tasks = [setl_graph.resource(t) for t in get_order(setl_graph)]
981
+
982
+ for task in tasks:
983
+ action = [actions[t.identifier] for t in task[rdflib.RDF.type] if t.identifier in actions]
984
+ if len(action) > 0:
985
+ action[0](task, resources)
986
+ return resources
987
+
988
+
989
+ logger = None
990
+
991
+ @click.command()
992
+ @click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.")
993
+ @click.option('-n', default=-1, help="Only process the first N rows.", type=int)
994
+ #@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.")
995
+ #@click.option('--text-validation', default=None, help="Save the text validation report to this file.")
996
+ @click.argument('script', type=click.Path(exists=True))
997
+ def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1):
998
+ """Command-line interface for running SETL scripts.
999
+
1000
+ Args:
1001
+ script: Path to the SETL script file (Turtle format).
1002
+ quiet: If True, minimize logging output.
1003
+ n: Only process the first N rows (-1 for all rows).
1004
+ """
1005
+ logging_level = logging.DEBUG
1006
+ if quiet:
1007
+ logging_level = logging.WARNING
1008
+ logging.basicConfig(level=logging_level)
1009
+
1010
+ global logger
1011
+ logger = logging.getLogger(__name__)
1012
+
1013
+ global run_samples
1014
+ run_samples = n
1015
+ setl_graph = rdflib.ConjunctiveGraph()
1016
+ content = open(script).read()
1017
+ setl_graph.parse(data=content, format="turtle")
1018
+
1019
+ run_setl(setl_graph)