mustrd 0.1.8__py3-none-any.whl → 0.2.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mustrd/spec_component.py CHANGED
@@ -1,682 +1,617 @@
1
- """
2
- MIT License
3
-
4
- Copyright (c) 2023 Semantic Partners Ltd
5
-
6
- Permission is hereby granted, free of charge, to any person obtaining a copy
7
- of this software and associated documentation files (the "Software"), to deal
8
- in the Software without restriction, including without limitation the rights
9
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
- copies of the Software, and to permit persons to whom the Software is
11
- furnished to do so, subject to the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be included in all
14
- copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- SOFTWARE.
23
- """
24
-
25
- import os
26
- from dataclasses import dataclass, field
27
- from itertools import groupby
28
- from pathlib import Path
29
- from typing import Tuple, List, Type
30
-
31
- import pandas
32
- import requests
33
- from rdflib import RDF, Graph, URIRef, Variable, Literal, XSD, util
34
- from rdflib.exceptions import ParserError
35
- from rdflib.term import Node
36
- import logging
37
-
38
- from . import logger_setup
39
- from .mustrdAnzo import get_queries_for_layer, get_queries_from_templated_step, get_spec_component_from_graphmart, get_query_from_querybuilder, get_query_from_step
40
- from .namespace import MUST, TRIPLESTORE
41
- from multimethods import MultiMethod, Default
42
-
43
- log = logger_setup.setup_logger(__name__)
44
-
45
-
46
- @dataclass
47
- class SpecComponent:
48
- pass
49
-
50
-
51
- @dataclass
52
- class GivenSpec(SpecComponent):
53
- value: Graph = None
54
-
55
-
56
- @dataclass
57
- class WhenSpec(SpecComponent):
58
- value: str = None
59
- queryType: URIRef = None
60
- bindings: dict = None
61
-
62
- @dataclass
63
- class AnzoWhenSpec(WhenSpec):
64
- paramQuery: str = None
65
- queryTemplate: str = None
66
-
67
- @dataclass
68
- class ThenSpec(SpecComponent):
69
- value: Graph = Graph()
70
- ordered: bool = False
71
-
72
-
73
- @dataclass
74
- class TableThenSpec(ThenSpec):
75
- value: pandas.DataFrame = field(default_factory=pandas.DataFrame)
76
-
77
-
78
- @dataclass
79
- class SpecComponentDetails:
80
- subject: URIRef
81
- predicate: URIRef
82
- spec_graph: Graph
83
- mustrd_triple_store: dict
84
- spec_component_node: Node
85
- data_source_type: Node
86
- run_config: dict
87
-
88
- def get_path(path_type: str, run_config: dict) -> Path:
89
- try:
90
- if str(run_config[path_type]).startswith("/"):
91
- return run_config[path_type]
92
- else:
93
- return Path(os.path.join(run_config['spec_path'], run_config[path_type]))
94
- except(KeyError):
95
- if str(run_config['data_path']).startswith("/"):
96
- return run_config['data_path']
97
- else:
98
- return Path(os.path.join(run_config['spec_path'], run_config['data_path']))
99
-
100
-
101
- def parse_spec_component(subject: URIRef,
102
- predicate: URIRef,
103
- spec_graph: Graph,
104
- run_config: dict,
105
- mustrd_triple_store: dict) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
106
- # print(f"parse_spec_component {subject=} {predicate=} ")
107
- spec_component_nodes = get_spec_component_nodes(subject, predicate, spec_graph)
108
- # all_data_source_types = []
109
- spec_components = []
110
- for spec_component_node in spec_component_nodes:
111
- data_source_types = get_data_source_types(subject, predicate, spec_graph, spec_component_node)
112
- for data_source_type in data_source_types:
113
- spec_component_details = SpecComponentDetails(
114
- subject=subject,
115
- predicate=predicate,
116
- spec_graph=spec_graph,
117
- mustrd_triple_store=mustrd_triple_store,
118
- spec_component_node=spec_component_node,
119
- data_source_type=data_source_type,
120
- run_config=run_config)
121
- spec_component = get_spec_component(spec_component_details)
122
- if type(spec_component) == list:
123
- spec_components += spec_component
124
- else:
125
- spec_components += [spec_component]
126
-
127
- # all_data_source_types.extend(data_source_types)
128
- # return all_data_source_types
129
- # merge multiple graphs into one, give error if spec config is a TableThen
130
- # print(f"calling multimethod with {spec_components}")
131
- return combine_specs(spec_components)
132
-
133
-
134
- def get_spec_component_type(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
135
- # Get the type of the first object in the list
136
- spec_type = type(spec_components[0])
137
- # Loop through the remaining objects in the list and check their types
138
- for spec_component in spec_components[1:]:
139
- if type(spec_component) != spec_type:
140
- # If an object has a different type, raise an error
141
- raise ValueError("All spec components must be of the same type")
142
-
143
- # If all objects have the same type, return the type
144
- return spec_type
145
-
146
-
147
- def combine_specs_dispatch(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
148
- spec_type = get_spec_component_type(spec_components)
149
- return spec_type
150
-
151
-
152
- combine_specs = MultiMethod("combine_specs", combine_specs_dispatch)
153
-
154
-
155
- @combine_specs.method(GivenSpec)
156
- def _combine_given_specs(spec_components: List[GivenSpec]) -> GivenSpec:
157
- if len(spec_components) == 1:
158
- return spec_components[0]
159
- else:
160
- graph = Graph()
161
- for spec_component in spec_components:
162
- graph += spec_component.value
163
- given_spec = GivenSpec()
164
- given_spec.value = graph
165
- return given_spec
166
-
167
-
168
- @combine_specs.method(WhenSpec)
169
- def _combine_when_specs(spec_components: List[WhenSpec]) -> WhenSpec:
170
- return spec_components
171
-
172
-
173
- @combine_specs.method(ThenSpec)
174
- def _combine_then_specs(spec_components: List[ThenSpec]) -> ThenSpec:
175
- if len(spec_components) == 1:
176
- return spec_components[0]
177
- else:
178
- graph = Graph()
179
- for spec_component in spec_components:
180
- graph += spec_component.value
181
- then_spec = ThenSpec()
182
- then_spec.value = graph
183
- return then_spec
184
-
185
-
186
- @combine_specs.method(TableThenSpec)
187
- def _combine_table_then_specs(spec_components: List[TableThenSpec]) -> TableThenSpec:
188
- if len(spec_components) != 1:
189
- raise ValueError(f"Parsing of multiple components of MUST.then for tables not implemented")
190
- return spec_components[0]
191
-
192
-
193
- @combine_specs.method(Default)
194
- def _combine_specs_default(spec_components: List[SpecComponent]):
195
- raise ValueError(f"Parsing of multiple components of this type not implemented {spec_components}")
196
-
197
- def get_data_source_types(subject: URIRef, predicate: URIRef, spec_graph: Graph, source_node: Node) -> List[Node]:
198
- data_source_types = []
199
- for data_source_type in spec_graph.objects(subject=source_node, predicate=RDF.type):
200
- data_source_types.append(data_source_type)
201
- # data_source_type = spec_graph.value(subject=source_node, predicate=RDF.type)
202
- if len(data_source_types) == 0:
203
- raise ValueError(f"Node has no rdf type {subject} {predicate}")
204
- return data_source_types
205
-
206
- # https://github.com/Semantic-partners/mustrd/issues/99
207
- def get_spec_component_dispatch(spec_component_details: SpecComponentDetails) -> Tuple[Node, URIRef]:
208
- return spec_component_details.data_source_type, spec_component_details.predicate
209
-
210
-
211
- get_spec_component = MultiMethod("get_spec_component", get_spec_component_dispatch)
212
-
213
-
214
- @get_spec_component.method((MUST.InheritedDataset, MUST.given))
215
- def _get_spec_component_inheritedstate_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
216
- spec_component = init_spec_component(spec_component_details.predicate)
217
- return spec_component
218
-
219
-
220
- @get_spec_component.method((MUST.FolderDataset, MUST.given))
221
- def _get_spec_component_folderdatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
222
- spec_component = init_spec_component(spec_component_details.predicate)
223
-
224
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
225
- predicate=MUST.fileName)
226
-
227
- path = Path(os.path.join(str(get_path('given_path',spec_component_details.run_config)), str(file_name)))
228
- try:
229
- spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
230
- except ParserError as e:
231
- log.error(f"Problem parsing {path}, error of type {type(e)}")
232
- raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
233
- return spec_component
234
-
235
-
236
- @get_spec_component.method((MUST.FolderSparqlSource, MUST.when))
237
- def _get_spec_component_foldersparqlsource_when(spec_component_details: SpecComponentDetails) -> GivenSpec:
238
- spec_component = init_spec_component(spec_component_details.predicate)
239
-
240
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
241
- predicate=MUST.fileName)
242
-
243
- path = Path(os.path.join(str(get_path('when_path',spec_component_details.run_config)), str(file_name)))
244
- spec_component.value = get_spec_component_from_file(path)
245
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
246
- predicate=MUST.queryType)
247
- return spec_component
248
-
249
-
250
- @get_spec_component.method((MUST.FolderDataset, MUST.then))
251
- def _get_spec_component_folderdatasource_then(spec_component_details: SpecComponentDetails) -> ThenSpec:
252
- spec_component = init_spec_component(spec_component_details.predicate)
253
-
254
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
255
- predicate=MUST.fileName)
256
- path = Path(os.path.join(str(get_path('then_path',spec_component_details.run_config)), str(file_name)))
257
-
258
- return load_dataset_from_file(path, spec_component)
259
-
260
- @get_spec_component.method((MUST.FileDataset, MUST.given))
261
- @get_spec_component.method((MUST.FileDataset, MUST.then))
262
- def _get_spec_component_filedatasource(spec_component_details: SpecComponentDetails) -> GivenSpec:
263
- spec_component = init_spec_component(spec_component_details.predicate)
264
- return load_spec_component(spec_component_details, spec_component)
265
-
266
- def load_spec_component(spec_component_details, spec_component):
267
- where_did_i_load_this_spec_from = spec_component_details.spec_graph.value(subject=spec_component_details.subject,
268
- predicate=MUST.specSourceFile)
269
- if (where_did_i_load_this_spec_from == None):
270
- log.error(f"{where_did_i_load_this_spec_from=} was None for test_spec={spec_component_details.subject}, we didn't set the test specifications specSourceFile when loading, spec_graph={spec_component_details.spec_graph}")
271
- file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
272
- predicate=MUST.file)))
273
-
274
- test_spec_file_path = os.path.dirname(where_did_i_load_this_spec_from)
275
-
276
- # first we try local relative to the test_spec_file_path, then we try relative to the path under test
277
- # we intentionally don't try for absolute files, but you should feel free to argue that we should do.
278
- paths = [
279
- Path(test_spec_file_path, file_path),
280
- Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
281
- ]
282
-
283
- for path in paths:
284
- if (os.path.exists(path)):
285
- return load_dataset_from_file(path, spec_component)
286
-
287
- raise FileNotFoundError(f"Could not find file {file_path=} in any of the {paths=}")
288
-
289
-
290
- def load_dataset_from_file(path: Path, spec_component: ThenSpec) -> ThenSpec:
291
- if path.is_dir():
292
- raise ValueError(f"Path {path} is a directory, expected a file")
293
-
294
- # https://github.com/Semantic-partners/mustrd/issues/94
295
- if path.suffix in {".csv", ".xlsx", ".xls"}:
296
- df = pandas.read_csv(path) if path.suffix == ".csv" else pandas.read_excel(path)
297
- then_spec = TableThenSpec()
298
- then_spec.value = df
299
- return then_spec
300
- else:
301
- try:
302
- file_format = util.guess_format(str(path))
303
- except AttributeError:
304
- raise ValueError(f"Unsupported file format: {path.suffix}")
305
-
306
- if file_format is not None:
307
- g = Graph()
308
- try:
309
- g.parse(data=get_spec_component_from_file(path), format=file_format)
310
- except ParserError as e:
311
- log.error(f"Problem parsing {path}, error of type {type(e)}")
312
- raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
313
- spec_component.value = g
314
- return spec_component
315
-
316
-
317
-
318
- @get_spec_component.method((MUST.FileSparqlSource, MUST.when))
319
- def _get_spec_component_filedatasource_when(spec_component_details: SpecComponentDetails) -> SpecComponent:
320
- spec_component = init_spec_component(spec_component_details.predicate)
321
-
322
- file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
323
- predicate=MUST.file)))
324
- if str(file_path).startswith("/"): # absolute path
325
- path = file_path
326
- else: #relative path
327
- path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
328
- spec_component.value = get_spec_component_from_file(path)
329
-
330
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
331
- predicate=MUST.queryType)
332
-
333
- return spec_component
334
-
335
-
336
- # @get_spec_component.method((MUST.FileDataset, MUST.then))
337
- # def _get_spec_component_filedatasource_then(spec_component_details: SpecComponentDetails) -> SpecComponent:
338
- # spec_component = init_spec_component(spec_component_details.predicate)
339
-
340
- # file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
341
- # predicate=MUST.file)))
342
- # if str(file_path).startswith("/"): # absolute path
343
- # path = file_path
344
- # else: #relative path
345
- # path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
346
- # return get_then_from_file(path, spec_component)
347
-
348
-
349
- @get_spec_component.method((MUST.TextSparqlSource, MUST.when))
350
- def _get_spec_component_TextSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
351
- spec_component = init_spec_component(spec_component_details.predicate)
352
-
353
- # Get specComponent directly from config file (in text string)
354
- spec_component.value = str(
355
- spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
356
- predicate=MUST.queryText))
357
-
358
- spec_component.bindings = get_when_bindings(spec_component_details.subject, spec_component_details.spec_graph)
359
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
360
- predicate=MUST.queryType)
361
- return spec_component
362
-
363
-
364
- # https://github.com/Semantic-partners/mustrd/issues/98
365
- @get_spec_component.method((MUST.HttpDataset, MUST.given))
366
- @get_spec_component.method((MUST.HttpDataset, MUST.when))
367
- @get_spec_component.method((MUST.HttpDataset, MUST.then))
368
- def _get_spec_component_HttpDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
369
- spec_component = init_spec_component(spec_component_details.predicate)
370
-
371
- # Get specComponent with http GET protocol
372
- spec_component.value = requests.get(str(
373
- spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
374
- predicate=MUST.dataSourceUrl)).content)
375
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
376
- predicate=MUST.queryType)
377
- return spec_component
378
-
379
-
380
- @get_spec_component.method((MUST.TableDataset, MUST.then))
381
- def _get_spec_component_TableDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
382
- table_then = TableThenSpec()
383
- # get specComponent from ttl table
384
- table_then.value = get_spec_from_table(spec_component_details.subject, spec_component_details.predicate,
385
- spec_component_details.spec_graph)
386
- table_then.ordered = is_then_select_ordered(spec_component_details.subject, spec_component_details.predicate,
387
- spec_component_details.spec_graph)
388
- return table_then
389
-
390
-
391
- @get_spec_component.method((MUST.EmptyTable, MUST.then))
392
- def _get_spec_component_EmptyTable(spec_component_details: SpecComponentDetails) -> SpecComponent:
393
- spec_component = TableThenSpec()
394
- return spec_component
395
-
396
-
397
- @get_spec_component.method((MUST.EmptyGraph, MUST.then))
398
- def _get_spec_component_EmptyGraph(spec_component_details: SpecComponentDetails) -> SpecComponent:
399
- spec_component = init_spec_component(spec_component_details.predicate)
400
-
401
- return spec_component
402
-
403
-
404
- @get_spec_component.method((MUST.StatementsDataset, MUST.given))
405
- @get_spec_component.method((MUST.StatementsDataset, MUST.then))
406
- def _get_spec_component_StatementsDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
407
- spec_component = init_spec_component(spec_component_details.predicate)
408
-
409
- spec_component.value = Graph().parse(
410
- data=get_spec_from_statements(spec_component_details.subject, spec_component_details.predicate,
411
- spec_component_details.spec_graph))
412
- return spec_component
413
-
414
-
415
- @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.given))
416
- @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.then))
417
- def _get_spec_component_AnzoGraphmartDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
418
- spec_component = init_spec_component(spec_component_details.predicate)
419
-
420
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
421
- # Get GIVEN or THEN from anzo graphmart
422
- graphmart = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
423
- predicate=MUST.graphmart)
424
- layer = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
425
- predicate=MUST.layer)
426
- spec_component.value = get_spec_component_from_graphmart(
427
- triple_store=spec_component_details.mustrd_triple_store,
428
- graphmart=graphmart,
429
- layer=layer)
430
- else:
431
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartDataset}")
432
-
433
- return spec_component
434
-
435
-
436
- @get_spec_component.method((MUST.AnzoQueryBuilderSparqlSource, MUST.when))
437
- def _get_spec_component_AnzoQueryBuilderSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
438
- spec_component = init_spec_component(spec_component_details.predicate)
439
-
440
- # Get WHEN specComponent from query builder
441
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
442
- query_folder = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
443
- predicate=MUST.queryFolder)
444
- query_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
445
- predicate=MUST.queryName)
446
- spec_component.value = get_query_from_querybuilder(triple_store=spec_component_details.mustrd_triple_store,
447
- folder_name=query_folder,
448
- query_name=query_name)
449
- # If anzo specific function is called but no anzo defined
450
- else:
451
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoQueryBuilderSparqlSource}")
452
-
453
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
454
- predicate=MUST.queryType)
455
- return spec_component
456
-
457
-
458
- @get_spec_component.method((MUST.AnzoGraphmartStepSparqlSource, MUST.when))
459
- def _get_spec_component_AnzoGraphmartStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
460
- spec_component = init_spec_component(spec_component_details.predicate)
461
-
462
- # Get WHEN specComponent from query builder
463
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
464
- query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
465
- predicate=MUST.anzoQueryStep)
466
- spec_component.value = get_query_from_step(triple_store=spec_component_details.mustrd_triple_store,
467
- query_step_uri=query_step_uri)
468
- # If anzo specific function is called but no anzo defined
469
- else:
470
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartStepSparqlSource}")
471
-
472
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
473
- predicate=MUST.queryType)
474
- return spec_component
475
-
476
- @get_spec_component.method((MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource, MUST.when))
477
- def _get_spec_component_AnzoGraphmartQueryDrivenTemplatedStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
478
- spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"] )
479
-
480
- # Get WHEN specComponent from query builder
481
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
482
- query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
483
- predicate=MUST.anzoQueryStep)
484
- queries = get_queries_from_templated_step(triple_store=spec_component_details.mustrd_triple_store,
485
- query_step_uri=query_step_uri)
486
- spec_component.paramQuery= queries["param_query"]
487
- spec_component.queryTemplate = queries["query_template"]
488
- # If anzo specific function is called but no anzo defined
489
- else:
490
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource}")
491
-
492
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
493
- predicate=MUST.queryType)
494
- return spec_component
495
-
496
- @get_spec_component.method((MUST.AnzoGraphmartLayerSparqlSource, MUST.when))
497
- def _get_spec_component_AnzoGraphmartLayerSparqlSource(spec_component_details: SpecComponentDetails) -> list:
498
- spec_components = []
499
- # Get the ordered WHEN specComponents which is the transform and query driven template queries for the Layer
500
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
501
- graphmart_layer_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
502
- predicate=MUST.anzoGraphmartLayer)
503
- queries = get_queries_for_layer(triple_store=spec_component_details.mustrd_triple_store,
504
- graphmart_layer_uri=graphmart_layer_uri)
505
- # If anzo specific function is called but no anzo defined
506
- else:
507
- raise ValueError(f"This test specification is specific to Anzo and can only be run against that platform.")
508
- for query in queries:
509
- spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"])
510
- spec_component.value = query.get("query")
511
- spec_component.paramQuery = query.get("param_query")
512
- spec_component.queryTemplate = query.get("query_template")
513
- if spec_component.value:
514
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
515
- predicate=MUST.queryType)
516
- else:
517
- spec_component.queryType = MUST.AnzoQueryDrivenUpdateSparql
518
- spec_components += [spec_component]
519
- return spec_components
520
-
521
- @get_spec_component.method(Default)
522
- def _get_spec_component_default(spec_component_details: SpecComponentDetails) -> SpecComponent:
523
- raise ValueError(
524
- f"Invalid combination of data source type ({spec_component_details.data_source_type}) and "
525
- f"spec component ({spec_component_details.predicate})")
526
-
527
-
528
- def init_spec_component(predicate: URIRef, triple_store_type: URIRef = None ) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
529
- if predicate == MUST.given:
530
- spec_component = GivenSpec()
531
- elif predicate == MUST.when:
532
- if triple_store_type == TRIPLESTORE.Anzo:
533
- spec_component = AnzoWhenSpec()
534
- else:
535
- spec_component = WhenSpec()
536
- elif predicate == MUST.then:
537
- spec_component = ThenSpec()
538
- else:
539
- spec_component = SpecComponent()
540
- return spec_component
541
-
542
-
543
- def get_spec_component_nodes(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> List[Node]:
544
- spec_component_nodes = []
545
- for spec_component_node in spec_graph.objects(subject=subject, predicate=predicate):
546
- spec_component_nodes.append(spec_component_node)
547
- # It shouldn't even be possible to get this far as an empty node indicates an invalid RDF file
548
- if spec_component_nodes is None:
549
- raise ValueError(f"specComponent Node empty for {subject} {predicate}")
550
- return spec_component_nodes
551
-
552
-
553
- def get_spec_component_from_file(path: Path) -> str:
554
- # project_root = get_project_root()
555
- # file_path = Path(os.path.join(project_root, path))
556
-
557
- if path.is_dir():
558
- raise ValueError(f"Path {path} is a directory, expected a file")
559
-
560
- try:
561
- content = path.read_text()
562
- except FileNotFoundError:
563
- raise
564
- return str(content)
565
-
566
-
567
- def get_spec_from_statements(subject: URIRef,
568
- predicate: URIRef,
569
- spec_graph: Graph) -> Graph:
570
- statements_query = f"""
571
- prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
572
-
573
- CONSTRUCT {{ ?s ?p ?o }}
574
- {{
575
- <{subject}> <{predicate}> [
576
- a <{MUST.StatementsDataset}> ;
577
- <{MUST.hasStatement}> [
578
- a rdf:Statement ;
579
- rdf:subject ?s ;
580
- rdf:predicate ?p ;
581
- rdf:object ?o ;
582
- ] ;
583
- ]
584
-
585
- }}
586
- """
587
- results = spec_graph.query(statements_query).graph
588
- return results.serialize(format="ttl")
589
-
590
-
591
- def get_spec_from_table(subject: URIRef,
592
- predicate: URIRef,
593
- spec_graph: Graph) -> pandas.DataFrame:
594
- # query the spec to get the expected result to convert to dataframe for comparison
595
- then_query = f"""
596
- prefix sh: <http://www.w3.org/ns/shacl#>
597
- SELECT ?row ?variable ?binding ?order
598
- WHERE {{
599
- <{subject}> <{predicate}> [
600
- a <{MUST.TableDataset}> ;
601
- <{MUST.hasRow}> ?row ].
602
- ?row <{MUST.hasBinding}> [
603
- <{MUST.variable}> ?variable ;
604
- <{MUST.boundValue}> ?binding ; ] .
605
- OPTIONAL {{ ?row sh:order ?order . }}
606
- .}}
607
- ORDER BY ?order"""
608
-
609
- expected_results = spec_graph.query(then_query)
610
- # get the unique row ids form the result to form the index of the results dataframe
611
- index = {str(row.row) for row in expected_results}
612
- # get the unique variables to form the columns of the results dataframe
613
- columns = set()
614
- for row in expected_results:
615
- columns.add(row.variable.value)
616
- columns.add(row.variable.value + "_datatype")
617
- # add an additional column for the sort order (if any) of the results
618
- columns.add("order")
619
- # create an empty dataframe to populate with the results
620
- df = pandas.DataFrame(index=list(index), columns=list(columns))
621
- # fill the dataframe with the results data
622
- for row in expected_results:
623
- df.loc[str(row.row), row.variable.value] = str(row.binding)
624
- df.loc[str(row.row), "order"] = row.order
625
- if type(row.binding) == Literal:
626
- literal_type = str(XSD.string)
627
- if hasattr(row.binding, "datatype") and row.binding.datatype:
628
- literal_type = str(row.binding.datatype)
629
- df.loc[str(row.row), row.variable.value + "_datatype"] = literal_type
630
- else:
631
- df.loc[str(row.row), row.variable.value + "_datatype"] = str(XSD.anyURI)
632
- # use the sort order sort the results
633
- df.sort_values(by="order", inplace=True)
634
- # drop the order column and replace the rowid index with a numeric one and replace empty values with spaces
635
- df.drop(columns="order", inplace=True)
636
- df.reset_index(drop=True, inplace=True)
637
- df.fillna('', inplace=True)
638
- return df
639
-
640
-
641
- def get_when_bindings(subject: URIRef,
642
- spec_graph: Graph) -> dict:
643
- when_bindings_query = f"""SELECT ?variable ?binding {{ <{subject}> <{MUST.when}> [ a <{MUST.TextSparqlSource}> ; <{MUST.hasBinding}> [ <{MUST.variable}> ?variable ; <{MUST.boundValue}> ?binding ; ] ; ] ;}}"""
644
- when_bindings = spec_graph.query(when_bindings_query)
645
-
646
- if len(when_bindings.bindings) == 0:
647
- return {}
648
- else:
649
- bindings = {}
650
- for binding in when_bindings:
651
- bindings[Variable(binding.variable.value)] = binding.binding
652
- return bindings
653
-
654
-
655
- def is_then_select_ordered(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> bool:
656
- ask_select_ordered = f"""
657
- ASK {{
658
- {{SELECT (count(?binding) as ?totalBindings) {{
659
- <{subject}> <{predicate}> [
660
- a <{MUST.TableDataset}> ;
661
- <{MUST.hasRow}> [ <{MUST.hasBinding}> [
662
- <{MUST.variable}> ?variable ;
663
- <{MUST.boundValue}> ?binding ;
664
- ] ;
665
- ]
666
- ]
667
- }} }}
668
- {{SELECT (count(?binding) as ?orderedBindings) {{
669
- <{subject}> <{predicate}> [
670
- a <{MUST.TableDataset}> ;
671
- <{MUST.hasRow}> [ sh:order ?order ;
672
- <{MUST.hasBinding}> [
673
- <{MUST.variable}> ?variable ;
674
- <{MUST.boundValue}> ?binding ;
675
- ] ;
676
- ]
677
- ]
678
- }} }}
679
- FILTER(?totalBindings = ?orderedBindings)
680
- }}"""
681
- is_ordered = spec_graph.query(ask_select_ordered)
682
- return is_ordered.askAnswer
1
+ """
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Semantic Partners Ltd
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ from dataclasses import dataclass, field
27
+ from itertools import groupby
28
+ from pathlib import Path
29
+ from typing import Tuple, List, Type
30
+
31
+ import pandas
32
+ import requests
33
+ from rdflib import RDF, Graph, URIRef, Variable, Literal, XSD, util
34
+ from rdflib.exceptions import ParserError
35
+ from rdflib.term import Node
36
+ import logging
37
+
38
+ import logger_setup
39
+ from mustrdAnzo import get_spec_component_from_graphmart, get_query_from_querybuilder, get_query_from_step
40
+ from namespace import MUST
41
+ from utils import get_project_root
42
+ from multimethods import MultiMethod, Default
43
+
44
+ log = logger_setup.setup_logger(__name__)
45
+
46
+
47
+ @dataclass
48
+ class SpecComponent:
49
+ pass
50
+
51
+
52
+ @dataclass
53
+ class GivenSpec(SpecComponent):
54
+ value: Graph = None
55
+
56
+
57
+ @dataclass
58
+ class WhenSpec(SpecComponent):
59
+ value: str = None
60
+ queryType: URIRef = None
61
+ bindings: dict = None
62
+
63
+
64
+ @dataclass
65
+ class ThenSpec(SpecComponent):
66
+ value: Graph = Graph()
67
+ ordered: bool = False
68
+
69
+
70
+ @dataclass
71
+ class TableThenSpec(ThenSpec):
72
+ value: pandas.DataFrame = field(default_factory=pandas.DataFrame)
73
+
74
+
75
+ @dataclass
76
+ class SpecComponentDetails:
77
+ subject: URIRef
78
+ predicate: URIRef
79
+ spec_graph: Graph
80
+ mustrd_triple_store: dict
81
+ spec_component_node: Node
82
+ data_source_type: Node
83
+ folder_location: Path
84
+
85
+
86
+ def parse_spec_component(subject: URIRef,
87
+ predicate: URIRef,
88
+ spec_graph: Graph,
89
+ folder_location: Path,
90
+ mustrd_triple_store: dict) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
91
+ spec_component_nodes = get_spec_component_nodes(subject, predicate, spec_graph)
92
+ # all_data_source_types = []
93
+ spec_components = []
94
+ for spec_component_node in spec_component_nodes:
95
+ data_source_types = get_data_source_types(subject, predicate, spec_graph, spec_component_node)
96
+ for data_source_type in data_source_types:
97
+ if data_source_type == MUST.FolderDataset and folder_location is None:
98
+ raise ValueError(
99
+ f"Cannot load data for {predicate}. "
100
+ f"{MUST.FolderDataset} needs to be used with parameter for folder path.")
101
+ logging.info(f"{folder_location=}")
102
+ spec_component_details = SpecComponentDetails(
103
+ subject=subject,
104
+ predicate=predicate,
105
+ spec_graph=spec_graph,
106
+ mustrd_triple_store=mustrd_triple_store,
107
+ spec_component_node=spec_component_node,
108
+ data_source_type=data_source_type,
109
+ folder_location=folder_location)
110
+ spec_components.append(get_spec_component(spec_component_details))
111
+ # all_data_source_types.extend(data_source_types)
112
+ # return all_data_source_types
113
+ # merge multiple graphs into one, give error if spec is When or TableThen
114
+ return combine_specs(spec_components)
115
+
116
+
117
+ def get_spec_component_type(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
118
+ # Get the type of the first object in the list
119
+ spec_type = type(spec_components[0])
120
+
121
+ # Loop through the remaining objects in the list and check their types
122
+ for spec_component in spec_components[1:]:
123
+ if type(spec_component) != spec_type:
124
+ # If an object has a different type, raise an error
125
+ raise ValueError("All spec components must be of the same type")
126
+
127
+ # If all objects have the same type, return the type
128
+ return spec_type
129
+
130
+
131
+ def combine_specs_dispatch(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
132
+ spec_type = get_spec_component_type(spec_components)
133
+ return spec_type
134
+
135
+
136
+ combine_specs = MultiMethod("combine_specs", combine_specs_dispatch)
137
+
138
+
139
+ @combine_specs.method(GivenSpec)
140
+ def _combine_given_specs(spec_components: List[GivenSpec]) -> GivenSpec:
141
+ if len(spec_components) == 1:
142
+ return spec_components[0]
143
+ else:
144
+ graph = Graph()
145
+ for spec_component in spec_components:
146
+ graph += spec_component.value
147
+ given_spec = GivenSpec()
148
+ given_spec.value = graph
149
+ return given_spec
150
+
151
+
152
+ @combine_specs.method(WhenSpec)
153
+ def _combine_when_specs(spec_components: List[WhenSpec]) -> WhenSpec:
154
+ if len(spec_components) != 1:
155
+ raise ValueError(f"Parsing of multiple components of {MUST.when} not implemented")
156
+ spec_component = spec_components[0]
157
+ return spec_component
158
+
159
+
160
+ @combine_specs.method(ThenSpec)
161
+ def _combine_then_specs(spec_components: List[ThenSpec]) -> ThenSpec:
162
+ if len(spec_components) == 1:
163
+ return spec_components[0]
164
+ else:
165
+ graph = Graph()
166
+ for spec_component in spec_components:
167
+ graph += spec_component.value
168
+ then_spec = ThenSpec()
169
+ then_spec.value = graph
170
+ return then_spec
171
+
172
+
173
+ @combine_specs.method(TableThenSpec)
174
+ def _combine_table_then_specs(spec_components: List[TableThenSpec]) -> TableThenSpec:
175
+ if len(spec_components) != 1:
176
+ raise ValueError(f"Parsing of multiple components of MUST.then for tables not implemented")
177
+ return spec_components[0]
178
+
179
+
180
+ @combine_specs.method(Default)
181
+ def _combine_specs_default(spec_components: List[SpecComponent]):
182
+ raise ValueError(f"Parsing of multiple components of this type not implemented")
183
+
184
+
185
+ # https://github.com/Semantic-partners/mustrd/issues/99
186
+ def get_spec_component_dispatch(spec_component_details: SpecComponentDetails) -> Tuple[Node, URIRef]:
187
+ return spec_component_details.data_source_type, spec_component_details.predicate
188
+
189
+
190
+ def get_data_source_types(subject: URIRef, predicate: URIRef, spec_graph: Graph, source_node: Node) -> List[Node]:
191
+ data_source_types = []
192
+ for data_source_type in spec_graph.objects(subject=source_node, predicate=RDF.type):
193
+ data_source_types.append(data_source_type)
194
+ # data_source_type = spec_graph.value(subject=source_node, predicate=RDF.type)
195
+ if len(data_source_types) == 0:
196
+ raise ValueError(f"Node has no rdf type {subject} {predicate}")
197
+ return data_source_types
198
+
199
+
200
+ get_spec_component = MultiMethod("get_spec_component", get_spec_component_dispatch)
201
+
202
+
203
+ @get_spec_component.method((MUST.InheritedDataset, MUST.given))
204
+ def _get_spec_component_inheritedstate_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
205
+ spec_component = init_spec_component(spec_component_details.predicate)
206
+ return spec_component
207
+
208
+
209
+ @get_spec_component.method((MUST.FolderDataset, MUST.given))
210
+ def _get_spec_component_folderdatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
211
+ spec_component = init_spec_component(spec_component_details.predicate)
212
+
213
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
214
+ predicate=MUST.fileName)
215
+
216
+ path = Path(os.path.join(str(spec_component_details.folder_location), str(file_name)))
217
+ try:
218
+ spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
219
+ except ParserError as e:
220
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
221
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
222
+ return spec_component
223
+
224
+
225
+ @get_spec_component.method((MUST.FolderSparqlSource, MUST.when))
226
+ def _get_spec_component_foldersparqlsource_when(spec_component_details: SpecComponentDetails) -> GivenSpec:
227
+ spec_component = init_spec_component(spec_component_details.predicate)
228
+
229
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
230
+ predicate=MUST.fileName)
231
+
232
+ path = Path(os.path.join(str(spec_component_details.folder_location), str(file_name)))
233
+ spec_component.value = get_spec_component_from_file(path)
234
+
235
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
236
+ spec_component_details.spec_component_node)
237
+ return spec_component
238
+
239
+
240
+ @get_spec_component.method((MUST.FolderDataset, MUST.then))
241
+ def _get_spec_component_folderdatasource_then(spec_component_details: SpecComponentDetails) -> ThenSpec:
242
+ spec_component = init_spec_component(spec_component_details.predicate)
243
+
244
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
245
+ predicate=MUST.fileName)
246
+ path = Path(os.path.join(str(spec_component_details.folder_location), str(file_name)))
247
+
248
+ return get_then_from_file(path, spec_component)
249
+
250
+
251
+ def get_then_from_file(path: Path, spec_component: ThenSpec) -> ThenSpec:
252
+ if path.is_dir():
253
+ raise ValueError(f"Path {path} is a directory, expected a file")
254
+
255
+ # https://github.com/Semantic-partners/mustrd/issues/94
256
+ if path.suffix in {".csv", ".xlsx", ".xls"}:
257
+ df = pandas.read_csv(path) if path.suffix == ".csv" else pandas.read_excel(path)
258
+ then_spec = TableThenSpec()
259
+ then_spec.value = df
260
+ return then_spec
261
+ else:
262
+ try:
263
+ file_format = util.guess_format(str(path))
264
+ except AttributeError:
265
+ raise ValueError(f"Unsupported file format: {path.suffix}")
266
+
267
+ if file_format is not None:
268
+ g = Graph()
269
+ try:
270
+ g.parse(data=get_spec_component_from_file(path), format=file_format)
271
+ except ParserError as e:
272
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
273
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
274
+ spec_component.value = g
275
+ return spec_component
276
+
277
+
278
+ @get_spec_component.method((MUST.FileDataset, MUST.given))
279
+ def _get_spec_component_filedatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
280
+ spec_component = init_spec_component(spec_component_details.predicate)
281
+
282
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
283
+ predicate=MUST.file)))
284
+ project_root = spec_component_details.folder_location
285
+ path = Path(os.path.join(project_root, file_path))
286
+ try:
287
+ spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
288
+ except ParserError as e:
289
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
290
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
291
+ return spec_component
292
+
293
+
294
+ @get_spec_component.method((MUST.FileSparqlSource, MUST.when))
295
+ def _get_spec_component_filedatasource_when(spec_component_details: SpecComponentDetails) -> SpecComponent:
296
+ spec_component = init_spec_component(spec_component_details.predicate)
297
+
298
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
299
+ predicate=MUST.file)))
300
+ project_root = get_project_root()
301
+ path = Path(os.path.join(project_root, file_path))
302
+ spec_component.value = get_spec_component_from_file(path)
303
+
304
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
305
+ spec_component_details.spec_component_node)
306
+ return spec_component
307
+
308
+
309
+ @get_spec_component.method((MUST.FileDataset, MUST.then))
310
+ def _get_spec_component_filedatasource_then(spec_component_details: SpecComponentDetails) -> SpecComponent:
311
+ spec_component = init_spec_component(spec_component_details.predicate)
312
+
313
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
314
+ predicate=MUST.file)))
315
+ # project_root = get_project_root()
316
+ # path = Path(os.path.join(project_root, file_path))
317
+ path = Path(os.path.join(spec_component_details.folder_location, file_path))
318
+ return get_then_from_file(path, spec_component)
319
+
320
+
321
+ @get_spec_component.method((MUST.TextSparqlSource, MUST.when))
322
+ def _get_spec_component_TextSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
323
+ spec_component = init_spec_component(spec_component_details.predicate)
324
+
325
+ # Get specComponent directly from config file (in text string)
326
+ spec_component.value = str(
327
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
328
+ predicate=MUST.queryText))
329
+
330
+ spec_component.bindings = get_when_bindings(spec_component_details.subject, spec_component_details.spec_graph)
331
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
332
+ spec_component_details.spec_component_node)
333
+
334
+ return spec_component
335
+
336
+
337
+ # https://github.com/Semantic-partners/mustrd/issues/98
338
+ @get_spec_component.method((MUST.HttpDataset, MUST.given))
339
+ @get_spec_component.method((MUST.HttpDataset, MUST.when))
340
+ @get_spec_component.method((MUST.HttpDataset, MUST.then))
341
+ def _get_spec_component_HttpDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
342
+ spec_component = init_spec_component(spec_component_details.predicate)
343
+
344
+ # Get specComponent with http GET protocol
345
+ spec_component.value = requests.get(str(
346
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
347
+ predicate=MUST.dataSourceUrl)).content)
348
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
349
+ spec_component_details.spec_component_node)
350
+ return spec_component
351
+
352
+
353
+ @get_spec_component.method((MUST.TableDataset, MUST.then))
354
+ def _get_spec_component_TableDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
355
+ table_then = TableThenSpec()
356
+ # get specComponent from ttl table
357
+ table_then.value = get_spec_from_table(spec_component_details.subject, spec_component_details.predicate,
358
+ spec_component_details.spec_graph)
359
+ table_then.ordered = is_then_select_ordered(spec_component_details.subject, spec_component_details.predicate,
360
+ spec_component_details.spec_graph)
361
+ return table_then
362
+
363
+
364
+ @get_spec_component.method((MUST.EmptyTable, MUST.then))
365
+ def _get_spec_component_EmptyTable(spec_component_details: SpecComponentDetails) -> SpecComponent:
366
+ spec_component = TableThenSpec()
367
+ return spec_component
368
+
369
+
370
+ @get_spec_component.method((MUST.EmptyGraph, MUST.then))
371
+ def _get_spec_component_EmptyGraph(spec_component_details: SpecComponentDetails) -> SpecComponent:
372
+ spec_component = init_spec_component(spec_component_details.predicate)
373
+
374
+ return spec_component
375
+
376
+
377
+ @get_spec_component.method((MUST.StatementsDataset, MUST.given))
378
+ @get_spec_component.method((MUST.StatementsDataset, MUST.then))
379
+ def _get_spec_component_StatementsDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
380
+ spec_component = init_spec_component(spec_component_details.predicate)
381
+
382
+ spec_component.value = Graph().parse(
383
+ data=get_spec_from_statements(spec_component_details.subject, spec_component_details.predicate,
384
+ spec_component_details.spec_graph))
385
+ return spec_component
386
+
387
+
388
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.given))
389
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.then))
390
+ def _get_spec_component_AnzoGraphmartDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
391
+ spec_component = init_spec_component(spec_component_details.predicate)
392
+
393
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
394
+ # Get GIVEN or THEN from anzo graphmart
395
+ graphmart = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
396
+ predicate=MUST.graphmart)
397
+ layer = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
398
+ predicate=MUST.layer)
399
+ spec_component.value = get_spec_component_from_graphmart(
400
+ triple_store=spec_component_details.mustrd_triple_store,
401
+ graphmart=graphmart,
402
+ layer=layer)
403
+ else:
404
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoGraphmartDataset}")
405
+
406
+ return spec_component
407
+
408
+
409
+ @get_spec_component.method((MUST.AnzoQueryBuilderSparqlSource, MUST.when))
410
+ def _get_spec_component_AnzoQueryBuilderSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
411
+ spec_component = init_spec_component(spec_component_details.predicate)
412
+
413
+ # Get WHEN specComponent from query builder
414
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
415
+ query_folder = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
416
+ predicate=MUST.queryFolder)
417
+ query_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
418
+ predicate=MUST.queryName)
419
+ spec_component.value = get_query_from_querybuilder(triple_store=spec_component_details.mustrd_triple_store,
420
+ folder_name=query_folder,
421
+ query_name=query_name)
422
+ # If anzo specific function is called but no anzo defined
423
+ else:
424
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoQueryBuilderSparqlSource}")
425
+
426
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
427
+ spec_component_details.spec_component_node)
428
+ return spec_component
429
+
430
+ @get_spec_component.method((MUST.AnzoGraphmartStepSparqlSource, MUST.when))
431
+ def _get_spec_component_AnzoGraphmartStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
432
+ spec_component = init_spec_component(spec_component_details.predicate)
433
+
434
+ # Get WHEN specComponent from query builder
435
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
436
+ query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
437
+ predicate=MUST.queryStepUri)
438
+ spec_component.value = get_query_from_step(triple_store=spec_component_details.mustrd_triple_store,
439
+ query_step_uri=query_step_uri)
440
+ # If anzo specific function is called but no anzo defined
441
+ else:
442
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoGraphmartStepSparqlSource}")
443
+
444
+ get_query_type(spec_component_details.predicate, spec_component_details.spec_graph, spec_component,
445
+ spec_component_details.spec_component_node)
446
+ return spec_component
447
+
448
+ @get_spec_component.method(Default)
449
+ def _get_spec_component_default(spec_component_details: SpecComponentDetails) -> SpecComponent:
450
+ raise ValueError(
451
+ f"Invalid combination of data source type ({spec_component_details.data_source_type}) and "
452
+ f"spec component ({spec_component_details.predicate})")
453
+
454
+
455
+ def init_spec_component(predicate: URIRef) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
456
+ if predicate == MUST.given:
457
+ spec_component = GivenSpec()
458
+ elif predicate == MUST.when:
459
+ spec_component = WhenSpec()
460
+ elif predicate == MUST.then:
461
+ spec_component = ThenSpec()
462
+ else:
463
+ spec_component = SpecComponent()
464
+
465
+ return spec_component
466
+
467
+
468
+ def get_query_type(predicate: URIRef, spec_graph: Graph, spec_component: SpecComponent, spec_component_node: Node):
469
+ if predicate == URIRef('https://mustrd.com/model/when'):
470
+ spec_component.queryType = spec_graph.value(subject=spec_component_node, predicate=MUST.queryType)
471
+
472
+
473
+ def get_spec_component_nodes(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> List[Node]:
474
+ spec_component_nodes = []
475
+ for spec_component_node in spec_graph.objects(subject=subject, predicate=predicate):
476
+ spec_component_nodes.append(spec_component_node)
477
+ # It shouldn't even be possible to get this far as an empty node indicates an invalid RDF file
478
+ if spec_component_nodes is None:
479
+ raise ValueError(f"specComponent Node empty for {subject} {predicate}")
480
+ return spec_component_nodes
481
+
482
+
483
+ def get_spec_component_from_file(path: Path) -> str:
484
+ # project_root = get_project_root()
485
+ # file_path = Path(os.path.join(project_root, path))
486
+
487
+ if path.is_dir():
488
+ raise ValueError(f"Path {path} is a directory, expected a file")
489
+
490
+ try:
491
+ content = path.read_text()
492
+ except FileNotFoundError:
493
+ raise
494
+ return str(content)
495
+
496
+
497
+ def get_spec_from_statements(subject: URIRef,
498
+ predicate: URIRef,
499
+ spec_graph: Graph) -> Graph:
500
+ statements_query = f"""
501
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
502
+
503
+ CONSTRUCT {{ ?s ?p ?o }}
504
+ {{
505
+ <{subject}> <{predicate}> [
506
+ a <{MUST.StatementsDataset}> ;
507
+ <{MUST.hasStatement}> [
508
+ a rdf:Statement ;
509
+ rdf:subject ?s ;
510
+ rdf:predicate ?p ;
511
+ rdf:object ?o ;
512
+ ] ;
513
+ ]
514
+
515
+ }}
516
+ """
517
+ results = spec_graph.query(statements_query).graph
518
+ return results.serialize(format="ttl")
519
+
520
+
521
+ def get_spec_from_table(subject: URIRef,
522
+ predicate: URIRef,
523
+ spec_graph: Graph) -> pandas.DataFrame:
524
+ then_query = f"""
525
+ SELECT ?then ?order ?variable ?binding
526
+ WHERE {{ {{
527
+ <{subject}> <{predicate}> [
528
+ a <{MUST.TableDataset}> ;
529
+ <{MUST.hasRow}> [
530
+ <{MUST.hasBinding}> [
531
+ <{MUST.variable}> ?variable ;
532
+ <{MUST.boundValue}> ?binding ; ] ;
533
+ ] ; ].}}
534
+ OPTIONAL {{ <{subject}> <{predicate}> [
535
+ a <{MUST.TableDataset}> ;
536
+ <{MUST.hasRow}> [ sh:order ?order ;
537
+ <{MUST.hasBinding}> [
538
+ <{MUST.variable}> ?variable ;
539
+ <{MUST.boundValue}> ?binding ; ] ;
540
+ ] ; ].}}
541
+ }} ORDER BY ASC(?order)"""
542
+
543
+ expected_results = spec_graph.query(then_query)
544
+
545
+ data_dict = {}
546
+ columns = []
547
+ series_list = []
548
+
549
+ for then, items in groupby(expected_results, lambda er: er.then):
550
+ for i in list(items):
551
+ if i.variable.value not in columns:
552
+ data_dict[i.variable.value] = []
553
+ data_dict[i.variable.value + "_datatype"] = []
554
+
555
+ for then, items in groupby(expected_results, lambda er: er.then):
556
+ for i in list(items):
557
+ data_dict[i.variable.value].append(str(i.binding))
558
+ if type(i.binding) == Literal:
559
+ literal_type = str(XSD.string)
560
+ if hasattr(i.binding, "datatype") and i.binding.datatype:
561
+ literal_type = str(i.binding.datatype)
562
+ data_dict[i.variable.value + "_datatype"].append(literal_type)
563
+ else:
564
+ data_dict[i.variable.value + "_datatype"].append(str(XSD.anyURI))
565
+
566
+ # convert dict to Series to avoid problem with array length
567
+ for key, value in data_dict.items():
568
+ series_list.append(pandas.Series(value, name=key))
569
+
570
+ df = pandas.concat(series_list, axis=1)
571
+ df.fillna('', inplace=True)
572
+
573
+ return df
574
+
575
+
576
+ def get_when_bindings(subject: URIRef,
577
+ spec_graph: Graph) -> dict:
578
+ when_bindings_query = f"""SELECT ?variable ?binding {{ <{subject}> <{MUST.when}> [ a <{MUST.TextSparqlSource}> ; <{MUST.hasBinding}> [ <{MUST.variable}> ?variable ; <{MUST.boundValue}> ?binding ; ] ; ] ;}}"""
579
+ when_bindings = spec_graph.query(when_bindings_query)
580
+
581
+ if len(when_bindings.bindings) == 0:
582
+ return {}
583
+ else:
584
+ bindings = {}
585
+ for binding in when_bindings:
586
+ bindings[Variable(binding.variable.value)] = binding.binding
587
+ return bindings
588
+
589
+
590
+ def is_then_select_ordered(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> bool:
591
+ ask_select_ordered = f"""
592
+ ASK {{
593
+ {{SELECT (count(?binding) as ?totalBindings) {{
594
+ <{subject}> <{predicate}> [
595
+ a <{MUST.TableDataset}> ;
596
+ <{MUST.hasRow}> [ <{MUST.hasBinding}> [
597
+ <{MUST.variable}> ?variable ;
598
+ <{MUST.boundValue}> ?binding ;
599
+ ] ;
600
+ ]
601
+ ]
602
+ }} }}
603
+ {{SELECT (count(?binding) as ?orderedBindings) {{
604
+ <{subject}> <{predicate}> [
605
+ a <{MUST.TableDataset}> ;
606
+ <{MUST.hasRow}> [ sh:order ?order ;
607
+ <{MUST.hasBinding}> [
608
+ <{MUST.variable}> ?variable ;
609
+ <{MUST.boundValue}> ?binding ;
610
+ ] ;
611
+ ]
612
+ ]
613
+ }} }}
614
+ FILTER(?totalBindings = ?orderedBindings)
615
+ }}"""
616
+ is_ordered = spec_graph.query(ask_select_ordered)
617
+ return is_ordered.askAnswer