mustrd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/spec_component.py ADDED
@@ -0,0 +1,683 @@
1
+ """
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Semantic Partners Ltd
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ from dataclasses import dataclass, field
27
+ from itertools import groupby
28
+ from pathlib import Path
29
+ from typing import Tuple, List, Type
30
+
31
+ import pandas
32
+ import requests
33
+ from rdflib import RDF, Graph, URIRef, Variable, Literal, XSD, util
34
+ from rdflib.exceptions import ParserError
35
+ from rdflib.term import Node
36
+ import logging
37
+
38
+ import logger_setup
39
+ from mustrdAnzo import get_queries_for_layer, get_queries_from_templated_step, get_spec_component_from_graphmart, get_query_from_querybuilder, get_query_from_step
40
+ from namespace import MUST
41
+ from utils import get_project_root
42
+ from multimethods import MultiMethod, Default
43
+
44
+ log = logger_setup.setup_logger(__name__)
45
+
46
+
47
+ @dataclass
48
+ class SpecComponent:
49
+ pass
50
+
51
+
52
+ @dataclass
53
+ class GivenSpec(SpecComponent):
54
+ value: Graph = None
55
+
56
+
57
+ @dataclass
58
+ class WhenSpec(SpecComponent):
59
+ value: str = None
60
+ queryType: URIRef = None
61
+ bindings: dict = None
62
+
63
+ @dataclass
64
+ class AnzoWhenSpec(WhenSpec):
65
+ paramQuery: str = None
66
+ queryTemplate: str = None
67
+
68
+ @dataclass
69
+ class ThenSpec(SpecComponent):
70
+ value: Graph = Graph()
71
+ ordered: bool = False
72
+
73
+
74
+ @dataclass
75
+ class TableThenSpec(ThenSpec):
76
+ value: pandas.DataFrame = field(default_factory=pandas.DataFrame)
77
+
78
+
79
+ @dataclass
80
+ class SpecComponentDetails:
81
+ subject: URIRef
82
+ predicate: URIRef
83
+ spec_graph: Graph
84
+ mustrd_triple_store: dict
85
+ spec_component_node: Node
86
+ data_source_type: Node
87
+ run_config: dict
88
+
89
+ def get_path(path_type: str, run_config: dict) -> Path:
90
+ try:
91
+ if str(run_config[path_type]).startswith("/"):
92
+ return run_config[path_type]
93
+ else:
94
+ return Path(os.path.join(run_config['spec_path'], run_config[path_type]))
95
+ except(KeyError):
96
+ if str(run_config['data_path']).startswith("/"):
97
+ return run_config['data_path']
98
+ else:
99
+ return Path(os.path.join(run_config['spec_path'], run_config['data_path']))
100
+
101
+
102
+ def parse_spec_component(subject: URIRef,
103
+ predicate: URIRef,
104
+ spec_graph: Graph,
105
+ run_config: dict,
106
+ mustrd_triple_store: dict) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
107
+ # print(f"parse_spec_component {subject=} {predicate=} ")
108
+ spec_component_nodes = get_spec_component_nodes(subject, predicate, spec_graph)
109
+ # all_data_source_types = []
110
+ spec_components = []
111
+ for spec_component_node in spec_component_nodes:
112
+ data_source_types = get_data_source_types(subject, predicate, spec_graph, spec_component_node)
113
+ for data_source_type in data_source_types:
114
+ spec_component_details = SpecComponentDetails(
115
+ subject=subject,
116
+ predicate=predicate,
117
+ spec_graph=spec_graph,
118
+ mustrd_triple_store=mustrd_triple_store,
119
+ spec_component_node=spec_component_node,
120
+ data_source_type=data_source_type,
121
+ run_config=run_config)
122
+ spec_component = get_spec_component(spec_component_details)
123
+ if type(spec_component) == list:
124
+ spec_components += spec_component
125
+ else:
126
+ spec_components += [spec_component]
127
+
128
+ # all_data_source_types.extend(data_source_types)
129
+ # return all_data_source_types
130
+ # merge multiple graphs into one, give error if spec config is a TableThen
131
+ # print(f"calling multimethod with {spec_components}")
132
+ return combine_specs(spec_components)
133
+
134
+
135
+ def get_spec_component_type(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
136
+ # Get the type of the first object in the list
137
+ spec_type = type(spec_components[0])
138
+ # Loop through the remaining objects in the list and check their types
139
+ for spec_component in spec_components[1:]:
140
+ if type(spec_component) != spec_type:
141
+ # If an object has a different type, raise an error
142
+ raise ValueError("All spec components must be of the same type")
143
+
144
+ # If all objects have the same type, return the type
145
+ return spec_type
146
+
147
+
148
+ def combine_specs_dispatch(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
149
+ spec_type = get_spec_component_type(spec_components)
150
+ return spec_type
151
+
152
+
153
+ combine_specs = MultiMethod("combine_specs", combine_specs_dispatch)
154
+
155
+
156
+ @combine_specs.method(GivenSpec)
157
+ def _combine_given_specs(spec_components: List[GivenSpec]) -> GivenSpec:
158
+ if len(spec_components) == 1:
159
+ return spec_components[0]
160
+ else:
161
+ graph = Graph()
162
+ for spec_component in spec_components:
163
+ graph += spec_component.value
164
+ given_spec = GivenSpec()
165
+ given_spec.value = graph
166
+ return given_spec
167
+
168
+
169
+ @combine_specs.method(WhenSpec)
170
+ def _combine_when_specs(spec_components: List[WhenSpec]) -> WhenSpec:
171
+ return spec_components
172
+
173
+
174
+ @combine_specs.method(ThenSpec)
175
+ def _combine_then_specs(spec_components: List[ThenSpec]) -> ThenSpec:
176
+ if len(spec_components) == 1:
177
+ return spec_components[0]
178
+ else:
179
+ graph = Graph()
180
+ for spec_component in spec_components:
181
+ graph += spec_component.value
182
+ then_spec = ThenSpec()
183
+ then_spec.value = graph
184
+ return then_spec
185
+
186
+
187
+ @combine_specs.method(TableThenSpec)
188
+ def _combine_table_then_specs(spec_components: List[TableThenSpec]) -> TableThenSpec:
189
+ if len(spec_components) != 1:
190
+ raise ValueError(f"Parsing of multiple components of MUST.then for tables not implemented")
191
+ return spec_components[0]
192
+
193
+
194
+ @combine_specs.method(Default)
195
+ def _combine_specs_default(spec_components: List[SpecComponent]):
196
+ raise ValueError(f"Parsing of multiple components of this type not implemented {spec_components}")
197
+
198
+ def get_data_source_types(subject: URIRef, predicate: URIRef, spec_graph: Graph, source_node: Node) -> List[Node]:
199
+ data_source_types = []
200
+ for data_source_type in spec_graph.objects(subject=source_node, predicate=RDF.type):
201
+ data_source_types.append(data_source_type)
202
+ # data_source_type = spec_graph.value(subject=source_node, predicate=RDF.type)
203
+ if len(data_source_types) == 0:
204
+ raise ValueError(f"Node has no rdf type {subject} {predicate}")
205
+ return data_source_types
206
+
207
+ # https://github.com/Semantic-partners/mustrd/issues/99
208
+ def get_spec_component_dispatch(spec_component_details: SpecComponentDetails) -> Tuple[Node, URIRef]:
209
+ return spec_component_details.data_source_type, spec_component_details.predicate
210
+
211
+
212
+ get_spec_component = MultiMethod("get_spec_component", get_spec_component_dispatch)
213
+
214
+
215
+ @get_spec_component.method((MUST.InheritedDataset, MUST.given))
216
+ def _get_spec_component_inheritedstate_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
217
+ spec_component = init_spec_component(spec_component_details.predicate)
218
+ return spec_component
219
+
220
+
221
+ @get_spec_component.method((MUST.FolderDataset, MUST.given))
222
+ def _get_spec_component_folderdatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
223
+ spec_component = init_spec_component(spec_component_details.predicate)
224
+
225
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
226
+ predicate=MUST.fileName)
227
+
228
+ path = Path(os.path.join(str(get_path('given_path',spec_component_details.run_config)), str(file_name)))
229
+ try:
230
+ spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
231
+ except ParserError as e:
232
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
233
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
234
+ return spec_component
235
+
236
+
237
+ @get_spec_component.method((MUST.FolderSparqlSource, MUST.when))
238
+ def _get_spec_component_foldersparqlsource_when(spec_component_details: SpecComponentDetails) -> GivenSpec:
239
+ spec_component = init_spec_component(spec_component_details.predicate)
240
+
241
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
242
+ predicate=MUST.fileName)
243
+
244
+ path = Path(os.path.join(str(get_path('when_path',spec_component_details.run_config)), str(file_name)))
245
+ spec_component.value = get_spec_component_from_file(path)
246
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
247
+ predicate=MUST.queryType)
248
+ return spec_component
249
+
250
+
251
+ @get_spec_component.method((MUST.FolderDataset, MUST.then))
252
+ def _get_spec_component_folderdatasource_then(spec_component_details: SpecComponentDetails) -> ThenSpec:
253
+ spec_component = init_spec_component(spec_component_details.predicate)
254
+
255
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
256
+ predicate=MUST.fileName)
257
+ path = Path(os.path.join(str(get_path('then_path',spec_component_details.run_config)), str(file_name)))
258
+
259
+ return load_dataset_from_file(path, spec_component)
260
+
261
+ @get_spec_component.method((MUST.FileDataset, MUST.given))
262
+ @get_spec_component.method((MUST.FileDataset, MUST.then))
263
+ def _get_spec_component_filedatasource(spec_component_details: SpecComponentDetails) -> GivenSpec:
264
+ spec_component = init_spec_component(spec_component_details.predicate)
265
+ return load_spec_component(spec_component_details, spec_component)
266
+
267
+ def load_spec_component(spec_component_details, spec_component):
268
+ where_did_i_load_this_spec_from = spec_component_details.spec_graph.value(subject=spec_component_details.subject,
269
+ predicate=MUST.specSourceFile)
270
+ if (where_did_i_load_this_spec_from == None):
271
+ log.error(f"{where_did_i_load_this_spec_from=} was None for test_spec={spec_component_details.subject}, we didn't set the test specifications specSourceFile when loading, spec_graph={spec_component_details.spec_graph}")
272
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
273
+ predicate=MUST.file)))
274
+
275
+ test_spec_file_path = os.path.dirname(where_did_i_load_this_spec_from)
276
+
277
+ # first we try local relative to the test_spec_file_path, then we try relative to the path under test
278
+ # we intentionally don't try for absolute files, but you should feel free to argue that we should do.
279
+ paths = [
280
+ Path(test_spec_file_path, file_path),
281
+ Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
282
+ ]
283
+
284
+ for path in paths:
285
+ if (os.path.exists(path)):
286
+ return load_dataset_from_file(path, spec_component)
287
+
288
+ raise FileNotFoundError(f"Could not find file {file_path=} in any of the {paths=}")
289
+
290
+
291
+ def load_dataset_from_file(path: Path, spec_component: ThenSpec) -> ThenSpec:
292
+ if path.is_dir():
293
+ raise ValueError(f"Path {path} is a directory, expected a file")
294
+
295
+ # https://github.com/Semantic-partners/mustrd/issues/94
296
+ if path.suffix in {".csv", ".xlsx", ".xls"}:
297
+ df = pandas.read_csv(path) if path.suffix == ".csv" else pandas.read_excel(path)
298
+ then_spec = TableThenSpec()
299
+ then_spec.value = df
300
+ return then_spec
301
+ else:
302
+ try:
303
+ file_format = util.guess_format(str(path))
304
+ except AttributeError:
305
+ raise ValueError(f"Unsupported file format: {path.suffix}")
306
+
307
+ if file_format is not None:
308
+ g = Graph()
309
+ try:
310
+ g.parse(data=get_spec_component_from_file(path), format=file_format)
311
+ except ParserError as e:
312
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
313
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
314
+ spec_component.value = g
315
+ return spec_component
316
+
317
+
318
+
319
+ @get_spec_component.method((MUST.FileSparqlSource, MUST.when))
320
+ def _get_spec_component_filedatasource_when(spec_component_details: SpecComponentDetails) -> SpecComponent:
321
+ spec_component = init_spec_component(spec_component_details.predicate)
322
+
323
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
324
+ predicate=MUST.file)))
325
+ if str(file_path).startswith("/"): # absolute path
326
+ path = file_path
327
+ else: #relative path
328
+ path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
329
+ spec_component.value = get_spec_component_from_file(path)
330
+
331
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
332
+ predicate=MUST.queryType)
333
+
334
+ return spec_component
335
+
336
+
337
+ # @get_spec_component.method((MUST.FileDataset, MUST.then))
338
+ # def _get_spec_component_filedatasource_then(spec_component_details: SpecComponentDetails) -> SpecComponent:
339
+ # spec_component = init_spec_component(spec_component_details.predicate)
340
+
341
+ # file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
342
+ # predicate=MUST.file)))
343
+ # if str(file_path).startswith("/"): # absolute path
344
+ # path = file_path
345
+ # else: #relative path
346
+ # path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
347
+ # return get_then_from_file(path, spec_component)
348
+
349
+
350
+ @get_spec_component.method((MUST.TextSparqlSource, MUST.when))
351
+ def _get_spec_component_TextSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
352
+ spec_component = init_spec_component(spec_component_details.predicate)
353
+
354
+ # Get specComponent directly from config file (in text string)
355
+ spec_component.value = str(
356
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
357
+ predicate=MUST.queryText))
358
+
359
+ spec_component.bindings = get_when_bindings(spec_component_details.subject, spec_component_details.spec_graph)
360
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
361
+ predicate=MUST.queryType)
362
+ return spec_component
363
+
364
+
365
+ # https://github.com/Semantic-partners/mustrd/issues/98
366
+ @get_spec_component.method((MUST.HttpDataset, MUST.given))
367
+ @get_spec_component.method((MUST.HttpDataset, MUST.when))
368
+ @get_spec_component.method((MUST.HttpDataset, MUST.then))
369
+ def _get_spec_component_HttpDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
370
+ spec_component = init_spec_component(spec_component_details.predicate)
371
+
372
+ # Get specComponent with http GET protocol
373
+ spec_component.value = requests.get(str(
374
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
375
+ predicate=MUST.dataSourceUrl)).content)
376
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
377
+ predicate=MUST.queryType)
378
+ return spec_component
379
+
380
+
381
+ @get_spec_component.method((MUST.TableDataset, MUST.then))
382
+ def _get_spec_component_TableDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
383
+ table_then = TableThenSpec()
384
+ # get specComponent from ttl table
385
+ table_then.value = get_spec_from_table(spec_component_details.subject, spec_component_details.predicate,
386
+ spec_component_details.spec_graph)
387
+ table_then.ordered = is_then_select_ordered(spec_component_details.subject, spec_component_details.predicate,
388
+ spec_component_details.spec_graph)
389
+ return table_then
390
+
391
+
392
+ @get_spec_component.method((MUST.EmptyTable, MUST.then))
393
+ def _get_spec_component_EmptyTable(spec_component_details: SpecComponentDetails) -> SpecComponent:
394
+ spec_component = TableThenSpec()
395
+ return spec_component
396
+
397
+
398
+ @get_spec_component.method((MUST.EmptyGraph, MUST.then))
399
+ def _get_spec_component_EmptyGraph(spec_component_details: SpecComponentDetails) -> SpecComponent:
400
+ spec_component = init_spec_component(spec_component_details.predicate)
401
+
402
+ return spec_component
403
+
404
+
405
+ @get_spec_component.method((MUST.StatementsDataset, MUST.given))
406
+ @get_spec_component.method((MUST.StatementsDataset, MUST.then))
407
+ def _get_spec_component_StatementsDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
408
+ spec_component = init_spec_component(spec_component_details.predicate)
409
+
410
+ spec_component.value = Graph().parse(
411
+ data=get_spec_from_statements(spec_component_details.subject, spec_component_details.predicate,
412
+ spec_component_details.spec_graph))
413
+ return spec_component
414
+
415
+
416
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.given))
417
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.then))
418
+ def _get_spec_component_AnzoGraphmartDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
419
+ spec_component = init_spec_component(spec_component_details.predicate)
420
+
421
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
422
+ # Get GIVEN or THEN from anzo graphmart
423
+ graphmart = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
424
+ predicate=MUST.graphmart)
425
+ layer = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
426
+ predicate=MUST.layer)
427
+ spec_component.value = get_spec_component_from_graphmart(
428
+ triple_store=spec_component_details.mustrd_triple_store,
429
+ graphmart=graphmart,
430
+ layer=layer)
431
+ else:
432
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoGraphmartDataset}")
433
+
434
+ return spec_component
435
+
436
+
437
+ @get_spec_component.method((MUST.AnzoQueryBuilderSparqlSource, MUST.when))
438
+ def _get_spec_component_AnzoQueryBuilderSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
439
+ spec_component = init_spec_component(spec_component_details.predicate)
440
+
441
+ # Get WHEN specComponent from query builder
442
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
443
+ query_folder = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
444
+ predicate=MUST.queryFolder)
445
+ query_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
446
+ predicate=MUST.queryName)
447
+ spec_component.value = get_query_from_querybuilder(triple_store=spec_component_details.mustrd_triple_store,
448
+ folder_name=query_folder,
449
+ query_name=query_name)
450
+ # If anzo specific function is called but no anzo defined
451
+ else:
452
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoQueryBuilderSparqlSource}")
453
+
454
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
455
+ predicate=MUST.queryType)
456
+ return spec_component
457
+
458
+
459
+ @get_spec_component.method((MUST.AnzoGraphmartStepSparqlSource, MUST.when))
460
+ def _get_spec_component_AnzoGraphmartStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
461
+ spec_component = init_spec_component(spec_component_details.predicate)
462
+
463
+ # Get WHEN specComponent from query builder
464
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
465
+ query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
466
+ predicate=MUST.anzoQueryStep)
467
+ spec_component.value = get_query_from_step(triple_store=spec_component_details.mustrd_triple_store,
468
+ query_step_uri=query_step_uri)
469
+ # If anzo specific function is called but no anzo defined
470
+ else:
471
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoGraphmartStepSparqlSource}")
472
+
473
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
474
+ predicate=MUST.queryType)
475
+ return spec_component
476
+
477
+ @get_spec_component.method((MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource, MUST.when))
478
+ def _get_spec_component_AnzoGraphmartQueryDrivenTemplatedStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
479
+ spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"] )
480
+
481
+ # Get WHEN specComponent from query builder
482
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
483
+ query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
484
+ predicate=MUST.anzoQueryStep)
485
+ queries = get_queries_from_templated_step(triple_store=spec_component_details.mustrd_triple_store,
486
+ query_step_uri=query_step_uri)
487
+ spec_component.paramQuery= queries["param_query"]
488
+ spec_component.queryTemplate = queries["query_template"]
489
+ # If anzo specific function is called but no anzo defined
490
+ else:
491
+ raise ValueError(f"You must define {MUST.AnzoConfig} to use {MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource}")
492
+
493
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
494
+ predicate=MUST.queryType)
495
+ return spec_component
496
+
497
+ @get_spec_component.method((MUST.AnzoGraphmartLayerSparqlSource, MUST.when))
498
+ def _get_spec_component_AnzoGraphmartLayerSparqlSource(spec_component_details: SpecComponentDetails) -> list:
499
+ spec_components = []
500
+ # Get the ordered WHEN specComponents which is the transform and query driven template queries for the Layer
501
+ if spec_component_details.mustrd_triple_store["type"] == MUST.Anzo:
502
+ graphmart_layer_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
503
+ predicate=MUST.anzoGraphmartLayer)
504
+ queries = get_queries_for_layer(triple_store=spec_component_details.mustrd_triple_store,
505
+ graphmart_layer_uri=graphmart_layer_uri)
506
+ # If anzo specific function is called but no anzo defined
507
+ else:
508
+ raise ValueError(f"This test specification is specific to Anzo and can only be run against that platform.")
509
+ for query in queries:
510
+ spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"])
511
+ spec_component.value = query.get("query")
512
+ spec_component.paramQuery = query.get("param_query")
513
+ spec_component.queryTemplate = query.get("query_template")
514
+ if spec_component.value:
515
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
516
+ predicate=MUST.queryType)
517
+ else:
518
+ spec_component.queryType = MUST.AnzoQueryDrivenUpdateSparql
519
+ spec_components += [spec_component]
520
+ return spec_components
521
+
522
+ @get_spec_component.method(Default)
523
+ def _get_spec_component_default(spec_component_details: SpecComponentDetails) -> SpecComponent:
524
+ raise ValueError(
525
+ f"Invalid combination of data source type ({spec_component_details.data_source_type}) and "
526
+ f"spec component ({spec_component_details.predicate})")
527
+
528
+
529
+ def init_spec_component(predicate: URIRef, triple_store_type: URIRef = None ) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
530
+ if predicate == MUST.given:
531
+ spec_component = GivenSpec()
532
+ elif predicate == MUST.when:
533
+ if triple_store_type == MUST.Anzo:
534
+ spec_component = AnzoWhenSpec()
535
+ else:
536
+ spec_component = WhenSpec()
537
+ elif predicate == MUST.then:
538
+ spec_component = ThenSpec()
539
+ else:
540
+ spec_component = SpecComponent()
541
+ return spec_component
542
+
543
+
544
+ def get_spec_component_nodes(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> List[Node]:
545
+ spec_component_nodes = []
546
+ for spec_component_node in spec_graph.objects(subject=subject, predicate=predicate):
547
+ spec_component_nodes.append(spec_component_node)
548
+ # It shouldn't even be possible to get this far as an empty node indicates an invalid RDF file
549
+ if spec_component_nodes is None:
550
+ raise ValueError(f"specComponent Node empty for {subject} {predicate}")
551
+ return spec_component_nodes
552
+
553
+
554
+ def get_spec_component_from_file(path: Path) -> str:
555
+ # project_root = get_project_root()
556
+ # file_path = Path(os.path.join(project_root, path))
557
+
558
+ if path.is_dir():
559
+ raise ValueError(f"Path {path} is a directory, expected a file")
560
+
561
+ try:
562
+ content = path.read_text()
563
+ except FileNotFoundError:
564
+ raise
565
+ return str(content)
566
+
567
+
568
+ def get_spec_from_statements(subject: URIRef,
569
+ predicate: URIRef,
570
+ spec_graph: Graph) -> Graph:
571
+ statements_query = f"""
572
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
573
+
574
+ CONSTRUCT {{ ?s ?p ?o }}
575
+ {{
576
+ <{subject}> <{predicate}> [
577
+ a <{MUST.StatementsDataset}> ;
578
+ <{MUST.hasStatement}> [
579
+ a rdf:Statement ;
580
+ rdf:subject ?s ;
581
+ rdf:predicate ?p ;
582
+ rdf:object ?o ;
583
+ ] ;
584
+ ]
585
+
586
+ }}
587
+ """
588
+ results = spec_graph.query(statements_query).graph
589
+ return results.serialize(format="ttl")
590
+
591
+
592
+ def get_spec_from_table(subject: URIRef,
593
+ predicate: URIRef,
594
+ spec_graph: Graph) -> pandas.DataFrame:
595
+ # query the spec to get the expected result to convert to dataframe for comparison
596
+ then_query = f"""
597
+ prefix sh: <http://www.w3.org/ns/shacl#>
598
+ SELECT ?row ?variable ?binding ?order
599
+ WHERE {{
600
+ <{subject}> <{predicate}> [
601
+ a <{MUST.TableDataset}> ;
602
+ <{MUST.hasRow}> ?row ].
603
+ ?row <{MUST.hasBinding}> [
604
+ <{MUST.variable}> ?variable ;
605
+ <{MUST.boundValue}> ?binding ; ] .
606
+ OPTIONAL {{ ?row sh:order ?order . }}
607
+ .}}
608
+ ORDER BY ?order"""
609
+
610
+ expected_results = spec_graph.query(then_query)
611
+ # get the unique row ids form the result to form the index of the results dataframe
612
+ index = {str(row.row) for row in expected_results}
613
+ # get the unique variables to form the columns of the results dataframe
614
+ columns = set()
615
+ for row in expected_results:
616
+ columns.add(row.variable.value)
617
+ columns.add(row.variable.value + "_datatype")
618
+ # add an additional column for the sort order (if any) of the results
619
+ columns.add("order")
620
+ # create an empty dataframe to populate with the results
621
+ df = pandas.DataFrame(index=list(index), columns=list(columns))
622
+ # fill the dataframe with the results data
623
+ for row in expected_results:
624
+ df.loc[str(row.row), row.variable.value] = str(row.binding)
625
+ df.loc[str(row.row), "order"] = row.order
626
+ if type(row.binding) == Literal:
627
+ literal_type = str(XSD.string)
628
+ if hasattr(row.binding, "datatype") and row.binding.datatype:
629
+ literal_type = str(row.binding.datatype)
630
+ df.loc[str(row.row), row.variable.value + "_datatype"] = literal_type
631
+ else:
632
+ df.loc[str(row.row), row.variable.value + "_datatype"] = str(XSD.anyURI)
633
+ # use the sort order sort the results
634
+ df.sort_values(by="order", inplace=True)
635
+ # drop the order column and replace the rowid index with a numeric one and replace empty values with spaces
636
+ df.drop(columns="order", inplace=True)
637
+ df.reset_index(drop=True, inplace=True)
638
+ df.fillna('', inplace=True)
639
+ return df
640
+
641
+
642
+ def get_when_bindings(subject: URIRef,
643
+ spec_graph: Graph) -> dict:
644
+ when_bindings_query = f"""SELECT ?variable ?binding {{ <{subject}> <{MUST.when}> [ a <{MUST.TextSparqlSource}> ; <{MUST.hasBinding}> [ <{MUST.variable}> ?variable ; <{MUST.boundValue}> ?binding ; ] ; ] ;}}"""
645
+ when_bindings = spec_graph.query(when_bindings_query)
646
+
647
+ if len(when_bindings.bindings) == 0:
648
+ return {}
649
+ else:
650
+ bindings = {}
651
+ for binding in when_bindings:
652
+ bindings[Variable(binding.variable.value)] = binding.binding
653
+ return bindings
654
+
655
+
656
+ def is_then_select_ordered(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> bool:
657
+ ask_select_ordered = f"""
658
+ ASK {{
659
+ {{SELECT (count(?binding) as ?totalBindings) {{
660
+ <{subject}> <{predicate}> [
661
+ a <{MUST.TableDataset}> ;
662
+ <{MUST.hasRow}> [ <{MUST.hasBinding}> [
663
+ <{MUST.variable}> ?variable ;
664
+ <{MUST.boundValue}> ?binding ;
665
+ ] ;
666
+ ]
667
+ ]
668
+ }} }}
669
+ {{SELECT (count(?binding) as ?orderedBindings) {{
670
+ <{subject}> <{predicate}> [
671
+ a <{MUST.TableDataset}> ;
672
+ <{MUST.hasRow}> [ sh:order ?order ;
673
+ <{MUST.hasBinding}> [
674
+ <{MUST.variable}> ?variable ;
675
+ <{MUST.boundValue}> ?binding ;
676
+ ] ;
677
+ ]
678
+ ]
679
+ }} }}
680
+ FILTER(?totalBindings = ?orderedBindings)
681
+ }}"""
682
+ is_ordered = spec_graph.query(ask_select_ordered)
683
+ return is_ordered.askAnswer