mustrd 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mustrd/spec_component.py CHANGED
@@ -1,690 +1,690 @@
1
- """
2
- MIT License
3
-
4
- Copyright (c) 2023 Semantic Partners Ltd
5
-
6
- Permission is hereby granted, free of charge, to any person obtaining a copy
7
- of this software and associated documentation files (the "Software"), to deal
8
- in the Software without restriction, including without limitation the rights
9
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
- copies of the Software, and to permit persons to whom the Software is
11
- furnished to do so, subject to the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be included in all
14
- copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- SOFTWARE.
23
- """
24
-
25
- import os
26
- from dataclasses import dataclass, field
27
- from pathlib import Path
28
- from typing import Tuple, List, Type
29
-
30
- import pandas
31
- import requests
32
- from rdflib import RDF, Graph, URIRef, Variable, Literal, XSD, util
33
- from rdflib.exceptions import ParserError
34
- from rdflib.term import Node
35
- import logging
36
-
37
- from . import logger_setup
38
- from .mustrdAnzo import get_queries_for_layer, get_queries_from_templated_step, get_spec_component_from_graphmart, get_query_from_querybuilder, get_query_from_step
39
- from .namespace import MUST, TRIPLESTORE
40
- from multimethods import MultiMethod, Default
41
- from .utils import get_mustrd_root
42
-
43
- log = logger_setup.setup_logger(__name__)
44
-
45
-
46
- @dataclass
47
- class SpecComponent:
48
- pass
49
-
50
-
51
- @dataclass
52
- class GivenSpec(SpecComponent):
53
- value: Graph = None
54
-
55
-
56
- @dataclass
57
- class WhenSpec(SpecComponent):
58
- value: str = None
59
- queryType: URIRef = None
60
- bindings: dict = None
61
-
62
- @dataclass
63
- class AnzoWhenSpec(WhenSpec):
64
- paramQuery: str = None
65
- queryTemplate: str = None
66
-
67
- @dataclass
68
- class ThenSpec(SpecComponent):
69
- value: Graph = Graph()
70
- ordered: bool = False
71
-
72
-
73
- @dataclass
74
- class TableThenSpec(ThenSpec):
75
- value: pandas.DataFrame = field(default_factory=pandas.DataFrame)
76
-
77
-
78
- @dataclass
79
- class SpecComponentDetails:
80
- subject: URIRef
81
- predicate: URIRef
82
- spec_graph: Graph
83
- mustrd_triple_store: dict
84
- spec_component_node: Node
85
- data_source_type: Node
86
- run_config: dict
87
- root_paths: list
88
-
89
- def get_path(path_type: str, file_name, spec_component_details: SpecComponentDetails) -> Path:
90
- if path_type in spec_component_details.run_config:
91
- relative_path = os.path.join(spec_component_details.run_config[path_type], file_name)
92
- else:
93
- relative_path = file_name
94
- return get_file_absolute_path(spec_component_details, relative_path)
95
-
96
-
97
- def parse_spec_component(subject: URIRef,
98
- predicate: URIRef,
99
- spec_graph: Graph,
100
- run_config: dict,
101
- mustrd_triple_store: dict) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
102
- # print(f"parse_spec_component {subject=} {predicate=} ")
103
- spec_component_nodes = get_spec_component_nodes(subject, predicate, spec_graph)
104
- # all_data_source_types = []
105
- spec_components = []
106
- for spec_component_node in spec_component_nodes:
107
- data_source_types = get_data_source_types(subject, predicate, spec_graph, spec_component_node)
108
- for data_source_type in data_source_types:
109
- spec_component_details = SpecComponentDetails(
110
- subject=subject,
111
- predicate=predicate,
112
- spec_graph=spec_graph,
113
- mustrd_triple_store=mustrd_triple_store,
114
- spec_component_node=spec_component_node,
115
- data_source_type=data_source_type,
116
- run_config=run_config,
117
- root_paths=get_components_roots(spec_graph, subject, run_config))
118
- spec_component = get_spec_component(spec_component_details)
119
- if type(spec_component) == list:
120
- spec_components += spec_component
121
- else:
122
- spec_components += [spec_component]
123
-
124
- # all_data_source_types.extend(data_source_types)
125
- # return all_data_source_types
126
- # merge multiple graphs into one, give error if spec config is a TableThen
127
- # print(f"calling multimethod with {spec_components}")
128
- return combine_specs(spec_components)
129
-
130
- # Here we retrieve all the possible root paths for a specification component.
131
- # This defines the order of priority between root paths which is:
132
- # 1) Path where the spec is located
133
- # 2) spec_path defined in mustrd test configuration files or cmd line argument
134
- # 3) data_path defined in mustrd test configuration files or cmd line argument
135
- # 4) Mustrd source folder: In case of default resources packaged with mustrd source (will be in venv when mustrd is called as library)
136
- # We intentionally don't try for absolute files, but you should feel free to argue that we should do
137
- def get_components_roots(spec_graph: Graph, subject: URIRef, run_config: dict):
138
- where_did_i_load_this_spec_from = spec_graph.value(subject=subject,
139
- predicate=MUST.specSourceFile)
140
- roots = []
141
- if (where_did_i_load_this_spec_from == None):
142
- log.error(f"{where_did_i_load_this_spec_from=} was None for test_spec={subject}, we didn't set the test specifications specSourceFile when loading, spec_graph={spec_graph}")
143
- else:
144
- roots.append(Path(os.path.dirname(where_did_i_load_this_spec_from)))
145
- if run_config and'spec_path' in run_config:
146
- roots.append(Path(run_config['spec_path']))
147
- if run_config and 'data_path' in run_config:
148
- roots.append(run_config['data_path'])
149
- roots.append(get_mustrd_root())
150
-
151
- return roots
152
-
153
-
154
- # From the list of component potential roots, return the first path that exists
155
- def get_file_absolute_path(spec_component_details: SpecComponentDetails, relative_file_path: str):
156
- if not relative_file_path:
157
- raise ValueError("Cannot get absolute path of None")
158
- absolute_file_paths = list(map(lambda root_path: Path(os.path.join(root_path, relative_file_path)), spec_component_details.root_paths))
159
- for absolute_file_path in absolute_file_paths:
160
- if (os.path.exists(absolute_file_path)):
161
- return absolute_file_path
162
- raise FileNotFoundError(f"Could not find file {relative_file_path=} in any of the {absolute_file_paths=}")
163
-
164
- def get_spec_component_type(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
165
- # Get the type of the first object in the list
166
- spec_type = type(spec_components[0])
167
- # Loop through the remaining objects in the list and check their types
168
- for spec_component in spec_components[1:]:
169
- if type(spec_component) != spec_type:
170
- # If an object has a different type, raise an error
171
- raise ValueError("All spec components must be of the same type")
172
-
173
- # If all objects have the same type, return the type
174
- return spec_type
175
-
176
-
177
- def combine_specs_dispatch(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
178
- spec_type = get_spec_component_type(spec_components)
179
- return spec_type
180
-
181
-
182
- combine_specs = MultiMethod("combine_specs", combine_specs_dispatch)
183
-
184
-
185
- @combine_specs.method(GivenSpec)
186
- def _combine_given_specs(spec_components: List[GivenSpec]) -> GivenSpec:
187
- if len(spec_components) == 1:
188
- return spec_components[0]
189
- else:
190
- graph = Graph()
191
- for spec_component in spec_components:
192
- graph += spec_component.value
193
- given_spec = GivenSpec()
194
- given_spec.value = graph
195
- return given_spec
196
-
197
-
198
- @combine_specs.method(WhenSpec)
199
- def _combine_when_specs(spec_components: List[WhenSpec]) -> WhenSpec:
200
- return spec_components
201
-
202
-
203
- @combine_specs.method(ThenSpec)
204
- def _combine_then_specs(spec_components: List[ThenSpec]) -> ThenSpec:
205
- if len(spec_components) == 1:
206
- return spec_components[0]
207
- else:
208
- graph = Graph()
209
- for spec_component in spec_components:
210
- graph += spec_component.value
211
- then_spec = ThenSpec()
212
- then_spec.value = graph
213
- return then_spec
214
-
215
-
216
- @combine_specs.method(TableThenSpec)
217
- def _combine_table_then_specs(spec_components: List[TableThenSpec]) -> TableThenSpec:
218
- if len(spec_components) != 1:
219
- raise ValueError(f"Parsing of multiple components of MUST.then for tables not implemented")
220
- return spec_components[0]
221
-
222
-
223
- @combine_specs.method(Default)
224
- def _combine_specs_default(spec_components: List[SpecComponent]):
225
- raise ValueError(f"Parsing of multiple components of this type not implemented {spec_components}")
226
-
227
- def get_data_source_types(subject: URIRef, predicate: URIRef, spec_graph: Graph, source_node: Node) -> List[Node]:
228
- data_source_types = []
229
- for data_source_type in spec_graph.objects(subject=source_node, predicate=RDF.type):
230
- data_source_types.append(data_source_type)
231
- # data_source_type = spec_graph.value(subject=source_node, predicate=RDF.type)
232
- if len(data_source_types) == 0:
233
- raise ValueError(f"Node has no rdf type {subject} {predicate}")
234
- return data_source_types
235
-
236
- # https://github.com/Semantic-partners/mustrd/issues/99
237
- def get_spec_component_dispatch(spec_component_details: SpecComponentDetails) -> Tuple[Node, URIRef]:
238
- return spec_component_details.data_source_type, spec_component_details.predicate
239
-
240
-
241
- get_spec_component = MultiMethod("get_spec_component", get_spec_component_dispatch)
242
-
243
-
244
- @get_spec_component.method((MUST.InheritedDataset, MUST.given))
245
- def _get_spec_component_inheritedstate_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
246
- spec_component = init_spec_component(spec_component_details.predicate)
247
- return spec_component
248
-
249
-
250
- @get_spec_component.method((MUST.FolderDataset, MUST.given))
251
- def _get_spec_component_folderdatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
252
- spec_component = init_spec_component(spec_component_details.predicate)
253
-
254
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
255
- predicate=MUST.fileName)
256
-
257
- path = get_path('given_path', file_name,spec_component_details)
258
- try:
259
- spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
260
- except ParserError as e:
261
- log.error(f"Problem parsing {path}, error of type {type(e)}")
262
- raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
263
- return spec_component
264
-
265
-
266
- @get_spec_component.method((MUST.FolderSparqlSource, MUST.when))
267
- def _get_spec_component_foldersparqlsource_when(spec_component_details: SpecComponentDetails) -> GivenSpec:
268
- spec_component = init_spec_component(spec_component_details.predicate)
269
-
270
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
271
- predicate=MUST.fileName)
272
-
273
- path = get_path('when_path', file_name,spec_component_details)
274
- spec_component.value = get_spec_component_from_file(path)
275
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
276
- predicate=MUST.queryType)
277
- return spec_component
278
-
279
-
280
- @get_spec_component.method((MUST.FolderDataset, MUST.then))
281
- def _get_spec_component_folderdatasource_then(spec_component_details: SpecComponentDetails) -> ThenSpec:
282
- spec_component = init_spec_component(spec_component_details.predicate)
283
-
284
- file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
285
- predicate=MUST.fileName)
286
- path = get_path('then_path', file_name,spec_component_details)
287
-
288
- return load_dataset_from_file(path, spec_component)
289
-
290
- @get_spec_component.method((MUST.FileDataset, MUST.given))
291
- @get_spec_component.method((MUST.FileDataset, MUST.then))
292
- def _get_spec_component_filedatasource(spec_component_details: SpecComponentDetails) -> GivenSpec:
293
- spec_component = init_spec_component(spec_component_details.predicate)
294
- return load_spec_component(spec_component_details, spec_component)
295
-
296
- def load_spec_component(spec_component_details, spec_component):
297
- file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
298
- predicate=MUST.file)))
299
- return load_dataset_from_file(get_file_absolute_path(spec_component_details, file_path), spec_component)
300
-
301
-
302
- def load_dataset_from_file(path: Path, spec_component: ThenSpec) -> ThenSpec:
303
- if path.is_dir():
304
- raise ValueError(f"Path {path} is a directory, expected a file")
305
-
306
- # https://github.com/Semantic-partners/mustrd/issues/94
307
- if path.suffix in {".csv", ".xlsx", ".xls"}:
308
- df = pandas.read_csv(path) if path.suffix == ".csv" else pandas.read_excel(path)
309
- then_spec = TableThenSpec()
310
- then_spec.value = df
311
- return then_spec
312
- else:
313
- try:
314
- file_format = util.guess_format(str(path))
315
- except AttributeError:
316
- raise ValueError(f"Unsupported file format: {path.suffix}")
317
-
318
- if file_format is not None:
319
- g = Graph()
320
- try:
321
- g.parse(data=get_spec_component_from_file(path), format=file_format)
322
- except ParserError as e:
323
- log.error(f"Problem parsing {path}, error of type {type(e)}")
324
- raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
325
- spec_component.value = g
326
- return spec_component
327
-
328
-
329
-
330
- @get_spec_component.method((MUST.FileSparqlSource, MUST.when))
331
- def _get_spec_component_filedatasource_when(spec_component_details: SpecComponentDetails) -> SpecComponent:
332
- spec_component = init_spec_component(spec_component_details.predicate)
333
-
334
- file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
335
- predicate=MUST.file)))
336
- spec_component.value = get_spec_component_from_file(get_file_absolute_path(spec_component_details, file_path))
337
-
338
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
339
- predicate=MUST.queryType)
340
-
341
- return spec_component
342
-
343
-
344
- # @get_spec_component.method((MUST.FileDataset, MUST.then))
345
- # def _get_spec_component_filedatasource_then(spec_component_details: SpecComponentDetails) -> SpecComponent:
346
- # spec_component = init_spec_component(spec_component_details.predicate)
347
-
348
- # file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
349
- # predicate=MUST.file)))
350
- # if str(file_path).startswith("/"): # absolute path
351
- # path = file_path
352
- # else: #relative path
353
- # path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
354
- # return get_then_from_file(path, spec_component)
355
-
356
-
357
- @get_spec_component.method((MUST.TextSparqlSource, MUST.when))
358
- def _get_spec_component_TextSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
359
- spec_component = init_spec_component(spec_component_details.predicate)
360
-
361
- # Get specComponent directly from config file (in text string)
362
- spec_component.value = str(
363
- spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
364
- predicate=MUST.queryText))
365
-
366
- spec_component.bindings = get_when_bindings(spec_component_details.subject, spec_component_details.spec_graph)
367
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
368
- predicate=MUST.queryType)
369
- return spec_component
370
-
371
-
372
- # https://github.com/Semantic-partners/mustrd/issues/98
373
- @get_spec_component.method((MUST.HttpDataset, MUST.given))
374
- @get_spec_component.method((MUST.HttpDataset, MUST.when))
375
- @get_spec_component.method((MUST.HttpDataset, MUST.then))
376
- def _get_spec_component_HttpDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
377
- spec_component = init_spec_component(spec_component_details.predicate)
378
-
379
- # Get specComponent with http GET protocol
380
- spec_component.value = requests.get(str(
381
- spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
382
- predicate=MUST.dataSourceUrl)).content)
383
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
384
- predicate=MUST.queryType)
385
- return spec_component
386
-
387
-
388
- @get_spec_component.method((MUST.TableDataset, MUST.then))
389
- def _get_spec_component_TableDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
390
- table_then = TableThenSpec()
391
- # get specComponent from ttl table
392
- table_then.value = get_spec_from_table(spec_component_details.subject, spec_component_details.predicate,
393
- spec_component_details.spec_graph)
394
- table_then.ordered = is_then_select_ordered(spec_component_details.subject, spec_component_details.predicate,
395
- spec_component_details.spec_graph)
396
- return table_then
397
-
398
-
399
- @get_spec_component.method((MUST.EmptyTable, MUST.then))
400
- def _get_spec_component_EmptyTable(spec_component_details: SpecComponentDetails) -> SpecComponent:
401
- spec_component = TableThenSpec()
402
- return spec_component
403
-
404
-
405
- @get_spec_component.method((MUST.EmptyGraph, MUST.then))
406
- def _get_spec_component_EmptyGraph(spec_component_details: SpecComponentDetails) -> SpecComponent:
407
- spec_component = init_spec_component(spec_component_details.predicate)
408
-
409
- return spec_component
410
-
411
-
412
- @get_spec_component.method((MUST.StatementsDataset, MUST.given))
413
- @get_spec_component.method((MUST.StatementsDataset, MUST.then))
414
- def _get_spec_component_StatementsDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
415
- spec_component = init_spec_component(spec_component_details.predicate)
416
-
417
- spec_component.value = Graph().parse(
418
- data=get_spec_from_statements(spec_component_details.subject, spec_component_details.predicate,
419
- spec_component_details.spec_graph))
420
- return spec_component
421
-
422
-
423
- @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.given))
424
- @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.then))
425
- def _get_spec_component_AnzoGraphmartDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
426
- spec_component = init_spec_component(spec_component_details.predicate)
427
-
428
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
429
- # Get GIVEN or THEN from anzo graphmart
430
- graphmart = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
431
- predicate=MUST.graphmart)
432
- layer = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
433
- predicate=MUST.layer)
434
- spec_component.value = get_spec_component_from_graphmart(
435
- triple_store=spec_component_details.mustrd_triple_store,
436
- graphmart=graphmart,
437
- layer=layer)
438
- else:
439
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartDataset}")
440
-
441
- return spec_component
442
-
443
-
444
- @get_spec_component.method((MUST.AnzoQueryBuilderSparqlSource, MUST.when))
445
- def _get_spec_component_AnzoQueryBuilderSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
446
- spec_component = init_spec_component(spec_component_details.predicate)
447
-
448
- # Get WHEN specComponent from query builder
449
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
450
- query_folder = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
451
- predicate=MUST.queryFolder)
452
- query_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
453
- predicate=MUST.queryName)
454
- spec_component.value = get_query_from_querybuilder(triple_store=spec_component_details.mustrd_triple_store,
455
- folder_name=query_folder,
456
- query_name=query_name)
457
- # If anzo specific function is called but no anzo defined
458
- else:
459
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoQueryBuilderSparqlSource}")
460
-
461
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
462
- predicate=MUST.queryType)
463
- return spec_component
464
-
465
-
466
- @get_spec_component.method((MUST.AnzoGraphmartStepSparqlSource, MUST.when))
467
- def _get_spec_component_AnzoGraphmartStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
468
- spec_component = init_spec_component(spec_component_details.predicate)
469
-
470
- # Get WHEN specComponent from query builder
471
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
472
- query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
473
- predicate=MUST.anzoQueryStep)
474
- spec_component.value = get_query_from_step(triple_store=spec_component_details.mustrd_triple_store,
475
- query_step_uri=query_step_uri)
476
- # If anzo specific function is called but no anzo defined
477
- else:
478
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartStepSparqlSource}")
479
-
480
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
481
- predicate=MUST.queryType)
482
- return spec_component
483
-
484
- @get_spec_component.method((MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource, MUST.when))
485
- def _get_spec_component_AnzoGraphmartQueryDrivenTemplatedStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
486
- spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"] )
487
-
488
- # Get WHEN specComponent from query builder
489
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
490
- query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
491
- predicate=MUST.anzoQueryStep)
492
- queries = get_queries_from_templated_step(triple_store=spec_component_details.mustrd_triple_store,
493
- query_step_uri=query_step_uri)
494
- spec_component.paramQuery= queries["param_query"]
495
- spec_component.queryTemplate = queries["query_template"]
496
- # If anzo specific function is called but no anzo defined
497
- else:
498
- raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource}")
499
-
500
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
501
- predicate=MUST.queryType)
502
- return spec_component
503
-
504
- @get_spec_component.method((MUST.AnzoGraphmartLayerSparqlSource, MUST.when))
505
- def _get_spec_component_AnzoGraphmartLayerSparqlSource(spec_component_details: SpecComponentDetails) -> list:
506
- spec_components = []
507
- # Get the ordered WHEN specComponents which is the transform and query driven template queries for the Layer
508
- if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
509
- graphmart_layer_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
510
- predicate=MUST.anzoGraphmartLayer)
511
- queries = get_queries_for_layer(triple_store=spec_component_details.mustrd_triple_store,
512
- graphmart_layer_uri=graphmart_layer_uri)
513
- # If anzo specific function is called but no anzo defined
514
- else:
515
- raise ValueError(f"This test specification is specific to Anzo and can only be run against that platform.")
516
- for query in queries:
517
- spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"])
518
- spec_component.value = query.get("query")
519
- spec_component.paramQuery = query.get("param_query")
520
- spec_component.queryTemplate = query.get("query_template")
521
- if spec_component.value:
522
- spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
523
- predicate=MUST.queryType)
524
- else:
525
- spec_component.queryType = MUST.AnzoQueryDrivenUpdateSparql
526
- spec_components += [spec_component]
527
- return spec_components
528
-
529
- @get_spec_component.method(Default)
530
- def _get_spec_component_default(spec_component_details: SpecComponentDetails) -> SpecComponent:
531
- raise ValueError(
532
- f"Invalid combination of data source type ({spec_component_details.data_source_type}) and "
533
- f"spec component ({spec_component_details.predicate})")
534
-
535
-
536
- def init_spec_component(predicate: URIRef, triple_store_type: URIRef = None ) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
537
- if predicate == MUST.given:
538
- spec_component = GivenSpec()
539
- elif predicate == MUST.when:
540
- if triple_store_type == TRIPLESTORE.Anzo:
541
- spec_component = AnzoWhenSpec()
542
- else:
543
- spec_component = WhenSpec()
544
- elif predicate == MUST.then:
545
- spec_component = ThenSpec()
546
- else:
547
- spec_component = SpecComponent()
548
- return spec_component
549
-
550
-
551
- def get_spec_component_nodes(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> List[Node]:
552
- spec_component_nodes = []
553
- for spec_component_node in spec_graph.objects(subject=subject, predicate=predicate):
554
- spec_component_nodes.append(spec_component_node)
555
- # It shouldn't even be possible to get this far as an empty node indicates an invalid RDF file
556
- if spec_component_nodes is None:
557
- raise ValueError(f"specComponent Node empty for {subject} {predicate}")
558
- return spec_component_nodes
559
-
560
-
561
- def get_spec_component_from_file(path: Path) -> str:
562
- # project_root = get_project_root()
563
- # file_path = Path(os.path.join(project_root, path))
564
-
565
- if path.is_dir():
566
- raise ValueError(f"Path {path} is a directory, expected a file")
567
-
568
- try:
569
- content = path.read_text()
570
- except FileNotFoundError:
571
- raise
572
- return str(content)
573
-
574
-
575
- def get_spec_from_statements(subject: URIRef,
576
- predicate: URIRef,
577
- spec_graph: Graph) -> Graph:
578
- statements_query = f"""
579
- prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
580
-
581
- CONSTRUCT {{ ?s ?p ?o }}
582
- {{
583
- <{subject}> <{predicate}> [
584
- a <{MUST.StatementsDataset}> ;
585
- <{MUST.hasStatement}> [
586
- a rdf:Statement ;
587
- rdf:subject ?s ;
588
- rdf:predicate ?p ;
589
- rdf:object ?o ;
590
- ] ;
591
- ]
592
-
593
- }}
594
- """
595
- results = spec_graph.query(statements_query).graph
596
- return results.serialize(format="ttl")
597
-
598
-
599
- def get_spec_from_table(subject: URIRef,
600
- predicate: URIRef,
601
- spec_graph: Graph) -> pandas.DataFrame:
602
- # query the spec to get the expected result to convert to dataframe for comparison
603
- then_query = f"""
604
- prefix sh: <http://www.w3.org/ns/shacl#>
605
- SELECT ?row ?variable ?binding ?order
606
- WHERE {{
607
- <{subject}> <{predicate}> [
608
- a <{MUST.TableDataset}> ;
609
- <{MUST.hasRow}> ?row ].
610
- ?row <{MUST.hasBinding}> [
611
- <{MUST.variable}> ?variable ;
612
- <{MUST.boundValue}> ?binding ; ] .
613
- OPTIONAL {{ ?row sh:order ?order . }}
614
- .}}
615
- ORDER BY ?order"""
616
-
617
- expected_results = spec_graph.query(then_query)
618
- # get the unique row ids form the result to form the index of the results dataframe
619
- index = {str(row.row) for row in expected_results}
620
- # get the unique variables to form the columns of the results dataframe
621
- columns = set()
622
- for row in expected_results:
623
- columns.add(row.variable.value)
624
- columns.add(row.variable.value + "_datatype")
625
- # add an additional column for the sort order (if any) of the results
626
- columns.add("order")
627
- # create an empty dataframe to populate with the results
628
- df = pandas.DataFrame(index=list(index), columns=list(columns))
629
- # fill the dataframe with the results data
630
- for row in expected_results:
631
- df.loc[str(row.row), row.variable.value] = str(row.binding)
632
- df.loc[str(row.row), "order"] = row.order
633
- if type(row.binding) == Literal:
634
- literal_type = str(XSD.string)
635
- if hasattr(row.binding, "datatype") and row.binding.datatype:
636
- literal_type = str(row.binding.datatype)
637
- df.loc[str(row.row), row.variable.value + "_datatype"] = literal_type
638
- else:
639
- df.loc[str(row.row), row.variable.value + "_datatype"] = str(XSD.anyURI)
640
- # use the sort order sort the results
641
- df.sort_values(by="order", inplace=True)
642
- # drop the order column and replace the rowid index with a numeric one and replace empty values with spaces
643
- df.drop(columns="order", inplace=True)
644
- df.reset_index(drop=True, inplace=True)
645
- df.fillna('', inplace=True)
646
- return df
647
-
648
-
649
- def get_when_bindings(subject: URIRef,
650
- spec_graph: Graph) -> dict:
651
- when_bindings_query = f"""SELECT ?variable ?binding {{ <{subject}> <{MUST.when}> [ a <{MUST.TextSparqlSource}> ; <{MUST.hasBinding}> [ <{MUST.variable}> ?variable ; <{MUST.boundValue}> ?binding ; ] ; ] ;}}"""
652
- when_bindings = spec_graph.query(when_bindings_query)
653
-
654
- if len(when_bindings.bindings) == 0:
655
- return {}
656
- else:
657
- bindings = {}
658
- for binding in when_bindings:
659
- bindings[Variable(binding.variable.value)] = binding.binding
660
- return bindings
661
-
662
-
663
- def is_then_select_ordered(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> bool:
664
- ask_select_ordered = f"""
665
- ASK {{
666
- {{SELECT (count(?binding) as ?totalBindings) {{
667
- <{subject}> <{predicate}> [
668
- a <{MUST.TableDataset}> ;
669
- <{MUST.hasRow}> [ <{MUST.hasBinding}> [
670
- <{MUST.variable}> ?variable ;
671
- <{MUST.boundValue}> ?binding ;
672
- ] ;
673
- ]
674
- ]
675
- }} }}
676
- {{SELECT (count(?binding) as ?orderedBindings) {{
677
- <{subject}> <{predicate}> [
678
- a <{MUST.TableDataset}> ;
679
- <{MUST.hasRow}> [ sh:order ?order ;
680
- <{MUST.hasBinding}> [
681
- <{MUST.variable}> ?variable ;
682
- <{MUST.boundValue}> ?binding ;
683
- ] ;
684
- ]
685
- ]
686
- }} }}
687
- FILTER(?totalBindings = ?orderedBindings)
688
- }}"""
689
- is_ordered = spec_graph.query(ask_select_ordered)
690
- return is_ordered.askAnswer
1
+ """
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Semantic Partners Ltd
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+ from typing import Tuple, List, Type
29
+
30
+ import pandas
31
+ import requests
32
+ from rdflib import RDF, Graph, URIRef, Variable, Literal, XSD, util
33
+ from rdflib.exceptions import ParserError
34
+ from rdflib.term import Node
35
+ import logging
36
+
37
+ from . import logger_setup
38
+ from .mustrdAnzo import get_queries_for_layer, get_queries_from_templated_step, get_spec_component_from_graphmart, get_query_from_querybuilder, get_query_from_step
39
+ from .namespace import MUST, TRIPLESTORE
40
+ from multimethods import MultiMethod, Default
41
+ from .utils import get_mustrd_root
42
+
43
+ log = logger_setup.setup_logger(__name__)
44
+
45
+
46
+ @dataclass
47
+ class SpecComponent:
48
+ pass
49
+
50
+
51
+ @dataclass
52
+ class GivenSpec(SpecComponent):
53
+ value: Graph = None
54
+
55
+
56
+ @dataclass
57
+ class WhenSpec(SpecComponent):
58
+ value: str = None
59
+ queryType: URIRef = None
60
+ bindings: dict = None
61
+
62
+ @dataclass
63
+ class AnzoWhenSpec(WhenSpec):
64
+ paramQuery: str = None
65
+ queryTemplate: str = None
66
+
67
+ @dataclass
68
+ class ThenSpec(SpecComponent):
69
+ value: Graph = Graph()
70
+ ordered: bool = False
71
+
72
+
73
+ @dataclass
74
+ class TableThenSpec(ThenSpec):
75
+ value: pandas.DataFrame = field(default_factory=pandas.DataFrame)
76
+
77
+
78
+ @dataclass
79
+ class SpecComponentDetails:
80
+ subject: URIRef
81
+ predicate: URIRef
82
+ spec_graph: Graph
83
+ mustrd_triple_store: dict
84
+ spec_component_node: Node
85
+ data_source_type: Node
86
+ run_config: dict
87
+ root_paths: list
88
+
89
+ def get_path(path_type: str, file_name, spec_component_details: SpecComponentDetails) -> Path:
90
+ if path_type in spec_component_details.run_config:
91
+ relative_path = os.path.join(spec_component_details.run_config[path_type], file_name)
92
+ else:
93
+ relative_path = file_name
94
+ return get_file_absolute_path(spec_component_details, relative_path)
95
+
96
+
97
+ def parse_spec_component(subject: URIRef,
98
+ predicate: URIRef,
99
+ spec_graph: Graph,
100
+ run_config: dict,
101
+ mustrd_triple_store: dict) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
102
+ # print(f"parse_spec_component {subject=} {predicate=} ")
103
+ spec_component_nodes = get_spec_component_nodes(subject, predicate, spec_graph)
104
+ # all_data_source_types = []
105
+ spec_components = []
106
+ for spec_component_node in spec_component_nodes:
107
+ data_source_types = get_data_source_types(subject, predicate, spec_graph, spec_component_node)
108
+ for data_source_type in data_source_types:
109
+ spec_component_details = SpecComponentDetails(
110
+ subject=subject,
111
+ predicate=predicate,
112
+ spec_graph=spec_graph,
113
+ mustrd_triple_store=mustrd_triple_store,
114
+ spec_component_node=spec_component_node,
115
+ data_source_type=data_source_type,
116
+ run_config=run_config,
117
+ root_paths=get_components_roots(spec_graph, subject, run_config))
118
+ spec_component = get_spec_component(spec_component_details)
119
+ if type(spec_component) == list:
120
+ spec_components += spec_component
121
+ else:
122
+ spec_components += [spec_component]
123
+
124
+ # all_data_source_types.extend(data_source_types)
125
+ # return all_data_source_types
126
+ # merge multiple graphs into one, give error if spec config is a TableThen
127
+ # print(f"calling multimethod with {spec_components}")
128
+ return combine_specs(spec_components)
129
+
130
+ # Here we retrieve all the possible root paths for a specification component.
131
+ # This defines the order of priority between root paths which is:
132
+ # 1) Path where the spec is located
133
+ # 2) spec_path defined in mustrd test configuration files or cmd line argument
134
+ # 3) data_path defined in mustrd test configuration files or cmd line argument
135
+ # 4) Mustrd source folder: In case of default resources packaged with mustrd source (will be in venv when mustrd is called as library)
136
+ # We intentionally don't try for absolute files, but you should feel free to argue that we should do
137
+ def get_components_roots(spec_graph: Graph, subject: URIRef, run_config: dict):
138
+ where_did_i_load_this_spec_from = spec_graph.value(subject=subject,
139
+ predicate=MUST.specSourceFile)
140
+ roots = []
141
+ if (where_did_i_load_this_spec_from == None):
142
+ log.error(f"{where_did_i_load_this_spec_from=} was None for test_spec={subject}, we didn't set the test specifications specSourceFile when loading, spec_graph={spec_graph}")
143
+ else:
144
+ roots.append(Path(os.path.dirname(where_did_i_load_this_spec_from)))
145
+ if run_config and'spec_path' in run_config:
146
+ roots.append(Path(run_config['spec_path']))
147
+ if run_config and 'data_path' in run_config:
148
+ roots.append(run_config['data_path'])
149
+ roots.append(get_mustrd_root())
150
+
151
+ return roots
152
+
153
+
154
+ # From the list of component potential roots, return the first path that exists
155
+ def get_file_absolute_path(spec_component_details: SpecComponentDetails, relative_file_path: str):
156
+ if not relative_file_path:
157
+ raise ValueError("Cannot get absolute path of None")
158
+ absolute_file_paths = list(map(lambda root_path: Path(os.path.join(root_path, relative_file_path)), spec_component_details.root_paths))
159
+ for absolute_file_path in absolute_file_paths:
160
+ if (os.path.exists(absolute_file_path)):
161
+ return absolute_file_path
162
+ raise FileNotFoundError(f"Could not find file {relative_file_path=} in any of the {absolute_file_paths=}")
163
+
164
+ def get_spec_component_type(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
165
+ # Get the type of the first object in the list
166
+ spec_type = type(spec_components[0])
167
+ # Loop through the remaining objects in the list and check their types
168
+ for spec_component in spec_components[1:]:
169
+ if type(spec_component) != spec_type:
170
+ # If an object has a different type, raise an error
171
+ raise ValueError("All spec components must be of the same type")
172
+
173
+ # If all objects have the same type, return the type
174
+ return spec_type
175
+
176
+
177
+ def combine_specs_dispatch(spec_components: List[SpecComponent]) -> Type[SpecComponent]:
178
+ spec_type = get_spec_component_type(spec_components)
179
+ return spec_type
180
+
181
+
182
+ combine_specs = MultiMethod("combine_specs", combine_specs_dispatch)
183
+
184
+
185
+ @combine_specs.method(GivenSpec)
186
+ def _combine_given_specs(spec_components: List[GivenSpec]) -> GivenSpec:
187
+ if len(spec_components) == 1:
188
+ return spec_components[0]
189
+ else:
190
+ graph = Graph()
191
+ for spec_component in spec_components:
192
+ graph += spec_component.value
193
+ given_spec = GivenSpec()
194
+ given_spec.value = graph
195
+ return given_spec
196
+
197
+
198
+ @combine_specs.method(WhenSpec)
199
+ def _combine_when_specs(spec_components: List[WhenSpec]) -> WhenSpec:
200
+ return spec_components
201
+
202
+
203
+ @combine_specs.method(ThenSpec)
204
+ def _combine_then_specs(spec_components: List[ThenSpec]) -> ThenSpec:
205
+ if len(spec_components) == 1:
206
+ return spec_components[0]
207
+ else:
208
+ graph = Graph()
209
+ for spec_component in spec_components:
210
+ graph += spec_component.value
211
+ then_spec = ThenSpec()
212
+ then_spec.value = graph
213
+ return then_spec
214
+
215
+
216
+ @combine_specs.method(TableThenSpec)
217
+ def _combine_table_then_specs(spec_components: List[TableThenSpec]) -> TableThenSpec:
218
+ if len(spec_components) != 1:
219
+ raise ValueError(f"Parsing of multiple components of MUST.then for tables not implemented")
220
+ return spec_components[0]
221
+
222
+
223
+ @combine_specs.method(Default)
224
+ def _combine_specs_default(spec_components: List[SpecComponent]):
225
+ raise ValueError(f"Parsing of multiple components of this type not implemented {spec_components}")
226
+
227
+ def get_data_source_types(subject: URIRef, predicate: URIRef, spec_graph: Graph, source_node: Node) -> List[Node]:
228
+ data_source_types = []
229
+ for data_source_type in spec_graph.objects(subject=source_node, predicate=RDF.type):
230
+ data_source_types.append(data_source_type)
231
+ # data_source_type = spec_graph.value(subject=source_node, predicate=RDF.type)
232
+ if len(data_source_types) == 0:
233
+ raise ValueError(f"Node has no rdf type {subject} {predicate}")
234
+ return data_source_types
235
+
236
+ # https://github.com/Semantic-partners/mustrd/issues/99
237
+ def get_spec_component_dispatch(spec_component_details: SpecComponentDetails) -> Tuple[Node, URIRef]:
238
+ return spec_component_details.data_source_type, spec_component_details.predicate
239
+
240
+
241
+ get_spec_component = MultiMethod("get_spec_component", get_spec_component_dispatch)
242
+
243
+
244
+ @get_spec_component.method((MUST.InheritedDataset, MUST.given))
245
+ def _get_spec_component_inheritedstate_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
246
+ spec_component = init_spec_component(spec_component_details.predicate)
247
+ return spec_component
248
+
249
+
250
+ @get_spec_component.method((MUST.FolderDataset, MUST.given))
251
+ def _get_spec_component_folderdatasource_given(spec_component_details: SpecComponentDetails) -> GivenSpec:
252
+ spec_component = init_spec_component(spec_component_details.predicate)
253
+
254
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
255
+ predicate=MUST.fileName)
256
+
257
+ path = get_path('given_path', file_name,spec_component_details)
258
+ try:
259
+ spec_component.value = Graph().parse(data=get_spec_component_from_file(path))
260
+ except ParserError as e:
261
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
262
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
263
+ return spec_component
264
+
265
+
266
+ @get_spec_component.method((MUST.FolderSparqlSource, MUST.when))
267
+ def _get_spec_component_foldersparqlsource_when(spec_component_details: SpecComponentDetails) -> GivenSpec:
268
+ spec_component = init_spec_component(spec_component_details.predicate)
269
+
270
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
271
+ predicate=MUST.fileName)
272
+
273
+ path = get_path('when_path', file_name,spec_component_details)
274
+ spec_component.value = get_spec_component_from_file(path)
275
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
276
+ predicate=MUST.queryType)
277
+ return spec_component
278
+
279
+
280
+ @get_spec_component.method((MUST.FolderDataset, MUST.then))
281
+ def _get_spec_component_folderdatasource_then(spec_component_details: SpecComponentDetails) -> ThenSpec:
282
+ spec_component = init_spec_component(spec_component_details.predicate)
283
+
284
+ file_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
285
+ predicate=MUST.fileName)
286
+ path = get_path('then_path', file_name,spec_component_details)
287
+
288
+ return load_dataset_from_file(path, spec_component)
289
+
290
+ @get_spec_component.method((MUST.FileDataset, MUST.given))
291
+ @get_spec_component.method((MUST.FileDataset, MUST.then))
292
+ def _get_spec_component_filedatasource(spec_component_details: SpecComponentDetails) -> GivenSpec:
293
+ spec_component = init_spec_component(spec_component_details.predicate)
294
+ return load_spec_component(spec_component_details, spec_component)
295
+
296
+ def load_spec_component(spec_component_details, spec_component):
297
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
298
+ predicate=MUST.file)))
299
+ return load_dataset_from_file(get_file_absolute_path(spec_component_details, file_path), spec_component)
300
+
301
+
302
+ def load_dataset_from_file(path: Path, spec_component: ThenSpec) -> ThenSpec:
303
+ if path.is_dir():
304
+ raise ValueError(f"Path {path} is a directory, expected a file")
305
+
306
+ # https://github.com/Semantic-partners/mustrd/issues/94
307
+ if path.suffix in {".csv", ".xlsx", ".xls"}:
308
+ df = pandas.read_csv(path) if path.suffix == ".csv" else pandas.read_excel(path)
309
+ then_spec = TableThenSpec()
310
+ then_spec.value = df
311
+ return then_spec
312
+ else:
313
+ try:
314
+ file_format = util.guess_format(str(path))
315
+ except AttributeError:
316
+ raise ValueError(f"Unsupported file format: {path.suffix}")
317
+
318
+ if file_format is not None:
319
+ g = Graph()
320
+ try:
321
+ g.parse(data=get_spec_component_from_file(path), format=file_format)
322
+ except ParserError as e:
323
+ log.error(f"Problem parsing {path}, error of type {type(e)}")
324
+ raise ValueError(f"Problem parsing {path}, error of type {type(e)}")
325
+ spec_component.value = g
326
+ return spec_component
327
+
328
+
329
+
330
+ @get_spec_component.method((MUST.FileSparqlSource, MUST.when))
331
+ def _get_spec_component_filedatasource_when(spec_component_details: SpecComponentDetails) -> SpecComponent:
332
+ spec_component = init_spec_component(spec_component_details.predicate)
333
+
334
+ file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
335
+ predicate=MUST.file)))
336
+ spec_component.value = get_spec_component_from_file(get_file_absolute_path(spec_component_details, file_path))
337
+
338
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
339
+ predicate=MUST.queryType)
340
+
341
+ return spec_component
342
+
343
+
344
+ # @get_spec_component.method((MUST.FileDataset, MUST.then))
345
+ # def _get_spec_component_filedatasource_then(spec_component_details: SpecComponentDetails) -> SpecComponent:
346
+ # spec_component = init_spec_component(spec_component_details.predicate)
347
+
348
+ # file_path = Path(str(spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
349
+ # predicate=MUST.file)))
350
+ # if str(file_path).startswith("/"): # absolute path
351
+ # path = file_path
352
+ # else: #relative path
353
+ # path = Path(os.path.join(spec_component_details.run_config['spec_path'], file_path))
354
+ # return get_then_from_file(path, spec_component)
355
+
356
+
357
+ @get_spec_component.method((MUST.TextSparqlSource, MUST.when))
358
+ def _get_spec_component_TextSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
359
+ spec_component = init_spec_component(spec_component_details.predicate)
360
+
361
+ # Get specComponent directly from config file (in text string)
362
+ spec_component.value = str(
363
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
364
+ predicate=MUST.queryText))
365
+
366
+ spec_component.bindings = get_when_bindings(spec_component_details.subject, spec_component_details.spec_graph)
367
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
368
+ predicate=MUST.queryType)
369
+ return spec_component
370
+
371
+
372
+ # https://github.com/Semantic-partners/mustrd/issues/98
373
+ @get_spec_component.method((MUST.HttpDataset, MUST.given))
374
+ @get_spec_component.method((MUST.HttpDataset, MUST.when))
375
+ @get_spec_component.method((MUST.HttpDataset, MUST.then))
376
+ def _get_spec_component_HttpDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
377
+ spec_component = init_spec_component(spec_component_details.predicate)
378
+
379
+ # Get specComponent with http GET protocol
380
+ spec_component.value = requests.get(str(
381
+ spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
382
+ predicate=MUST.dataSourceUrl)).content)
383
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
384
+ predicate=MUST.queryType)
385
+ return spec_component
386
+
387
+
388
+ @get_spec_component.method((MUST.TableDataset, MUST.then))
389
+ def _get_spec_component_TableDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
390
+ table_then = TableThenSpec()
391
+ # get specComponent from ttl table
392
+ table_then.value = get_spec_from_table(spec_component_details.subject, spec_component_details.predicate,
393
+ spec_component_details.spec_graph)
394
+ table_then.ordered = is_then_select_ordered(spec_component_details.subject, spec_component_details.predicate,
395
+ spec_component_details.spec_graph)
396
+ return table_then
397
+
398
+
399
+ @get_spec_component.method((MUST.EmptyTable, MUST.then))
400
+ def _get_spec_component_EmptyTable(spec_component_details: SpecComponentDetails) -> SpecComponent:
401
+ spec_component = TableThenSpec()
402
+ return spec_component
403
+
404
+
405
+ @get_spec_component.method((MUST.EmptyGraph, MUST.then))
406
+ def _get_spec_component_EmptyGraph(spec_component_details: SpecComponentDetails) -> SpecComponent:
407
+ spec_component = init_spec_component(spec_component_details.predicate)
408
+
409
+ return spec_component
410
+
411
+
412
+ @get_spec_component.method((MUST.StatementsDataset, MUST.given))
413
+ @get_spec_component.method((MUST.StatementsDataset, MUST.then))
414
+ def _get_spec_component_StatementsDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
415
+ spec_component = init_spec_component(spec_component_details.predicate)
416
+
417
+ spec_component.value = Graph().parse(
418
+ data=get_spec_from_statements(spec_component_details.subject, spec_component_details.predicate,
419
+ spec_component_details.spec_graph))
420
+ return spec_component
421
+
422
+
423
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.given))
424
+ @get_spec_component.method((MUST.AnzoGraphmartDataset, MUST.then))
425
+ def _get_spec_component_AnzoGraphmartDataset(spec_component_details: SpecComponentDetails) -> SpecComponent:
426
+ spec_component = init_spec_component(spec_component_details.predicate)
427
+
428
+ if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
429
+ # Get GIVEN or THEN from anzo graphmart
430
+ graphmart = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
431
+ predicate=MUST.graphmart)
432
+ layer = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
433
+ predicate=MUST.layer)
434
+ spec_component.value = get_spec_component_from_graphmart(
435
+ triple_store=spec_component_details.mustrd_triple_store,
436
+ graphmart=graphmart,
437
+ layer=layer)
438
+ else:
439
+ raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartDataset}")
440
+
441
+ return spec_component
442
+
443
+
444
+ @get_spec_component.method((MUST.AnzoQueryBuilderSparqlSource, MUST.when))
445
+ def _get_spec_component_AnzoQueryBuilderSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
446
+ spec_component = init_spec_component(spec_component_details.predicate)
447
+
448
+ # Get WHEN specComponent from query builder
449
+ if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
450
+ query_folder = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
451
+ predicate=MUST.queryFolder)
452
+ query_name = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
453
+ predicate=MUST.queryName)
454
+ spec_component.value = get_query_from_querybuilder(triple_store=spec_component_details.mustrd_triple_store,
455
+ folder_name=query_folder,
456
+ query_name=query_name)
457
+ # If anzo specific function is called but no anzo defined
458
+ else:
459
+ raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoQueryBuilderSparqlSource}")
460
+
461
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
462
+ predicate=MUST.queryType)
463
+ return spec_component
464
+
465
+
466
+ @get_spec_component.method((MUST.AnzoGraphmartStepSparqlSource, MUST.when))
467
+ def _get_spec_component_AnzoGraphmartStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
468
+ spec_component = init_spec_component(spec_component_details.predicate)
469
+
470
+ # Get WHEN specComponent from query builder
471
+ if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
472
+ query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
473
+ predicate=MUST.anzoQueryStep)
474
+ spec_component.value = get_query_from_step(triple_store=spec_component_details.mustrd_triple_store,
475
+ query_step_uri=query_step_uri)
476
+ # If anzo specific function is called but no anzo defined
477
+ else:
478
+ raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartStepSparqlSource}")
479
+
480
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
481
+ predicate=MUST.queryType)
482
+ return spec_component
483
+
484
+ @get_spec_component.method((MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource, MUST.when))
485
+ def _get_spec_component_AnzoGraphmartQueryDrivenTemplatedStepSparqlSource(spec_component_details: SpecComponentDetails) -> SpecComponent:
486
+ spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"] )
487
+
488
+ # Get WHEN specComponent from query builder
489
+ if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
490
+ query_step_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
491
+ predicate=MUST.anzoQueryStep)
492
+ queries = get_queries_from_templated_step(triple_store=spec_component_details.mustrd_triple_store,
493
+ query_step_uri=query_step_uri)
494
+ spec_component.paramQuery= queries["param_query"]
495
+ spec_component.queryTemplate = queries["query_template"]
496
+ # If anzo specific function is called but no anzo defined
497
+ else:
498
+ raise ValueError(f"You must define {TRIPLESTORE.Anzo} to use {MUST.AnzoGraphmartQueryDrivenTemplatedStepSparqlSource}")
499
+
500
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
501
+ predicate=MUST.queryType)
502
+ return spec_component
503
+
504
+ @get_spec_component.method((MUST.AnzoGraphmartLayerSparqlSource, MUST.when))
505
+ def _get_spec_component_AnzoGraphmartLayerSparqlSource(spec_component_details: SpecComponentDetails) -> list:
506
+ spec_components = []
507
+ # Get the ordered WHEN specComponents which is the transform and query driven template queries for the Layer
508
+ if spec_component_details.mustrd_triple_store["type"] == TRIPLESTORE.Anzo:
509
+ graphmart_layer_uri = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
510
+ predicate=MUST.anzoGraphmartLayer)
511
+ queries = get_queries_for_layer(triple_store=spec_component_details.mustrd_triple_store,
512
+ graphmart_layer_uri=graphmart_layer_uri)
513
+ # If anzo specific function is called but no anzo defined
514
+ else:
515
+ raise ValueError(f"This test specification is specific to Anzo and can only be run against that platform.")
516
+ for query in queries:
517
+ spec_component = init_spec_component(spec_component_details.predicate, spec_component_details.mustrd_triple_store["type"])
518
+ spec_component.value = query.get("query")
519
+ spec_component.paramQuery = query.get("param_query")
520
+ spec_component.queryTemplate = query.get("query_template")
521
+ if spec_component.value:
522
+ spec_component.queryType = spec_component_details.spec_graph.value(subject=spec_component_details.spec_component_node,
523
+ predicate=MUST.queryType)
524
+ else:
525
+ spec_component.queryType = MUST.AnzoQueryDrivenUpdateSparql
526
+ spec_components += [spec_component]
527
+ return spec_components
528
+
529
+ @get_spec_component.method(Default)
530
+ def _get_spec_component_default(spec_component_details: SpecComponentDetails) -> SpecComponent:
531
+ raise ValueError(
532
+ f"Invalid combination of data source type ({spec_component_details.data_source_type}) and "
533
+ f"spec component ({spec_component_details.predicate})")
534
+
535
+
536
+ def init_spec_component(predicate: URIRef, triple_store_type: URIRef = None ) -> GivenSpec | WhenSpec | ThenSpec | TableThenSpec:
537
+ if predicate == MUST.given:
538
+ spec_component = GivenSpec()
539
+ elif predicate == MUST.when:
540
+ if triple_store_type == TRIPLESTORE.Anzo:
541
+ spec_component = AnzoWhenSpec()
542
+ else:
543
+ spec_component = WhenSpec()
544
+ elif predicate == MUST.then:
545
+ spec_component = ThenSpec()
546
+ else:
547
+ spec_component = SpecComponent()
548
+ return spec_component
549
+
550
+
551
+ def get_spec_component_nodes(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> List[Node]:
552
+ spec_component_nodes = []
553
+ for spec_component_node in spec_graph.objects(subject=subject, predicate=predicate):
554
+ spec_component_nodes.append(spec_component_node)
555
+ # It shouldn't even be possible to get this far as an empty node indicates an invalid RDF file
556
+ if spec_component_nodes is None:
557
+ raise ValueError(f"specComponent Node empty for {subject} {predicate}")
558
+ return spec_component_nodes
559
+
560
+
561
+ def get_spec_component_from_file(path: Path) -> str:
562
+ # project_root = get_project_root()
563
+ # file_path = Path(os.path.join(project_root, path))
564
+
565
+ if path.is_dir():
566
+ raise ValueError(f"Path {path} is a directory, expected a file")
567
+
568
+ try:
569
+ content = path.read_text()
570
+ except FileNotFoundError:
571
+ raise
572
+ return str(content)
573
+
574
+
575
+ def get_spec_from_statements(subject: URIRef,
576
+ predicate: URIRef,
577
+ spec_graph: Graph) -> Graph:
578
+ statements_query = f"""
579
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
580
+
581
+ CONSTRUCT {{ ?s ?p ?o }}
582
+ {{
583
+ <{subject}> <{predicate}> [
584
+ a <{MUST.StatementsDataset}> ;
585
+ <{MUST.hasStatement}> [
586
+ a rdf:Statement ;
587
+ rdf:subject ?s ;
588
+ rdf:predicate ?p ;
589
+ rdf:object ?o ;
590
+ ] ;
591
+ ]
592
+
593
+ }}
594
+ """
595
+ results = spec_graph.query(statements_query).graph
596
+ return results.serialize(format="ttl")
597
+
598
+
599
+ def get_spec_from_table(subject: URIRef,
600
+ predicate: URIRef,
601
+ spec_graph: Graph) -> pandas.DataFrame:
602
+ # query the spec to get the expected result to convert to dataframe for comparison
603
+ then_query = f"""
604
+ prefix sh: <http://www.w3.org/ns/shacl#>
605
+ SELECT ?row ?variable ?binding ?order
606
+ WHERE {{
607
+ <{subject}> <{predicate}> [
608
+ a <{MUST.TableDataset}> ;
609
+ <{MUST.hasRow}> ?row ].
610
+ ?row <{MUST.hasBinding}> [
611
+ <{MUST.variable}> ?variable ;
612
+ <{MUST.boundValue}> ?binding ; ] .
613
+ OPTIONAL {{ ?row sh:order ?order . }}
614
+ .}}
615
+ ORDER BY ?order"""
616
+
617
+ expected_results = spec_graph.query(then_query)
618
+ # get the unique row ids form the result to form the index of the results dataframe
619
+ index = {str(row.row) for row in expected_results}
620
+ # get the unique variables to form the columns of the results dataframe
621
+ columns = set()
622
+ for row in expected_results:
623
+ columns.add(row.variable.value)
624
+ columns.add(row.variable.value + "_datatype")
625
+ # add an additional column for the sort order (if any) of the results
626
+ columns.add("order")
627
+ # create an empty dataframe to populate with the results
628
+ df = pandas.DataFrame(index=list(index), columns=list(columns))
629
+ # fill the dataframe with the results data
630
+ for row in expected_results:
631
+ df.loc[str(row.row), row.variable.value] = str(row.binding)
632
+ df.loc[str(row.row), "order"] = row.order
633
+ if type(row.binding) == Literal:
634
+ literal_type = str(XSD.string)
635
+ if hasattr(row.binding, "datatype") and row.binding.datatype:
636
+ literal_type = str(row.binding.datatype)
637
+ df.loc[str(row.row), row.variable.value + "_datatype"] = literal_type
638
+ else:
639
+ df.loc[str(row.row), row.variable.value + "_datatype"] = str(XSD.anyURI)
640
+ # use the sort order sort the results
641
+ df.sort_values(by="order", inplace=True)
642
+ # drop the order column and replace the rowid index with a numeric one and replace empty values with spaces
643
+ df.drop(columns="order", inplace=True)
644
+ df.reset_index(drop=True, inplace=True)
645
+ df.fillna('', inplace=True)
646
+ return df
647
+
648
+
649
+ def get_when_bindings(subject: URIRef,
650
+ spec_graph: Graph) -> dict:
651
+ when_bindings_query = f"""SELECT ?variable ?binding {{ <{subject}> <{MUST.when}> [ a <{MUST.TextSparqlSource}> ; <{MUST.hasBinding}> [ <{MUST.variable}> ?variable ; <{MUST.boundValue}> ?binding ; ] ; ] ;}}"""
652
+ when_bindings = spec_graph.query(when_bindings_query)
653
+
654
+ if len(when_bindings.bindings) == 0:
655
+ return {}
656
+ else:
657
+ bindings = {}
658
+ for binding in when_bindings:
659
+ bindings[Variable(binding.variable.value)] = binding.binding
660
+ return bindings
661
+
662
+
663
+ def is_then_select_ordered(subject: URIRef, predicate: URIRef, spec_graph: Graph) -> bool:
664
+ ask_select_ordered = f"""
665
+ ASK {{
666
+ {{SELECT (count(?binding) as ?totalBindings) {{
667
+ <{subject}> <{predicate}> [
668
+ a <{MUST.TableDataset}> ;
669
+ <{MUST.hasRow}> [ <{MUST.hasBinding}> [
670
+ <{MUST.variable}> ?variable ;
671
+ <{MUST.boundValue}> ?binding ;
672
+ ] ;
673
+ ]
674
+ ]
675
+ }} }}
676
+ {{SELECT (count(?binding) as ?orderedBindings) {{
677
+ <{subject}> <{predicate}> [
678
+ a <{MUST.TableDataset}> ;
679
+ <{MUST.hasRow}> [ sh:order ?order ;
680
+ <{MUST.hasBinding}> [
681
+ <{MUST.variable}> ?variable ;
682
+ <{MUST.boundValue}> ?binding ;
683
+ ] ;
684
+ ]
685
+ ]
686
+ }} }}
687
+ FILTER(?totalBindings = ?orderedBindings)
688
+ }}"""
689
+ is_ordered = spec_graph.query(ask_select_ordered)
690
+ return is_ordered.askAnswer