mustrd 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mustrd/mustrd.py CHANGED
@@ -1,787 +1,787 @@
1
- """
2
- MIT License
3
-
4
- Copyright (c) 2023 Semantic Partners Ltd
5
-
6
- Permission is hereby granted, free of charge, to any person obtaining a copy
7
- of this software and associated documentation files (the "Software"), to deal
8
- in the Software without restriction, including without limitation the rights
9
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
- copies of the Software, and to permit persons to whom the Software is
11
- furnished to do so, subject to the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be included in all
14
- copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- SOFTWARE.
23
- """
24
-
25
- import os
26
- from typing import Tuple, List
27
-
28
- import tomli
29
- from rdflib.plugins.parsers.notation3 import BadSyntax
30
-
31
- from . import logger_setup
32
- from dataclasses import dataclass
33
-
34
- from pyparsing import ParseException
35
- from pathlib import Path
36
- from requests import ConnectionError, ConnectTimeout, HTTPError, RequestException
37
-
38
- from rdflib import Graph, URIRef, RDF, XSD, SH, Literal
39
-
40
- from rdflib.compare import isomorphic, graph_diff
41
- import pandas
42
-
43
- from .namespace import MUST, TRIPLESTORE
44
- import requests
45
- import json
46
- from pandas import DataFrame
47
-
48
- from .spec_component import TableThenSpec, parse_spec_component, WhenSpec, ThenSpec
49
- from .utils import is_json,get_mustrd_root
50
- from colorama import Fore, Style
51
- from tabulate import tabulate
52
- from collections import defaultdict
53
- from pyshacl import validate
54
- import logging
55
- from http.client import HTTPConnection
56
- from .steprunner import upload_given, run_when
57
-
58
- log = logger_setup.setup_logger(__name__)
59
-
60
- requests.packages.urllib3.disable_warnings()
61
- requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'
62
-
63
- logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
64
-
65
-
66
- def debug_requests_on():
67
- '''Switches on logging of the requests module.'''
68
- HTTPConnection.debuglevel = 1
69
-
70
- logging.basicConfig()
71
- logging.getLogger().setLevel(logging.DEBUG)
72
- requests_log = logging.getLogger("requests.packages.urllib3")
73
- requests_log.setLevel(logging.DEBUG)
74
- requests_log.propagate = True
75
-
76
- def debug_requests_off():
77
- '''Switches off logging of the requests module, might be some side-effects'''
78
- HTTPConnection.debuglevel = 0
79
-
80
- root_logger = logging.getLogger()
81
- root_logger.setLevel(logging.WARNING)
82
- root_logger.handlers = []
83
- requests_log = logging.getLogger("requests.packages.urllib3")
84
- requests_log.setLevel(logging.WARNING)
85
- requests_log.propagate = False
86
-
87
- debug_requests_off()
88
-
89
- @dataclass
90
- class Specification:
91
- spec_uri: URIRef
92
- triple_store: dict
93
- given: Graph
94
- when: WhenSpec
95
- then: ThenSpec
96
- spec_file_name: str = "default.mustrd.ttl"
97
-
98
-
99
- @dataclass
100
- class GraphComparison:
101
- in_expected_not_in_actual: Graph
102
- in_actual_not_in_expected: Graph
103
- in_both: Graph
104
-
105
-
106
- @dataclass
107
- class SpecResult:
108
- spec_uri: URIRef
109
- triple_store: URIRef
110
-
111
-
112
- @dataclass
113
- class SpecPassed(SpecResult):
114
- pass
115
-
116
-
117
- @dataclass()
118
- class SpecPassedWithWarning(SpecResult):
119
- warning: str
120
-
121
-
122
- @dataclass
123
- class SelectSpecFailure(SpecResult):
124
- table_comparison: pandas.DataFrame
125
- message: str
126
-
127
-
128
- @dataclass
129
- class ConstructSpecFailure(SpecResult):
130
- graph_comparison: GraphComparison
131
-
132
-
133
- @dataclass
134
- class UpdateSpecFailure(SpecResult):
135
- graph_comparison: GraphComparison
136
-
137
-
138
- @dataclass
139
- class SparqlParseFailure(SpecResult):
140
- exception: ParseException
141
-
142
-
143
- @dataclass
144
- class SparqlExecutionError(SpecResult):
145
- exception: Exception
146
-
147
-
148
- @dataclass
149
- class TripleStoreConnectionError(SpecResult):
150
- exception: ConnectionError
151
-
152
-
153
- @dataclass
154
- class SpecSkipped(SpecResult):
155
- message: str
156
- spec_file_name: str = "default.mustrd.ttl"
157
-
158
-
159
- @dataclass
160
- class SparqlAction:
161
- query: str
162
-
163
-
164
- @dataclass
165
- class SelectSparqlQuery(SparqlAction):
166
- pass
167
-
168
-
169
- @dataclass
170
- class ConstructSparqlQuery(SparqlAction):
171
- pass
172
-
173
-
174
- @dataclass
175
- class UpdateSparqlQuery(SparqlAction):
176
- pass
177
-
178
-
179
- # https://github.com/Semantic-partners/mustrd/issues/19
180
-
181
- def validate_specs(run_config: dict, triple_stores: List, shacl_graph: Graph, ont_graph: Graph, file_name: str = "*")\
182
- -> Tuple[List, Graph, List]:
183
- spec_graph = Graph()
184
- subject_uris = set()
185
- focus_uris = set()
186
- invalid_specs = []
187
- ttl_files = list(run_config['spec_path'].glob(f'**/{file_name}.mustrd.ttl'))
188
- ttl_files.sort()
189
- log.info(f"Found {len(ttl_files)} {file_name}.mustrd.ttl files in {run_config['spec_path']}")
190
-
191
- for file in ttl_files:
192
- error_messages = []
193
-
194
- log.info(f"Parse: {file}")
195
- try:
196
- file_graph = Graph().parse(file)
197
- except BadSyntax as e:
198
- template = "An exception of type {0} occurred when trying to parse a spec file. Arguments:\n{1!r}"
199
- message = template.format(type(e).__name__, e.args)
200
- log.error(message)
201
- error_messages += [f"Could not extract spec from {file} due to exception of type "
202
- f"{type(e).__name__} when parsing file"]
203
- continue
204
- # run shacl validation
205
- conforms, results_graph, results_text = validate(file_graph,
206
- shacl_graph=shacl_graph,
207
- ont_graph=ont_graph,
208
- inference='none',
209
- abort_on_first=False,
210
- allow_infos=False,
211
- allow_warnings=False,
212
- meta_shacl=False,
213
- advanced=True,
214
- js=False,
215
- debug=False)
216
- if not conforms:
217
- for msg in results_graph.objects(predicate=SH.resultMessage):
218
- log.warning(f"{file_graph}")
219
- log.warning(f"{msg} File: {file.name}")
220
- error_messages += [f"{msg} File: {file.name}"]
221
-
222
- # collect a list of uris of the tests in focus
223
- for focus_uri in file_graph.subjects(predicate=MUST.focus, object=Literal("true", datatype=XSD.boolean)):
224
- if focus_uri in focus_uris:
225
- focus_uri = URIRef(str(focus_uri) + "_DUPLICATE")
226
- focus_uris.add(focus_uri)
227
-
228
- # make sure there are no duplicate test IRIs in the files
229
- for subject_uri in file_graph.subjects(RDF.type, MUST.TestSpec):
230
- if subject_uri in subject_uris:
231
- log.warning(f"Duplicate subject URI found: {file.name} {subject_uri}. File will not be parsed.")
232
- error_messages += [f"Duplicate subject URI found in {file.name}."]
233
- subject_uri = URIRef(str(subject_uri) + "_DUPLICATE")
234
- if len(error_messages) > 0:
235
- error_messages.sort()
236
- error_message = "\n".join(msg for msg in error_messages)
237
- invalid_specs += [SpecSkipped(subject_uri, triple_store["type"], error_message, file.name) for triple_store in
238
- triple_stores]
239
- else:
240
- subject_uris.add(subject_uri)
241
- this_spec_graph = Graph()
242
- this_spec_graph.parse(file)
243
- spec_uris_in_this_file = list(this_spec_graph.subjects(RDF.type, MUST.TestSpec))
244
- for spec in spec_uris_in_this_file:
245
- # print(f"adding {tripleToAdd}")
246
- this_spec_graph.add([spec, MUST.specSourceFile, Literal(file)])
247
- this_spec_graph.add([spec, MUST.specFileName, Literal(file.name)])
248
- # print(f"beforeadd: {spec_graph}" )
249
- # print(f"beforeadd: {str(this_spec_graph.serialize())}" )
250
- spec_graph += this_spec_graph
251
-
252
-
253
- sourceFiles = list(spec_graph.subject_objects(MUST.specSourceFile))
254
- # print(f"sourceFiles: {sourceFiles}")
255
-
256
- valid_spec_uris = list(spec_graph.subjects(RDF.type, MUST.TestSpec))
257
-
258
- if focus_uris:
259
- invalid_focus_specs = []
260
- for spec in invalid_specs:
261
- if spec.spec_uri in focus_uris:
262
- invalid_focus_specs += [spec]
263
- focus_uris.remove(spec.spec_uri)
264
- log.info(f"Collected {len(focus_uris)} focus test spec(s)")
265
- return focus_uris, spec_graph, invalid_focus_specs
266
- else:
267
- log.info(f"Collected {len(valid_spec_uris)} valid test spec(s)")
268
- return valid_spec_uris, spec_graph, invalid_specs
269
-
270
-
271
- def get_specs(spec_uris: List[URIRef], spec_graph: Graph, triple_stores: List[dict],
272
- run_config: dict):
273
- specs = []
274
- skipped_results = []
275
- try:
276
- for triple_store in triple_stores:
277
- if "error" in triple_store:
278
- log.error(f"{triple_store['error']}. No specs run for this triple store.")
279
- skipped_results += [SpecSkipped(spec_uri, triple_store['type'], triple_store['error'], get_spec_file(spec_uri, spec_graph)) for spec_uri in
280
- spec_uris]
281
- else:
282
- for spec_uri in spec_uris:
283
- try:
284
- specs += [get_spec(spec_uri, spec_graph, run_config, triple_store)]
285
- except (ValueError, FileNotFoundError, ConnectionError) as e:
286
- skipped_results += [SpecSkipped(spec_uri, triple_store['type'], e, get_spec_file(spec_uri, spec_graph))]
287
-
288
- except (BadSyntax, FileNotFoundError) as e:
289
- template = "An exception of type {0} occurred when trying to parse the triple store configuration file. " \
290
- "Arguments:\n{1!r}"
291
- message = template.format(type(e).__name__, e.args)
292
- log.error(message)
293
- log.error("No specifications will be run.")
294
-
295
- log.info(f"Extracted {len(specs)} specifications that will be run")
296
- return specs, skipped_results
297
-
298
-
299
- def run_specs(specs) -> List[SpecResult]:
300
- results = []
301
- # https://github.com/Semantic-partners/mustrd/issues/115
302
- for specification in specs:
303
- results.append(run_spec(specification))
304
- return results
305
-
306
- def get_spec_file(spec_uri: URIRef, spec_graph: Graph):
307
- return str(spec_graph.value(subject = spec_uri, predicate = MUST.specFileName, default = "default.mustrd.ttl"))
308
-
309
- def get_spec(spec_uri: URIRef, spec_graph: Graph, run_config: dict, mustrd_triple_store: dict = None) -> Specification:
310
- try:
311
- if mustrd_triple_store is None:
312
- mustrd_triple_store = {"type": TRIPLESTORE.RdfLib}
313
- components = []
314
- for predicate in MUST.given, MUST.when, MUST.then:
315
- components.append(parse_spec_component(subject=spec_uri,
316
- predicate=predicate,
317
- spec_graph=spec_graph,
318
- run_config=run_config,
319
- mustrd_triple_store=mustrd_triple_store))
320
-
321
- spec_file_name = get_spec_file(spec_uri, spec_graph)
322
- # https://github.com/Semantic-partners/mustrd/issues/92
323
- return Specification(spec_uri, mustrd_triple_store, components[0].value, components[1], components[2], spec_file_name)
324
-
325
- except (ValueError, FileNotFoundError) as e:
326
- template = "An exception of type {0} occurred. Arguments:\n{1!r}"
327
- message = template.format(type(e).__name__, e.args)
328
- log.exception(message)
329
- raise
330
- except ConnectionError as e:
331
- log.error(e)
332
- raise
333
-
334
-
335
- def check_result(spec, result):
336
- if type(spec.then) == TableThenSpec:
337
- return table_comparison(result, spec)
338
- else:
339
- graph_compare = graph_comparison(spec.then.value, result)
340
- if isomorphic(result, spec.then.value):
341
- return SpecPassed(spec.spec_uri, spec.triple_store["type"])
342
- else:
343
- if spec.when[0].queryType == MUST.ConstructSparql:
344
- return ConstructSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
345
- else:
346
- return UpdateSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
347
-
348
-
349
- def run_spec(spec: Specification) -> SpecResult:
350
- spec_uri = spec.spec_uri
351
- triple_store = spec.triple_store
352
- # close_connection = True
353
- log.debug(f"run_when {spec_uri=}, {triple_store=}, {spec.given=}, {spec.when=}, {spec.then=}")
354
- if spec.given:
355
- given_as_turtle = spec.given.serialize(format="turtle")
356
- log.debug(f"{given_as_turtle}")
357
- upload_given(triple_store, spec.given)
358
- else:
359
- if triple_store['type'] == TRIPLESTORE.RdfLib:
360
- return SpecSkipped(spec_uri, triple_store['type'], "Unable to run Inherited State tests on Rdflib")
361
- try:
362
- for when in spec.when:
363
- log.info(f"Running {when.queryType} spec {spec_uri} on {triple_store['type']}")
364
- try:
365
- result = run_when(spec_uri, triple_store, when)
366
- except ParseException as e:
367
- return SparqlParseFailure(spec_uri, triple_store["type"], e)
368
- except NotImplementedError as ex:
369
- return SpecSkipped(spec_uri, triple_store["type"], ex.args[0])
370
- return check_result(spec, result)
371
- except (ConnectionError, TimeoutError, HTTPError, ConnectTimeout, OSError) as e:
372
- # close_connection = False
373
- template = "An exception of type {0} occurred. Arguments:\n{1!r}"
374
- message = template.format(type(e).__name__, e.args)
375
- log.error(message)
376
- return TripleStoreConnectionError(spec_uri, triple_store["type"], message)
377
- except (TypeError, RequestException) as e:
378
- log.error(f"{type(e)} {e}")
379
- return SparqlExecutionError(spec_uri, triple_store["type"], e)
380
-
381
- # https://github.com/Semantic-partners/mustrd/issues/78
382
- # finally:
383
- # if type(mustrd_triple_store) == MustrdAnzo and close_connection:
384
- # mustrd_triple_store.clear_graph()
385
-
386
- def get_triple_store_graph(triple_store_graph_path: Path, secrets: str):
387
- if secrets:
388
- return Graph().parse(triple_store_graph_path).parse(data = secrets)
389
- else:
390
- secret_path = triple_store_graph_path.parent / Path(triple_store_graph_path.stem + "_secrets" + triple_store_graph_path.suffix)
391
- return Graph().parse(triple_store_graph_path).parse(secret_path)
392
-
393
-
394
- def get_triple_stores(triple_store_graph: Graph) -> list[dict]:
395
- triple_stores = []
396
- shacl_graph = Graph().parse(Path(os.path.join(get_mustrd_root(), "model/triplestoreshapes.ttl")))
397
- ont_graph = Graph().parse(Path(os.path.join(get_mustrd_root(), "model/triplestoreOntology.ttl")))
398
- conforms, results_graph, results_text = validate(
399
- data_graph= triple_store_graph,
400
- shacl_graph = shacl_graph,
401
- ont_graph = ont_graph,
402
- advanced= True,
403
- inference= 'none'
404
- )
405
- if not conforms:
406
- raise ValueError(f"Triple store configuration not conform to the shapes. SHACL report: {results_text}", results_graph)
407
- for triple_store_config, rdf_type, triple_store_type in triple_store_graph.triples((None, RDF.type, None)):
408
- triple_store = {}
409
- triple_store["type"] = triple_store_type
410
- triple_store["uri"] = triple_store_config
411
- # Anzo graph via anzo
412
- if triple_store_type == TRIPLESTORE.Anzo:
413
- triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.url)
414
- triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.port)
415
- try:
416
- triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.username))
417
- triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.password))
418
- except (FileNotFoundError, ValueError) as e:
419
- triple_store["error"] = e
420
- triple_store["gqe_uri"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.gqeURI)
421
- triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
422
- predicate=TRIPLESTORE.inputGraph)
423
- triple_store["output_graph"] = triple_store_graph.value(subject=triple_store_config,
424
- predicate=TRIPLESTORE.outputGraph)
425
- try:
426
- check_triple_store_params(triple_store, ["url", "port", "username", "password", "input_graph"])
427
- except ValueError as e:
428
- triple_store["error"] = e
429
- # GraphDB
430
- elif triple_store_type == TRIPLESTORE.GraphDb:
431
- triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.url)
432
- triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.port)
433
- try:
434
- triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.username))
435
- triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.password))
436
- except (FileNotFoundError, ValueError) as e:
437
- log.error(f"Credential retrieval failed {e}")
438
- triple_store["error"] = e
439
- triple_store["repository"] = triple_store_graph.value(subject=triple_store_config,
440
- predicate=TRIPLESTORE.repository)
441
- triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
442
- predicate=TRIPLESTORE.inputGraph)
443
-
444
- try:
445
- check_triple_store_params(triple_store, ["url", "port", "repository"])
446
- except ValueError as e:
447
- triple_store["error"] = e
448
- elif triple_store_type != TRIPLESTORE.RdfLib:
449
- triple_store["error"] = f"Triple store not implemented: {triple_store_type}"
450
-
451
- triple_stores.append(triple_store)
452
- return triple_stores
453
-
454
-
455
- def check_triple_store_params(triple_store: dict, required_params: List[str]):
456
- missing_params = [param for param in required_params if triple_store.get(param) is None]
457
- if missing_params:
458
- raise ValueError(f"Cannot establish connection to {triple_store['type']}. "
459
- f"Missing required parameter(s): {', '.join(missing_params)}.")
460
-
461
-
462
- def get_credential_from_file(triple_store_name: URIRef, credential: str, config_path: Literal) -> str:
463
- log.info(f"get_credential_from_file {triple_store_name}, {credential}, {config_path}")
464
- if config_path is None:
465
- raise ValueError(f"Cannot establish connection defined in {triple_store_name}. "
466
- f"Missing required parameter: {credential}.")
467
- # if os.path.isrelative(config_path)
468
- # project_root = get_project_root()
469
- path = Path(config_path)
470
- log.info(f"get_credential_from_file {path}")
471
-
472
- if not os.path.isfile(path):
473
- log.error(f"couldn't find {path}")
474
- raise FileNotFoundError(f"Credentials config file not found: {path}")
475
- try:
476
- with open(path, "rb") as f:
477
- config = tomli.load(f)
478
- except tomli.TOMLDecodeError as e:
479
- log.error(f"config error {path} {e}")
480
- raise ValueError(f"Error reading credentials config file: {e}")
481
- return config[str(triple_store_name)][credential]
482
-
483
- # Convert sparql json query results as defined in https://www.w3.org/TR/rdf-sparql-json-res/
484
- def json_results_to_panda_dataframe(result: str) -> pandas.DataFrame:
485
- json_result = json.loads(result)
486
- frames = DataFrame()
487
- for binding in json_result["results"]["bindings"]:
488
- columns = []
489
- values = []
490
- for key in binding:
491
- value_object = binding[key]
492
- columns.append(key)
493
- values.append(str(value_object["value"]))
494
- columns.append(key + "_datatype")
495
- if "type" in value_object and value_object["type"] == "literal":
496
- literal_type = str(XSD.string)
497
- if "datatype" in value_object:
498
- literal_type = value_object["datatype"]
499
- values.append(literal_type)
500
- else:
501
- values.append(str(XSD.anyURI))
502
-
503
- frames = pandas.concat(objs=[frames, pandas.DataFrame([values], columns=columns)], ignore_index=True)
504
- frames.fillna('', inplace=True)
505
-
506
- if frames.size == 0:
507
- frames = pandas.DataFrame()
508
- return frames
509
-
510
-
511
- # https://github.com/Semantic-partners/mustrd/issues/110
512
- # https://github.com/Semantic-partners/mustrd/issues/52
513
- def table_comparison(result: str, spec: Specification) -> SpecResult:
514
- warning = None
515
- order_list = ["order by ?", "order by desc", "order by asc"]
516
- ordered_result = any(pattern in spec.when[0].value.lower() for pattern in order_list)
517
- then = spec.then.value
518
- try:
519
- if is_json(result):
520
- df = json_results_to_panda_dataframe(result)
521
- columns = list(df.columns)
522
- else:
523
- raise ParseException
524
- sorted_columns = sorted(columns)
525
- sorted_then_cols = sorted(list(then))
526
- if not df.empty:
527
-
528
- if not ordered_result:
529
- df.sort_values(by=columns[::2], inplace=True)
530
- df.reset_index(inplace=True, drop=True)
531
- if spec.then.ordered:
532
- warning = f"sh:order in {spec.spec_uri} is ignored, no ORDER BY in query"
533
- log.warning(warning)
534
-
535
- # Scenario 1: expected no result but got a result
536
- if then.empty:
537
- message = f"Expected 0 row(s) and 0 column(s), got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
538
- empty_then = create_empty_dataframe_with_columns(df)
539
- df_diff = empty_then.compare(df, result_names=("expected", "actual"))
540
-
541
- else:
542
- # Scenario 2: expected a result and got a result
543
- # pandas.set_option('display.max_columns', None)
544
- message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), " \
545
- f"got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
546
- if ordered_result is True and not spec.then.ordered:
547
- message += ". Actual result is ordered, must:then must contain sh:order on every row."
548
- return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], None, message)
549
- # if df.shape == then.shape and (df.columns == then.columns).all():
550
- # df_diff = then.compare(df, result_names=("expected", "actual"))
551
- # if df_diff.empty:
552
- # df_diff = df
553
- # print(df_diff.to_markdown())
554
- # else:
555
- # df_diff = construct_df_diff(df, then)
556
- # print(df_diff.to_markdown())
557
- else:
558
- if len(columns) == len(then.columns):
559
- if sorted_columns == sorted_then_cols:
560
- then = then[columns]
561
- if not ordered_result:
562
- then.sort_values(by=columns[::2], inplace=True)
563
- then.reset_index(drop=True, inplace=True)
564
- if df.shape == then.shape and (df.columns == then.columns).all():
565
- df_diff = then.compare(df, result_names=("expected", "actual"))
566
- else:
567
- df_diff = construct_df_diff(df, then)
568
-
569
- else:
570
- then = then[sorted_then_cols]
571
- df = df[sorted_columns]
572
- df_diff = construct_df_diff(df, then)
573
- else:
574
-
575
- then = then[sorted_then_cols]
576
- df = df[sorted_columns]
577
- df_diff = construct_df_diff(df, then)
578
- else:
579
-
580
- if then.empty:
581
- # Scenario 3: expected no result, got no result
582
- message = f"Expected 0 row(s) and 0 column(s), got 0 row(s) and 0 column(s)"
583
- df = pandas.DataFrame()
584
- else:
585
- # Scenario 4: expected a result, but got an empty result
586
- message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), got 0 row(s) and 0 column(s)"
587
- then = then[sorted_then_cols]
588
- df = create_empty_dataframe_with_columns(then)
589
- df_diff = then.compare(df, result_names=("expected", "actual"))
590
- print(df_diff.to_markdown())
591
-
592
- if df_diff.empty:
593
- if warning:
594
- return SpecPassedWithWarning(spec.spec_uri, spec.triple_store["type"], warning)
595
- else:
596
- return SpecPassed(spec.spec_uri, spec.triple_store["type"])
597
- else:
598
- # message += f"\nexpected:\n{then}\nactual:{df}"
599
- log.error(message)
600
- # print(spec.spec_uri)
601
- # print("actual:")
602
- # print(then)
603
- # print("expected:")
604
- # print(df)
605
- return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], df_diff, message)
606
-
607
- except ParseException as e:
608
- return SparqlParseFailure(spec.spec_uri, spec.triple_store["type"], e)
609
- except NotImplementedError as ex:
610
- return SpecSkipped(spec.spec_uri, spec.triple_store["type"], ex)
611
-
612
-
613
- def graph_comparison(expected_graph: Graph, actual_graph: Graph) -> GraphComparison:
614
- diff = graph_diff(expected_graph, actual_graph)
615
- in_both = diff[0]
616
- in_expected = diff[1]
617
- in_actual = diff[2]
618
- in_expected_not_in_actual = (in_expected - in_actual)
619
- in_actual_not_in_expected = (in_actual - in_expected)
620
- return GraphComparison(in_expected_not_in_actual, in_actual_not_in_expected, in_both)
621
-
622
-
623
- def get_then_update(spec_uri: URIRef, spec_graph: Graph) -> Graph:
624
- then_query = f"""
625
- prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
626
-
627
- CONSTRUCT {{ ?s ?p ?o }}
628
- {{
629
- <{spec_uri}> <{MUST.then}>
630
- a <{MUST.StatementsDataset}> ;
631
- <{MUST.hasStatement}> [
632
- a rdf:Statement ;
633
- rdf:subject ?s ;
634
- rdf:predicate ?p ;
635
- rdf:object ?o ;
636
- ] ; ]
637
- }}
638
- """
639
- expected_results = spec_graph.query(then_query).graph
640
-
641
- return expected_results
642
-
643
-
644
- def calculate_row_difference(df1: pandas.DataFrame,
645
- df2: pandas.DataFrame) -> pandas.DataFrame:
646
- df_all = df1.merge(df2.drop_duplicates(), how='left', indicator=True)
647
- actual_rows = df_all[df_all['_merge'] == 'left_only']
648
- actual_rows = actual_rows.drop('_merge', axis=1)
649
- return actual_rows
650
-
651
-
652
- def construct_df_diff(df: pandas.DataFrame,
653
- then: pandas.DataFrame) -> pandas.DataFrame:
654
- actual_rows = calculate_row_difference(df, then)
655
- expected_rows = calculate_row_difference(then, df)
656
- actual_columns = df.columns.difference(then.columns)
657
- expected_columns = then.columns.difference(df.columns)
658
-
659
- df_diff = pandas.DataFrame()
660
- modified_df = df
661
- modified_then = then
662
-
663
- if actual_columns.size > 0:
664
- modified_then = modified_then.reindex(modified_then.columns.to_list() + actual_columns.to_list(), axis=1)
665
- modified_then[actual_columns.to_list()] = modified_then[actual_columns.to_list()].fillna('')
666
-
667
- if expected_columns.size > 0:
668
- modified_df = modified_df.reindex(modified_df.columns.to_list() + expected_columns.to_list(), axis=1)
669
- modified_df[expected_columns.to_list()] = modified_df[expected_columns.to_list()].fillna('')
670
-
671
- modified_df = modified_df.reindex(modified_then.columns, axis=1)
672
-
673
- if df.shape[0] != then.shape[0] and df.shape[1] != then.shape[1]:
674
- # take modified columns and add rows
675
- actual_rows = calculate_row_difference(modified_df, modified_then)
676
- expected_rows = calculate_row_difference(modified_then, modified_df)
677
- df_diff = generate_row_diff(actual_rows, expected_rows)
678
- elif actual_rows.shape[0] > 0 or expected_rows.shape[0] > 0:
679
- df_diff = generate_row_diff(actual_rows, expected_rows)
680
- elif actual_columns.size > 0 or expected_columns.size > 0:
681
- df_diff = modified_then.compare(modified_df, result_names=("expected", "actual"), keep_shape=True,
682
- keep_equal=True)
683
- df_diff.fillna("", inplace=True)
684
- return df_diff
685
-
686
-
687
- def generate_row_diff(actual_rows: pandas.DataFrame, expected_rows: pandas.DataFrame) -> pandas.DataFrame:
688
- df_diff_actual_rows = pandas.DataFrame()
689
- df_diff_expected_rows = pandas.DataFrame()
690
-
691
- if actual_rows.shape[0] > 0:
692
- empty_actual_copy = create_empty_dataframe_with_columns(actual_rows)
693
- df_diff_actual_rows = empty_actual_copy.compare(actual_rows, result_names=("expected", "actual"))
694
-
695
- if expected_rows.shape[0] > 0:
696
- empty_expected_copy = create_empty_dataframe_with_columns(expected_rows)
697
- df_diff_expected_rows = expected_rows.compare(empty_expected_copy, result_names=("expected", "actual"))
698
-
699
- df_diff_rows = pandas.concat([df_diff_actual_rows, df_diff_expected_rows], ignore_index=True)
700
- return df_diff_rows
701
-
702
-
703
- def create_empty_dataframe_with_columns(df: pandas.DataFrame) -> pandas.DataFrame:
704
- empty_copy = pandas.DataFrame().reindex_like(df)
705
- empty_copy.fillna("", inplace=True)
706
- return empty_copy
707
-
708
-
709
- def review_results(results: List[SpecResult], verbose: bool) -> None:
710
- print("===== Result Overview =====")
711
- # Init dictionaries
712
- status_dict = defaultdict(lambda: defaultdict(int))
713
- status_counts = defaultdict(lambda: defaultdict(int))
714
- colours = {SpecPassed: Fore.GREEN, SpecPassedWithWarning: Fore.YELLOW, SpecSkipped: Fore.YELLOW}
715
- # Populate dictionaries from results
716
- for result in results:
717
- status_counts[result.triple_store][type(result)] += 1
718
- status_dict[result.spec_uri][result.triple_store] = type(result)
719
-
720
- # Get the list of statuses and list of unique triple stores
721
- statuses = list(status for inner_dict in status_dict.values() for status in inner_dict.values())
722
- triple_stores = list(set(status for inner_dict in status_dict.values() for status in inner_dict.keys()))
723
-
724
- # Convert dictionaries to list for tabulate
725
- table_rows = [[spec_uri] + [
726
- f"{colours.get(status_dict[spec_uri][triple_store], Fore.RED)}{status_dict[spec_uri][triple_store].__name__}{Style.RESET_ALL}"
727
- for triple_store in triple_stores] for spec_uri in set(status_dict.keys())]
728
-
729
- status_rows = [[f"{colours.get(status, Fore.RED)}{status.__name__}{Style.RESET_ALL}"] +
730
- [f"{colours.get(status, Fore.RED)}{status_counts[triple_store][status]}{Style.RESET_ALL}"
731
- for triple_store in triple_stores] for status in set(statuses)]
732
-
733
- # Display tables with tabulate
734
- print(tabulate(table_rows, headers=['Spec Uris / triple stores'] + triple_stores, tablefmt="pretty"))
735
- print(tabulate(status_rows, headers=['Status / triple stores'] + triple_stores, tablefmt="pretty"))
736
-
737
- pass_count = statuses.count(SpecPassed)
738
- warning_count = statuses.count(SpecPassedWithWarning)
739
- skipped_count = statuses.count(SpecSkipped)
740
- fail_count = len(
741
- list(filter(lambda status: status not in [SpecPassed, SpecPassedWithWarning, SpecSkipped], statuses)))
742
-
743
- if fail_count:
744
- overview_colour = Fore.RED
745
- elif warning_count or skipped_count:
746
- overview_colour = Fore.YELLOW
747
- else:
748
- overview_colour = Fore.GREEN
749
-
750
- logger_setup.flush()
751
- print(f"{overview_colour}===== {fail_count} failures, {skipped_count} skipped, {Fore.GREEN}{pass_count} passed, "
752
- f"{overview_colour}{warning_count} passed with warnings =====")
753
-
754
- if verbose and (fail_count or warning_count or skipped_count):
755
- for res in results:
756
- if type(res) == UpdateSpecFailure:
757
- print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
758
- print(f"{Fore.BLUE} In Expected Not In Actual:")
759
- print(res.graph_comparison.in_expected_not_in_actual.serialize(format="ttl"))
760
- print()
761
- print(f"{Fore.RED} in_actual_not_in_expected")
762
- print(res.graph_comparison.in_actual_not_in_expected.serialize(format="ttl"))
763
- print(f"{Fore.GREEN} in_both")
764
- print(res.graph_comparison.in_both.serialize(format="ttl"))
765
-
766
- if type(res) == SelectSpecFailure:
767
- print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
768
- print(res.message)
769
- print(res.table_comparison.to_markdown())
770
- if type(res) == ConstructSpecFailure or type(res) == UpdateSpecFailure:
771
- print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
772
- if type(res) == SpecPassedWithWarning:
773
- print(f"{Fore.YELLOW}Passed with warning {res.spec_uri} {res.triple_store}")
774
- print(res.warning)
775
- if type(res) == TripleStoreConnectionError or type(res) == SparqlExecutionError or \
776
- type(res) == SparqlParseFailure:
777
- print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
778
- print(res.exception)
779
- if type(res) == SpecSkipped:
780
- print(f"{Fore.YELLOW}Skipped {res.spec_uri} {res.triple_store}")
781
- print(res.message)
782
-
783
-
784
-
785
-
786
-
787
-
1
+ """
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Semantic Partners Ltd
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ from typing import Tuple, List
27
+
28
+ import tomli
29
+ from rdflib.plugins.parsers.notation3 import BadSyntax
30
+
31
+ from . import logger_setup
32
+ from dataclasses import dataclass
33
+
34
+ from pyparsing import ParseException
35
+ from pathlib import Path
36
+ from requests import ConnectionError, ConnectTimeout, HTTPError, RequestException
37
+
38
+ from rdflib import Graph, URIRef, RDF, XSD, SH, Literal
39
+
40
+ from rdflib.compare import isomorphic, graph_diff
41
+ import pandas
42
+
43
+ from .namespace import MUST, TRIPLESTORE
44
+ import requests
45
+ import json
46
+ from pandas import DataFrame
47
+
48
+ from .spec_component import TableThenSpec, parse_spec_component, WhenSpec, ThenSpec
49
+ from .utils import is_json,get_mustrd_root
50
+ from colorama import Fore, Style
51
+ from tabulate import tabulate
52
+ from collections import defaultdict
53
+ from pyshacl import validate
54
+ import logging
55
+ from http.client import HTTPConnection
56
+ from .steprunner import upload_given, run_when
57
+
58
+ log = logger_setup.setup_logger(__name__)
59
+
60
+ requests.packages.urllib3.disable_warnings()
61
+ requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'
62
+
63
+ logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
64
+
65
+
66
+ def debug_requests_on():
67
+ '''Switches on logging of the requests module.'''
68
+ HTTPConnection.debuglevel = 1
69
+
70
+ logging.basicConfig()
71
+ logging.getLogger().setLevel(logging.DEBUG)
72
+ requests_log = logging.getLogger("requests.packages.urllib3")
73
+ requests_log.setLevel(logging.DEBUG)
74
+ requests_log.propagate = True
75
+
76
+ def debug_requests_off():
77
+ '''Switches off logging of the requests module, might be some side-effects'''
78
+ HTTPConnection.debuglevel = 0
79
+
80
+ root_logger = logging.getLogger()
81
+ root_logger.setLevel(logging.WARNING)
82
+ root_logger.handlers = []
83
+ requests_log = logging.getLogger("requests.packages.urllib3")
84
+ requests_log.setLevel(logging.WARNING)
85
+ requests_log.propagate = False
86
+
87
+ debug_requests_off()
88
+
89
+ @dataclass
90
+ class Specification:
91
+ spec_uri: URIRef
92
+ triple_store: dict
93
+ given: Graph
94
+ when: WhenSpec
95
+ then: ThenSpec
96
+ spec_file_name: str = "default.mustrd.ttl"
97
+
98
+
99
+ @dataclass
100
+ class GraphComparison:
101
+ in_expected_not_in_actual: Graph
102
+ in_actual_not_in_expected: Graph
103
+ in_both: Graph
104
+
105
+
106
+ @dataclass
107
+ class SpecResult:
108
+ spec_uri: URIRef
109
+ triple_store: URIRef
110
+
111
+
112
+ @dataclass
113
+ class SpecPassed(SpecResult):
114
+ pass
115
+
116
+
117
+ @dataclass()
118
+ class SpecPassedWithWarning(SpecResult):
119
+ warning: str
120
+
121
+
122
+ @dataclass
123
+ class SelectSpecFailure(SpecResult):
124
+ table_comparison: pandas.DataFrame
125
+ message: str
126
+
127
+
128
+ @dataclass
129
+ class ConstructSpecFailure(SpecResult):
130
+ graph_comparison: GraphComparison
131
+
132
+
133
+ @dataclass
134
+ class UpdateSpecFailure(SpecResult):
135
+ graph_comparison: GraphComparison
136
+
137
+
138
+ @dataclass
139
+ class SparqlParseFailure(SpecResult):
140
+ exception: ParseException
141
+
142
+
143
+ @dataclass
144
+ class SparqlExecutionError(SpecResult):
145
+ exception: Exception
146
+
147
+
148
+ @dataclass
149
+ class TripleStoreConnectionError(SpecResult):
150
+ exception: ConnectionError
151
+
152
+
153
+ @dataclass
154
+ class SpecSkipped(SpecResult):
155
+ message: str
156
+ spec_file_name: str = "default.mustrd.ttl"
157
+
158
+
159
+ @dataclass
160
+ class SparqlAction:
161
+ query: str
162
+
163
+
164
+ @dataclass
165
+ class SelectSparqlQuery(SparqlAction):
166
+ pass
167
+
168
+
169
+ @dataclass
170
+ class ConstructSparqlQuery(SparqlAction):
171
+ pass
172
+
173
+
174
+ @dataclass
175
+ class UpdateSparqlQuery(SparqlAction):
176
+ pass
177
+
178
+
179
+ # https://github.com/Semantic-partners/mustrd/issues/19
180
+
181
+ def validate_specs(run_config: dict, triple_stores: List, shacl_graph: Graph, ont_graph: Graph, file_name: str = "*")\
182
+ -> Tuple[List, Graph, List]:
183
+ spec_graph = Graph()
184
+ subject_uris = set()
185
+ focus_uris = set()
186
+ invalid_specs = []
187
+ ttl_files = list(run_config['spec_path'].glob(f'**/{file_name}.mustrd.ttl'))
188
+ ttl_files.sort()
189
+ log.info(f"Found {len(ttl_files)} {file_name}.mustrd.ttl files in {run_config['spec_path']}")
190
+
191
+ for file in ttl_files:
192
+ error_messages = []
193
+
194
+ log.info(f"Parse: {file}")
195
+ try:
196
+ file_graph = Graph().parse(file)
197
+ except BadSyntax as e:
198
+ template = "An exception of type {0} occurred when trying to parse a spec file. Arguments:\n{1!r}"
199
+ message = template.format(type(e).__name__, e.args)
200
+ log.error(message)
201
+ error_messages += [f"Could not extract spec from {file} due to exception of type "
202
+ f"{type(e).__name__} when parsing file"]
203
+ continue
204
+ # run shacl validation
205
+ conforms, results_graph, results_text = validate(file_graph,
206
+ shacl_graph=shacl_graph,
207
+ ont_graph=ont_graph,
208
+ inference='none',
209
+ abort_on_first=False,
210
+ allow_infos=False,
211
+ allow_warnings=False,
212
+ meta_shacl=False,
213
+ advanced=True,
214
+ js=False,
215
+ debug=False)
216
+ if not conforms:
217
+ for msg in results_graph.objects(predicate=SH.resultMessage):
218
+ log.warning(f"{file_graph}")
219
+ log.warning(f"{msg} File: {file.name}")
220
+ error_messages += [f"{msg} File: {file.name}"]
221
+
222
+ # collect a list of uris of the tests in focus
223
+ for focus_uri in file_graph.subjects(predicate=MUST.focus, object=Literal("true", datatype=XSD.boolean)):
224
+ if focus_uri in focus_uris:
225
+ focus_uri = URIRef(str(focus_uri) + "_DUPLICATE")
226
+ focus_uris.add(focus_uri)
227
+
228
+ # make sure there are no duplicate test IRIs in the files
229
+ for subject_uri in file_graph.subjects(RDF.type, MUST.TestSpec):
230
+ if subject_uri in subject_uris:
231
+ log.warning(f"Duplicate subject URI found: {file.name} {subject_uri}. File will not be parsed.")
232
+ error_messages += [f"Duplicate subject URI found in {file.name}."]
233
+ subject_uri = URIRef(str(subject_uri) + "_DUPLICATE")
234
+ if len(error_messages) > 0:
235
+ error_messages.sort()
236
+ error_message = "\n".join(msg for msg in error_messages)
237
+ invalid_specs += [SpecSkipped(subject_uri, triple_store["type"], error_message, file.name) for triple_store in
238
+ triple_stores]
239
+ else:
240
+ subject_uris.add(subject_uri)
241
+ this_spec_graph = Graph()
242
+ this_spec_graph.parse(file)
243
+ spec_uris_in_this_file = list(this_spec_graph.subjects(RDF.type, MUST.TestSpec))
244
+ for spec in spec_uris_in_this_file:
245
+ # print(f"adding {tripleToAdd}")
246
+ this_spec_graph.add([spec, MUST.specSourceFile, Literal(file)])
247
+ this_spec_graph.add([spec, MUST.specFileName, Literal(file.name)])
248
+ # print(f"beforeadd: {spec_graph}" )
249
+ # print(f"beforeadd: {str(this_spec_graph.serialize())}" )
250
+ spec_graph += this_spec_graph
251
+
252
+
253
+ sourceFiles = list(spec_graph.subject_objects(MUST.specSourceFile))
254
+ # print(f"sourceFiles: {sourceFiles}")
255
+
256
+ valid_spec_uris = list(spec_graph.subjects(RDF.type, MUST.TestSpec))
257
+
258
+ if focus_uris:
259
+ invalid_focus_specs = []
260
+ for spec in invalid_specs:
261
+ if spec.spec_uri in focus_uris:
262
+ invalid_focus_specs += [spec]
263
+ focus_uris.remove(spec.spec_uri)
264
+ log.info(f"Collected {len(focus_uris)} focus test spec(s)")
265
+ return focus_uris, spec_graph, invalid_focus_specs
266
+ else:
267
+ log.info(f"Collected {len(valid_spec_uris)} valid test spec(s)")
268
+ return valid_spec_uris, spec_graph, invalid_specs
269
+
270
+
271
+ def get_specs(spec_uris: List[URIRef], spec_graph: Graph, triple_stores: List[dict],
272
+ run_config: dict):
273
+ specs = []
274
+ skipped_results = []
275
+ try:
276
+ for triple_store in triple_stores:
277
+ if "error" in triple_store:
278
+ log.error(f"{triple_store['error']}. No specs run for this triple store.")
279
+ skipped_results += [SpecSkipped(spec_uri, triple_store['type'], triple_store['error'], get_spec_file(spec_uri, spec_graph)) for spec_uri in
280
+ spec_uris]
281
+ else:
282
+ for spec_uri in spec_uris:
283
+ try:
284
+ specs += [get_spec(spec_uri, spec_graph, run_config, triple_store)]
285
+ except (ValueError, FileNotFoundError, ConnectionError) as e:
286
+ skipped_results += [SpecSkipped(spec_uri, triple_store['type'], e, get_spec_file(spec_uri, spec_graph))]
287
+
288
+ except (BadSyntax, FileNotFoundError) as e:
289
+ template = "An exception of type {0} occurred when trying to parse the triple store configuration file. " \
290
+ "Arguments:\n{1!r}"
291
+ message = template.format(type(e).__name__, e.args)
292
+ log.error(message)
293
+ log.error("No specifications will be run.")
294
+
295
+ log.info(f"Extracted {len(specs)} specifications that will be run")
296
+ return specs, skipped_results
297
+
298
+
299
+ def run_specs(specs) -> List[SpecResult]:
300
+ results = []
301
+ # https://github.com/Semantic-partners/mustrd/issues/115
302
+ for specification in specs:
303
+ results.append(run_spec(specification))
304
+ return results
305
+
306
+ def get_spec_file(spec_uri: URIRef, spec_graph: Graph):
307
+ return str(spec_graph.value(subject = spec_uri, predicate = MUST.specFileName, default = "default.mustrd.ttl"))
308
+
309
+ def get_spec(spec_uri: URIRef, spec_graph: Graph, run_config: dict, mustrd_triple_store: dict = None) -> Specification:
310
+ try:
311
+ if mustrd_triple_store is None:
312
+ mustrd_triple_store = {"type": TRIPLESTORE.RdfLib}
313
+ components = []
314
+ for predicate in MUST.given, MUST.when, MUST.then:
315
+ components.append(parse_spec_component(subject=spec_uri,
316
+ predicate=predicate,
317
+ spec_graph=spec_graph,
318
+ run_config=run_config,
319
+ mustrd_triple_store=mustrd_triple_store))
320
+
321
+ spec_file_name = get_spec_file(spec_uri, spec_graph)
322
+ # https://github.com/Semantic-partners/mustrd/issues/92
323
+ return Specification(spec_uri, mustrd_triple_store, components[0].value, components[1], components[2], spec_file_name)
324
+
325
+ except (ValueError, FileNotFoundError) as e:
326
+ template = "An exception of type {0} occurred. Arguments:\n{1!r}"
327
+ message = template.format(type(e).__name__, e.args)
328
+ log.exception(message)
329
+ raise
330
+ except ConnectionError as e:
331
+ log.error(e)
332
+ raise
333
+
334
+
335
+ def check_result(spec, result):
336
+ if type(spec.then) == TableThenSpec:
337
+ return table_comparison(result, spec)
338
+ else:
339
+ graph_compare = graph_comparison(spec.then.value, result)
340
+ if isomorphic(result, spec.then.value):
341
+ return SpecPassed(spec.spec_uri, spec.triple_store["type"])
342
+ else:
343
+ if spec.when[0].queryType == MUST.ConstructSparql:
344
+ return ConstructSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
345
+ else:
346
+ return UpdateSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
347
+
348
+
349
+ def run_spec(spec: Specification) -> SpecResult:
350
+ spec_uri = spec.spec_uri
351
+ triple_store = spec.triple_store
352
+ # close_connection = True
353
+ log.debug(f"run_when {spec_uri=}, {triple_store=}, {spec.given=}, {spec.when=}, {spec.then=}")
354
+ if spec.given:
355
+ given_as_turtle = spec.given.serialize(format="turtle")
356
+ log.debug(f"{given_as_turtle}")
357
+ upload_given(triple_store, spec.given)
358
+ else:
359
+ if triple_store['type'] == TRIPLESTORE.RdfLib:
360
+ return SpecSkipped(spec_uri, triple_store['type'], "Unable to run Inherited State tests on Rdflib")
361
+ try:
362
+ for when in spec.when:
363
+ log.info(f"Running {when.queryType} spec {spec_uri} on {triple_store['type']}")
364
+ try:
365
+ result = run_when(spec_uri, triple_store, when)
366
+ except ParseException as e:
367
+ return SparqlParseFailure(spec_uri, triple_store["type"], e)
368
+ except NotImplementedError as ex:
369
+ return SpecSkipped(spec_uri, triple_store["type"], ex.args[0])
370
+ return check_result(spec, result)
371
+ except (ConnectionError, TimeoutError, HTTPError, ConnectTimeout, OSError) as e:
372
+ # close_connection = False
373
+ template = "An exception of type {0} occurred. Arguments:\n{1!r}"
374
+ message = template.format(type(e).__name__, e.args)
375
+ log.error(message)
376
+ return TripleStoreConnectionError(spec_uri, triple_store["type"], message)
377
+ except (TypeError, RequestException) as e:
378
+ log.error(f"{type(e)} {e}")
379
+ return SparqlExecutionError(spec_uri, triple_store["type"], e)
380
+
381
+ # https://github.com/Semantic-partners/mustrd/issues/78
382
+ # finally:
383
+ # if type(mustrd_triple_store) == MustrdAnzo and close_connection:
384
+ # mustrd_triple_store.clear_graph()
385
+
386
+ def get_triple_store_graph(triple_store_graph_path: Path, secrets: str):
387
+ if secrets:
388
+ return Graph().parse(triple_store_graph_path).parse(data = secrets)
389
+ else:
390
+ secret_path = triple_store_graph_path.parent / Path(triple_store_graph_path.stem + "_secrets" + triple_store_graph_path.suffix)
391
+ return Graph().parse(triple_store_graph_path).parse(secret_path)
392
+
393
+
394
+ def get_triple_stores(triple_store_graph: Graph) -> list[dict]:
395
+ triple_stores = []
396
+ shacl_graph = Graph().parse(Path(os.path.join(get_mustrd_root(), "model/triplestoreshapes.ttl")))
397
+ ont_graph = Graph().parse(Path(os.path.join(get_mustrd_root(), "model/triplestoreOntology.ttl")))
398
+ conforms, results_graph, results_text = validate(
399
+ data_graph= triple_store_graph,
400
+ shacl_graph = shacl_graph,
401
+ ont_graph = ont_graph,
402
+ advanced= True,
403
+ inference= 'none'
404
+ )
405
+ if not conforms:
406
+ raise ValueError(f"Triple store configuration not conform to the shapes. SHACL report: {results_text}", results_graph)
407
+ for triple_store_config, rdf_type, triple_store_type in triple_store_graph.triples((None, RDF.type, None)):
408
+ triple_store = {}
409
+ triple_store["type"] = triple_store_type
410
+ triple_store["uri"] = triple_store_config
411
+ # Anzo graph via anzo
412
+ if triple_store_type == TRIPLESTORE.Anzo:
413
+ triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.url)
414
+ triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.port)
415
+ try:
416
+ triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.username))
417
+ triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.password))
418
+ except (FileNotFoundError, ValueError) as e:
419
+ triple_store["error"] = e
420
+ triple_store["gqe_uri"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.gqeURI)
421
+ triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
422
+ predicate=TRIPLESTORE.inputGraph)
423
+ triple_store["output_graph"] = triple_store_graph.value(subject=triple_store_config,
424
+ predicate=TRIPLESTORE.outputGraph)
425
+ try:
426
+ check_triple_store_params(triple_store, ["url", "port", "username", "password", "input_graph"])
427
+ except ValueError as e:
428
+ triple_store["error"] = e
429
+ # GraphDB
430
+ elif triple_store_type == TRIPLESTORE.GraphDb:
431
+ triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.url)
432
+ triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.port)
433
+ try:
434
+ triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.username))
435
+ triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=TRIPLESTORE.password))
436
+ except (FileNotFoundError, ValueError) as e:
437
+ log.error(f"Credential retrieval failed {e}")
438
+ triple_store["error"] = e
439
+ triple_store["repository"] = triple_store_graph.value(subject=triple_store_config,
440
+ predicate=TRIPLESTORE.repository)
441
+ triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
442
+ predicate=TRIPLESTORE.inputGraph)
443
+
444
+ try:
445
+ check_triple_store_params(triple_store, ["url", "port", "repository"])
446
+ except ValueError as e:
447
+ triple_store["error"] = e
448
+ elif triple_store_type != TRIPLESTORE.RdfLib:
449
+ triple_store["error"] = f"Triple store not implemented: {triple_store_type}"
450
+
451
+ triple_stores.append(triple_store)
452
+ return triple_stores
453
+
454
+
455
+ def check_triple_store_params(triple_store: dict, required_params: List[str]):
456
+ missing_params = [param for param in required_params if triple_store.get(param) is None]
457
+ if missing_params:
458
+ raise ValueError(f"Cannot establish connection to {triple_store['type']}. "
459
+ f"Missing required parameter(s): {', '.join(missing_params)}.")
460
+
461
+
462
+ def get_credential_from_file(triple_store_name: URIRef, credential: str, config_path: Literal) -> str:
463
+ log.info(f"get_credential_from_file {triple_store_name}, {credential}, {config_path}")
464
+ if config_path is None:
465
+ raise ValueError(f"Cannot establish connection defined in {triple_store_name}. "
466
+ f"Missing required parameter: {credential}.")
467
+ # if os.path.isrelative(config_path)
468
+ # project_root = get_project_root()
469
+ path = Path(config_path)
470
+ log.info(f"get_credential_from_file {path}")
471
+
472
+ if not os.path.isfile(path):
473
+ log.error(f"couldn't find {path}")
474
+ raise FileNotFoundError(f"Credentials config file not found: {path}")
475
+ try:
476
+ with open(path, "rb") as f:
477
+ config = tomli.load(f)
478
+ except tomli.TOMLDecodeError as e:
479
+ log.error(f"config error {path} {e}")
480
+ raise ValueError(f"Error reading credentials config file: {e}")
481
+ return config[str(triple_store_name)][credential]
482
+
483
+ # Convert sparql json query results as defined in https://www.w3.org/TR/rdf-sparql-json-res/
484
+ def json_results_to_panda_dataframe(result: str) -> pandas.DataFrame:
485
+ json_result = json.loads(result)
486
+ frames = DataFrame()
487
+ for binding in json_result["results"]["bindings"]:
488
+ columns = []
489
+ values = []
490
+ for key in binding:
491
+ value_object = binding[key]
492
+ columns.append(key)
493
+ values.append(str(value_object["value"]))
494
+ columns.append(key + "_datatype")
495
+ if "type" in value_object and value_object["type"] == "literal":
496
+ literal_type = str(XSD.string)
497
+ if "datatype" in value_object:
498
+ literal_type = value_object["datatype"]
499
+ values.append(literal_type)
500
+ else:
501
+ values.append(str(XSD.anyURI))
502
+
503
+ frames = pandas.concat(objs=[frames, pandas.DataFrame([values], columns=columns)], ignore_index=True)
504
+ frames.fillna('', inplace=True)
505
+
506
+ if frames.size == 0:
507
+ frames = pandas.DataFrame()
508
+ return frames
509
+
510
+
511
+ # https://github.com/Semantic-partners/mustrd/issues/110
512
+ # https://github.com/Semantic-partners/mustrd/issues/52
513
+ def table_comparison(result: str, spec: Specification) -> SpecResult:
514
+ warning = None
515
+ order_list = ["order by ?", "order by desc", "order by asc"]
516
+ ordered_result = any(pattern in spec.when[0].value.lower() for pattern in order_list)
517
+ then = spec.then.value
518
+ try:
519
+ if is_json(result):
520
+ df = json_results_to_panda_dataframe(result)
521
+ columns = list(df.columns)
522
+ else:
523
+ raise ParseException
524
+ sorted_columns = sorted(columns)
525
+ sorted_then_cols = sorted(list(then))
526
+ if not df.empty:
527
+
528
+ if not ordered_result:
529
+ df.sort_values(by=columns[::2], inplace=True)
530
+ df.reset_index(inplace=True, drop=True)
531
+ if spec.then.ordered:
532
+ warning = f"sh:order in {spec.spec_uri} is ignored, no ORDER BY in query"
533
+ log.warning(warning)
534
+
535
+ # Scenario 1: expected no result but got a result
536
+ if then.empty:
537
+ message = f"Expected 0 row(s) and 0 column(s), got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
538
+ empty_then = create_empty_dataframe_with_columns(df)
539
+ df_diff = empty_then.compare(df, result_names=("expected", "actual"))
540
+
541
+ else:
542
+ # Scenario 2: expected a result and got a result
543
+ # pandas.set_option('display.max_columns', None)
544
+ message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), " \
545
+ f"got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
546
+ if ordered_result is True and not spec.then.ordered:
547
+ message += ". Actual result is ordered, must:then must contain sh:order on every row."
548
+ return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], None, message)
549
+ # if df.shape == then.shape and (df.columns == then.columns).all():
550
+ # df_diff = then.compare(df, result_names=("expected", "actual"))
551
+ # if df_diff.empty:
552
+ # df_diff = df
553
+ # print(df_diff.to_markdown())
554
+ # else:
555
+ # df_diff = construct_df_diff(df, then)
556
+ # print(df_diff.to_markdown())
557
+ else:
558
+ if len(columns) == len(then.columns):
559
+ if sorted_columns == sorted_then_cols:
560
+ then = then[columns]
561
+ if not ordered_result:
562
+ then.sort_values(by=columns[::2], inplace=True)
563
+ then.reset_index(drop=True, inplace=True)
564
+ if df.shape == then.shape and (df.columns == then.columns).all():
565
+ df_diff = then.compare(df, result_names=("expected", "actual"))
566
+ else:
567
+ df_diff = construct_df_diff(df, then)
568
+
569
+ else:
570
+ then = then[sorted_then_cols]
571
+ df = df[sorted_columns]
572
+ df_diff = construct_df_diff(df, then)
573
+ else:
574
+
575
+ then = then[sorted_then_cols]
576
+ df = df[sorted_columns]
577
+ df_diff = construct_df_diff(df, then)
578
+ else:
579
+
580
+ if then.empty:
581
+ # Scenario 3: expected no result, got no result
582
+ message = f"Expected 0 row(s) and 0 column(s), got 0 row(s) and 0 column(s)"
583
+ df = pandas.DataFrame()
584
+ else:
585
+ # Scenario 4: expected a result, but got an empty result
586
+ message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), got 0 row(s) and 0 column(s)"
587
+ then = then[sorted_then_cols]
588
+ df = create_empty_dataframe_with_columns(then)
589
+ df_diff = then.compare(df, result_names=("expected", "actual"))
590
+ print(df_diff.to_markdown())
591
+
592
+ if df_diff.empty:
593
+ if warning:
594
+ return SpecPassedWithWarning(spec.spec_uri, spec.triple_store["type"], warning)
595
+ else:
596
+ return SpecPassed(spec.spec_uri, spec.triple_store["type"])
597
+ else:
598
+ # message += f"\nexpected:\n{then}\nactual:{df}"
599
+ log.error(message)
600
+ # print(spec.spec_uri)
601
+ # print("actual:")
602
+ # print(then)
603
+ # print("expected:")
604
+ # print(df)
605
+ return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], df_diff, message)
606
+
607
+ except ParseException as e:
608
+ return SparqlParseFailure(spec.spec_uri, spec.triple_store["type"], e)
609
+ except NotImplementedError as ex:
610
+ return SpecSkipped(spec.spec_uri, spec.triple_store["type"], ex)
611
+
612
+
613
+ def graph_comparison(expected_graph: Graph, actual_graph: Graph) -> GraphComparison:
614
+ diff = graph_diff(expected_graph, actual_graph)
615
+ in_both = diff[0]
616
+ in_expected = diff[1]
617
+ in_actual = diff[2]
618
+ in_expected_not_in_actual = (in_expected - in_actual)
619
+ in_actual_not_in_expected = (in_actual - in_expected)
620
+ return GraphComparison(in_expected_not_in_actual, in_actual_not_in_expected, in_both)
621
+
622
+
623
+ def get_then_update(spec_uri: URIRef, spec_graph: Graph) -> Graph:
624
+ then_query = f"""
625
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
626
+
627
+ CONSTRUCT {{ ?s ?p ?o }}
628
+ {{
629
+ <{spec_uri}> <{MUST.then}>
630
+ a <{MUST.StatementsDataset}> ;
631
+ <{MUST.hasStatement}> [
632
+ a rdf:Statement ;
633
+ rdf:subject ?s ;
634
+ rdf:predicate ?p ;
635
+ rdf:object ?o ;
636
+ ] ; ]
637
+ }}
638
+ """
639
+ expected_results = spec_graph.query(then_query).graph
640
+
641
+ return expected_results
642
+
643
+
644
+ def calculate_row_difference(df1: pandas.DataFrame,
645
+ df2: pandas.DataFrame) -> pandas.DataFrame:
646
+ df_all = df1.merge(df2.drop_duplicates(), how='left', indicator=True)
647
+ actual_rows = df_all[df_all['_merge'] == 'left_only']
648
+ actual_rows = actual_rows.drop('_merge', axis=1)
649
+ return actual_rows
650
+
651
+
652
+ def construct_df_diff(df: pandas.DataFrame,
653
+ then: pandas.DataFrame) -> pandas.DataFrame:
654
+ actual_rows = calculate_row_difference(df, then)
655
+ expected_rows = calculate_row_difference(then, df)
656
+ actual_columns = df.columns.difference(then.columns)
657
+ expected_columns = then.columns.difference(df.columns)
658
+
659
+ df_diff = pandas.DataFrame()
660
+ modified_df = df
661
+ modified_then = then
662
+
663
+ if actual_columns.size > 0:
664
+ modified_then = modified_then.reindex(modified_then.columns.to_list() + actual_columns.to_list(), axis=1)
665
+ modified_then[actual_columns.to_list()] = modified_then[actual_columns.to_list()].fillna('')
666
+
667
+ if expected_columns.size > 0:
668
+ modified_df = modified_df.reindex(modified_df.columns.to_list() + expected_columns.to_list(), axis=1)
669
+ modified_df[expected_columns.to_list()] = modified_df[expected_columns.to_list()].fillna('')
670
+
671
+ modified_df = modified_df.reindex(modified_then.columns, axis=1)
672
+
673
+ if df.shape[0] != then.shape[0] and df.shape[1] != then.shape[1]:
674
+ # take modified columns and add rows
675
+ actual_rows = calculate_row_difference(modified_df, modified_then)
676
+ expected_rows = calculate_row_difference(modified_then, modified_df)
677
+ df_diff = generate_row_diff(actual_rows, expected_rows)
678
+ elif actual_rows.shape[0] > 0 or expected_rows.shape[0] > 0:
679
+ df_diff = generate_row_diff(actual_rows, expected_rows)
680
+ elif actual_columns.size > 0 or expected_columns.size > 0:
681
+ df_diff = modified_then.compare(modified_df, result_names=("expected", "actual"), keep_shape=True,
682
+ keep_equal=True)
683
+ df_diff.fillna("", inplace=True)
684
+ return df_diff
685
+
686
+
687
+ def generate_row_diff(actual_rows: pandas.DataFrame, expected_rows: pandas.DataFrame) -> pandas.DataFrame:
688
+ df_diff_actual_rows = pandas.DataFrame()
689
+ df_diff_expected_rows = pandas.DataFrame()
690
+
691
+ if actual_rows.shape[0] > 0:
692
+ empty_actual_copy = create_empty_dataframe_with_columns(actual_rows)
693
+ df_diff_actual_rows = empty_actual_copy.compare(actual_rows, result_names=("expected", "actual"))
694
+
695
+ if expected_rows.shape[0] > 0:
696
+ empty_expected_copy = create_empty_dataframe_with_columns(expected_rows)
697
+ df_diff_expected_rows = expected_rows.compare(empty_expected_copy, result_names=("expected", "actual"))
698
+
699
+ df_diff_rows = pandas.concat([df_diff_actual_rows, df_diff_expected_rows], ignore_index=True)
700
+ return df_diff_rows
701
+
702
+
703
+ def create_empty_dataframe_with_columns(df: pandas.DataFrame) -> pandas.DataFrame:
704
+ empty_copy = pandas.DataFrame().reindex_like(df)
705
+ empty_copy.fillna("", inplace=True)
706
+ return empty_copy
707
+
708
+
709
+ def review_results(results: List[SpecResult], verbose: bool) -> None:
710
+ print("===== Result Overview =====")
711
+ # Init dictionaries
712
+ status_dict = defaultdict(lambda: defaultdict(int))
713
+ status_counts = defaultdict(lambda: defaultdict(int))
714
+ colours = {SpecPassed: Fore.GREEN, SpecPassedWithWarning: Fore.YELLOW, SpecSkipped: Fore.YELLOW}
715
+ # Populate dictionaries from results
716
+ for result in results:
717
+ status_counts[result.triple_store][type(result)] += 1
718
+ status_dict[result.spec_uri][result.triple_store] = type(result)
719
+
720
+ # Get the list of statuses and list of unique triple stores
721
+ statuses = list(status for inner_dict in status_dict.values() for status in inner_dict.values())
722
+ triple_stores = list(set(status for inner_dict in status_dict.values() for status in inner_dict.keys()))
723
+
724
+ # Convert dictionaries to list for tabulate
725
+ table_rows = [[spec_uri] + [
726
+ f"{colours.get(status_dict[spec_uri][triple_store], Fore.RED)}{status_dict[spec_uri][triple_store].__name__}{Style.RESET_ALL}"
727
+ for triple_store in triple_stores] for spec_uri in set(status_dict.keys())]
728
+
729
+ status_rows = [[f"{colours.get(status, Fore.RED)}{status.__name__}{Style.RESET_ALL}"] +
730
+ [f"{colours.get(status, Fore.RED)}{status_counts[triple_store][status]}{Style.RESET_ALL}"
731
+ for triple_store in triple_stores] for status in set(statuses)]
732
+
733
+ # Display tables with tabulate
734
+ print(tabulate(table_rows, headers=['Spec Uris / triple stores'] + triple_stores, tablefmt="pretty"))
735
+ print(tabulate(status_rows, headers=['Status / triple stores'] + triple_stores, tablefmt="pretty"))
736
+
737
+ pass_count = statuses.count(SpecPassed)
738
+ warning_count = statuses.count(SpecPassedWithWarning)
739
+ skipped_count = statuses.count(SpecSkipped)
740
+ fail_count = len(
741
+ list(filter(lambda status: status not in [SpecPassed, SpecPassedWithWarning, SpecSkipped], statuses)))
742
+
743
+ if fail_count:
744
+ overview_colour = Fore.RED
745
+ elif warning_count or skipped_count:
746
+ overview_colour = Fore.YELLOW
747
+ else:
748
+ overview_colour = Fore.GREEN
749
+
750
+ logger_setup.flush()
751
+ print(f"{overview_colour}===== {fail_count} failures, {skipped_count} skipped, {Fore.GREEN}{pass_count} passed, "
752
+ f"{overview_colour}{warning_count} passed with warnings =====")
753
+
754
+ if verbose and (fail_count or warning_count or skipped_count):
755
+ for res in results:
756
+ if type(res) == UpdateSpecFailure:
757
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
758
+ print(f"{Fore.BLUE} In Expected Not In Actual:")
759
+ print(res.graph_comparison.in_expected_not_in_actual.serialize(format="ttl"))
760
+ print()
761
+ print(f"{Fore.RED} in_actual_not_in_expected")
762
+ print(res.graph_comparison.in_actual_not_in_expected.serialize(format="ttl"))
763
+ print(f"{Fore.GREEN} in_both")
764
+ print(res.graph_comparison.in_both.serialize(format="ttl"))
765
+
766
+ if type(res) == SelectSpecFailure:
767
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
768
+ print(res.message)
769
+ print(res.table_comparison.to_markdown())
770
+ if type(res) == ConstructSpecFailure or type(res) == UpdateSpecFailure:
771
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
772
+ if type(res) == SpecPassedWithWarning:
773
+ print(f"{Fore.YELLOW}Passed with warning {res.spec_uri} {res.triple_store}")
774
+ print(res.warning)
775
+ if type(res) == TripleStoreConnectionError or type(res) == SparqlExecutionError or \
776
+ type(res) == SparqlParseFailure:
777
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
778
+ print(res.exception)
779
+ if type(res) == SpecSkipped:
780
+ print(f"{Fore.YELLOW}Skipped {res.spec_uri} {res.triple_store}")
781
+ print(res.message)
782
+
783
+
784
+
785
+
786
+
787
+