mustrd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/mustrd.py ADDED
@@ -0,0 +1,776 @@
1
+ """
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Semantic Partners Ltd
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ from typing import Tuple, List
27
+
28
+ import tomli
29
+ from rdflib.plugins.parsers.notation3 import BadSyntax
30
+
31
+ import logger_setup
32
+ from dataclasses import dataclass
33
+
34
+ from pyparsing import ParseException
35
+ from pathlib import Path
36
+ from requests import ConnectionError, ConnectTimeout, HTTPError, RequestException
37
+
38
+ from rdflib import Graph, URIRef, RDF, XSD, SH, Literal
39
+
40
+ from rdflib.compare import isomorphic, graph_diff
41
+ import pandas
42
+
43
+ from namespace import MUST
44
+ import requests
45
+ import json
46
+ from pandas import DataFrame
47
+
48
+ from spec_component import TableThenSpec, parse_spec_component, WhenSpec, ThenSpec
49
+ from utils import is_json
50
+ from colorama import Fore, Style
51
+ from tabulate import tabulate
52
+ from collections import defaultdict
53
+ from pyshacl import validate
54
+ import logging
55
+ from http.client import HTTPConnection
56
+ from steprunner import upload_given, run_when
57
+
58
+ log = logger_setup.setup_logger(__name__)
59
+
60
+ requests.packages.urllib3.disable_warnings()
61
+ requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'
62
+
63
+ logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
64
+
65
+
66
+ def debug_requests_on():
67
+ '''Switches on logging of the requests module.'''
68
+ HTTPConnection.debuglevel = 1
69
+
70
+ logging.basicConfig()
71
+ logging.getLogger().setLevel(logging.DEBUG)
72
+ requests_log = logging.getLogger("requests.packages.urllib3")
73
+ requests_log.setLevel(logging.DEBUG)
74
+ requests_log.propagate = True
75
+
76
+ def debug_requests_off():
77
+ '''Switches off logging of the requests module, might be some side-effects'''
78
+ HTTPConnection.debuglevel = 0
79
+
80
+ root_logger = logging.getLogger()
81
+ root_logger.setLevel(logging.WARNING)
82
+ root_logger.handlers = []
83
+ requests_log = logging.getLogger("requests.packages.urllib3")
84
+ requests_log.setLevel(logging.WARNING)
85
+ requests_log.propagate = False
86
+
87
+ debug_requests_off()
88
+
89
+ @dataclass
90
+ class Specification:
91
+ spec_uri: URIRef
92
+ triple_store: dict
93
+ given: Graph
94
+ when: WhenSpec
95
+ then: ThenSpec
96
+
97
+
98
+ @dataclass
99
+ class GraphComparison:
100
+ in_expected_not_in_actual: Graph
101
+ in_actual_not_in_expected: Graph
102
+ in_both: Graph
103
+
104
+
105
+ @dataclass
106
+ class SpecResult:
107
+ spec_uri: URIRef
108
+ triple_store: URIRef
109
+
110
+
111
+ @dataclass
112
+ class SpecPassed(SpecResult):
113
+ pass
114
+
115
+
116
+ @dataclass()
117
+ class SpecPassedWithWarning(SpecResult):
118
+ warning: str
119
+
120
+
121
+ @dataclass
122
+ class SelectSpecFailure(SpecResult):
123
+ table_comparison: pandas.DataFrame
124
+ message: str
125
+
126
+
127
+ @dataclass
128
+ class ConstructSpecFailure(SpecResult):
129
+ graph_comparison: GraphComparison
130
+
131
+
132
+ @dataclass
133
+ class UpdateSpecFailure(SpecResult):
134
+ graph_comparison: GraphComparison
135
+
136
+
137
+ @dataclass
138
+ class SparqlParseFailure(SpecResult):
139
+ exception: ParseException
140
+
141
+
142
+ @dataclass
143
+ class SparqlExecutionError(SpecResult):
144
+ exception: Exception
145
+
146
+
147
+ @dataclass
148
+ class TripleStoreConnectionError(SpecResult):
149
+ exception: ConnectionError
150
+
151
+
152
+ @dataclass
153
+ class SpecSkipped(SpecResult):
154
+ message: str
155
+
156
+
157
+ @dataclass
158
+ class SparqlAction:
159
+ query: str
160
+
161
+
162
+ @dataclass
163
+ class SelectSparqlQuery(SparqlAction):
164
+ pass
165
+
166
+
167
+ @dataclass
168
+ class ConstructSparqlQuery(SparqlAction):
169
+ pass
170
+
171
+
172
+ @dataclass
173
+ class UpdateSparqlQuery(SparqlAction):
174
+ pass
175
+
176
+
177
+ # https://github.com/Semantic-partners/mustrd/issues/19
178
+
179
+ def validate_specs(run_config: dict, triple_stores: List, shacl_graph: Graph, ont_graph: Graph)\
180
+ -> Tuple[List, Graph, List]:
181
+ spec_graph = Graph()
182
+ subject_uris = set()
183
+ focus_uris = set()
184
+ invalid_specs = []
185
+ # ttl_files = list(spec_path.glob('**/*.mustrd.ttl'))
186
+ ttl_files = list(run_config['spec_path'].glob('**/*.mustrd.ttl'))
187
+ ttl_files.sort()
188
+ log.info(f"Found {len(ttl_files)} mustrd.ttl files in {run_config['spec_path']}")
189
+
190
+ for file in ttl_files:
191
+ error_messages = []
192
+
193
+ log.info(f"Parse: {file}")
194
+ try:
195
+ file_graph = Graph().parse(file)
196
+ except BadSyntax as e:
197
+ template = "An exception of type {0} occurred when trying to parse a spec file. Arguments:\n{1!r}"
198
+ message = template.format(type(e).__name__, e.args)
199
+ log.error(message)
200
+ error_messages += [f"Could not extract spec from {file} due to exception of type "
201
+ f"{type(e).__name__} when parsing file"]
202
+ continue
203
+ # run shacl validation
204
+ conforms, results_graph, results_text = validate(file_graph,
205
+ shacl_graph=shacl_graph,
206
+ ont_graph=ont_graph,
207
+ inference='none',
208
+ abort_on_first=False,
209
+ allow_infos=False,
210
+ allow_warnings=False,
211
+ meta_shacl=False,
212
+ advanced=True,
213
+ js=False,
214
+ debug=False)
215
+ if not conforms:
216
+ for msg in results_graph.objects(predicate=SH.resultMessage):
217
+ log.warning(f"{file_graph}")
218
+ log.warning(f"{msg} File: {file.name}")
219
+ error_messages += [f"{msg} File: {file.name}"]
220
+
221
+ # collect a list of uris of the tests in focus
222
+ for focus_uri in file_graph.subjects(predicate=MUST.focus, object=Literal("true", datatype=XSD.boolean)):
223
+ if focus_uri in focus_uris:
224
+ focus_uri = URIRef(str(focus_uri) + "_DUPLICATE")
225
+ focus_uris.add(focus_uri)
226
+
227
+ # make sure there are no duplicate test IRIs in the files
228
+ for subject_uri in file_graph.subjects(RDF.type, MUST.TestSpec):
229
+ if subject_uri in subject_uris:
230
+ log.warning(f"Duplicate subject URI found: {file.name} {subject_uri}. File will not be parsed.")
231
+ error_messages += [f"Duplicate subject URI found in {file.name}."]
232
+ subject_uri = URIRef(str(subject_uri) + "_DUPLICATE")
233
+ if len(error_messages) > 0:
234
+ error_messages.sort()
235
+ error_message = "\n".join(msg for msg in error_messages)
236
+ invalid_specs += [SpecSkipped(subject_uri, triple_store["type"], error_message) for triple_store in
237
+ triple_stores]
238
+ else:
239
+ subject_uris.add(subject_uri)
240
+ this_spec_graph = Graph()
241
+ this_spec_graph.parse(file)
242
+ spec_uris_in_this_file = list(this_spec_graph.subjects(RDF.type, MUST.TestSpec))
243
+ for spec in spec_uris_in_this_file:
244
+ tripleToAdd = [spec, MUST.specSourceFile, Literal(file)]
245
+ # print(f"adding {tripleToAdd}")
246
+ this_spec_graph.add(tripleToAdd)
247
+ # print(f"beforeadd: {spec_graph}" )
248
+ # print(f"beforeadd: {str(this_spec_graph.serialize())}" )
249
+ spec_graph += this_spec_graph
250
+ # print(f"afteradd: {str(spec_graph.serialize())}" )
251
+
252
+
253
+ sourceFiles = list(spec_graph.subject_objects(MUST.specSourceFile))
254
+ # print(f"sourceFiles: {sourceFiles}")
255
+
256
+ valid_spec_uris = list(spec_graph.subjects(RDF.type, MUST.TestSpec))
257
+
258
+ if focus_uris:
259
+ invalid_focus_specs = []
260
+ for spec in invalid_specs:
261
+ if spec.spec_uri in focus_uris:
262
+ invalid_focus_specs += [spec]
263
+ focus_uris.remove(spec.spec_uri)
264
+ log.info(f"Collected {len(focus_uris)} focus test spec(s)")
265
+ return focus_uris, spec_graph, invalid_focus_specs
266
+ else:
267
+ log.info(f"Collected {len(valid_spec_uris)} valid test spec(s)")
268
+ return valid_spec_uris, spec_graph, invalid_specs
269
+
270
+
271
+ def get_specs(spec_uris: List[URIRef], spec_graph: Graph, triple_stores: List[dict],
272
+ run_config: dict):
273
+ specs = []
274
+ skipped_results = []
275
+ try:
276
+ for triple_store in triple_stores:
277
+ if "error" in triple_store:
278
+ log.error(f"{triple_store['error']}. No specs run for this triple store.")
279
+ skipped_results += [SpecSkipped(spec_uri, triple_store['type'], triple_store['error']) for spec_uri in
280
+ spec_uris]
281
+ else:
282
+ for spec_uri in spec_uris:
283
+ try:
284
+ specs += [get_spec(spec_uri, spec_graph, run_config, triple_store)]
285
+ except (ValueError, FileNotFoundError, ConnectionError) as e:
286
+ skipped_results += [SpecSkipped(spec_uri, triple_store['type'], e)]
287
+
288
+ except (BadSyntax, FileNotFoundError) as e:
289
+ template = "An exception of type {0} occurred when trying to parse the triple store configuration file. " \
290
+ "Arguments:\n{1!r}"
291
+ message = template.format(type(e).__name__, e.args)
292
+ log.error(message)
293
+ log.error("No specifications will be run.")
294
+
295
+ log.info(f"Extracted {len(specs)} specifications that will be run")
296
+ return specs, skipped_results
297
+
298
+
299
+ def run_specs(specs) -> List[SpecResult]:
300
+ results = []
301
+ # https://github.com/Semantic-partners/mustrd/issues/115
302
+ for specification in specs:
303
+ results.append(run_spec(specification))
304
+ return results
305
+
306
+
307
+ def get_spec(spec_uri: URIRef, spec_graph: Graph, run_config: dict, mustrd_triple_store: dict = None) -> Specification:
308
+ try:
309
+ if mustrd_triple_store is None:
310
+ mustrd_triple_store = {"type": MUST.RdfLib}
311
+ components = []
312
+ for predicate in MUST.given, MUST.when, MUST.then:
313
+ components.append(parse_spec_component(subject=spec_uri,
314
+ predicate=predicate,
315
+ spec_graph=spec_graph,
316
+ run_config=run_config,
317
+ mustrd_triple_store=mustrd_triple_store))
318
+
319
+
320
+ # https://github.com/Semantic-partners/mustrd/issues/92
321
+ return Specification(spec_uri, mustrd_triple_store, components[0].value, components[1], components[2])
322
+
323
+ except (ValueError, FileNotFoundError) as e:
324
+ template = "An exception of type {0} occurred. Arguments:\n{1!r}"
325
+ message = template.format(type(e).__name__, e.args)
326
+ log.exception(message)
327
+ raise
328
+ except ConnectionError as e:
329
+ log.error(e)
330
+ raise
331
+
332
+
333
+ def check_result(spec, result):
334
+ if type(spec.then) == TableThenSpec:
335
+ return table_comparison(result, spec)
336
+ else:
337
+ graph_compare = graph_comparison(spec.then.value, result)
338
+ if isomorphic(result, spec.then.value):
339
+ return SpecPassed(spec.spec_uri, spec.triple_store["type"])
340
+ else:
341
+ if spec.when[0].queryType == MUST.ConstructSparql:
342
+ return ConstructSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
343
+ else:
344
+ return UpdateSpecFailure(spec.spec_uri, spec.triple_store["type"], graph_compare)
345
+
346
+
347
+ def run_spec(spec: Specification) -> SpecResult:
348
+ spec_uri = spec.spec_uri
349
+ triple_store = spec.triple_store
350
+ # close_connection = True
351
+ log.debug(f"run_when {spec_uri=}, {triple_store=}, {spec.given=}, {spec.when=}, {spec.then=}")
352
+ if spec.given:
353
+ given_as_turtle = spec.given.serialize(format="turtle")
354
+ log.debug(f"{given_as_turtle}")
355
+ upload_given(triple_store, spec.given)
356
+ else:
357
+ if triple_store['type'] == MUST.RdfLib:
358
+ return SpecSkipped(spec_uri, triple_store['type'], "Unable to run Inherited State tests on Rdflib")
359
+ try:
360
+ for when in spec.when:
361
+ log.info(f"Running {when.queryType} spec {spec_uri} on {triple_store['type']}")
362
+ try:
363
+ result = run_when(spec_uri, triple_store, when)
364
+ except ParseException as e:
365
+ return SparqlParseFailure(spec_uri, triple_store["type"], e)
366
+ except NotImplementedError as ex:
367
+ return SpecSkipped(spec_uri, triple_store["type"], ex.args[0])
368
+ return check_result(spec, result)
369
+ except (ConnectionError, TimeoutError, HTTPError, ConnectTimeout, OSError) as e:
370
+ # close_connection = False
371
+ template = "An exception of type {0} occurred. Arguments:\n{1!r}"
372
+ message = template.format(type(e).__name__, e.args)
373
+ log.error(message)
374
+ return TripleStoreConnectionError(spec_uri, triple_store["type"], message)
375
+ except (TypeError, RequestException) as e:
376
+ log.error(f"{type(e)} {e}")
377
+ return SparqlExecutionError(spec_uri, triple_store["type"], e)
378
+
379
+ # https://github.com/Semantic-partners/mustrd/issues/78
380
+ # finally:
381
+ # if type(mustrd_triple_store) == MustrdAnzo and close_connection:
382
+ # mustrd_triple_store.clear_graph()
383
+
384
+
385
+
386
+ def get_triple_store_graph(triple_store_graph_path: Path):
387
+ secret_path = triple_store_graph_path.parent / Path(triple_store_graph_path.stem + "_secrets" + triple_store_graph_path.suffix)
388
+ return Graph().parse(triple_store_graph_path).parse(secret_path)
389
+
390
+ def get_triple_stores(triple_store_graph: Graph) -> list[dict]:
391
+ triple_stores = []
392
+ for triple_store_config, rdf_type, triple_store_type in triple_store_graph.triples((None, RDF.type, None)):
393
+ triple_store = {}
394
+ # Local rdf lib triple store
395
+ if triple_store_type == MUST.RdfLibConfig:
396
+ triple_store["type"] = MUST.RdfLib
397
+ # Anzo graph via anzo
398
+ elif triple_store_type == MUST.AnzoConfig:
399
+ triple_store["type"] = MUST.Anzo
400
+ triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=MUST.url)
401
+ triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=MUST.port)
402
+ try:
403
+ triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=MUST.username))
404
+ triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=MUST.password))
405
+ except (FileNotFoundError, ValueError) as e:
406
+ triple_store["error"] = e
407
+ triple_store["gqe_uri"] = triple_store_graph.value(subject=triple_store_config, predicate=MUST.gqeURI)
408
+ triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
409
+ predicate=MUST.inputGraph)
410
+ triple_store["output_graph"] = triple_store_graph.value(subject=triple_store_config,
411
+ predicate=MUST.outputGraph)
412
+ try:
413
+ check_triple_store_params(triple_store, ["url", "port", "username", "password", "input_graph"])
414
+ except ValueError as e:
415
+ triple_store["error"] = e
416
+ # GraphDB
417
+ elif triple_store_type == MUST.GraphDbConfig:
418
+ triple_store["type"] = MUST.GraphDb
419
+ triple_store["url"] = triple_store_graph.value(subject=triple_store_config, predicate=MUST.url)
420
+ triple_store["port"] = triple_store_graph.value(subject=triple_store_config, predicate=MUST.port)
421
+ try:
422
+ triple_store["username"] = str(triple_store_graph.value(subject=triple_store_config, predicate=MUST.username))
423
+ triple_store["password"] = str(triple_store_graph.value(subject=triple_store_config, predicate=MUST.password))
424
+ except (FileNotFoundError, ValueError) as e:
425
+ log.error(f"Credential retrieval failed {e}")
426
+ triple_store["error"] = e
427
+ triple_store["repository"] = triple_store_graph.value(subject=triple_store_config,
428
+ predicate=MUST.repository)
429
+ triple_store["input_graph"] = triple_store_graph.value(subject=triple_store_config,
430
+ predicate=MUST.inputGraph)
431
+
432
+ try:
433
+ check_triple_store_params(triple_store, ["url", "port", "repository"])
434
+ except ValueError as e:
435
+ triple_store["error"] = e
436
+ else:
437
+ triple_store["type"] = triple_store_type
438
+ triple_store["error"] = f"Triple store not implemented: {triple_store_type}"
439
+
440
+ triple_stores.append(triple_store)
441
+ return triple_stores
442
+
443
+
444
+ def check_triple_store_params(triple_store: dict, required_params: List[str]):
445
+ missing_params = [param for param in required_params if triple_store.get(param) is None]
446
+ if missing_params:
447
+ raise ValueError(f"Cannot establish connection to {triple_store['type']}. "
448
+ f"Missing required parameter(s): {', '.join(missing_params)}.")
449
+
450
+
451
+ def get_credential_from_file(triple_store_name: URIRef, credential: str, config_path: Literal) -> str:
452
+ log.info(f"get_credential_from_file {triple_store_name}, {credential}, {config_path}")
453
+ if config_path is None:
454
+ raise ValueError(f"Cannot establish connection defined in {triple_store_name}. "
455
+ f"Missing required parameter: {credential}.")
456
+ # if os.path.isrelative(config_path)
457
+ # project_root = get_project_root()
458
+ path = Path(config_path)
459
+ log.info(f"get_credential_from_file {path}")
460
+
461
+ if not os.path.isfile(path):
462
+ log.error(f"couldn't find {path}")
463
+ raise FileNotFoundError(f"Credentials config file not found: {path}")
464
+ try:
465
+ with open(path, "rb") as f:
466
+ config = tomli.load(f)
467
+ except tomli.TOMLDecodeError as e:
468
+ log.error(f"config error {path} {e}")
469
+ raise ValueError(f"Error reading credentials config file: {e}")
470
+ return config[str(triple_store_name)][credential]
471
+
472
+ # Convert sparql json query results as defined in https://www.w3.org/TR/rdf-sparql-json-res/
473
+ def json_results_to_panda_dataframe(result: str) -> pandas.DataFrame:
474
+ json_result = json.loads(result)
475
+ frames = DataFrame()
476
+ for binding in json_result["results"]["bindings"]:
477
+ columns = []
478
+ values = []
479
+ for key in binding:
480
+ value_object = binding[key]
481
+ columns.append(key)
482
+ values.append(str(value_object["value"]))
483
+ columns.append(key + "_datatype")
484
+ if "type" in value_object and value_object["type"] == "literal":
485
+ literal_type = str(XSD.string)
486
+ if "datatype" in value_object:
487
+ literal_type = value_object["datatype"]
488
+ values.append(literal_type)
489
+ else:
490
+ values.append(str(XSD.anyURI))
491
+
492
+ frames = pandas.concat(objs=[frames, pandas.DataFrame([values], columns=columns)], ignore_index=True)
493
+ frames.fillna('', inplace=True)
494
+
495
+ if frames.size == 0:
496
+ frames = pandas.DataFrame()
497
+ return frames
498
+
499
+
500
+ # https://github.com/Semantic-partners/mustrd/issues/110
501
+ # https://github.com/Semantic-partners/mustrd/issues/52
502
+ def table_comparison(result: str, spec: Specification) -> SpecResult:
503
+ warning = None
504
+ order_list = ["order by ?", "order by desc", "order by asc"]
505
+ ordered_result = any(pattern in spec.when[0].value.lower() for pattern in order_list)
506
+ then = spec.then.value
507
+ try:
508
+ if is_json(result):
509
+ df = json_results_to_panda_dataframe(result)
510
+ columns = list(df.columns)
511
+ else:
512
+ raise ParseException
513
+ sorted_columns = sorted(columns)
514
+ sorted_then_cols = sorted(list(then))
515
+ if not df.empty:
516
+
517
+ if not ordered_result:
518
+ df.sort_values(by=columns[::2], inplace=True)
519
+ df.reset_index(inplace=True, drop=True)
520
+ if spec.then.ordered:
521
+ warning = f"sh:order in {spec.spec_uri} is ignored, no ORDER BY in query"
522
+ log.warning(warning)
523
+
524
+ # Scenario 1: expected no result but got a result
525
+ if then.empty:
526
+ message = f"Expected 0 row(s) and 0 column(s), got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
527
+ empty_then = create_empty_dataframe_with_columns(df)
528
+ df_diff = empty_then.compare(df, result_names=("expected", "actual"))
529
+
530
+ else:
531
+ # Scenario 2: expected a result and got a result
532
+ # pandas.set_option('display.max_columns', None)
533
+ message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), " \
534
+ f"got {df.shape[0]} row(s) and {round(df.shape[1] / 2)} column(s)"
535
+ if ordered_result is True and not spec.then.ordered:
536
+ message += ". Actual result is ordered, must:then must contain sh:order on every row."
537
+ return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], None, message)
538
+ # if df.shape == then.shape and (df.columns == then.columns).all():
539
+ # df_diff = then.compare(df, result_names=("expected", "actual"))
540
+ # if df_diff.empty:
541
+ # df_diff = df
542
+ # print(df_diff.to_markdown())
543
+ # else:
544
+ # df_diff = construct_df_diff(df, then)
545
+ # print(df_diff.to_markdown())
546
+ else:
547
+ if len(columns) == len(then.columns):
548
+ if sorted_columns == sorted_then_cols:
549
+ then = then[columns]
550
+ if not ordered_result:
551
+ then.sort_values(by=columns[::2], inplace=True)
552
+ then.reset_index(drop=True, inplace=True)
553
+ if df.shape == then.shape and (df.columns == then.columns).all():
554
+ df_diff = then.compare(df, result_names=("expected", "actual"))
555
+ else:
556
+ df_diff = construct_df_diff(df, then)
557
+
558
+ else:
559
+ then = then[sorted_then_cols]
560
+ df = df[sorted_columns]
561
+ df_diff = construct_df_diff(df, then)
562
+ else:
563
+
564
+ then = then[sorted_then_cols]
565
+ df = df[sorted_columns]
566
+ df_diff = construct_df_diff(df, then)
567
+ else:
568
+
569
+ if then.empty:
570
+ # Scenario 3: expected no result, got no result
571
+ message = f"Expected 0 row(s) and 0 column(s), got 0 row(s) and 0 column(s)"
572
+ df = pandas.DataFrame()
573
+ else:
574
+ # Scenario 4: expected a result, but got an empty result
575
+ message = f"Expected {then.shape[0]} row(s) and {round(then.shape[1] / 2)} column(s), got 0 row(s) and 0 column(s)"
576
+ then = then[sorted_then_cols]
577
+ df = create_empty_dataframe_with_columns(then)
578
+ df_diff = then.compare(df, result_names=("expected", "actual"))
579
+ print(df_diff.to_markdown())
580
+
581
+ if df_diff.empty:
582
+ if warning:
583
+ return SpecPassedWithWarning(spec.spec_uri, spec.triple_store["type"], warning)
584
+ else:
585
+ return SpecPassed(spec.spec_uri, spec.triple_store["type"])
586
+ else:
587
+ # message += f"\nexpected:\n{then}\nactual:{df}"
588
+ log.error(message)
589
+ # print(spec.spec_uri)
590
+ # print("actual:")
591
+ # print(then)
592
+ # print("expected:")
593
+ # print(df)
594
+ return SelectSpecFailure(spec.spec_uri, spec.triple_store["type"], df_diff, message)
595
+
596
+ except ParseException as e:
597
+ return SparqlParseFailure(spec.spec_uri, spec.triple_store["type"], e)
598
+ except NotImplementedError as ex:
599
+ return SpecSkipped(spec.spec_uri, spec.triple_store["type"], ex)
600
+
601
+
602
+ def graph_comparison(expected_graph: Graph, actual_graph: Graph) -> GraphComparison:
603
+ diff = graph_diff(expected_graph, actual_graph)
604
+ in_both = diff[0]
605
+ in_expected = diff[1]
606
+ in_actual = diff[2]
607
+ in_expected_not_in_actual = (in_expected - in_actual)
608
+ in_actual_not_in_expected = (in_actual - in_expected)
609
+ return GraphComparison(in_expected_not_in_actual, in_actual_not_in_expected, in_both)
610
+
611
+
612
+ def get_then_update(spec_uri: URIRef, spec_graph: Graph) -> Graph:
613
+ then_query = f"""
614
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
615
+
616
+ CONSTRUCT {{ ?s ?p ?o }}
617
+ {{
618
+ <{spec_uri}> <{MUST.then}>
619
+ a <{MUST.StatementsDataset}> ;
620
+ <{MUST.hasStatement}> [
621
+ a rdf:Statement ;
622
+ rdf:subject ?s ;
623
+ rdf:predicate ?p ;
624
+ rdf:object ?o ;
625
+ ] ; ]
626
+ }}
627
+ """
628
+ expected_results = spec_graph.query(then_query).graph
629
+
630
+ return expected_results
631
+
632
+
633
+ def calculate_row_difference(df1: pandas.DataFrame,
634
+ df2: pandas.DataFrame) -> pandas.DataFrame:
635
+ df_all = df1.merge(df2.drop_duplicates(), how='left', indicator=True)
636
+ actual_rows = df_all[df_all['_merge'] == 'left_only']
637
+ actual_rows = actual_rows.drop('_merge', axis=1)
638
+ return actual_rows
639
+
640
+
641
+ def construct_df_diff(df: pandas.DataFrame,
642
+ then: pandas.DataFrame) -> pandas.DataFrame:
643
+ actual_rows = calculate_row_difference(df, then)
644
+ expected_rows = calculate_row_difference(then, df)
645
+ actual_columns = df.columns.difference(then.columns)
646
+ expected_columns = then.columns.difference(df.columns)
647
+
648
+ df_diff = pandas.DataFrame()
649
+ modified_df = df
650
+ modified_then = then
651
+
652
+ if actual_columns.size > 0:
653
+ modified_then = modified_then.reindex(modified_then.columns.to_list() + actual_columns.to_list(), axis=1)
654
+ modified_then[actual_columns.to_list()] = modified_then[actual_columns.to_list()].fillna('')
655
+
656
+ if expected_columns.size > 0:
657
+ modified_df = modified_df.reindex(modified_df.columns.to_list() + expected_columns.to_list(), axis=1)
658
+ modified_df[expected_columns.to_list()] = modified_df[expected_columns.to_list()].fillna('')
659
+
660
+ modified_df = modified_df.reindex(modified_then.columns, axis=1)
661
+
662
+ if df.shape[0] != then.shape[0] and df.shape[1] != then.shape[1]:
663
+ # take modified columns and add rows
664
+ actual_rows = calculate_row_difference(modified_df, modified_then)
665
+ expected_rows = calculate_row_difference(modified_then, modified_df)
666
+ df_diff = generate_row_diff(actual_rows, expected_rows)
667
+ elif actual_rows.shape[0] > 0 or expected_rows.shape[0] > 0:
668
+ df_diff = generate_row_diff(actual_rows, expected_rows)
669
+ elif actual_columns.size > 0 or expected_columns.size > 0:
670
+ df_diff = modified_then.compare(modified_df, result_names=("expected", "actual"), keep_shape=True,
671
+ keep_equal=True)
672
+ df_diff.fillna("", inplace=True)
673
+ return df_diff
674
+
675
+
676
+ def generate_row_diff(actual_rows: pandas.DataFrame, expected_rows: pandas.DataFrame) -> pandas.DataFrame:
677
+ df_diff_actual_rows = pandas.DataFrame()
678
+ df_diff_expected_rows = pandas.DataFrame()
679
+
680
+ if actual_rows.shape[0] > 0:
681
+ empty_actual_copy = create_empty_dataframe_with_columns(actual_rows)
682
+ df_diff_actual_rows = empty_actual_copy.compare(actual_rows, result_names=("expected", "actual"))
683
+
684
+ if expected_rows.shape[0] > 0:
685
+ empty_expected_copy = create_empty_dataframe_with_columns(expected_rows)
686
+ df_diff_expected_rows = expected_rows.compare(empty_expected_copy, result_names=("expected", "actual"))
687
+
688
+ df_diff_rows = pandas.concat([df_diff_actual_rows, df_diff_expected_rows], ignore_index=True)
689
+ return df_diff_rows
690
+
691
+
692
+ def create_empty_dataframe_with_columns(df: pandas.DataFrame) -> pandas.DataFrame:
693
+ empty_copy = pandas.DataFrame().reindex_like(df)
694
+ empty_copy.fillna("", inplace=True)
695
+ return empty_copy
696
+
697
+
698
+ def review_results(results: List[SpecResult], verbose: bool) -> None:
699
+ print("===== Result Overview =====")
700
+ # Init dictionaries
701
+ status_dict = defaultdict(lambda: defaultdict(int))
702
+ status_counts = defaultdict(lambda: defaultdict(int))
703
+ colours = {SpecPassed: Fore.GREEN, SpecPassedWithWarning: Fore.YELLOW, SpecSkipped: Fore.YELLOW}
704
+ # Populate dictionaries from results
705
+ for result in results:
706
+ status_counts[result.triple_store][type(result)] += 1
707
+ status_dict[result.spec_uri][result.triple_store] = type(result)
708
+
709
+ # Get the list of statuses and list of unique triple stores
710
+ statuses = list(status for inner_dict in status_dict.values() for status in inner_dict.values())
711
+ triple_stores = list(set(status for inner_dict in status_dict.values() for status in inner_dict.keys()))
712
+
713
+ # Convert dictionaries to list for tabulate
714
+ table_rows = [[spec_uri] + [
715
+ f"{colours.get(status_dict[spec_uri][triple_store], Fore.RED)}{status_dict[spec_uri][triple_store].__name__}{Style.RESET_ALL}"
716
+ for triple_store in triple_stores] for spec_uri in set(status_dict.keys())]
717
+
718
+ status_rows = [[f"{colours.get(status, Fore.RED)}{status.__name__}{Style.RESET_ALL}"] +
719
+ [f"{colours.get(status, Fore.RED)}{status_counts[triple_store][status]}{Style.RESET_ALL}"
720
+ for triple_store in triple_stores] for status in set(statuses)]
721
+
722
+ # Display tables with tabulate
723
+ print(tabulate(table_rows, headers=['Spec Uris / triple stores'] + triple_stores, tablefmt="pretty"))
724
+ print(tabulate(status_rows, headers=['Status / triple stores'] + triple_stores, tablefmt="pretty"))
725
+
726
+ pass_count = statuses.count(SpecPassed)
727
+ warning_count = statuses.count(SpecPassedWithWarning)
728
+ skipped_count = statuses.count(SpecSkipped)
729
+ fail_count = len(
730
+ list(filter(lambda status: status not in [SpecPassed, SpecPassedWithWarning, SpecSkipped], statuses)))
731
+
732
+ if fail_count:
733
+ overview_colour = Fore.RED
734
+ elif warning_count or skipped_count:
735
+ overview_colour = Fore.YELLOW
736
+ else:
737
+ overview_colour = Fore.GREEN
738
+
739
+ logger_setup.flush()
740
+ print(f"{overview_colour}===== {fail_count} failures, {skipped_count} skipped, {Fore.GREEN}{pass_count} passed, "
741
+ f"{overview_colour}{warning_count} passed with warnings =====")
742
+
743
+ if verbose and (fail_count or warning_count or skipped_count):
744
+ for res in results:
745
+ if type(res) == UpdateSpecFailure:
746
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
747
+ print(f"{Fore.BLUE} In Expected Not In Actual:")
748
+ print(res.graph_comparison.in_expected_not_in_actual.serialize(format="ttl"))
749
+ print()
750
+ print(f"{Fore.RED} in_actual_not_in_expected")
751
+ print(res.graph_comparison.in_actual_not_in_expected.serialize(format="ttl"))
752
+ print(f"{Fore.GREEN} in_both")
753
+ print(res.graph_comparison.in_both.serialize(format="ttl"))
754
+
755
+ if type(res) == SelectSpecFailure:
756
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
757
+ print(res.message)
758
+ print(res.table_comparison.to_markdown())
759
+ if type(res) == ConstructSpecFailure or type(res) == UpdateSpecFailure:
760
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
761
+ if type(res) == SpecPassedWithWarning:
762
+ print(f"{Fore.YELLOW}Passed with warning {res.spec_uri} {res.triple_store}")
763
+ print(res.warning)
764
+ if type(res) == TripleStoreConnectionError or type(res) == SparqlExecutionError or \
765
+ type(res) == SparqlParseFailure:
766
+ print(f"{Fore.RED}Failed {res.spec_uri} {res.triple_store}")
767
+ print(res.exception)
768
+ if type(res) == SpecSkipped:
769
+ print(f"{Fore.YELLOW}Skipped {res.spec_uri} {res.triple_store}")
770
+ print(res.message)
771
+
772
+
773
+
774
+
775
+
776
+