@lde/pipeline-shacl-validator 0.12.17 → 0.12.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -70,6 +70,20 @@ detail. Configure at least one writer in production pipelines.
|
|
|
70
70
|
The bundled `FileWriter` and `SparqlUpdateWriter` already implement the
|
|
71
71
|
`Writer` contract; bring your own for custom destinations.
|
|
72
72
|
|
|
73
|
+
##### Blank-node-free reports
|
|
74
|
+
|
|
75
|
+
shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
|
|
76
|
+
any anonymous `sh:sourceShape` as blank nodes. Before writing, `ShaclValidator`
|
|
77
|
+
rewrites each one to a dataset-scoped IRI of the form
|
|
78
|
+
`<dataset>/.well-known/shacl#<batch>-<label>`. This keeps a file-based served
|
|
79
|
+
store (e.g. the Dataset Knowledge Graph) from fusing one dataset's results into
|
|
80
|
+
another's when it `cat`s every per-dataset n-quads file into a single index –
|
|
81
|
+
blank-node labels are only document-scoped and recur across files (see
|
|
82
|
+
[ldelements/lde#478](https://github.com/ldelements/lde/issues/478)). The
|
|
83
|
+
dataset IRI rules out fusion across datasets; `<batch>`, a hash of the report's
|
|
84
|
+
quads, rules out fusion across the separate `validate()` batches that land in
|
|
85
|
+
one dataset's validation graph.
|
|
86
|
+
|
|
73
87
|
#### Filesystem collisions with `FileWriter`
|
|
74
88
|
|
|
75
89
|
`FileWriter` derives its filename from `dataset.iri` only. If the pipeline's
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"shacl-validator.d.ts","sourceRoot":"","sources":["../src/shacl-validator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAE5C,OAAO,KAAK,EACV,SAAS,EACT,gBAAgB,EAChB,gBAAgB,EAChB,MAAM,EACP,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"shacl-validator.d.ts","sourceRoot":"","sources":["../src/shacl-validator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAE5C,OAAO,KAAK,EACV,SAAS,EACT,gBAAgB,EAChB,gBAAgB,EAChB,MAAM,EACP,MAAM,eAAe,CAAC;AAQvB,0CAA0C;AAC1C,MAAM,WAAW,qBAAqB;IACpC,6FAA6F;IAC7F,UAAU,EAAE,MAAM,CAAC;IACnB;;;;;;;;OAQG;IACH,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;CAC1B;AAQD;;;;;;GAMG;AACH,qBAAa,cAAe,YAAW,SAAS;IAC9C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;IAEzC,OAAO,CAAC,aAAa,CAAkB;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAyC;gBAE1D,OAAO,EAAE,qBAAqB;IAKpC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAmDpE,MAAM,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAkB3C,SAAS;CASxB"}
|
package/dist/shacl-validator.js
CHANGED
|
@@ -3,6 +3,7 @@ import ShaclEngine from 'shacl-engine/Validator.js';
|
|
|
3
3
|
// @ts-expect-error -- rdf-ext has no type declarations.
|
|
4
4
|
import rdf from 'rdf-ext';
|
|
5
5
|
import { rdfDereferencer } from 'rdf-dereference';
|
|
6
|
+
import { skolemizeReport } from './skolemize-report.js';
|
|
6
7
|
/**
|
|
7
8
|
* SHACL-based {@link Validator} for `@lde/pipeline`.
|
|
8
9
|
*
|
|
@@ -43,7 +44,12 @@ export class ShaclValidator {
|
|
|
43
44
|
acc.conforms = false;
|
|
44
45
|
this.accumulators.set(key, acc);
|
|
45
46
|
if (violations > 0 && this.reportWriters.length > 0) {
|
|
46
|
-
|
|
47
|
+
// Skolemise the report's blank nodes to dataset-scoped IRIs before writing.
|
|
48
|
+
// shacl-engine emits the report and every result as blank nodes, whose
|
|
49
|
+
// labels are not unique across the per-dataset n-quads files a file-based
|
|
50
|
+
// store cats into one index — fusing one dataset's violations into
|
|
51
|
+
// another's (see ldelements/lde#478).
|
|
52
|
+
const reportQuads = skolemizeReport(report.dataset, dataset.iri.toString());
|
|
47
53
|
for (const writer of this.reportWriters) {
|
|
48
54
|
await writer.write(dataset, asyncIterableOf(reportQuads));
|
|
49
55
|
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { Quad } from '@rdfjs/types';
|
|
2
|
+
/**
|
|
3
|
+
* Rewrite every blank node in a shacl-engine validation report to a
|
|
4
|
+
* deterministic, dataset-scoped IRI, leaving the report otherwise unchanged.
|
|
5
|
+
*
|
|
6
|
+
* shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
|
|
7
|
+
* any anonymous `sh:sourceShape`/`sh:value`/`sh:detail` as blank nodes. When a
|
|
8
|
+
* file-based served store such as the Dataset Knowledge Graph concatenates every
|
|
9
|
+
* per-dataset n-quads file into one index (`qlever index` over the `cat` of all
|
|
10
|
+
* files), document-scoped blank-node labels recur across files and fuse one
|
|
11
|
+
* dataset's results into another's — a cross-graph traversal can then reach a
|
|
12
|
+
* foreign dataset's violations (see ldelements/lde#478 and #474, and
|
|
13
|
+
* netwerk-digitaal-erfgoed/dataset-knowledge-graph#352).
|
|
14
|
+
*
|
|
15
|
+
* Each blank node becomes `<dataset>/.well-known/shacl#<batch>-<label>`. The
|
|
16
|
+
* dataset IRI rules out fusion across datasets; `<batch>`, a hash of this
|
|
17
|
+
* report's quads, rules out fusion across the separate `validate()` batches that
|
|
18
|
+
* land in one dataset's validation graph — their labels both restart at `b1`,
|
|
19
|
+
* but a batch carrying different violations hashes differently. Two batches with
|
|
20
|
+
* byte-identical reports collapse onto the same IRIs, which is correct: they are
|
|
21
|
+
* the same violations.
|
|
22
|
+
*
|
|
23
|
+
* @param quads - The report quads (`report.dataset`), possibly with blank nodes.
|
|
24
|
+
* @param datasetIri - The dataset the report is about; scopes every minted IRI.
|
|
25
|
+
* @returns The same quads with every blank node replaced by a skolem IRI.
|
|
26
|
+
*/
|
|
27
|
+
export declare function skolemizeReport(quads: Iterable<Quad>, datasetIri: string): Quad[];
|
|
28
|
+
//# sourceMappingURL=skolemize-report.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"skolemize-report.d.ts","sourceRoot":"","sources":["../src/skolemize-report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAQ,MAAM,cAAc,CAAC;AAK/C;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,EACrB,UAAU,EAAE,MAAM,GACjB,IAAI,EAAE,CAiBR"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { hashSuffix, skolemIri } from '@lde/dataset';
|
|
2
|
+
// @ts-expect-error -- rdf-ext has no type declarations.
|
|
3
|
+
import rdf from 'rdf-ext';
|
|
4
|
+
/**
|
|
5
|
+
* Rewrite every blank node in a shacl-engine validation report to a
|
|
6
|
+
* deterministic, dataset-scoped IRI, leaving the report otherwise unchanged.
|
|
7
|
+
*
|
|
8
|
+
* shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
|
|
9
|
+
* any anonymous `sh:sourceShape`/`sh:value`/`sh:detail` as blank nodes. When a
|
|
10
|
+
* file-based served store such as the Dataset Knowledge Graph concatenates every
|
|
11
|
+
* per-dataset n-quads file into one index (`qlever index` over the `cat` of all
|
|
12
|
+
* files), document-scoped blank-node labels recur across files and fuse one
|
|
13
|
+
* dataset's results into another's — a cross-graph traversal can then reach a
|
|
14
|
+
* foreign dataset's violations (see ldelements/lde#478 and #474, and
|
|
15
|
+
* netwerk-digitaal-erfgoed/dataset-knowledge-graph#352).
|
|
16
|
+
*
|
|
17
|
+
* Each blank node becomes `<dataset>/.well-known/shacl#<batch>-<label>`. The
|
|
18
|
+
* dataset IRI rules out fusion across datasets; `<batch>`, a hash of this
|
|
19
|
+
* report's quads, rules out fusion across the separate `validate()` batches that
|
|
20
|
+
* land in one dataset's validation graph — their labels both restart at `b1`,
|
|
21
|
+
* but a batch carrying different violations hashes differently. Two batches with
|
|
22
|
+
* byte-identical reports collapse onto the same IRIs, which is correct: they are
|
|
23
|
+
* the same violations.
|
|
24
|
+
*
|
|
25
|
+
* @param quads - The report quads (`report.dataset`), possibly with blank nodes.
|
|
26
|
+
* @param datasetIri - The dataset the report is about; scopes every minted IRI.
|
|
27
|
+
* @returns The same quads with every blank node replaced by a skolem IRI.
|
|
28
|
+
*/
|
|
29
|
+
export function skolemizeReport(quads, datasetIri) {
|
|
30
|
+
const reportQuads = [...quads];
|
|
31
|
+
// Fold the per-batch hash into the base, then let skolemIri append `-<label>`,
|
|
32
|
+
// matching the skolems minted for provenance and distribution reports (#474).
|
|
33
|
+
const base = `${datasetIri}/.well-known/shacl#${hashSuffix(fingerprint(reportQuads))}`;
|
|
34
|
+
const skolemize = (term) => term.termType === 'BlankNode'
|
|
35
|
+
? rdf.namedNode(skolemIri(base, term.value))
|
|
36
|
+
: term;
|
|
37
|
+
return reportQuads.map((quad) => rdf.quad(skolemize(quad.subject), quad.predicate, skolemize(quad.object), skolemize(quad.graph)));
|
|
38
|
+
}
|
|
39
|
+
/** A deterministic string identifying a report, for use as a per-batch IRI segment. */
|
|
40
|
+
function fingerprint(quads) {
|
|
41
|
+
return quads
|
|
42
|
+
.map((quad) => `${termKey(quad.subject)} ${quad.predicate.value} ${termKey(quad.object)}`)
|
|
43
|
+
.sort()
|
|
44
|
+
.join('\n');
|
|
45
|
+
}
|
|
46
|
+
function termKey(term) {
|
|
47
|
+
switch (term.termType) {
|
|
48
|
+
case 'BlankNode':
|
|
49
|
+
return `_:${term.value}`;
|
|
50
|
+
case 'Literal':
|
|
51
|
+
return `"${term.value}"^^<${term.datatype.value}>@${term.language}`;
|
|
52
|
+
default:
|
|
53
|
+
return `<${term.value}>`;
|
|
54
|
+
}
|
|
55
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lde/pipeline-shacl-validator",
|
|
3
|
-
"version": "0.12.
|
|
3
|
+
"version": "0.12.19",
|
|
4
4
|
"description": "SHACL validation for @lde/pipeline",
|
|
5
5
|
"repository": {
|
|
6
6
|
"url": "git+https://github.com/ldelements/lde.git",
|
|
@@ -36,6 +36,6 @@
|
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@lde/dataset": "0.7.7",
|
|
39
|
-
"@lde/pipeline": "0.30.
|
|
39
|
+
"@lde/pipeline": "0.30.18"
|
|
40
40
|
}
|
|
41
41
|
}
|