@lde/pipeline-shacl-validator 0.12.17 → 0.12.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -70,6 +70,20 @@ detail. Configure at least one writer in production pipelines.
70
70
  The bundled `FileWriter` and `SparqlUpdateWriter` already implement the
71
71
  `Writer` contract; bring your own for custom destinations.
72
72
 
73
+ ##### Blank-node-free reports
74
+
75
+ shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
76
+ any anonymous `sh:sourceShape` as blank nodes. Before writing, `ShaclValidator`
77
+ rewrites each one to a dataset-scoped IRI of the form
78
+ `<dataset>/.well-known/shacl#<batch>-<label>`. This keeps a file-based served
79
+ store (e.g. the Dataset Knowledge Graph) from fusing one dataset's results into
80
+ another's when it `cat`s every per-dataset n-quads file into a single index –
81
+ blank-node labels are only document-scoped and recur across files (see
82
+ [ldelements/lde#478](https://github.com/ldelements/lde/issues/478)). The
83
+ dataset IRI rules out fusion across datasets; `<batch>`, a hash of the report's
84
+ quads, rules out fusion across the separate `validate()` batches that land in
85
+ one dataset's validation graph.
86
+
73
87
  #### Filesystem collisions with `FileWriter`
74
88
 
75
89
  `FileWriter` derives its filename from `dataset.iri` only. If the pipeline's
@@ -1 +1 @@
1
- {"version":3,"file":"shacl-validator.d.ts","sourceRoot":"","sources":["../src/shacl-validator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAE5C,OAAO,KAAK,EACV,SAAS,EACT,gBAAgB,EAChB,gBAAgB,EAChB,MAAM,EACP,MAAM,eAAe,CAAC;AAOvB,0CAA0C;AAC1C,MAAM,WAAW,qBAAqB;IACpC,6FAA6F;IAC7F,UAAU,EAAE,MAAM,CAAC;IACnB;;;;;;;;OAQG;IACH,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;CAC1B;AAQD;;;;;;GAMG;AACH,qBAAa,cAAe,YAAW,SAAS;IAC9C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;IAEzC,OAAO,CAAC,aAAa,CAAkB;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAyC;gBAE1D,OAAO,EAAE,qBAAqB;IAKpC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;IA2CpE,MAAM,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAkB3C,SAAS;CASxB"}
1
+ {"version":3,"file":"shacl-validator.d.ts","sourceRoot":"","sources":["../src/shacl-validator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAE5C,OAAO,KAAK,EACV,SAAS,EACT,gBAAgB,EAChB,gBAAgB,EAChB,MAAM,EACP,MAAM,eAAe,CAAC;AAQvB,0CAA0C;AAC1C,MAAM,WAAW,qBAAqB;IACpC,6FAA6F;IAC7F,UAAU,EAAE,MAAM,CAAC;IACnB;;;;;;;;OAQG;IACH,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;CAC1B;AAQD;;;;;;GAMG;AACH,qBAAa,cAAe,YAAW,SAAS;IAC9C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;IAEzC,OAAO,CAAC,aAAa,CAAkB;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAyC;gBAE1D,OAAO,EAAE,qBAAqB;IAKpC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAmDpE,MAAM,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAkB3C,SAAS;CASxB"}
@@ -3,6 +3,7 @@ import ShaclEngine from 'shacl-engine/Validator.js';
3
3
  // @ts-expect-error -- rdf-ext has no type declarations.
4
4
  import rdf from 'rdf-ext';
5
5
  import { rdfDereferencer } from 'rdf-dereference';
6
+ import { skolemizeReport } from './skolemize-report.js';
6
7
  /**
7
8
  * SHACL-based {@link Validator} for `@lde/pipeline`.
8
9
  *
@@ -43,7 +44,12 @@ export class ShaclValidator {
43
44
  acc.conforms = false;
44
45
  this.accumulators.set(key, acc);
45
46
  if (violations > 0 && this.reportWriters.length > 0) {
46
- const reportQuads = [...report.dataset];
47
+ // Skolemise the report's blank nodes to dataset-scoped IRIs before writing.
48
+ // shacl-engine emits the report and every result as blank nodes, whose
49
+ // labels are not unique across the per-dataset n-quads files a file-based
50
+ // store cats into one index — fusing one dataset's violations into
51
+ // another's (see ldelements/lde#478).
52
+ const reportQuads = skolemizeReport(report.dataset, dataset.iri.toString());
47
53
  for (const writer of this.reportWriters) {
48
54
  await writer.write(dataset, asyncIterableOf(reportQuads));
49
55
  }
@@ -0,0 +1,28 @@
1
+ import type { Quad } from '@rdfjs/types';
2
+ /**
3
+ * Rewrite every blank node in a shacl-engine validation report to a
4
+ * deterministic, dataset-scoped IRI, leaving the report otherwise unchanged.
5
+ *
6
+ * shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
7
+ * any anonymous `sh:sourceShape`/`sh:value`/`sh:detail` as blank nodes. When a
8
+ * file-based served store such as the Dataset Knowledge Graph concatenates every
9
+ * per-dataset n-quads file into one index (`qlever index` over the `cat` of all
10
+ * files), document-scoped blank-node labels recur across files and fuse one
11
+ * dataset's results into another's — a cross-graph traversal can then reach a
12
+ * foreign dataset's violations (see ldelements/lde#478 and #474, and
13
+ * netwerk-digitaal-erfgoed/dataset-knowledge-graph#352).
14
+ *
15
+ * Each blank node becomes `<dataset>/.well-known/shacl#<batch>-<label>`. The
16
+ * dataset IRI rules out fusion across datasets; `<batch>`, a hash of this
17
+ * report's quads, rules out fusion across the separate `validate()` batches that
18
+ * land in one dataset's validation graph — their labels both restart at `b1`,
19
+ * but a batch carrying different violations hashes differently. Two batches with
20
+ * byte-identical reports collapse onto the same IRIs, which is correct: they are
21
+ * the same violations.
22
+ *
23
+ * @param quads - The report quads (`report.dataset`), possibly with blank nodes.
24
+ * @param datasetIri - The dataset the report is about; scopes every minted IRI.
25
+ * @returns The same quads with every blank node replaced by a skolem IRI.
26
+ */
27
+ export declare function skolemizeReport(quads: Iterable<Quad>, datasetIri: string): Quad[];
28
+ //# sourceMappingURL=skolemize-report.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skolemize-report.d.ts","sourceRoot":"","sources":["../src/skolemize-report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAQ,MAAM,cAAc,CAAC;AAK/C;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,EACrB,UAAU,EAAE,MAAM,GACjB,IAAI,EAAE,CAiBR"}
@@ -0,0 +1,55 @@
1
+ import { hashSuffix, skolemIri } from '@lde/dataset';
2
+ // @ts-expect-error -- rdf-ext has no type declarations.
3
+ import rdf from 'rdf-ext';
4
+ /**
5
+ * Rewrite every blank node in a shacl-engine validation report to a
6
+ * deterministic, dataset-scoped IRI, leaving the report otherwise unchanged.
7
+ *
8
+ * shacl-engine emits the `sh:ValidationReport`, every `sh:ValidationResult` and
9
+ * any anonymous `sh:sourceShape`/`sh:value`/`sh:detail` as blank nodes. When a
10
+ * file-based served store such as the Dataset Knowledge Graph concatenates every
11
+ * per-dataset n-quads file into one index (`qlever index` over the `cat` of all
12
+ * files), document-scoped blank-node labels recur across files and fuse one
13
+ * dataset's results into another's — a cross-graph traversal can then reach a
14
+ * foreign dataset's violations (see ldelements/lde#478 and #474, and
15
+ * netwerk-digitaal-erfgoed/dataset-knowledge-graph#352).
16
+ *
17
+ * Each blank node becomes `<dataset>/.well-known/shacl#<batch>-<label>`. The
18
+ * dataset IRI rules out fusion across datasets; `<batch>`, a hash of this
19
+ * report's quads, rules out fusion across the separate `validate()` batches that
20
+ * land in one dataset's validation graph — their labels both restart at `b1`,
21
+ * but a batch carrying different violations hashes differently. Two batches with
22
+ * byte-identical reports collapse onto the same IRIs, which is correct: they are
23
+ * the same violations.
24
+ *
25
+ * @param quads - The report quads (`report.dataset`), possibly with blank nodes.
26
+ * @param datasetIri - The dataset the report is about; scopes every minted IRI.
27
+ * @returns The same quads with every blank node replaced by a skolem IRI.
28
+ */
29
+ export function skolemizeReport(quads, datasetIri) {
30
+ const reportQuads = [...quads];
31
+ // Fold the per-batch hash into the base, then let skolemIri append `-<label>`,
32
+ // matching the skolems minted for provenance and distribution reports (#474).
33
+ const base = `${datasetIri}/.well-known/shacl#${hashSuffix(fingerprint(reportQuads))}`;
34
+ const skolemize = (term) => term.termType === 'BlankNode'
35
+ ? rdf.namedNode(skolemIri(base, term.value))
36
+ : term;
37
+ return reportQuads.map((quad) => rdf.quad(skolemize(quad.subject), quad.predicate, skolemize(quad.object), skolemize(quad.graph)));
38
+ }
39
+ /** A deterministic string identifying a report, for use as a per-batch IRI segment. */
40
+ function fingerprint(quads) {
41
+ return quads
42
+ .map((quad) => `${termKey(quad.subject)} ${quad.predicate.value} ${termKey(quad.object)}`)
43
+ .sort()
44
+ .join('\n');
45
+ }
46
+ function termKey(term) {
47
+ switch (term.termType) {
48
+ case 'BlankNode':
49
+ return `_:${term.value}`;
50
+ case 'Literal':
51
+ return `"${term.value}"^^<${term.datatype.value}>@${term.language}`;
52
+ default:
53
+ return `<${term.value}>`;
54
+ }
55
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/pipeline-shacl-validator",
3
- "version": "0.12.17",
3
+ "version": "0.12.19",
4
4
  "description": "SHACL validation for @lde/pipeline",
5
5
  "repository": {
6
6
  "url": "git+https://github.com/ldelements/lde.git",
@@ -36,6 +36,6 @@
36
36
  },
37
37
  "peerDependencies": {
38
38
  "@lde/dataset": "0.7.7",
39
- "@lde/pipeline": "0.30.17"
39
+ "@lde/pipeline": "0.30.18"
40
40
  }
41
41
  }