@lde/pipeline 0.30.13 → 0.30.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -357,7 +357,7 @@ new Pipeline({
357
357
 
358
358
  #### `provenancePlugin()`
359
359
 
360
- Appends [PROV-O](https://www.w3.org/TR/prov-o/) provenance quads (`prov:Entity`, `prov:Activity`, `prov:startedAtTime`, `prov:endedAtTime`) to every stage’s output.
360
+ Appends [PROV-O](https://www.w3.org/TR/prov-o/) provenance quads (`prov:Entity`, `prov:Activity`, `prov:startedAtTime`, `prov:endedAtTime`) to every stage’s output. The `prov:Activity` is a stable IRI keyed on `(dataset, stage)`, not a blank node, so activities stay distinct – and a re-run stays idempotent – when per-dataset outputs are merged into one graph (blank-node labels are not unique across separately serialised documents and would fuse unrelated activities).
361
361
 
362
362
  #### `schemaOrgNormalizationPlugin(options?)`
363
363
 
@@ -1 +1 @@
1
- {"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/distribution/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAAe,KAAK,IAAI,EAAE,MAAM,IAAI,CAAC;AAC5C,OAAO,EAIL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAUjC;;;;;;;;;GASG;AACH,wBAAuB,mBAAmB,CACxC,YAAY,EAAE,eAAe,EAAE,EAC/B,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,YAAY,GAC1B,aAAa,CAAC,IAAI,CAAC,CA2CrB"}
1
+ {"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/distribution/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEzD,OAAO,EAAe,KAAK,IAAI,EAAE,MAAM,IAAI,CAAC;AAC5C,OAAO,EAIL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAUjC;;;;;;;;;GASG;AACH,wBAAuB,mBAAmB,CACxC,YAAY,EAAE,eAAe,EAAE,EAC/B,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,YAAY,GAC1B,aAAa,CAAC,IAAI,CAAC,CAiDrB"}
@@ -1,6 +1,7 @@
1
+ import { hashSuffix, skolemIri } from '@lde/dataset';
1
2
  import { DataFactory } from 'n3';
2
3
  import { NetworkError, SparqlProbeResult, } from '@lde/distribution-probe';
3
- const { quad, namedNode, blankNode, literal } = DataFactory;
4
+ const { quad, namedNode, literal } = DataFactory;
4
5
  const RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
5
6
  const SCHEMA = 'https://schema.org/';
6
7
  const VOID = 'http://rdfs.org/ns/void#';
@@ -17,10 +18,16 @@ const HTTP_STATUS = 'https://www.w3.org/2011/http-statusCodes#';
17
18
  * whose `schema:target` matches the failed distribution's access URL.
18
19
  */
19
20
  export async function* probeResultsToQuads(probeResults, datasetIri, importResult) {
20
- // Track blank nodes per URL so import errors can reference the right action.
21
+ // Track each action node per URL so import errors can reference the right
22
+ // action. Each action is a deterministic IRI keyed on (dataset, URL), not a
23
+ // blank node: this output is merged with other datasets' into one cat-built
24
+ // graph where blank-node labels are not unique across documents and would
25
+ // fuse unrelated actions into one node (see issue #474). The
26
+ // `.well-known/schema#action-<hash>` shape mirrors the linkset skolem.
27
+ const actionBase = `${datasetIri}/.well-known/schema#action`;
21
28
  const actionsByUrl = new Map();
22
29
  for (const result of probeResults) {
23
- const action = blankNode();
30
+ const action = namedNode(skolemIri(actionBase, hashSuffix(result.url)));
24
31
  actionsByUrl.set(result.url, action);
25
32
  yield quad(action, namedNode(`${RDF}type`), namedNode(`${SCHEMA}Action`));
26
33
  yield quad(action, namedNode(`${SCHEMA}target`), namedNode(result.url));
@@ -0,0 +1,38 @@
1
+ import type { Quad } from '@rdfjs/types';
2
+ import type { QuadTransform } from '../stage.js';
3
+ /**
4
+ * Why this guard exists.
5
+ *
6
+ * A file-based served store (e.g. the Dataset Knowledge Graph) rebuilds its
7
+ * index by concatenating every per-dataset n-quads file and parsing the
8
+ * concatenation as ONE RDF document (`qlever index` over
9
+ * `find … -exec cat {} +`). Blank-node labels are only document-scoped, and the
10
+ * pipeline emits deterministic labels (n3 `DataFactory.blankNode()` → `n3-N`,
11
+ * the counter resets per dataset/run), so the same label recurs across files and
12
+ * the indexer fuses those nodes into one — merging unrelated provenance,
13
+ * measurements and linksets across datasets and runs. Named nodes never fuse.
14
+ *
15
+ * The invariant for any quads the pipeline writes into such a store is therefore:
16
+ * NO blank nodes. Mint stable (skolem) IRIs instead — see `skolemIri` in
17
+ * `@lde/dataset`. These helpers make that invariant testable and enforceable.
18
+ *
19
+ * See ldelements/lde#474 and netwerk-digitaal-erfgoed/dataset-knowledge-graph#352.
20
+ */
21
+ /**
22
+ * The distinct blank-node labels appearing in subject, object, or graph position
23
+ * across `quads`. Empty when the quads are blank-node-free.
24
+ */
25
+ export declare function blankNodes(quads: Iterable<Quad>): string[];
26
+ /**
27
+ * Throw if any quad carries a blank node. Use in producer tests to lock in the
28
+ * no-blank-nodes invariant (see module docs).
29
+ */
30
+ export declare function assertNoBlankNodes(quads: Iterable<Quad>): void;
31
+ /**
32
+ * A {@link QuadTransform} that passes quads through unchanged but throws on the
33
+ * first blank node it sees. Insert it just before the writer to turn the
34
+ * no-blank-nodes invariant into a hard pipeline failure (e.g. in a CI/staging
35
+ * run) rather than a per-test opt-in.
36
+ */
37
+ export declare function failOnBlankNodes<Context>(): QuadTransform<Context>;
38
+ //# sourceMappingURL=blankNodes.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"blankNodes.d.ts","sourceRoot":"","sources":["../../src/guard/blankNodes.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAEjD;;;;;;;;;;;;;;;;;GAiBG;AAEH;;;GAGG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,GAAG,MAAM,EAAE,CAU1D;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,GAAG,IAAI,CAU9D;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,KAAK,aAAa,CAAC,OAAO,CAAC,CAelE"}
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Why this guard exists.
3
+ *
4
+ * A file-based served store (e.g. the Dataset Knowledge Graph) rebuilds its
5
+ * index by concatenating every per-dataset n-quads file and parsing the
6
+ * concatenation as ONE RDF document (`qlever index` over
7
+ * `find … -exec cat {} +`). Blank-node labels are only document-scoped, and the
8
+ * pipeline emits deterministic labels (n3 `DataFactory.blankNode()` → `n3-N`,
9
+ * the counter resets per dataset/run), so the same label recurs across files and
10
+ * the indexer fuses those nodes into one — merging unrelated provenance,
11
+ * measurements and linksets across datasets and runs. Named nodes never fuse.
12
+ *
13
+ * The invariant for any quads the pipeline writes into such a store is therefore:
14
+ * NO blank nodes. Mint stable (skolem) IRIs instead — see `skolemIri` in
15
+ * `@lde/dataset`. These helpers make that invariant testable and enforceable.
16
+ *
17
+ * See ldelements/lde#474 and netwerk-digitaal-erfgoed/dataset-knowledge-graph#352.
18
+ */
19
+ /**
20
+ * The distinct blank-node labels appearing in subject, object, or graph position
21
+ * across `quads`. Empty when the quads are blank-node-free.
22
+ */
23
+ export function blankNodes(quads) {
24
+ const offenders = new Set();
25
+ for (const quad of quads) {
26
+ for (const term of [quad.subject, quad.object, quad.graph]) {
27
+ if (term.termType === 'BlankNode') {
28
+ offenders.add(term.value);
29
+ }
30
+ }
31
+ }
32
+ return [...offenders];
33
+ }
34
+ /**
35
+ * Throw if any quad carries a blank node. Use in producer tests to lock in the
36
+ * no-blank-nodes invariant (see module docs).
37
+ */
38
+ export function assertNoBlankNodes(quads) {
39
+ const offenders = blankNodes(quads);
40
+ if (offenders.length > 0) {
41
+ throw new Error(`Output contains ${offenders.length} blank node(s), which fuse across ` +
42
+ `datasets when a file-based store cat-indexes per-dataset files. ` +
43
+ `Mint skolem IRIs instead (see skolemIri in @lde/dataset; ldelements/lde#474). ` +
44
+ `First: ${offenders.slice(0, 10).join(', ')}`);
45
+ }
46
+ }
47
+ /**
48
+ * A {@link QuadTransform} that passes quads through unchanged but throws on the
49
+ * first blank node it sees. Insert it just before the writer to turn the
50
+ * no-blank-nodes invariant into a hard pipeline failure (e.g. in a CI/staging
51
+ * run) rather than a per-test opt-in.
52
+ */
53
+ export function failOnBlankNodes() {
54
+ return async function* (quads) {
55
+ for await (const quad of quads) {
56
+ for (const term of [quad.subject, quad.object, quad.graph]) {
57
+ if (term.termType === 'BlankNode') {
58
+ throw new Error(`Blank node reached the writer (${term.value}); it would fuse ` +
59
+ `across datasets in a cat-built index. Mint a skolem IRI instead ` +
60
+ `(ldelements/lde#474): ${quad.subject.value} ${quad.predicate.value} …`);
61
+ }
62
+ }
63
+ yield quad;
64
+ }
65
+ };
66
+ }
package/dist/index.d.ts CHANGED
@@ -6,6 +6,7 @@ export * from './progressReporter.js';
6
6
  export * from './selector.js';
7
7
  export * from './stage.js';
8
8
  export * from './stageOutputResolver.js';
9
+ export * from './guard/blankNodes.js';
9
10
  export * from './sparql/index.js';
10
11
  export * from './distribution/index.js';
11
12
  export * from './provenance/index.js';
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
package/dist/index.js CHANGED
@@ -6,6 +6,7 @@ export * from './progressReporter.js';
6
6
  export * from './selector.js';
7
7
  export * from './stage.js';
8
8
  export * from './stageOutputResolver.js';
9
+ export * from './guard/blankNodes.js';
9
10
  export * from './sparql/index.js';
10
11
  export * from './distribution/index.js';
11
12
  export * from './provenance/index.js';
@@ -8,6 +8,18 @@ import type { ProvenanceStore } from './provenance/store.js';
8
8
  import type { StageOutputResolver } from './stageOutputResolver.js';
9
9
  import type { ProgressReporter } from './progressReporter.js';
10
10
  import { type TimeoutPolicy } from './sparql/timeoutPolicy.js';
11
+ /**
12
+ * Context handed to a {@link PipelinePlugin.beforeStageWrite} transform: the
13
+ * `dataset` whose merged output is being written and the `stage` that produced
14
+ * it. The stage identity lets a transform mint stable IRIs keyed on
15
+ * `(dataset, stage)` instead of blank nodes, which would fuse across stages and
16
+ * datasets once the per-dataset outputs are merged into one graph (see issue
17
+ * #474).
18
+ */
19
+ export interface BeforeStageWriteContext {
20
+ dataset: Dataset;
21
+ stage: string;
22
+ }
11
23
  /** Plugin that hooks into pipeline lifecycle events. */
12
24
  export interface PipelinePlugin {
13
25
  name: string;
@@ -17,9 +29,7 @@ export interface PipelinePlugin {
17
29
  * – provenance, namespace normalisation – that apply regardless of which
18
30
  * executor produced a quad.
19
31
  */
20
- beforeStageWrite?: QuadTransform<{
21
- dataset: Dataset;
22
- }>;
32
+ beforeStageWrite?: QuadTransform<BeforeStageWriteContext>;
23
33
  }
24
34
  export interface PipelineOptions {
25
35
  datasetSelector: DatasetSelector;
@@ -66,6 +76,7 @@ export declare class Pipeline {
66
76
  private readonly datasetSelector;
67
77
  private readonly stages;
68
78
  private readonly writer;
79
+ private readonly beforeStageWrite?;
69
80
  private readonly distributionResolver;
70
81
  private readonly chaining?;
71
82
  private readonly reporter?;
@@ -79,6 +90,13 @@ export declare class Pipeline {
79
90
  private recordOutcome;
80
91
  private reportValidators;
81
92
  private collectStages;
93
+ /**
94
+ * The writer a stage's merged output is written through: the user writer
95
+ * wrapped with the plugins' {@link PipelinePlugin.beforeStageWrite}
96
+ * transforms, carrying this `stage`'s identity so a transform can mint stable
97
+ * per-`(dataset, stage)` IRIs rather than blank nodes.
98
+ */
99
+ private stageWriter;
82
100
  /**
83
101
  * Run a stage with reporting and return whether it was supported.
84
102
  * Returns `true` if the stage produced results, `false` if NotSupported.
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAG1B,MAAM,4BAA4B,CAAC;AAKpC,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAO7D,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAEV,gBAAgB,EACjB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAEL,KAAK,aAAa,EACnB,MAAM,2BAA2B,CAAC;AAEnC,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,aAAa,CAAC;QAAE,OAAO,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;CACxD;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,cAAc,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;IAC5B;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,aAAa,CAAC;CAC/B;AAgFD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;IAC7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAsB;IACrD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAkB;IACnD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAS;gBAE9B,OAAO,EAAE,eAAe;IA0C9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAoBZ,cAAc;IA0I5B,+EAA+E;YACjE,aAAa;YAmBb,gBAAgB;IAW9B,OAAO,CAAE,aAAa;IAOtB;;;OAGG;YACW,QAAQ;IA0CtB,2EAA2E;YAC7D,eAAe;YAqBf,QAAQ;YA2DP,SAAS;CAczB"}
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAG1B,MAAM,4BAA4B,CAAC;AAKpC,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAO7D,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAEV,gBAAgB,EACjB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAEL,KAAK,aAAa,EACnB,MAAM,2BAA2B,CAAC;AAEnC;;;;;;;GAOG;AACH,MAAM,WAAW,uBAAuB;IACtC,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,aAAa,CAAC,uBAAuB,CAAC,CAAC;CAC3D;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,cAAc,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;IAC5B;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,aAAa,CAAC;CAC/B;AAkFD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAyC;IAC3E,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;IAC7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAsB;IACrD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAkB;IACnD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAS;gBAE9B,OAAO,EAAE,eAAe;IA2C9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAoBZ,cAAc;IA0I5B,+EAA+E;YACjE,aAAa;YAmBb,gBAAgB;IAW9B,OAAO,CAAE,aAAa;IAOtB;;;;;OAKG;IACH,OAAO,CAAC,WAAW;IAMnB;;;OAGG;YACW,QAAQ;IA0CtB,2EAA2E;YAC7D,eAAe;YAqBf,QAAQ;YA+DP,SAAS;CAczB"}
package/dist/pipeline.js CHANGED
@@ -63,15 +63,14 @@ class FanOutWriter {
63
63
  class TransformWriter {
64
64
  inner;
65
65
  transform;
66
- constructor(inner, transform) {
66
+ stage;
67
+ constructor(inner, transform, stage) {
67
68
  this.inner = inner;
68
69
  this.transform = transform;
70
+ this.stage = stage;
69
71
  }
70
72
  async write(dataset, quads) {
71
- await this.inner.write(dataset, this.transform(quads, { dataset }));
72
- }
73
- async flush(dataset) {
74
- await this.inner.flush?.(dataset);
73
+ await this.inner.write(dataset, this.transform(quads, { dataset, stage: this.stage }));
75
74
  }
76
75
  }
77
76
  export class Pipeline {
@@ -79,6 +78,7 @@ export class Pipeline {
79
78
  datasetSelector;
80
79
  stages;
81
80
  writer;
81
+ beforeStageWrite;
82
82
  distributionResolver;
83
83
  chaining;
84
84
  reporter;
@@ -96,17 +96,18 @@ export class Pipeline {
96
96
  this.name = options.name ?? '';
97
97
  this.datasetSelector = options.datasetSelector;
98
98
  this.stages = options.stages;
99
- let writer = Array.isArray(options.writers)
99
+ // The user writer is the post-merge target; the plugins' beforeStageWrite
100
+ // transforms wrap it per stage (see stageWriter) so each carries the stage
101
+ // identity it needs to mint stable, non-fusing IRIs.
102
+ this.writer = Array.isArray(options.writers)
100
103
  ? new FanOutWriter(options.writers)
101
104
  : options.writers;
102
105
  const transforms = options.plugins
103
106
  ?.map((p) => p.beforeStageWrite)
104
107
  .filter((t) => t !== undefined);
105
- if (transforms?.length) {
106
- const composed = (quads, context) => transforms.reduce((q, fn) => fn(q, context), quads);
107
- writer = new TransformWriter(writer, composed);
108
- }
109
- this.writer = writer;
108
+ this.beforeStageWrite = transforms?.length
109
+ ? (quads, context) => transforms.reduce((q, fn) => fn(q, context), quads)
110
+ : undefined;
110
111
  this.distributionResolver =
111
112
  options.distributionResolver ?? new SparqlDistributionResolver();
112
113
  this.chaining = options.chaining;
@@ -210,7 +211,7 @@ export class Pipeline {
210
211
  await this.runChain(dataset, resolved.distribution, stage, timeout);
211
212
  }
212
213
  else {
213
- await this.runStage(dataset, resolved.distribution, stage, this.writer, timeout);
214
+ await this.runStage(dataset, resolved.distribution, stage, this.stageWriter(stage.name), timeout);
214
215
  }
215
216
  }
216
217
  catch (error) {
@@ -269,6 +270,17 @@ export class Pipeline {
269
270
  yield* this.collectStages(stage.stages);
270
271
  }
271
272
  }
273
+ /**
274
+ * The writer a stage's merged output is written through: the user writer
275
+ * wrapped with the plugins' {@link PipelinePlugin.beforeStageWrite}
276
+ * transforms, carrying this `stage`'s identity so a transform can mint stable
277
+ * per-`(dataset, stage)` IRIs rather than blank nodes.
278
+ */
279
+ stageWriter(stage) {
280
+ return this.beforeStageWrite
281
+ ? new TransformWriter(this.writer, this.beforeStageWrite, stage)
282
+ : this.writer;
283
+ }
272
284
  /**
273
285
  * Run a stage with reporting and return whether it was supported.
274
286
  * Returns `true` if the stage produced results, `false` if NotSupported.
@@ -335,8 +347,9 @@ export class Pipeline {
335
347
  currentDistribution = await stageOutputResolver.resolve(childWriter.getOutputPath(dataset));
336
348
  }
337
349
  }
338
- // 3. Concatenate all output files → user writer.
339
- await this.writer.write(dataset, this.readFiles(outputFiles));
350
+ // 3. Concatenate all output files → user writer, applying the plugins'
351
+ // beforeStageWrite transforms once for the chain under the parent stage.
352
+ await this.stageWriter(stage.name).write(dataset, this.readFiles(outputFiles));
340
353
  }
341
354
  finally {
342
355
  await stageOutputResolver.cleanup();
@@ -1,10 +1,7 @@
1
1
  import type { QuadTransform } from '../stage.js';
2
- import type { PipelinePlugin } from '../pipeline.js';
3
- import type { Dataset } from '@lde/dataset';
2
+ import type { BeforeStageWriteContext, PipelinePlugin } from '../pipeline.js';
4
3
  /** QuadTransform that appends PROV-O provenance quads. */
5
- export declare const provenanceTransform: QuadTransform<{
6
- dataset: Dataset;
7
- }>;
4
+ export declare const provenanceTransform: QuadTransform<BeforeStageWriteContext>;
8
5
  /** Pipeline plugin that appends PROV-O provenance to every stage's output. */
9
6
  export declare function provenancePlugin(): PipelinePlugin;
10
7
  //# sourceMappingURL=provenance.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"provenance.d.ts","sourceRoot":"","sources":["../../src/plugin/provenance.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AACrD,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAkB5C,0DAA0D;AAC1D,eAAO,MAAM,mBAAmB,EAAE,aAAa,CAAC;IAAE,OAAO,EAAE,OAAO,CAAA;CAAE,CAGC,CAAC;AAEtE,8EAA8E;AAC9E,wBAAgB,gBAAgB,IAAI,cAAc,CAKjD"}
1
+ {"version":3,"file":"provenance.d.ts","sourceRoot":"","sources":["../../src/plugin/provenance.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,KAAK,EAAE,uBAAuB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAmB9E,0DAA0D;AAC1D,eAAO,MAAM,mBAAmB,EAAE,aAAa,CAAC,uBAAuB,CAGK,CAAC;AAE7E,8EAA8E;AAC9E,wBAAgB,gBAAgB,IAAI,cAAc,CAKjD"}
@@ -1,5 +1,6 @@
1
+ import { hashSuffix, skolemIri } from '@lde/dataset';
1
2
  import { DataFactory } from 'n3';
2
- const { namedNode, literal, blankNode, quad } = DataFactory;
3
+ const { namedNode, literal, quad } = DataFactory;
3
4
  const RDF_TYPE = namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type');
4
5
  const PROV_ENTITY = namedNode('http://www.w3.org/ns/prov#Entity');
5
6
  const PROV_ACTIVITY = namedNode('http://www.w3.org/ns/prov#Activity');
@@ -8,7 +9,7 @@ const PROV_STARTED_AT_TIME = namedNode('http://www.w3.org/ns/prov#startedAtTime'
8
9
  const PROV_ENDED_AT_TIME = namedNode('http://www.w3.org/ns/prov#endedAtTime');
9
10
  const XSD_DATE_TIME = namedNode('http://www.w3.org/2001/XMLSchema#dateTime');
10
11
  /** QuadTransform that appends PROV-O provenance quads. */
11
- export const provenanceTransform = (quads, { dataset }) => appendProvenanceQuads(quads, dataset.iri.toString(), new Date());
12
+ export const provenanceTransform = (quads, { dataset, stage }) => appendProvenanceQuads(quads, dataset.iri.toString(), stage, new Date());
12
13
  /** Pipeline plugin that appends PROV-O provenance to every stage's output. */
13
14
  export function provenancePlugin() {
14
15
  return {
@@ -16,13 +17,20 @@ export function provenancePlugin() {
16
17
  beforeStageWrite: provenanceTransform,
17
18
  };
18
19
  }
19
- async function* appendProvenanceQuads(quads, iri, startedAt) {
20
+ async function* appendProvenanceQuads(quads, iri, stage, startedAt) {
20
21
  for await (const q of quads) {
21
22
  yield q;
22
23
  }
23
24
  const endedAt = new Date();
24
25
  const subject = namedNode(iri);
25
- const activity = blankNode();
26
+ // Skolemise the activity to a stable IRI keyed on (dataset, stage) instead of
27
+ // a blank node. Per-dataset outputs are merged into one graph (the DKG index
28
+ // cats every dataset's file together), where blank-node labels are not unique
29
+ // across documents and would fuse unrelated activities into one node — one
30
+ // prov:Activity wrongly wasGeneratedBy several datasets (see issue #474).
31
+ // The IRI also makes a re-run idempotent: same (dataset, stage) → same node.
32
+ // The `.well-known/prov#activity-<hash>` shape mirrors the linkset skolem.
33
+ const activity = namedNode(skolemIri(`${iri}/.well-known/prov#activity`, hashSuffix(stage)));
26
34
  yield quad(subject, RDF_TYPE, PROV_ENTITY);
27
35
  yield quad(subject, PROV_WAS_GENERATED_BY, activity);
28
36
  yield quad(activity, RDF_TYPE, PROV_ACTIVITY);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/pipeline",
3
- "version": "0.30.13",
3
+ "version": "0.30.15",
4
4
  "repository": {
5
5
  "url": "git+https://github.com/ldelements/lde.git",
6
6
  "directory": "packages/pipeline"
@@ -24,10 +24,10 @@
24
24
  "!**/*.tsbuildinfo"
25
25
  ],
26
26
  "dependencies": {
27
- "@lde/dataset": "0.7.6",
28
- "@lde/dataset-registry-client": "0.8.2",
29
- "@lde/distribution-probe": "0.1.9",
30
- "@lde/sparql-importer": "0.6.4",
27
+ "@lde/dataset": "0.7.7",
28
+ "@lde/dataset-registry-client": "0.8.3",
29
+ "@lde/distribution-probe": "0.1.10",
30
+ "@lde/sparql-importer": "0.6.5",
31
31
  "@lde/sparql-server": "0.4.11",
32
32
  "@rdfjs/types": "^2.0.1",
33
33
  "@traqula/generator-sparql-1-1": "^1.1.4",