@lde/pipeline 0.30.11 → 0.30.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -276,6 +276,63 @@ Writes generated quads to a destination:
276
276
  - `SparqlUpdateWriter` — writes to a SPARQL endpoint via UPDATE queries
277
277
  - `FileWriter` — writes to local files
278
278
 
279
+ ### Provenance store
280
+
281
+ A `ProvenanceStore` gives the pipeline a small per-dataset memory, so a future run can skip datasets that are genuinely unchanged. It is purely a storage seam: the framework owns the skip decision (see [`sourceFingerprint`](#source-change-fingerprint) and `shouldReprocess`), the store owns only how each record is persisted.
282
+
283
+ ```typescript
284
+ interface ProvenanceStore {
285
+ get(datasetUri: URL): Promise<ProcessingRecord | null>;
286
+ set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
287
+ }
288
+ ```
289
+
290
+ A `ProcessingRecord` holds the two opaque change fields — `sourceFingerprint` (derived automatically from source metadata) and `pipelineVersion` (consumer-declared) — plus `generatedAt` and a `status` of `'success'` or `'failed'`. The two change fields are compared only for equality, never parsed or ordered.
291
+
292
+ #### `FileLoadedSparqlProvenanceStore`
293
+
294
+ The reference implementation targets a triplestore that is served read-only and rebuilt by bulk-loading files (e.g. [QLever](https://github.com/ad-freiburg/qlever)). It reads through SPARQL queries against the live endpoint, and writes records as files for the next bulk-load — because the endpoint accepts no SPARQL UPDATE.
295
+
296
+ ```typescript
297
+ import { FileLoadedSparqlProvenanceStore } from '@lde/pipeline';
298
+
299
+ const store = new FileLoadedSparqlProvenanceStore({
300
+ queryEndpoint: new URL('http://localhost:7001/sparql'),
301
+ pipelineIri: new URL('https://example.org/pipelines/dkg'),
302
+ outputDir: './provenance',
303
+ });
304
+ ```
305
+
306
+ - `get` runs a named-graph-scoped SPARQL `SELECT` against `queryEndpoint`, reading the records a previous run loaded.
307
+ - `set` writes one flat [PROV-O](https://www.w3.org/TR/prov-o/) N-Quads file per dataset into `outputDir`, in the pipeline-scoped named graph, to be bulk-loaded after the run.
308
+
309
+ Each record is stored as flat PROV-O on the dataset entity — `prov:generatedAtTime` plus `sourceFingerprint`, `pipelineVersion` and `status` under the `https://w3id.org/lde/provenance#` namespace. Scoping every record by `pipelineIri` (used as the named graph) lets multiple pipelines share one triplestore without colliding.
310
+
311
+ #### Enabling skipping
312
+
313
+ Skipping is opt-in. Pass a `provenanceStore` and a `pipelineVersion` to the `Pipeline`:
314
+
315
+ ```typescript
316
+ new Pipeline({
317
+ // …
318
+ provenanceStore: store,
319
+ pipelineVersion: 'v3', // rotate only on releases that change output
320
+ });
321
+ ```
322
+
323
+ For each dataset the pipeline probes its distributions, derives the source-change fingerprint, reads the stored record, and **skips before importing** when both change fields match:
324
+
325
+ ```
326
+ skip iff recorded.sourceFingerprint === current.sourceFingerprint
327
+ AND recorded.pipelineVersion === current.pipelineVersion
328
+ ```
329
+
330
+ Otherwise it imports (if needed), runs the stages, and writes an updated record. `pipelineVersion` is consumer-owned and opaque: rotate it only on releases that change output, and every dataset reprocesses on the next run. It is **required** when a `provenanceStore` is configured (a skip-enabled pipeline with no version would silently freeze); when no store is configured, every dataset is reprocessed — today’s behaviour. A dataset that failed but whose source is unchanged is recorded as `'failed'` and skipped on later runs until its source changes or the version rotates, so a deterministically failing import is not retried every run.
331
+
332
+ ### Source-change fingerprint
333
+
334
+ `sourceFingerprint(distribution, probeResult)` derives a cheap, opaque change signal for a distribution from metadata the probe already collected — no body download. For a data dump it combines the most recent of the register’s `dct:modified` and the artifact’s HTTP `Last-Modified` with the byte size (the probe’s `Content-Length`, falling back to the declared `dcat:byteSize`). It returns `null` for a live SPARQL endpoint, or when no date and no size can be established — a `null` fingerprint never compares equal, so such a distribution is always reprocessed.
335
+
279
336
  ### Plugins
280
337
 
281
338
  Plugins hook into the pipeline lifecycle via the `PipelinePlugin` interface. Register them in the `plugins` array when constructing a `Pipeline`.
@@ -1,6 +1,7 @@
1
+ import { type Dataset } from '@lde/dataset';
1
2
  import type { Importer } from '@lde/sparql-importer';
2
3
  import type { SparqlServer } from '@lde/sparql-server';
3
- import { type DistributionResolver, NoDistributionAvailable, ResolvedDistribution } from './resolver.js';
4
+ import { type DistributionResolver, type ResolveCallbacks, NoDistributionAvailable, ProbedDistributions, ResolvedDistribution } from './resolver.js';
4
5
  export interface ImportResolverOptions {
5
6
  importer: Importer;
6
7
  server: SparqlServer;
@@ -26,12 +27,28 @@ export interface ImportResolverOptions {
26
27
  * adds the ability to import a data dump into a local SPARQL server. The
27
28
  * {@link ImportResolverOptions.strategy | strategy} option controls whether the
28
29
  * inner resolver's SPARQL endpoint is preferred or bypassed.
30
+ *
31
+ * The split is preserved across both phases: {@link probe} chooses the
32
+ * {@link ProbedSource} (the inner SPARQL endpoint, or the preferred importable
33
+ * data dump) without importing; {@link resolve} performs the import only when
34
+ * that source is a data dump.
29
35
  */
30
36
  export declare class ImportResolver implements DistributionResolver {
31
37
  private readonly inner;
32
38
  private readonly options;
33
39
  constructor(inner: DistributionResolver, options: ImportResolverOptions);
34
- resolve(...args: Parameters<DistributionResolver['resolve']>): Promise<ResolvedDistribution | NoDistributionAvailable>;
40
+ probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
41
+ resolve(probed: ProbedDistributions, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
42
+ /**
43
+ * The preferred importable data dump and its probe result, or `null` if no
44
+ * downloadable distribution passed probing.
45
+ */
46
+ private selectImportCandidate;
47
+ /**
48
+ * Downloadable distributions whose access URL passed probing, in preference
49
+ * order (compressed first, see {@link Dataset.getDownloadDistributions}).
50
+ */
51
+ private importCandidates;
35
52
  private importDataset;
36
53
  cleanup(): Promise<void>;
37
54
  }
@@ -1 +1 @@
1
- {"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAMrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,KAAK,oBAAoB,EAEzB,uBAAuB,EACvB,oBAAoB,EACrB,MAAM,eAAe,CAAC;AAGvB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,EAAE,QAAQ,GAAG,QAAQ,CAAC;CAChC;AAED;;;;;;;GAOG;AACH,qBAAa,cAAe,YAAW,oBAAoB;IAEvD,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,OAAO;gBADP,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,qBAAqB;IAG3C,OAAO,CACX,GAAG,IAAI,EAAE,UAAU,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,GACnD,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;YAgB5C,aAAa;IA2GrB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
1
+ {"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,OAAO,EAAgB,MAAM,cAAc,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAMrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,KAAK,oBAAoB,EAEzB,KAAK,gBAAgB,EACrB,uBAAuB,EACvB,mBAAmB,EACnB,oBAAoB,EACrB,MAAM,eAAe,CAAC;AAGvB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,EAAE,QAAQ,GAAG,QAAQ,CAAC;CAChC;AAED;;;;;;;;;;;;GAYG;AACH,qBAAa,cAAe,YAAW,oBAAoB;IAEvD,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,OAAO;gBADP,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,qBAAqB;IAG3C,KAAK,CACT,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC;IAgBzB,OAAO,CACX,MAAM,EAAE,mBAAmB,EAC3B,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;IAoB1D;;;OAGG;IACH,OAAO,CAAC,qBAAqB;IAY7B;;;OAGG;IACH,OAAO,CAAC,gBAAgB;YAeV,aAAa;IAmGrB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
@@ -1,6 +1,6 @@
1
1
  import { Distribution } from '@lde/dataset';
2
2
  import { ImportFailed, ImportSuccessful, NotSupported, } from '@lde/sparql-importer';
3
- import { NoDistributionAvailable, ResolvedDistribution, } from './resolver.js';
3
+ import { NoDistributionAvailable, ProbedDistributions, ResolvedDistribution, } from './resolver.js';
4
4
  import { NetworkError } from '@lde/distribution-probe';
5
5
  /**
6
6
  * A {@link DistributionResolver} decorator that adds data-dump import logic.
@@ -9,6 +9,11 @@ import { NetworkError } from '@lde/distribution-probe';
9
9
  * adds the ability to import a data dump into a local SPARQL server. The
10
10
  * {@link ImportResolverOptions.strategy | strategy} option controls whether the
11
11
  * inner resolver's SPARQL endpoint is preferred or bypassed.
12
+ *
13
+ * The split is preserved across both phases: {@link probe} chooses the
14
+ * {@link ProbedSource} (the inner SPARQL endpoint, or the preferred importable
15
+ * data dump) without importing; {@link resolve} performs the import only when
16
+ * that source is a data dump.
12
17
  */
13
18
  export class ImportResolver {
14
19
  inner;
@@ -17,24 +22,54 @@ export class ImportResolver {
17
22
  this.inner = inner;
18
23
  this.options = options;
19
24
  }
20
- async resolve(...args) {
21
- const [dataset, callbacks] = args;
22
- const result = await this.inner.resolve(...args);
23
- // 'sparql' strategy (default): use SPARQL endpoint if inner found one.
24
- if (this.options.strategy !== 'import' &&
25
- result instanceof ResolvedDistribution) {
26
- return result;
25
+ async probe(dataset, callbacks) {
26
+ const probed = await this.inner.probe(dataset, callbacks);
27
+ // 'sparql' strategy (default): keep the inner SPARQL endpoint if found.
28
+ if (this.options.strategy !== 'import' && probed.source) {
29
+ return probed;
27
30
  }
28
- // Either 'import' strategy or inner found nothing: import a data dump.
29
- return this.importDataset(dataset, result.probeResults, callbacks);
31
+ // Either 'import' strategy or no SPARQL endpoint: select a data dump to
32
+ // import. Choosing the candidate here (not in resolve) keeps the import
33
+ // cost out of the probe phase while still letting the pipeline fingerprint
34
+ // the dump it would import.
35
+ const source = this.selectImportCandidate(dataset, probed.probeResults);
36
+ return new ProbedDistributions(dataset, probed.probeResults, source);
30
37
  }
31
- async importDataset(dataset, probeResults, callbacks) {
38
+ async resolve(probed, callbacks) {
39
+ if (!probed.source) {
40
+ return new NoDistributionAvailable(probed.dataset, 'No importable distributions passed probing', probed.probeResults);
41
+ }
42
+ // A SPARQL endpoint source needs no import.
43
+ if (probed.source.distribution.isSparql()) {
44
+ return new ResolvedDistribution(probed.source.distribution, probed.probeResults);
45
+ }
46
+ return this.importDataset(probed.dataset, probed.probeResults, callbacks);
47
+ }
48
+ /**
49
+ * The preferred importable data dump and its probe result, or `null` if no
50
+ * downloadable distribution passed probing.
51
+ */
52
+ selectImportCandidate(dataset, probeResults) {
53
+ const candidate = this.importCandidates(dataset, probeResults)[0];
54
+ if (!candidate)
55
+ return null;
56
+ const probeResult = probeResults.find((result) => result.url === candidate.accessUrl.toString());
57
+ return probeResult ? { distribution: candidate, probeResult } : null;
58
+ }
59
+ /**
60
+ * Downloadable distributions whose access URL passed probing, in preference
61
+ * order (compressed first, see {@link Dataset.getDownloadDistributions}).
62
+ */
63
+ importCandidates(dataset, probeResults) {
32
64
  const successfulUrls = new Set(probeResults
33
65
  .filter((r) => !(r instanceof NetworkError) && r.isSuccess())
34
66
  .map((r) => r.url));
35
- const candidates = dataset
67
+ return dataset
36
68
  .getDownloadDistributions()
37
69
  .filter((d) => d.accessUrl && successfulUrls.has(d.accessUrl.toString()));
70
+ }
71
+ async importDataset(dataset, probeResults, callbacks) {
72
+ const candidates = this.importCandidates(dataset, probeResults);
38
73
  // Establish a trustworthy change signal for the downloader so it can skip
39
74
  // redundant downloads (and preserve the QLever index cache). For a data
40
75
  // dump the authoritative date is the most recent of the register’s declared
@@ -1,6 +1,6 @@
1
1
  export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, type ProbeResultType, } from '@lde/distribution-probe';
2
2
  export { probeResultsToQuads } from './report.js';
3
3
  export { ImportResolver, type ImportResolverOptions, } from './importResolver.js';
4
- export { ResolvedDistribution, NoDistributionAvailable, SparqlDistributionResolver, type DistributionResolver, type ResolveCallbacks, type SparqlDistributionResolverOptions, } from './resolver.js';
4
+ export { ResolvedDistribution, NoDistributionAvailable, ProbedDistributions, SparqlDistributionResolver, type DistributionResolver, type ProbedSource, type ResolveCallbacks, type SparqlDistributionResolverOptions, } from './resolver.js';
5
5
  export { resolveDistributions, type DistributionStageResult, } from './resolveDistributions.js';
6
6
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/distribution/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,eAAe,GACrB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAElD,OAAO,EACL,cAAc,EACd,KAAK,qBAAqB,GAC3B,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EACL,oBAAoB,EACpB,uBAAuB,EACvB,0BAA0B,EAC1B,KAAK,oBAAoB,EACzB,KAAK,gBAAgB,EACrB,KAAK,iCAAiC,GACvC,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,oBAAoB,EACpB,KAAK,uBAAuB,GAC7B,MAAM,2BAA2B,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/distribution/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,eAAe,GACrB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAElD,OAAO,EACL,cAAc,EACd,KAAK,qBAAqB,GAC3B,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EACL,oBAAoB,EACpB,uBAAuB,EACvB,mBAAmB,EACnB,0BAA0B,EAC1B,KAAK,oBAAoB,EACzB,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,iCAAiC,GACvC,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,oBAAoB,EACpB,KAAK,uBAAuB,GAC7B,MAAM,2BAA2B,CAAC"}
@@ -1,5 +1,5 @@
1
1
  export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, } from '@lde/distribution-probe';
2
2
  export { probeResultsToQuads } from './report.js';
3
3
  export { ImportResolver, } from './importResolver.js';
4
- export { ResolvedDistribution, NoDistributionAvailable, SparqlDistributionResolver, } from './resolver.js';
4
+ export { ResolvedDistribution, NoDistributionAvailable, ProbedDistributions, SparqlDistributionResolver, } from './resolver.js';
5
5
  export { resolveDistributions, } from './resolveDistributions.js';
@@ -1 +1 @@
1
- {"version":3,"file":"resolveDistributions.d.ts","sourceRoot":"","sources":["../../src/distribution/resolveDistributions.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,OAAO,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,uBAAuB;IACtC,YAAY,EAAE,YAAY,GAAG,IAAI,CAAC;IAClC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,OAAO,EAChB,QAAQ,EAAE,oBAAoB,GAC7B,OAAO,CAAC,uBAAuB,CAAC,CAoBlC"}
1
+ {"version":3,"file":"resolveDistributions.d.ts","sourceRoot":"","sources":["../../src/distribution/resolveDistributions.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,OAAO,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,uBAAuB;IACtC,YAAY,EAAE,YAAY,GAAG,IAAI,CAAC;IAClC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,OAAO,EAChB,QAAQ,EAAE,oBAAoB,GAC7B,OAAO,CAAC,uBAAuB,CAAC,CAqBlC"}
@@ -1,7 +1,8 @@
1
1
  import { probeResultsToQuads } from './report.js';
2
2
  import { NoDistributionAvailable, } from './resolver.js';
3
3
  export async function resolveDistributions(dataset, resolver) {
4
- const result = await resolver.resolve(dataset);
4
+ const probed = await resolver.probe(dataset);
5
+ const result = await resolver.resolve(probed);
5
6
  if (result instanceof NoDistributionAvailable) {
6
7
  return {
7
8
  distribution: null,
@@ -16,34 +16,68 @@ export declare class NoDistributionAvailable {
16
16
  readonly importFailed?: ImportFailed | undefined;
17
17
  constructor(dataset: Dataset, message: string, probeResults: ProbeResultType[], importFailed?: ImportFailed | undefined);
18
18
  }
19
- /** Callbacks fired during distribution resolution. */
19
+ /**
20
+ * The distribution a dataset will be processed from, paired with its probe
21
+ * result. Drives the source-change fingerprint: a live SPARQL endpoint yields
22
+ * `null` (always reprocess), a data dump yields its change fingerprint.
23
+ */
24
+ export interface ProbedSource {
25
+ distribution: Distribution;
26
+ probeResult: ProbeResultType;
27
+ }
28
+ /**
29
+ * The outcome of the probe phase: every distribution’s probe result, plus the
30
+ * {@link ProbedSource} that will be used to process the dataset (or `null` if
31
+ * none is available). Determined without importing, so the pipeline can decide
32
+ * to skip a dataset before paying the import cost.
33
+ */
34
+ export declare class ProbedDistributions {
35
+ readonly dataset: Dataset;
36
+ readonly probeResults: ProbeResultType[];
37
+ readonly source: ProbedSource | null;
38
+ constructor(dataset: Dataset, probeResults: ProbeResultType[], source: ProbedSource | null);
39
+ }
40
+ /** Callbacks fired during distribution probing and resolution. */
20
41
  export interface ResolveCallbacks {
21
- /** Called each time a single distribution probe completes. */
42
+ /** Called each time a single distribution probe completes (probe phase). */
22
43
  onProbe?: (distribution: Distribution, result: ProbeResultType) => void;
23
- /** Called when a data-dump import begins. */
44
+ /** Called when a data-dump import begins (resolve phase). */
24
45
  onImportStart?: () => void;
25
- /** Called when importing a distribution fails. */
46
+ /** Called when importing a distribution fails (resolve phase). */
26
47
  onImportFailed?: (distribution: Distribution, error: string) => void;
27
48
  }
49
+ /**
50
+ * Resolves a dataset to a usable distribution in two phases so the pipeline can
51
+ * gate on a dataset’s source-change fingerprint before paying any import cost:
52
+ *
53
+ * 1. {@link probe} probes every distribution and selects the source-to-be,
54
+ * without importing.
55
+ * 2. {@link resolve} turns that probed source into a usable SPARQL endpoint,
56
+ * importing a data dump only when the source is one.
57
+ */
28
58
  export interface DistributionResolver {
29
- resolve(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
59
+ probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
60
+ resolve(probed: ProbedDistributions, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
30
61
  cleanup?(): Promise<void>;
31
62
  }
32
63
  export interface SparqlDistributionResolverOptions {
33
64
  timeout?: number;
34
65
  }
35
66
  /**
36
- * Resolves a dataset to a usable SPARQL distribution by probing its distributions.
67
+ * Resolves a dataset to its own SPARQL endpoint by probing its distributions.
37
68
  *
38
- * 1. Probes all distributions in parallel.
39
- * 2. Returns the first valid SPARQL endpoint as a `ResolvedDistribution`.
40
- * 3. If none: returns `NoDistributionAvailable`.
69
+ * {@link probe} returns the first valid SPARQL endpoint as the
70
+ * {@link ProbedSource}; {@link resolve} returns it as a
71
+ * {@link ResolvedDistribution}, or {@link NoDistributionAvailable} when none
72
+ * responded. Never imports a data dump – wrap with {@link ImportResolver} for
73
+ * that.
41
74
  *
42
75
  * Does not mutate `dataset.distributions`.
43
76
  */
44
77
  export declare class SparqlDistributionResolver implements DistributionResolver {
45
78
  private readonly timeout;
46
79
  constructor(options?: SparqlDistributionResolverOptions);
47
- resolve(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
80
+ probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
81
+ resolve(probed: ProbedDistributions): Promise<ResolvedDistribution | NoDistributionAvailable>;
48
82
  }
49
83
  //# sourceMappingURL=resolver.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/distribution/resolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAGL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC,qBAAa,oBAAoB;IAE7B,QAAQ,CAAC,YAAY,EAAE,YAAY;IACnC,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;IACpC,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM;IAChC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM;gBAJpB,YAAY,EAAE,YAAY,EAC1B,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA,EAC3B,cAAc,CAAC,EAAE,MAAM,YAAA,EACvB,WAAW,CAAC,EAAE,MAAM,YAAA;CAEhC;AAED,qBAAa,uBAAuB;IAEhC,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM;IACxB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;gBAH3B,OAAO,EAAE,OAAO,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA;CAEvC;AAED,sDAAsD;AACtD,MAAM,WAAW,gBAAgB;IAC/B,8DAA8D;IAC9D,OAAO,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,MAAM,EAAE,eAAe,KAAK,IAAI,CAAC;IACxE,6CAA6C;IAC7C,aAAa,CAAC,EAAE,MAAM,IAAI,CAAC;IAC3B,kDAAkD;IAClD,cAAc,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACtE;AAED,MAAM,WAAW,oBAAoB;IACnC,OAAO,CACL,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC,CAAC;IAC3D,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3B;AAED,MAAM,WAAW,iCAAiC;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;;;;GAQG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;gBAErB,OAAO,CAAC,EAAE,iCAAiC;IAIjD,OAAO,CACX,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;CA6B3D"}
1
+ {"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/distribution/resolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAGL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC,qBAAa,oBAAoB;IAE7B,QAAQ,CAAC,YAAY,EAAE,YAAY;IACnC,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;IACpC,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM;IAChC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM;gBAJpB,YAAY,EAAE,YAAY,EAC1B,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA,EAC3B,cAAc,CAAC,EAAE,MAAM,YAAA,EACvB,WAAW,CAAC,EAAE,MAAM,YAAA;CAEhC;AAED,qBAAa,uBAAuB;IAEhC,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM;IACxB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;gBAH3B,OAAO,EAAE,OAAO,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA;CAEvC;AAED;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC3B,YAAY,EAAE,YAAY,CAAC;IAC3B,WAAW,EAAE,eAAe,CAAC;CAC9B;AAED;;;;;GAKG;AACH,qBAAa,mBAAmB;IAE5B,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI;gBAF3B,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,eAAe,EAAE,EAC/B,MAAM,EAAE,YAAY,GAAG,IAAI;CAEvC;AAED,kEAAkE;AAClE,MAAM,WAAW,gBAAgB;IAC/B,4EAA4E;IAC5E,OAAO,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,MAAM,EAAE,eAAe,KAAK,IAAI,CAAC;IACxE,6DAA6D;IAC7D,aAAa,CAAC,EAAE,MAAM,IAAI,CAAC;IAC3B,kEAAkE;IAClE,cAAc,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACtE;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,CACH,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC,CAAC;IAChC,OAAO,CACL,MAAM,EAAE,mBAAmB,EAC3B,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC,CAAC;IAC3D,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3B;AAED,MAAM,WAAW,iCAAiC;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;;;;;;GAUG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;gBAErB,OAAO,CAAC,EAAE,iCAAiC;IAIjD,KAAK,CACT,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC;IA4BzB,OAAO,CACX,MAAM,EAAE,mBAAmB,GAC1B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;CAc3D"}
@@ -26,11 +26,29 @@ export class NoDistributionAvailable {
26
26
  }
27
27
  }
28
28
  /**
29
- * Resolves a dataset to a usable SPARQL distribution by probing its distributions.
29
+ * The outcome of the probe phase: every distribution’s probe result, plus the
30
+ * {@link ProbedSource} that will be used to process the dataset (or `null` if
31
+ * none is available). Determined without importing, so the pipeline can decide
32
+ * to skip a dataset before paying the import cost.
33
+ */
34
+ export class ProbedDistributions {
35
+ dataset;
36
+ probeResults;
37
+ source;
38
+ constructor(dataset, probeResults, source) {
39
+ this.dataset = dataset;
40
+ this.probeResults = probeResults;
41
+ this.source = source;
42
+ }
43
+ }
44
+ /**
45
+ * Resolves a dataset to its own SPARQL endpoint by probing its distributions.
30
46
  *
31
- * 1. Probes all distributions in parallel.
32
- * 2. Returns the first valid SPARQL endpoint as a `ResolvedDistribution`.
33
- * 3. If none: returns `NoDistributionAvailable`.
47
+ * {@link probe} returns the first valid SPARQL endpoint as the
48
+ * {@link ProbedSource}; {@link resolve} returns it as a
49
+ * {@link ResolvedDistribution}, or {@link NoDistributionAvailable} when none
50
+ * responded. Never imports a data dump – wrap with {@link ImportResolver} for
51
+ * that.
34
52
  *
35
53
  * Does not mutate `dataset.distributions`.
36
54
  */
@@ -39,22 +57,30 @@ export class SparqlDistributionResolver {
39
57
  constructor(options) {
40
58
  this.timeout = options?.timeout ?? 5000;
41
59
  }
42
- async resolve(dataset, callbacks) {
60
+ async probe(dataset, callbacks) {
43
61
  const results = await Promise.all(dataset.distributions.map(async (distribution) => {
44
62
  const result = await probe(distribution, { timeoutMs: this.timeout });
45
63
  callbacks?.onProbe?.(distribution, result);
46
64
  return result;
47
65
  }));
48
66
  // Find first valid SPARQL endpoint.
67
+ let source = null;
49
68
  for (let i = 0; i < dataset.distributions.length; i++) {
50
69
  const distribution = dataset.distributions[i];
51
70
  const result = results[i];
52
71
  if (distribution.isSparql() &&
53
72
  result instanceof SparqlProbeResult &&
54
73
  result.isSuccess()) {
55
- return new ResolvedDistribution(distribution, results);
74
+ source = { distribution, probeResult: result };
75
+ break;
56
76
  }
57
77
  }
58
- return new NoDistributionAvailable(dataset, 'No SPARQL endpoint available', results);
78
+ return new ProbedDistributions(dataset, results, source);
79
+ }
80
+ async resolve(probed) {
81
+ if (probed.source && probed.source.distribution.isSparql()) {
82
+ return new ResolvedDistribution(probed.source.distribution, probed.probeResults);
83
+ }
84
+ return new NoDistributionAvailable(probed.dataset, 'No SPARQL endpoint available', probed.probeResults);
59
85
  }
60
86
  }
@@ -4,6 +4,7 @@ import { Stage } from './stage.js';
4
4
  import type { QuadTransform } from './stage.js';
5
5
  import type { Writer } from './writer/writer.js';
6
6
  import { type DistributionResolver } from './distribution/resolver.js';
7
+ import type { ProvenanceStore } from './provenance/store.js';
7
8
  import type { StageOutputResolver } from './stageOutputResolver.js';
8
9
  import type { ProgressReporter } from './progressReporter.js';
9
10
  import { type TimeoutPolicy } from './sparql/timeoutPolicy.js';
@@ -32,6 +33,22 @@ export interface PipelineOptions {
32
33
  outputDir: string;
33
34
  };
34
35
  reporter?: ProgressReporter;
36
+ /**
37
+ * Optional per-dataset processing memory. When set, the pipeline skips a
38
+ * dataset whose source-change fingerprint and {@link pipelineVersion} both
39
+ * match the stored record – before paying the import cost – and writes an
40
+ * updated record after processing. When omitted, every dataset is
41
+ * reprocessed (today’s behaviour).
42
+ */
43
+ provenanceStore?: ProvenanceStore;
44
+ /**
45
+ * Opaque, consumer-declared version of the pipeline’s output-affecting
46
+ * logic, rotated only on releases that change output. Compared for equality,
47
+ * never parsed or ordered. Required when {@link provenanceStore} is set (a
48
+ * skip-enabled pipeline with no version would silently freeze); ignored
49
+ * otherwise.
50
+ */
51
+ pipelineVersion?: string;
35
52
  /**
36
53
  * Factory producing a fresh {@link TimeoutPolicy} per dataset. Defaults
37
54
  * to {@link constantTimeoutPolicy}`(300_000)` so existing call sites
@@ -53,9 +70,13 @@ export declare class Pipeline {
53
70
  private readonly chaining?;
54
71
  private readonly reporter?;
55
72
  private readonly timeoutFactory;
73
+ private readonly provenanceStore?;
74
+ private readonly pipelineVersion?;
56
75
  constructor(options: PipelineOptions);
57
76
  run(): Promise<void>;
58
77
  private processDataset;
78
+ /** Persist the processing record for a dataset, when a store is configured. */
79
+ private recordOutcome;
59
80
  private reportValidators;
60
81
  private collectStages;
61
82
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;AAQpC,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAEV,gBAAgB,EACjB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAEL,KAAK,aAAa,EACnB,MAAM,2BAA2B,CAAC;AAEnC,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,aAAa,CAAC;QAAE,OAAO,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;CACxD;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,cAAc,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;IAC5B;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,aAAa,CAAC;CAC/B;AAgFD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;IAC7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAsB;gBAEzC,OAAO,EAAE,eAAe;IAkC9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAoBZ,cAAc;YAgFd,gBAAgB;IAW9B,OAAO,CAAE,aAAa;IAOtB;;;OAGG;YACW,QAAQ;IA0CtB,2EAA2E;YAC7D,eAAe;YAqBf,QAAQ;YA2DP,SAAS;CAczB"}
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAG1B,MAAM,4BAA4B,CAAC;AAKpC,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAO7D,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAEV,gBAAgB,EACjB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAEL,KAAK,aAAa,EACnB,MAAM,2BAA2B,CAAC;AAEnC,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,aAAa,CAAC;QAAE,OAAO,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;CACxD;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,cAAc,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;IAC5B;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,aAAa,CAAC;CAC/B;AAgFD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;IAC7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAsB;IACrD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAkB;IACnD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAS;gBAE9B,OAAO,EAAE,eAAe;IA0C9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAoBZ,cAAc;IA0I5B,+EAA+E;YACjE,aAAa;YAmBb,gBAAgB;IAW9B,OAAO,CAAE,aAAa;IAOtB;;;OAGG;YACW,QAAQ;IA0CtB,2EAA2E;YAC7D,eAAe;YAqBf,QAAQ;YA2DP,SAAS;CAczB"}
package/dist/pipeline.js CHANGED
@@ -3,6 +3,8 @@ import { StreamParser } from 'n3';
3
3
  import { FileWriter } from './writer/fileWriter.js';
4
4
  import { NoDistributionAvailable, } from './distribution/resolver.js';
5
5
  import { SparqlDistributionResolver } from './distribution/index.js';
6
+ import { sourceFingerprint } from './provenance/sourceFingerprint.js';
7
+ import { shouldReprocess } from './provenance/reprocessDecision.js';
6
8
  import { NetworkError, SparqlProbeResult, } from '@lde/distribution-probe';
7
9
  import { NotSupported } from './sparql/executor.js';
8
10
  import { ConstantTimeoutPolicy, } from './sparql/timeoutPolicy.js';
@@ -81,11 +83,16 @@ export class Pipeline {
81
83
  chaining;
82
84
  reporter;
83
85
  timeoutFactory;
86
+ provenanceStore;
87
+ pipelineVersion;
84
88
  constructor(options) {
85
89
  const hasSubStages = options.stages.some((stage) => stage.stages.length > 0);
86
90
  if (hasSubStages && !options.chaining) {
87
91
  throw new Error('chaining is required when any stage has sub-stages');
88
92
  }
93
+ if (options.provenanceStore && options.pipelineVersion === undefined) {
94
+ throw new Error('pipelineVersion is required when a provenanceStore is configured');
95
+ }
89
96
  this.name = options.name ?? '';
90
97
  this.datasetSelector = options.datasetSelector;
91
98
  this.stages = options.stages;
@@ -106,6 +113,8 @@ export class Pipeline {
106
113
  this.reporter = options.reporter;
107
114
  this.timeoutFactory =
108
115
  options.timeout ?? (() => new ConstantTimeoutPolicy(300_000));
116
+ this.provenanceStore = options.provenanceStore;
117
+ this.pipelineVersion = options.pipelineVersion;
109
118
  }
110
119
  async run() {
111
120
  const start = Date.now();
@@ -125,17 +134,48 @@ export class Pipeline {
125
134
  }
126
135
  async processDataset(dataset) {
127
136
  this.reporter?.datasetStart?.(dataset);
128
- const timeout = this.timeoutFactory();
129
- const unsubscribe = timeout.subscribe?.({
130
- onTighten: (event) => this.reporter?.timeoutTightened?.(event),
131
- onRelax: (event) => this.reporter?.timeoutRelaxed?.(event),
132
- });
133
- let resolved;
137
+ // Probe phase: gather probe results and the source-to-be, without importing.
138
+ let probed;
134
139
  try {
135
- resolved = await this.distributionResolver.resolve(dataset, {
140
+ probed = await this.distributionResolver.probe(dataset, {
136
141
  onProbe: (distribution, result) => {
137
142
  this.reporter?.distributionProbed?.(mapProbeResult(distribution, result));
138
143
  },
144
+ });
145
+ }
146
+ catch (error) {
147
+ this.reporter?.datasetSkipped?.(dataset, `Distribution probing failed: ${error instanceof Error ? error.message : String(error)}`);
148
+ return;
149
+ }
150
+ // Derive the source-change fingerprint from the probed source: null for a
151
+ // live SPARQL endpoint (always reprocess) or when no source is available.
152
+ const fingerprint = probed.source
153
+ ? sourceFingerprint(probed.source.distribution, probed.source.probeResult)
154
+ : null;
155
+ // Gate: skip an unchanged dataset before paying any import cost.
156
+ if (this.provenanceStore) {
157
+ let stored = null;
158
+ try {
159
+ stored = await this.provenanceStore.get(dataset.iri);
160
+ }
161
+ catch {
162
+ // An unreadable record must not abort the whole run, nor wrongly skip:
163
+ // treat it as ‘never processed’ so this dataset reprocesses. The
164
+ // periodic full reprocess is the backstop.
165
+ stored = null;
166
+ }
167
+ if (!shouldReprocess({
168
+ sourceFingerprint: fingerprint,
169
+ pipelineVersion: this.pipelineVersion,
170
+ }, stored)) {
171
+ this.reporter?.datasetSkipped?.(dataset, 'Unchanged since last run');
172
+ return;
173
+ }
174
+ }
175
+ // Resolve phase: import a data dump only when the source is one.
176
+ let resolved;
177
+ try {
178
+ resolved = await this.distributionResolver.resolve(probed, {
139
179
  onImportStart: () => {
140
180
  this.reporter?.importStarted?.();
141
181
  },
@@ -149,10 +189,20 @@ export class Pipeline {
149
189
  return;
150
190
  }
151
191
  if (resolved instanceof NoDistributionAvailable) {
192
+ // Record the failure so a dataset whose source is unchanged is not
193
+ // re-imported every run; it is retried at the next fingerprint change or
194
+ // version rotation.
195
+ await this.recordOutcome(dataset, fingerprint, 'failed');
152
196
  this.reporter?.datasetSkipped?.(dataset, resolved.message);
153
197
  return;
154
198
  }
155
199
  this.reporter?.distributionSelected?.(dataset, resolved.distribution, resolved.importedFrom, resolved.importDuration, resolved.tripleCount);
200
+ const timeout = this.timeoutFactory();
201
+ const unsubscribe = timeout.subscribe?.({
202
+ onTighten: (event) => this.reporter?.timeoutTightened?.(event),
203
+ onRelax: (event) => this.reporter?.timeoutRelaxed?.(event),
204
+ });
205
+ let stageFailed = false;
156
206
  try {
157
207
  for (const stage of this.stages) {
158
208
  try {
@@ -164,6 +214,7 @@ export class Pipeline {
164
214
  }
165
215
  }
166
216
  catch (error) {
217
+ stageFailed = true;
167
218
  this.reporter?.stageFailed?.(stage.name, error instanceof Error ? error : new Error(String(error)));
168
219
  }
169
220
  }
@@ -174,12 +225,32 @@ export class Pipeline {
174
225
  }
175
226
  await this.writer.flush?.(dataset);
176
227
  await this.reportValidators(dataset);
228
+ // A dataset whose stages threw produced incomplete output; record it as
229
+ // ‘failed’ rather than freezing a broken result under a ‘success’ record.
230
+ await this.recordOutcome(dataset, fingerprint, stageFailed ? 'failed' : 'success');
177
231
  const datasetMemory = process.memoryUsage();
178
232
  this.reporter?.datasetComplete?.(dataset, {
179
233
  memoryUsageBytes: datasetMemory.rss,
180
234
  heapUsedBytes: datasetMemory.heapUsed,
181
235
  });
182
236
  }
237
+ /** Persist the processing record for a dataset, when a store is configured. */
238
+ async recordOutcome(dataset, fingerprint, status) {
239
+ if (!this.provenanceStore)
240
+ return;
241
+ try {
242
+ await this.provenanceStore.set(dataset.iri, {
243
+ sourceFingerprint: fingerprint,
244
+ pipelineVersion: this.pipelineVersion,
245
+ generatedAt: new Date().toISOString(),
246
+ status,
247
+ });
248
+ }
249
+ catch {
250
+ // A failed write must not abort the run; the dataset simply reprocesses
251
+ // next run, its record not yet updated.
252
+ }
253
+ }
183
254
  async reportValidators(dataset) {
184
255
  const validators = new Set();
185
256
  for (const stage of this.collectStages(this.stages)) {
@@ -0,0 +1,44 @@
1
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
2
+ import type { ProcessingRecord } from './record.js';
3
+ import type { ProvenanceStore } from './store.js';
4
+ export interface FileLoadedSparqlProvenanceStoreOptions {
5
+ /** Read-only SPARQL endpoint to query for previously-loaded records. */
6
+ queryEndpoint: URL;
7
+ /**
8
+ * The pipeline’s IRI, used as the named graph that scopes this pipeline’s
9
+ * records so multiple pipelines sharing one triplestore do not collide.
10
+ */
11
+ pipelineIri: URL;
12
+ /**
13
+ * Directory the records are written to as files, to be bulk-loaded into the
14
+ * read-only triplestore after the run. Kept separate from the data output
15
+ * directory so filenames (keyed by dataset URI) never collide.
16
+ */
17
+ outputDir: string;
18
+ /**
19
+ * Optional {@link SparqlEndpointFetcher} for the query side, intended for
20
+ * tests. Defaults to a fresh instance.
21
+ */
22
+ fetcher?: SparqlEndpointFetcher;
23
+ }
24
+ /**
25
+ * A {@link ProvenanceStore} for a triplestore that is served read-only and
26
+ * rebuilt by bulk-loading files (e.g. QLever).
27
+ *
28
+ * Reads through SPARQL queries against the live endpoint (records loaded from
29
+ * a previous run); writes the records as files for the next bulk-load, since
30
+ * the endpoint accepts no SPARQL UPDATE. Records are flat PROV-O keyed by the
31
+ * dataset URI, written into the pipeline-scoped provenance named graph.
32
+ */
33
+ export declare class FileLoadedSparqlProvenanceStore implements ProvenanceStore {
34
+ private readonly queryEndpoint;
35
+ private readonly pipelineIri;
36
+ private readonly writer;
37
+ private readonly fetcher;
38
+ constructor(options: FileLoadedSparqlProvenanceStoreOptions);
39
+ get(datasetUri: URL): Promise<ProcessingRecord | null>;
40
+ private selectQuery;
41
+ set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
42
+ private toQuads;
43
+ }
44
+ //# sourceMappingURL=fileLoadedSparqlProvenanceStore.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fileLoadedSparqlProvenanceStore.d.ts","sourceRoot":"","sources":["../../src/provenance/fileLoadedSparqlProvenanceStore.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AASlD,MAAM,WAAW,sCAAsC;IACrD,wEAAwE;IACxE,aAAa,EAAE,GAAG,CAAC;IACnB;;;OAGG;IACH,WAAW,EAAE,GAAG,CAAC;IACjB;;;;OAIG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;GAQG;AACH,qBAAa,+BAAgC,YAAW,eAAe;IACrE,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAM;IACpC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAM;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAa;IACpC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,sCAAsC;IAWrD,GAAG,CAAC,UAAU,EAAE,GAAG,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAoB5D,OAAO,CAAC,WAAW;IAmBb,GAAG,CAAC,UAAU,EAAE,GAAG,EAAE,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;YAMpD,OAAO;CA0BvB"}
@@ -0,0 +1,81 @@
1
+ import { Dataset, assertSafeIri } from '@lde/dataset';
2
+ import { DataFactory } from 'n3';
3
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
4
+ import { FileWriter } from '../writer/fileWriter.js';
5
+ const { namedNode, literal, quad } = DataFactory;
6
+ const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
7
+ const PROV = 'http://www.w3.org/ns/prov#';
8
+ const LDE = 'https://w3id.org/lde/provenance#';
9
+ const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime';
10
+ /**
11
+ * A {@link ProvenanceStore} for a triplestore that is served read-only and
12
+ * rebuilt by bulk-loading files (e.g. QLever).
13
+ *
14
+ * Reads through SPARQL queries against the live endpoint (records loaded from
15
+ * a previous run); writes the records as files for the next bulk-load, since
16
+ * the endpoint accepts no SPARQL UPDATE. Records are flat PROV-O keyed by the
17
+ * dataset URI, written into the pipeline-scoped provenance named graph.
18
+ */
19
+ export class FileLoadedSparqlProvenanceStore {
20
+ queryEndpoint;
21
+ pipelineIri;
22
+ writer;
23
+ fetcher;
24
+ constructor(options) {
25
+ this.queryEndpoint = options.queryEndpoint;
26
+ this.pipelineIri = options.pipelineIri;
27
+ this.writer = new FileWriter({
28
+ outputDir: options.outputDir,
29
+ format: 'n-quads',
30
+ graphIri: () => this.pipelineIri,
31
+ });
32
+ this.fetcher = options.fetcher ?? new SparqlEndpointFetcher();
33
+ }
34
+ async get(datasetUri) {
35
+ const stream = (await this.fetcher.fetchBindings(this.queryEndpoint.toString(), this.selectQuery(datasetUri)));
36
+ for await (const binding of stream) {
37
+ // A record exists iff the mandatory fields bound; the fingerprint is
38
+ // optional and absent for a run with no establishable signal.
39
+ return {
40
+ sourceFingerprint: binding.fingerprint?.value ?? null,
41
+ pipelineVersion: binding.version.value,
42
+ generatedAt: binding.generatedAt.value,
43
+ status: binding.status.value,
44
+ };
45
+ }
46
+ return null;
47
+ }
48
+ selectQuery(datasetUri) {
49
+ // Guard before interpolating into a SPARQL `<…>` reference, mirroring
50
+ // SparqlUpdateWriter. URL normalisation already encodes unsafe characters,
51
+ // so this is defence-in-depth against a non-normalised IRI reaching here.
52
+ const datasetIri = datasetUri.toString();
53
+ const pipelineIri = this.pipelineIri.toString();
54
+ assertSafeIri(datasetIri);
55
+ assertSafeIri(pipelineIri);
56
+ const dataset = `<${datasetIri}>`;
57
+ return `SELECT ?fingerprint ?version ?status ?generatedAt WHERE {
58
+ GRAPH <${pipelineIri}> {
59
+ ${dataset} <${LDE}pipelineVersion> ?version ;
60
+ <${LDE}status> ?status ;
61
+ <${PROV}generatedAtTime> ?generatedAt .
62
+ OPTIONAL { ${dataset} <${LDE}sourceFingerprint> ?fingerprint }
63
+ }
64
+ } LIMIT 1`;
65
+ }
66
+ async set(datasetUri, record) {
67
+ const dataset = new Dataset({ iri: datasetUri, distributions: [] });
68
+ await this.writer.write(dataset, this.toQuads(datasetUri, record));
69
+ await this.writer.flush(dataset);
70
+ }
71
+ async *toQuads(datasetUri, record) {
72
+ const subject = namedNode(datasetUri.toString());
73
+ yield quad(subject, namedNode(RDF_TYPE), namedNode(`${PROV}Entity`));
74
+ yield quad(subject, namedNode(`${PROV}generatedAtTime`), literal(record.generatedAt, namedNode(XSD_DATE_TIME)));
75
+ if (record.sourceFingerprint !== null) {
76
+ yield quad(subject, namedNode(`${LDE}sourceFingerprint`), literal(record.sourceFingerprint));
77
+ }
78
+ yield quad(subject, namedNode(`${LDE}pipelineVersion`), literal(record.pipelineVersion));
79
+ yield quad(subject, namedNode(`${LDE}status`), literal(record.status));
80
+ }
81
+ }
@@ -1,4 +1,6 @@
1
1
  export { sourceFingerprint } from './sourceFingerprint.js';
2
2
  export { shouldReprocess } from './reprocessDecision.js';
3
3
  export type { ProcessingRecord, ChangeKey } from './record.js';
4
+ export type { ProvenanceStore } from './store.js';
5
+ export { FileLoadedSparqlProvenanceStore, type FileLoadedSparqlProvenanceStoreOptions, } from './fileLoadedSparqlProvenanceStore.js';
4
6
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAC/D,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClD,OAAO,EACL,+BAA+B,EAC/B,KAAK,sCAAsC,GAC5C,MAAM,sCAAsC,CAAC"}
@@ -1,2 +1,3 @@
1
1
  export { sourceFingerprint } from './sourceFingerprint.js';
2
2
  export { shouldReprocess } from './reprocessDecision.js';
3
+ export { FileLoadedSparqlProvenanceStore, } from './fileLoadedSparqlProvenanceStore.js';
@@ -0,0 +1,20 @@
1
+ import type { ProcessingRecord } from './record.js';
2
+ /**
3
+ * The pipeline’s per-dataset processing memory.
4
+ *
5
+ * The framework owns the skip semantics (see `shouldReprocess`); a
6
+ * `ProvenanceStore` owns only the physical storage of {@link ProcessingRecord}s,
7
+ * keyed by dataset URI. Implementations are free to back this with a
8
+ * triplestore, files, or anything else.
9
+ */
10
+ export interface ProvenanceStore {
11
+ /**
12
+ * The record from the dataset’s last processing, or `null` if it has never
13
+ * been processed (or the store was wiped). A `null` result drives a
14
+ * reprocess.
15
+ */
16
+ get(datasetUri: URL): Promise<ProcessingRecord | null>;
17
+ /** Persist the record for a dataset, replacing any previous one. */
18
+ set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
19
+ }
20
+ //# sourceMappingURL=store.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/provenance/store.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAEpD;;;;;;;GAOG;AACH,MAAM,WAAW,eAAe;IAC9B;;;;OAIG;IACH,GAAG,CAAC,UAAU,EAAE,GAAG,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC;IACvD,oEAAoE;IACpE,GAAG,CAAC,UAAU,EAAE,GAAG,EAAE,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAC/D"}
@@ -0,0 +1 @@
1
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/pipeline",
3
- "version": "0.30.11",
3
+ "version": "0.30.13",
4
4
  "repository": {
5
5
  "url": "git+https://github.com/ldelements/lde.git",
6
6
  "directory": "packages/pipeline"
@@ -24,14 +24,14 @@
24
24
  "!**/*.tsbuildinfo"
25
25
  ],
26
26
  "dependencies": {
27
- "@lde/dataset": "0.7.5",
28
- "@lde/dataset-registry-client": "0.8.1",
29
- "@lde/distribution-probe": "0.1.8",
30
- "@lde/sparql-importer": "0.6.3",
27
+ "@lde/dataset": "0.7.6",
28
+ "@lde/dataset-registry-client": "0.8.2",
29
+ "@lde/distribution-probe": "0.1.9",
30
+ "@lde/sparql-importer": "0.6.4",
31
31
  "@lde/sparql-server": "0.4.11",
32
32
  "@rdfjs/types": "^2.0.1",
33
- "@traqula/generator-sparql-1-1": "^1.1.1",
34
- "@traqula/parser-sparql-1-1": "^1.1.1",
33
+ "@traqula/generator-sparql-1-1": "^1.1.4",
34
+ "@traqula/parser-sparql-1-1": "^1.1.4",
35
35
  "@traqula/rules-sparql-1-1": "^1.1.0",
36
36
  "fetch-sparql-endpoint": "^7.1.0",
37
37
  "filenamify-url": "^4.0.0",