@lde/pipeline 0.30.10 → 0.30.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -0
- package/dist/distribution/importResolver.d.ts +19 -2
- package/dist/distribution/importResolver.d.ts.map +1 -1
- package/dist/distribution/importResolver.js +47 -12
- package/dist/distribution/index.d.ts +1 -1
- package/dist/distribution/index.d.ts.map +1 -1
- package/dist/distribution/index.js +1 -1
- package/dist/distribution/resolveDistributions.d.ts.map +1 -1
- package/dist/distribution/resolveDistributions.js +2 -1
- package/dist/distribution/resolver.d.ts +44 -10
- package/dist/distribution/resolver.d.ts.map +1 -1
- package/dist/distribution/resolver.js +33 -7
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/pipeline.d.ts +21 -0
- package/dist/pipeline.d.ts.map +1 -1
- package/dist/pipeline.js +78 -7
- package/dist/provenance/fileLoadedSparqlProvenanceStore.d.ts +44 -0
- package/dist/provenance/fileLoadedSparqlProvenanceStore.d.ts.map +1 -0
- package/dist/provenance/fileLoadedSparqlProvenanceStore.js +81 -0
- package/dist/provenance/index.d.ts +6 -0
- package/dist/provenance/index.d.ts.map +1 -0
- package/dist/provenance/index.js +3 -0
- package/dist/provenance/record.d.ts +35 -0
- package/dist/provenance/record.d.ts.map +1 -0
- package/dist/provenance/record.js +1 -0
- package/dist/provenance/reprocessDecision.d.ts +21 -0
- package/dist/provenance/reprocessDecision.d.ts.map +1 -0
- package/dist/provenance/reprocessDecision.js +30 -0
- package/dist/provenance/sourceFingerprint.d.ts +29 -0
- package/dist/provenance/sourceFingerprint.d.ts.map +1 -0
- package/dist/provenance/sourceFingerprint.js +57 -0
- package/dist/provenance/store.d.ts +20 -0
- package/dist/provenance/store.d.ts.map +1 -0
- package/dist/provenance/store.js +1 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -276,6 +276,63 @@ Writes generated quads to a destination:
|
|
|
276
276
|
- `SparqlUpdateWriter` — writes to a SPARQL endpoint via UPDATE queries
|
|
277
277
|
- `FileWriter` — writes to local files
|
|
278
278
|
|
|
279
|
+
### Provenance store
|
|
280
|
+
|
|
281
|
+
A `ProvenanceStore` gives the pipeline a small per-dataset memory, so a future run can skip datasets that are genuinely unchanged. It is purely a storage seam: the framework owns the skip decision (see [`sourceFingerprint`](#source-change-fingerprint) and `shouldReprocess`), the store owns only how each record is persisted.
|
|
282
|
+
|
|
283
|
+
```typescript
|
|
284
|
+
interface ProvenanceStore {
|
|
285
|
+
get(datasetUri: URL): Promise<ProcessingRecord | null>;
|
|
286
|
+
set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
|
|
287
|
+
}
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
A `ProcessingRecord` holds the two opaque change fields — `sourceFingerprint` (derived automatically from source metadata) and `pipelineVersion` (consumer-declared) — plus `generatedAt` and a `status` of `'success'` or `'failed'`. The two change fields are compared only for equality, never parsed or ordered.
|
|
291
|
+
|
|
292
|
+
#### `FileLoadedSparqlProvenanceStore`
|
|
293
|
+
|
|
294
|
+
The reference implementation targets a triplestore that is served read-only and rebuilt by bulk-loading files (e.g. [QLever](https://github.com/ad-freiburg/qlever)). It reads through SPARQL queries against the live endpoint, and writes records as files for the next bulk-load — because the endpoint accepts no SPARQL UPDATE.
|
|
295
|
+
|
|
296
|
+
```typescript
|
|
297
|
+
import { FileLoadedSparqlProvenanceStore } from '@lde/pipeline';
|
|
298
|
+
|
|
299
|
+
const store = new FileLoadedSparqlProvenanceStore({
|
|
300
|
+
queryEndpoint: new URL('http://localhost:7001/sparql'),
|
|
301
|
+
pipelineIri: new URL('https://example.org/pipelines/dkg'),
|
|
302
|
+
outputDir: './provenance',
|
|
303
|
+
});
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
- `get` runs a named-graph-scoped SPARQL `SELECT` against `queryEndpoint`, reading the records a previous run loaded.
|
|
307
|
+
- `set` writes one flat [PROV-O](https://www.w3.org/TR/prov-o/) N-Quads file per dataset into `outputDir`, in the pipeline-scoped named graph, to be bulk-loaded after the run.
|
|
308
|
+
|
|
309
|
+
Each record is stored as flat PROV-O on the dataset entity — `prov:generatedAtTime` plus `sourceFingerprint`, `pipelineVersion` and `status` under the `https://w3id.org/lde/provenance#` namespace. Scoping every record by `pipelineIri` (used as the named graph) lets multiple pipelines share one triplestore without colliding.
|
|
310
|
+
|
|
311
|
+
#### Enabling skipping
|
|
312
|
+
|
|
313
|
+
Skipping is opt-in. Pass a `provenanceStore` and a `pipelineVersion` to the `Pipeline`:
|
|
314
|
+
|
|
315
|
+
```typescript
|
|
316
|
+
new Pipeline({
|
|
317
|
+
// …
|
|
318
|
+
provenanceStore: store,
|
|
319
|
+
pipelineVersion: 'v3', // rotate only on releases that change output
|
|
320
|
+
});
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
For each dataset the pipeline probes its distributions, derives the source-change fingerprint, reads the stored record, and **skips before importing** when both change fields match:
|
|
324
|
+
|
|
325
|
+
```
|
|
326
|
+
skip iff recorded.sourceFingerprint === current.sourceFingerprint
|
|
327
|
+
AND recorded.pipelineVersion === current.pipelineVersion
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Otherwise it imports (if needed), runs the stages, and writes an updated record. `pipelineVersion` is consumer-owned and opaque: rotate it only on releases that change output, and every dataset reprocesses on the next run. It is **required** when a `provenanceStore` is configured (a skip-enabled pipeline with no version would silently freeze); when no store is configured, every dataset is reprocessed — today’s behaviour. A dataset that failed but whose source is unchanged is recorded as `'failed'` and skipped on later runs until its source changes or the version rotates, so a deterministically failing import is not retried every run.
|
|
331
|
+
|
|
332
|
+
### Source-change fingerprint
|
|
333
|
+
|
|
334
|
+
`sourceFingerprint(distribution, probeResult)` derives a cheap, opaque change signal for a distribution from metadata the probe already collected — no body download. For a data dump it combines the most recent of the register’s `dct:modified` and the artifact’s HTTP `Last-Modified` with the byte size (the probe’s `Content-Length`, falling back to the declared `dcat:byteSize`). It returns `null` for a live SPARQL endpoint, or when no date and no size can be established — a `null` fingerprint never compares equal, so such a distribution is always reprocessed.
|
|
335
|
+
|
|
279
336
|
### Plugins
|
|
280
337
|
|
|
281
338
|
Plugins hook into the pipeline lifecycle via the `PipelinePlugin` interface. Register them in the `plugins` array when constructing a `Pipeline`.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { type Dataset } from '@lde/dataset';
|
|
1
2
|
import type { Importer } from '@lde/sparql-importer';
|
|
2
3
|
import type { SparqlServer } from '@lde/sparql-server';
|
|
3
|
-
import { type DistributionResolver, NoDistributionAvailable, ResolvedDistribution } from './resolver.js';
|
|
4
|
+
import { type DistributionResolver, type ResolveCallbacks, NoDistributionAvailable, ProbedDistributions, ResolvedDistribution } from './resolver.js';
|
|
4
5
|
export interface ImportResolverOptions {
|
|
5
6
|
importer: Importer;
|
|
6
7
|
server: SparqlServer;
|
|
@@ -26,12 +27,28 @@ export interface ImportResolverOptions {
|
|
|
26
27
|
* adds the ability to import a data dump into a local SPARQL server. The
|
|
27
28
|
* {@link ImportResolverOptions.strategy | strategy} option controls whether the
|
|
28
29
|
* inner resolver's SPARQL endpoint is preferred or bypassed.
|
|
30
|
+
*
|
|
31
|
+
* The split is preserved across both phases: {@link probe} chooses the
|
|
32
|
+
* {@link ProbedSource} (the inner SPARQL endpoint, or the preferred importable
|
|
33
|
+
* data dump) without importing; {@link resolve} performs the import only when
|
|
34
|
+
* that source is a data dump.
|
|
29
35
|
*/
|
|
30
36
|
export declare class ImportResolver implements DistributionResolver {
|
|
31
37
|
private readonly inner;
|
|
32
38
|
private readonly options;
|
|
33
39
|
constructor(inner: DistributionResolver, options: ImportResolverOptions);
|
|
34
|
-
|
|
40
|
+
probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
|
|
41
|
+
resolve(probed: ProbedDistributions, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
|
|
42
|
+
/**
|
|
43
|
+
* The preferred importable data dump and its probe result, or `null` if no
|
|
44
|
+
* downloadable distribution passed probing.
|
|
45
|
+
*/
|
|
46
|
+
private selectImportCandidate;
|
|
47
|
+
/**
|
|
48
|
+
* Downloadable distributions whose access URL passed probing, in preference
|
|
49
|
+
* order (compressed first, see {@link Dataset.getDownloadDistributions}).
|
|
50
|
+
*/
|
|
51
|
+
private importCandidates;
|
|
35
52
|
private importDataset;
|
|
36
53
|
cleanup(): Promise<void>;
|
|
37
54
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,OAAO,EAAgB,MAAM,cAAc,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAMrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,KAAK,oBAAoB,EAEzB,KAAK,gBAAgB,EACrB,uBAAuB,EACvB,mBAAmB,EACnB,oBAAoB,EACrB,MAAM,eAAe,CAAC;AAGvB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,EAAE,QAAQ,GAAG,QAAQ,CAAC;CAChC;AAED;;;;;;;;;;;;GAYG;AACH,qBAAa,cAAe,YAAW,oBAAoB;IAEvD,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,OAAO;gBADP,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,qBAAqB;IAG3C,KAAK,CACT,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC;IAgBzB,OAAO,CACX,MAAM,EAAE,mBAAmB,EAC3B,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;IAoB1D;;;OAGG;IACH,OAAO,CAAC,qBAAqB;IAY7B;;;OAGG;IACH,OAAO,CAAC,gBAAgB;YAeV,aAAa;IAmGrB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Distribution } from '@lde/dataset';
|
|
2
2
|
import { ImportFailed, ImportSuccessful, NotSupported, } from '@lde/sparql-importer';
|
|
3
|
-
import { NoDistributionAvailable, ResolvedDistribution, } from './resolver.js';
|
|
3
|
+
import { NoDistributionAvailable, ProbedDistributions, ResolvedDistribution, } from './resolver.js';
|
|
4
4
|
import { NetworkError } from '@lde/distribution-probe';
|
|
5
5
|
/**
|
|
6
6
|
* A {@link DistributionResolver} decorator that adds data-dump import logic.
|
|
@@ -9,6 +9,11 @@ import { NetworkError } from '@lde/distribution-probe';
|
|
|
9
9
|
* adds the ability to import a data dump into a local SPARQL server. The
|
|
10
10
|
* {@link ImportResolverOptions.strategy | strategy} option controls whether the
|
|
11
11
|
* inner resolver's SPARQL endpoint is preferred or bypassed.
|
|
12
|
+
*
|
|
13
|
+
* The split is preserved across both phases: {@link probe} chooses the
|
|
14
|
+
* {@link ProbedSource} (the inner SPARQL endpoint, or the preferred importable
|
|
15
|
+
* data dump) without importing; {@link resolve} performs the import only when
|
|
16
|
+
* that source is a data dump.
|
|
12
17
|
*/
|
|
13
18
|
export class ImportResolver {
|
|
14
19
|
inner;
|
|
@@ -17,24 +22,54 @@ export class ImportResolver {
|
|
|
17
22
|
this.inner = inner;
|
|
18
23
|
this.options = options;
|
|
19
24
|
}
|
|
20
|
-
async
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
result instanceof ResolvedDistribution) {
|
|
26
|
-
return result;
|
|
25
|
+
async probe(dataset, callbacks) {
|
|
26
|
+
const probed = await this.inner.probe(dataset, callbacks);
|
|
27
|
+
// 'sparql' strategy (default): keep the inner SPARQL endpoint if found.
|
|
28
|
+
if (this.options.strategy !== 'import' && probed.source) {
|
|
29
|
+
return probed;
|
|
27
30
|
}
|
|
28
|
-
// Either 'import' strategy or
|
|
29
|
-
|
|
31
|
+
// Either 'import' strategy or no SPARQL endpoint: select a data dump to
|
|
32
|
+
// import. Choosing the candidate here (not in resolve) keeps the import
|
|
33
|
+
// cost out of the probe phase while still letting the pipeline fingerprint
|
|
34
|
+
// the dump it would import.
|
|
35
|
+
const source = this.selectImportCandidate(dataset, probed.probeResults);
|
|
36
|
+
return new ProbedDistributions(dataset, probed.probeResults, source);
|
|
30
37
|
}
|
|
31
|
-
async
|
|
38
|
+
async resolve(probed, callbacks) {
|
|
39
|
+
if (!probed.source) {
|
|
40
|
+
return new NoDistributionAvailable(probed.dataset, 'No importable distributions passed probing', probed.probeResults);
|
|
41
|
+
}
|
|
42
|
+
// A SPARQL endpoint source needs no import.
|
|
43
|
+
if (probed.source.distribution.isSparql()) {
|
|
44
|
+
return new ResolvedDistribution(probed.source.distribution, probed.probeResults);
|
|
45
|
+
}
|
|
46
|
+
return this.importDataset(probed.dataset, probed.probeResults, callbacks);
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* The preferred importable data dump and its probe result, or `null` if no
|
|
50
|
+
* downloadable distribution passed probing.
|
|
51
|
+
*/
|
|
52
|
+
selectImportCandidate(dataset, probeResults) {
|
|
53
|
+
const candidate = this.importCandidates(dataset, probeResults)[0];
|
|
54
|
+
if (!candidate)
|
|
55
|
+
return null;
|
|
56
|
+
const probeResult = probeResults.find((result) => result.url === candidate.accessUrl.toString());
|
|
57
|
+
return probeResult ? { distribution: candidate, probeResult } : null;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Downloadable distributions whose access URL passed probing, in preference
|
|
61
|
+
* order (compressed first, see {@link Dataset.getDownloadDistributions}).
|
|
62
|
+
*/
|
|
63
|
+
importCandidates(dataset, probeResults) {
|
|
32
64
|
const successfulUrls = new Set(probeResults
|
|
33
65
|
.filter((r) => !(r instanceof NetworkError) && r.isSuccess())
|
|
34
66
|
.map((r) => r.url));
|
|
35
|
-
|
|
67
|
+
return dataset
|
|
36
68
|
.getDownloadDistributions()
|
|
37
69
|
.filter((d) => d.accessUrl && successfulUrls.has(d.accessUrl.toString()));
|
|
70
|
+
}
|
|
71
|
+
async importDataset(dataset, probeResults, callbacks) {
|
|
72
|
+
const candidates = this.importCandidates(dataset, probeResults);
|
|
38
73
|
// Establish a trustworthy change signal for the downloader so it can skip
|
|
39
74
|
// redundant downloads (and preserve the QLever index cache). For a data
|
|
40
75
|
// dump the authoritative date is the most recent of the register’s declared
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, type ProbeResultType, } from '@lde/distribution-probe';
|
|
2
2
|
export { probeResultsToQuads } from './report.js';
|
|
3
3
|
export { ImportResolver, type ImportResolverOptions, } from './importResolver.js';
|
|
4
|
-
export { ResolvedDistribution, NoDistributionAvailable, SparqlDistributionResolver, type DistributionResolver, type ResolveCallbacks, type SparqlDistributionResolverOptions, } from './resolver.js';
|
|
4
|
+
export { ResolvedDistribution, NoDistributionAvailable, ProbedDistributions, SparqlDistributionResolver, type DistributionResolver, type ProbedSource, type ResolveCallbacks, type SparqlDistributionResolverOptions, } from './resolver.js';
|
|
5
5
|
export { resolveDistributions, type DistributionStageResult, } from './resolveDistributions.js';
|
|
6
6
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/distribution/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,eAAe,GACrB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAElD,OAAO,EACL,cAAc,EACd,KAAK,qBAAqB,GAC3B,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EACL,oBAAoB,EACpB,uBAAuB,EACvB,0BAA0B,EAC1B,KAAK,oBAAoB,EACzB,KAAK,gBAAgB,EACrB,KAAK,iCAAiC,GACvC,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,oBAAoB,EACpB,KAAK,uBAAuB,GAC7B,MAAM,2BAA2B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/distribution/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,eAAe,GACrB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAElD,OAAO,EACL,cAAc,EACd,KAAK,qBAAqB,GAC3B,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EACL,oBAAoB,EACpB,uBAAuB,EACvB,mBAAmB,EACnB,0BAA0B,EAC1B,KAAK,oBAAoB,EACzB,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,iCAAiC,GACvC,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,oBAAoB,EACpB,KAAK,uBAAuB,GAC7B,MAAM,2BAA2B,CAAC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, } from '@lde/distribution-probe';
|
|
2
2
|
export { probeResultsToQuads } from './report.js';
|
|
3
3
|
export { ImportResolver, } from './importResolver.js';
|
|
4
|
-
export { ResolvedDistribution, NoDistributionAvailable, SparqlDistributionResolver, } from './resolver.js';
|
|
4
|
+
export { ResolvedDistribution, NoDistributionAvailable, ProbedDistributions, SparqlDistributionResolver, } from './resolver.js';
|
|
5
5
|
export { resolveDistributions, } from './resolveDistributions.js';
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"resolveDistributions.d.ts","sourceRoot":"","sources":["../../src/distribution/resolveDistributions.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,OAAO,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,uBAAuB;IACtC,YAAY,EAAE,YAAY,GAAG,IAAI,CAAC;IAClC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,OAAO,EAChB,QAAQ,EAAE,oBAAoB,GAC7B,OAAO,CAAC,uBAAuB,CAAC,
|
|
1
|
+
{"version":3,"file":"resolveDistributions.d.ts","sourceRoot":"","sources":["../../src/distribution/resolveDistributions.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,OAAO,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,uBAAuB;IACtC,YAAY,EAAE,YAAY,GAAG,IAAI,CAAC;IAClC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,OAAO,EAChB,QAAQ,EAAE,oBAAoB,GAC7B,OAAO,CAAC,uBAAuB,CAAC,CAqBlC"}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { probeResultsToQuads } from './report.js';
|
|
2
2
|
import { NoDistributionAvailable, } from './resolver.js';
|
|
3
3
|
export async function resolveDistributions(dataset, resolver) {
|
|
4
|
-
const
|
|
4
|
+
const probed = await resolver.probe(dataset);
|
|
5
|
+
const result = await resolver.resolve(probed);
|
|
5
6
|
if (result instanceof NoDistributionAvailable) {
|
|
6
7
|
return {
|
|
7
8
|
distribution: null,
|
|
@@ -16,34 +16,68 @@ export declare class NoDistributionAvailable {
|
|
|
16
16
|
readonly importFailed?: ImportFailed | undefined;
|
|
17
17
|
constructor(dataset: Dataset, message: string, probeResults: ProbeResultType[], importFailed?: ImportFailed | undefined);
|
|
18
18
|
}
|
|
19
|
-
/**
|
|
19
|
+
/**
|
|
20
|
+
* The distribution a dataset will be processed from, paired with its probe
|
|
21
|
+
* result. Drives the source-change fingerprint: a live SPARQL endpoint yields
|
|
22
|
+
* `null` (always reprocess), a data dump yields its change fingerprint.
|
|
23
|
+
*/
|
|
24
|
+
export interface ProbedSource {
|
|
25
|
+
distribution: Distribution;
|
|
26
|
+
probeResult: ProbeResultType;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* The outcome of the probe phase: every distribution’s probe result, plus the
|
|
30
|
+
* {@link ProbedSource} that will be used to process the dataset (or `null` if
|
|
31
|
+
* none is available). Determined without importing, so the pipeline can decide
|
|
32
|
+
* to skip a dataset before paying the import cost.
|
|
33
|
+
*/
|
|
34
|
+
export declare class ProbedDistributions {
|
|
35
|
+
readonly dataset: Dataset;
|
|
36
|
+
readonly probeResults: ProbeResultType[];
|
|
37
|
+
readonly source: ProbedSource | null;
|
|
38
|
+
constructor(dataset: Dataset, probeResults: ProbeResultType[], source: ProbedSource | null);
|
|
39
|
+
}
|
|
40
|
+
/** Callbacks fired during distribution probing and resolution. */
|
|
20
41
|
export interface ResolveCallbacks {
|
|
21
|
-
/** Called each time a single distribution probe completes. */
|
|
42
|
+
/** Called each time a single distribution probe completes (probe phase). */
|
|
22
43
|
onProbe?: (distribution: Distribution, result: ProbeResultType) => void;
|
|
23
|
-
/** Called when a data-dump import begins. */
|
|
44
|
+
/** Called when a data-dump import begins (resolve phase). */
|
|
24
45
|
onImportStart?: () => void;
|
|
25
|
-
/** Called when importing a distribution fails. */
|
|
46
|
+
/** Called when importing a distribution fails (resolve phase). */
|
|
26
47
|
onImportFailed?: (distribution: Distribution, error: string) => void;
|
|
27
48
|
}
|
|
49
|
+
/**
|
|
50
|
+
* Resolves a dataset to a usable distribution in two phases so the pipeline can
|
|
51
|
+
* gate on a dataset’s source-change fingerprint before paying any import cost:
|
|
52
|
+
*
|
|
53
|
+
* 1. {@link probe} probes every distribution and selects the source-to-be,
|
|
54
|
+
* without importing.
|
|
55
|
+
* 2. {@link resolve} turns that probed source into a usable SPARQL endpoint,
|
|
56
|
+
* importing a data dump only when the source is one.
|
|
57
|
+
*/
|
|
28
58
|
export interface DistributionResolver {
|
|
29
|
-
|
|
59
|
+
probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
|
|
60
|
+
resolve(probed: ProbedDistributions, callbacks?: ResolveCallbacks): Promise<ResolvedDistribution | NoDistributionAvailable>;
|
|
30
61
|
cleanup?(): Promise<void>;
|
|
31
62
|
}
|
|
32
63
|
export interface SparqlDistributionResolverOptions {
|
|
33
64
|
timeout?: number;
|
|
34
65
|
}
|
|
35
66
|
/**
|
|
36
|
-
* Resolves a dataset to
|
|
67
|
+
* Resolves a dataset to its own SPARQL endpoint by probing its distributions.
|
|
37
68
|
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
40
|
-
*
|
|
69
|
+
* {@link probe} returns the first valid SPARQL endpoint as the
|
|
70
|
+
* {@link ProbedSource}; {@link resolve} returns it as a
|
|
71
|
+
* {@link ResolvedDistribution}, or {@link NoDistributionAvailable} when none
|
|
72
|
+
* responded. Never imports a data dump – wrap with {@link ImportResolver} for
|
|
73
|
+
* that.
|
|
41
74
|
*
|
|
42
75
|
* Does not mutate `dataset.distributions`.
|
|
43
76
|
*/
|
|
44
77
|
export declare class SparqlDistributionResolver implements DistributionResolver {
|
|
45
78
|
private readonly timeout;
|
|
46
79
|
constructor(options?: SparqlDistributionResolverOptions);
|
|
47
|
-
|
|
80
|
+
probe(dataset: Dataset, callbacks?: ResolveCallbacks): Promise<ProbedDistributions>;
|
|
81
|
+
resolve(probed: ProbedDistributions): Promise<ResolvedDistribution | NoDistributionAvailable>;
|
|
48
82
|
}
|
|
49
83
|
//# sourceMappingURL=resolver.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/distribution/resolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAGL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC,qBAAa,oBAAoB;IAE7B,QAAQ,CAAC,YAAY,EAAE,YAAY;IACnC,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;IACpC,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM;IAChC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM;gBAJpB,YAAY,EAAE,YAAY,EAC1B,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA,EAC3B,cAAc,CAAC,EAAE,MAAM,YAAA,EACvB,WAAW,CAAC,EAAE,MAAM,YAAA;CAEhC;AAED,qBAAa,uBAAuB;IAEhC,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM;IACxB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;gBAH3B,OAAO,EAAE,OAAO,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA;CAEvC;AAED,
|
|
1
|
+
{"version":3,"file":"resolver.d.ts","sourceRoot":"","sources":["../../src/distribution/resolver.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAGL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC,qBAAa,oBAAoB;IAE7B,QAAQ,CAAC,YAAY,EAAE,YAAY;IACnC,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;IACpC,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM;IAChC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM;gBAJpB,YAAY,EAAE,YAAY,EAC1B,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA,EAC3B,cAAc,CAAC,EAAE,MAAM,YAAA,EACvB,WAAW,CAAC,EAAE,MAAM,YAAA;CAEhC;AAED,qBAAa,uBAAuB;IAEhC,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM;IACxB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,YAAY,CAAC,EAAE,YAAY;gBAH3B,OAAO,EAAE,OAAO,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,EAAE,eAAe,EAAE,EAC/B,YAAY,CAAC,EAAE,YAAY,YAAA;CAEvC;AAED;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC3B,YAAY,EAAE,YAAY,CAAC;IAC3B,WAAW,EAAE,eAAe,CAAC;CAC9B;AAED;;;;;GAKG;AACH,qBAAa,mBAAmB;IAE5B,QAAQ,CAAC,OAAO,EAAE,OAAO;IACzB,QAAQ,CAAC,YAAY,EAAE,eAAe,EAAE;IACxC,QAAQ,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI;gBAF3B,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,eAAe,EAAE,EAC/B,MAAM,EAAE,YAAY,GAAG,IAAI;CAEvC;AAED,kEAAkE;AAClE,MAAM,WAAW,gBAAgB;IAC/B,4EAA4E;IAC5E,OAAO,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,MAAM,EAAE,eAAe,KAAK,IAAI,CAAC;IACxE,6DAA6D;IAC7D,aAAa,CAAC,EAAE,MAAM,IAAI,CAAC;IAC3B,kEAAkE;IAClE,cAAc,CAAC,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACtE;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,CACH,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC,CAAC;IAChC,OAAO,CACL,MAAM,EAAE,mBAAmB,EAC3B,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC,CAAC;IAC3D,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3B;AAED,MAAM,WAAW,iCAAiC;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;;;;;;GAUG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;gBAErB,OAAO,CAAC,EAAE,iCAAiC;IAIjD,KAAK,CACT,OAAO,EAAE,OAAO,EAChB,SAAS,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,mBAAmB,CAAC;IA4BzB,OAAO,CACX,MAAM,EAAE,mBAAmB,GAC1B,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;CAc3D"}
|
|
@@ -26,11 +26,29 @@ export class NoDistributionAvailable {
|
|
|
26
26
|
}
|
|
27
27
|
}
|
|
28
28
|
/**
|
|
29
|
-
*
|
|
29
|
+
* The outcome of the probe phase: every distribution’s probe result, plus the
|
|
30
|
+
* {@link ProbedSource} that will be used to process the dataset (or `null` if
|
|
31
|
+
* none is available). Determined without importing, so the pipeline can decide
|
|
32
|
+
* to skip a dataset before paying the import cost.
|
|
33
|
+
*/
|
|
34
|
+
export class ProbedDistributions {
|
|
35
|
+
dataset;
|
|
36
|
+
probeResults;
|
|
37
|
+
source;
|
|
38
|
+
constructor(dataset, probeResults, source) {
|
|
39
|
+
this.dataset = dataset;
|
|
40
|
+
this.probeResults = probeResults;
|
|
41
|
+
this.source = source;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Resolves a dataset to its own SPARQL endpoint by probing its distributions.
|
|
30
46
|
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
47
|
+
* {@link probe} returns the first valid SPARQL endpoint as the
|
|
48
|
+
* {@link ProbedSource}; {@link resolve} returns it as a
|
|
49
|
+
* {@link ResolvedDistribution}, or {@link NoDistributionAvailable} when none
|
|
50
|
+
* responded. Never imports a data dump – wrap with {@link ImportResolver} for
|
|
51
|
+
* that.
|
|
34
52
|
*
|
|
35
53
|
* Does not mutate `dataset.distributions`.
|
|
36
54
|
*/
|
|
@@ -39,22 +57,30 @@ export class SparqlDistributionResolver {
|
|
|
39
57
|
constructor(options) {
|
|
40
58
|
this.timeout = options?.timeout ?? 5000;
|
|
41
59
|
}
|
|
42
|
-
async
|
|
60
|
+
async probe(dataset, callbacks) {
|
|
43
61
|
const results = await Promise.all(dataset.distributions.map(async (distribution) => {
|
|
44
62
|
const result = await probe(distribution, { timeoutMs: this.timeout });
|
|
45
63
|
callbacks?.onProbe?.(distribution, result);
|
|
46
64
|
return result;
|
|
47
65
|
}));
|
|
48
66
|
// Find first valid SPARQL endpoint.
|
|
67
|
+
let source = null;
|
|
49
68
|
for (let i = 0; i < dataset.distributions.length; i++) {
|
|
50
69
|
const distribution = dataset.distributions[i];
|
|
51
70
|
const result = results[i];
|
|
52
71
|
if (distribution.isSparql() &&
|
|
53
72
|
result instanceof SparqlProbeResult &&
|
|
54
73
|
result.isSuccess()) {
|
|
55
|
-
|
|
74
|
+
source = { distribution, probeResult: result };
|
|
75
|
+
break;
|
|
56
76
|
}
|
|
57
77
|
}
|
|
58
|
-
return new
|
|
78
|
+
return new ProbedDistributions(dataset, results, source);
|
|
79
|
+
}
|
|
80
|
+
async resolve(probed) {
|
|
81
|
+
if (probed.source && probed.source.distribution.isSparql()) {
|
|
82
|
+
return new ResolvedDistribution(probed.source.distribution, probed.probeResults);
|
|
83
|
+
}
|
|
84
|
+
return new NoDistributionAvailable(probed.dataset, 'No SPARQL endpoint available', probed.probeResults);
|
|
59
85
|
}
|
|
60
86
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
package/dist/pipeline.d.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { Stage } from './stage.js';
|
|
|
4
4
|
import type { QuadTransform } from './stage.js';
|
|
5
5
|
import type { Writer } from './writer/writer.js';
|
|
6
6
|
import { type DistributionResolver } from './distribution/resolver.js';
|
|
7
|
+
import type { ProvenanceStore } from './provenance/store.js';
|
|
7
8
|
import type { StageOutputResolver } from './stageOutputResolver.js';
|
|
8
9
|
import type { ProgressReporter } from './progressReporter.js';
|
|
9
10
|
import { type TimeoutPolicy } from './sparql/timeoutPolicy.js';
|
|
@@ -32,6 +33,22 @@ export interface PipelineOptions {
|
|
|
32
33
|
outputDir: string;
|
|
33
34
|
};
|
|
34
35
|
reporter?: ProgressReporter;
|
|
36
|
+
/**
|
|
37
|
+
* Optional per-dataset processing memory. When set, the pipeline skips a
|
|
38
|
+
* dataset whose source-change fingerprint and {@link pipelineVersion} both
|
|
39
|
+
* match the stored record – before paying the import cost – and writes an
|
|
40
|
+
* updated record after processing. When omitted, every dataset is
|
|
41
|
+
* reprocessed (today’s behaviour).
|
|
42
|
+
*/
|
|
43
|
+
provenanceStore?: ProvenanceStore;
|
|
44
|
+
/**
|
|
45
|
+
* Opaque, consumer-declared version of the pipeline’s output-affecting
|
|
46
|
+
* logic, rotated only on releases that change output. Compared for equality,
|
|
47
|
+
* never parsed or ordered. Required when {@link provenanceStore} is set (a
|
|
48
|
+
* skip-enabled pipeline with no version would silently freeze); ignored
|
|
49
|
+
* otherwise.
|
|
50
|
+
*/
|
|
51
|
+
pipelineVersion?: string;
|
|
35
52
|
/**
|
|
36
53
|
* Factory producing a fresh {@link TimeoutPolicy} per dataset. Defaults
|
|
37
54
|
* to {@link constantTimeoutPolicy}`(300_000)` so existing call sites
|
|
@@ -53,9 +70,13 @@ export declare class Pipeline {
|
|
|
53
70
|
private readonly chaining?;
|
|
54
71
|
private readonly reporter?;
|
|
55
72
|
private readonly timeoutFactory;
|
|
73
|
+
private readonly provenanceStore?;
|
|
74
|
+
private readonly pipelineVersion?;
|
|
56
75
|
constructor(options: PipelineOptions);
|
|
57
76
|
run(): Promise<void>;
|
|
58
77
|
private processDataset;
|
|
78
|
+
/** Persist the processing record for a dataset, when a store is configured. */
|
|
79
|
+
private recordOutcome;
|
|
59
80
|
private reportValidators;
|
|
60
81
|
private collectStages;
|
|
61
82
|
/**
|
package/dist/pipeline.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAgB,MAAM,cAAc,CAAC;AAGrD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAG1B,MAAM,4BAA4B,CAAC;AAKpC,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAO7D,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAEV,gBAAgB,EACjB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAEL,KAAK,aAAa,EACnB,MAAM,2BAA2B,CAAC;AAEnC,wDAAwD;AACxD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,aAAa,CAAC;QAAE,OAAO,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;CACxD;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,cAAc,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;IAC5B;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,aAAa,CAAC;CAC/B;AAgFD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;IAC7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAsB;IACrD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAkB;IACnD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAS;gBAE9B,OAAO,EAAE,eAAe;IA0C9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAoBZ,cAAc;IA0I5B,+EAA+E;YACjE,aAAa;YAmBb,gBAAgB;IAW9B,OAAO,CAAE,aAAa;IAOtB;;;OAGG;YACW,QAAQ;IA0CtB,2EAA2E;YAC7D,eAAe;YAqBf,QAAQ;YA2DP,SAAS;CAczB"}
|
package/dist/pipeline.js
CHANGED
|
@@ -3,6 +3,8 @@ import { StreamParser } from 'n3';
|
|
|
3
3
|
import { FileWriter } from './writer/fileWriter.js';
|
|
4
4
|
import { NoDistributionAvailable, } from './distribution/resolver.js';
|
|
5
5
|
import { SparqlDistributionResolver } from './distribution/index.js';
|
|
6
|
+
import { sourceFingerprint } from './provenance/sourceFingerprint.js';
|
|
7
|
+
import { shouldReprocess } from './provenance/reprocessDecision.js';
|
|
6
8
|
import { NetworkError, SparqlProbeResult, } from '@lde/distribution-probe';
|
|
7
9
|
import { NotSupported } from './sparql/executor.js';
|
|
8
10
|
import { ConstantTimeoutPolicy, } from './sparql/timeoutPolicy.js';
|
|
@@ -81,11 +83,16 @@ export class Pipeline {
|
|
|
81
83
|
chaining;
|
|
82
84
|
reporter;
|
|
83
85
|
timeoutFactory;
|
|
86
|
+
provenanceStore;
|
|
87
|
+
pipelineVersion;
|
|
84
88
|
constructor(options) {
|
|
85
89
|
const hasSubStages = options.stages.some((stage) => stage.stages.length > 0);
|
|
86
90
|
if (hasSubStages && !options.chaining) {
|
|
87
91
|
throw new Error('chaining is required when any stage has sub-stages');
|
|
88
92
|
}
|
|
93
|
+
if (options.provenanceStore && options.pipelineVersion === undefined) {
|
|
94
|
+
throw new Error('pipelineVersion is required when a provenanceStore is configured');
|
|
95
|
+
}
|
|
89
96
|
this.name = options.name ?? '';
|
|
90
97
|
this.datasetSelector = options.datasetSelector;
|
|
91
98
|
this.stages = options.stages;
|
|
@@ -106,6 +113,8 @@ export class Pipeline {
|
|
|
106
113
|
this.reporter = options.reporter;
|
|
107
114
|
this.timeoutFactory =
|
|
108
115
|
options.timeout ?? (() => new ConstantTimeoutPolicy(300_000));
|
|
116
|
+
this.provenanceStore = options.provenanceStore;
|
|
117
|
+
this.pipelineVersion = options.pipelineVersion;
|
|
109
118
|
}
|
|
110
119
|
async run() {
|
|
111
120
|
const start = Date.now();
|
|
@@ -125,17 +134,48 @@ export class Pipeline {
|
|
|
125
134
|
}
|
|
126
135
|
async processDataset(dataset) {
|
|
127
136
|
this.reporter?.datasetStart?.(dataset);
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
onTighten: (event) => this.reporter?.timeoutTightened?.(event),
|
|
131
|
-
onRelax: (event) => this.reporter?.timeoutRelaxed?.(event),
|
|
132
|
-
});
|
|
133
|
-
let resolved;
|
|
137
|
+
// Probe phase: gather probe results and the source-to-be, without importing.
|
|
138
|
+
let probed;
|
|
134
139
|
try {
|
|
135
|
-
|
|
140
|
+
probed = await this.distributionResolver.probe(dataset, {
|
|
136
141
|
onProbe: (distribution, result) => {
|
|
137
142
|
this.reporter?.distributionProbed?.(mapProbeResult(distribution, result));
|
|
138
143
|
},
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
catch (error) {
|
|
147
|
+
this.reporter?.datasetSkipped?.(dataset, `Distribution probing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
// Derive the source-change fingerprint from the probed source: null for a
|
|
151
|
+
// live SPARQL endpoint (always reprocess) or when no source is available.
|
|
152
|
+
const fingerprint = probed.source
|
|
153
|
+
? sourceFingerprint(probed.source.distribution, probed.source.probeResult)
|
|
154
|
+
: null;
|
|
155
|
+
// Gate: skip an unchanged dataset before paying any import cost.
|
|
156
|
+
if (this.provenanceStore) {
|
|
157
|
+
let stored = null;
|
|
158
|
+
try {
|
|
159
|
+
stored = await this.provenanceStore.get(dataset.iri);
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
// An unreadable record must not abort the whole run, nor wrongly skip:
|
|
163
|
+
// treat it as ‘never processed’ so this dataset reprocesses. The
|
|
164
|
+
// periodic full reprocess is the backstop.
|
|
165
|
+
stored = null;
|
|
166
|
+
}
|
|
167
|
+
if (!shouldReprocess({
|
|
168
|
+
sourceFingerprint: fingerprint,
|
|
169
|
+
pipelineVersion: this.pipelineVersion,
|
|
170
|
+
}, stored)) {
|
|
171
|
+
this.reporter?.datasetSkipped?.(dataset, 'Unchanged since last run');
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// Resolve phase: import a data dump only when the source is one.
|
|
176
|
+
let resolved;
|
|
177
|
+
try {
|
|
178
|
+
resolved = await this.distributionResolver.resolve(probed, {
|
|
139
179
|
onImportStart: () => {
|
|
140
180
|
this.reporter?.importStarted?.();
|
|
141
181
|
},
|
|
@@ -149,10 +189,20 @@ export class Pipeline {
|
|
|
149
189
|
return;
|
|
150
190
|
}
|
|
151
191
|
if (resolved instanceof NoDistributionAvailable) {
|
|
192
|
+
// Record the failure so a dataset whose source is unchanged is not
|
|
193
|
+
// re-imported every run; it is retried at the next fingerprint change or
|
|
194
|
+
// version rotation.
|
|
195
|
+
await this.recordOutcome(dataset, fingerprint, 'failed');
|
|
152
196
|
this.reporter?.datasetSkipped?.(dataset, resolved.message);
|
|
153
197
|
return;
|
|
154
198
|
}
|
|
155
199
|
this.reporter?.distributionSelected?.(dataset, resolved.distribution, resolved.importedFrom, resolved.importDuration, resolved.tripleCount);
|
|
200
|
+
const timeout = this.timeoutFactory();
|
|
201
|
+
const unsubscribe = timeout.subscribe?.({
|
|
202
|
+
onTighten: (event) => this.reporter?.timeoutTightened?.(event),
|
|
203
|
+
onRelax: (event) => this.reporter?.timeoutRelaxed?.(event),
|
|
204
|
+
});
|
|
205
|
+
let stageFailed = false;
|
|
156
206
|
try {
|
|
157
207
|
for (const stage of this.stages) {
|
|
158
208
|
try {
|
|
@@ -164,6 +214,7 @@ export class Pipeline {
|
|
|
164
214
|
}
|
|
165
215
|
}
|
|
166
216
|
catch (error) {
|
|
217
|
+
stageFailed = true;
|
|
167
218
|
this.reporter?.stageFailed?.(stage.name, error instanceof Error ? error : new Error(String(error)));
|
|
168
219
|
}
|
|
169
220
|
}
|
|
@@ -174,12 +225,32 @@ export class Pipeline {
|
|
|
174
225
|
}
|
|
175
226
|
await this.writer.flush?.(dataset);
|
|
176
227
|
await this.reportValidators(dataset);
|
|
228
|
+
// A dataset whose stages threw produced incomplete output; record it as
|
|
229
|
+
// ‘failed’ rather than freezing a broken result under a ‘success’ record.
|
|
230
|
+
await this.recordOutcome(dataset, fingerprint, stageFailed ? 'failed' : 'success');
|
|
177
231
|
const datasetMemory = process.memoryUsage();
|
|
178
232
|
this.reporter?.datasetComplete?.(dataset, {
|
|
179
233
|
memoryUsageBytes: datasetMemory.rss,
|
|
180
234
|
heapUsedBytes: datasetMemory.heapUsed,
|
|
181
235
|
});
|
|
182
236
|
}
|
|
237
|
+
/** Persist the processing record for a dataset, when a store is configured. */
|
|
238
|
+
async recordOutcome(dataset, fingerprint, status) {
|
|
239
|
+
if (!this.provenanceStore)
|
|
240
|
+
return;
|
|
241
|
+
try {
|
|
242
|
+
await this.provenanceStore.set(dataset.iri, {
|
|
243
|
+
sourceFingerprint: fingerprint,
|
|
244
|
+
pipelineVersion: this.pipelineVersion,
|
|
245
|
+
generatedAt: new Date().toISOString(),
|
|
246
|
+
status,
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
catch {
|
|
250
|
+
// A failed write must not abort the run; the dataset simply reprocesses
|
|
251
|
+
// next run, its record not yet updated.
|
|
252
|
+
}
|
|
253
|
+
}
|
|
183
254
|
async reportValidators(dataset) {
|
|
184
255
|
const validators = new Set();
|
|
185
256
|
for (const stage of this.collectStages(this.stages)) {
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
2
|
+
import type { ProcessingRecord } from './record.js';
|
|
3
|
+
import type { ProvenanceStore } from './store.js';
|
|
4
|
+
export interface FileLoadedSparqlProvenanceStoreOptions {
|
|
5
|
+
/** Read-only SPARQL endpoint to query for previously-loaded records. */
|
|
6
|
+
queryEndpoint: URL;
|
|
7
|
+
/**
|
|
8
|
+
* The pipeline’s IRI, used as the named graph that scopes this pipeline’s
|
|
9
|
+
* records so multiple pipelines sharing one triplestore do not collide.
|
|
10
|
+
*/
|
|
11
|
+
pipelineIri: URL;
|
|
12
|
+
/**
|
|
13
|
+
* Directory the records are written to as files, to be bulk-loaded into the
|
|
14
|
+
* read-only triplestore after the run. Kept separate from the data output
|
|
15
|
+
* directory so filenames (keyed by dataset URI) never collide.
|
|
16
|
+
*/
|
|
17
|
+
outputDir: string;
|
|
18
|
+
/**
|
|
19
|
+
* Optional {@link SparqlEndpointFetcher} for the query side, intended for
|
|
20
|
+
* tests. Defaults to a fresh instance.
|
|
21
|
+
*/
|
|
22
|
+
fetcher?: SparqlEndpointFetcher;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* A {@link ProvenanceStore} for a triplestore that is served read-only and
|
|
26
|
+
* rebuilt by bulk-loading files (e.g. QLever).
|
|
27
|
+
*
|
|
28
|
+
* Reads through SPARQL queries against the live endpoint (records loaded from
|
|
29
|
+
* a previous run); writes the records as files for the next bulk-load, since
|
|
30
|
+
* the endpoint accepts no SPARQL UPDATE. Records are flat PROV-O keyed by the
|
|
31
|
+
* dataset URI, written into the pipeline-scoped provenance named graph.
|
|
32
|
+
*/
|
|
33
|
+
export declare class FileLoadedSparqlProvenanceStore implements ProvenanceStore {
|
|
34
|
+
private readonly queryEndpoint;
|
|
35
|
+
private readonly pipelineIri;
|
|
36
|
+
private readonly writer;
|
|
37
|
+
private readonly fetcher;
|
|
38
|
+
constructor(options: FileLoadedSparqlProvenanceStoreOptions);
|
|
39
|
+
get(datasetUri: URL): Promise<ProcessingRecord | null>;
|
|
40
|
+
private selectQuery;
|
|
41
|
+
set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
|
|
42
|
+
private toQuads;
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=fileLoadedSparqlProvenanceStore.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fileLoadedSparqlProvenanceStore.d.ts","sourceRoot":"","sources":["../../src/provenance/fileLoadedSparqlProvenanceStore.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AASlD,MAAM,WAAW,sCAAsC;IACrD,wEAAwE;IACxE,aAAa,EAAE,GAAG,CAAC;IACnB;;;OAGG;IACH,WAAW,EAAE,GAAG,CAAC;IACjB;;;;OAIG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;GAQG;AACH,qBAAa,+BAAgC,YAAW,eAAe;IACrE,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAM;IACpC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAM;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAa;IACpC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,sCAAsC;IAWrD,GAAG,CAAC,UAAU,EAAE,GAAG,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAoB5D,OAAO,CAAC,WAAW;IAmBb,GAAG,CAAC,UAAU,EAAE,GAAG,EAAE,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;YAMpD,OAAO;CA0BvB"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { Dataset, assertSafeIri } from '@lde/dataset';
|
|
2
|
+
import { DataFactory } from 'n3';
|
|
3
|
+
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
4
|
+
import { FileWriter } from '../writer/fileWriter.js';
|
|
5
|
+
const { namedNode, literal, quad } = DataFactory;
|
|
6
|
+
const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
|
|
7
|
+
const PROV = 'http://www.w3.org/ns/prov#';
|
|
8
|
+
const LDE = 'https://w3id.org/lde/provenance#';
|
|
9
|
+
const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime';
|
|
10
|
+
/**
|
|
11
|
+
* A {@link ProvenanceStore} for a triplestore that is served read-only and
|
|
12
|
+
* rebuilt by bulk-loading files (e.g. QLever).
|
|
13
|
+
*
|
|
14
|
+
* Reads through SPARQL queries against the live endpoint (records loaded from
|
|
15
|
+
* a previous run); writes the records as files for the next bulk-load, since
|
|
16
|
+
* the endpoint accepts no SPARQL UPDATE. Records are flat PROV-O keyed by the
|
|
17
|
+
* dataset URI, written into the pipeline-scoped provenance named graph.
|
|
18
|
+
*/
|
|
19
|
+
export class FileLoadedSparqlProvenanceStore {
|
|
20
|
+
queryEndpoint;
|
|
21
|
+
pipelineIri;
|
|
22
|
+
writer;
|
|
23
|
+
fetcher;
|
|
24
|
+
constructor(options) {
|
|
25
|
+
this.queryEndpoint = options.queryEndpoint;
|
|
26
|
+
this.pipelineIri = options.pipelineIri;
|
|
27
|
+
this.writer = new FileWriter({
|
|
28
|
+
outputDir: options.outputDir,
|
|
29
|
+
format: 'n-quads',
|
|
30
|
+
graphIri: () => this.pipelineIri,
|
|
31
|
+
});
|
|
32
|
+
this.fetcher = options.fetcher ?? new SparqlEndpointFetcher();
|
|
33
|
+
}
|
|
34
|
+
async get(datasetUri) {
|
|
35
|
+
const stream = (await this.fetcher.fetchBindings(this.queryEndpoint.toString(), this.selectQuery(datasetUri)));
|
|
36
|
+
for await (const binding of stream) {
|
|
37
|
+
// A record exists iff the mandatory fields bound; the fingerprint is
|
|
38
|
+
// optional and absent for a run with no establishable signal.
|
|
39
|
+
return {
|
|
40
|
+
sourceFingerprint: binding.fingerprint?.value ?? null,
|
|
41
|
+
pipelineVersion: binding.version.value,
|
|
42
|
+
generatedAt: binding.generatedAt.value,
|
|
43
|
+
status: binding.status.value,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
selectQuery(datasetUri) {
|
|
49
|
+
// Guard before interpolating into a SPARQL `<…>` reference, mirroring
|
|
50
|
+
// SparqlUpdateWriter. URL normalisation already encodes unsafe characters,
|
|
51
|
+
// so this is defence-in-depth against a non-normalised IRI reaching here.
|
|
52
|
+
const datasetIri = datasetUri.toString();
|
|
53
|
+
const pipelineIri = this.pipelineIri.toString();
|
|
54
|
+
assertSafeIri(datasetIri);
|
|
55
|
+
assertSafeIri(pipelineIri);
|
|
56
|
+
const dataset = `<${datasetIri}>`;
|
|
57
|
+
return `SELECT ?fingerprint ?version ?status ?generatedAt WHERE {
|
|
58
|
+
GRAPH <${pipelineIri}> {
|
|
59
|
+
${dataset} <${LDE}pipelineVersion> ?version ;
|
|
60
|
+
<${LDE}status> ?status ;
|
|
61
|
+
<${PROV}generatedAtTime> ?generatedAt .
|
|
62
|
+
OPTIONAL { ${dataset} <${LDE}sourceFingerprint> ?fingerprint }
|
|
63
|
+
}
|
|
64
|
+
} LIMIT 1`;
|
|
65
|
+
}
|
|
66
|
+
async set(datasetUri, record) {
|
|
67
|
+
const dataset = new Dataset({ iri: datasetUri, distributions: [] });
|
|
68
|
+
await this.writer.write(dataset, this.toQuads(datasetUri, record));
|
|
69
|
+
await this.writer.flush(dataset);
|
|
70
|
+
}
|
|
71
|
+
async *toQuads(datasetUri, record) {
|
|
72
|
+
const subject = namedNode(datasetUri.toString());
|
|
73
|
+
yield quad(subject, namedNode(RDF_TYPE), namedNode(`${PROV}Entity`));
|
|
74
|
+
yield quad(subject, namedNode(`${PROV}generatedAtTime`), literal(record.generatedAt, namedNode(XSD_DATE_TIME)));
|
|
75
|
+
if (record.sourceFingerprint !== null) {
|
|
76
|
+
yield quad(subject, namedNode(`${LDE}sourceFingerprint`), literal(record.sourceFingerprint));
|
|
77
|
+
}
|
|
78
|
+
yield quad(subject, namedNode(`${LDE}pipelineVersion`), literal(record.pipelineVersion));
|
|
79
|
+
yield quad(subject, namedNode(`${LDE}status`), literal(record.status));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { sourceFingerprint } from './sourceFingerprint.js';
|
|
2
|
+
export { shouldReprocess } from './reprocessDecision.js';
|
|
3
|
+
export type { ProcessingRecord, ChangeKey } from './record.js';
|
|
4
|
+
export type { ProvenanceStore } from './store.js';
|
|
5
|
+
export { FileLoadedSparqlProvenanceStore, type FileLoadedSparqlProvenanceStoreOptions, } from './fileLoadedSparqlProvenanceStore.js';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAC/D,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClD,OAAO,EACL,+BAA+B,EAC/B,KAAK,sCAAsC,GAC5C,MAAM,sCAAsC,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The per-dataset processing memory the pipeline keeps to decide whether a
|
|
3
|
+
* dataset can be skipped on the next run.
|
|
4
|
+
*
|
|
5
|
+
* Both change fields ({@link sourceFingerprint} and {@link pipelineVersion})
|
|
6
|
+
* are opaque strings, compared only for equality – never parsed or ordered.
|
|
7
|
+
*/
|
|
8
|
+
export interface ProcessingRecord {
|
|
9
|
+
/**
|
|
10
|
+
* The source-change fingerprint at the time of processing (see
|
|
11
|
+
* `sourceFingerprint`), or `null` when none could be established (e.g. a live
|
|
12
|
+
* SPARQL endpoint). Derived automatically from observed source metadata, not
|
|
13
|
+
* a declared version. A `null` fingerprint never compares equal, so the
|
|
14
|
+
* dataset is always reprocessed.
|
|
15
|
+
*/
|
|
16
|
+
sourceFingerprint: string | null;
|
|
17
|
+
/**
|
|
18
|
+
* The consumer-declared pipeline version under which the dataset was
|
|
19
|
+
* processed. Kept separate from {@link sourceFingerprint}, never combined
|
|
20
|
+
* into a single fingerprint: the data side is observed, the logic side is
|
|
21
|
+
* intentionally declared.
|
|
22
|
+
*/
|
|
23
|
+
pipelineVersion: string;
|
|
24
|
+
/** ISO timestamp of when the record was written. */
|
|
25
|
+
generatedAt: string;
|
|
26
|
+
/**
|
|
27
|
+
* Whether processing succeeded. Recorded so a dataset that failed but whose
|
|
28
|
+
* source is unchanged is skipped on subsequent runs rather than re-imported
|
|
29
|
+
* every run; it is retried at the next source change or version rotation.
|
|
30
|
+
*/
|
|
31
|
+
status: 'success' | 'failed';
|
|
32
|
+
}
|
|
33
|
+
/** The two fields the skip rule compares for equality. */
|
|
34
|
+
export type ChangeKey = Pick<ProcessingRecord, 'sourceFingerprint' | 'pipelineVersion'>;
|
|
35
|
+
//# sourceMappingURL=record.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"record.d.ts","sourceRoot":"","sources":["../../src/provenance/record.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC;;;;;OAKG;IACH,eAAe,EAAE,MAAM,CAAC;IACxB,oDAAoD;IACpD,WAAW,EAAE,MAAM,CAAC;IACpB;;;;OAIG;IACH,MAAM,EAAE,SAAS,GAAG,QAAQ,CAAC;CAC9B;AAED,0DAA0D;AAC1D,MAAM,MAAM,SAAS,GAAG,IAAI,CAC1B,gBAAgB,EAChB,mBAAmB,GAAG,iBAAiB,CACxC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ChangeKey, ProcessingRecord } from './record.js';
|
|
2
|
+
/**
|
|
3
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
4
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
5
|
+
* processed).
|
|
6
|
+
*
|
|
7
|
+
* The rule is pure equality on the two change fields:
|
|
8
|
+
*
|
|
9
|
+
* ```
|
|
10
|
+
* skip iff stored !== null
|
|
11
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
12
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
13
|
+
* ```
|
|
14
|
+
*
|
|
15
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
16
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
17
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
18
|
+
* no establishable fingerprint is always reprocessed.
|
|
19
|
+
*/
|
|
20
|
+
export declare function shouldReprocess(current: ChangeKey, stored: ProcessingRecord | null): boolean;
|
|
21
|
+
//# sourceMappingURL=reprocessDecision.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reprocessDecision.d.ts","sourceRoot":"","sources":["../../src/provenance/reprocessDecision.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAE/D;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,eAAe,CAC7B,OAAO,EAAE,SAAS,EAClB,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAC9B,OAAO,CAOT"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
3
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
4
|
+
* processed).
|
|
5
|
+
*
|
|
6
|
+
* The rule is pure equality on the two change fields:
|
|
7
|
+
*
|
|
8
|
+
* ```
|
|
9
|
+
* skip iff stored !== null
|
|
10
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
11
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
15
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
16
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
17
|
+
* no establishable fingerprint is always reprocessed.
|
|
18
|
+
*/
|
|
19
|
+
export function shouldReprocess(current, stored) {
|
|
20
|
+
if (stored === null)
|
|
21
|
+
return true;
|
|
22
|
+
// A null source fingerprint never compares equal, even to a stored null.
|
|
23
|
+
if (current.sourceFingerprint === null)
|
|
24
|
+
return true;
|
|
25
|
+
if (current.sourceFingerprint !== stored.sourceFingerprint)
|
|
26
|
+
return true;
|
|
27
|
+
if (current.pipelineVersion !== stored.pipelineVersion)
|
|
28
|
+
return true;
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { Distribution } from '@lde/dataset';
|
|
2
|
+
import { type ProbeResultType } from '@lde/distribution-probe';
|
|
3
|
+
/**
|
|
4
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
5
|
+
* probe already collected – no body download.
|
|
6
|
+
*
|
|
7
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
8
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
9
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
10
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
11
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
12
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
13
|
+
* download/import layer agree.
|
|
14
|
+
*
|
|
15
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
16
|
+
* parsed or ordered.
|
|
17
|
+
*
|
|
18
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
19
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
20
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
21
|
+
* those distributions are always reprocessed.
|
|
22
|
+
*
|
|
23
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
24
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
25
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
26
|
+
* throw or an unstable fingerprint.
|
|
27
|
+
*/
|
|
28
|
+
export declare function sourceFingerprint(distribution: Distribution, probeResult: ProbeResultType): string | null;
|
|
29
|
+
//# sourceMappingURL=sourceFingerprint.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sourceFingerprint.d.ts","sourceRoot":"","sources":["../../src/provenance/sourceFingerprint.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACjD,OAAO,EAEL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,YAAY,EAC1B,WAAW,EAAE,eAAe,GAC3B,MAAM,GAAG,IAAI,CAwBf"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { DataDumpProbeResult, } from '@lde/distribution-probe';
|
|
2
|
+
/**
|
|
3
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
4
|
+
* probe already collected – no body download.
|
|
5
|
+
*
|
|
6
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
7
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
8
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
9
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
10
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
11
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
12
|
+
* download/import layer agree.
|
|
13
|
+
*
|
|
14
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
15
|
+
* parsed or ordered.
|
|
16
|
+
*
|
|
17
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
18
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
19
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
20
|
+
* those distributions are always reprocessed.
|
|
21
|
+
*
|
|
22
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
23
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
24
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
25
|
+
* throw or an unstable fingerprint.
|
|
26
|
+
*/
|
|
27
|
+
export function sourceFingerprint(distribution, probeResult) {
|
|
28
|
+
if (distribution.isSparql()) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
const modifiedDate = mostRecent(distribution.lastModified, probeResult instanceof DataDumpProbeResult
|
|
32
|
+
? (probeResult.lastModified ?? undefined)
|
|
33
|
+
: undefined);
|
|
34
|
+
const probeSize = probeResult instanceof DataDumpProbeResult ? probeResult.contentSize : null;
|
|
35
|
+
const byteSize = probeSize !== null && !Number.isNaN(probeSize)
|
|
36
|
+
? probeSize
|
|
37
|
+
: distribution.byteSize;
|
|
38
|
+
if (modifiedDate === undefined && byteSize === undefined) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
return `${modifiedDate?.toISOString() ?? ''}|${byteSize ?? ''}`;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* The most recent of the given dates, ignoring `undefined` and Invalid Dates.
|
|
45
|
+
* Filtering invalid dates keeps a malformed metadata value from being selected
|
|
46
|
+
* (which would make `toISOString` throw) and from sticking ahead of a valid
|
|
47
|
+
* date – `validDate > invalidDate` is `number > NaN`, i.e. always `false`.
|
|
48
|
+
*/
|
|
49
|
+
function mostRecent(...dates) {
|
|
50
|
+
return dates.reduce((latest, date) => {
|
|
51
|
+
if (date === undefined || Number.isNaN(date.valueOf()))
|
|
52
|
+
return latest;
|
|
53
|
+
if (latest === undefined || date > latest)
|
|
54
|
+
return date;
|
|
55
|
+
return latest;
|
|
56
|
+
}, undefined);
|
|
57
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ProcessingRecord } from './record.js';
|
|
2
|
+
/**
|
|
3
|
+
* The pipeline’s per-dataset processing memory.
|
|
4
|
+
*
|
|
5
|
+
* The framework owns the skip semantics (see `shouldReprocess`); a
|
|
6
|
+
* `ProvenanceStore` owns only the physical storage of {@link ProcessingRecord}s,
|
|
7
|
+
* keyed by dataset URI. Implementations are free to back this with a
|
|
8
|
+
* triplestore, files, or anything else.
|
|
9
|
+
*/
|
|
10
|
+
export interface ProvenanceStore {
|
|
11
|
+
/**
|
|
12
|
+
* The record from the dataset’s last processing, or `null` if it has never
|
|
13
|
+
* been processed (or the store was wiped). A `null` result drives a
|
|
14
|
+
* reprocess.
|
|
15
|
+
*/
|
|
16
|
+
get(datasetUri: URL): Promise<ProcessingRecord | null>;
|
|
17
|
+
/** Persist the record for a dataset, replacing any previous one. */
|
|
18
|
+
set(datasetUri: URL, record: ProcessingRecord): Promise<void>;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/provenance/store.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAEpD;;;;;;;GAOG;AACH,MAAM,WAAW,eAAe;IAC9B;;;;OAIG;IACH,GAAG,CAAC,UAAU,EAAE,GAAG,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC;IACvD,oEAAoE;IACpE,GAAG,CAAC,UAAU,EAAE,GAAG,EAAE,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAC/D"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|