@lde/pipeline 0.30.10 → 0.30.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/provenance/index.d.ts +4 -0
- package/dist/provenance/index.d.ts.map +1 -0
- package/dist/provenance/index.js +2 -0
- package/dist/provenance/record.d.ts +35 -0
- package/dist/provenance/record.d.ts.map +1 -0
- package/dist/provenance/record.js +1 -0
- package/dist/provenance/reprocessDecision.d.ts +21 -0
- package/dist/provenance/reprocessDecision.d.ts.map +1 -0
- package/dist/provenance/reprocessDecision.js +30 -0
- package/dist/provenance/sourceFingerprint.d.ts +29 -0
- package/dist/provenance/sourceFingerprint.d.ts.map +1 -0
- package/dist/provenance/sourceFingerprint.js +57 -0
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The per-dataset processing memory the pipeline keeps to decide whether a
|
|
3
|
+
* dataset can be skipped on the next run.
|
|
4
|
+
*
|
|
5
|
+
* Both change fields ({@link sourceFingerprint} and {@link pipelineVersion})
|
|
6
|
+
* are opaque strings, compared only for equality – never parsed or ordered.
|
|
7
|
+
*/
|
|
8
|
+
export interface ProcessingRecord {
|
|
9
|
+
/**
|
|
10
|
+
* The source-change fingerprint at the time of processing (see
|
|
11
|
+
* `sourceFingerprint`), or `null` when none could be established (e.g. a live
|
|
12
|
+
* SPARQL endpoint). Derived automatically from observed source metadata, not
|
|
13
|
+
* a declared version. A `null` fingerprint never compares equal, so the
|
|
14
|
+
* dataset is always reprocessed.
|
|
15
|
+
*/
|
|
16
|
+
sourceFingerprint: string | null;
|
|
17
|
+
/**
|
|
18
|
+
* The consumer-declared pipeline version under which the dataset was
|
|
19
|
+
* processed. Kept separate from {@link sourceFingerprint}, never combined
|
|
20
|
+
* into a single fingerprint: the data side is observed, the logic side is
|
|
21
|
+
* intentionally declared.
|
|
22
|
+
*/
|
|
23
|
+
pipelineVersion: string;
|
|
24
|
+
/** ISO timestamp of when the record was written. */
|
|
25
|
+
generatedAt: string;
|
|
26
|
+
/**
|
|
27
|
+
* Whether processing succeeded. Recorded so a dataset that failed but whose
|
|
28
|
+
* source is unchanged is skipped on subsequent runs rather than re-imported
|
|
29
|
+
* every run; it is retried at the next source change or version rotation.
|
|
30
|
+
*/
|
|
31
|
+
status: 'success' | 'failed';
|
|
32
|
+
}
|
|
33
|
+
/** The two fields the skip rule compares for equality. */
|
|
34
|
+
export type ChangeKey = Pick<ProcessingRecord, 'sourceFingerprint' | 'pipelineVersion'>;
|
|
35
|
+
//# sourceMappingURL=record.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"record.d.ts","sourceRoot":"","sources":["../../src/provenance/record.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC;;;;;OAKG;IACH,eAAe,EAAE,MAAM,CAAC;IACxB,oDAAoD;IACpD,WAAW,EAAE,MAAM,CAAC;IACpB;;;;OAIG;IACH,MAAM,EAAE,SAAS,GAAG,QAAQ,CAAC;CAC9B;AAED,0DAA0D;AAC1D,MAAM,MAAM,SAAS,GAAG,IAAI,CAC1B,gBAAgB,EAChB,mBAAmB,GAAG,iBAAiB,CACxC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ChangeKey, ProcessingRecord } from './record.js';
|
|
2
|
+
/**
|
|
3
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
4
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
5
|
+
* processed).
|
|
6
|
+
*
|
|
7
|
+
* The rule is pure equality on the two change fields:
|
|
8
|
+
*
|
|
9
|
+
* ```
|
|
10
|
+
* skip iff stored !== null
|
|
11
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
12
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
13
|
+
* ```
|
|
14
|
+
*
|
|
15
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
16
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
17
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
18
|
+
* no establishable fingerprint is always reprocessed.
|
|
19
|
+
*/
|
|
20
|
+
export declare function shouldReprocess(current: ChangeKey, stored: ProcessingRecord | null): boolean;
|
|
21
|
+
//# sourceMappingURL=reprocessDecision.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reprocessDecision.d.ts","sourceRoot":"","sources":["../../src/provenance/reprocessDecision.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAE/D;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,eAAe,CAC7B,OAAO,EAAE,SAAS,EAClB,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAC9B,OAAO,CAOT"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
3
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
4
|
+
* processed).
|
|
5
|
+
*
|
|
6
|
+
* The rule is pure equality on the two change fields:
|
|
7
|
+
*
|
|
8
|
+
* ```
|
|
9
|
+
* skip iff stored !== null
|
|
10
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
11
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
15
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
16
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
17
|
+
* no establishable fingerprint is always reprocessed.
|
|
18
|
+
*/
|
|
19
|
+
export function shouldReprocess(current, stored) {
|
|
20
|
+
if (stored === null)
|
|
21
|
+
return true;
|
|
22
|
+
// A null source fingerprint never compares equal, even to a stored null.
|
|
23
|
+
if (current.sourceFingerprint === null)
|
|
24
|
+
return true;
|
|
25
|
+
if (current.sourceFingerprint !== stored.sourceFingerprint)
|
|
26
|
+
return true;
|
|
27
|
+
if (current.pipelineVersion !== stored.pipelineVersion)
|
|
28
|
+
return true;
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { Distribution } from '@lde/dataset';
|
|
2
|
+
import { type ProbeResultType } from '@lde/distribution-probe';
|
|
3
|
+
/**
|
|
4
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
5
|
+
* probe already collected – no body download.
|
|
6
|
+
*
|
|
7
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
8
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
9
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
10
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
11
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
12
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
13
|
+
* download/import layer agree.
|
|
14
|
+
*
|
|
15
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
16
|
+
* parsed or ordered.
|
|
17
|
+
*
|
|
18
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
19
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
20
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
21
|
+
* those distributions are always reprocessed.
|
|
22
|
+
*
|
|
23
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
24
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
25
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
26
|
+
* throw or an unstable fingerprint.
|
|
27
|
+
*/
|
|
28
|
+
export declare function sourceFingerprint(distribution: Distribution, probeResult: ProbeResultType): string | null;
|
|
29
|
+
//# sourceMappingURL=sourceFingerprint.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sourceFingerprint.d.ts","sourceRoot":"","sources":["../../src/provenance/sourceFingerprint.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACjD,OAAO,EAEL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,YAAY,EAC1B,WAAW,EAAE,eAAe,GAC3B,MAAM,GAAG,IAAI,CAwBf"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { DataDumpProbeResult, } from '@lde/distribution-probe';
|
|
2
|
+
/**
|
|
3
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
4
|
+
* probe already collected – no body download.
|
|
5
|
+
*
|
|
6
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
7
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
8
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
9
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
10
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
11
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
12
|
+
* download/import layer agree.
|
|
13
|
+
*
|
|
14
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
15
|
+
* parsed or ordered.
|
|
16
|
+
*
|
|
17
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
18
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
19
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
20
|
+
* those distributions are always reprocessed.
|
|
21
|
+
*
|
|
22
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
23
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
24
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
25
|
+
* throw or an unstable fingerprint.
|
|
26
|
+
*/
|
|
27
|
+
export function sourceFingerprint(distribution, probeResult) {
|
|
28
|
+
if (distribution.isSparql()) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
const modifiedDate = mostRecent(distribution.lastModified, probeResult instanceof DataDumpProbeResult
|
|
32
|
+
? (probeResult.lastModified ?? undefined)
|
|
33
|
+
: undefined);
|
|
34
|
+
const probeSize = probeResult instanceof DataDumpProbeResult ? probeResult.contentSize : null;
|
|
35
|
+
const byteSize = probeSize !== null && !Number.isNaN(probeSize)
|
|
36
|
+
? probeSize
|
|
37
|
+
: distribution.byteSize;
|
|
38
|
+
if (modifiedDate === undefined && byteSize === undefined) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
return `${modifiedDate?.toISOString() ?? ''}|${byteSize ?? ''}`;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* The most recent of the given dates, ignoring `undefined` and Invalid Dates.
|
|
45
|
+
* Filtering invalid dates keeps a malformed metadata value from being selected
|
|
46
|
+
* (which would make `toISOString` throw) and from sticking ahead of a valid
|
|
47
|
+
* date – `validDate > invalidDate` is `number > NaN`, i.e. always `false`.
|
|
48
|
+
*/
|
|
49
|
+
function mostRecent(...dates) {
|
|
50
|
+
return dates.reduce((latest, date) => {
|
|
51
|
+
if (date === undefined || Number.isNaN(date.valueOf()))
|
|
52
|
+
return latest;
|
|
53
|
+
if (latest === undefined || date > latest)
|
|
54
|
+
return date;
|
|
55
|
+
return latest;
|
|
56
|
+
}, undefined);
|
|
57
|
+
}
|