@lde/pipeline 0.30.10 → 0.30.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -8,6 +8,7 @@ export * from './stage.js';
8
8
  export * from './stageOutputResolver.js';
9
9
  export * from './sparql/index.js';
10
10
  export * from './distribution/index.js';
11
+ export * from './provenance/index.js';
11
12
  export * from './writer/index.js';
12
13
  export * from './plugin/namespaceNormalization.js';
13
14
  export * from './plugin/provenance.js';
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
package/dist/index.js CHANGED
@@ -8,6 +8,7 @@ export * from './stage.js';
8
8
  export * from './stageOutputResolver.js';
9
9
  export * from './sparql/index.js';
10
10
  export * from './distribution/index.js';
11
+ export * from './provenance/index.js';
11
12
  export * from './writer/index.js';
12
13
  export * from './plugin/namespaceNormalization.js';
13
14
  export * from './plugin/provenance.js';
@@ -0,0 +1,4 @@
1
+ export { sourceFingerprint } from './sourceFingerprint.js';
2
+ export { shouldReprocess } from './reprocessDecision.js';
3
+ export type { ProcessingRecord, ChangeKey } from './record.js';
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC"}
@@ -0,0 +1,2 @@
1
+ export { sourceFingerprint } from './sourceFingerprint.js';
2
+ export { shouldReprocess } from './reprocessDecision.js';
@@ -0,0 +1,35 @@
1
+ /**
2
+ * The per-dataset processing memory the pipeline keeps to decide whether a
3
+ * dataset can be skipped on the next run.
4
+ *
5
+ * Both change fields ({@link sourceFingerprint} and {@link pipelineVersion})
6
+ * are opaque strings, compared only for equality – never parsed or ordered.
7
+ */
8
+ export interface ProcessingRecord {
9
+ /**
10
+ * The source-change fingerprint at the time of processing (see
11
+ * `sourceFingerprint`), or `null` when none could be established (e.g. a live
12
+ * SPARQL endpoint). Derived automatically from observed source metadata, not
13
+ * a declared version. A `null` fingerprint never compares equal, so the
14
+ * dataset is always reprocessed.
15
+ */
16
+ sourceFingerprint: string | null;
17
+ /**
18
+ * The consumer-declared pipeline version under which the dataset was
19
+ * processed. Kept separate from {@link sourceFingerprint}, never combined
20
+ * into a single fingerprint: the data side is observed, the logic side is
21
+ * intentionally declared.
22
+ */
23
+ pipelineVersion: string;
24
+ /** ISO timestamp of when the record was written. */
25
+ generatedAt: string;
26
+ /**
27
+ * Whether processing succeeded. Recorded so a dataset that failed but whose
28
+ * source is unchanged is skipped on subsequent runs rather than re-imported
29
+ * every run; it is retried at the next source change or version rotation.
30
+ */
31
+ status: 'success' | 'failed';
32
+ }
33
+ /** The two fields the skip rule compares for equality. */
34
+ export type ChangeKey = Pick<ProcessingRecord, 'sourceFingerprint' | 'pipelineVersion'>;
35
+ //# sourceMappingURL=record.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"record.d.ts","sourceRoot":"","sources":["../../src/provenance/record.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC;;;;;OAKG;IACH,eAAe,EAAE,MAAM,CAAC;IACxB,oDAAoD;IACpD,WAAW,EAAE,MAAM,CAAC;IACpB;;;;OAIG;IACH,MAAM,EAAE,SAAS,GAAG,QAAQ,CAAC;CAC9B;AAED,0DAA0D;AAC1D,MAAM,MAAM,SAAS,GAAG,IAAI,CAC1B,gBAAgB,EAChB,mBAAmB,GAAG,iBAAiB,CACxC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,21 @@
1
+ import type { ChangeKey, ProcessingRecord } from './record.js';
2
+ /**
3
+ * Decide whether a dataset must be reprocessed, given its current change
4
+ * fields and the record from the last run (or `null` if it has never been
5
+ * processed).
6
+ *
7
+ * The rule is pure equality on the two change fields:
8
+ *
9
+ * ```
10
+ * skip iff stored !== null
11
+ * AND current.sourceFingerprint === stored.sourceFingerprint
12
+ * AND current.pipelineVersion === stored.pipelineVersion
13
+ * ```
14
+ *
15
+ * Equality, never ordering – any opaque version representation works, a
16
+ * rollback to identical logic correctly skips, and a partial run resumes
17
+ * cleanly. A `null` source fingerprint never compares equal, so a dataset with
18
+ * no establishable fingerprint is always reprocessed.
19
+ */
20
+ export declare function shouldReprocess(current: ChangeKey, stored: ProcessingRecord | null): boolean;
21
+ //# sourceMappingURL=reprocessDecision.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reprocessDecision.d.ts","sourceRoot":"","sources":["../../src/provenance/reprocessDecision.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAE/D;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,eAAe,CAC7B,OAAO,EAAE,SAAS,EAClB,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAC9B,OAAO,CAOT"}
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Decide whether a dataset must be reprocessed, given its current change
3
+ * fields and the record from the last run (or `null` if it has never been
4
+ * processed).
5
+ *
6
+ * The rule is pure equality on the two change fields:
7
+ *
8
+ * ```
9
+ * skip iff stored !== null
10
+ * AND current.sourceFingerprint === stored.sourceFingerprint
11
+ * AND current.pipelineVersion === stored.pipelineVersion
12
+ * ```
13
+ *
14
+ * Equality, never ordering – any opaque version representation works, a
15
+ * rollback to identical logic correctly skips, and a partial run resumes
16
+ * cleanly. A `null` source fingerprint never compares equal, so a dataset with
17
+ * no establishable fingerprint is always reprocessed.
18
+ */
19
+ export function shouldReprocess(current, stored) {
20
+ if (stored === null)
21
+ return true;
22
+ // A null source fingerprint never compares equal, even to a stored null.
23
+ if (current.sourceFingerprint === null)
24
+ return true;
25
+ if (current.sourceFingerprint !== stored.sourceFingerprint)
26
+ return true;
27
+ if (current.pipelineVersion !== stored.pipelineVersion)
28
+ return true;
29
+ return false;
30
+ }
@@ -0,0 +1,29 @@
1
+ import type { Distribution } from '@lde/dataset';
2
+ import { type ProbeResultType } from '@lde/distribution-probe';
3
+ /**
4
+ * Derive a cheap source-change fingerprint for a distribution from metadata the
5
+ * probe already collected – no body download.
6
+ *
7
+ * For a data dump the fingerprint combines the most recent of the register’s
8
+ * declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
9
+ * artifact’s byte size (the probe’s `Content-Length`, falling back to the
10
+ * register’s declared `dcat:byteSize`). Taking the maximum date errs toward
11
+ * reprocessing rather than serving stale output, and mirrors the change signal
12
+ * {@link ImportResolver} computes for the downloader so the skip layer and the
13
+ * download/import layer agree.
14
+ *
15
+ * The returned string is opaque: it is only ever compared for equality, never
16
+ * parsed or ordered.
17
+ *
18
+ * Returns `null` when no fingerprint can be established – a live SPARQL
19
+ * endpoint (which exposes none), or a distribution whose probe yielded neither
20
+ * a usable date nor a byte size. A `null` fingerprint never compares equal, so
21
+ * those distributions are always reprocessed.
22
+ *
23
+ * Robust against malformed third-party metadata: an unparseable HTTP
24
+ * `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
25
+ * `Content-Length` (`NaN`) are both treated as absent rather than producing a
26
+ * throw or an unstable fingerprint.
27
+ */
28
+ export declare function sourceFingerprint(distribution: Distribution, probeResult: ProbeResultType): string | null;
29
+ //# sourceMappingURL=sourceFingerprint.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sourceFingerprint.d.ts","sourceRoot":"","sources":["../../src/provenance/sourceFingerprint.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACjD,OAAO,EAEL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,YAAY,EAC1B,WAAW,EAAE,eAAe,GAC3B,MAAM,GAAG,IAAI,CAwBf"}
@@ -0,0 +1,57 @@
1
+ import { DataDumpProbeResult, } from '@lde/distribution-probe';
2
+ /**
3
+ * Derive a cheap source-change fingerprint for a distribution from metadata the
4
+ * probe already collected – no body download.
5
+ *
6
+ * For a data dump the fingerprint combines the most recent of the register’s
7
+ * declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
8
+ * artifact’s byte size (the probe’s `Content-Length`, falling back to the
9
+ * register’s declared `dcat:byteSize`). Taking the maximum date errs toward
10
+ * reprocessing rather than serving stale output, and mirrors the change signal
11
+ * {@link ImportResolver} computes for the downloader so the skip layer and the
12
+ * download/import layer agree.
13
+ *
14
+ * The returned string is opaque: it is only ever compared for equality, never
15
+ * parsed or ordered.
16
+ *
17
+ * Returns `null` when no fingerprint can be established – a live SPARQL
18
+ * endpoint (which exposes none), or a distribution whose probe yielded neither
19
+ * a usable date nor a byte size. A `null` fingerprint never compares equal, so
20
+ * those distributions are always reprocessed.
21
+ *
22
+ * Robust against malformed third-party metadata: an unparseable HTTP
23
+ * `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
24
+ * `Content-Length` (`NaN`) are both treated as absent rather than producing a
25
+ * throw or an unstable fingerprint.
26
+ */
27
+ export function sourceFingerprint(distribution, probeResult) {
28
+ if (distribution.isSparql()) {
29
+ return null;
30
+ }
31
+ const modifiedDate = mostRecent(distribution.lastModified, probeResult instanceof DataDumpProbeResult
32
+ ? (probeResult.lastModified ?? undefined)
33
+ : undefined);
34
+ const probeSize = probeResult instanceof DataDumpProbeResult ? probeResult.contentSize : null;
35
+ const byteSize = probeSize !== null && !Number.isNaN(probeSize)
36
+ ? probeSize
37
+ : distribution.byteSize;
38
+ if (modifiedDate === undefined && byteSize === undefined) {
39
+ return null;
40
+ }
41
+ return `${modifiedDate?.toISOString() ?? ''}|${byteSize ?? ''}`;
42
+ }
43
+ /**
44
+ * The most recent of the given dates, ignoring `undefined` and Invalid Dates.
45
+ * Filtering invalid dates keeps a malformed metadata value from being selected
46
+ * (which would make `toISOString` throw) and from sticking ahead of a valid
47
+ * date – `validDate > invalidDate` is `number > NaN`, i.e. always `false`.
48
+ */
49
+ function mostRecent(...dates) {
50
+ return dates.reduce((latest, date) => {
51
+ if (date === undefined || Number.isNaN(date.valueOf()))
52
+ return latest;
53
+ if (latest === undefined || date > latest)
54
+ return date;
55
+ return latest;
56
+ }, undefined);
57
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/pipeline",
3
- "version": "0.30.10",
3
+ "version": "0.30.11",
4
4
  "repository": {
5
5
  "url": "git+https://github.com/ldelements/lde.git",
6
6
  "directory": "packages/pipeline"