@lde/pipeline 0.30.9 → 0.30.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/provenance/index.d.ts +4 -0
- package/dist/provenance/index.d.ts.map +1 -0
- package/dist/provenance/index.js +2 -0
- package/dist/provenance/record.d.ts +35 -0
- package/dist/provenance/record.d.ts.map +1 -0
- package/dist/provenance/record.js +1 -0
- package/dist/provenance/reprocessDecision.d.ts +21 -0
- package/dist/provenance/reprocessDecision.d.ts.map +1 -0
- package/dist/provenance/reprocessDecision.js +30 -0
- package/dist/provenance/sourceFingerprint.d.ts +29 -0
- package/dist/provenance/sourceFingerprint.d.ts.map +1 -0
- package/dist/provenance/sourceFingerprint.js +57 -0
- package/dist/writer/fileWriter.d.ts +11 -0
- package/dist/writer/fileWriter.d.ts.map +1 -1
- package/dist/writer/fileWriter.js +43 -17
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,mBAAmB,CAAC;AAClC,cAAc,oCAAoC,CAAC;AACnD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -8,6 +8,7 @@ export * from './stage.js';
|
|
|
8
8
|
export * from './stageOutputResolver.js';
|
|
9
9
|
export * from './sparql/index.js';
|
|
10
10
|
export * from './distribution/index.js';
|
|
11
|
+
export * from './provenance/index.js';
|
|
11
12
|
export * from './writer/index.js';
|
|
12
13
|
export * from './plugin/namespaceNormalization.js';
|
|
13
14
|
export * from './plugin/provenance.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/provenance/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The per-dataset processing memory the pipeline keeps to decide whether a
|
|
3
|
+
* dataset can be skipped on the next run.
|
|
4
|
+
*
|
|
5
|
+
* Both change fields ({@link sourceFingerprint} and {@link pipelineVersion})
|
|
6
|
+
* are opaque strings, compared only for equality – never parsed or ordered.
|
|
7
|
+
*/
|
|
8
|
+
export interface ProcessingRecord {
|
|
9
|
+
/**
|
|
10
|
+
* The source-change fingerprint at the time of processing (see
|
|
11
|
+
* `sourceFingerprint`), or `null` when none could be established (e.g. a live
|
|
12
|
+
* SPARQL endpoint). Derived automatically from observed source metadata, not
|
|
13
|
+
* a declared version. A `null` fingerprint never compares equal, so the
|
|
14
|
+
* dataset is always reprocessed.
|
|
15
|
+
*/
|
|
16
|
+
sourceFingerprint: string | null;
|
|
17
|
+
/**
|
|
18
|
+
* The consumer-declared pipeline version under which the dataset was
|
|
19
|
+
* processed. Kept separate from {@link sourceFingerprint}, never combined
|
|
20
|
+
* into a single fingerprint: the data side is observed, the logic side is
|
|
21
|
+
* intentionally declared.
|
|
22
|
+
*/
|
|
23
|
+
pipelineVersion: string;
|
|
24
|
+
/** ISO timestamp of when the record was written. */
|
|
25
|
+
generatedAt: string;
|
|
26
|
+
/**
|
|
27
|
+
* Whether processing succeeded. Recorded so a dataset that failed but whose
|
|
28
|
+
* source is unchanged is skipped on subsequent runs rather than re-imported
|
|
29
|
+
* every run; it is retried at the next source change or version rotation.
|
|
30
|
+
*/
|
|
31
|
+
status: 'success' | 'failed';
|
|
32
|
+
}
|
|
33
|
+
/** The two fields the skip rule compares for equality. */
|
|
34
|
+
export type ChangeKey = Pick<ProcessingRecord, 'sourceFingerprint' | 'pipelineVersion'>;
|
|
35
|
+
//# sourceMappingURL=record.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"record.d.ts","sourceRoot":"","sources":["../../src/provenance/record.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC;;;;;OAKG;IACH,eAAe,EAAE,MAAM,CAAC;IACxB,oDAAoD;IACpD,WAAW,EAAE,MAAM,CAAC;IACpB;;;;OAIG;IACH,MAAM,EAAE,SAAS,GAAG,QAAQ,CAAC;CAC9B;AAED,0DAA0D;AAC1D,MAAM,MAAM,SAAS,GAAG,IAAI,CAC1B,gBAAgB,EAChB,mBAAmB,GAAG,iBAAiB,CACxC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ChangeKey, ProcessingRecord } from './record.js';
|
|
2
|
+
/**
|
|
3
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
4
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
5
|
+
* processed).
|
|
6
|
+
*
|
|
7
|
+
* The rule is pure equality on the two change fields:
|
|
8
|
+
*
|
|
9
|
+
* ```
|
|
10
|
+
* skip iff stored !== null
|
|
11
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
12
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
13
|
+
* ```
|
|
14
|
+
*
|
|
15
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
16
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
17
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
18
|
+
* no establishable fingerprint is always reprocessed.
|
|
19
|
+
*/
|
|
20
|
+
export declare function shouldReprocess(current: ChangeKey, stored: ProcessingRecord | null): boolean;
|
|
21
|
+
//# sourceMappingURL=reprocessDecision.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reprocessDecision.d.ts","sourceRoot":"","sources":["../../src/provenance/reprocessDecision.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAE/D;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,eAAe,CAC7B,OAAO,EAAE,SAAS,EAClB,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAC9B,OAAO,CAOT"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Decide whether a dataset must be reprocessed, given its current change
|
|
3
|
+
* fields and the record from the last run (or `null` if it has never been
|
|
4
|
+
* processed).
|
|
5
|
+
*
|
|
6
|
+
* The rule is pure equality on the two change fields:
|
|
7
|
+
*
|
|
8
|
+
* ```
|
|
9
|
+
* skip iff stored !== null
|
|
10
|
+
* AND current.sourceFingerprint === stored.sourceFingerprint
|
|
11
|
+
* AND current.pipelineVersion === stored.pipelineVersion
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* Equality, never ordering – any opaque version representation works, a
|
|
15
|
+
* rollback to identical logic correctly skips, and a partial run resumes
|
|
16
|
+
* cleanly. A `null` source fingerprint never compares equal, so a dataset with
|
|
17
|
+
* no establishable fingerprint is always reprocessed.
|
|
18
|
+
*/
|
|
19
|
+
export function shouldReprocess(current, stored) {
|
|
20
|
+
if (stored === null)
|
|
21
|
+
return true;
|
|
22
|
+
// A null source fingerprint never compares equal, even to a stored null.
|
|
23
|
+
if (current.sourceFingerprint === null)
|
|
24
|
+
return true;
|
|
25
|
+
if (current.sourceFingerprint !== stored.sourceFingerprint)
|
|
26
|
+
return true;
|
|
27
|
+
if (current.pipelineVersion !== stored.pipelineVersion)
|
|
28
|
+
return true;
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { Distribution } from '@lde/dataset';
|
|
2
|
+
import { type ProbeResultType } from '@lde/distribution-probe';
|
|
3
|
+
/**
|
|
4
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
5
|
+
* probe already collected – no body download.
|
|
6
|
+
*
|
|
7
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
8
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
9
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
10
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
11
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
12
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
13
|
+
* download/import layer agree.
|
|
14
|
+
*
|
|
15
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
16
|
+
* parsed or ordered.
|
|
17
|
+
*
|
|
18
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
19
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
20
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
21
|
+
* those distributions are always reprocessed.
|
|
22
|
+
*
|
|
23
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
24
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
25
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
26
|
+
* throw or an unstable fingerprint.
|
|
27
|
+
*/
|
|
28
|
+
export declare function sourceFingerprint(distribution: Distribution, probeResult: ProbeResultType): string | null;
|
|
29
|
+
//# sourceMappingURL=sourceFingerprint.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sourceFingerprint.d.ts","sourceRoot":"","sources":["../../src/provenance/sourceFingerprint.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACjD,OAAO,EAEL,KAAK,eAAe,EACrB,MAAM,yBAAyB,CAAC;AAEjC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,YAAY,EAC1B,WAAW,EAAE,eAAe,GAC3B,MAAM,GAAG,IAAI,CAwBf"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { DataDumpProbeResult, } from '@lde/distribution-probe';
|
|
2
|
+
/**
|
|
3
|
+
* Derive a cheap source-change fingerprint for a distribution from metadata the
|
|
4
|
+
* probe already collected – no body download.
|
|
5
|
+
*
|
|
6
|
+
* For a data dump the fingerprint combines the most recent of the register’s
|
|
7
|
+
* declared `dct:modified` and the artifact’s HTTP `Last-Modified` with the
|
|
8
|
+
* artifact’s byte size (the probe’s `Content-Length`, falling back to the
|
|
9
|
+
* register’s declared `dcat:byteSize`). Taking the maximum date errs toward
|
|
10
|
+
* reprocessing rather than serving stale output, and mirrors the change signal
|
|
11
|
+
* {@link ImportResolver} computes for the downloader so the skip layer and the
|
|
12
|
+
* download/import layer agree.
|
|
13
|
+
*
|
|
14
|
+
* The returned string is opaque: it is only ever compared for equality, never
|
|
15
|
+
* parsed or ordered.
|
|
16
|
+
*
|
|
17
|
+
* Returns `null` when no fingerprint can be established – a live SPARQL
|
|
18
|
+
* endpoint (which exposes none), or a distribution whose probe yielded neither
|
|
19
|
+
* a usable date nor a byte size. A `null` fingerprint never compares equal, so
|
|
20
|
+
* those distributions are always reprocessed.
|
|
21
|
+
*
|
|
22
|
+
* Robust against malformed third-party metadata: an unparseable HTTP
|
|
23
|
+
* `Last-Modified` or `dct:modified` (an Invalid Date) and a non-numeric
|
|
24
|
+
* `Content-Length` (`NaN`) are both treated as absent rather than producing a
|
|
25
|
+
* throw or an unstable fingerprint.
|
|
26
|
+
*/
|
|
27
|
+
export function sourceFingerprint(distribution, probeResult) {
|
|
28
|
+
if (distribution.isSparql()) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
const modifiedDate = mostRecent(distribution.lastModified, probeResult instanceof DataDumpProbeResult
|
|
32
|
+
? (probeResult.lastModified ?? undefined)
|
|
33
|
+
: undefined);
|
|
34
|
+
const probeSize = probeResult instanceof DataDumpProbeResult ? probeResult.contentSize : null;
|
|
35
|
+
const byteSize = probeSize !== null && !Number.isNaN(probeSize)
|
|
36
|
+
? probeSize
|
|
37
|
+
: distribution.byteSize;
|
|
38
|
+
if (modifiedDate === undefined && byteSize === undefined) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
return `${modifiedDate?.toISOString() ?? ''}|${byteSize ?? ''}`;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* The most recent of the given dates, ignoring `undefined` and Invalid Dates.
|
|
45
|
+
* Filtering invalid dates keeps a malformed metadata value from being selected
|
|
46
|
+
* (which would make `toISOString` throw) and from sticking ahead of a valid
|
|
47
|
+
* date – `validDate > invalidDate` is `number > NaN`, i.e. always `false`.
|
|
48
|
+
*/
|
|
49
|
+
function mostRecent(...dates) {
|
|
50
|
+
return dates.reduce((latest, date) => {
|
|
51
|
+
if (date === undefined || Number.isNaN(date.valueOf()))
|
|
52
|
+
return latest;
|
|
53
|
+
if (latest === undefined || date > latest)
|
|
54
|
+
return date;
|
|
55
|
+
return latest;
|
|
56
|
+
}, undefined);
|
|
57
|
+
}
|
|
@@ -21,12 +21,23 @@ export interface FileWriterOptions {
|
|
|
21
21
|
* Only used when format is 'turtle'.
|
|
22
22
|
*/
|
|
23
23
|
prefixes?: Record<string, string>;
|
|
24
|
+
/**
|
|
25
|
+
* Derive the named-graph IRI each quad is written into. Only meaningful for
|
|
26
|
+
* format `'n-quads'`; ignored for `'turtle'` and `'n-triples'`, which have no
|
|
27
|
+
* graph slot. When set, every quad is re-emitted with this graph term,
|
|
28
|
+
* regardless of the quad's own graph — mirroring
|
|
29
|
+
* {@link SparqlUpdateWriter}'s `graphIri`, so the same callback produces the
|
|
30
|
+
* same named-graph structure whether you write to a SPARQL store or to files.
|
|
31
|
+
* Defaults to undefined (quads written as-is, i.e. the default graph).
|
|
32
|
+
*/
|
|
33
|
+
graphIri?: (dataset: Dataset) => URL;
|
|
24
34
|
}
|
|
25
35
|
export declare class FileWriter implements Writer {
|
|
26
36
|
private readonly outputDir;
|
|
27
37
|
readonly format: 'turtle' | 'n-triples' | 'n-quads';
|
|
28
38
|
private readonly replacementCharacter;
|
|
29
39
|
private readonly prefixes?;
|
|
40
|
+
private readonly graphIri?;
|
|
30
41
|
private readonly activeWriters;
|
|
31
42
|
constructor(options: FileWriterOptions);
|
|
32
43
|
write(dataset: Dataset, quads: AsyncIterable<Quad>): Promise<void>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fileWriter.d.ts","sourceRoot":"","sources":["../../src/writer/fileWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAMzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAErC,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;IAC5C;;;OAGG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"fileWriter.d.ts","sourceRoot":"","sources":["../../src/writer/fileWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAMzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAErC,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;IAC5C;;;OAGG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClC;;;;;;;;OAQG;IACH,QAAQ,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,KAAK,GAAG,CAAC;CACtC;AAiBD,qBAAa,UAAW,YAAW,MAAM;IACvC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,QAAQ,CAAC,MAAM,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;IACpD,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;IAC9C,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAyB;IACnD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA4B;IACtD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAG1B;gBAEQ,OAAO,EAAE,iBAAiB;IAQhC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAiClE,KAAK,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC;IA8B5C,aAAa,CAAC,OAAO,EAAE,OAAO,GAAG,MAAM;IAIvC,WAAW,CAAC,OAAO,EAAE,OAAO,GAAG,MAAM;IAQrC,OAAO,CAAC,WAAW;YAIL,iBAAiB;IA4B/B,OAAO,CAAC,YAAY;CAUrB"}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { createWriteStream } from 'node:fs';
|
|
2
|
-
import { mkdir } from 'node:fs/promises';
|
|
2
|
+
import { mkdir, rename, rm } from 'node:fs/promises';
|
|
3
3
|
import { join, dirname } from 'node:path';
|
|
4
4
|
import filenamifyUrl from 'filenamify-url';
|
|
5
|
-
import { Writer as N3Writer } from 'n3';
|
|
5
|
+
import { DataFactory, Writer as N3Writer } from 'n3';
|
|
6
6
|
/**
|
|
7
7
|
* Streams RDF quads to files on disk using N3 Writer.
|
|
8
8
|
*
|
|
@@ -22,12 +22,14 @@ export class FileWriter {
|
|
|
22
22
|
format;
|
|
23
23
|
replacementCharacter;
|
|
24
24
|
prefixes;
|
|
25
|
+
graphIri;
|
|
25
26
|
activeWriters = new Map();
|
|
26
27
|
constructor(options) {
|
|
27
28
|
this.outputDir = options.outputDir;
|
|
28
29
|
this.format = options.format ?? 'n-triples';
|
|
29
30
|
this.replacementCharacter = options.replacementCharacter ?? '-';
|
|
30
31
|
this.prefixes = options.prefixes;
|
|
32
|
+
this.graphIri = options.graphIri;
|
|
31
33
|
}
|
|
32
34
|
async write(dataset, quads) {
|
|
33
35
|
// Peek at the first quad to avoid creating empty files.
|
|
@@ -36,9 +38,18 @@ export class FileWriter {
|
|
|
36
38
|
if (first.done)
|
|
37
39
|
return;
|
|
38
40
|
const { n3Writer } = await this.getOrCreateWriter(dataset);
|
|
39
|
-
|
|
41
|
+
// Re-emit each quad into the configured named graph (n-quads only). The
|
|
42
|
+
// pipeline's quads carry no graph context, so the graph is supplied here
|
|
43
|
+
// exactly as SparqlUpdateWriter supplies it via INSERT DATA { GRAPH … }.
|
|
44
|
+
const graphNode = this.format === 'n-quads' && this.graphIri
|
|
45
|
+
? DataFactory.namedNode(this.graphIri(dataset).toString())
|
|
46
|
+
: undefined;
|
|
47
|
+
const addQuad = (quad) => n3Writer.addQuad(graphNode
|
|
48
|
+
? DataFactory.quad(quad.subject, quad.predicate, quad.object, graphNode)
|
|
49
|
+
: quad);
|
|
50
|
+
addQuad(first.value);
|
|
40
51
|
for await (const quad of { [Symbol.asyncIterator]: () => iterator }) {
|
|
41
|
-
|
|
52
|
+
addQuad(quad);
|
|
42
53
|
}
|
|
43
54
|
}
|
|
44
55
|
async flush(dataset) {
|
|
@@ -47,18 +58,29 @@ export class FileWriter {
|
|
|
47
58
|
if (!entry)
|
|
48
59
|
return;
|
|
49
60
|
this.activeWriters.delete(key);
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if (
|
|
57
|
-
reject(
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
// Quads are streamed to a sibling temp file; only on a clean flush is it
|
|
62
|
+
// atomically renamed onto the final path. A crash therefore leaves at most
|
|
63
|
+
// a stale `*.tmp` — never a truncated final file — so a downstream index
|
|
64
|
+
// rebuild that globs the final extension never reads a half-written file.
|
|
65
|
+
try {
|
|
66
|
+
await new Promise((resolve, reject) => {
|
|
67
|
+
if (entry.stream.errored) {
|
|
68
|
+
reject(entry.stream.errored);
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
entry.n3Writer.end((error) => {
|
|
72
|
+
if (error)
|
|
73
|
+
reject(error);
|
|
74
|
+
else
|
|
75
|
+
resolve();
|
|
76
|
+
});
|
|
60
77
|
});
|
|
61
|
-
}
|
|
78
|
+
}
|
|
79
|
+
catch (error) {
|
|
80
|
+
await rm(entry.tempPath, { force: true, recursive: true });
|
|
81
|
+
throw error;
|
|
82
|
+
}
|
|
83
|
+
await rename(entry.tempPath, key);
|
|
62
84
|
}
|
|
63
85
|
getOutputPath(dataset) {
|
|
64
86
|
return this.getFilePath(dataset);
|
|
@@ -79,7 +101,11 @@ export class FileWriter {
|
|
|
79
101
|
if (existing)
|
|
80
102
|
return existing;
|
|
81
103
|
await mkdir(dirname(key), { recursive: true });
|
|
82
|
-
|
|
104
|
+
// Write to a sibling temp file (same directory, so the flush rename stays on
|
|
105
|
+
// one filesystem and is atomic). The `.tmp` suffix keeps it out of any glob
|
|
106
|
+
// on the final extension.
|
|
107
|
+
const tempPath = `${key}.tmp`;
|
|
108
|
+
const stream = createWriteStream(tempPath, { flags: 'w' });
|
|
83
109
|
stream.on('error', (error) => {
|
|
84
110
|
// Surface stream errors when flushing; prevents 'unhandled error' crashes.
|
|
85
111
|
stream.destroy(error);
|
|
@@ -88,7 +114,7 @@ export class FileWriter {
|
|
|
88
114
|
format: formatMap[this.format],
|
|
89
115
|
prefixes: this.prefixes,
|
|
90
116
|
});
|
|
91
|
-
const entry = { n3Writer, stream };
|
|
117
|
+
const entry = { n3Writer, stream, tempPath };
|
|
92
118
|
this.activeWriters.set(key, entry);
|
|
93
119
|
return entry;
|
|
94
120
|
}
|