@lde/pipeline 0.6.28 → 0.6.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -3
- package/dist/pipeline.d.ts +15 -7
- package/dist/pipeline.d.ts.map +1 -1
- package/dist/pipeline.js +55 -30
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -86,9 +86,7 @@ import {
|
|
|
86
86
|
} from '@lde/pipeline';
|
|
87
87
|
|
|
88
88
|
const pipeline = new Pipeline({
|
|
89
|
-
name: 'example',
|
|
90
89
|
datasetSelector: new ManualDatasetSelection([dataset]),
|
|
91
|
-
distributionResolver: new SparqlDistributionResolver(),
|
|
92
90
|
stages: [
|
|
93
91
|
new Stage({
|
|
94
92
|
name: 'per-class',
|
|
@@ -101,7 +99,7 @@ const pipeline = new Pipeline({
|
|
|
101
99
|
}),
|
|
102
100
|
}),
|
|
103
101
|
],
|
|
104
|
-
|
|
102
|
+
writers: new SparqlUpdateWriter({
|
|
105
103
|
endpoint: new URL('http://localhost:7200/repositories/lde/statements'),
|
|
106
104
|
}),
|
|
107
105
|
});
|
package/dist/pipeline.d.ts
CHANGED
|
@@ -5,18 +5,26 @@ import { type DistributionResolver } from './distribution/resolver.js';
|
|
|
5
5
|
import type { StageOutputResolver } from './stageOutputResolver.js';
|
|
6
6
|
import type { ProgressReporter } from './progressReporter.js';
|
|
7
7
|
export interface PipelineOptions {
|
|
8
|
-
name: string;
|
|
9
8
|
datasetSelector: DatasetSelector;
|
|
10
9
|
stages: Stage[];
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
writers: Writer | Writer[];
|
|
11
|
+
name?: string;
|
|
12
|
+
distributionResolver?: DistributionResolver;
|
|
13
|
+
chaining?: {
|
|
14
|
+
stageOutputResolver: StageOutputResolver;
|
|
15
|
+
outputDir: string;
|
|
16
|
+
outputFormat?: 'turtle' | 'n-triples' | 'n-quads';
|
|
17
|
+
};
|
|
16
18
|
reporter?: ProgressReporter;
|
|
17
19
|
}
|
|
18
20
|
export declare class Pipeline {
|
|
19
|
-
private readonly
|
|
21
|
+
private readonly name;
|
|
22
|
+
private readonly datasetSelector;
|
|
23
|
+
private readonly stages;
|
|
24
|
+
private readonly writer;
|
|
25
|
+
private readonly distributionResolver;
|
|
26
|
+
private readonly chaining?;
|
|
27
|
+
private readonly reporter?;
|
|
20
28
|
constructor(options: PipelineOptions);
|
|
21
29
|
run(): Promise<void>;
|
|
22
30
|
private processDataset;
|
package/dist/pipeline.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;AAGpC,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;KACnD,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;CAC7B;AAmBD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;gBAEjC,OAAO,EAAE,eAAe;IAoB9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAaZ,cAAc;YA0Bd,QAAQ;YA8BR,QAAQ;YAmDR,eAAe;YAkCd,SAAS;CAUzB"}
|
package/dist/pipeline.js
CHANGED
|
@@ -2,40 +2,67 @@ import { createReadStream } from 'node:fs';
|
|
|
2
2
|
import { StreamParser } from 'n3';
|
|
3
3
|
import { FileWriter } from './writer/fileWriter.js';
|
|
4
4
|
import { NoDistributionAvailable, } from './distribution/resolver.js';
|
|
5
|
+
import { SparqlDistributionResolver } from './distribution/index.js';
|
|
5
6
|
import { NotSupported } from './sparql/executor.js';
|
|
7
|
+
class FanOutWriter {
|
|
8
|
+
writers;
|
|
9
|
+
constructor(writers) {
|
|
10
|
+
this.writers = writers;
|
|
11
|
+
}
|
|
12
|
+
async write(dataset, quads) {
|
|
13
|
+
const collected = [];
|
|
14
|
+
for await (const quad of quads)
|
|
15
|
+
collected.push(quad);
|
|
16
|
+
for (const w of this.writers) {
|
|
17
|
+
await w.write(dataset, (async function* () {
|
|
18
|
+
yield* collected;
|
|
19
|
+
})());
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
6
23
|
export class Pipeline {
|
|
7
|
-
|
|
24
|
+
name;
|
|
25
|
+
datasetSelector;
|
|
26
|
+
stages;
|
|
27
|
+
writer;
|
|
28
|
+
distributionResolver;
|
|
29
|
+
chaining;
|
|
30
|
+
reporter;
|
|
8
31
|
constructor(options) {
|
|
9
32
|
const hasSubStages = options.stages.some((stage) => stage.stages.length > 0);
|
|
10
|
-
if (hasSubStages && !options.
|
|
11
|
-
throw new Error('
|
|
12
|
-
}
|
|
13
|
-
if (hasSubStages && !options.outputDir) {
|
|
14
|
-
throw new Error('outputDir is required when any stage has sub-stages');
|
|
33
|
+
if (hasSubStages && !options.chaining) {
|
|
34
|
+
throw new Error('chaining is required when any stage has sub-stages');
|
|
15
35
|
}
|
|
16
|
-
this.
|
|
36
|
+
this.name = options.name ?? '';
|
|
37
|
+
this.datasetSelector = options.datasetSelector;
|
|
38
|
+
this.stages = options.stages;
|
|
39
|
+
this.writer = Array.isArray(options.writers)
|
|
40
|
+
? new FanOutWriter(options.writers)
|
|
41
|
+
: options.writers;
|
|
42
|
+
this.distributionResolver =
|
|
43
|
+
options.distributionResolver ?? new SparqlDistributionResolver();
|
|
44
|
+
this.chaining = options.chaining;
|
|
45
|
+
this.reporter = options.reporter;
|
|
17
46
|
}
|
|
18
47
|
async run() {
|
|
19
|
-
const { datasetSelector, reporter, name } = this.options;
|
|
20
48
|
const start = Date.now();
|
|
21
|
-
reporter?.pipelineStart(name);
|
|
22
|
-
const datasets = await datasetSelector.select();
|
|
49
|
+
this.reporter?.pipelineStart(this.name);
|
|
50
|
+
const datasets = await this.datasetSelector.select();
|
|
23
51
|
for await (const dataset of datasets) {
|
|
24
52
|
await this.processDataset(dataset);
|
|
25
53
|
}
|
|
26
|
-
reporter?.pipelineComplete({ duration: Date.now() - start });
|
|
54
|
+
this.reporter?.pipelineComplete({ duration: Date.now() - start });
|
|
27
55
|
}
|
|
28
56
|
async processDataset(dataset) {
|
|
29
|
-
const { distributionResolver, reporter } = this.options;
|
|
30
57
|
const datasetIri = dataset.iri.toString();
|
|
31
|
-
reporter?.datasetStart(datasetIri);
|
|
32
|
-
const resolved = await distributionResolver.resolve(dataset);
|
|
58
|
+
this.reporter?.datasetStart(datasetIri);
|
|
59
|
+
const resolved = await this.distributionResolver.resolve(dataset);
|
|
33
60
|
if (resolved instanceof NoDistributionAvailable) {
|
|
34
|
-
reporter?.datasetSkipped(datasetIri, resolved.message);
|
|
61
|
+
this.reporter?.datasetSkipped(datasetIri, resolved.message);
|
|
35
62
|
return;
|
|
36
63
|
}
|
|
37
64
|
try {
|
|
38
|
-
for (const stage of this.
|
|
65
|
+
for (const stage of this.stages) {
|
|
39
66
|
if (stage.stages.length > 0) {
|
|
40
67
|
await this.runChain(dataset, resolved.distribution, stage);
|
|
41
68
|
}
|
|
@@ -47,26 +74,25 @@ export class Pipeline {
|
|
|
47
74
|
catch {
|
|
48
75
|
// Stage error for this dataset; continue to next dataset.
|
|
49
76
|
}
|
|
50
|
-
reporter?.datasetComplete(datasetIri);
|
|
77
|
+
this.reporter?.datasetComplete(datasetIri);
|
|
51
78
|
}
|
|
52
79
|
async runStage(dataset, distribution, stage) {
|
|
53
|
-
|
|
54
|
-
reporter?.stageStart(stage.name);
|
|
80
|
+
this.reporter?.stageStart(stage.name);
|
|
55
81
|
const stageStart = Date.now();
|
|
56
82
|
let elementsProcessed = 0;
|
|
57
83
|
let quadsGenerated = 0;
|
|
58
|
-
const result = await stage.run(dataset, distribution, writer, {
|
|
84
|
+
const result = await stage.run(dataset, distribution, this.writer, {
|
|
59
85
|
onProgress: (elements, quads) => {
|
|
60
86
|
elementsProcessed = elements;
|
|
61
87
|
quadsGenerated = quads;
|
|
62
|
-
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
88
|
+
this.reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
63
89
|
},
|
|
64
90
|
});
|
|
65
91
|
if (result instanceof NotSupported) {
|
|
66
|
-
reporter?.stageSkipped(stage.name, result.message);
|
|
92
|
+
this.reporter?.stageSkipped(stage.name, result.message);
|
|
67
93
|
}
|
|
68
94
|
else {
|
|
69
|
-
reporter?.stageComplete(stage.name, {
|
|
95
|
+
this.reporter?.stageComplete(stage.name, {
|
|
70
96
|
elementsProcessed,
|
|
71
97
|
quadsGenerated,
|
|
72
98
|
duration: Date.now() - stageStart,
|
|
@@ -74,7 +100,7 @@ export class Pipeline {
|
|
|
74
100
|
}
|
|
75
101
|
}
|
|
76
102
|
async runChain(dataset, distribution, stage) {
|
|
77
|
-
const {
|
|
103
|
+
const { stageOutputResolver, outputDir, outputFormat } = this.chaining;
|
|
78
104
|
const outputFiles = [];
|
|
79
105
|
try {
|
|
80
106
|
// 1. Run parent stage → FileWriter.
|
|
@@ -99,15 +125,14 @@ export class Pipeline {
|
|
|
99
125
|
}
|
|
100
126
|
}
|
|
101
127
|
// 3. Concatenate all output files → user writer.
|
|
102
|
-
await writer.write(dataset, this.readFiles(outputFiles));
|
|
128
|
+
await this.writer.write(dataset, this.readFiles(outputFiles));
|
|
103
129
|
}
|
|
104
130
|
finally {
|
|
105
131
|
await stageOutputResolver.cleanup();
|
|
106
132
|
}
|
|
107
133
|
}
|
|
108
134
|
async runChainedStage(dataset, distribution, stage, stageWriter) {
|
|
109
|
-
|
|
110
|
-
reporter?.stageStart(stage.name);
|
|
135
|
+
this.reporter?.stageStart(stage.name);
|
|
111
136
|
const stageStart = Date.now();
|
|
112
137
|
let elementsProcessed = 0;
|
|
113
138
|
let quadsGenerated = 0;
|
|
@@ -115,14 +140,14 @@ export class Pipeline {
|
|
|
115
140
|
onProgress: (elements, quads) => {
|
|
116
141
|
elementsProcessed = elements;
|
|
117
142
|
quadsGenerated = quads;
|
|
118
|
-
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
143
|
+
this.reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
119
144
|
},
|
|
120
145
|
});
|
|
121
146
|
if (result instanceof NotSupported) {
|
|
122
|
-
reporter?.stageSkipped(stage.name, result.message);
|
|
147
|
+
this.reporter?.stageSkipped(stage.name, result.message);
|
|
123
148
|
throw new Error(`Stage '${stage.name}' returned NotSupported in chained mode`);
|
|
124
149
|
}
|
|
125
|
-
reporter?.stageComplete(stage.name, {
|
|
150
|
+
this.reporter?.stageComplete(stage.name, {
|
|
126
151
|
elementsProcessed,
|
|
127
152
|
quadsGenerated,
|
|
128
153
|
duration: Date.now() - stageStart,
|