@lde/pipeline 0.6.23 → 0.6.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/asyncQueue.d.ts +29 -0
- package/dist/asyncQueue.d.ts.map +1 -0
- package/dist/asyncQueue.js +106 -0
- package/dist/index.d.ts +3 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -4
- package/dist/pipeline.d.ts +24 -7
- package/dist/pipeline.d.ts.map +1 -1
- package/dist/pipeline.js +129 -30
- package/dist/progressReporter.d.ts +21 -0
- package/dist/progressReporter.d.ts.map +1 -0
- package/dist/progressReporter.js +1 -0
- package/dist/sparql/executor.d.ts +7 -2
- package/dist/sparql/executor.d.ts.map +1 -1
- package/dist/sparql/executor.js +9 -2
- package/dist/stage.d.ts +11 -2
- package/dist/stage.d.ts.map +1 -1
- package/dist/stage.js +99 -17
- package/dist/stageOutputResolver.d.ts +6 -0
- package/dist/stageOutputResolver.d.ts.map +1 -0
- package/dist/stageOutputResolver.js +1 -0
- package/dist/writer/fileWriter.d.ts +3 -2
- package/dist/writer/fileWriter.d.ts.map +1 -1
- package/dist/writer/fileWriter.js +3 -0
- package/package.json +5 -7
- package/dist/builder.d.ts +0 -120
- package/dist/builder.d.ts.map +0 -1
- package/dist/builder.js +0 -116
- package/dist/config.d.ts +0 -71
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js +0 -114
- package/dist/import.d.ts +0 -30
- package/dist/import.d.ts.map +0 -1
- package/dist/import.js +0 -44
- package/dist/step/sparqlQuery.d.ts +0 -35
- package/dist/step/sparqlQuery.d.ts.map +0 -1
- package/dist/step/sparqlQuery.js +0 -38
- package/dist/step.d.ts +0 -55
- package/dist/step.d.ts.map +0 -1
- package/dist/step.js +0 -39
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A bounded async channel: producers `push()` items, a single consumer
|
|
3
|
+
* iterates with `for await...of`. Backpressure is applied when the buffer
|
|
4
|
+
* reaches `capacity` — `push()` will block until the consumer pulls.
|
|
5
|
+
*/
|
|
6
|
+
export declare class AsyncQueue<T> implements AsyncIterable<T> {
|
|
7
|
+
private buffer;
|
|
8
|
+
private readonly capacity;
|
|
9
|
+
private closed;
|
|
10
|
+
private error;
|
|
11
|
+
/** Resolvers for a blocked consumer waiting for data or close/abort. */
|
|
12
|
+
private consumerResolve?;
|
|
13
|
+
private consumerReject?;
|
|
14
|
+
/** Resolvers for blocked producers waiting for buffer space. */
|
|
15
|
+
private producerResolvers;
|
|
16
|
+
constructor(capacity?: number);
|
|
17
|
+
/**
|
|
18
|
+
* Push an item into the queue. Blocks (returns a Promise) when the buffer
|
|
19
|
+
* is full. Throws if the queue has been closed or aborted.
|
|
20
|
+
*/
|
|
21
|
+
push(item: T): Promise<void>;
|
|
22
|
+
/** Signal that no more items will be pushed. */
|
|
23
|
+
close(): void;
|
|
24
|
+
/** Signal an error. Unblocks all waiting producers and the consumer. */
|
|
25
|
+
abort(error: unknown): void;
|
|
26
|
+
[Symbol.asyncIterator](): AsyncIterator<T, undefined>;
|
|
27
|
+
private pull;
|
|
28
|
+
}
|
|
29
|
+
//# sourceMappingURL=asyncQueue.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"asyncQueue.d.ts","sourceRoot":"","sources":["../src/asyncQueue.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,qBAAa,UAAU,CAAC,CAAC,CAAE,YAAW,aAAa,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,MAAM,CAAW;IACzB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAsB;IAEnC,wEAAwE;IACxE,OAAO,CAAC,eAAe,CAAC,CAAgD;IACxE,OAAO,CAAC,cAAc,CAAC,CAA4B;IAEnD,gEAAgE;IAChE,OAAO,CAAC,iBAAiB,CAGjB;gBAEI,QAAQ,SAAM;IAI1B;;;OAGG;IACG,IAAI,CAAC,IAAI,EAAE,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IA2BlC,gDAAgD;IAChD,KAAK,IAAI,IAAI;IAab,wEAAwE;IACxE,KAAK,CAAC,KAAK,EAAE,OAAO,GAAG,IAAI;IAoB3B,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa,CAAC,CAAC,EAAE,SAAS,CAAC;IAMrD,OAAO,CAAC,IAAI;CA2Bb"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A bounded async channel: producers `push()` items, a single consumer
|
|
3
|
+
* iterates with `for await...of`. Backpressure is applied when the buffer
|
|
4
|
+
* reaches `capacity` — `push()` will block until the consumer pulls.
|
|
5
|
+
*/
|
|
6
|
+
export class AsyncQueue {
|
|
7
|
+
buffer = [];
|
|
8
|
+
capacity;
|
|
9
|
+
closed = false;
|
|
10
|
+
error = undefined;
|
|
11
|
+
/** Resolvers for a blocked consumer waiting for data or close/abort. */
|
|
12
|
+
consumerResolve;
|
|
13
|
+
consumerReject;
|
|
14
|
+
/** Resolvers for blocked producers waiting for buffer space. */
|
|
15
|
+
producerResolvers = [];
|
|
16
|
+
constructor(capacity = 128) {
|
|
17
|
+
this.capacity = capacity;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Push an item into the queue. Blocks (returns a Promise) when the buffer
|
|
21
|
+
* is full. Throws if the queue has been closed or aborted.
|
|
22
|
+
*/
|
|
23
|
+
async push(item) {
|
|
24
|
+
if (this.error !== undefined) {
|
|
25
|
+
throw this.error;
|
|
26
|
+
}
|
|
27
|
+
if (this.closed) {
|
|
28
|
+
throw new Error('Cannot push to a closed queue');
|
|
29
|
+
}
|
|
30
|
+
// If a consumer is already waiting, deliver directly.
|
|
31
|
+
if (this.consumerResolve) {
|
|
32
|
+
const resolve = this.consumerResolve;
|
|
33
|
+
this.consumerResolve = undefined;
|
|
34
|
+
this.consumerReject = undefined;
|
|
35
|
+
resolve({ value: item, done: false });
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
// Wait for space if buffer is at capacity.
|
|
39
|
+
if (this.buffer.length >= this.capacity) {
|
|
40
|
+
await new Promise((resolve, reject) => {
|
|
41
|
+
this.producerResolvers.push({ resolve, reject });
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
this.buffer.push(item);
|
|
45
|
+
}
|
|
46
|
+
/** Signal that no more items will be pushed. */
|
|
47
|
+
close() {
|
|
48
|
+
if (this.closed)
|
|
49
|
+
return;
|
|
50
|
+
this.closed = true;
|
|
51
|
+
// Wake a waiting consumer with done signal if buffer is empty.
|
|
52
|
+
if (this.buffer.length === 0 && this.consumerResolve) {
|
|
53
|
+
const resolve = this.consumerResolve;
|
|
54
|
+
this.consumerResolve = undefined;
|
|
55
|
+
this.consumerReject = undefined;
|
|
56
|
+
resolve({ value: undefined, done: true });
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/** Signal an error. Unblocks all waiting producers and the consumer. */
|
|
60
|
+
abort(error) {
|
|
61
|
+
if (this.error !== undefined)
|
|
62
|
+
return; // first error wins
|
|
63
|
+
this.error = error;
|
|
64
|
+
this.closed = true;
|
|
65
|
+
// Reject all blocked producers.
|
|
66
|
+
for (const { reject } of this.producerResolvers) {
|
|
67
|
+
reject(error);
|
|
68
|
+
}
|
|
69
|
+
this.producerResolvers = [];
|
|
70
|
+
// Reject or resolve the consumer depending on buffered items.
|
|
71
|
+
if (this.consumerReject) {
|
|
72
|
+
const reject = this.consumerReject;
|
|
73
|
+
this.consumerResolve = undefined;
|
|
74
|
+
this.consumerReject = undefined;
|
|
75
|
+
reject(error);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
[Symbol.asyncIterator]() {
|
|
79
|
+
return {
|
|
80
|
+
next: () => this.pull(),
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
pull() {
|
|
84
|
+
// Drain buffer first.
|
|
85
|
+
if (this.buffer.length > 0) {
|
|
86
|
+
const item = this.buffer.shift();
|
|
87
|
+
// Unblock one waiting producer.
|
|
88
|
+
if (this.producerResolvers.length > 0) {
|
|
89
|
+
this.producerResolvers.shift().resolve();
|
|
90
|
+
}
|
|
91
|
+
return Promise.resolve({ value: item, done: false });
|
|
92
|
+
}
|
|
93
|
+
// Buffer empty — check for error or closed.
|
|
94
|
+
if (this.error !== undefined) {
|
|
95
|
+
return Promise.reject(this.error);
|
|
96
|
+
}
|
|
97
|
+
if (this.closed) {
|
|
98
|
+
return Promise.resolve({ value: undefined, done: true });
|
|
99
|
+
}
|
|
100
|
+
// Wait for a producer to push or for close/abort.
|
|
101
|
+
return new Promise((resolve, reject) => {
|
|
102
|
+
this.consumerResolve = resolve;
|
|
103
|
+
this.consumerReject = reject;
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
+
export * from './asyncQueue.js';
|
|
1
2
|
export * from './batch.js';
|
|
2
3
|
export * from './pipeline.js';
|
|
4
|
+
export * from './progressReporter.js';
|
|
3
5
|
export * from './selector.js';
|
|
4
6
|
export * from './stage.js';
|
|
5
|
-
export * from './
|
|
6
|
-
export * from './step/sparqlQuery.js';
|
|
7
|
-
export * from './builder.js';
|
|
8
|
-
export * from './config.js';
|
|
7
|
+
export * from './stageOutputResolver.js';
|
|
9
8
|
export * from './sparql/index.js';
|
|
10
9
|
export * from './distribution/index.js';
|
|
11
10
|
export * from './writer/index.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,eAAe,CAAC;AAC9B,cAAc,uBAAuB,CAAC;AACtC,cAAc,eAAe,CAAC;AAC9B,cAAc,YAAY,CAAC;AAC3B,cAAc,0BAA0B,CAAC;AACzC,cAAc,mBAAmB,CAAC;AAClC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
+
export * from './asyncQueue.js';
|
|
1
2
|
export * from './batch.js';
|
|
2
3
|
export * from './pipeline.js';
|
|
4
|
+
export * from './progressReporter.js';
|
|
3
5
|
export * from './selector.js';
|
|
4
6
|
export * from './stage.js';
|
|
5
|
-
export * from './
|
|
6
|
-
export * from './step/sparqlQuery.js';
|
|
7
|
-
export * from './builder.js';
|
|
8
|
-
export * from './config.js';
|
|
7
|
+
export * from './stageOutputResolver.js';
|
|
9
8
|
export * from './sparql/index.js';
|
|
10
9
|
export * from './distribution/index.js';
|
|
11
10
|
export * from './writer/index.js';
|
package/dist/pipeline.d.ts
CHANGED
|
@@ -1,11 +1,28 @@
|
|
|
1
|
-
import { Selector } from './selector.js';
|
|
2
|
-
import {
|
|
1
|
+
import type { Selector } from './selector.js';
|
|
2
|
+
import { Stage } from './stage.js';
|
|
3
|
+
import type { Writer } from './writer/writer.js';
|
|
4
|
+
import { type DistributionResolver } from './distribution/resolver.js';
|
|
5
|
+
import type { StageOutputResolver } from './stageOutputResolver.js';
|
|
6
|
+
import type { ProgressReporter } from './progressReporter.js';
|
|
7
|
+
export interface PipelineOptions {
|
|
8
|
+
name: string;
|
|
9
|
+
selector: Selector;
|
|
10
|
+
stages: Stage[];
|
|
11
|
+
writer: Writer;
|
|
12
|
+
distributionResolver: DistributionResolver;
|
|
13
|
+
stageOutputResolver?: StageOutputResolver;
|
|
14
|
+
outputDir?: string;
|
|
15
|
+
outputFormat?: 'turtle' | 'n-triples' | 'n-quads';
|
|
16
|
+
reporter?: ProgressReporter;
|
|
17
|
+
}
|
|
3
18
|
export declare class Pipeline {
|
|
4
|
-
private readonly
|
|
5
|
-
constructor(
|
|
6
|
-
selector: Selector;
|
|
7
|
-
steps: Step[];
|
|
8
|
-
});
|
|
19
|
+
private readonly options;
|
|
20
|
+
constructor(options: PipelineOptions);
|
|
9
21
|
run(): Promise<void>;
|
|
22
|
+
private processDataset;
|
|
23
|
+
private runStage;
|
|
24
|
+
private runChain;
|
|
25
|
+
private runChainedStage;
|
|
26
|
+
private readFiles;
|
|
10
27
|
}
|
|
11
28
|
//# sourceMappingURL=pipeline.d.ts.map
|
package/dist/pipeline.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;AAEpC,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB,EAAE,oBAAoB,CAAC;IAC3C,mBAAmB,CAAC,EAAE,mBAAmB,CAAC;IAC1C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;IAClD,QAAQ,CAAC,EAAE,gBAAgB,CAAC;CAC7B;AAED,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkB;gBAE9B,OAAO,EAAE,eAAe;IAe9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAcZ,cAAc;YA2Bd,QAAQ;YAgCR,QAAQ;YAoDR,eAAe;YAoCd,SAAS;CAUzB"}
|
package/dist/pipeline.js
CHANGED
|
@@ -1,42 +1,141 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { createReadStream } from 'node:fs';
|
|
2
|
+
import { StreamParser } from 'n3';
|
|
3
|
+
import { FileWriter } from './writer/fileWriter.js';
|
|
4
|
+
import { NoDistributionAvailable, } from './distribution/resolver.js';
|
|
5
|
+
import { NotSupported } from './sparql/executor.js';
|
|
3
6
|
export class Pipeline {
|
|
4
|
-
|
|
5
|
-
constructor(
|
|
6
|
-
|
|
7
|
+
options;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
const hasSubStages = options.stages.some((stage) => stage.stages.length > 0);
|
|
10
|
+
if (hasSubStages && !options.stageOutputResolver) {
|
|
11
|
+
throw new Error('stageOutputResolver is required when any stage has sub-stages');
|
|
12
|
+
}
|
|
13
|
+
if (hasSubStages && !options.outputDir) {
|
|
14
|
+
throw new Error('outputDir is required when any stage has sub-stages');
|
|
15
|
+
}
|
|
16
|
+
this.options = options;
|
|
7
17
|
}
|
|
8
18
|
async run() {
|
|
9
|
-
const
|
|
19
|
+
const { selector, reporter, name } = this.options;
|
|
20
|
+
const start = Date.now();
|
|
21
|
+
reporter?.pipelineStart(name);
|
|
22
|
+
const datasets = await selector.select();
|
|
10
23
|
for await (const dataset of datasets) {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
24
|
+
await this.processDataset(dataset);
|
|
25
|
+
}
|
|
26
|
+
reporter?.pipelineComplete({ duration: Date.now() - start });
|
|
27
|
+
}
|
|
28
|
+
async processDataset(dataset) {
|
|
29
|
+
const { distributionResolver, reporter } = this.options;
|
|
30
|
+
const datasetIri = dataset.iri.toString();
|
|
31
|
+
reporter?.datasetStart(datasetIri);
|
|
32
|
+
const resolved = await distributionResolver.resolve(dataset);
|
|
33
|
+
if (resolved instanceof NoDistributionAvailable) {
|
|
34
|
+
reporter?.datasetSkipped(datasetIri, resolved.message);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
try {
|
|
38
|
+
for (const stage of this.options.stages) {
|
|
39
|
+
if (stage.stages.length > 0) {
|
|
40
|
+
await this.runChain(dataset, resolved.distribution, stage);
|
|
16
41
|
}
|
|
17
|
-
else
|
|
18
|
-
|
|
19
|
-
result.on('data', (data) => {
|
|
20
|
-
// TODO: pipe to writers.
|
|
21
|
-
console.log('Data:', data);
|
|
22
|
-
});
|
|
23
|
-
result.on('error', (error) => {
|
|
24
|
-
console.error('rejecting');
|
|
25
|
-
reject(error);
|
|
26
|
-
});
|
|
27
|
-
result.on('end', resolve);
|
|
28
|
-
});
|
|
29
|
-
await promise;
|
|
42
|
+
else {
|
|
43
|
+
await this.runStage(dataset, resolved.distribution, stage);
|
|
30
44
|
}
|
|
31
45
|
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
// Stage error for this dataset; continue to next dataset.
|
|
49
|
+
}
|
|
50
|
+
reporter?.datasetComplete(datasetIri);
|
|
51
|
+
}
|
|
52
|
+
async runStage(dataset, distribution, stage) {
|
|
53
|
+
const { writer, reporter } = this.options;
|
|
54
|
+
reporter?.stageStart(stage.name);
|
|
55
|
+
const stageStart = Date.now();
|
|
56
|
+
let elementsProcessed = 0;
|
|
57
|
+
let quadsGenerated = 0;
|
|
58
|
+
const result = await stage.run(dataset, distribution, writer, {
|
|
59
|
+
onProgress: (elements, quads) => {
|
|
60
|
+
elementsProcessed = elements;
|
|
61
|
+
quadsGenerated = quads;
|
|
62
|
+
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
if (result instanceof NotSupported) {
|
|
66
|
+
reporter?.stageSkipped(stage.name, result.message);
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
reporter?.stageComplete(stage.name, {
|
|
70
|
+
elementsProcessed,
|
|
71
|
+
quadsGenerated,
|
|
72
|
+
duration: Date.now() - stageStart,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
async runChain(dataset, distribution, stage) {
|
|
77
|
+
const { writer, stageOutputResolver, outputDir, outputFormat } = this.options;
|
|
78
|
+
const outputFiles = [];
|
|
79
|
+
try {
|
|
80
|
+
// 1. Run parent stage → FileWriter.
|
|
81
|
+
const parentWriter = new FileWriter({
|
|
82
|
+
outputDir: `${outputDir}/${stage.name}`,
|
|
83
|
+
format: outputFormat,
|
|
84
|
+
});
|
|
85
|
+
await this.runChainedStage(dataset, distribution, stage, parentWriter);
|
|
86
|
+
outputFiles.push(parentWriter.getOutputPath(dataset));
|
|
87
|
+
// 2. Chain through children.
|
|
88
|
+
let currentDistribution = await stageOutputResolver.resolve(parentWriter.getOutputPath(dataset));
|
|
89
|
+
for (let i = 0; i < stage.stages.length; i++) {
|
|
90
|
+
const child = stage.stages[i];
|
|
91
|
+
const childWriter = new FileWriter({
|
|
92
|
+
outputDir: `${outputDir}/${child.name}`,
|
|
93
|
+
format: outputFormat,
|
|
94
|
+
});
|
|
95
|
+
await this.runChainedStage(dataset, currentDistribution, child, childWriter);
|
|
96
|
+
outputFiles.push(childWriter.getOutputPath(dataset));
|
|
97
|
+
if (i < stage.stages.length - 1) {
|
|
98
|
+
currentDistribution = await stageOutputResolver.resolve(childWriter.getOutputPath(dataset));
|
|
35
99
|
}
|
|
36
100
|
}
|
|
101
|
+
// 3. Concatenate all output files → user writer.
|
|
102
|
+
await writer.write(dataset, this.readFiles(outputFiles));
|
|
103
|
+
}
|
|
104
|
+
finally {
|
|
105
|
+
await stageOutputResolver.cleanup();
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
async runChainedStage(dataset, distribution, stage, stageWriter) {
|
|
109
|
+
const { reporter } = this.options;
|
|
110
|
+
reporter?.stageStart(stage.name);
|
|
111
|
+
const stageStart = Date.now();
|
|
112
|
+
let elementsProcessed = 0;
|
|
113
|
+
let quadsGenerated = 0;
|
|
114
|
+
const result = await stage.run(dataset, distribution, stageWriter, {
|
|
115
|
+
onProgress: (elements, quads) => {
|
|
116
|
+
elementsProcessed = elements;
|
|
117
|
+
quadsGenerated = quads;
|
|
118
|
+
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
119
|
+
},
|
|
120
|
+
});
|
|
121
|
+
if (result instanceof NotSupported) {
|
|
122
|
+
reporter?.stageSkipped(stage.name, result.message);
|
|
123
|
+
throw new Error(`Stage '${stage.name}' returned NotSupported in chained mode`);
|
|
124
|
+
}
|
|
125
|
+
reporter?.stageComplete(stage.name, {
|
|
126
|
+
elementsProcessed,
|
|
127
|
+
quadsGenerated,
|
|
128
|
+
duration: Date.now() - stageStart,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
async *readFiles(paths) {
|
|
132
|
+
for (const path of paths) {
|
|
133
|
+
const stream = createReadStream(path);
|
|
134
|
+
const parser = new StreamParser();
|
|
135
|
+
stream.pipe(parser);
|
|
136
|
+
for await (const quad of parser) {
|
|
137
|
+
yield quad;
|
|
138
|
+
}
|
|
37
139
|
}
|
|
38
140
|
}
|
|
39
141
|
}
|
|
40
|
-
const isFinishable = (step) => {
|
|
41
|
-
return typeof step.finish === 'function';
|
|
42
|
-
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export interface ProgressReporter {
|
|
2
|
+
pipelineStart(name: string): void;
|
|
3
|
+
datasetStart(dataset: string): void;
|
|
4
|
+
stageStart(stage: string): void;
|
|
5
|
+
stageProgress(update: {
|
|
6
|
+
elementsProcessed: number;
|
|
7
|
+
quadsGenerated: number;
|
|
8
|
+
}): void;
|
|
9
|
+
stageComplete(stage: string, result: {
|
|
10
|
+
elementsProcessed: number;
|
|
11
|
+
quadsGenerated: number;
|
|
12
|
+
duration: number;
|
|
13
|
+
}): void;
|
|
14
|
+
stageSkipped(stage: string, reason: string): void;
|
|
15
|
+
datasetComplete(dataset: string): void;
|
|
16
|
+
datasetSkipped(dataset: string, reason: string): void;
|
|
17
|
+
pipelineComplete(result: {
|
|
18
|
+
duration: number;
|
|
19
|
+
}): void;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=progressReporter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressReporter.d.ts","sourceRoot":"","sources":["../src/progressReporter.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,gBAAgB;IAC/B,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,YAAY,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAChC,aAAa,CAAC,MAAM,EAAE;QACpB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,cAAc,EAAE,MAAM,CAAC;KACxB,GAAG,IAAI,CAAC;IACT,aAAa,CACX,KAAK,EAAE,MAAM,EACb,MAAM,EAAE;QACN,iBAAiB,EAAE,MAAM,CAAC;QAC1B,cAAc,EAAE,MAAM,CAAC;QACvB,QAAQ,EAAE,MAAM,CAAC;KAClB,GACA,IAAI,CAAC;IACR,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IAClD,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvC,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtD,gBAAgB,CAAC,MAAM,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;CACtD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -2,8 +2,13 @@ import { Dataset, Distribution } from '@lde/dataset';
|
|
|
2
2
|
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
3
3
|
import type { NamedNode, Quad, Stream } from '@rdfjs/types';
|
|
4
4
|
import type { Readable } from 'node:stream';
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
/**
|
|
6
|
+
* An executor could not run because the dataset lacks a supported distribution.
|
|
7
|
+
*/
|
|
8
|
+
export declare class NotSupported {
|
|
9
|
+
readonly message: string;
|
|
10
|
+
constructor(message: string);
|
|
11
|
+
}
|
|
7
12
|
/** A single row of variable bindings (variable name → NamedNode). */
|
|
8
13
|
export type VariableBindings = Record<string, NamedNode>;
|
|
9
14
|
export interface ExecuteOptions {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"executor.d.ts","sourceRoot":"","sources":["../../src/sparql/executor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,OAAO,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,cAAc,CAAC;AAC5D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"executor.d.ts","sourceRoot":"","sources":["../../src/sparql/executor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,OAAO,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,cAAc,CAAC;AAC5D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAO5C;;GAEG;AACH,qBAAa,YAAY;aACK,OAAO,EAAE,MAAM;gBAAf,OAAO,EAAE,MAAM;CAC5C;AAED,qEAAqE;AACrE,MAAM,MAAM,gBAAgB,GAAG,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;AAEzD,MAAM,WAAW,cAAc;IAC7B;;;OAGG;IACH,QAAQ,CAAC,EAAE,gBAAgB,EAAE,CAAC;CAC/B;AAED,MAAM,WAAW,QAAQ;IACvB,OAAO,CACL,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,cAAc,GACvB,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC;CAChD;AAED;;;GAGG;AACH,MAAM,MAAM,UAAU,GAAG,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;AAEjD;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC7C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,qBAAa,uBAAwB,YAAW,QAAQ;IACtD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAiB;IACvC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;IAChD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAmB;gBAEjC,OAAO,EAAE,8BAA8B;IAcnD;;;;;;;OAOG;IACG,OAAO,CACX,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,cAAc,GACvB,OAAO,CAAC,UAAU,CAAC;IAoBtB;;;;;OAKG;WACiB,QAAQ,CAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,IAAI,CAAC,8BAA8B,EAAE,OAAO,CAAC,GACtD,OAAO,CAAC,uBAAuB,CAAC;CAIpC;AAED;;;;;;GAMG;AACH,wBAAgB,wBAAwB,CACtC,KAAK,EAAE,MAAM,EACb,YAAY,EAAE,YAAY,GAAG,IAAI,EACjC,OAAO,EAAE,OAAO,GACf,MAAM,CAWR;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAErE"}
|
package/dist/sparql/executor.js
CHANGED
|
@@ -4,8 +4,15 @@ import { resolve } from 'node:path';
|
|
|
4
4
|
import { Generator, Parser } from 'sparqljs';
|
|
5
5
|
import { withDefaultGraph } from './graph.js';
|
|
6
6
|
import { injectValues } from './values.js';
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
/**
|
|
8
|
+
* An executor could not run because the dataset lacks a supported distribution.
|
|
9
|
+
*/
|
|
10
|
+
export class NotSupported {
|
|
11
|
+
message;
|
|
12
|
+
constructor(message) {
|
|
13
|
+
this.message = message;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
9
16
|
/**
|
|
10
17
|
* A streaming SPARQL CONSTRUCT executor that parses the query once (in the
|
|
11
18
|
* constructor) and operates on the AST for graph and VALUES injection.
|
package/dist/stage.d.ts
CHANGED
|
@@ -8,15 +8,24 @@ export interface StageOptions {
|
|
|
8
8
|
selector?: StageSelector;
|
|
9
9
|
/** Maximum number of bindings per executor call. @default 10 */
|
|
10
10
|
batchSize?: number;
|
|
11
|
+
/** Maximum concurrent in-flight executor batches. @default 10 */
|
|
12
|
+
maxConcurrency?: number;
|
|
13
|
+
/** Child stages that chain off this stage's output. */
|
|
14
|
+
stages?: Stage[];
|
|
15
|
+
}
|
|
16
|
+
export interface RunOptions {
|
|
17
|
+
onProgress?: (elementsProcessed: number, quadsGenerated: number) => void;
|
|
11
18
|
}
|
|
12
19
|
export declare class Stage {
|
|
13
20
|
readonly name: string;
|
|
21
|
+
readonly stages: readonly Stage[];
|
|
14
22
|
private readonly executors;
|
|
15
23
|
private readonly selector?;
|
|
16
24
|
private readonly batchSize;
|
|
25
|
+
private readonly maxConcurrency;
|
|
17
26
|
constructor(options: StageOptions);
|
|
18
|
-
run(dataset: Dataset, distribution: Distribution, writer: Writer): Promise<NotSupported | void>;
|
|
19
|
-
private
|
|
27
|
+
run(dataset: Dataset, distribution: Distribution, writer: Writer, options?: RunOptions): Promise<NotSupported | void>;
|
|
28
|
+
private runWithSelector;
|
|
20
29
|
private executeAll;
|
|
21
30
|
}
|
|
22
31
|
/** Stage-level selector that yields variable bindings for use in executor queries. Pagination is an implementation detail. */
|
package/dist/stage.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,QAAQ,CAAC,EAAE,aAAa,CAAC;IACzB,gEAAgE;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uDAAuD;IACvD,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,CAAC,EAAE,CAAC,iBAAiB,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,KAAK,IAAI,CAAC;CAC1E;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,KAAK,EAAE,CAAC;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAgB;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;gBAE5B,OAAO,EAAE,YAAY;IAW3B,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,UAAU,GACnB,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAajB,eAAe;YA8Gf,UAAU;CAqBzB;AAUD,8HAA8H;AAE9H,MAAM,WAAW,aAAc,SAAQ,aAAa,CAAC,gBAAgB,CAAC;CAAG"}
|
package/dist/stage.js
CHANGED
|
@@ -1,48 +1,130 @@
|
|
|
1
1
|
import { NotSupported } from './sparql/executor.js';
|
|
2
2
|
import { batch } from './batch.js';
|
|
3
|
+
import { AsyncQueue } from './asyncQueue.js';
|
|
3
4
|
export class Stage {
|
|
4
5
|
name;
|
|
6
|
+
stages;
|
|
5
7
|
executors;
|
|
6
8
|
selector;
|
|
7
9
|
batchSize;
|
|
10
|
+
maxConcurrency;
|
|
8
11
|
constructor(options) {
|
|
9
12
|
this.name = options.name;
|
|
13
|
+
this.stages = options.stages ?? [];
|
|
10
14
|
this.executors = Array.isArray(options.executors)
|
|
11
15
|
? options.executors
|
|
12
16
|
: [options.executors];
|
|
13
17
|
this.selector = options.selector;
|
|
14
18
|
this.batchSize = options.batchSize ?? 10;
|
|
19
|
+
this.maxConcurrency = options.maxConcurrency ?? 10;
|
|
15
20
|
}
|
|
16
|
-
async run(dataset, distribution, writer) {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
async run(dataset, distribution, writer, options) {
|
|
22
|
+
if (this.selector) {
|
|
23
|
+
return this.runWithSelector(dataset, distribution, writer, options);
|
|
24
|
+
}
|
|
25
|
+
const streams = await this.executeAll(dataset, distribution);
|
|
20
26
|
if (streams instanceof NotSupported) {
|
|
21
27
|
return streams;
|
|
22
28
|
}
|
|
23
29
|
await writer.write(dataset, mergeStreams(streams));
|
|
24
30
|
}
|
|
25
|
-
async
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
async runWithSelector(dataset, distribution, writer, options) {
|
|
32
|
+
// Peek the first batch to detect an empty selector before starting the
|
|
33
|
+
// writer (important because e.g. SparqlUpdateWriter does CLEAR GRAPH).
|
|
34
|
+
const batches = batch(this.selector, this.batchSize);
|
|
35
|
+
const iter = batches[Symbol.asyncIterator]();
|
|
36
|
+
const first = await iter.next();
|
|
37
|
+
if (first.done) {
|
|
38
|
+
return new NotSupported('All executors returned NotSupported');
|
|
39
|
+
}
|
|
40
|
+
// Reconstruct a full iterable including the peeked first batch.
|
|
41
|
+
const allBatches = (async function* () {
|
|
42
|
+
yield first.value;
|
|
43
|
+
// Continue yielding remaining batches from the same iterator.
|
|
44
|
+
for (;;) {
|
|
45
|
+
const next = await iter.next();
|
|
46
|
+
if (next.done)
|
|
47
|
+
break;
|
|
48
|
+
yield next.value;
|
|
49
|
+
}
|
|
50
|
+
})();
|
|
51
|
+
const queue = new AsyncQueue();
|
|
52
|
+
let elementsProcessed = 0;
|
|
53
|
+
let quadsGenerated = 0;
|
|
54
|
+
let hasResults = false;
|
|
55
|
+
const dispatch = async () => {
|
|
56
|
+
const inFlight = new Set();
|
|
57
|
+
let firstError;
|
|
58
|
+
const track = (promise) => {
|
|
59
|
+
const p = promise.then(() => {
|
|
60
|
+
inFlight.delete(p);
|
|
61
|
+
}, (err) => {
|
|
62
|
+
inFlight.delete(p);
|
|
63
|
+
firstError ??= err;
|
|
31
64
|
});
|
|
32
|
-
|
|
33
|
-
|
|
65
|
+
inFlight.add(p);
|
|
66
|
+
};
|
|
67
|
+
try {
|
|
68
|
+
for await (const bindings of allBatches) {
|
|
69
|
+
if (firstError)
|
|
70
|
+
break;
|
|
71
|
+
for (const executor of this.executors) {
|
|
72
|
+
if (firstError)
|
|
73
|
+
break;
|
|
74
|
+
// Respect maxConcurrency: wait for a slot to open.
|
|
75
|
+
if (inFlight.size >= this.maxConcurrency) {
|
|
76
|
+
await Promise.race(inFlight);
|
|
77
|
+
if (firstError)
|
|
78
|
+
break;
|
|
79
|
+
}
|
|
80
|
+
track((async () => {
|
|
81
|
+
const result = await executor.execute(dataset, distribution, {
|
|
82
|
+
bindings,
|
|
83
|
+
});
|
|
84
|
+
if (!(result instanceof NotSupported)) {
|
|
85
|
+
hasResults = true;
|
|
86
|
+
for await (const quad of result) {
|
|
87
|
+
await queue.push(quad);
|
|
88
|
+
quadsGenerated++;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
elementsProcessed += bindings.length;
|
|
92
|
+
options?.onProgress?.(elementsProcessed, quadsGenerated);
|
|
93
|
+
})());
|
|
94
|
+
}
|
|
34
95
|
}
|
|
35
96
|
}
|
|
36
|
-
|
|
37
|
-
|
|
97
|
+
catch (err) {
|
|
98
|
+
firstError ??= err;
|
|
99
|
+
}
|
|
100
|
+
// Wait for all remaining in-flight tasks to settle.
|
|
101
|
+
await Promise.all(inFlight);
|
|
102
|
+
if (firstError) {
|
|
103
|
+
queue.abort(firstError);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
queue.close();
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
const dispatchPromise = dispatch();
|
|
110
|
+
const writePromise = (async () => {
|
|
111
|
+
try {
|
|
112
|
+
await writer.write(dataset, queue);
|
|
113
|
+
}
|
|
114
|
+
catch (err) {
|
|
115
|
+
queue.abort(err);
|
|
116
|
+
throw err;
|
|
117
|
+
}
|
|
118
|
+
})();
|
|
119
|
+
await Promise.all([dispatchPromise, writePromise]);
|
|
120
|
+
if (!hasResults) {
|
|
38
121
|
return new NotSupported('All executors returned NotSupported');
|
|
39
122
|
}
|
|
40
|
-
return streams;
|
|
41
123
|
}
|
|
42
124
|
async executeAll(dataset, distribution) {
|
|
125
|
+
const results = await Promise.all(this.executors.map((executor) => executor.execute(dataset, distribution)));
|
|
43
126
|
const streams = [];
|
|
44
|
-
for (const
|
|
45
|
-
const result = await executor.execute(dataset, distribution);
|
|
127
|
+
for (const result of results) {
|
|
46
128
|
if (!(result instanceof NotSupported)) {
|
|
47
129
|
streams.push(result);
|
|
48
130
|
}
|