@lde/pipeline 0.6.27 → 0.6.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -31
- package/dist/pipeline.d.ts +15 -7
- package/dist/pipeline.d.ts.map +1 -1
- package/dist/pipeline.js +55 -30
- package/dist/sparql/selector.d.ts +3 -4
- package/dist/sparql/selector.d.ts.map +1 -1
- package/dist/sparql/selector.js +4 -4
- package/dist/stage.d.ts +4 -5
- package/dist/stage.d.ts.map +1 -1
- package/dist/stage.js +4 -7
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,63 +2,109 @@
|
|
|
2
2
|
|
|
3
3
|
Framework for building RDF data processing pipelines with SPARQL.
|
|
4
4
|
|
|
5
|
-
## Features
|
|
6
|
-
|
|
7
|
-
- **Pipeline** — orchestrates steps that process DCAT datasets
|
|
8
|
-
- **PipelineBuilder** — fluent API for constructing pipelines from steps and selectors
|
|
9
|
-
- **PipelineConfig** — load pipeline configuration from YAML/JSON files
|
|
10
|
-
- **SparqlConstructExecutor** — streaming SPARQL CONSTRUCT with template substitution and variable bindings
|
|
11
|
-
- **Distribution analysis** — probe and analyze dataset distributions
|
|
12
|
-
|
|
13
5
|
## Components
|
|
14
6
|
|
|
15
7
|
A **Pipeline** consists of:
|
|
16
8
|
|
|
17
|
-
-
|
|
18
|
-
-
|
|
9
|
+
- a **Dataset Selector** that selects which datasets to process
|
|
10
|
+
- a **Distribution Resolver** that resolves each dataset to a usable SPARQL endpoint
|
|
19
11
|
- one or more **Stages**, each consisting of:
|
|
20
|
-
- an optional **Selector** that
|
|
21
|
-
- one or more **Executors** that generate triples
|
|
12
|
+
- an optional **Item Selector** that selects resources (as variable bindings) for fan-out
|
|
13
|
+
- one or more **Executors** that generate triples
|
|
22
14
|
|
|
23
15
|
### Dataset Selector
|
|
24
16
|
|
|
25
|
-
Selects datasets, either manually
|
|
17
|
+
Selects datasets, either manually or by querying a DCAT Dataset Registry:
|
|
26
18
|
|
|
27
|
-
|
|
19
|
+
```typescript
|
|
20
|
+
// From a registry
|
|
21
|
+
const selector = new RegistrySelector({
|
|
22
|
+
registry: new Client(new URL('https://example.com/sparql')),
|
|
23
|
+
});
|
|
28
24
|
|
|
29
|
-
|
|
25
|
+
// Manual
|
|
26
|
+
const selector = new ManualDatasetSelection([dataset]);
|
|
27
|
+
```
|
|
30
28
|
|
|
31
|
-
|
|
29
|
+
### Item Selector
|
|
32
30
|
|
|
33
|
-
|
|
34
|
-
If not, and a valid RDF datadump is available, that is imported to a local SPARQL server.
|
|
31
|
+
Selects resources from the distribution and fans out executor calls per batch of results. Implements the `ItemSelector` interface:
|
|
35
32
|
|
|
36
|
-
|
|
33
|
+
```typescript
|
|
34
|
+
interface ItemSelector {
|
|
35
|
+
select(distribution: Distribution): AsyncIterable<VariableBindings>;
|
|
36
|
+
}
|
|
37
|
+
```
|
|
37
38
|
|
|
38
|
-
|
|
39
|
+
The distribution is received at run time, so selectors don't need the endpoint URL at construction time. Use `SparqlItemSelector` for SPARQL-based selection with automatic pagination:
|
|
39
40
|
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
```typescript
|
|
42
|
+
new SparqlItemSelector({
|
|
43
|
+
query: 'SELECT DISTINCT ?class WHERE { ?s a ?class }',
|
|
44
|
+
});
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
For dynamic queries that depend on the distribution, implement `ItemSelector` directly:
|
|
48
|
+
|
|
49
|
+
```typescript
|
|
50
|
+
const itemSelector: ItemSelector = {
|
|
51
|
+
select: (distribution) => {
|
|
52
|
+
const query = buildQuery(distribution);
|
|
53
|
+
return new SparqlItemSelector({ query }).select(distribution);
|
|
54
|
+
},
|
|
55
|
+
};
|
|
56
|
+
```
|
|
42
57
|
|
|
43
58
|
### Executor
|
|
44
59
|
|
|
60
|
+
Generates RDF triples. `SparqlConstructExecutor` runs a SPARQL CONSTRUCT query with template substitution and variable bindings:
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
const executor = new SparqlConstructExecutor({
|
|
64
|
+
query: 'CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }',
|
|
65
|
+
});
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Writer
|
|
69
|
+
|
|
70
|
+
Writes generated quads to a destination:
|
|
71
|
+
|
|
72
|
+
- `SparqlUpdateWriter` — writes to a SPARQL endpoint via UPDATE queries
|
|
73
|
+
- `FileWriter` — writes to local files
|
|
74
|
+
|
|
45
75
|
## Usage
|
|
46
76
|
|
|
47
77
|
```typescript
|
|
48
78
|
import {
|
|
49
|
-
|
|
79
|
+
Pipeline,
|
|
80
|
+
Stage,
|
|
50
81
|
SparqlConstructExecutor,
|
|
51
|
-
|
|
82
|
+
SparqlItemSelector,
|
|
83
|
+
SparqlUpdateWriter,
|
|
84
|
+
ManualDatasetSelection,
|
|
85
|
+
SparqlDistributionResolver,
|
|
52
86
|
} from '@lde/pipeline';
|
|
53
87
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
88
|
+
const pipeline = new Pipeline({
|
|
89
|
+
datasetSelector: new ManualDatasetSelection([dataset]),
|
|
90
|
+
stages: [
|
|
91
|
+
new Stage({
|
|
92
|
+
name: 'per-class',
|
|
93
|
+
itemSelector: new SparqlItemSelector({
|
|
94
|
+
query: 'SELECT DISTINCT ?class WHERE { ?s a ?class }',
|
|
95
|
+
}),
|
|
96
|
+
executors: new SparqlConstructExecutor({
|
|
97
|
+
query:
|
|
98
|
+
'CONSTRUCT { ?class a <http://example.org/Class> } WHERE { ?s a ?class }',
|
|
99
|
+
}),
|
|
100
|
+
}),
|
|
101
|
+
],
|
|
102
|
+
writers: new SparqlUpdateWriter({
|
|
103
|
+
endpoint: new URL('http://localhost:7200/repositories/lde/statements'),
|
|
104
|
+
}),
|
|
60
105
|
});
|
|
61
|
-
|
|
106
|
+
|
|
107
|
+
await pipeline.run();
|
|
62
108
|
```
|
|
63
109
|
|
|
64
110
|
## Validation
|
package/dist/pipeline.d.ts
CHANGED
|
@@ -5,18 +5,26 @@ import { type DistributionResolver } from './distribution/resolver.js';
|
|
|
5
5
|
import type { StageOutputResolver } from './stageOutputResolver.js';
|
|
6
6
|
import type { ProgressReporter } from './progressReporter.js';
|
|
7
7
|
export interface PipelineOptions {
|
|
8
|
-
name: string;
|
|
9
8
|
datasetSelector: DatasetSelector;
|
|
10
9
|
stages: Stage[];
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
writers: Writer | Writer[];
|
|
11
|
+
name?: string;
|
|
12
|
+
distributionResolver?: DistributionResolver;
|
|
13
|
+
chaining?: {
|
|
14
|
+
stageOutputResolver: StageOutputResolver;
|
|
15
|
+
outputDir: string;
|
|
16
|
+
outputFormat?: 'turtle' | 'n-triples' | 'n-quads';
|
|
17
|
+
};
|
|
16
18
|
reporter?: ProgressReporter;
|
|
17
19
|
}
|
|
18
20
|
export declare class Pipeline {
|
|
19
|
-
private readonly
|
|
21
|
+
private readonly name;
|
|
22
|
+
private readonly datasetSelector;
|
|
23
|
+
private readonly stages;
|
|
24
|
+
private readonly writer;
|
|
25
|
+
private readonly distributionResolver;
|
|
26
|
+
private readonly chaining?;
|
|
27
|
+
private readonly reporter?;
|
|
20
28
|
constructor(options: PipelineOptions);
|
|
21
29
|
run(): Promise<void>;
|
|
22
30
|
private processDataset;
|
package/dist/pipeline.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO,EACL,KAAK,oBAAoB,EAE1B,MAAM,4BAA4B,CAAC;AAGpC,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,MAAM,WAAW,eAAe;IAC9B,eAAe,EAAE,eAAe,CAAC;IACjC,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,oBAAoB,CAAC,EAAE,oBAAoB,CAAC;IAC5C,QAAQ,CAAC,EAAE;QACT,mBAAmB,EAAE,mBAAmB,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;KACnD,CAAC;IACF,QAAQ,CAAC,EAAE,gBAAgB,CAAC;CAC7B;AAmBD,qBAAa,QAAQ;IACnB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAU;IACjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAuB;IAC5D,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAA8B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAmB;gBAEjC,OAAO,EAAE,eAAe;IAoB9B,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;YAaZ,cAAc;YA0Bd,QAAQ;YA8BR,QAAQ;YAmDR,eAAe;YAkCd,SAAS;CAUzB"}
|
package/dist/pipeline.js
CHANGED
|
@@ -2,40 +2,67 @@ import { createReadStream } from 'node:fs';
|
|
|
2
2
|
import { StreamParser } from 'n3';
|
|
3
3
|
import { FileWriter } from './writer/fileWriter.js';
|
|
4
4
|
import { NoDistributionAvailable, } from './distribution/resolver.js';
|
|
5
|
+
import { SparqlDistributionResolver } from './distribution/index.js';
|
|
5
6
|
import { NotSupported } from './sparql/executor.js';
|
|
7
|
+
class FanOutWriter {
|
|
8
|
+
writers;
|
|
9
|
+
constructor(writers) {
|
|
10
|
+
this.writers = writers;
|
|
11
|
+
}
|
|
12
|
+
async write(dataset, quads) {
|
|
13
|
+
const collected = [];
|
|
14
|
+
for await (const quad of quads)
|
|
15
|
+
collected.push(quad);
|
|
16
|
+
for (const w of this.writers) {
|
|
17
|
+
await w.write(dataset, (async function* () {
|
|
18
|
+
yield* collected;
|
|
19
|
+
})());
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
6
23
|
export class Pipeline {
|
|
7
|
-
|
|
24
|
+
name;
|
|
25
|
+
datasetSelector;
|
|
26
|
+
stages;
|
|
27
|
+
writer;
|
|
28
|
+
distributionResolver;
|
|
29
|
+
chaining;
|
|
30
|
+
reporter;
|
|
8
31
|
constructor(options) {
|
|
9
32
|
const hasSubStages = options.stages.some((stage) => stage.stages.length > 0);
|
|
10
|
-
if (hasSubStages && !options.
|
|
11
|
-
throw new Error('
|
|
12
|
-
}
|
|
13
|
-
if (hasSubStages && !options.outputDir) {
|
|
14
|
-
throw new Error('outputDir is required when any stage has sub-stages');
|
|
33
|
+
if (hasSubStages && !options.chaining) {
|
|
34
|
+
throw new Error('chaining is required when any stage has sub-stages');
|
|
15
35
|
}
|
|
16
|
-
this.
|
|
36
|
+
this.name = options.name ?? '';
|
|
37
|
+
this.datasetSelector = options.datasetSelector;
|
|
38
|
+
this.stages = options.stages;
|
|
39
|
+
this.writer = Array.isArray(options.writers)
|
|
40
|
+
? new FanOutWriter(options.writers)
|
|
41
|
+
: options.writers;
|
|
42
|
+
this.distributionResolver =
|
|
43
|
+
options.distributionResolver ?? new SparqlDistributionResolver();
|
|
44
|
+
this.chaining = options.chaining;
|
|
45
|
+
this.reporter = options.reporter;
|
|
17
46
|
}
|
|
18
47
|
async run() {
|
|
19
|
-
const { datasetSelector, reporter, name } = this.options;
|
|
20
48
|
const start = Date.now();
|
|
21
|
-
reporter?.pipelineStart(name);
|
|
22
|
-
const datasets = await datasetSelector.select();
|
|
49
|
+
this.reporter?.pipelineStart(this.name);
|
|
50
|
+
const datasets = await this.datasetSelector.select();
|
|
23
51
|
for await (const dataset of datasets) {
|
|
24
52
|
await this.processDataset(dataset);
|
|
25
53
|
}
|
|
26
|
-
reporter?.pipelineComplete({ duration: Date.now() - start });
|
|
54
|
+
this.reporter?.pipelineComplete({ duration: Date.now() - start });
|
|
27
55
|
}
|
|
28
56
|
async processDataset(dataset) {
|
|
29
|
-
const { distributionResolver, reporter } = this.options;
|
|
30
57
|
const datasetIri = dataset.iri.toString();
|
|
31
|
-
reporter?.datasetStart(datasetIri);
|
|
32
|
-
const resolved = await distributionResolver.resolve(dataset);
|
|
58
|
+
this.reporter?.datasetStart(datasetIri);
|
|
59
|
+
const resolved = await this.distributionResolver.resolve(dataset);
|
|
33
60
|
if (resolved instanceof NoDistributionAvailable) {
|
|
34
|
-
reporter?.datasetSkipped(datasetIri, resolved.message);
|
|
61
|
+
this.reporter?.datasetSkipped(datasetIri, resolved.message);
|
|
35
62
|
return;
|
|
36
63
|
}
|
|
37
64
|
try {
|
|
38
|
-
for (const stage of this.
|
|
65
|
+
for (const stage of this.stages) {
|
|
39
66
|
if (stage.stages.length > 0) {
|
|
40
67
|
await this.runChain(dataset, resolved.distribution, stage);
|
|
41
68
|
}
|
|
@@ -47,26 +74,25 @@ export class Pipeline {
|
|
|
47
74
|
catch {
|
|
48
75
|
// Stage error for this dataset; continue to next dataset.
|
|
49
76
|
}
|
|
50
|
-
reporter?.datasetComplete(datasetIri);
|
|
77
|
+
this.reporter?.datasetComplete(datasetIri);
|
|
51
78
|
}
|
|
52
79
|
async runStage(dataset, distribution, stage) {
|
|
53
|
-
|
|
54
|
-
reporter?.stageStart(stage.name);
|
|
80
|
+
this.reporter?.stageStart(stage.name);
|
|
55
81
|
const stageStart = Date.now();
|
|
56
82
|
let elementsProcessed = 0;
|
|
57
83
|
let quadsGenerated = 0;
|
|
58
|
-
const result = await stage.run(dataset, distribution, writer, {
|
|
84
|
+
const result = await stage.run(dataset, distribution, this.writer, {
|
|
59
85
|
onProgress: (elements, quads) => {
|
|
60
86
|
elementsProcessed = elements;
|
|
61
87
|
quadsGenerated = quads;
|
|
62
|
-
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
88
|
+
this.reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
63
89
|
},
|
|
64
90
|
});
|
|
65
91
|
if (result instanceof NotSupported) {
|
|
66
|
-
reporter?.stageSkipped(stage.name, result.message);
|
|
92
|
+
this.reporter?.stageSkipped(stage.name, result.message);
|
|
67
93
|
}
|
|
68
94
|
else {
|
|
69
|
-
reporter?.stageComplete(stage.name, {
|
|
95
|
+
this.reporter?.stageComplete(stage.name, {
|
|
70
96
|
elementsProcessed,
|
|
71
97
|
quadsGenerated,
|
|
72
98
|
duration: Date.now() - stageStart,
|
|
@@ -74,7 +100,7 @@ export class Pipeline {
|
|
|
74
100
|
}
|
|
75
101
|
}
|
|
76
102
|
async runChain(dataset, distribution, stage) {
|
|
77
|
-
const {
|
|
103
|
+
const { stageOutputResolver, outputDir, outputFormat } = this.chaining;
|
|
78
104
|
const outputFiles = [];
|
|
79
105
|
try {
|
|
80
106
|
// 1. Run parent stage → FileWriter.
|
|
@@ -99,15 +125,14 @@ export class Pipeline {
|
|
|
99
125
|
}
|
|
100
126
|
}
|
|
101
127
|
// 3. Concatenate all output files → user writer.
|
|
102
|
-
await writer.write(dataset, this.readFiles(outputFiles));
|
|
128
|
+
await this.writer.write(dataset, this.readFiles(outputFiles));
|
|
103
129
|
}
|
|
104
130
|
finally {
|
|
105
131
|
await stageOutputResolver.cleanup();
|
|
106
132
|
}
|
|
107
133
|
}
|
|
108
134
|
async runChainedStage(dataset, distribution, stage, stageWriter) {
|
|
109
|
-
|
|
110
|
-
reporter?.stageStart(stage.name);
|
|
135
|
+
this.reporter?.stageStart(stage.name);
|
|
111
136
|
const stageStart = Date.now();
|
|
112
137
|
let elementsProcessed = 0;
|
|
113
138
|
let quadsGenerated = 0;
|
|
@@ -115,14 +140,14 @@ export class Pipeline {
|
|
|
115
140
|
onProgress: (elements, quads) => {
|
|
116
141
|
elementsProcessed = elements;
|
|
117
142
|
quadsGenerated = quads;
|
|
118
|
-
reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
143
|
+
this.reporter?.stageProgress({ elementsProcessed, quadsGenerated });
|
|
119
144
|
},
|
|
120
145
|
});
|
|
121
146
|
if (result instanceof NotSupported) {
|
|
122
|
-
reporter?.stageSkipped(stage.name, result.message);
|
|
147
|
+
this.reporter?.stageSkipped(stage.name, result.message);
|
|
123
148
|
throw new Error(`Stage '${stage.name}' returned NotSupported in chained mode`);
|
|
124
149
|
}
|
|
125
|
-
reporter?.stageComplete(stage.name, {
|
|
150
|
+
this.reporter?.stageComplete(stage.name, {
|
|
126
151
|
elementsProcessed,
|
|
127
152
|
quadsGenerated,
|
|
128
153
|
duration: Date.now() - stageStart,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
+
import type { Distribution } from '@lde/dataset';
|
|
1
2
|
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
2
3
|
import type { ItemSelector } from '../stage.js';
|
|
3
4
|
import type { VariableBindings } from './executor.js';
|
|
4
5
|
export interface SparqlItemSelectorOptions {
|
|
5
6
|
/** SELECT query projecting at least one named variable. A LIMIT in the query sets the default page size. */
|
|
6
7
|
query: string;
|
|
7
|
-
/** SPARQL endpoint URL. */
|
|
8
|
-
endpoint: URL;
|
|
9
8
|
/** Results per page. Overrides any LIMIT in the query. @default 10 */
|
|
10
9
|
pageSize?: number;
|
|
11
10
|
/** Custom fetcher instance. */
|
|
@@ -15,6 +14,7 @@ export interface SparqlItemSelectorOptions {
|
|
|
15
14
|
* {@link ItemSelector} that pages through SPARQL SELECT results,
|
|
16
15
|
* yielding all projected variable bindings (NamedNode values only) per row.
|
|
17
16
|
*
|
|
17
|
+
* The endpoint URL comes from the {@link Distribution} passed to {@link select}.
|
|
18
18
|
* Pagination is an internal detail — consumers iterate binding rows directly.
|
|
19
19
|
* If the query contains a LIMIT, it is used as the default page size
|
|
20
20
|
* (can be overridden by the `pageSize` option). Pagination continues
|
|
@@ -22,10 +22,9 @@ export interface SparqlItemSelectorOptions {
|
|
|
22
22
|
*/
|
|
23
23
|
export declare class SparqlItemSelector implements ItemSelector {
|
|
24
24
|
private readonly parsed;
|
|
25
|
-
private readonly endpoint;
|
|
26
25
|
private readonly pageSize;
|
|
27
26
|
private readonly fetcher;
|
|
28
27
|
constructor(options: SparqlItemSelectorOptions);
|
|
29
|
-
|
|
28
|
+
select(distribution: Distribution): AsyncIterableIterator<VariableBindings>;
|
|
30
29
|
}
|
|
31
30
|
//# sourceMappingURL=selector.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAQ9D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAKtD,MAAM,WAAW,yBAAyB;IACxC,4GAA4G;IAC5G,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;GASG;AACH,qBAAa,kBAAmB,YAAW,YAAY;IACrD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,yBAAyB;IAkBvC,MAAM,CACX,YAAY,EAAE,YAAY,GACzB,qBAAqB,CAAC,gBAAgB,CAAC;CAmC3C"}
|
package/dist/sparql/selector.js
CHANGED
|
@@ -6,6 +6,7 @@ const generator = new Generator();
|
|
|
6
6
|
* {@link ItemSelector} that pages through SPARQL SELECT results,
|
|
7
7
|
* yielding all projected variable bindings (NamedNode values only) per row.
|
|
8
8
|
*
|
|
9
|
+
* The endpoint URL comes from the {@link Distribution} passed to {@link select}.
|
|
9
10
|
* Pagination is an internal detail — consumers iterate binding rows directly.
|
|
10
11
|
* If the query contains a LIMIT, it is used as the default page size
|
|
11
12
|
* (can be overridden by the `pageSize` option). Pagination continues
|
|
@@ -13,7 +14,6 @@ const generator = new Generator();
|
|
|
13
14
|
*/
|
|
14
15
|
export class SparqlItemSelector {
|
|
15
16
|
parsed;
|
|
16
|
-
endpoint;
|
|
17
17
|
pageSize;
|
|
18
18
|
fetcher;
|
|
19
19
|
constructor(options) {
|
|
@@ -26,17 +26,17 @@ export class SparqlItemSelector {
|
|
|
26
26
|
throw new Error('Query must project at least one named variable (SELECT * is not supported)');
|
|
27
27
|
}
|
|
28
28
|
this.parsed = parsed;
|
|
29
|
-
this.endpoint = options.endpoint;
|
|
30
29
|
this.pageSize = options.pageSize ?? parsed.limit ?? 10;
|
|
31
30
|
this.fetcher = options.fetcher ?? new SparqlEndpointFetcher();
|
|
32
31
|
}
|
|
33
|
-
async *
|
|
32
|
+
async *select(distribution) {
|
|
33
|
+
const endpoint = distribution.accessUrl;
|
|
34
34
|
let offset = 0;
|
|
35
35
|
while (true) {
|
|
36
36
|
this.parsed.limit = this.pageSize;
|
|
37
37
|
this.parsed.offset = offset;
|
|
38
38
|
const paginatedQuery = generator.stringify(this.parsed);
|
|
39
|
-
const stream = (await this.fetcher.fetchBindings(
|
|
39
|
+
const stream = (await this.fetcher.fetchBindings(endpoint.toString(), paginatedQuery));
|
|
40
40
|
let pageSize = 0;
|
|
41
41
|
for await (const record of stream) {
|
|
42
42
|
const row = Object.fromEntries(Object.entries(record).filter(([, term]) => term.termType === 'NamedNode'));
|
package/dist/stage.d.ts
CHANGED
|
@@ -2,12 +2,10 @@ import { Dataset, Distribution } from '@lde/dataset';
|
|
|
2
2
|
import type { Executor, VariableBindings } from './sparql/executor.js';
|
|
3
3
|
import { NotSupported } from './sparql/executor.js';
|
|
4
4
|
import type { Writer } from './writer/writer.js';
|
|
5
|
-
/** An item selector, or a factory that receives the runtime distribution. */
|
|
6
|
-
export type ItemSelectorInput = ItemSelector | ((distribution: Distribution) => ItemSelector);
|
|
7
5
|
export interface StageOptions {
|
|
8
6
|
name: string;
|
|
9
7
|
executors: Executor | Executor[];
|
|
10
|
-
itemSelector?:
|
|
8
|
+
itemSelector?: ItemSelector;
|
|
11
9
|
/** Maximum number of bindings per executor call. @default 10 */
|
|
12
10
|
batchSize?: number;
|
|
13
11
|
/** Maximum concurrent in-flight executor batches. @default 10 */
|
|
@@ -22,7 +20,7 @@ export declare class Stage {
|
|
|
22
20
|
readonly name: string;
|
|
23
21
|
readonly stages: readonly Stage[];
|
|
24
22
|
private readonly executors;
|
|
25
|
-
private readonly
|
|
23
|
+
private readonly itemSelector?;
|
|
26
24
|
private readonly batchSize;
|
|
27
25
|
private readonly maxConcurrency;
|
|
28
26
|
constructor(options: StageOptions);
|
|
@@ -31,6 +29,7 @@ export declare class Stage {
|
|
|
31
29
|
private executeAll;
|
|
32
30
|
}
|
|
33
31
|
/** Selects items (as variable bindings) for executors to process. Pagination is an implementation detail. */
|
|
34
|
-
export interface ItemSelector
|
|
32
|
+
export interface ItemSelector {
|
|
33
|
+
select(distribution: Distribution): AsyncIterable<VariableBindings>;
|
|
35
34
|
}
|
|
36
35
|
//# sourceMappingURL=stage.d.ts.map
|
package/dist/stage.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,
|
|
1
|
+
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,gEAAgE;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uDAAuD;IACvD,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,CAAC,EAAE,CAAC,iBAAiB,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,KAAK,IAAI,CAAC;CAC1E;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,KAAK,EAAE,CAAC;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAe;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;gBAE5B,OAAO,EAAE,YAAY;IAW3B,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,UAAU,GACnB,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAmBjB,eAAe;YA+Gf,UAAU;CAqBzB;AAUD,6GAA6G;AAC7G,MAAM,WAAW,YAAY;IAC3B,MAAM,CAAC,YAAY,EAAE,YAAY,GAAG,aAAa,CAAC,gBAAgB,CAAC,CAAC;CACrE"}
|
package/dist/stage.js
CHANGED
|
@@ -5,7 +5,7 @@ export class Stage {
|
|
|
5
5
|
name;
|
|
6
6
|
stages;
|
|
7
7
|
executors;
|
|
8
|
-
|
|
8
|
+
itemSelector;
|
|
9
9
|
batchSize;
|
|
10
10
|
maxConcurrency;
|
|
11
11
|
constructor(options) {
|
|
@@ -14,16 +14,13 @@ export class Stage {
|
|
|
14
14
|
this.executors = Array.isArray(options.executors)
|
|
15
15
|
? options.executors
|
|
16
16
|
: [options.executors];
|
|
17
|
-
this.
|
|
17
|
+
this.itemSelector = options.itemSelector;
|
|
18
18
|
this.batchSize = options.batchSize ?? 10;
|
|
19
19
|
this.maxConcurrency = options.maxConcurrency ?? 10;
|
|
20
20
|
}
|
|
21
21
|
async run(dataset, distribution, writer, options) {
|
|
22
|
-
if (this.
|
|
23
|
-
|
|
24
|
-
? this.itemSelectorInput(distribution)
|
|
25
|
-
: this.itemSelectorInput;
|
|
26
|
-
return this.runWithSelector(selector, dataset, distribution, writer, options);
|
|
22
|
+
if (this.itemSelector) {
|
|
23
|
+
return this.runWithSelector(this.itemSelector.select(distribution), dataset, distribution, writer, options);
|
|
27
24
|
}
|
|
28
25
|
const streams = await this.executeAll(dataset, distribution);
|
|
29
26
|
if (streams instanceof NotSupported) {
|