@lde/pipeline 0.28.0 → 0.28.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -4
- package/dist/sparql/executor.d.ts +44 -0
- package/dist/sparql/executor.d.ts.map +1 -1
- package/dist/sparql/executor.js +57 -1
- package/dist/sparql/index.d.ts +1 -1
- package/dist/sparql/index.d.ts.map +1 -1
- package/dist/sparql/index.js +1 -1
- package/dist/sparql/selector.d.ts +13 -8
- package/dist/sparql/selector.d.ts.map +1 -1
- package/dist/sparql/selector.js +14 -14
- package/dist/stage.d.ts +10 -2
- package/dist/stage.d.ts.map +1 -1
- package/dist/stage.js +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -30,17 +30,51 @@ const selector = new RegistrySelector({
|
|
|
30
30
|
const selector = new ManualDatasetSelection([dataset]);
|
|
31
31
|
```
|
|
32
32
|
|
|
33
|
+
### Stage
|
|
34
|
+
|
|
35
|
+
A stage groups an item selector, one or more executors, and configuration:
|
|
36
|
+
|
|
37
|
+
```typescript
|
|
38
|
+
new Stage({
|
|
39
|
+
name: 'per-class',
|
|
40
|
+
itemSelector: new SparqlItemSelector({
|
|
41
|
+
query: 'SELECT DISTINCT ?class WHERE { ?s a ?class }',
|
|
42
|
+
}),
|
|
43
|
+
executors: executor,
|
|
44
|
+
batchSize: 100,
|
|
45
|
+
maxConcurrency: 5,
|
|
46
|
+
});
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
#### Batch size
|
|
50
|
+
|
|
51
|
+
`batchSize` (default: 10) controls how many variable bindings are passed to each executor call as a `VALUES` clause. It also sets the page size for the item selector's SPARQL requests, so that each paginated request fills exactly one executor batch.
|
|
52
|
+
|
|
53
|
+
A `LIMIT` clause in the selector query overrides `batchSize` as the page size — use this when the SPARQL endpoint enforces a hard result limit:
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
// Endpoint caps results at 1000, but process in batches of 100.
|
|
57
|
+
new Stage({
|
|
58
|
+
name: 'per-class',
|
|
59
|
+
itemSelector: new SparqlItemSelector({
|
|
60
|
+
query: 'SELECT DISTINCT ?class WHERE { ?s a ?class } LIMIT 1000',
|
|
61
|
+
}),
|
|
62
|
+
executors: executor,
|
|
63
|
+
batchSize: 100,
|
|
64
|
+
});
|
|
65
|
+
```
|
|
66
|
+
|
|
33
67
|
### Item Selector
|
|
34
68
|
|
|
35
69
|
Selects resources from the distribution and fans out executor calls per batch of results. Implements the `ItemSelector` interface:
|
|
36
70
|
|
|
37
71
|
```typescript
|
|
38
72
|
interface ItemSelector {
|
|
39
|
-
select(distribution: Distribution): AsyncIterable<VariableBindings>;
|
|
73
|
+
select(distribution: Distribution, batchSize?: number): AsyncIterable<VariableBindings>;
|
|
40
74
|
}
|
|
41
75
|
```
|
|
42
76
|
|
|
43
|
-
The distribution is received at run time, so selectors don't need the endpoint URL at construction time. Use `SparqlItemSelector` for SPARQL-based selection with automatic pagination:
|
|
77
|
+
The distribution is received at run time, so selectors don't need the endpoint URL at construction time. The `batchSize` parameter is set by the stage. Use `SparqlItemSelector` for SPARQL-based selection with automatic pagination:
|
|
44
78
|
|
|
45
79
|
```typescript
|
|
46
80
|
new SparqlItemSelector({
|
|
@@ -52,9 +86,9 @@ For dynamic queries that depend on the distribution, implement `ItemSelector` di
|
|
|
52
86
|
|
|
53
87
|
```typescript
|
|
54
88
|
const itemSelector: ItemSelector = {
|
|
55
|
-
select: (distribution) => {
|
|
89
|
+
select: (distribution, batchSize) => {
|
|
56
90
|
const query = buildQuery(distribution);
|
|
57
|
-
return new SparqlItemSelector({ query }).select(distribution);
|
|
91
|
+
return new SparqlItemSelector({ query }).select(distribution, batchSize);
|
|
58
92
|
},
|
|
59
93
|
};
|
|
60
94
|
```
|
|
@@ -69,6 +103,15 @@ const executor = new SparqlConstructExecutor({
|
|
|
69
103
|
});
|
|
70
104
|
```
|
|
71
105
|
|
|
106
|
+
When querying endpoints that return line-oriented formats like N-Triples (e.g. QLever), enable `lineBuffer` to work around an [N3.js chunk-splitting bug](https://github.com/rdfjs/N3.js/issues/578) that causes intermittent parse errors on large responses:
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
const executor = new SparqlConstructExecutor({
|
|
110
|
+
query: 'CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }',
|
|
111
|
+
lineBuffer: true,
|
|
112
|
+
});
|
|
113
|
+
```
|
|
114
|
+
|
|
72
115
|
`Executor` is an interface, so you can implement your own for logic that's hard to express in pure SPARQL — for example, cleaning up messy date notations or converting locale-specific dates to ISO 8601. The decorator pattern lets you wrap a SPARQL executor and post-process its quad stream in TypeScript:
|
|
73
116
|
|
|
74
117
|
```typescript
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { Dataset, Distribution } from '@lde/dataset';
|
|
2
2
|
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
3
3
|
import type { NamedNode, Quad } from '@rdfjs/types';
|
|
4
|
+
import { Transform } from 'node:stream';
|
|
4
5
|
/**
|
|
5
6
|
* An executor could not run because the dataset lacks a supported distribution.
|
|
6
7
|
*/
|
|
@@ -42,6 +43,17 @@ export interface SparqlConstructExecutorOptions {
|
|
|
42
43
|
* Optional custom SparqlEndpointFetcher instance.
|
|
43
44
|
*/
|
|
44
45
|
fetcher?: SparqlEndpointFetcher;
|
|
46
|
+
/**
|
|
47
|
+
* Buffer complete lines before passing them to the N3 parser.
|
|
48
|
+
*
|
|
49
|
+
* Works around an [N3.js bug](https://github.com/rdfjs/N3.js/issues/578)
|
|
50
|
+
* where language tags (e.g. `@nl-nl`) split across HTTP chunk boundaries
|
|
51
|
+
* cause parse errors. Enable this when querying endpoints that return
|
|
52
|
+
* line-oriented formats such as N-Triples (e.g. QLever).
|
|
53
|
+
*
|
|
54
|
+
* @default false
|
|
55
|
+
*/
|
|
56
|
+
lineBuffer?: boolean;
|
|
45
57
|
}
|
|
46
58
|
/**
|
|
47
59
|
* A streaming SPARQL CONSTRUCT executor.
|
|
@@ -75,6 +87,7 @@ export declare class SparqlConstructExecutor implements Executor {
|
|
|
75
87
|
private readonly preParsed?;
|
|
76
88
|
private readonly fetcher;
|
|
77
89
|
private readonly retries;
|
|
90
|
+
private readonly lineBuffer;
|
|
78
91
|
private readonly generator;
|
|
79
92
|
constructor(options: SparqlConstructExecutorOptions);
|
|
80
93
|
/**
|
|
@@ -86,6 +99,12 @@ export declare class SparqlConstructExecutor implements Executor {
|
|
|
86
99
|
* @returns AsyncIterable<Quad> stream of results.
|
|
87
100
|
*/
|
|
88
101
|
execute(dataset: Dataset, distribution: Distribution, options?: ExecuteOptions): Promise<AsyncIterable<Quad>>;
|
|
102
|
+
/**
|
|
103
|
+
* Fetch quads from the endpoint, optionally line-buffering the response
|
|
104
|
+
* stream before it reaches the N3 parser to work around
|
|
105
|
+
* {@link https://github.com/rdfjs/N3.js/issues/578 | N3.js#578}.
|
|
106
|
+
*/
|
|
107
|
+
private fetchQuads;
|
|
89
108
|
/**
|
|
90
109
|
* Create an executor from a query file.
|
|
91
110
|
*
|
|
@@ -98,4 +117,29 @@ export declare class SparqlConstructExecutor implements Executor {
|
|
|
98
117
|
* Read a SPARQL query from a file.
|
|
99
118
|
*/
|
|
100
119
|
export declare function readQueryFile(filename: string): Promise<string>;
|
|
120
|
+
/**
|
|
121
|
+
* Buffers incoming data until complete lines (`\n`-terminated) are available,
|
|
122
|
+
* then pushes them downstream as a single chunk.
|
|
123
|
+
*
|
|
124
|
+
* **Why this exists:** `fetch-sparql-endpoint` pipes the raw HTTP response
|
|
125
|
+
* stream directly into N3.js's `StreamParser`. N3.js has a bug
|
|
126
|
+
* ({@link https://github.com/rdfjs/N3.js/issues/578 | N3.js#578}) where
|
|
127
|
+
* tokens that straddle chunk boundaries — most commonly language tags like
|
|
128
|
+
* `@nl-nl` — cause spurious `Unexpected "-nl"` parse errors. The error is
|
|
129
|
+
* non-deterministic and typically surfaces only on responses larger than
|
|
130
|
+
* ~12 MB, because that is when HTTP chunking starts splitting mid-token.
|
|
131
|
+
*
|
|
132
|
+
* By ensuring each chunk passed to the parser ends on a line boundary, we
|
|
133
|
+
* prevent any N-Triples token from being split. Memory overhead is minimal:
|
|
134
|
+
* at most one partial line is buffered at a time.
|
|
135
|
+
*
|
|
136
|
+
* This transform can be removed once N3.js#578 is fixed upstream.
|
|
137
|
+
*
|
|
138
|
+
* @see https://github.com/rdfjs/N3.js/issues/578
|
|
139
|
+
*/
|
|
140
|
+
export declare class LineBufferTransform extends Transform {
|
|
141
|
+
private remainder;
|
|
142
|
+
_transform(chunk: Buffer, _encoding: string, callback: () => void): void;
|
|
143
|
+
_flush(callback: () => void): void;
|
|
144
|
+
}
|
|
101
145
|
//# sourceMappingURL=executor.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"executor.d.ts","sourceRoot":"","sources":["../../src/sparql/executor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAiB,MAAM,cAAc,CAAC;AACpE,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,OAAO,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"executor.d.ts","sourceRoot":"","sources":["../../src/sparql/executor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAiB,MAAM,cAAc,CAAC;AACpE,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,OAAO,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAGpD,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAUxC;;GAEG;AACH,qBAAa,YAAY;aACK,OAAO,EAAE,MAAM;gBAAf,OAAO,EAAE,MAAM;CAC5C;AAED,qEAAqE;AACrE,MAAM,MAAM,gBAAgB,GAAG,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;AAEzD,MAAM,WAAW,cAAc;IAC7B;;;OAGG;IACH,QAAQ,CAAC,EAAE,gBAAgB,EAAE,CAAC;CAC/B;AAED,MAAM,WAAW,QAAQ;IACvB,OAAO,CACL,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,cAAc,GACvB,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC;CAChD;AAED;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC7C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;IAEhC;;;;;;;;;OASG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACtB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,qBAAa,uBAAwB,YAAW,QAAQ;IACtD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAiB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;IAChD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAU;IACrC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAmB;gBAEjC,OAAO,EAAE,8BAA8B;IAoBnD;;;;;;;OAOG;IACG,OAAO,CACX,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,cAAc,GACvB,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;IAwC/B;;;;OAIG;YACW,UAAU;IAmBxB;;;;;OAKG;WACiB,QAAQ,CAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,IAAI,CAAC,8BAA8B,EAAE,OAAO,CAAC,GACtD,OAAO,CAAC,uBAAuB,CAAC;CAIpC;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAErE;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,mBAAoB,SAAQ,SAAS;IAChD,OAAO,CAAC,SAAS,CAAM;IAEd,UAAU,CACjB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,IAAI;IAWb,MAAM,CAAC,QAAQ,EAAE,MAAM,IAAI;CAMrC"}
|
package/dist/sparql/executor.js
CHANGED
|
@@ -2,6 +2,8 @@ import { assertSafeIri } from '@lde/dataset';
|
|
|
2
2
|
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
3
3
|
import { readFile } from 'node:fs/promises';
|
|
4
4
|
import { resolve } from 'node:path';
|
|
5
|
+
import { Transform } from 'node:stream';
|
|
6
|
+
import { StreamParser } from 'n3';
|
|
5
7
|
import { Parser } from '@traqula/parser-sparql-1-1';
|
|
6
8
|
import { Generator } from '@traqula/generator-sparql-1-1';
|
|
7
9
|
import isNetworkError from 'is-network-error';
|
|
@@ -49,10 +51,12 @@ export class SparqlConstructExecutor {
|
|
|
49
51
|
preParsed;
|
|
50
52
|
fetcher;
|
|
51
53
|
retries;
|
|
54
|
+
lineBuffer;
|
|
52
55
|
generator = new Generator();
|
|
53
56
|
constructor(options) {
|
|
54
57
|
this.rawQuery = options.query;
|
|
55
58
|
this.retries = options.retries ?? 3;
|
|
59
|
+
this.lineBuffer = options.lineBuffer ?? false;
|
|
56
60
|
if (!options.query.includes('#subjectFilter#')) {
|
|
57
61
|
const parsed = new Parser().parse(options.query);
|
|
58
62
|
if (parsed.type !== 'query' || parsed.subType !== 'construct') {
|
|
@@ -98,11 +102,25 @@ export class SparqlConstructExecutor {
|
|
|
98
102
|
let query = this.generator.generate(ast);
|
|
99
103
|
assertSafeIri(dataset.iri.toString());
|
|
100
104
|
query = query.replaceAll('?dataset', `<${dataset.iri}>`);
|
|
101
|
-
return await pRetry(() => this.
|
|
105
|
+
return await pRetry(() => this.fetchQuads(endpoint.toString(), query), {
|
|
102
106
|
retries: this.retries,
|
|
103
107
|
shouldRetry: ({ error }) => isTransientError(error),
|
|
104
108
|
});
|
|
105
109
|
}
|
|
110
|
+
/**
|
|
111
|
+
* Fetch quads from the endpoint, optionally line-buffering the response
|
|
112
|
+
* stream before it reaches the N3 parser to work around
|
|
113
|
+
* {@link https://github.com/rdfjs/N3.js/issues/578 | N3.js#578}.
|
|
114
|
+
*/
|
|
115
|
+
async fetchQuads(endpoint, query) {
|
|
116
|
+
if (!this.lineBuffer) {
|
|
117
|
+
return this.fetcher.fetchTriples(endpoint, query);
|
|
118
|
+
}
|
|
119
|
+
const [contentType, , responseStream] = await this.fetcher.fetchRawStream(endpoint, query, SparqlEndpointFetcher.CONTENTTYPE_TURTLE);
|
|
120
|
+
return responseStream
|
|
121
|
+
.pipe(new LineBufferTransform())
|
|
122
|
+
.pipe(new StreamParser({ format: contentType }));
|
|
123
|
+
}
|
|
106
124
|
/**
|
|
107
125
|
* Create an executor from a query file.
|
|
108
126
|
*
|
|
@@ -120,6 +138,44 @@ export class SparqlConstructExecutor {
|
|
|
120
138
|
export async function readQueryFile(filename) {
|
|
121
139
|
return (await readFile(resolve(filename))).toString();
|
|
122
140
|
}
|
|
141
|
+
/**
|
|
142
|
+
* Buffers incoming data until complete lines (`\n`-terminated) are available,
|
|
143
|
+
* then pushes them downstream as a single chunk.
|
|
144
|
+
*
|
|
145
|
+
* **Why this exists:** `fetch-sparql-endpoint` pipes the raw HTTP response
|
|
146
|
+
* stream directly into N3.js's `StreamParser`. N3.js has a bug
|
|
147
|
+
* ({@link https://github.com/rdfjs/N3.js/issues/578 | N3.js#578}) where
|
|
148
|
+
* tokens that straddle chunk boundaries — most commonly language tags like
|
|
149
|
+
* `@nl-nl` — cause spurious `Unexpected "-nl"` parse errors. The error is
|
|
150
|
+
* non-deterministic and typically surfaces only on responses larger than
|
|
151
|
+
* ~12 MB, because that is when HTTP chunking starts splitting mid-token.
|
|
152
|
+
*
|
|
153
|
+
* By ensuring each chunk passed to the parser ends on a line boundary, we
|
|
154
|
+
* prevent any N-Triples token from being split. Memory overhead is minimal:
|
|
155
|
+
* at most one partial line is buffered at a time.
|
|
156
|
+
*
|
|
157
|
+
* This transform can be removed once N3.js#578 is fixed upstream.
|
|
158
|
+
*
|
|
159
|
+
* @see https://github.com/rdfjs/N3.js/issues/578
|
|
160
|
+
*/
|
|
161
|
+
export class LineBufferTransform extends Transform {
|
|
162
|
+
remainder = '';
|
|
163
|
+
_transform(chunk, _encoding, callback) {
|
|
164
|
+
const data = this.remainder + chunk.toString();
|
|
165
|
+
const lines = data.split('\n');
|
|
166
|
+
this.remainder = lines.pop() ?? '';
|
|
167
|
+
if (lines.length > 0) {
|
|
168
|
+
this.push(lines.join('\n') + '\n');
|
|
169
|
+
}
|
|
170
|
+
callback();
|
|
171
|
+
}
|
|
172
|
+
_flush(callback) {
|
|
173
|
+
if (this.remainder.length > 0) {
|
|
174
|
+
this.push(this.remainder);
|
|
175
|
+
}
|
|
176
|
+
callback();
|
|
177
|
+
}
|
|
178
|
+
}
|
|
123
179
|
const transientStatusPattern = /HTTP status (\d+)/;
|
|
124
180
|
function isTransientError(error) {
|
|
125
181
|
if (isNetworkError(error))
|
package/dist/sparql/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { SparqlConstructExecutor, NotSupported, readQueryFile, type ExecuteOptions, type Executor, type SparqlConstructExecutorOptions, type VariableBindings, } from './executor.js';
|
|
1
|
+
export { SparqlConstructExecutor, LineBufferTransform, NotSupported, readQueryFile, type ExecuteOptions, type Executor, type SparqlConstructExecutorOptions, type VariableBindings, } from './executor.js';
|
|
2
2
|
export { SparqlItemSelector, type SparqlItemSelectorOptions, } from './selector.js';
|
|
3
3
|
export { injectValues } from './values.js';
|
|
4
4
|
export { withDefaultGraph } from './graph.js';
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/sparql/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,uBAAuB,EACvB,YAAY,EACZ,aAAa,EACb,KAAK,cAAc,EACnB,KAAK,QAAQ,EACb,KAAK,8BAA8B,EACnC,KAAK,gBAAgB,GACtB,MAAM,eAAe,CAAC;AACvB,OAAO,EACL,kBAAkB,EAClB,KAAK,yBAAyB,GAC/B,MAAM,eAAe,CAAC;AAEvB,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/sparql/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,uBAAuB,EACvB,mBAAmB,EACnB,YAAY,EACZ,aAAa,EACb,KAAK,cAAc,EACnB,KAAK,QAAQ,EACb,KAAK,8BAA8B,EACnC,KAAK,gBAAgB,GACtB,MAAM,eAAe,CAAC;AACvB,OAAO,EACL,kBAAkB,EAClB,KAAK,yBAAyB,GAC/B,MAAM,eAAe,CAAC;AAEvB,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC"}
|
package/dist/sparql/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { SparqlConstructExecutor, NotSupported, readQueryFile, } from './executor.js';
|
|
1
|
+
export { SparqlConstructExecutor, LineBufferTransform, NotSupported, readQueryFile, } from './executor.js';
|
|
2
2
|
export { SparqlItemSelector, } from './selector.js';
|
|
3
3
|
export { injectValues } from './values.js';
|
|
4
4
|
export { withDefaultGraph } from './graph.js';
|
|
@@ -3,10 +3,13 @@ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
|
3
3
|
import type { ItemSelector } from '../stage.js';
|
|
4
4
|
import type { VariableBindings } from './executor.js';
|
|
5
5
|
export interface SparqlItemSelectorOptions {
|
|
6
|
-
/**
|
|
6
|
+
/**
|
|
7
|
+
* SELECT query projecting at least one named variable.
|
|
8
|
+
*
|
|
9
|
+
* A `LIMIT` clause in the query overrides the stage's `batchSize` as the
|
|
10
|
+
* page size — use this when the SPARQL endpoint enforces a result limit.
|
|
11
|
+
*/
|
|
7
12
|
query: string;
|
|
8
|
-
/** Results per page. Overrides any LIMIT in the query. @default 10 */
|
|
9
|
-
pageSize?: number;
|
|
10
13
|
/** Custom fetcher instance. */
|
|
11
14
|
fetcher?: SparqlEndpointFetcher;
|
|
12
15
|
}
|
|
@@ -16,15 +19,17 @@ export interface SparqlItemSelectorOptions {
|
|
|
16
19
|
*
|
|
17
20
|
* The endpoint URL comes from the {@link Distribution} passed to {@link select}.
|
|
18
21
|
* Pagination is an internal detail — consumers iterate binding rows directly.
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
+
*
|
|
23
|
+
* The page size (results per SPARQL request) is determined by, in order:
|
|
24
|
+
* 1. A `LIMIT` clause in the selector query (for endpoints with hard result limits)
|
|
25
|
+
* 2. The stage's {@link StageOptions.batchSize} (passed via {@link select})
|
|
26
|
+
* 3. A default of 10
|
|
22
27
|
*/
|
|
23
28
|
export declare class SparqlItemSelector implements ItemSelector {
|
|
24
29
|
private readonly parsed;
|
|
25
|
-
private readonly
|
|
30
|
+
private readonly queryLimit?;
|
|
26
31
|
private readonly fetcher;
|
|
27
32
|
constructor(options: SparqlItemSelectorOptions);
|
|
28
|
-
select(distribution: Distribution): AsyncIterableIterator<VariableBindings>;
|
|
33
|
+
select(distribution: Distribution, batchSize?: number): AsyncIterableIterator<VariableBindings>;
|
|
29
34
|
}
|
|
30
35
|
//# sourceMappingURL=selector.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAQ9D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAMtD,MAAM,WAAW,yBAAyB;IACxC
|
|
1
|
+
{"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAQ9D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAMtD,MAAM,WAAW,yBAAyB;IACxC;;;;;OAKG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,kBAAmB,YAAW,YAAY;IACrD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAS;IACrC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,yBAAyB;IAkBvC,MAAM,CACX,YAAY,EAAE,YAAY,EAC1B,SAAS,CAAC,EAAE,MAAM,GACjB,qBAAqB,CAAC,gBAAgB,CAAC;CAuC3C"}
|
package/dist/sparql/selector.js
CHANGED
|
@@ -11,13 +11,15 @@ const F = new AstFactory();
|
|
|
11
11
|
*
|
|
12
12
|
* The endpoint URL comes from the {@link Distribution} passed to {@link select}.
|
|
13
13
|
* Pagination is an internal detail — consumers iterate binding rows directly.
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
14
|
+
*
|
|
15
|
+
* The page size (results per SPARQL request) is determined by, in order:
|
|
16
|
+
* 1. A `LIMIT` clause in the selector query (for endpoints with hard result limits)
|
|
17
|
+
* 2. The stage's {@link StageOptions.batchSize} (passed via {@link select})
|
|
18
|
+
* 3. A default of 10
|
|
17
19
|
*/
|
|
18
20
|
export class SparqlItemSelector {
|
|
19
21
|
parsed;
|
|
20
|
-
|
|
22
|
+
queryLimit;
|
|
21
23
|
fetcher;
|
|
22
24
|
constructor(options) {
|
|
23
25
|
const parsed = parser.parse(options.query);
|
|
@@ -29,31 +31,29 @@ export class SparqlItemSelector {
|
|
|
29
31
|
throw new Error('Query must project at least one named variable (SELECT * is not supported)');
|
|
30
32
|
}
|
|
31
33
|
this.parsed = parsed;
|
|
32
|
-
this.
|
|
33
|
-
options.pageSize ??
|
|
34
|
-
this.parsed.solutionModifiers.limitOffset?.limit ??
|
|
35
|
-
10;
|
|
34
|
+
this.queryLimit = this.parsed.solutionModifiers.limitOffset?.limit;
|
|
36
35
|
this.fetcher = options.fetcher ?? new SparqlEndpointFetcher();
|
|
37
36
|
}
|
|
38
|
-
async *select(distribution) {
|
|
37
|
+
async *select(distribution, batchSize) {
|
|
38
|
+
const effectivePageSize = this.queryLimit ?? batchSize ?? 10;
|
|
39
39
|
const endpoint = distribution.accessUrl;
|
|
40
40
|
let offset = 0;
|
|
41
41
|
while (true) {
|
|
42
|
-
this.parsed.solutionModifiers.limitOffset = F.solutionModifierLimitOffset(
|
|
42
|
+
this.parsed.solutionModifiers.limitOffset = F.solutionModifierLimitOffset(effectivePageSize, offset, F.gen());
|
|
43
43
|
const paginatedQuery = generator.generate(this.parsed);
|
|
44
44
|
const stream = (await this.fetcher.fetchBindings(endpoint.toString(), paginatedQuery));
|
|
45
|
-
let
|
|
45
|
+
let count = 0;
|
|
46
46
|
for await (const record of stream) {
|
|
47
47
|
const row = Object.fromEntries(Object.entries(record).filter(([, term]) => term.termType === 'NamedNode'));
|
|
48
48
|
if (Object.keys(row).length > 0) {
|
|
49
49
|
yield row;
|
|
50
|
-
|
|
50
|
+
count++;
|
|
51
51
|
}
|
|
52
52
|
}
|
|
53
|
-
if (
|
|
53
|
+
if (count === 0 || count < effectivePageSize) {
|
|
54
54
|
return;
|
|
55
55
|
}
|
|
56
|
-
offset +=
|
|
56
|
+
offset += count;
|
|
57
57
|
}
|
|
58
58
|
}
|
|
59
59
|
}
|
package/dist/stage.d.ts
CHANGED
|
@@ -10,7 +10,15 @@ export interface StageOptions {
|
|
|
10
10
|
name: string;
|
|
11
11
|
executors: Executor | Executor[];
|
|
12
12
|
itemSelector?: ItemSelector;
|
|
13
|
-
/**
|
|
13
|
+
/**
|
|
14
|
+
* Maximum number of bindings per executor call.
|
|
15
|
+
*
|
|
16
|
+
* Also used as the selector's page size so that each paginated request
|
|
17
|
+
* fills exactly one batch. A `LIMIT` clause in the selector query
|
|
18
|
+
* overrides this for endpoints with hard result limits.
|
|
19
|
+
*
|
|
20
|
+
* @default 10
|
|
21
|
+
*/
|
|
14
22
|
batchSize?: number;
|
|
15
23
|
/** Maximum concurrent in-flight executor batches. @default 10 */
|
|
16
24
|
maxConcurrency?: number;
|
|
@@ -48,6 +56,6 @@ export declare class Stage {
|
|
|
48
56
|
}
|
|
49
57
|
/** Selects items (as variable bindings) for executors to process. Pagination is an implementation detail. */
|
|
50
58
|
export interface ItemSelector {
|
|
51
|
-
select(distribution: Distribution): AsyncIterable<VariableBindings>;
|
|
59
|
+
select(distribution: Distribution, batchSize?: number): AsyncIterable<VariableBindings>;
|
|
52
60
|
}
|
|
53
61
|
//# sourceMappingURL=stage.d.ts.map
|
package/dist/stage.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,mEAAmE;AACnE,MAAM,MAAM,aAAa,GAAG,CAC1B,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,EAC1B,OAAO,EAAE,OAAO,KACb,aAAa,CAAC,IAAI,CAAC,CAAC;AAEzB,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B
|
|
1
|
+
{"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,mEAAmE;AACnE,MAAM,MAAM,aAAa,GAAG,CAC1B,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,EAC1B,OAAO,EAAE,OAAO,KACb,aAAa,CAAC,IAAI,CAAC,CAAC;AAEzB,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B;;;;;;;;OAQG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uDAAuD;IACvD,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC;IACjB,qFAAqF;IACrF,UAAU,CAAC,EAAE;QACX,SAAS,EAAE,SAAS,CAAC;QACrB,iEAAiE;QACjE,SAAS,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,CAAC;KACvC,CAAC;CACH;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,CAAC,EAAE,CAAC,cAAc,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,KAAK,IAAI,CAAC;CACvE;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,KAAK,EAAE,CAAC;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAe;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAA6B;gBAE7C,OAAO,EAAE,YAAY;IAYjC,mDAAmD;IACnD,IAAI,SAAS,IAAI,SAAS,GAAG,SAAS,CAErC;IAEK,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,UAAU,GACnB,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAkDjB,eAAe;IA2I7B;;;OAGG;YACW,cAAc;YAqBd,UAAU;CAqBzB;AAUD,6GAA6G;AAC7G,MAAM,WAAW,YAAY;IAC3B,MAAM,CACJ,YAAY,EAAE,YAAY,EAC1B,SAAS,CAAC,EAAE,MAAM,GACjB,aAAa,CAAC,gBAAgB,CAAC,CAAC;CACpC"}
|
package/dist/stage.js
CHANGED
|
@@ -26,7 +26,7 @@ export class Stage {
|
|
|
26
26
|
}
|
|
27
27
|
async run(dataset, distribution, writer, options) {
|
|
28
28
|
if (this.itemSelector) {
|
|
29
|
-
return this.runWithSelector(this.itemSelector.select(distribution), dataset, distribution, writer, options);
|
|
29
|
+
return this.runWithSelector(this.itemSelector.select(distribution, this.batchSize), dataset, distribution, writer, options);
|
|
30
30
|
}
|
|
31
31
|
const streams = await this.executeAll(dataset, distribution);
|
|
32
32
|
if (streams instanceof NotSupported) {
|