@lde/pipeline 0.27.0 → 0.28.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -30,17 +30,51 @@ const selector = new RegistrySelector({
30
30
  const selector = new ManualDatasetSelection([dataset]);
31
31
  ```
32
32
 
33
+ ### Stage
34
+
35
+ A stage groups an item selector, one or more executors, and configuration:
36
+
37
+ ```typescript
38
+ new Stage({
39
+ name: 'per-class',
40
+ itemSelector: new SparqlItemSelector({
41
+ query: 'SELECT DISTINCT ?class WHERE { ?s a ?class }',
42
+ }),
43
+ executors: executor,
44
+ batchSize: 100,
45
+ maxConcurrency: 5,
46
+ });
47
+ ```
48
+
49
+ #### Batch size
50
+
51
+ `batchSize` (default: 10) controls how many variable bindings are passed to each executor call as a `VALUES` clause. It also sets the page size for the item selector's SPARQL requests, so that each paginated request fills exactly one executor batch.
52
+
53
+ A `LIMIT` clause in the selector query overrides `batchSize` as the page size — use this when the SPARQL endpoint enforces a hard result limit:
54
+
55
+ ```typescript
56
+ // Endpoint caps results at 1000, but process in batches of 100.
57
+ new Stage({
58
+ name: 'per-class',
59
+ itemSelector: new SparqlItemSelector({
60
+ query: 'SELECT DISTINCT ?class WHERE { ?s a ?class } LIMIT 1000',
61
+ }),
62
+ executors: executor,
63
+ batchSize: 100,
64
+ });
65
+ ```
66
+
33
67
  ### Item Selector
34
68
 
35
69
  Selects resources from the distribution and fans out executor calls per batch of results. Implements the `ItemSelector` interface:
36
70
 
37
71
  ```typescript
38
72
  interface ItemSelector {
39
- select(distribution: Distribution): AsyncIterable<VariableBindings>;
73
+ select(distribution: Distribution, batchSize?: number): AsyncIterable<VariableBindings>;
40
74
  }
41
75
  ```
42
76
 
43
- The distribution is received at run time, so selectors don't need the endpoint URL at construction time. Use `SparqlItemSelector` for SPARQL-based selection with automatic pagination:
77
+ The distribution is received at run time, so selectors don't need the endpoint URL at construction time. The `batchSize` parameter is set by the stage. Use `SparqlItemSelector` for SPARQL-based selection with automatic pagination:
44
78
 
45
79
  ```typescript
46
80
  new SparqlItemSelector({
@@ -52,9 +86,9 @@ For dynamic queries that depend on the distribution, implement `ItemSelector` di
52
86
 
53
87
  ```typescript
54
88
  const itemSelector: ItemSelector = {
55
- select: (distribution) => {
89
+ select: (distribution, batchSize) => {
56
90
  const query = buildQuery(distribution);
57
- return new SparqlItemSelector({ query }).select(distribution);
91
+ return new SparqlItemSelector({ query }).select(distribution, batchSize);
58
92
  },
59
93
  };
60
94
  ```
@@ -1 +1 @@
1
- {"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAMrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,KAAK,oBAAoB,EAEzB,uBAAuB,EACvB,oBAAoB,EACrB,MAAM,eAAe,CAAC;AAGvB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,EAAE,QAAQ,GAAG,QAAQ,CAAC;CAChC;AAED;;;;;;;GAOG;AACH,qBAAa,cAAe,YAAW,oBAAoB;IAEvD,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,OAAO;gBADP,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,qBAAqB;IAG3C,OAAO,CACX,GAAG,IAAI,EAAE,UAAU,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,GACnD,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;YAgB5C,aAAa;IAkFrB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
1
+ {"version":3,"file":"importResolver.d.ts","sourceRoot":"","sources":["../../src/distribution/importResolver.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAMrD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,KAAK,oBAAoB,EAEzB,uBAAuB,EACvB,oBAAoB,EACrB,MAAM,eAAe,CAAC;AAGvB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,QAAQ,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,EAAE,QAAQ,GAAG,QAAQ,CAAC;CAChC;AAED;;;;;;;GAOG;AACH,qBAAa,cAAe,YAAW,oBAAoB;IAEvD,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,OAAO;gBADP,KAAK,EAAE,oBAAoB,EAC3B,OAAO,EAAE,qBAAqB;IAG3C,OAAO,CACX,GAAG,IAAI,EAAE,UAAU,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,GACnD,OAAO,CAAC,oBAAoB,GAAG,uBAAuB,CAAC;YAgB5C,aAAa;IAqFrB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
@@ -17,6 +17,7 @@ declare abstract class ProbeResult {
17
17
  readonly lastModified: Date | null;
18
18
  readonly contentType: string | null;
19
19
  readonly failureReason: string | null;
20
+ readonly warnings: string[];
20
21
  constructor(url: string, response: Response, failureReason?: string | null);
21
22
  isSuccess(): boolean;
22
23
  }
@@ -1 +1 @@
1
- {"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../../src/distribution/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;gBADf,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM;CAElC;AAED;;GAEG;AACH,uBAAe,WAAW;aAQN,GAAG,EAAE,MAAM;IAP7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;gBAG3B,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,aAAa,GAAE,MAAM,GAAG,IAAW;IAY9B,SAAS,IAAI,OAAO;CAO5B;AAID;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD,SAAgB,mBAAmB,qCAAuB;IAEjD,SAAS,IAAI,OAAO;CAM9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAEjB;;;;;;;GAOG;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,SAAO,GACb,OAAO,CAAC,eAAe,CAAC,CAY1B"}
1
+ {"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../../src/distribution/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;gBADf,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM;CAElC;AAED;;GAEG;AACH,uBAAe,WAAW;aASN,GAAG,EAAE,MAAM;IAR7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7C,SAAgB,QAAQ,EAAE,MAAM,EAAE,CAAM;gBAGtB,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,aAAa,GAAE,MAAM,GAAG,IAAW;IAY9B,SAAS,IAAI,OAAO;CAO5B;AAID;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD,SAAgB,mBAAmB,qCAAuB;IAEjD,SAAS,IAAI,OAAO;CAM9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAEjB;;;;;;;GAOG;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,SAAO,GACb,OAAO,CAAC,eAAe,CAAC,CAY1B"}
@@ -20,6 +20,7 @@ class ProbeResult {
20
20
  lastModified = null;
21
21
  contentType;
22
22
  failureReason;
23
+ warnings = [];
23
24
  constructor(url, response, failureReason = null) {
24
25
  this.url = url;
25
26
  this.statusCode = response.status;
@@ -147,9 +148,13 @@ async function probeDataDump(distribution, timeout) {
147
148
  const failureReason = isHttpSuccess
148
149
  ? validateBody(body, getResponse.headers.get('Content-Type'))
149
150
  : null;
150
- return new DataDumpProbeResult(url, getResponse, failureReason);
151
+ const result = new DataDumpProbeResult(url, getResponse, failureReason);
152
+ checkContentTypeMismatch(result, distribution.mimeType);
153
+ return result;
151
154
  }
152
- return new DataDumpProbeResult(url, headResponse);
155
+ const result = new DataDumpProbeResult(url, headResponse);
156
+ checkContentTypeMismatch(result, distribution.mimeType);
157
+ return result;
153
158
  }
154
159
  const rdfContentTypes = [
155
160
  'text/turtle',
@@ -174,3 +179,23 @@ function validateBody(body, contentType) {
174
179
  }
175
180
  return null;
176
181
  }
182
+ /** Content types that indicate compression, not the RDF serialization format. */
183
+ const compressionTypes = new Set([
184
+ 'application/gzip',
185
+ 'application/x-gzip',
186
+ 'application/octet-stream',
187
+ ]);
188
+ /**
189
+ * Compare the declared MIME type from the dataset registry against the
190
+ * server's Content-Type header. Adds a warning when they disagree.
191
+ */
192
+ function checkContentTypeMismatch(result, declaredMimeType) {
193
+ if (!result.isSuccess() || !declaredMimeType || !result.contentType)
194
+ return;
195
+ const actual = result.contentType.split(';')[0].trim();
196
+ if (compressionTypes.has(actual))
197
+ return;
198
+ if (actual !== declaredMimeType) {
199
+ result.warnings.push(`Server Content-Type ${actual} does not match declared media type ${declaredMimeType}`);
200
+ }
201
+ }
@@ -1 +1 @@
1
- {"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/distribution/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAAe,KAAK,IAAI,EAAE,MAAM,IAAI,CAAC;AAC5C,OAAO,EAIL,KAAK,eAAe,EACrB,MAAM,YAAY,CAAC;AAUpB;;;;;;;;;GASG;AACH,wBAAuB,mBAAmB,CACxC,YAAY,EAAE,eAAe,EAAE,EAC/B,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,YAAY,GAC1B,aAAa,CAAC,IAAI,CAAC,CAwCrB"}
1
+ {"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/distribution/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAAe,KAAK,IAAI,EAAE,MAAM,IAAI,CAAC;AAC5C,OAAO,EAIL,KAAK,eAAe,EACrB,MAAM,YAAY,CAAC;AAUpB;;;;;;;;;GASG;AACH,wBAAuB,mBAAmB,CACxC,YAAY,EAAE,eAAe,EAAE,EAC/B,UAAU,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,YAAY,GAC1B,aAAa,CAAC,IAAI,CAAC,CA2CrB"}
@@ -29,6 +29,9 @@ export async function* probeResultsToQuads(probeResults, datasetIri, importResul
29
29
  }
30
30
  else if (result.isSuccess()) {
31
31
  yield* successQuads(action, result, datasetIri);
32
+ for (const warning of result.warnings) {
33
+ yield quad(action, namedNode(`${SCHEMA}error`), literal(warning));
34
+ }
32
35
  }
33
36
  else if (result.failureReason) {
34
37
  yield quad(action, namedNode(`${SCHEMA}error`), literal(result.failureReason));
package/dist/pipeline.js CHANGED
@@ -257,6 +257,7 @@ function mapProbeResult(distribution, result) {
257
257
  type: 'network-error',
258
258
  available: false,
259
259
  error: result.message,
260
+ warnings: [],
260
261
  };
261
262
  }
262
263
  return {
@@ -266,5 +267,7 @@ function mapProbeResult(distribution, result) {
266
267
  : 'data-dump',
267
268
  available: result.isSuccess(),
268
269
  statusCode: result.statusCode,
270
+ error: result.failureReason ?? undefined,
271
+ warnings: result.warnings,
269
272
  };
270
273
  }
@@ -6,6 +6,7 @@ export interface DistributionAnalysisResult {
6
6
  available: boolean;
7
7
  statusCode?: number;
8
8
  error?: string;
9
+ warnings: string[];
9
10
  }
10
11
  export interface ProgressReporter {
11
12
  pipelineStart?(name: string): void;
@@ -1 +1 @@
1
- {"version":3,"file":"progressReporter.d.ts","sourceRoot":"","sources":["../src/progressReporter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAEvD,MAAM,WAAW,0BAA0B;IACzC,YAAY,EAAE,YAAY,CAAC;IAC3B,IAAI,EAAE,QAAQ,GAAG,WAAW,GAAG,eAAe,CAAC;IAC/C,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,gBAAgB;IAC/B,aAAa,CAAC,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,gBAAgB,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACzD,YAAY,CAAC,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI,CAAC;IACtC,8DAA8D;IAC9D,kBAAkB,CAAC,CAAC,MAAM,EAAE,0BAA0B,GAAG,IAAI,CAAC;IAC9D,6CAA6C;IAC7C,aAAa,CAAC,IAAI,IAAI,CAAC;IACvB,kDAAkD;IAClD,YAAY,CAAC,CAAC,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/D,oBAAoB,CAAC,CACnB,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,YAAY,CAAC,EAAE,YAAY,EAC3B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,GACnB,IAAI,CAAC;IACR,UAAU,CAAC,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,aAAa,CAAC,CAAC,MAAM,EAAE;QACrB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,gBAAgB,EAAE,MAAM,CAAC;QACzB,aAAa,EAAE,MAAM,CAAC;KACvB,GAAG,IAAI,CAAC;IACT,aAAa,CAAC,CACZ,KAAK,EAAE,MAAM,EACb,MAAM,EAAE;QACN,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,QAAQ,EAAE,MAAM,CAAC;KAClB,GACA,IAAI,CAAC;IACR,WAAW,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,GAAG,IAAI,CAAC;IAChD,4DAA4D;IAC5D,cAAc,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,gBAAgB,GAAG,IAAI,CAAC;IAC/D,YAAY,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACnD,eAAe,CAAC,CACd,OAAO,EAAE,OAAO,EAChB,MAAM,EAAE;QAAE,gBAAgB,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAA;KAAE,GAC1D,IAAI,CAAC;IACR,cAAc,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACxD,gBAAgB,CAAC,CAAC,MAAM,EAAE;QACxB,QAAQ,EAAE,MAAM,CAAC;QACjB,gBAAgB,EAAE,MAAM,CAAC;QACzB,aAAa,EAAE,MAAM,CAAC;KACvB,GAAG,IAAI,CAAC;CACV"}
1
+ {"version":3,"file":"progressReporter.d.ts","sourceRoot":"","sources":["../src/progressReporter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAEvD,MAAM,WAAW,0BAA0B;IACzC,YAAY,EAAE,YAAY,CAAC;IAC3B,IAAI,EAAE,QAAQ,GAAG,WAAW,GAAG,eAAe,CAAC;IAC/C,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,aAAa,CAAC,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,gBAAgB,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACzD,YAAY,CAAC,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI,CAAC;IACtC,8DAA8D;IAC9D,kBAAkB,CAAC,CAAC,MAAM,EAAE,0BAA0B,GAAG,IAAI,CAAC;IAC9D,6CAA6C;IAC7C,aAAa,CAAC,IAAI,IAAI,CAAC;IACvB,kDAAkD;IAClD,YAAY,CAAC,CAAC,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/D,oBAAoB,CAAC,CACnB,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,YAAY,CAAC,EAAE,YAAY,EAC3B,cAAc,CAAC,EAAE,MAAM,EACvB,WAAW,CAAC,EAAE,MAAM,GACnB,IAAI,CAAC;IACR,UAAU,CAAC,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,aAAa,CAAC,CAAC,MAAM,EAAE;QACrB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,gBAAgB,EAAE,MAAM,CAAC;QACzB,aAAa,EAAE,MAAM,CAAC;KACvB,GAAG,IAAI,CAAC;IACT,aAAa,CAAC,CACZ,KAAK,EAAE,MAAM,EACb,MAAM,EAAE;QACN,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,QAAQ,EAAE,MAAM,CAAC;KAClB,GACA,IAAI,CAAC;IACR,WAAW,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,GAAG,IAAI,CAAC;IAChD,4DAA4D;IAC5D,cAAc,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,gBAAgB,GAAG,IAAI,CAAC;IAC/D,YAAY,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACnD,eAAe,CAAC,CACd,OAAO,EAAE,OAAO,EAChB,MAAM,EAAE;QAAE,gBAAgB,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAA;KAAE,GAC1D,IAAI,CAAC;IACR,cAAc,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACxD,gBAAgB,CAAC,CAAC,MAAM,EAAE;QACxB,QAAQ,EAAE,MAAM,CAAC;QACjB,gBAAgB,EAAE,MAAM,CAAC;QACzB,aAAa,EAAE,MAAM,CAAC;KACvB,GAAG,IAAI,CAAC;CACV"}
@@ -3,10 +3,13 @@ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
3
3
  import type { ItemSelector } from '../stage.js';
4
4
  import type { VariableBindings } from './executor.js';
5
5
  export interface SparqlItemSelectorOptions {
6
- /** SELECT query projecting at least one named variable. A LIMIT in the query sets the default page size. */
6
+ /**
7
+ * SELECT query projecting at least one named variable.
8
+ *
9
+ * A `LIMIT` clause in the query overrides the stage's `batchSize` as the
10
+ * page size — use this when the SPARQL endpoint enforces a result limit.
11
+ */
7
12
  query: string;
8
- /** Results per page. Overrides any LIMIT in the query. @default 10 */
9
- pageSize?: number;
10
13
  /** Custom fetcher instance. */
11
14
  fetcher?: SparqlEndpointFetcher;
12
15
  }
@@ -16,15 +19,17 @@ export interface SparqlItemSelectorOptions {
16
19
  *
17
20
  * The endpoint URL comes from the {@link Distribution} passed to {@link select}.
18
21
  * Pagination is an internal detail — consumers iterate binding rows directly.
19
- * If the query contains a LIMIT, it is used as the default page size
20
- * (can be overridden by the `pageSize` option). Pagination continues
21
- * until a page returns fewer results than the page size.
22
+ *
23
+ * The page size (results per SPARQL request) is determined by, in order:
24
+ * 1. A `LIMIT` clause in the selector query (for endpoints with hard result limits)
25
+ * 2. The stage's {@link StageOptions.batchSize} (passed via {@link select})
26
+ * 3. A default of 10
22
27
  */
23
28
  export declare class SparqlItemSelector implements ItemSelector {
24
29
  private readonly parsed;
25
- private readonly pageSize;
30
+ private readonly queryLimit?;
26
31
  private readonly fetcher;
27
32
  constructor(options: SparqlItemSelectorOptions);
28
- select(distribution: Distribution): AsyncIterableIterator<VariableBindings>;
33
+ select(distribution: Distribution, batchSize?: number): AsyncIterableIterator<VariableBindings>;
29
34
  }
30
35
  //# sourceMappingURL=selector.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAQ9D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAMtD,MAAM,WAAW,yBAAyB;IACxC,4GAA4G;IAC5G,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;GASG;AACH,qBAAa,kBAAmB,YAAW,YAAY;IACrD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,yBAAyB;IAqBvC,MAAM,CACX,YAAY,EAAE,YAAY,GACzB,qBAAqB,CAAC,gBAAgB,CAAC;CAsC3C"}
1
+ {"version":3,"file":"selector.d.ts","sourceRoot":"","sources":["../../src/sparql/selector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAQ9D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAMtD,MAAM,WAAW,yBAAyB;IACxC;;;;;OAKG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,kBAAmB,YAAW,YAAY;IACrD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAS;IACrC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;gBAEpC,OAAO,EAAE,yBAAyB;IAkBvC,MAAM,CACX,YAAY,EAAE,YAAY,EAC1B,SAAS,CAAC,EAAE,MAAM,GACjB,qBAAqB,CAAC,gBAAgB,CAAC;CAuC3C"}
@@ -11,13 +11,15 @@ const F = new AstFactory();
11
11
  *
12
12
  * The endpoint URL comes from the {@link Distribution} passed to {@link select}.
13
13
  * Pagination is an internal detail — consumers iterate binding rows directly.
14
- * If the query contains a LIMIT, it is used as the default page size
15
- * (can be overridden by the `pageSize` option). Pagination continues
16
- * until a page returns fewer results than the page size.
14
+ *
15
+ * The page size (results per SPARQL request) is determined by, in order:
16
+ * 1. A `LIMIT` clause in the selector query (for endpoints with hard result limits)
17
+ * 2. The stage's {@link StageOptions.batchSize} (passed via {@link select})
18
+ * 3. A default of 10
17
19
  */
18
20
  export class SparqlItemSelector {
19
21
  parsed;
20
- pageSize;
22
+ queryLimit;
21
23
  fetcher;
22
24
  constructor(options) {
23
25
  const parsed = parser.parse(options.query);
@@ -29,31 +31,29 @@ export class SparqlItemSelector {
29
31
  throw new Error('Query must project at least one named variable (SELECT * is not supported)');
30
32
  }
31
33
  this.parsed = parsed;
32
- this.pageSize =
33
- options.pageSize ??
34
- this.parsed.solutionModifiers.limitOffset?.limit ??
35
- 10;
34
+ this.queryLimit = this.parsed.solutionModifiers.limitOffset?.limit;
36
35
  this.fetcher = options.fetcher ?? new SparqlEndpointFetcher();
37
36
  }
38
- async *select(distribution) {
37
+ async *select(distribution, batchSize) {
38
+ const effectivePageSize = this.queryLimit ?? batchSize ?? 10;
39
39
  const endpoint = distribution.accessUrl;
40
40
  let offset = 0;
41
41
  while (true) {
42
- this.parsed.solutionModifiers.limitOffset = F.solutionModifierLimitOffset(this.pageSize, offset, F.gen());
42
+ this.parsed.solutionModifiers.limitOffset = F.solutionModifierLimitOffset(effectivePageSize, offset, F.gen());
43
43
  const paginatedQuery = generator.generate(this.parsed);
44
44
  const stream = (await this.fetcher.fetchBindings(endpoint.toString(), paginatedQuery));
45
- let pageSize = 0;
45
+ let count = 0;
46
46
  for await (const record of stream) {
47
47
  const row = Object.fromEntries(Object.entries(record).filter(([, term]) => term.termType === 'NamedNode'));
48
48
  if (Object.keys(row).length > 0) {
49
49
  yield row;
50
- pageSize++;
50
+ count++;
51
51
  }
52
52
  }
53
- if (pageSize === 0 || pageSize < this.pageSize) {
53
+ if (count === 0 || count < effectivePageSize) {
54
54
  return;
55
55
  }
56
- offset += pageSize;
56
+ offset += count;
57
57
  }
58
58
  }
59
59
  }
package/dist/stage.d.ts CHANGED
@@ -10,7 +10,15 @@ export interface StageOptions {
10
10
  name: string;
11
11
  executors: Executor | Executor[];
12
12
  itemSelector?: ItemSelector;
13
- /** Maximum number of bindings per executor call. @default 10 */
13
+ /**
14
+ * Maximum number of bindings per executor call.
15
+ *
16
+ * Also used as the selector's page size so that each paginated request
17
+ * fills exactly one batch. A `LIMIT` clause in the selector query
18
+ * overrides this for endpoints with hard result limits.
19
+ *
20
+ * @default 10
21
+ */
14
22
  batchSize?: number;
15
23
  /** Maximum concurrent in-flight executor batches. @default 10 */
16
24
  maxConcurrency?: number;
@@ -48,6 +56,6 @@ export declare class Stage {
48
56
  }
49
57
  /** Selects items (as variable bindings) for executors to process. Pagination is an implementation detail. */
50
58
  export interface ItemSelector {
51
- select(distribution: Distribution): AsyncIterable<VariableBindings>;
59
+ select(distribution: Distribution, batchSize?: number): AsyncIterable<VariableBindings>;
52
60
  }
53
61
  //# sourceMappingURL=stage.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,mEAAmE;AACnE,MAAM,MAAM,aAAa,GAAG,CAC1B,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,EAC1B,OAAO,EAAE,OAAO,KACb,aAAa,CAAC,IAAI,CAAC,CAAC;AAEzB,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,gEAAgE;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uDAAuD;IACvD,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC;IACjB,qFAAqF;IACrF,UAAU,CAAC,EAAE;QACX,SAAS,EAAE,SAAS,CAAC;QACrB,iEAAiE;QACjE,SAAS,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,CAAC;KACvC,CAAC;CACH;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,CAAC,EAAE,CAAC,cAAc,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,KAAK,IAAI,CAAC;CACvE;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,KAAK,EAAE,CAAC;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAe;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAA6B;gBAE7C,OAAO,EAAE,YAAY;IAYjC,mDAAmD;IACnD,IAAI,SAAS,IAAI,SAAS,GAAG,SAAS,CAErC;IAEK,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,UAAU,GACnB,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAkDjB,eAAe;IA2I7B;;;OAGG;YACW,cAAc;YAqBd,UAAU;CAqBzB;AAUD,6GAA6G;AAC7G,MAAM,WAAW,YAAY;IAC3B,MAAM,CAAC,YAAY,EAAE,YAAY,GAAG,aAAa,CAAC,gBAAgB,CAAC,CAAC;CACrE"}
1
+ {"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAGjD,mEAAmE;AACnE,MAAM,MAAM,aAAa,GAAG,CAC1B,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,EAC1B,OAAO,EAAE,OAAO,KACb,aAAa,CAAC,IAAI,CAAC,CAAC;AAEzB,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B;;;;;;;;OAQG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,uDAAuD;IACvD,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC;IACjB,qFAAqF;IACrF,UAAU,CAAC,EAAE;QACX,SAAS,EAAE,SAAS,CAAC;QACrB,iEAAiE;QACjE,SAAS,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,CAAC;KACvC,CAAC;CACH;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,CAAC,EAAE,CAAC,cAAc,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,KAAK,IAAI,CAAC;CACvE;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,KAAK,EAAE,CAAC;IAClC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAe;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAA6B;gBAE7C,OAAO,EAAE,YAAY;IAYjC,mDAAmD;IACnD,IAAI,SAAS,IAAI,SAAS,GAAG,SAAS,CAErC;IAEK,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,UAAU,GACnB,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAkDjB,eAAe;IA2I7B;;;OAGG;YACW,cAAc;YAqBd,UAAU;CAqBzB;AAUD,6GAA6G;AAC7G,MAAM,WAAW,YAAY;IAC3B,MAAM,CACJ,YAAY,EAAE,YAAY,EAC1B,SAAS,CAAC,EAAE,MAAM,GACjB,aAAa,CAAC,gBAAgB,CAAC,CAAC;CACpC"}
package/dist/stage.js CHANGED
@@ -26,7 +26,7 @@ export class Stage {
26
26
  }
27
27
  async run(dataset, distribution, writer, options) {
28
28
  if (this.itemSelector) {
29
- return this.runWithSelector(this.itemSelector.select(distribution), dataset, distribution, writer, options);
29
+ return this.runWithSelector(this.itemSelector.select(distribution, this.batchSize), dataset, distribution, writer, options);
30
30
  }
31
31
  const streams = await this.executeAll(dataset, distribution);
32
32
  if (streams instanceof NotSupported) {
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "@lde/pipeline",
3
- "version": "0.27.0",
3
+ "version": "0.28.1",
4
4
  "repository": {
5
5
  "url": "git+https://github.com/ldelements/lde.git",
6
6
  "directory": "packages/pipeline"
7
7
  },
8
+ "license": "MIT",
8
9
  "type": "module",
9
10
  "exports": {
10
11
  "./package.json": "./package.json",
@@ -25,7 +26,7 @@
25
26
  "dependencies": {
26
27
  "@lde/dataset": "0.7.2",
27
28
  "@lde/dataset-registry-client": "0.7.4",
28
- "@lde/sparql-importer": "0.5.0",
29
+ "@lde/sparql-importer": "0.6.0",
29
30
  "@lde/sparql-server": "0.4.10",
30
31
  "@rdfjs/types": "^2.0.1",
31
32
  "@traqula/generator-sparql-1-1": "^1.0.3",