@lde/pipeline 0.6.19 → 0.6.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/stage.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { Dataset, Distribution } from '@lde/dataset';
2
- import type { Quad } from '@rdfjs/types';
3
2
  import type { Executor, VariableBindings } from './sparql/executor.js';
4
3
  import { NotSupported } from './sparql/executor.js';
4
+ import type { Writer } from './writer/writer.js';
5
5
  export interface StageOptions {
6
6
  name: string;
7
7
  executors: Executor | Executor[];
@@ -15,7 +15,8 @@ export declare class Stage {
15
15
  private readonly selector?;
16
16
  private readonly batchSize;
17
17
  constructor(options: StageOptions);
18
- run(dataset: Dataset, distribution: Distribution): Promise<AsyncIterable<Quad> | NotSupported>;
18
+ run(dataset: Dataset, distribution: Distribution, writer: Writer): Promise<NotSupported | void>;
19
+ private executeWithSelector;
19
20
  private executeAll;
20
21
  }
21
22
  /** Stage-level selector that yields variable bindings for use in executor queries. Pagination is an implementation detail. */
@@ -1 +1 @@
1
- {"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAGpD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,QAAQ,CAAC,EAAE,aAAa,CAAC;IACzB,gEAAgE;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAgB;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,OAAO,EAAE,YAAY;IAS3B,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,GACzB,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC;YAwBhC,UAAU;CAkBzB;AAUD,8HAA8H;AAE9H,MAAM,WAAW,aAAc,SAAQ,aAAa,CAAC,gBAAgB,CAAC;CAAG"}
1
+ {"version":3,"file":"stage.d.ts","sourceRoot":"","sources":["../src/stage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,QAAQ,GAAG,QAAQ,EAAE,CAAC;IACjC,QAAQ,CAAC,EAAE,aAAa,CAAC;IACzB,gEAAgE;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,KAAK;IAChB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAa;IACvC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAgB;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,OAAO,EAAE,YAAY;IAS3B,GAAG,CACP,OAAO,EAAE,OAAO,EAChB,YAAY,EAAE,YAAY,EAC1B,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC;YAYjB,mBAAmB;YAuBnB,UAAU;CAkBzB;AAUD,8HAA8H;AAE9H,MAAM,WAAW,aAAc,SAAQ,aAAa,CAAC,gBAAgB,CAAC;CAAG"}
package/dist/stage.js CHANGED
@@ -13,10 +13,16 @@ export class Stage {
13
13
  this.selector = options.selector;
14
14
  this.batchSize = options.batchSize ?? 10;
15
15
  }
16
- async run(dataset, distribution) {
17
- if (!this.selector) {
18
- return this.executeAll(dataset, distribution);
16
+ async run(dataset, distribution, writer) {
17
+ const streams = this.selector
18
+ ? await this.executeWithSelector(dataset, distribution)
19
+ : await this.executeAll(dataset, distribution);
20
+ if (streams instanceof NotSupported) {
21
+ return streams;
19
22
  }
23
+ await writer.write(dataset, mergeStreams(streams));
24
+ }
25
+ async executeWithSelector(dataset, distribution) {
20
26
  const streams = [];
21
27
  for await (const bindings of batch(this.selector, this.batchSize)) {
22
28
  for (const executor of this.executors) {
@@ -31,7 +37,7 @@ export class Stage {
31
37
  if (streams.length === 0) {
32
38
  return new NotSupported('All executors returned NotSupported');
33
39
  }
34
- return mergeStreams(streams);
40
+ return streams;
35
41
  }
36
42
  async executeAll(dataset, distribution) {
37
43
  const streams = [];
@@ -44,7 +50,7 @@ export class Stage {
44
50
  if (streams.length === 0) {
45
51
  return new NotSupported('All executors returned NotSupported');
46
52
  }
47
- return mergeStreams(streams);
53
+ return streams;
48
54
  }
49
55
  }
50
56
  async function* mergeStreams(streams) {
@@ -1,5 +1,5 @@
1
1
  import { Dataset } from '@lde/dataset';
2
- import type { DatasetCore } from '@rdfjs/types';
2
+ import type { Quad } from '@rdfjs/types';
3
3
  import { Writer } from './writer.js';
4
4
  export interface FileWriterOptions {
5
5
  /**
@@ -16,7 +16,7 @@ export declare class FileWriter implements Writer {
16
16
  private readonly outputDir;
17
17
  private readonly format;
18
18
  constructor(options: FileWriterOptions);
19
- write(dataset: Dataset, data: DatasetCore): Promise<void>;
19
+ write(dataset: Dataset, quads: AsyncIterable<Quad>): Promise<void>;
20
20
  private getFilename;
21
21
  private getExtension;
22
22
  }
@@ -1 +1 @@
1
- {"version":3,"file":"fileWriter.d.ts","sourceRoot":"","sources":["../../src/writer/fileWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAIhD,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAGrC,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;CAC7C;AAaD,qBAAa,UAAW,YAAW,MAAM;IACvC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqC;gBAEhD,OAAO,EAAE,iBAAiB;IAKhC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAiB/D,OAAO,CAAC,WAAW;IAQnB,OAAO,CAAC,YAAY;CAUrB"}
1
+ {"version":3,"file":"fileWriter.d.ts","sourceRoot":"","sources":["../../src/writer/fileWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAMzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAErC,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,MAAM,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,SAAS,CAAC;CAC7C;AAaD,qBAAa,UAAW,YAAW,MAAM;IACvC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqC;gBAEhD,OAAO,EAAE,iBAAiB;IAKhC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAyBxE,OAAO,CAAC,WAAW;IAQnB,OAAO,CAAC,YAAY;CAUrB"}
@@ -1,9 +1,10 @@
1
- import { mkdir, writeFile } from 'node:fs/promises';
1
+ import { createWriteStream } from 'node:fs';
2
+ import { mkdir } from 'node:fs/promises';
2
3
  import { join, dirname } from 'node:path';
3
4
  import filenamifyUrl from 'filenamify-url';
4
- import { serializeQuads } from './serialize.js';
5
+ import { Writer as N3Writer } from 'n3';
5
6
  /**
6
- * Writes RDF data to files on disk.
7
+ * Streams RDF quads to files on disk using N3 Writer.
7
8
  *
8
9
  * Files are named based on the dataset IRI using filenamify-url.
9
10
  */
@@ -19,17 +20,28 @@ export class FileWriter {
19
20
  this.outputDir = options.outputDir;
20
21
  this.format = options.format ?? 'turtle';
21
22
  }
22
- async write(dataset, data) {
23
- const quads = [...data];
24
- if (quads.length === 0) {
23
+ async write(dataset, quads) {
24
+ // Peek at the first quad to avoid creating empty files.
25
+ const iterator = quads[Symbol.asyncIterator]();
26
+ const first = await iterator.next();
27
+ if (first.done)
25
28
  return;
26
- }
27
- const filename = this.getFilename(dataset);
28
- const filePath = join(this.outputDir, filename);
29
- // Ensure the output directory exists.
29
+ const filePath = join(this.outputDir, this.getFilename(dataset));
30
30
  await mkdir(dirname(filePath), { recursive: true });
31
- const content = await serializeQuads(quads, formatMap[this.format]);
32
- await writeFile(filePath, content, 'utf-8');
31
+ const stream = createWriteStream(filePath);
32
+ const writer = new N3Writer(stream, { format: formatMap[this.format] });
33
+ writer.addQuad(first.value);
34
+ for await (const quad of { [Symbol.asyncIterator]: () => iterator }) {
35
+ writer.addQuad(quad);
36
+ }
37
+ await new Promise((resolve, reject) => {
38
+ writer.end((error) => {
39
+ if (error)
40
+ reject(error);
41
+ else
42
+ resolve();
43
+ });
44
+ });
33
45
  }
34
46
  getFilename(dataset) {
35
47
  const extension = this.getExtension();
@@ -1,5 +1,5 @@
1
1
  import { Dataset } from '@lde/dataset';
2
- import type { DatasetCore } from '@rdfjs/types';
2
+ import type { Quad } from '@rdfjs/types';
3
3
  import { Writer } from './writer.js';
4
4
  export interface SparqlWriterOptions {
5
5
  /**
@@ -21,14 +21,17 @@ export interface SparqlWriterOptions {
21
21
  /**
22
22
  * Writes RDF data to a SPARQL endpoint using SPARQL UPDATE INSERT DATA queries.
23
23
  *
24
- * Each dataset's data is written to a named graph based on the dataset IRI.
24
+ * Clears the named graph before writing, then streams quads in batches
25
+ * to avoid accumulating the entire dataset in memory.
25
26
  */
26
27
  export declare class SparqlUpdateWriter implements Writer {
27
28
  private readonly endpoint;
28
29
  private readonly fetch;
29
30
  private readonly batchSize;
30
31
  constructor(options: SparqlWriterOptions);
31
- write(dataset: Dataset, data: DatasetCore): Promise<void>;
32
+ write(dataset: Dataset, quads: AsyncIterable<Quad>): Promise<void>;
33
+ private clearGraph;
32
34
  private insertBatch;
35
+ private executeUpdate;
33
36
  }
34
37
  //# sourceMappingURL=sparqlUpdateWriter.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"sparqlUpdateWriter.d.ts","sourceRoot":"","sources":["../../src/writer/sparqlUpdateWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,WAAW,EAAQ,MAAM,cAAc,CAAC;AACtD,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAGrC,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,QAAQ,EAAE,GAAG,CAAC;IACd;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;IAChC;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;;GAIG;AACH,qBAAa,kBAAmB,YAAW,MAAM;IAC/C,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAM;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA0B;IAChD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,OAAO,EAAE,mBAAmB;IAMlC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;YAejD,WAAW;CAmB1B"}
1
+ {"version":3,"file":"sparqlUpdateWriter.d.ts","sourceRoot":"","sources":["../../src/writer/sparqlUpdateWriter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAEzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAGrC,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,QAAQ,EAAE,GAAG,CAAC;IACd;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;IAChC;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;;;GAKG;AACH,qBAAa,kBAAmB,YAAW,MAAM;IAC/C,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAM;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA0B;IAChD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,OAAO,EAAE,mBAAmB;IAMlC,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;YAS1D,UAAU;YAIV,WAAW;YAOX,aAAa;CAgB5B"}
@@ -1,8 +1,10 @@
1
+ import { batch } from '../batch.js';
1
2
  import { serializeQuads } from './serialize.js';
2
3
  /**
3
4
  * Writes RDF data to a SPARQL endpoint using SPARQL UPDATE INSERT DATA queries.
4
5
  *
5
- * Each dataset's data is written to a named graph based on the dataset IRI.
6
+ * Clears the named graph before writing, then streams quads in batches
7
+ * to avoid accumulating the entire dataset in memory.
6
8
  */
7
9
  export class SparqlUpdateWriter {
8
10
  endpoint;
@@ -13,21 +15,21 @@ export class SparqlUpdateWriter {
13
15
  this.fetch = options.fetch ?? globalThis.fetch;
14
16
  this.batchSize = options.batchSize ?? 10000;
15
17
  }
16
- async write(dataset, data) {
18
+ async write(dataset, quads) {
17
19
  const graphUri = dataset.iri.toString();
18
- const quads = [...data];
19
- if (quads.length === 0) {
20
- return;
21
- }
22
- // Process in batches to avoid hitting endpoint size limits.
23
- for (let i = 0; i < quads.length; i += this.batchSize) {
24
- const batch = quads.slice(i, i + this.batchSize);
25
- await this.insertBatch(graphUri, batch);
20
+ await this.clearGraph(graphUri);
21
+ for await (const chunk of batch(quads, this.batchSize)) {
22
+ await this.insertBatch(graphUri, chunk);
26
23
  }
27
24
  }
25
+ async clearGraph(graphUri) {
26
+ await this.executeUpdate(`CLEAR GRAPH <${graphUri}>`);
27
+ }
28
28
  async insertBatch(graphUri, quads) {
29
29
  const turtleData = await serializeQuads(quads, 'N-Triples');
30
- const query = `INSERT DATA { GRAPH <${graphUri}> { ${turtleData} } }`;
30
+ await this.executeUpdate(`INSERT DATA { GRAPH <${graphUri}> { ${turtleData} } }`);
31
+ }
32
+ async executeUpdate(query) {
31
33
  const response = await this.fetch(this.endpoint.toString(), {
32
34
  method: 'POST',
33
35
  headers: {
@@ -1,5 +1,5 @@
1
1
  import { Dataset } from '@lde/dataset';
2
- import type { DatasetCore } from '@rdfjs/types';
2
+ import type { Quad } from '@rdfjs/types';
3
3
  /**
4
4
  * Interface for writing RDF data to a destination.
5
5
  */
@@ -8,8 +8,8 @@ export interface Writer {
8
8
  * Write RDF data for a dataset to the destination.
9
9
  *
10
10
  * @param dataset The dataset metadata
11
- * @param data The RDF data to write
11
+ * @param quads The RDF quads to write
12
12
  */
13
- write(dataset: Dataset, data: DatasetCore): Promise<void>;
13
+ write(dataset: Dataset, quads: AsyncIterable<Quad>): Promise<void>;
14
14
  }
15
15
  //# sourceMappingURL=writer.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"writer.d.ts","sourceRoot":"","sources":["../../src/writer/writer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEhD;;GAEG;AACH,MAAM,WAAW,MAAM;IACrB;;;;;OAKG;IACH,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3D"}
1
+ {"version":3,"file":"writer.d.ts","sourceRoot":"","sources":["../../src/writer/writer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAEzC;;GAEG;AACH,MAAM,WAAW,MAAM;IACrB;;;;;OAKG;IACH,KAAK,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CACpE"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lde/pipeline",
3
- "version": "0.6.19",
3
+ "version": "0.6.21",
4
4
  "repository": {
5
5
  "url": "https://github.com/ldengine/lde",
6
6
  "directory": "packages/pipeline"