@gscdump/engine 0.26.9 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  import { arrowToRows } from "../arrow-utils.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import { join } from "node:path";
4
- import process from "node:process";
5
4
  import { fileURLToPath } from "node:url";
5
+ import process from "node:process";
6
6
  import { unlinkSync } from "node:fs";
7
7
  import { tmpdir } from "node:os";
8
8
  import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
@@ -8,6 +8,7 @@ interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
8
8
  /** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
9
9
  writerScript?: string;
10
10
  }
11
+ /** Run the PyIceberg writer subprocess for one job, return its parsed result. */
11
12
  interface LocalIcebergSink extends Sink {
12
13
  /** The catalog namespace the 5 tables live under. */
13
14
  readonly namespace: string;
@@ -1,44 +1,47 @@
1
1
  import { ICEBERG_SCHEMAS } from "./_chunks/schema2.mjs";
2
- import { execFile } from "node:child_process";
3
2
  import { dirname, join } from "node:path";
4
- import process from "node:process";
5
3
  import { fileURLToPath } from "node:url";
6
- const POC_S3 = {
7
- endpoint: "localhost:9100",
8
- accessKeyId: "poc",
9
- secretAccessKey: "pocpocpoc",
10
- region: "us-east-1"
11
- };
12
- function resolveWriterScript(override) {
13
- if (override) return override;
14
- return join(dirname(fileURLToPath(import.meta.url)), "..", "..", "scripts", "iceberg-writer.py");
4
+ import process from "node:process";
5
+ function resolvePyIcebergPython(override) {
6
+ return override ?? process.env["GSCDUMP_ICEBERG_PYTHON"] ?? "python3";
15
7
  }
16
- function runWriter(python, script, job) {
8
+ async function runPyIcebergWriter(options) {
9
+ const { execFile } = await import("node:child_process");
17
10
  return new Promise((resolve, reject) => {
18
- execFile(python, [script], { maxBuffer: 64 * 1024 * 1024 }, (err, stdout, stderr) => {
11
+ execFile(options.python, [options.script], { maxBuffer: 64 * 1024 * 1024 }, (err, stdout, stderr) => {
19
12
  let parsed;
20
13
  if (stdout.trim()) try {
21
14
  parsed = JSON.parse(stdout);
22
15
  } catch {}
23
- if (parsed?.error) {
24
- reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer failed: ${parsed.error}`));
16
+ if (parsed && !(err && options.rejectOnProcessError)) {
17
+ resolve(parsed);
25
18
  return;
26
19
  }
27
20
  if (err) {
28
- reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer process failed (${err.message})${stderr ? `: ${stderr}` : ""}`));
21
+ if (options.processErrorAsParseFailure) {
22
+ reject(/* @__PURE__ */ new Error(`${options.label} produced no parseable output (${err.message})${stderr ? `: ${stderr}` : ""}`));
23
+ return;
24
+ }
25
+ reject(/* @__PURE__ */ new Error(`${options.label} process failed (${err.message})${stderr ? `: ${stderr}` : ""}`));
29
26
  return;
30
27
  }
31
- if (!parsed) {
32
- reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer produced no parseable output: ${stdout || stderr}`));
33
- return;
34
- }
35
- resolve(parsed);
36
- }).stdin?.end(JSON.stringify(job));
28
+ reject(/* @__PURE__ */ new Error(`${options.label} produced no parseable output: ${stdout || stderr}`));
29
+ }).stdin?.end(JSON.stringify(options.job));
37
30
  });
38
31
  }
32
+ const POC_S3 = {
33
+ endpoint: "localhost:9100",
34
+ accessKeyId: "poc",
35
+ secretAccessKey: "pocpocpoc",
36
+ region: "us-east-1"
37
+ };
38
+ function resolveWriterScript(override) {
39
+ if (override) return override;
40
+ return join(dirname(fileURLToPath(import.meta.url)), "..", "..", "scripts", "iceberg-writer.py");
41
+ }
39
42
  function createLocalIcebergSink(options) {
40
43
  const s3 = options.s3 ?? POC_S3;
41
- const python = options.python ?? process.env.GSCDUMP_ICEBERG_PYTHON ?? "python3";
44
+ const python = resolvePyIcebergPython(options.python);
42
45
  const script = resolveWriterScript(options.writerScript);
43
46
  function buildJob(op, slice, rows) {
44
47
  return {
@@ -61,7 +64,14 @@ function createLocalIcebergSink(options) {
61
64
  capabilities: { appendOnly: true },
62
65
  async emit(slice, rows) {
63
66
  if (rows.length === 0) return { rowCount: 0 };
64
- const res = await runWriter(python, script, buildJob("emit", slice, rows));
67
+ const res = await runPyIcebergWriter({
68
+ python,
69
+ script,
70
+ job: buildJob("emit", slice, rows),
71
+ label: "LocalIcebergSink writer",
72
+ rejectOnProcessError: true
73
+ });
74
+ if (res.error) throw new Error(`LocalIcebergSink writer failed: ${res.error}`);
65
75
  touched.add(slice.table);
66
76
  return { rowCount: res.rowCount ?? 0 };
67
77
  },
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.26.9",
4
+ "version": "0.27.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -190,8 +190,8 @@
190
190
  "drizzle-orm": "1.0.0-rc.3",
191
191
  "icebird": "^0.8.10",
192
192
  "proper-lockfile": "^4.1.2",
193
- "gscdump": "0.26.9",
194
- "@gscdump/contracts": "0.26.9"
193
+ "@gscdump/contracts": "0.27.0",
194
+ "gscdump": "0.27.0"
195
195
  },
196
196
  "devDependencies": {
197
197
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -200,7 +200,7 @@
200
200
  "hyparquet": "^1.26.0",
201
201
  "hyparquet-writer": "^0.15.6",
202
202
  "tsx": "^4.22.4",
203
- "vitest": "^4.1.8"
203
+ "vitest": "^4.1.9"
204
204
  },
205
205
  "scripts": {
206
206
  "dev": "obuild --stub",