npm - @isidorus/cpu - Versions diffs - 0.0.0-alpha.2 → 0.0.0-alpha.4 - Mend

@isidorus/cpu 0.0.0-alpha.2 → 0.0.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/binding.gyp +1 -1
package/dist/graph.d.ts +19 -12
package/dist/graph.d.ts.map +1 -1
package/dist/graph.js +23 -16
package/dist/graph.js.map +1 -1
package/dist/inference-pool.d.ts +35 -12
package/dist/inference-pool.d.ts.map +1 -1
package/dist/inference-pool.js +226 -259
package/dist/inference-pool.js.map +1 -1
package/dist/tsconfig.tsbuildinfo +1 -1
package/package.json +2 -3
package/prebuilds/darwin-arm64/@isidorus+cpu.node +0 -0
package/prebuilds/linux-x64/@isidorus+cpu.node +0 -0
package/prebuilds/win32-x64/@isidorus+cpu.node +0 -0
package/scripts/test-install.js +1 -1
package/src/native/graph.cc +356 -255
package/src/native/graph.h +2 -0

package/dist/inference-pool.js CHANGED Viewed

@@ -2,71 +2,101 @@
  * InferencePool — strategy-aware inference execution with CPU affinity.
  *
  * Strategies:
- *
- *   worker-pool  N Workers × Session(intra=1, inter=1)
+ *   worker-pool  N Workers × native Session(intra=1, inter=1)
  *                JS controls parallelism. Each Worker owns one Session.
  *                N concurrent requests run on N cores simultaneously.
  *                Best: small/medium models, high concurrency.
  *
- *   tf-parallel  1 Session × Session(intra=hw, inter=1)
+ *   tf-parallel  1 native Session × (intra=hw−reserveCores, inter=1)
  *                TF's eigen threadpool owns all TF cores for one request.
  *                Concurrent requests queue behind each other.
  *                Best: large models where one matmul fills all cores.
  *
  *   auto         Probe-based selection:
- *                  model < 150 MB                   → worker-pool (no probe)
- *                  model ≥ 150 MB + probeShape       → warm probe → threshold
- *                  model ≥ 150 MB, no probeShape     → tf-parallel (fallback)
+ *                  model < 150 MB                 → worker-pool (no probe)
+ *                  model ≥ 150 MB + probeShape     → warm probe → threshold
+ *                  model ≥ 150 MB, no probeShape   → tf-parallel (fallback)
  *
  * CPU affinity (reserveCores):
- *   reserveCores = R pins TF computation to the LAST (N-R) cores.
- *   The FIRST R cores stay free for the event loop, libuv I/O, opencv, etc.
- *   The fence is applied in OnRunWork immediately before/after TF_SessionRun.
+ *   Pins TF compute to the LAST (N−reserveCores) cores via OS affinity fence
+ *   applied in OnRunWork immediately before/after TF_SessionRun.
+ *
+ * Transport:
+ *   Control plane (state machine) — 4-slot Int32Array over a SharedArrayBuffer.
+ *     Main stores WORK, Worker Atomics.wait()s and observes it. Tiny, exact.
+ *
+ *   Data plane (tensor bytes) — one SharedTensorSegment (jude-map) per slot.
+ *     Main calls seg.write(shape, dtype, bytes) — seqlock write, zero copy.
+ *     Worker calls seg.read()                  — seqlock read, zero copy.
+ *     No postMessage for input data. No SAB size limit concern (control SAB
+ *     is 16 bytes per worker; data segments are sized to the model's input).
+ *     Atomics are slower than seqlocks — jude-map's seqlock handles the data
+ *     plane; Atomics handle only the 4-slot state machine.
  */
 import { Worker, isMainThread, parentPort, workerData } from "worker_threads";
 import { availableParallelism } from "os";
-import { statSync } from "fs";
+import { statSync, readFileSync } from "fs";
 import { performance } from "perf_hooks";
+import { fileURLToPath } from "url";
+import { dirname, join } from "path";
+import nodeGypBuild from "node-gyp-build";
+import { SharedTensorSegment } from "jude-map";
 // ─── Constants ──────────────────────────────────────────────────────────────
 const IDLE = 0;
 const WORK = 1;
 const DONE = 2;
 const SHUTDOWN = 3;
-const CTRL_SLOTS = 4; // Int32 slots per worker in the control SAB
+const CTRL_SLOTS = 4; // state-machine only — IDLE/WORK/DONE/SHUTDOWN
 const SIZE_THRESHOLD_BYTES = 150 * 1024 * 1024; // 150 MB
 const DEFAULT_AUTO_THRESHOLD = 20; // ms
+// Default SharedTensorSegment capacity per Worker slot (bytes).
+// 4 MB covers MobileNetV2 (224×224×3×4 ≈ 602 KB) and ResNet50 single-image
+// inputs with comfortable headroom. Increase via PoolOptions.maxInputBytes
+// for large batch sizes or video frame inputs.
+const DEFAULT_MAX_INPUT_BYTES = 4 * 1024 * 1024;
+// ─── jude-map DType bridge ───────────────────────────────────────────────────
+// TF_DataType integers used by the native Session match jude-map's DType enum
+// values — both mirror the TensorFlow wire format. We cast directly.
+function tfDtypeToJudeMap(dtype) {
+    return dtype;
+}
 // ─── Worker-side logic ──────────────────────────────────────────────────────
 //
-// This block runs when the same file is loaded as a Worker thread.
-// The worker owns exactly one TFSession with intra_op=1, so all parallelism
-// is expressed at the Worker level (N workers = N cores).
+// Runs when this file is loaded by inference-pool-worker.mjs (tsx bootstrap)
+// or by the compiled .js entry as a Worker thread.
 //
-// Control protocol (per-worker Int32Array over a SharedArrayBuffer):
-//   slot[0] = IDLE      → parked, waiting for work
-//             WORK      → main thread has a request ready
-//             DONE      → worker finished, result sent via postMessage
-//             SHUTDOWN  → main thread requests exit
+// Transport:
+//   Control plane  — Int32Array over ctrlSab (4 slots, state machine only)
+//   Data plane     — SharedTensorSegment reconstructed from segSab (jude-map)
 //
-// Message protocol (postMessage, ordered relative to Atomics):
-//   main → worker:  { inputData: Buffer, inputShape: number[], inputDtype: number }
-//   worker → main:  { type: "ready" }
-//                   { type: "result", outputs: TensorValue[], inferenceMs: number }
-//                   { type: "work_error", error: string }
-//                   { type: "shutdown_ack" }
+// Init:
+//   Loads the native addon directly (no import from @isidorus/cpu — that would
+//   re-run ensureTf() and create a circular module reference).
+//   Loads the frozen graph via importGraphDef(readFileSync(modelPath)).
 //
-// Ordering guarantee:
-//   Main posts the input message BEFORE storing WORK + notifying,
-//   so the worker's Atomics.wait wakes AFTER the message is queued.
-//   Node.js buffers port messages until a listener is registered,
-//   so parentPort.once("message", ...) safely receives the queued message.
+// Work loop:
+//   Atomics.wait(ctrl, 0, IDLE)         ← park
+//   const { data, shape, dtype } = seg.read()  ← seqlock read, zero copy
+//   results = await sess.runAsync(feeds, fetches)
+//   Atomics.store(ctrl, 0, DONE) + notify
+//   postMessage({ type: "result", ... })
 if (!isMainThread) {
-    const { ctrlSab, workerIndex, modelPath, inputOp, outputOps } = workerData;
+    const { ctrlSab, segSab, maxInputBytes, workerIndex, modelPath, inputOp, outputOps, reserveCores, } = workerData;
     const ctrl = new Int32Array(ctrlSab, workerIndex * CTRL_SLOTS * 4, CTRL_SLOTS);
-    // ── Init ──────────────────────────────────────────────────────────────────
+    // ── Init ─────────────────────────────────────────────────────────────────
     let sess;
     try {
-        const { TFSession } = await import("jude-tf");
-        sess = await TFSession.loadFrozenGraph(modelPath);
+        // Load the native addon from the package root. Workers inherit
+        // LIBTENSORFLOW_PATH and PATH so the addon finds libtensorflow without
+        // re-running ensureTf().
+        const pkgRoot = join(dirname(fileURLToPath(import.meta.url)), "..", "..");
+        const workerAddon = nodeGypBuild(pkgRoot);
+        const nativeGraph = new workerAddon.Graph();
+        nativeGraph.importGraphDef(readFileSync(modelPath));
+        sess = new workerAddon.Session(nativeGraph, {
+            strategy: "worker-pool", // intra=1, inter=1
+            reserveCores,
+        });
     }
     catch (err) {
         parentPort.postMessage({
@@ -75,72 +105,62 @@ if (!isMainThread) {
         });
         process.exit(1);
     }
+    // Reconstruct the SharedTensorSegment from the SAB passed via workerData.
+    // jude-map segments are SAB-backed — the same underlying memory is accessible
+    // from both the main thread and this Worker.
+    const seg = SharedTensorSegment.fromSharedBuffer(segSab, maxInputBytes);
     Atomics.store(ctrl, 0, IDLE);
     parentPort.postMessage({ type: "ready" });
-    // Reinterpret raw bytes as the correct typed array for jude-tf.
-    // postMessage structured-clone strips the Buffer prototype, so the worker
-    // receives a plain Uint8Array. We reinterpret the underlying bytes as the
-    // correct typed array so jude-tf sees the right TF_DataType.
-    function asTypedArray(buf, dtype) {
-        const ab = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
-        switch (dtype) {
-            case 1:
-                return new Float32Array(ab); // TF_FLOAT
-            case 2:
-                return new Float64Array(ab); // TF_DOUBLE
-            case 3:
-                return new Int32Array(ab); // TF_INT32
-            case 4:
-                return new Uint8Array(ab); // TF_UINT8
-            case 9:
-                return new BigInt64Array(ab); // TF_INT64
-            default:
-                return new Uint8Array(ab);
-        }
-    }
     // ── Work loop ─────────────────────────────────────────────────────────────
-    // Atomics.wait BLOCKS the worker thread (allowed in Worker threads, not in
-    // the main thread). The block is intentional — the worker is dedicated to
-    // inference and has no other async work to process while parked.
+    // Atomics.wait BLOCKS — intentional. This Worker is dedicated to inference
+    // and has no other async work to process while parked.
     while (true) {
-        // Park until main thread sets ctrl to WORK or SHUTDOWN.
         Atomics.wait(ctrl, 0, IDLE);
         const state = Atomics.load(ctrl, 0);
         if (state === SHUTDOWN) {
             sess.destroy();
+            seg.destroy();
             parentPort.postMessage({ type: "shutdown_ack" });
             break;
         }
         if (state === WORK) {
-            // Receive input data. Main posted it BEFORE storing WORK,
-            // so the message is already queued in the port's buffer.
-            const msg = await new Promise((resolve) => parentPort.once("message", resolve));
             try {
                 const t0 = performance.now();
-                const inputArray = asTypedArray(msg.inputData, msg.inputDtype);
-                const results = await sess.run({ [inputOp]: inputArray }, outputOps);
+                // Zero-copy read — seqlock guarantees a consistent snapshot.
+                // Main thread called seg.write() before storing WORK, so the data
+                // is fully committed before we observe WORK here.
+                const tensor = seg.read();
+                if (!tensor)
+                    throw new Error("seg.read() returned null — segment not written");
+                const feeds = [
+                    {
+                        opName: inputOp,
+                        index: 0,
+                        tensor: {
+                            dtype: tensor.dtype,
+                            shape: tensor.shape,
+                            data: Buffer.from(tensor.data.buffer, tensor.data.byteOffset, tensor.data.byteLength),
+                        },
+                    },
+                ];
+                const fetches = outputOps.map((op) => ({
+                    opName: op,
+                    index: 0,
+                }));
+                // runAsync pushes TF_SessionRun onto the Worker's own libuv thread
+                // pool, keeping the Worker's event loop free for Atomics signalling.
+                const rawOutputs = (await sess.runAsync(feeds, fetches));
                 const inferenceMs = performance.now() - t0;
+                // Signal done before postMessage so main thread can unblock.
                 Atomics.store(ctrl, 0, DONE);
                 Atomics.notify(ctrl, 0, 1);
                 parentPort.postMessage({
                     type: "result",
-                    outputs: outputOps.map((k) => {
-                        const r = results[k];
-                        const view = r.data;
-                        return {
-                            dtype: r.dtype,
-                            shape: r.shape,
-                            // Copy into a Buffer — postMessage will structured-clone it as
-                            // Uint8Array on the receiving end; the main thread wraps it back
-                            // into a Buffer in handleMessage.
-                            data: Buffer.from(view.buffer, view.byteOffset, view.byteLength),
-                        };
-                    }),
+                    outputs: outputOps.map((_, i) => rawOutputs[i]),
                     inferenceMs,
                 });
             }
             catch (err) {
-                console.error(`[worker ${workerIndex}] error:`, err?.stack ?? String(err));
                 Atomics.store(ctrl, 0, DONE);
                 Atomics.notify(ctrl, 0, 1);
                 parentPort.postMessage({
@@ -148,10 +168,7 @@ if (!isMainThread) {
                     error: err?.stack ?? String(err),
                 });
             }
-            // Use compareExchange instead of store — if the main thread wrote
-            // SHUTDOWN between our DONE store and here, don't overwrite it.
-            // Otherwise the next Atomics.wait(ctrl, 0, IDLE) would block forever
-            // waiting for a notify that never comes.
+            // compareExchange: if main wrote SHUTDOWN between DONE and here, keep it.
             Atomics.compareExchange(ctrl, 0, DONE, IDLE);
         }
     }
@@ -162,11 +179,6 @@ export class InferencePool {
     workerSlots;
     queue;
     ctrlSab;
-    // tf-parallel path — exactly one of these pairs is non-null:
-    //   (tfParallelGraph, tfParallelSess) — native @isidorus/cpu Session
-    //                                       uses Graph.getOp() for feed/fetch
-    //   (null, tfParallelSess)            — jude-tf TFSession fallback
-    //                                       uses dict API { [opName]: data }
     tfParallelGraph;
     tfParallelSess;
     tfParallelBusy;
@@ -190,24 +202,34 @@ export class InferencePool {
     }
     // ── Factory ────────────────────────────────────────────────────────────────
     static async create(opts) {
-        // Auto-discover input/output op names if not provided.
-        // Loads the frozen graph once via jude-tf, reads inferred Placeholder
-        // names and sink op names, then destroys the probe session.
+        // ── Auto-discover inputOp / outputOps if not provided ──────────────────
+        // Load the graph once, scan for Placeholder ops (inputs) and sink ops
+        // (outputs whose results nothing else consumes). No jude-tf needed.
         if (!opts.inputOp || !opts.outputOps?.length) {
-            const { TFSession } = await import("jude-tf");
-            const probe = await TFSession.loadFrozenGraph(opts.modelPath);
-            opts.inputOp ??= probe.inputs[0];
-            opts.outputOps ??= probe.outputs;
-            probe.destroy();
-            if (!opts.inputOp)
-                throw new Error(`Could not infer inputOp from ${opts.modelPath}`);
-            if (!opts.outputOps?.length)
-                throw new Error(`Could not infer outputOps from ${opts.modelPath}`);
+            const { getAddon } = await import("./_native.js");
+            const { Graph: GCls } = await import("./graph.js");
+            const addon = getAddon();
+            const g = new GCls(new addon.Graph());
+            g.importGraphDef(readFileSync(opts.modelPath));
+            if (!opts.inputOp) {
+                const placeholders = g.listOpsOfType("Placeholder");
+                if (!placeholders.length)
+                    throw new Error(`No Placeholder ops found in ${opts.modelPath}`);
+                opts.inputOp = placeholders[0];
+            }
+            if (!opts.outputOps?.length) {
+                const sinks = g.listSinkOps();
+                if (!sinks.length)
+                    throw new Error(`No sink ops found in ${opts.modelPath}`);
+                opts.outputOps = sinks;
+            }
+            // g is garbage-collected — no explicit destroy needed for a probe graph.
         }
         const requestedStrategy = opts.strategy ?? "auto";
         const concurrency = opts.concurrency ?? availableParallelism();
         const autoThreshold = opts.autoThresholdMs ?? DEFAULT_AUTO_THRESHOLD;
         const reserveCores = opts.reserveCores ?? 0;
+        const maxInputBytes = opts.maxInputBytes ?? DEFAULT_MAX_INPUT_BYTES;
         let resolved;
         if (requestedStrategy === "worker-pool") {
             resolved = "worker-pool";
@@ -216,7 +238,6 @@ export class InferencePool {
             resolved = "tf-parallel";
         }
         else {
-            // auto — file size as cheap first signal, probe if ambiguous
             const modelBytes = statSync(opts.modelPath).size;
             if (modelBytes < SIZE_THRESHOLD_BYTES) {
                 resolved = "worker-pool";
@@ -227,13 +248,33 @@ export class InferencePool {
                     `threshold, no probeShape → tf-parallel\n`);
             }
             else {
-                // Warm probe with intra=1 — measure single-core inference time
-                const { TFSession } = await import("jude-tf");
-                const probeSess = await TFSession.loadFrozenGraph(opts.modelPath);
-                const probeInput = Buffer.alloc(opts.probeShape.reduce((a, b) => a * b, 1) * 4);
-                await probeSess.runAsync({ [opts.inputOp]: probeInput }, opts.outputOps);
+                // Warm probe via native Session (intra=1) to measure single-core time.
+                const { getAddon } = await import("./_native.js");
+                const { Graph: GCls } = await import("./graph.js");
+                const addon = getAddon();
+                const probeG = new GCls(new addon.Graph());
+                probeG.importGraphDef(readFileSync(opts.modelPath));
+                const probeSess = new addon.Session(probeG._native, {
+                    strategy: "worker-pool",
+                    reserveCores: 0,
+                });
+                const probeElems = opts.probeShape.reduce((a, b) => a * b, 1);
+                const probeInput = Buffer.alloc(probeElems * 4);
+                const probeFeeds = [
+                    {
+                        opName: opts.inputOp,
+                        index: 0,
+                        tensor: { dtype: 1, shape: opts.probeShape, data: probeInput },
+                    },
+                ];
+                const probeFetches = opts.outputOps.map((op) => ({
+                    opName: op,
+                    index: 0,
+                }));
+                // Warmup
+                await probeSess.runAsync(probeFeeds, probeFetches);
                 const t0 = performance.now();
-                await probeSess.runAsync({ [opts.inputOp]: probeInput }, opts.outputOps);
+                await probeSess.runAsync(probeFeeds, probeFetches);
                 const probeMs = performance.now() - t0;
                 probeSess.destroy();
                 resolved = probeMs >= autoThreshold ? "tf-parallel" : "worker-pool";
@@ -242,25 +283,22 @@ export class InferencePool {
             }
         }
         if (reserveCores > 0) {
-            const hw = availableParallelism();
-            const tfCores = Math.max(1, hw - reserveCores);
+            const tfCores = Math.max(1, availableParallelism() - reserveCores);
             process.stderr.write(`[isidorus] CPU affinity: reserving ${reserveCores} core(s), ` +
                 `TF gets ${tfCores} core(s)\n`);
         }
         return resolved === "worker-pool"
-            ? InferencePool.createWorkerPool(opts, concurrency, reserveCores)
+            ? InferencePool.createWorkerPool(opts, concurrency, reserveCores, maxInputBytes)
             : InferencePool.createTfParallel(opts, reserveCores);
     }
     // ── worker-pool init ───────────────────────────────────────────────────────
-    static async createWorkerPool(opts, concurrency, reserveCores) {
+    static async createWorkerPool(opts, concurrency, reserveCores, maxInputBytes) {
         const ctrlSab = new SharedArrayBuffer(concurrency * CTRL_SLOTS * 4);
         const slots = [];
         const startedWorkers = [];
-        // In dev/test we run TypeScript source directly via tsx. Workers don't
-        // inherit --import tsx from the parent, so we use a small .mjs bootstrap
-        // (inference-pool-worker.mjs) that calls register() from tsx/esm/api
-        // before importing this .ts file. In production the compiled .js entry
-        // is used directly with no extra loader needed.
+        // In dev/test we run TypeScript source directly via tsx.
+        // inference-pool-worker.mjs calls register() from tsx/esm/api before
+        // importing this .ts file. In production the compiled .js is used directly.
         const isTsSource = import.meta.url.endsWith(".ts");
         const workerEntry = isTsSource
             ? new URL("./inference-pool-worker.mjs", import.meta.url)
@@ -269,9 +307,20 @@ export class InferencePool {
             for (let i = 0; i < concurrency; i++) {
                 const ctrl = new Int32Array(ctrlSab, i * CTRL_SLOTS * 4, CTRL_SLOTS);
                 Atomics.store(ctrl, 0, IDLE);
+                // One SharedTensorSegment per slot — zero-copy data transport.
+                // createShared() allocates a SharedArrayBuffer backing the seqlock
+                // + data region. The SAB is passed to the Worker so both sides
+                // share the same physical memory with no cross-thread copy.
+                // Must use createShared(), not new SharedTensorSegment() — the
+                // mmap constructor produces a process-local mapping that has no SAB
+                // and cannot be transferred to a Worker via workerData.
+                const seg = SharedTensorSegment.createShared(maxInputBytes);
+                const segSab = seg.sharedBuffer; // the backing SAB
                 const worker = new Worker(workerEntry, {
                     workerData: {
                         ctrlSab,
+                        segSab,
+                        maxInputBytes,
                         workerIndex: i,
                         modelPath: opts.modelPath,
                         inputOp: opts.inputOp,
@@ -291,11 +340,17 @@ export class InferencePool {
                     });
                     worker.once("error", reject);
                 });
-                slots.push({ worker, ctrl, busy: false, resolve: null, reject: null });
+                slots.push({
+                    worker,
+                    ctrl,
+                    seg,
+                    busy: false,
+                    resolve: null,
+                    reject: null,
+                });
             }
         }
         catch (err) {
-            // Terminate any workers that were already started before the failure.
             await Promise.allSettled(startedWorkers.map((w) => w.terminate()));
             throw err;
         }
@@ -316,47 +371,26 @@ export class InferencePool {
     static async createTfParallel(opts, reserveCores) {
         const hw = availableParallelism();
         const tfCores = Math.max(1, hw - reserveCores);
-        // Try the native @isidorus/cpu Session path first (ConfigProto thread
-        // config + OnRunWork affinity fence). Falls back to jude-tf if the addon
-        // hasn't been initialised (e.g. when called from outside @isidorus/cpu).
-        //
-        // We import _native.js rather than "@isidorus/cpu" to avoid a circular
-        // dependency — this file IS part of @isidorus/cpu, so importing the
-        // package entry point would re-run ensureTf() + node-gyp-build.
-        let tfParallelGraph = null;
-        let tfParallelSess = null;
-        try {
-            const { getAddon } = await import("./_native.js");
-            const { readFileSync } = await import("fs");
-            const { Graph: GraphClass } = await import("./graph.js");
-            const { Session: SessionClass } = await import("./session.js");
-            const addon = getAddon();
-            const g = new GraphClass(new addon.Graph());
-            g.importGraphDef(readFileSync(opts.modelPath));
-            tfParallelGraph = g;
-            tfParallelSess = new SessionClass(new addon.Session(g._native, {
-                strategy: "tf-parallel",
-                reserveCores,
-            }));
-            process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
-                `(${reserveCores} core(s) reserved, native Session)\n`);
-        }
-        catch {
-            // Native addon not available — fall back to jude-tf TFSession.
-            // This path lacks the affinity fence but is otherwise correct.
-            const { TFSession } = await import("jude-tf");
-            tfParallelSess = await TFSession.loadFrozenGraph(opts.modelPath);
-            process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
-                `(${reserveCores} core(s) reserved, jude-tf fallback)\n`);
-        }
+        const { getAddon } = await import("./_native.js");
+        const { Graph: GCls } = await import("./graph.js");
+        const { Session: SCls } = await import("./session.js");
+        const addon = getAddon();
+        const g = new GCls(new addon.Graph());
+        g.importGraphDef(readFileSync(opts.modelPath));
+        const sess = new SCls(new addon.Session(g._native, {
+            strategy: "tf-parallel",
+            reserveCores,
+        }));
+        process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
+            `(${reserveCores} core(s) reserved, native Session)\n`);
         return new InferencePool({
             strategy: "tf-parallel",
             reserveCores,
             workerSlots: [],
             queue: [],
             ctrlSab: null,
-            tfParallelGraph,
-            tfParallelSess,
+            tfParallelGraph: g,
+            tfParallelSess: sess,
             modelPath: opts.modelPath,
             inputOp: opts.inputOp,
             outputOps: opts.outputOps,
@@ -371,12 +405,10 @@ export class InferencePool {
     inferWorkerPool(inputBuf, inputShape, inputDtype) {
         return new Promise((resolve, reject) => {
             const slot = this.workerSlots.find((w) => !w.busy);
-            if (slot) {
+            if (slot)
                 this.dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject);
-            }
-            else {
+            else
                 this.queue.push({ inputBuf, inputShape, inputDtype, resolve, reject });
-            }
         });
     }
     dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject) {
@@ -384,21 +416,13 @@ export class InferencePool {
         slot.busy = true;
         slot.resolve = resolve;
         slot.reject = reject;
-        // ── ORDERING CRITICAL ──────────────────────────────────────────────────
-        // 1. Register the result listener FIRST — before the worker can possibly
-        //    send a response. Node.js buffers port messages until a listener is
-        //    registered, but registering after notify introduces a thread-level
-        //    race where an extremely fast result could be missed.
-        // 2. Post the input data SECOND — the worker awaits this message after
-        //    waking from Atomics.wait, so it must arrive before WORK is stored.
-        // 3. Store WORK + notify LAST — wakes the worker.
+        // Register result listener BEFORE writing data or signalling WORK.
         const handleMessage = (msg) => {
             if (msg.type === "work_error") {
                 this.settleSlot(slot, null, new Error(msg.error));
                 return;
             }
             if (msg.type !== "result") {
-                // Unexpected message type — re-register to wait for the actual result.
                 slot.worker.once("message", handleMessage);
                 return;
             }
@@ -408,26 +432,21 @@ export class InferencePool {
                 outputs: msg.outputs.map((o) => ({
                     dtype: o.dtype,
                     shape: o.shape,
-                    // postMessage structured-clones Buffer as plain Uint8Array —
-                    // wrap it back into a Buffer so callers can use Buffer.isBuffer().
+                    // postMessage structured-clones Buffer as Uint8Array — rewrap.
                     data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
                 })),
                 inferenceMs: msg.inferenceMs,
             }, null);
         };
-        // Register listener before waking the worker.
         slot.worker.once("message", handleMessage);
-        // Register a one-shot error listener so an uncaught worker crash rejects
-        // the promise instead of leaving it hanging.
-        slot.worker.once("error", (err) => {
-            this.settleSlot(slot, null, err);
-        });
-        // Post input, then wake worker.
-        slot.worker.postMessage({ inputData: inputBuf, inputShape, inputDtype });
+        slot.worker.once("error", (err) => this.settleSlot(slot, null, err));
+        // Zero-copy write — seqlock ensures the Worker sees a consistent snapshot.
+        // No postMessage for input data. The Worker reads via seg.read() after
+        // observing WORK on the control SAB.
+        slot.seg.write(inputShape, tfDtypeToJudeMap(inputDtype), inputBuf);
         Atomics.store(slot.ctrl, 0, WORK);
         Atomics.notify(slot.ctrl, 0, 1);
     }
-    /** Settle a worker slot's in-flight promise and drain the queue. */
     settleSlot(slot, result, err) {
         const resolve = slot.resolve;
         const reject = slot.reject;
@@ -438,11 +457,9 @@ export class InferencePool {
             reject?.(err);
         else
             resolve?.(result);
-        // Drain one queued request now that the slot is free.
         const next = this.queue.shift();
-        if (next) {
+        if (next)
             this.dispatchToWorker(slot, next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
-        }
     }
     // ── tf-parallel path ───────────────────────────────────────────────────────
     inferTfParallel(inputBuf, inputShape, inputDtype) {
@@ -463,100 +480,50 @@ export class InferencePool {
     runTfParallel(inputBuf, inputShape, inputDtype, resolve, reject) {
         this.tfParallelBusy = true;
         const t0 = performance.now();
-        let inferencePromise;
-        if (this.tfParallelGraph) {
-            // ── Native @isidorus/cpu Session path ──────────────────────────────
-            // Build feed/fetch arrays using Graph.getOp() to resolve op names to
-            // Tensor references, which Session.runAsync expects.
-            const g = this.tfParallelGraph;
-            const inputTensor = g.getOp(this.inputOp);
-            if (!inputTensor) {
-                this.tfParallelBusy = false;
-                reject(new Error(`tf-parallel: input op not found in graph: ${this.inputOp}`));
-                return;
-            }
-            const outputTensors = this.outputOps.map((name) => {
-                const t = g.getOp(name);
-                if (!t)
-                    throw new Error(`tf-parallel: output op not found in graph: ${name}`);
-                return t;
-            });
-            // Reinterpret the raw Buffer bytes as the correct TypedArray dtype.
-            // asTypedArray is only defined inside the !isMainThread block, so we
-            // inline the same logic here for the main-thread tf-parallel path.
-            const ab = inputBuf.buffer.slice(inputBuf.byteOffset, inputBuf.byteOffset + inputBuf.byteLength);
-            let typedInput;
-            switch (inputDtype) {
-                case 1:
-                    typedInput = new Float32Array(ab);
-                    break;
-                case 2:
-                    typedInput = new Float64Array(ab);
-                    break;
-                case 3:
-                    typedInput = new Int32Array(ab);
-                    break;
-                case 4:
-                    typedInput = new Uint8Array(ab);
-                    break;
-                case 9:
-                    typedInput = new BigInt64Array(ab);
-                    break;
-                default:
-                    typedInput = new Uint8Array(ab);
-            }
-            const feedValue = {
-                dtype: inputDtype,
-                shape: inputShape,
-                data: Buffer.from(typedInput.buffer, typedInput.byteOffset, typedInput.byteLength),
-            };
-            inferencePromise = this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors).then((outputs) => {
-                // Map back to { [outputKey]: TensorResult } for uniform handling below
-                const result = {};
-                this.outputOps.forEach((key, i) => {
-                    result[key] = outputs[i];
-                });
-                return result;
-            });
-        }
-        else {
-            // ── jude-tf TFSession fallback path ───────────────────────────────
-            inferencePromise = this.tfParallelSess.runAsync({ [this.inputOp]: inputBuf }, this.outputOps);
+        // Native Session feed/fetch format — Graph.getOp() resolves op names to
+        // Tensor descriptors that Session.runAsync() expects.
+        const g = this.tfParallelGraph;
+        const inputTensor = g.getOp(this.inputOp);
+        if (!inputTensor) {
+            this.tfParallelBusy = false;
+            reject(new Error(`tf-parallel: input op not found: ${this.inputOp}`));
+            return;
         }
-        inferencePromise
-            .then((results) => {
+        const outputTensors = this.outputOps.map((name) => {
+            const t = g.getOp(name);
+            if (!t)
+                throw new Error(`tf-parallel: output op not found: ${name}`);
+            return t;
+        });
+        const feedValue = {
+            dtype: inputDtype,
+            shape: inputShape,
+            data: inputBuf,
+        };
+        this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors)
+            .then((outputs) => {
             const inferenceMs = performance.now() - t0;
             this.tfParallelBusy = false;
             resolve({
                 workerId: 0,
                 strategy: "tf-parallel",
-                outputs: this.outputOps.map((k) => {
-                    const r = results[k];
-                    if (!r)
-                        return { dtype: 0, shape: [], data: Buffer.alloc(0) };
-                    const view = r.data;
-                    return {
-                        dtype: r.dtype,
-                        shape: r.shape,
-                        data: Buffer.isBuffer(r.data)
-                            ? r.data
-                            : Buffer.from(view.buffer, view.byteOffset, view.byteLength),
-                    };
-                }),
+                outputs: outputs.map((o) => ({
+                    dtype: o.dtype,
+                    shape: o.shape,
+                    data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
+                })),
                 inferenceMs,
             });
             const next = this.tfParallelQueue.shift();
-            if (next) {
+            if (next)
                 this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
-            }
         })
             .catch((err) => {
             this.tfParallelBusy = false;
             reject(err);
             const next = this.tfParallelQueue.shift();
-            if (next) {
+            if (next)
                 this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
-            }
         });
     }
     // ── Introspection ──────────────────────────────────────────────────────────
@@ -578,17 +545,17 @@ export class InferencePool {
         if (this.strategy === "worker-pool") {
             await Promise.all(this.workerSlots.map((slot) => new Promise((resolve, reject) => {
                 const doShutdown = () => {
-                    // Register shutdown_ack listener before storing SHUTDOWN.
                     slot.worker.once("message", (msg) => {
-                        if (msg.type === "shutdown_ack")
+                        if (msg.type === "shutdown_ack") {
+                            slot.seg.destroy(); // release jude-map segment after Worker exits
                             resolve();
+                        }
                     });
                     slot.worker.once("error", reject);
                     Atomics.store(slot.ctrl, 0, SHUTDOWN);
                     Atomics.notify(slot.ctrl, 0, 1);
                 };
                 if (slot.busy) {
-                    // Wait for the current in-flight request to finish first.
                     const origResolve = slot.resolve;
                     const origReject = slot.reject;
                     slot.resolve = (r) => {