@isidorus/cpu 0.0.0-alpha.2 → 0.0.0-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,71 +2,101 @@
2
2
  * InferencePool — strategy-aware inference execution with CPU affinity.
3
3
  *
4
4
  * Strategies:
5
- *
6
- * worker-pool N Workers × Session(intra=1, inter=1)
5
+ * worker-pool N Workers × native Session(intra=1, inter=1)
7
6
  * JS controls parallelism. Each Worker owns one Session.
8
7
  * N concurrent requests run on N cores simultaneously.
9
8
  * Best: small/medium models, high concurrency.
10
9
  *
11
- * tf-parallel 1 Session × Session(intra=hw, inter=1)
10
+ * tf-parallel 1 native Session × (intra=hw−reserveCores, inter=1)
12
11
  * TF's eigen threadpool owns all TF cores for one request.
13
12
  * Concurrent requests queue behind each other.
14
13
  * Best: large models where one matmul fills all cores.
15
14
  *
16
15
  * auto Probe-based selection:
17
- * model < 150 MB → worker-pool (no probe)
18
- * model ≥ 150 MB + probeShape → warm probe → threshold
19
- * model ≥ 150 MB, no probeShape → tf-parallel (fallback)
16
+ * model < 150 MB → worker-pool (no probe)
17
+ * model ≥ 150 MB + probeShape → warm probe → threshold
18
+ * model ≥ 150 MB, no probeShape → tf-parallel (fallback)
20
19
  *
21
20
  * CPU affinity (reserveCores):
22
- * reserveCores = R pins TF computation to the LAST (N-R) cores.
23
- * The FIRST R cores stay free for the event loop, libuv I/O, opencv, etc.
24
- * The fence is applied in OnRunWork immediately before/after TF_SessionRun.
21
+ * Pins TF compute to the LAST (N−reserveCores) cores via OS affinity fence
22
+ * applied in OnRunWork immediately before/after TF_SessionRun.
23
+ *
24
+ * Transport:
25
+ * Control plane (state machine) — 4-slot Int32Array over a SharedArrayBuffer.
26
+ * Main stores WORK, Worker Atomics.wait()s and observes it. Tiny, exact.
27
+ *
28
+ * Data plane (tensor bytes) — one SharedTensorSegment (jude-map) per slot.
29
+ * Main calls seg.write(shape, dtype, bytes) — seqlock write, zero copy.
30
+ * Worker calls seg.read() — seqlock read, zero copy.
31
+ * No postMessage for input data. No SAB size limit concern (control SAB
32
+ * is 16 bytes per worker; data segments are sized to the model's input).
33
+ * Atomics are slower than seqlocks — jude-map's seqlock handles the data
34
+ * plane; Atomics handle only the 4-slot state machine.
25
35
  */
26
36
  import { Worker, isMainThread, parentPort, workerData } from "worker_threads";
27
37
  import { availableParallelism } from "os";
28
- import { statSync } from "fs";
38
+ import { statSync, readFileSync } from "fs";
29
39
  import { performance } from "perf_hooks";
40
+ import { fileURLToPath } from "url";
41
+ import { dirname, join } from "path";
42
+ import nodeGypBuild from "node-gyp-build";
43
+ import { SharedTensorSegment } from "jude-map";
30
44
  // ─── Constants ──────────────────────────────────────────────────────────────
31
45
  const IDLE = 0;
32
46
  const WORK = 1;
33
47
  const DONE = 2;
34
48
  const SHUTDOWN = 3;
35
- const CTRL_SLOTS = 4; // Int32 slots per worker in the control SAB
49
+ const CTRL_SLOTS = 4; // state-machine only IDLE/WORK/DONE/SHUTDOWN
36
50
  const SIZE_THRESHOLD_BYTES = 150 * 1024 * 1024; // 150 MB
37
51
  const DEFAULT_AUTO_THRESHOLD = 20; // ms
52
+ // Default SharedTensorSegment capacity per Worker slot (bytes).
53
+ // 4 MB covers MobileNetV2 (224×224×3×4 ≈ 602 KB) and ResNet50 single-image
54
+ // inputs with comfortable headroom. Increase via PoolOptions.maxInputBytes
55
+ // for large batch sizes or video frame inputs.
56
+ const DEFAULT_MAX_INPUT_BYTES = 4 * 1024 * 1024;
57
+ // ─── jude-map DType bridge ───────────────────────────────────────────────────
58
+ // TF_DataType integers used by the native Session match jude-map's DType enum
59
+ // values — both mirror the TensorFlow wire format. We cast directly.
60
+ function tfDtypeToJudeMap(dtype) {
61
+ return dtype;
62
+ }
38
63
  // ─── Worker-side logic ──────────────────────────────────────────────────────
39
64
  //
40
- // This block runs when the same file is loaded as a Worker thread.
41
- // The worker owns exactly one TFSession with intra_op=1, so all parallelism
42
- // is expressed at the Worker level (N workers = N cores).
65
+ // Runs when this file is loaded by inference-pool-worker.mjs (tsx bootstrap)
66
+ // or by the compiled .js entry as a Worker thread.
43
67
  //
44
- // Control protocol (per-worker Int32Array over a SharedArrayBuffer):
45
- // slot[0] = IDLE → parked, waiting for work
46
- // WORK → main thread has a request ready
47
- // DONE → worker finished, result sent via postMessage
48
- // SHUTDOWN → main thread requests exit
68
+ // Transport:
69
+ // Control plane — Int32Array over ctrlSab (4 slots, state machine only)
70
+ // Data plane — SharedTensorSegment reconstructed from segSab (jude-map)
49
71
  //
50
- // Message protocol (postMessage, ordered relative to Atomics):
51
- // main worker: { inputData: Buffer, inputShape: number[], inputDtype: number }
52
- // worker main: { type: "ready" }
53
- // { type: "result", outputs: TensorValue[], inferenceMs: number }
54
- // { type: "work_error", error: string }
55
- // { type: "shutdown_ack" }
72
+ // Init:
73
+ // Loads the native addon directly (no import from @isidorus/cpu — that would
74
+ // re-run ensureTf() and create a circular module reference).
75
+ // Loads the frozen graph via importGraphDef(readFileSync(modelPath)).
56
76
  //
57
- // Ordering guarantee:
58
- // Main posts the input message BEFORE storing WORK + notifying,
59
- // so the worker's Atomics.wait wakes AFTER the message is queued.
60
- // Node.js buffers port messages until a listener is registered,
61
- // so parentPort.once("message", ...) safely receives the queued message.
77
+ // Work loop:
78
+ // Atomics.wait(ctrl, 0, IDLE) ← park
79
+ // const { data, shape, dtype } = seg.read() ← seqlock read, zero copy
80
+ // results = await sess.runAsync(feeds, fetches)
81
+ // Atomics.store(ctrl, 0, DONE) + notify
82
+ // postMessage({ type: "result", ... })
62
83
  if (!isMainThread) {
63
- const { ctrlSab, workerIndex, modelPath, inputOp, outputOps } = workerData;
84
+ const { ctrlSab, segSab, maxInputBytes, workerIndex, modelPath, inputOp, outputOps, reserveCores, } = workerData;
64
85
  const ctrl = new Int32Array(ctrlSab, workerIndex * CTRL_SLOTS * 4, CTRL_SLOTS);
65
- // ── Init ──────────────────────────────────────────────────────────────────
86
+ // ── Init ─────────────────────────────────────────────────────────────────
66
87
  let sess;
67
88
  try {
68
- const { TFSession } = await import("jude-tf");
69
- sess = await TFSession.loadFrozenGraph(modelPath);
89
+ // Load the native addon from the package root. Workers inherit
90
+ // LIBTENSORFLOW_PATH and PATH so the addon finds libtensorflow without
91
+ // re-running ensureTf().
92
+ const pkgRoot = join(dirname(fileURLToPath(import.meta.url)), "..", "..");
93
+ const workerAddon = nodeGypBuild(pkgRoot);
94
+ const nativeGraph = new workerAddon.Graph();
95
+ nativeGraph.importGraphDef(readFileSync(modelPath));
96
+ sess = new workerAddon.Session(nativeGraph, {
97
+ strategy: "worker-pool", // intra=1, inter=1
98
+ reserveCores,
99
+ });
70
100
  }
71
101
  catch (err) {
72
102
  parentPort.postMessage({
@@ -75,72 +105,62 @@ if (!isMainThread) {
75
105
  });
76
106
  process.exit(1);
77
107
  }
108
+ // Reconstruct the SharedTensorSegment from the SAB passed via workerData.
109
+ // jude-map segments are SAB-backed — the same underlying memory is accessible
110
+ // from both the main thread and this Worker.
111
+ const seg = SharedTensorSegment.fromSharedBuffer(segSab, maxInputBytes);
78
112
  Atomics.store(ctrl, 0, IDLE);
79
113
  parentPort.postMessage({ type: "ready" });
80
- // Reinterpret raw bytes as the correct typed array for jude-tf.
81
- // postMessage structured-clone strips the Buffer prototype, so the worker
82
- // receives a plain Uint8Array. We reinterpret the underlying bytes as the
83
- // correct typed array so jude-tf sees the right TF_DataType.
84
- function asTypedArray(buf, dtype) {
85
- const ab = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
86
- switch (dtype) {
87
- case 1:
88
- return new Float32Array(ab); // TF_FLOAT
89
- case 2:
90
- return new Float64Array(ab); // TF_DOUBLE
91
- case 3:
92
- return new Int32Array(ab); // TF_INT32
93
- case 4:
94
- return new Uint8Array(ab); // TF_UINT8
95
- case 9:
96
- return new BigInt64Array(ab); // TF_INT64
97
- default:
98
- return new Uint8Array(ab);
99
- }
100
- }
101
114
  // ── Work loop ─────────────────────────────────────────────────────────────
102
- // Atomics.wait BLOCKS the worker thread (allowed in Worker threads, not in
103
- // the main thread). The block is intentional the worker is dedicated to
104
- // inference and has no other async work to process while parked.
115
+ // Atomics.wait BLOCKS intentional. This Worker is dedicated to inference
116
+ // and has no other async work to process while parked.
105
117
  while (true) {
106
- // Park until main thread sets ctrl to WORK or SHUTDOWN.
107
118
  Atomics.wait(ctrl, 0, IDLE);
108
119
  const state = Atomics.load(ctrl, 0);
109
120
  if (state === SHUTDOWN) {
110
121
  sess.destroy();
122
+ seg.destroy();
111
123
  parentPort.postMessage({ type: "shutdown_ack" });
112
124
  break;
113
125
  }
114
126
  if (state === WORK) {
115
- // Receive input data. Main posted it BEFORE storing WORK,
116
- // so the message is already queued in the port's buffer.
117
- const msg = await new Promise((resolve) => parentPort.once("message", resolve));
118
127
  try {
119
128
  const t0 = performance.now();
120
- const inputArray = asTypedArray(msg.inputData, msg.inputDtype);
121
- const results = await sess.run({ [inputOp]: inputArray }, outputOps);
129
+ // Zero-copy read seqlock guarantees a consistent snapshot.
130
+ // Main thread called seg.write() before storing WORK, so the data
131
+ // is fully committed before we observe WORK here.
132
+ const tensor = seg.read();
133
+ if (!tensor)
134
+ throw new Error("seg.read() returned null — segment not written");
135
+ const feeds = [
136
+ {
137
+ opName: inputOp,
138
+ index: 0,
139
+ tensor: {
140
+ dtype: tensor.dtype,
141
+ shape: tensor.shape,
142
+ data: Buffer.from(tensor.data.buffer, tensor.data.byteOffset, tensor.data.byteLength),
143
+ },
144
+ },
145
+ ];
146
+ const fetches = outputOps.map((op) => ({
147
+ opName: op,
148
+ index: 0,
149
+ }));
150
+ // runAsync pushes TF_SessionRun onto the Worker's own libuv thread
151
+ // pool, keeping the Worker's event loop free for Atomics signalling.
152
+ const rawOutputs = (await sess.runAsync(feeds, fetches));
122
153
  const inferenceMs = performance.now() - t0;
154
+ // Signal done before postMessage so main thread can unblock.
123
155
  Atomics.store(ctrl, 0, DONE);
124
156
  Atomics.notify(ctrl, 0, 1);
125
157
  parentPort.postMessage({
126
158
  type: "result",
127
- outputs: outputOps.map((k) => {
128
- const r = results[k];
129
- const view = r.data;
130
- return {
131
- dtype: r.dtype,
132
- shape: r.shape,
133
- // Copy into a Buffer — postMessage will structured-clone it as
134
- // Uint8Array on the receiving end; the main thread wraps it back
135
- // into a Buffer in handleMessage.
136
- data: Buffer.from(view.buffer, view.byteOffset, view.byteLength),
137
- };
138
- }),
159
+ outputs: outputOps.map((_, i) => rawOutputs[i]),
139
160
  inferenceMs,
140
161
  });
141
162
  }
142
163
  catch (err) {
143
- console.error(`[worker ${workerIndex}] error:`, err?.stack ?? String(err));
144
164
  Atomics.store(ctrl, 0, DONE);
145
165
  Atomics.notify(ctrl, 0, 1);
146
166
  parentPort.postMessage({
@@ -148,10 +168,7 @@ if (!isMainThread) {
148
168
  error: err?.stack ?? String(err),
149
169
  });
150
170
  }
151
- // Use compareExchange instead of store if the main thread wrote
152
- // SHUTDOWN between our DONE store and here, don't overwrite it.
153
- // Otherwise the next Atomics.wait(ctrl, 0, IDLE) would block forever
154
- // waiting for a notify that never comes.
171
+ // compareExchange: if main wrote SHUTDOWN between DONE and here, keep it.
155
172
  Atomics.compareExchange(ctrl, 0, DONE, IDLE);
156
173
  }
157
174
  }
@@ -162,11 +179,6 @@ export class InferencePool {
162
179
  workerSlots;
163
180
  queue;
164
181
  ctrlSab;
165
- // tf-parallel path — exactly one of these pairs is non-null:
166
- // (tfParallelGraph, tfParallelSess) — native @isidorus/cpu Session
167
- // uses Graph.getOp() for feed/fetch
168
- // (null, tfParallelSess) — jude-tf TFSession fallback
169
- // uses dict API { [opName]: data }
170
182
  tfParallelGraph;
171
183
  tfParallelSess;
172
184
  tfParallelBusy;
@@ -190,24 +202,34 @@ export class InferencePool {
190
202
  }
191
203
  // ── Factory ────────────────────────────────────────────────────────────────
192
204
  static async create(opts) {
193
- // Auto-discover input/output op names if not provided.
194
- // Loads the frozen graph once via jude-tf, reads inferred Placeholder
195
- // names and sink op names, then destroys the probe session.
205
+ // ── Auto-discover inputOp / outputOps if not provided ──────────────────
206
+ // Load the graph once, scan for Placeholder ops (inputs) and sink ops
207
+ // (outputs whose results nothing else consumes). No jude-tf needed.
196
208
  if (!opts.inputOp || !opts.outputOps?.length) {
197
- const { TFSession } = await import("jude-tf");
198
- const probe = await TFSession.loadFrozenGraph(opts.modelPath);
199
- opts.inputOp ??= probe.inputs[0];
200
- opts.outputOps ??= probe.outputs;
201
- probe.destroy();
202
- if (!opts.inputOp)
203
- throw new Error(`Could not infer inputOp from ${opts.modelPath}`);
204
- if (!opts.outputOps?.length)
205
- throw new Error(`Could not infer outputOps from ${opts.modelPath}`);
209
+ const { getAddon } = await import("./_native.js");
210
+ const { Graph: GCls } = await import("./graph.js");
211
+ const addon = getAddon();
212
+ const g = new GCls(new addon.Graph());
213
+ g.importGraphDef(readFileSync(opts.modelPath));
214
+ if (!opts.inputOp) {
215
+ const placeholders = g.listOpsOfType("Placeholder");
216
+ if (!placeholders.length)
217
+ throw new Error(`No Placeholder ops found in ${opts.modelPath}`);
218
+ opts.inputOp = placeholders[0];
219
+ }
220
+ if (!opts.outputOps?.length) {
221
+ const sinks = g.listSinkOps();
222
+ if (!sinks.length)
223
+ throw new Error(`No sink ops found in ${opts.modelPath}`);
224
+ opts.outputOps = sinks;
225
+ }
226
+ // g is garbage-collected — no explicit destroy needed for a probe graph.
206
227
  }
207
228
  const requestedStrategy = opts.strategy ?? "auto";
208
229
  const concurrency = opts.concurrency ?? availableParallelism();
209
230
  const autoThreshold = opts.autoThresholdMs ?? DEFAULT_AUTO_THRESHOLD;
210
231
  const reserveCores = opts.reserveCores ?? 0;
232
+ const maxInputBytes = opts.maxInputBytes ?? DEFAULT_MAX_INPUT_BYTES;
211
233
  let resolved;
212
234
  if (requestedStrategy === "worker-pool") {
213
235
  resolved = "worker-pool";
@@ -216,7 +238,6 @@ export class InferencePool {
216
238
  resolved = "tf-parallel";
217
239
  }
218
240
  else {
219
- // auto — file size as cheap first signal, probe if ambiguous
220
241
  const modelBytes = statSync(opts.modelPath).size;
221
242
  if (modelBytes < SIZE_THRESHOLD_BYTES) {
222
243
  resolved = "worker-pool";
@@ -227,13 +248,33 @@ export class InferencePool {
227
248
  `threshold, no probeShape → tf-parallel\n`);
228
249
  }
229
250
  else {
230
- // Warm probe with intra=1 measure single-core inference time
231
- const { TFSession } = await import("jude-tf");
232
- const probeSess = await TFSession.loadFrozenGraph(opts.modelPath);
233
- const probeInput = Buffer.alloc(opts.probeShape.reduce((a, b) => a * b, 1) * 4);
234
- await probeSess.runAsync({ [opts.inputOp]: probeInput }, opts.outputOps);
251
+ // Warm probe via native Session (intra=1) to measure single-core time.
252
+ const { getAddon } = await import("./_native.js");
253
+ const { Graph: GCls } = await import("./graph.js");
254
+ const addon = getAddon();
255
+ const probeG = new GCls(new addon.Graph());
256
+ probeG.importGraphDef(readFileSync(opts.modelPath));
257
+ const probeSess = new addon.Session(probeG._native, {
258
+ strategy: "worker-pool",
259
+ reserveCores: 0,
260
+ });
261
+ const probeElems = opts.probeShape.reduce((a, b) => a * b, 1);
262
+ const probeInput = Buffer.alloc(probeElems * 4);
263
+ const probeFeeds = [
264
+ {
265
+ opName: opts.inputOp,
266
+ index: 0,
267
+ tensor: { dtype: 1, shape: opts.probeShape, data: probeInput },
268
+ },
269
+ ];
270
+ const probeFetches = opts.outputOps.map((op) => ({
271
+ opName: op,
272
+ index: 0,
273
+ }));
274
+ // Warmup
275
+ await probeSess.runAsync(probeFeeds, probeFetches);
235
276
  const t0 = performance.now();
236
- await probeSess.runAsync({ [opts.inputOp]: probeInput }, opts.outputOps);
277
+ await probeSess.runAsync(probeFeeds, probeFetches);
237
278
  const probeMs = performance.now() - t0;
238
279
  probeSess.destroy();
239
280
  resolved = probeMs >= autoThreshold ? "tf-parallel" : "worker-pool";
@@ -242,25 +283,22 @@ export class InferencePool {
242
283
  }
243
284
  }
244
285
  if (reserveCores > 0) {
245
- const hw = availableParallelism();
246
- const tfCores = Math.max(1, hw - reserveCores);
286
+ const tfCores = Math.max(1, availableParallelism() - reserveCores);
247
287
  process.stderr.write(`[isidorus] CPU affinity: reserving ${reserveCores} core(s), ` +
248
288
  `TF gets ${tfCores} core(s)\n`);
249
289
  }
250
290
  return resolved === "worker-pool"
251
- ? InferencePool.createWorkerPool(opts, concurrency, reserveCores)
291
+ ? InferencePool.createWorkerPool(opts, concurrency, reserveCores, maxInputBytes)
252
292
  : InferencePool.createTfParallel(opts, reserveCores);
253
293
  }
254
294
  // ── worker-pool init ───────────────────────────────────────────────────────
255
- static async createWorkerPool(opts, concurrency, reserveCores) {
295
+ static async createWorkerPool(opts, concurrency, reserveCores, maxInputBytes) {
256
296
  const ctrlSab = new SharedArrayBuffer(concurrency * CTRL_SLOTS * 4);
257
297
  const slots = [];
258
298
  const startedWorkers = [];
259
- // In dev/test we run TypeScript source directly via tsx. Workers don't
260
- // inherit --import tsx from the parent, so we use a small .mjs bootstrap
261
- // (inference-pool-worker.mjs) that calls register() from tsx/esm/api
262
- // before importing this .ts file. In production the compiled .js entry
263
- // is used directly with no extra loader needed.
299
+ // In dev/test we run TypeScript source directly via tsx.
300
+ // inference-pool-worker.mjs calls register() from tsx/esm/api before
301
+ // importing this .ts file. In production the compiled .js is used directly.
264
302
  const isTsSource = import.meta.url.endsWith(".ts");
265
303
  const workerEntry = isTsSource
266
304
  ? new URL("./inference-pool-worker.mjs", import.meta.url)
@@ -269,9 +307,20 @@ export class InferencePool {
269
307
  for (let i = 0; i < concurrency; i++) {
270
308
  const ctrl = new Int32Array(ctrlSab, i * CTRL_SLOTS * 4, CTRL_SLOTS);
271
309
  Atomics.store(ctrl, 0, IDLE);
310
+ // One SharedTensorSegment per slot — zero-copy data transport.
311
+ // createShared() allocates a SharedArrayBuffer backing the seqlock
312
+ // + data region. The SAB is passed to the Worker so both sides
313
+ // share the same physical memory with no cross-thread copy.
314
+ // Must use createShared(), not new SharedTensorSegment() — the
315
+ // mmap constructor produces a process-local mapping that has no SAB
316
+ // and cannot be transferred to a Worker via workerData.
317
+ const seg = SharedTensorSegment.createShared(maxInputBytes);
318
+ const segSab = seg.sharedBuffer; // the backing SAB
272
319
  const worker = new Worker(workerEntry, {
273
320
  workerData: {
274
321
  ctrlSab,
322
+ segSab,
323
+ maxInputBytes,
275
324
  workerIndex: i,
276
325
  modelPath: opts.modelPath,
277
326
  inputOp: opts.inputOp,
@@ -291,11 +340,17 @@ export class InferencePool {
291
340
  });
292
341
  worker.once("error", reject);
293
342
  });
294
- slots.push({ worker, ctrl, busy: false, resolve: null, reject: null });
343
+ slots.push({
344
+ worker,
345
+ ctrl,
346
+ seg,
347
+ busy: false,
348
+ resolve: null,
349
+ reject: null,
350
+ });
295
351
  }
296
352
  }
297
353
  catch (err) {
298
- // Terminate any workers that were already started before the failure.
299
354
  await Promise.allSettled(startedWorkers.map((w) => w.terminate()));
300
355
  throw err;
301
356
  }
@@ -316,47 +371,26 @@ export class InferencePool {
316
371
  static async createTfParallel(opts, reserveCores) {
317
372
  const hw = availableParallelism();
318
373
  const tfCores = Math.max(1, hw - reserveCores);
319
- // Try the native @isidorus/cpu Session path first (ConfigProto thread
320
- // config + OnRunWork affinity fence). Falls back to jude-tf if the addon
321
- // hasn't been initialised (e.g. when called from outside @isidorus/cpu).
322
- //
323
- // We import _native.js rather than "@isidorus/cpu" to avoid a circular
324
- // dependency — this file IS part of @isidorus/cpu, so importing the
325
- // package entry point would re-run ensureTf() + node-gyp-build.
326
- let tfParallelGraph = null;
327
- let tfParallelSess = null;
328
- try {
329
- const { getAddon } = await import("./_native.js");
330
- const { readFileSync } = await import("fs");
331
- const { Graph: GraphClass } = await import("./graph.js");
332
- const { Session: SessionClass } = await import("./session.js");
333
- const addon = getAddon();
334
- const g = new GraphClass(new addon.Graph());
335
- g.importGraphDef(readFileSync(opts.modelPath));
336
- tfParallelGraph = g;
337
- tfParallelSess = new SessionClass(new addon.Session(g._native, {
338
- strategy: "tf-parallel",
339
- reserveCores,
340
- }));
341
- process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
342
- `(${reserveCores} core(s) reserved, native Session)\n`);
343
- }
344
- catch {
345
- // Native addon not available — fall back to jude-tf TFSession.
346
- // This path lacks the affinity fence but is otherwise correct.
347
- const { TFSession } = await import("jude-tf");
348
- tfParallelSess = await TFSession.loadFrozenGraph(opts.modelPath);
349
- process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
350
- `(${reserveCores} core(s) reserved, jude-tf fallback)\n`);
351
- }
374
+ const { getAddon } = await import("./_native.js");
375
+ const { Graph: GCls } = await import("./graph.js");
376
+ const { Session: SCls } = await import("./session.js");
377
+ const addon = getAddon();
378
+ const g = new GCls(new addon.Graph());
379
+ g.importGraphDef(readFileSync(opts.modelPath));
380
+ const sess = new SCls(new addon.Session(g._native, {
381
+ strategy: "tf-parallel",
382
+ reserveCores,
383
+ }));
384
+ process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
385
+ `(${reserveCores} core(s) reserved, native Session)\n`);
352
386
  return new InferencePool({
353
387
  strategy: "tf-parallel",
354
388
  reserveCores,
355
389
  workerSlots: [],
356
390
  queue: [],
357
391
  ctrlSab: null,
358
- tfParallelGraph,
359
- tfParallelSess,
392
+ tfParallelGraph: g,
393
+ tfParallelSess: sess,
360
394
  modelPath: opts.modelPath,
361
395
  inputOp: opts.inputOp,
362
396
  outputOps: opts.outputOps,
@@ -371,12 +405,10 @@ export class InferencePool {
371
405
  inferWorkerPool(inputBuf, inputShape, inputDtype) {
372
406
  return new Promise((resolve, reject) => {
373
407
  const slot = this.workerSlots.find((w) => !w.busy);
374
- if (slot) {
408
+ if (slot)
375
409
  this.dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject);
376
- }
377
- else {
410
+ else
378
411
  this.queue.push({ inputBuf, inputShape, inputDtype, resolve, reject });
379
- }
380
412
  });
381
413
  }
382
414
  dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject) {
@@ -384,21 +416,13 @@ export class InferencePool {
384
416
  slot.busy = true;
385
417
  slot.resolve = resolve;
386
418
  slot.reject = reject;
387
- // ── ORDERING CRITICAL ──────────────────────────────────────────────────
388
- // 1. Register the result listener FIRST — before the worker can possibly
389
- // send a response. Node.js buffers port messages until a listener is
390
- // registered, but registering after notify introduces a thread-level
391
- // race where an extremely fast result could be missed.
392
- // 2. Post the input data SECOND — the worker awaits this message after
393
- // waking from Atomics.wait, so it must arrive before WORK is stored.
394
- // 3. Store WORK + notify LAST — wakes the worker.
419
+ // Register result listener BEFORE writing data or signalling WORK.
395
420
  const handleMessage = (msg) => {
396
421
  if (msg.type === "work_error") {
397
422
  this.settleSlot(slot, null, new Error(msg.error));
398
423
  return;
399
424
  }
400
425
  if (msg.type !== "result") {
401
- // Unexpected message type — re-register to wait for the actual result.
402
426
  slot.worker.once("message", handleMessage);
403
427
  return;
404
428
  }
@@ -408,26 +432,21 @@ export class InferencePool {
408
432
  outputs: msg.outputs.map((o) => ({
409
433
  dtype: o.dtype,
410
434
  shape: o.shape,
411
- // postMessage structured-clones Buffer as plain Uint8Array —
412
- // wrap it back into a Buffer so callers can use Buffer.isBuffer().
435
+ // postMessage structured-clones Buffer as Uint8Array — rewrap.
413
436
  data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
414
437
  })),
415
438
  inferenceMs: msg.inferenceMs,
416
439
  }, null);
417
440
  };
418
- // Register listener before waking the worker.
419
441
  slot.worker.once("message", handleMessage);
420
- // Register a one-shot error listener so an uncaught worker crash rejects
421
- // the promise instead of leaving it hanging.
422
- slot.worker.once("error", (err) => {
423
- this.settleSlot(slot, null, err);
424
- });
425
- // Post input, then wake worker.
426
- slot.worker.postMessage({ inputData: inputBuf, inputShape, inputDtype });
442
+ slot.worker.once("error", (err) => this.settleSlot(slot, null, err));
443
+ // Zero-copy write — seqlock ensures the Worker sees a consistent snapshot.
444
+ // No postMessage for input data. The Worker reads via seg.read() after
445
+ // observing WORK on the control SAB.
446
+ slot.seg.write(inputShape, tfDtypeToJudeMap(inputDtype), inputBuf);
427
447
  Atomics.store(slot.ctrl, 0, WORK);
428
448
  Atomics.notify(slot.ctrl, 0, 1);
429
449
  }
430
- /** Settle a worker slot's in-flight promise and drain the queue. */
431
450
  settleSlot(slot, result, err) {
432
451
  const resolve = slot.resolve;
433
452
  const reject = slot.reject;
@@ -438,11 +457,9 @@ export class InferencePool {
438
457
  reject?.(err);
439
458
  else
440
459
  resolve?.(result);
441
- // Drain one queued request now that the slot is free.
442
460
  const next = this.queue.shift();
443
- if (next) {
461
+ if (next)
444
462
  this.dispatchToWorker(slot, next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
445
- }
446
463
  }
447
464
  // ── tf-parallel path ───────────────────────────────────────────────────────
448
465
  inferTfParallel(inputBuf, inputShape, inputDtype) {
@@ -463,100 +480,50 @@ export class InferencePool {
463
480
  runTfParallel(inputBuf, inputShape, inputDtype, resolve, reject) {
464
481
  this.tfParallelBusy = true;
465
482
  const t0 = performance.now();
466
- let inferencePromise;
467
- if (this.tfParallelGraph) {
468
- // ── Native @isidorus/cpu Session path ──────────────────────────────
469
- // Build feed/fetch arrays using Graph.getOp() to resolve op names to
470
- // Tensor references, which Session.runAsync expects.
471
- const g = this.tfParallelGraph;
472
- const inputTensor = g.getOp(this.inputOp);
473
- if (!inputTensor) {
474
- this.tfParallelBusy = false;
475
- reject(new Error(`tf-parallel: input op not found in graph: ${this.inputOp}`));
476
- return;
477
- }
478
- const outputTensors = this.outputOps.map((name) => {
479
- const t = g.getOp(name);
480
- if (!t)
481
- throw new Error(`tf-parallel: output op not found in graph: ${name}`);
482
- return t;
483
- });
484
- // Reinterpret the raw Buffer bytes as the correct TypedArray dtype.
485
- // asTypedArray is only defined inside the !isMainThread block, so we
486
- // inline the same logic here for the main-thread tf-parallel path.
487
- const ab = inputBuf.buffer.slice(inputBuf.byteOffset, inputBuf.byteOffset + inputBuf.byteLength);
488
- let typedInput;
489
- switch (inputDtype) {
490
- case 1:
491
- typedInput = new Float32Array(ab);
492
- break;
493
- case 2:
494
- typedInput = new Float64Array(ab);
495
- break;
496
- case 3:
497
- typedInput = new Int32Array(ab);
498
- break;
499
- case 4:
500
- typedInput = new Uint8Array(ab);
501
- break;
502
- case 9:
503
- typedInput = new BigInt64Array(ab);
504
- break;
505
- default:
506
- typedInput = new Uint8Array(ab);
507
- }
508
- const feedValue = {
509
- dtype: inputDtype,
510
- shape: inputShape,
511
- data: Buffer.from(typedInput.buffer, typedInput.byteOffset, typedInput.byteLength),
512
- };
513
- inferencePromise = this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors).then((outputs) => {
514
- // Map back to { [outputKey]: TensorResult } for uniform handling below
515
- const result = {};
516
- this.outputOps.forEach((key, i) => {
517
- result[key] = outputs[i];
518
- });
519
- return result;
520
- });
521
- }
522
- else {
523
- // ── jude-tf TFSession fallback path ───────────────────────────────
524
- inferencePromise = this.tfParallelSess.runAsync({ [this.inputOp]: inputBuf }, this.outputOps);
483
+ // Native Session feed/fetch format — Graph.getOp() resolves op names to
484
+ // Tensor descriptors that Session.runAsync() expects.
485
+ const g = this.tfParallelGraph;
486
+ const inputTensor = g.getOp(this.inputOp);
487
+ if (!inputTensor) {
488
+ this.tfParallelBusy = false;
489
+ reject(new Error(`tf-parallel: input op not found: ${this.inputOp}`));
490
+ return;
525
491
  }
526
- inferencePromise
527
- .then((results) => {
492
+ const outputTensors = this.outputOps.map((name) => {
493
+ const t = g.getOp(name);
494
+ if (!t)
495
+ throw new Error(`tf-parallel: output op not found: ${name}`);
496
+ return t;
497
+ });
498
+ const feedValue = {
499
+ dtype: inputDtype,
500
+ shape: inputShape,
501
+ data: inputBuf,
502
+ };
503
+ this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors)
504
+ .then((outputs) => {
528
505
  const inferenceMs = performance.now() - t0;
529
506
  this.tfParallelBusy = false;
530
507
  resolve({
531
508
  workerId: 0,
532
509
  strategy: "tf-parallel",
533
- outputs: this.outputOps.map((k) => {
534
- const r = results[k];
535
- if (!r)
536
- return { dtype: 0, shape: [], data: Buffer.alloc(0) };
537
- const view = r.data;
538
- return {
539
- dtype: r.dtype,
540
- shape: r.shape,
541
- data: Buffer.isBuffer(r.data)
542
- ? r.data
543
- : Buffer.from(view.buffer, view.byteOffset, view.byteLength),
544
- };
545
- }),
510
+ outputs: outputs.map((o) => ({
511
+ dtype: o.dtype,
512
+ shape: o.shape,
513
+ data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
514
+ })),
546
515
  inferenceMs,
547
516
  });
548
517
  const next = this.tfParallelQueue.shift();
549
- if (next) {
518
+ if (next)
550
519
  this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
551
- }
552
520
  })
553
521
  .catch((err) => {
554
522
  this.tfParallelBusy = false;
555
523
  reject(err);
556
524
  const next = this.tfParallelQueue.shift();
557
- if (next) {
525
+ if (next)
558
526
  this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
559
- }
560
527
  });
561
528
  }
562
529
  // ── Introspection ──────────────────────────────────────────────────────────
@@ -578,17 +545,17 @@ export class InferencePool {
578
545
  if (this.strategy === "worker-pool") {
579
546
  await Promise.all(this.workerSlots.map((slot) => new Promise((resolve, reject) => {
580
547
  const doShutdown = () => {
581
- // Register shutdown_ack listener before storing SHUTDOWN.
582
548
  slot.worker.once("message", (msg) => {
583
- if (msg.type === "shutdown_ack")
549
+ if (msg.type === "shutdown_ack") {
550
+ slot.seg.destroy(); // release jude-map segment after Worker exits
584
551
  resolve();
552
+ }
585
553
  });
586
554
  slot.worker.once("error", reject);
587
555
  Atomics.store(slot.ctrl, 0, SHUTDOWN);
588
556
  Atomics.notify(slot.ctrl, 0, 1);
589
557
  };
590
558
  if (slot.busy) {
591
- // Wait for the current in-flight request to finish first.
592
559
  const origResolve = slot.resolve;
593
560
  const origReject = slot.reject;
594
561
  slot.resolve = (r) => {