@isidorus/cpu 0.0.0-alpha.2 → 0.0.0-alpha.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +1 -1
- package/dist/graph.d.ts +19 -12
- package/dist/graph.d.ts.map +1 -1
- package/dist/graph.js +23 -16
- package/dist/graph.js.map +1 -1
- package/dist/inference-pool.d.ts +35 -12
- package/dist/inference-pool.d.ts.map +1 -1
- package/dist/inference-pool.js +226 -259
- package/dist/inference-pool.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +2 -3
- package/prebuilds/darwin-arm64/@isidorus+cpu.node +0 -0
- package/prebuilds/linux-x64/@isidorus+cpu.node +0 -0
- package/prebuilds/win32-x64/@isidorus+cpu.node +0 -0
- package/scripts/test-install.js +1 -1
- package/src/native/graph.cc +356 -255
- package/src/native/graph.h +2 -0
package/dist/inference-pool.js
CHANGED
|
@@ -2,71 +2,101 @@
|
|
|
2
2
|
* InferencePool — strategy-aware inference execution with CPU affinity.
|
|
3
3
|
*
|
|
4
4
|
* Strategies:
|
|
5
|
-
*
|
|
6
|
-
* worker-pool N Workers × Session(intra=1, inter=1)
|
|
5
|
+
* worker-pool N Workers × native Session(intra=1, inter=1)
|
|
7
6
|
* JS controls parallelism. Each Worker owns one Session.
|
|
8
7
|
* N concurrent requests run on N cores simultaneously.
|
|
9
8
|
* Best: small/medium models, high concurrency.
|
|
10
9
|
*
|
|
11
|
-
* tf-parallel 1 Session ×
|
|
10
|
+
* tf-parallel 1 native Session × (intra=hw−reserveCores, inter=1)
|
|
12
11
|
* TF's eigen threadpool owns all TF cores for one request.
|
|
13
12
|
* Concurrent requests queue behind each other.
|
|
14
13
|
* Best: large models where one matmul fills all cores.
|
|
15
14
|
*
|
|
16
15
|
* auto Probe-based selection:
|
|
17
|
-
* model < 150 MB
|
|
18
|
-
* model ≥ 150 MB + probeShape
|
|
19
|
-
* model ≥ 150 MB, no probeShape
|
|
16
|
+
* model < 150 MB → worker-pool (no probe)
|
|
17
|
+
* model ≥ 150 MB + probeShape → warm probe → threshold
|
|
18
|
+
* model ≥ 150 MB, no probeShape → tf-parallel (fallback)
|
|
20
19
|
*
|
|
21
20
|
* CPU affinity (reserveCores):
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
21
|
+
* Pins TF compute to the LAST (N−reserveCores) cores via OS affinity fence
|
|
22
|
+
* applied in OnRunWork immediately before/after TF_SessionRun.
|
|
23
|
+
*
|
|
24
|
+
* Transport:
|
|
25
|
+
* Control plane (state machine) — 4-slot Int32Array over a SharedArrayBuffer.
|
|
26
|
+
* Main stores WORK, Worker Atomics.wait()s and observes it. Tiny, exact.
|
|
27
|
+
*
|
|
28
|
+
* Data plane (tensor bytes) — one SharedTensorSegment (jude-map) per slot.
|
|
29
|
+
* Main calls seg.write(shape, dtype, bytes) — seqlock write, zero copy.
|
|
30
|
+
* Worker calls seg.read() — seqlock read, zero copy.
|
|
31
|
+
* No postMessage for input data. No SAB size limit concern (control SAB
|
|
32
|
+
* is 16 bytes per worker; data segments are sized to the model's input).
|
|
33
|
+
* Atomics are slower than seqlocks — jude-map's seqlock handles the data
|
|
34
|
+
* plane; Atomics handle only the 4-slot state machine.
|
|
25
35
|
*/
|
|
26
36
|
import { Worker, isMainThread, parentPort, workerData } from "worker_threads";
|
|
27
37
|
import { availableParallelism } from "os";
|
|
28
|
-
import { statSync } from "fs";
|
|
38
|
+
import { statSync, readFileSync } from "fs";
|
|
29
39
|
import { performance } from "perf_hooks";
|
|
40
|
+
import { fileURLToPath } from "url";
|
|
41
|
+
import { dirname, join } from "path";
|
|
42
|
+
import nodeGypBuild from "node-gyp-build";
|
|
43
|
+
import { SharedTensorSegment } from "jude-map";
|
|
30
44
|
// ─── Constants ──────────────────────────────────────────────────────────────
|
|
31
45
|
const IDLE = 0;
|
|
32
46
|
const WORK = 1;
|
|
33
47
|
const DONE = 2;
|
|
34
48
|
const SHUTDOWN = 3;
|
|
35
|
-
const CTRL_SLOTS = 4; //
|
|
49
|
+
const CTRL_SLOTS = 4; // state-machine only — IDLE/WORK/DONE/SHUTDOWN
|
|
36
50
|
const SIZE_THRESHOLD_BYTES = 150 * 1024 * 1024; // 150 MB
|
|
37
51
|
const DEFAULT_AUTO_THRESHOLD = 20; // ms
|
|
52
|
+
// Default SharedTensorSegment capacity per Worker slot (bytes).
|
|
53
|
+
// 4 MB covers MobileNetV2 (224×224×3×4 ≈ 602 KB) and ResNet50 single-image
|
|
54
|
+
// inputs with comfortable headroom. Increase via PoolOptions.maxInputBytes
|
|
55
|
+
// for large batch sizes or video frame inputs.
|
|
56
|
+
const DEFAULT_MAX_INPUT_BYTES = 4 * 1024 * 1024;
|
|
57
|
+
// ─── jude-map DType bridge ───────────────────────────────────────────────────
|
|
58
|
+
// TF_DataType integers used by the native Session match jude-map's DType enum
|
|
59
|
+
// values — both mirror the TensorFlow wire format. We cast directly.
|
|
60
|
+
function tfDtypeToJudeMap(dtype) {
|
|
61
|
+
return dtype;
|
|
62
|
+
}
|
|
38
63
|
// ─── Worker-side logic ──────────────────────────────────────────────────────
|
|
39
64
|
//
|
|
40
|
-
//
|
|
41
|
-
//
|
|
42
|
-
// is expressed at the Worker level (N workers = N cores).
|
|
65
|
+
// Runs when this file is loaded by inference-pool-worker.mjs (tsx bootstrap)
|
|
66
|
+
// or by the compiled .js entry as a Worker thread.
|
|
43
67
|
//
|
|
44
|
-
//
|
|
45
|
-
//
|
|
46
|
-
//
|
|
47
|
-
// DONE → worker finished, result sent via postMessage
|
|
48
|
-
// SHUTDOWN → main thread requests exit
|
|
68
|
+
// Transport:
|
|
69
|
+
// Control plane — Int32Array over ctrlSab (4 slots, state machine only)
|
|
70
|
+
// Data plane — SharedTensorSegment reconstructed from segSab (jude-map)
|
|
49
71
|
//
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
//
|
|
53
|
-
//
|
|
54
|
-
// { type: "work_error", error: string }
|
|
55
|
-
// { type: "shutdown_ack" }
|
|
72
|
+
// Init:
|
|
73
|
+
// Loads the native addon directly (no import from @isidorus/cpu — that would
|
|
74
|
+
// re-run ensureTf() and create a circular module reference).
|
|
75
|
+
// Loads the frozen graph via importGraphDef(readFileSync(modelPath)).
|
|
56
76
|
//
|
|
57
|
-
//
|
|
58
|
-
//
|
|
59
|
-
//
|
|
60
|
-
//
|
|
61
|
-
//
|
|
77
|
+
// Work loop:
|
|
78
|
+
// Atomics.wait(ctrl, 0, IDLE) ← park
|
|
79
|
+
// const { data, shape, dtype } = seg.read() ← seqlock read, zero copy
|
|
80
|
+
// results = await sess.runAsync(feeds, fetches)
|
|
81
|
+
// Atomics.store(ctrl, 0, DONE) + notify
|
|
82
|
+
// postMessage({ type: "result", ... })
|
|
62
83
|
if (!isMainThread) {
|
|
63
|
-
const { ctrlSab, workerIndex, modelPath, inputOp, outputOps } = workerData;
|
|
84
|
+
const { ctrlSab, segSab, maxInputBytes, workerIndex, modelPath, inputOp, outputOps, reserveCores, } = workerData;
|
|
64
85
|
const ctrl = new Int32Array(ctrlSab, workerIndex * CTRL_SLOTS * 4, CTRL_SLOTS);
|
|
65
|
-
// ── Init
|
|
86
|
+
// ── Init ─────────────────────────────────────────────────────────────────
|
|
66
87
|
let sess;
|
|
67
88
|
try {
|
|
68
|
-
|
|
69
|
-
|
|
89
|
+
// Load the native addon from the package root. Workers inherit
|
|
90
|
+
// LIBTENSORFLOW_PATH and PATH so the addon finds libtensorflow without
|
|
91
|
+
// re-running ensureTf().
|
|
92
|
+
const pkgRoot = join(dirname(fileURLToPath(import.meta.url)), "..", "..");
|
|
93
|
+
const workerAddon = nodeGypBuild(pkgRoot);
|
|
94
|
+
const nativeGraph = new workerAddon.Graph();
|
|
95
|
+
nativeGraph.importGraphDef(readFileSync(modelPath));
|
|
96
|
+
sess = new workerAddon.Session(nativeGraph, {
|
|
97
|
+
strategy: "worker-pool", // intra=1, inter=1
|
|
98
|
+
reserveCores,
|
|
99
|
+
});
|
|
70
100
|
}
|
|
71
101
|
catch (err) {
|
|
72
102
|
parentPort.postMessage({
|
|
@@ -75,72 +105,62 @@ if (!isMainThread) {
|
|
|
75
105
|
});
|
|
76
106
|
process.exit(1);
|
|
77
107
|
}
|
|
108
|
+
// Reconstruct the SharedTensorSegment from the SAB passed via workerData.
|
|
109
|
+
// jude-map segments are SAB-backed — the same underlying memory is accessible
|
|
110
|
+
// from both the main thread and this Worker.
|
|
111
|
+
const seg = SharedTensorSegment.fromSharedBuffer(segSab, maxInputBytes);
|
|
78
112
|
Atomics.store(ctrl, 0, IDLE);
|
|
79
113
|
parentPort.postMessage({ type: "ready" });
|
|
80
|
-
// Reinterpret raw bytes as the correct typed array for jude-tf.
|
|
81
|
-
// postMessage structured-clone strips the Buffer prototype, so the worker
|
|
82
|
-
// receives a plain Uint8Array. We reinterpret the underlying bytes as the
|
|
83
|
-
// correct typed array so jude-tf sees the right TF_DataType.
|
|
84
|
-
function asTypedArray(buf, dtype) {
|
|
85
|
-
const ab = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
86
|
-
switch (dtype) {
|
|
87
|
-
case 1:
|
|
88
|
-
return new Float32Array(ab); // TF_FLOAT
|
|
89
|
-
case 2:
|
|
90
|
-
return new Float64Array(ab); // TF_DOUBLE
|
|
91
|
-
case 3:
|
|
92
|
-
return new Int32Array(ab); // TF_INT32
|
|
93
|
-
case 4:
|
|
94
|
-
return new Uint8Array(ab); // TF_UINT8
|
|
95
|
-
case 9:
|
|
96
|
-
return new BigInt64Array(ab); // TF_INT64
|
|
97
|
-
default:
|
|
98
|
-
return new Uint8Array(ab);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
114
|
// ── Work loop ─────────────────────────────────────────────────────────────
|
|
102
|
-
// Atomics.wait BLOCKS
|
|
103
|
-
//
|
|
104
|
-
// inference and has no other async work to process while parked.
|
|
115
|
+
// Atomics.wait BLOCKS — intentional. This Worker is dedicated to inference
|
|
116
|
+
// and has no other async work to process while parked.
|
|
105
117
|
while (true) {
|
|
106
|
-
// Park until main thread sets ctrl to WORK or SHUTDOWN.
|
|
107
118
|
Atomics.wait(ctrl, 0, IDLE);
|
|
108
119
|
const state = Atomics.load(ctrl, 0);
|
|
109
120
|
if (state === SHUTDOWN) {
|
|
110
121
|
sess.destroy();
|
|
122
|
+
seg.destroy();
|
|
111
123
|
parentPort.postMessage({ type: "shutdown_ack" });
|
|
112
124
|
break;
|
|
113
125
|
}
|
|
114
126
|
if (state === WORK) {
|
|
115
|
-
// Receive input data. Main posted it BEFORE storing WORK,
|
|
116
|
-
// so the message is already queued in the port's buffer.
|
|
117
|
-
const msg = await new Promise((resolve) => parentPort.once("message", resolve));
|
|
118
127
|
try {
|
|
119
128
|
const t0 = performance.now();
|
|
120
|
-
|
|
121
|
-
|
|
129
|
+
// Zero-copy read — seqlock guarantees a consistent snapshot.
|
|
130
|
+
// Main thread called seg.write() before storing WORK, so the data
|
|
131
|
+
// is fully committed before we observe WORK here.
|
|
132
|
+
const tensor = seg.read();
|
|
133
|
+
if (!tensor)
|
|
134
|
+
throw new Error("seg.read() returned null — segment not written");
|
|
135
|
+
const feeds = [
|
|
136
|
+
{
|
|
137
|
+
opName: inputOp,
|
|
138
|
+
index: 0,
|
|
139
|
+
tensor: {
|
|
140
|
+
dtype: tensor.dtype,
|
|
141
|
+
shape: tensor.shape,
|
|
142
|
+
data: Buffer.from(tensor.data.buffer, tensor.data.byteOffset, tensor.data.byteLength),
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
];
|
|
146
|
+
const fetches = outputOps.map((op) => ({
|
|
147
|
+
opName: op,
|
|
148
|
+
index: 0,
|
|
149
|
+
}));
|
|
150
|
+
// runAsync pushes TF_SessionRun onto the Worker's own libuv thread
|
|
151
|
+
// pool, keeping the Worker's event loop free for Atomics signalling.
|
|
152
|
+
const rawOutputs = (await sess.runAsync(feeds, fetches));
|
|
122
153
|
const inferenceMs = performance.now() - t0;
|
|
154
|
+
// Signal done before postMessage so main thread can unblock.
|
|
123
155
|
Atomics.store(ctrl, 0, DONE);
|
|
124
156
|
Atomics.notify(ctrl, 0, 1);
|
|
125
157
|
parentPort.postMessage({
|
|
126
158
|
type: "result",
|
|
127
|
-
outputs: outputOps.map((
|
|
128
|
-
const r = results[k];
|
|
129
|
-
const view = r.data;
|
|
130
|
-
return {
|
|
131
|
-
dtype: r.dtype,
|
|
132
|
-
shape: r.shape,
|
|
133
|
-
// Copy into a Buffer — postMessage will structured-clone it as
|
|
134
|
-
// Uint8Array on the receiving end; the main thread wraps it back
|
|
135
|
-
// into a Buffer in handleMessage.
|
|
136
|
-
data: Buffer.from(view.buffer, view.byteOffset, view.byteLength),
|
|
137
|
-
};
|
|
138
|
-
}),
|
|
159
|
+
outputs: outputOps.map((_, i) => rawOutputs[i]),
|
|
139
160
|
inferenceMs,
|
|
140
161
|
});
|
|
141
162
|
}
|
|
142
163
|
catch (err) {
|
|
143
|
-
console.error(`[worker ${workerIndex}] error:`, err?.stack ?? String(err));
|
|
144
164
|
Atomics.store(ctrl, 0, DONE);
|
|
145
165
|
Atomics.notify(ctrl, 0, 1);
|
|
146
166
|
parentPort.postMessage({
|
|
@@ -148,10 +168,7 @@ if (!isMainThread) {
|
|
|
148
168
|
error: err?.stack ?? String(err),
|
|
149
169
|
});
|
|
150
170
|
}
|
|
151
|
-
//
|
|
152
|
-
// SHUTDOWN between our DONE store and here, don't overwrite it.
|
|
153
|
-
// Otherwise the next Atomics.wait(ctrl, 0, IDLE) would block forever
|
|
154
|
-
// waiting for a notify that never comes.
|
|
171
|
+
// compareExchange: if main wrote SHUTDOWN between DONE and here, keep it.
|
|
155
172
|
Atomics.compareExchange(ctrl, 0, DONE, IDLE);
|
|
156
173
|
}
|
|
157
174
|
}
|
|
@@ -162,11 +179,6 @@ export class InferencePool {
|
|
|
162
179
|
workerSlots;
|
|
163
180
|
queue;
|
|
164
181
|
ctrlSab;
|
|
165
|
-
// tf-parallel path — exactly one of these pairs is non-null:
|
|
166
|
-
// (tfParallelGraph, tfParallelSess) — native @isidorus/cpu Session
|
|
167
|
-
// uses Graph.getOp() for feed/fetch
|
|
168
|
-
// (null, tfParallelSess) — jude-tf TFSession fallback
|
|
169
|
-
// uses dict API { [opName]: data }
|
|
170
182
|
tfParallelGraph;
|
|
171
183
|
tfParallelSess;
|
|
172
184
|
tfParallelBusy;
|
|
@@ -190,24 +202,34 @@ export class InferencePool {
|
|
|
190
202
|
}
|
|
191
203
|
// ── Factory ────────────────────────────────────────────────────────────────
|
|
192
204
|
static async create(opts) {
|
|
193
|
-
// Auto-discover
|
|
194
|
-
//
|
|
195
|
-
//
|
|
205
|
+
// ── Auto-discover inputOp / outputOps if not provided ──────────────────
|
|
206
|
+
// Load the graph once, scan for Placeholder ops (inputs) and sink ops
|
|
207
|
+
// (outputs whose results nothing else consumes). No jude-tf needed.
|
|
196
208
|
if (!opts.inputOp || !opts.outputOps?.length) {
|
|
197
|
-
const {
|
|
198
|
-
const
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
if (!opts.inputOp)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
209
|
+
const { getAddon } = await import("./_native.js");
|
|
210
|
+
const { Graph: GCls } = await import("./graph.js");
|
|
211
|
+
const addon = getAddon();
|
|
212
|
+
const g = new GCls(new addon.Graph());
|
|
213
|
+
g.importGraphDef(readFileSync(opts.modelPath));
|
|
214
|
+
if (!opts.inputOp) {
|
|
215
|
+
const placeholders = g.listOpsOfType("Placeholder");
|
|
216
|
+
if (!placeholders.length)
|
|
217
|
+
throw new Error(`No Placeholder ops found in ${opts.modelPath}`);
|
|
218
|
+
opts.inputOp = placeholders[0];
|
|
219
|
+
}
|
|
220
|
+
if (!opts.outputOps?.length) {
|
|
221
|
+
const sinks = g.listSinkOps();
|
|
222
|
+
if (!sinks.length)
|
|
223
|
+
throw new Error(`No sink ops found in ${opts.modelPath}`);
|
|
224
|
+
opts.outputOps = sinks;
|
|
225
|
+
}
|
|
226
|
+
// g is garbage-collected — no explicit destroy needed for a probe graph.
|
|
206
227
|
}
|
|
207
228
|
const requestedStrategy = opts.strategy ?? "auto";
|
|
208
229
|
const concurrency = opts.concurrency ?? availableParallelism();
|
|
209
230
|
const autoThreshold = opts.autoThresholdMs ?? DEFAULT_AUTO_THRESHOLD;
|
|
210
231
|
const reserveCores = opts.reserveCores ?? 0;
|
|
232
|
+
const maxInputBytes = opts.maxInputBytes ?? DEFAULT_MAX_INPUT_BYTES;
|
|
211
233
|
let resolved;
|
|
212
234
|
if (requestedStrategy === "worker-pool") {
|
|
213
235
|
resolved = "worker-pool";
|
|
@@ -216,7 +238,6 @@ export class InferencePool {
|
|
|
216
238
|
resolved = "tf-parallel";
|
|
217
239
|
}
|
|
218
240
|
else {
|
|
219
|
-
// auto — file size as cheap first signal, probe if ambiguous
|
|
220
241
|
const modelBytes = statSync(opts.modelPath).size;
|
|
221
242
|
if (modelBytes < SIZE_THRESHOLD_BYTES) {
|
|
222
243
|
resolved = "worker-pool";
|
|
@@ -227,13 +248,33 @@ export class InferencePool {
|
|
|
227
248
|
`threshold, no probeShape → tf-parallel\n`);
|
|
228
249
|
}
|
|
229
250
|
else {
|
|
230
|
-
// Warm probe
|
|
231
|
-
const {
|
|
232
|
-
const
|
|
233
|
-
const
|
|
234
|
-
|
|
251
|
+
// Warm probe via native Session (intra=1) to measure single-core time.
|
|
252
|
+
const { getAddon } = await import("./_native.js");
|
|
253
|
+
const { Graph: GCls } = await import("./graph.js");
|
|
254
|
+
const addon = getAddon();
|
|
255
|
+
const probeG = new GCls(new addon.Graph());
|
|
256
|
+
probeG.importGraphDef(readFileSync(opts.modelPath));
|
|
257
|
+
const probeSess = new addon.Session(probeG._native, {
|
|
258
|
+
strategy: "worker-pool",
|
|
259
|
+
reserveCores: 0,
|
|
260
|
+
});
|
|
261
|
+
const probeElems = opts.probeShape.reduce((a, b) => a * b, 1);
|
|
262
|
+
const probeInput = Buffer.alloc(probeElems * 4);
|
|
263
|
+
const probeFeeds = [
|
|
264
|
+
{
|
|
265
|
+
opName: opts.inputOp,
|
|
266
|
+
index: 0,
|
|
267
|
+
tensor: { dtype: 1, shape: opts.probeShape, data: probeInput },
|
|
268
|
+
},
|
|
269
|
+
];
|
|
270
|
+
const probeFetches = opts.outputOps.map((op) => ({
|
|
271
|
+
opName: op,
|
|
272
|
+
index: 0,
|
|
273
|
+
}));
|
|
274
|
+
// Warmup
|
|
275
|
+
await probeSess.runAsync(probeFeeds, probeFetches);
|
|
235
276
|
const t0 = performance.now();
|
|
236
|
-
await probeSess.runAsync(
|
|
277
|
+
await probeSess.runAsync(probeFeeds, probeFetches);
|
|
237
278
|
const probeMs = performance.now() - t0;
|
|
238
279
|
probeSess.destroy();
|
|
239
280
|
resolved = probeMs >= autoThreshold ? "tf-parallel" : "worker-pool";
|
|
@@ -242,25 +283,22 @@ export class InferencePool {
|
|
|
242
283
|
}
|
|
243
284
|
}
|
|
244
285
|
if (reserveCores > 0) {
|
|
245
|
-
const
|
|
246
|
-
const tfCores = Math.max(1, hw - reserveCores);
|
|
286
|
+
const tfCores = Math.max(1, availableParallelism() - reserveCores);
|
|
247
287
|
process.stderr.write(`[isidorus] CPU affinity: reserving ${reserveCores} core(s), ` +
|
|
248
288
|
`TF gets ${tfCores} core(s)\n`);
|
|
249
289
|
}
|
|
250
290
|
return resolved === "worker-pool"
|
|
251
|
-
? InferencePool.createWorkerPool(opts, concurrency, reserveCores)
|
|
291
|
+
? InferencePool.createWorkerPool(opts, concurrency, reserveCores, maxInputBytes)
|
|
252
292
|
: InferencePool.createTfParallel(opts, reserveCores);
|
|
253
293
|
}
|
|
254
294
|
// ── worker-pool init ───────────────────────────────────────────────────────
|
|
255
|
-
static async createWorkerPool(opts, concurrency, reserveCores) {
|
|
295
|
+
static async createWorkerPool(opts, concurrency, reserveCores, maxInputBytes) {
|
|
256
296
|
const ctrlSab = new SharedArrayBuffer(concurrency * CTRL_SLOTS * 4);
|
|
257
297
|
const slots = [];
|
|
258
298
|
const startedWorkers = [];
|
|
259
|
-
// In dev/test we run TypeScript source directly via tsx.
|
|
260
|
-
//
|
|
261
|
-
//
|
|
262
|
-
// before importing this .ts file. In production the compiled .js entry
|
|
263
|
-
// is used directly with no extra loader needed.
|
|
299
|
+
// In dev/test we run TypeScript source directly via tsx.
|
|
300
|
+
// inference-pool-worker.mjs calls register() from tsx/esm/api before
|
|
301
|
+
// importing this .ts file. In production the compiled .js is used directly.
|
|
264
302
|
const isTsSource = import.meta.url.endsWith(".ts");
|
|
265
303
|
const workerEntry = isTsSource
|
|
266
304
|
? new URL("./inference-pool-worker.mjs", import.meta.url)
|
|
@@ -269,9 +307,20 @@ export class InferencePool {
|
|
|
269
307
|
for (let i = 0; i < concurrency; i++) {
|
|
270
308
|
const ctrl = new Int32Array(ctrlSab, i * CTRL_SLOTS * 4, CTRL_SLOTS);
|
|
271
309
|
Atomics.store(ctrl, 0, IDLE);
|
|
310
|
+
// One SharedTensorSegment per slot — zero-copy data transport.
|
|
311
|
+
// createShared() allocates a SharedArrayBuffer backing the seqlock
|
|
312
|
+
// + data region. The SAB is passed to the Worker so both sides
|
|
313
|
+
// share the same physical memory with no cross-thread copy.
|
|
314
|
+
// Must use createShared(), not new SharedTensorSegment() — the
|
|
315
|
+
// mmap constructor produces a process-local mapping that has no SAB
|
|
316
|
+
// and cannot be transferred to a Worker via workerData.
|
|
317
|
+
const seg = SharedTensorSegment.createShared(maxInputBytes);
|
|
318
|
+
const segSab = seg.sharedBuffer; // the backing SAB
|
|
272
319
|
const worker = new Worker(workerEntry, {
|
|
273
320
|
workerData: {
|
|
274
321
|
ctrlSab,
|
|
322
|
+
segSab,
|
|
323
|
+
maxInputBytes,
|
|
275
324
|
workerIndex: i,
|
|
276
325
|
modelPath: opts.modelPath,
|
|
277
326
|
inputOp: opts.inputOp,
|
|
@@ -291,11 +340,17 @@ export class InferencePool {
|
|
|
291
340
|
});
|
|
292
341
|
worker.once("error", reject);
|
|
293
342
|
});
|
|
294
|
-
slots.push({
|
|
343
|
+
slots.push({
|
|
344
|
+
worker,
|
|
345
|
+
ctrl,
|
|
346
|
+
seg,
|
|
347
|
+
busy: false,
|
|
348
|
+
resolve: null,
|
|
349
|
+
reject: null,
|
|
350
|
+
});
|
|
295
351
|
}
|
|
296
352
|
}
|
|
297
353
|
catch (err) {
|
|
298
|
-
// Terminate any workers that were already started before the failure.
|
|
299
354
|
await Promise.allSettled(startedWorkers.map((w) => w.terminate()));
|
|
300
355
|
throw err;
|
|
301
356
|
}
|
|
@@ -316,47 +371,26 @@ export class InferencePool {
|
|
|
316
371
|
static async createTfParallel(opts, reserveCores) {
|
|
317
372
|
const hw = availableParallelism();
|
|
318
373
|
const tfCores = Math.max(1, hw - reserveCores);
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
const { Graph: GraphClass } = await import("./graph.js");
|
|
332
|
-
const { Session: SessionClass } = await import("./session.js");
|
|
333
|
-
const addon = getAddon();
|
|
334
|
-
const g = new GraphClass(new addon.Graph());
|
|
335
|
-
g.importGraphDef(readFileSync(opts.modelPath));
|
|
336
|
-
tfParallelGraph = g;
|
|
337
|
-
tfParallelSess = new SessionClass(new addon.Session(g._native, {
|
|
338
|
-
strategy: "tf-parallel",
|
|
339
|
-
reserveCores,
|
|
340
|
-
}));
|
|
341
|
-
process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
|
|
342
|
-
`(${reserveCores} core(s) reserved, native Session)\n`);
|
|
343
|
-
}
|
|
344
|
-
catch {
|
|
345
|
-
// Native addon not available — fall back to jude-tf TFSession.
|
|
346
|
-
// This path lacks the affinity fence but is otherwise correct.
|
|
347
|
-
const { TFSession } = await import("jude-tf");
|
|
348
|
-
tfParallelSess = await TFSession.loadFrozenGraph(opts.modelPath);
|
|
349
|
-
process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
|
|
350
|
-
`(${reserveCores} core(s) reserved, jude-tf fallback)\n`);
|
|
351
|
-
}
|
|
374
|
+
const { getAddon } = await import("./_native.js");
|
|
375
|
+
const { Graph: GCls } = await import("./graph.js");
|
|
376
|
+
const { Session: SCls } = await import("./session.js");
|
|
377
|
+
const addon = getAddon();
|
|
378
|
+
const g = new GCls(new addon.Graph());
|
|
379
|
+
g.importGraphDef(readFileSync(opts.modelPath));
|
|
380
|
+
const sess = new SCls(new addon.Session(g._native, {
|
|
381
|
+
strategy: "tf-parallel",
|
|
382
|
+
reserveCores,
|
|
383
|
+
}));
|
|
384
|
+
process.stderr.write(`[isidorus] tf-parallel: intra_op=${tfCores} ` +
|
|
385
|
+
`(${reserveCores} core(s) reserved, native Session)\n`);
|
|
352
386
|
return new InferencePool({
|
|
353
387
|
strategy: "tf-parallel",
|
|
354
388
|
reserveCores,
|
|
355
389
|
workerSlots: [],
|
|
356
390
|
queue: [],
|
|
357
391
|
ctrlSab: null,
|
|
358
|
-
tfParallelGraph,
|
|
359
|
-
tfParallelSess,
|
|
392
|
+
tfParallelGraph: g,
|
|
393
|
+
tfParallelSess: sess,
|
|
360
394
|
modelPath: opts.modelPath,
|
|
361
395
|
inputOp: opts.inputOp,
|
|
362
396
|
outputOps: opts.outputOps,
|
|
@@ -371,12 +405,10 @@ export class InferencePool {
|
|
|
371
405
|
inferWorkerPool(inputBuf, inputShape, inputDtype) {
|
|
372
406
|
return new Promise((resolve, reject) => {
|
|
373
407
|
const slot = this.workerSlots.find((w) => !w.busy);
|
|
374
|
-
if (slot)
|
|
408
|
+
if (slot)
|
|
375
409
|
this.dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject);
|
|
376
|
-
|
|
377
|
-
else {
|
|
410
|
+
else
|
|
378
411
|
this.queue.push({ inputBuf, inputShape, inputDtype, resolve, reject });
|
|
379
|
-
}
|
|
380
412
|
});
|
|
381
413
|
}
|
|
382
414
|
dispatchToWorker(slot, inputBuf, inputShape, inputDtype, resolve, reject) {
|
|
@@ -384,21 +416,13 @@ export class InferencePool {
|
|
|
384
416
|
slot.busy = true;
|
|
385
417
|
slot.resolve = resolve;
|
|
386
418
|
slot.reject = reject;
|
|
387
|
-
//
|
|
388
|
-
// 1. Register the result listener FIRST — before the worker can possibly
|
|
389
|
-
// send a response. Node.js buffers port messages until a listener is
|
|
390
|
-
// registered, but registering after notify introduces a thread-level
|
|
391
|
-
// race where an extremely fast result could be missed.
|
|
392
|
-
// 2. Post the input data SECOND — the worker awaits this message after
|
|
393
|
-
// waking from Atomics.wait, so it must arrive before WORK is stored.
|
|
394
|
-
// 3. Store WORK + notify LAST — wakes the worker.
|
|
419
|
+
// Register result listener BEFORE writing data or signalling WORK.
|
|
395
420
|
const handleMessage = (msg) => {
|
|
396
421
|
if (msg.type === "work_error") {
|
|
397
422
|
this.settleSlot(slot, null, new Error(msg.error));
|
|
398
423
|
return;
|
|
399
424
|
}
|
|
400
425
|
if (msg.type !== "result") {
|
|
401
|
-
// Unexpected message type — re-register to wait for the actual result.
|
|
402
426
|
slot.worker.once("message", handleMessage);
|
|
403
427
|
return;
|
|
404
428
|
}
|
|
@@ -408,26 +432,21 @@ export class InferencePool {
|
|
|
408
432
|
outputs: msg.outputs.map((o) => ({
|
|
409
433
|
dtype: o.dtype,
|
|
410
434
|
shape: o.shape,
|
|
411
|
-
// postMessage structured-clones Buffer as
|
|
412
|
-
// wrap it back into a Buffer so callers can use Buffer.isBuffer().
|
|
435
|
+
// postMessage structured-clones Buffer as Uint8Array — rewrap.
|
|
413
436
|
data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
|
|
414
437
|
})),
|
|
415
438
|
inferenceMs: msg.inferenceMs,
|
|
416
439
|
}, null);
|
|
417
440
|
};
|
|
418
|
-
// Register listener before waking the worker.
|
|
419
441
|
slot.worker.once("message", handleMessage);
|
|
420
|
-
|
|
421
|
-
// the
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
// Post input, then wake worker.
|
|
426
|
-
slot.worker.postMessage({ inputData: inputBuf, inputShape, inputDtype });
|
|
442
|
+
slot.worker.once("error", (err) => this.settleSlot(slot, null, err));
|
|
443
|
+
// Zero-copy write — seqlock ensures the Worker sees a consistent snapshot.
|
|
444
|
+
// No postMessage for input data. The Worker reads via seg.read() after
|
|
445
|
+
// observing WORK on the control SAB.
|
|
446
|
+
slot.seg.write(inputShape, tfDtypeToJudeMap(inputDtype), inputBuf);
|
|
427
447
|
Atomics.store(slot.ctrl, 0, WORK);
|
|
428
448
|
Atomics.notify(slot.ctrl, 0, 1);
|
|
429
449
|
}
|
|
430
|
-
/** Settle a worker slot's in-flight promise and drain the queue. */
|
|
431
450
|
settleSlot(slot, result, err) {
|
|
432
451
|
const resolve = slot.resolve;
|
|
433
452
|
const reject = slot.reject;
|
|
@@ -438,11 +457,9 @@ export class InferencePool {
|
|
|
438
457
|
reject?.(err);
|
|
439
458
|
else
|
|
440
459
|
resolve?.(result);
|
|
441
|
-
// Drain one queued request now that the slot is free.
|
|
442
460
|
const next = this.queue.shift();
|
|
443
|
-
if (next)
|
|
461
|
+
if (next)
|
|
444
462
|
this.dispatchToWorker(slot, next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
|
|
445
|
-
}
|
|
446
463
|
}
|
|
447
464
|
// ── tf-parallel path ───────────────────────────────────────────────────────
|
|
448
465
|
inferTfParallel(inputBuf, inputShape, inputDtype) {
|
|
@@ -463,100 +480,50 @@ export class InferencePool {
|
|
|
463
480
|
runTfParallel(inputBuf, inputShape, inputDtype, resolve, reject) {
|
|
464
481
|
this.tfParallelBusy = true;
|
|
465
482
|
const t0 = performance.now();
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
this.tfParallelBusy = false;
|
|
475
|
-
reject(new Error(`tf-parallel: input op not found in graph: ${this.inputOp}`));
|
|
476
|
-
return;
|
|
477
|
-
}
|
|
478
|
-
const outputTensors = this.outputOps.map((name) => {
|
|
479
|
-
const t = g.getOp(name);
|
|
480
|
-
if (!t)
|
|
481
|
-
throw new Error(`tf-parallel: output op not found in graph: ${name}`);
|
|
482
|
-
return t;
|
|
483
|
-
});
|
|
484
|
-
// Reinterpret the raw Buffer bytes as the correct TypedArray dtype.
|
|
485
|
-
// asTypedArray is only defined inside the !isMainThread block, so we
|
|
486
|
-
// inline the same logic here for the main-thread tf-parallel path.
|
|
487
|
-
const ab = inputBuf.buffer.slice(inputBuf.byteOffset, inputBuf.byteOffset + inputBuf.byteLength);
|
|
488
|
-
let typedInput;
|
|
489
|
-
switch (inputDtype) {
|
|
490
|
-
case 1:
|
|
491
|
-
typedInput = new Float32Array(ab);
|
|
492
|
-
break;
|
|
493
|
-
case 2:
|
|
494
|
-
typedInput = new Float64Array(ab);
|
|
495
|
-
break;
|
|
496
|
-
case 3:
|
|
497
|
-
typedInput = new Int32Array(ab);
|
|
498
|
-
break;
|
|
499
|
-
case 4:
|
|
500
|
-
typedInput = new Uint8Array(ab);
|
|
501
|
-
break;
|
|
502
|
-
case 9:
|
|
503
|
-
typedInput = new BigInt64Array(ab);
|
|
504
|
-
break;
|
|
505
|
-
default:
|
|
506
|
-
typedInput = new Uint8Array(ab);
|
|
507
|
-
}
|
|
508
|
-
const feedValue = {
|
|
509
|
-
dtype: inputDtype,
|
|
510
|
-
shape: inputShape,
|
|
511
|
-
data: Buffer.from(typedInput.buffer, typedInput.byteOffset, typedInput.byteLength),
|
|
512
|
-
};
|
|
513
|
-
inferencePromise = this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors).then((outputs) => {
|
|
514
|
-
// Map back to { [outputKey]: TensorResult } for uniform handling below
|
|
515
|
-
const result = {};
|
|
516
|
-
this.outputOps.forEach((key, i) => {
|
|
517
|
-
result[key] = outputs[i];
|
|
518
|
-
});
|
|
519
|
-
return result;
|
|
520
|
-
});
|
|
521
|
-
}
|
|
522
|
-
else {
|
|
523
|
-
// ── jude-tf TFSession fallback path ───────────────────────────────
|
|
524
|
-
inferencePromise = this.tfParallelSess.runAsync({ [this.inputOp]: inputBuf }, this.outputOps);
|
|
483
|
+
// Native Session feed/fetch format — Graph.getOp() resolves op names to
|
|
484
|
+
// Tensor descriptors that Session.runAsync() expects.
|
|
485
|
+
const g = this.tfParallelGraph;
|
|
486
|
+
const inputTensor = g.getOp(this.inputOp);
|
|
487
|
+
if (!inputTensor) {
|
|
488
|
+
this.tfParallelBusy = false;
|
|
489
|
+
reject(new Error(`tf-parallel: input op not found: ${this.inputOp}`));
|
|
490
|
+
return;
|
|
525
491
|
}
|
|
526
|
-
|
|
527
|
-
.
|
|
492
|
+
const outputTensors = this.outputOps.map((name) => {
|
|
493
|
+
const t = g.getOp(name);
|
|
494
|
+
if (!t)
|
|
495
|
+
throw new Error(`tf-parallel: output op not found: ${name}`);
|
|
496
|
+
return t;
|
|
497
|
+
});
|
|
498
|
+
const feedValue = {
|
|
499
|
+
dtype: inputDtype,
|
|
500
|
+
shape: inputShape,
|
|
501
|
+
data: inputBuf,
|
|
502
|
+
};
|
|
503
|
+
this.tfParallelSess.runAsync([[inputTensor, feedValue]], outputTensors)
|
|
504
|
+
.then((outputs) => {
|
|
528
505
|
const inferenceMs = performance.now() - t0;
|
|
529
506
|
this.tfParallelBusy = false;
|
|
530
507
|
resolve({
|
|
531
508
|
workerId: 0,
|
|
532
509
|
strategy: "tf-parallel",
|
|
533
|
-
outputs:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
return {
|
|
539
|
-
dtype: r.dtype,
|
|
540
|
-
shape: r.shape,
|
|
541
|
-
data: Buffer.isBuffer(r.data)
|
|
542
|
-
? r.data
|
|
543
|
-
: Buffer.from(view.buffer, view.byteOffset, view.byteLength),
|
|
544
|
-
};
|
|
545
|
-
}),
|
|
510
|
+
outputs: outputs.map((o) => ({
|
|
511
|
+
dtype: o.dtype,
|
|
512
|
+
shape: o.shape,
|
|
513
|
+
data: Buffer.isBuffer(o.data) ? o.data : Buffer.from(o.data),
|
|
514
|
+
})),
|
|
546
515
|
inferenceMs,
|
|
547
516
|
});
|
|
548
517
|
const next = this.tfParallelQueue.shift();
|
|
549
|
-
if (next)
|
|
518
|
+
if (next)
|
|
550
519
|
this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
|
|
551
|
-
}
|
|
552
520
|
})
|
|
553
521
|
.catch((err) => {
|
|
554
522
|
this.tfParallelBusy = false;
|
|
555
523
|
reject(err);
|
|
556
524
|
const next = this.tfParallelQueue.shift();
|
|
557
|
-
if (next)
|
|
525
|
+
if (next)
|
|
558
526
|
this.runTfParallel(next.inputBuf, next.inputShape, next.inputDtype, next.resolve, next.reject);
|
|
559
|
-
}
|
|
560
527
|
});
|
|
561
528
|
}
|
|
562
529
|
// ── Introspection ──────────────────────────────────────────────────────────
|
|
@@ -578,17 +545,17 @@ export class InferencePool {
|
|
|
578
545
|
if (this.strategy === "worker-pool") {
|
|
579
546
|
await Promise.all(this.workerSlots.map((slot) => new Promise((resolve, reject) => {
|
|
580
547
|
const doShutdown = () => {
|
|
581
|
-
// Register shutdown_ack listener before storing SHUTDOWN.
|
|
582
548
|
slot.worker.once("message", (msg) => {
|
|
583
|
-
if (msg.type === "shutdown_ack")
|
|
549
|
+
if (msg.type === "shutdown_ack") {
|
|
550
|
+
slot.seg.destroy(); // release jude-map segment after Worker exits
|
|
584
551
|
resolve();
|
|
552
|
+
}
|
|
585
553
|
});
|
|
586
554
|
slot.worker.once("error", reject);
|
|
587
555
|
Atomics.store(slot.ctrl, 0, SHUTDOWN);
|
|
588
556
|
Atomics.notify(slot.ctrl, 0, 1);
|
|
589
557
|
};
|
|
590
558
|
if (slot.busy) {
|
|
591
|
-
// Wait for the current in-flight request to finish first.
|
|
592
559
|
const origResolve = slot.resolve;
|
|
593
560
|
const origReject = slot.reject;
|
|
594
561
|
slot.resolve = (r) => {
|