@camstack/addon-embedding-encoder 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,959 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var src_exports = {};
32
+ __export(src_exports, {
33
+ EmbeddingEncoderAddon: () => EmbeddingEncoderAddon
34
+ });
35
+ module.exports = __toCommonJS(src_exports);
36
+
37
+ // src/addon/index.ts
38
+ var import_types2 = require("@camstack/types");
39
+ var import_core = require("@camstack/core");
40
+
41
+ // src/catalogs/embedding-models.ts
42
+ var CLIP_IMAGE_MODELS = [
43
+ {
44
+ id: "clip-vit-b32",
45
+ name: "CLIP ViT-B/32",
46
+ description: "OpenAI CLIP ViT-B/32 \u2014 fast, 512-dim, int8 quantized (85 MB)",
47
+ inputSize: { width: 224, height: 224 },
48
+ labels: [],
49
+ inputLayout: "nchw",
50
+ inputNormalization: "none",
51
+ formats: {
52
+ onnx: {
53
+ url: "https://huggingface.co/Xenova/clip-vit-base-patch32/resolve/main/onnx/vision_model_quantized.onnx",
54
+ sizeMB: 85
55
+ }
56
+ }
57
+ },
58
+ {
59
+ id: "clip-vit-b16",
60
+ name: "CLIP ViT-B/16",
61
+ description: "OpenAI CLIP ViT-B/16 \u2014 higher accuracy, 512-dim, int8 quantized (83 MB)",
62
+ inputSize: { width: 224, height: 224 },
63
+ labels: [],
64
+ inputLayout: "nchw",
65
+ inputNormalization: "none",
66
+ formats: {
67
+ onnx: {
68
+ url: "https://huggingface.co/Xenova/clip-vit-base-patch16/resolve/main/onnx/vision_model_quantized.onnx",
69
+ sizeMB: 83
70
+ }
71
+ }
72
+ },
73
+ {
74
+ id: "siglip2-b16-256",
75
+ name: "SigLIP2 Base/16 256",
76
+ description: "Google SigLIP2 \u2014 superior scene understanding, 768-dim, int8 quantized (90 MB)",
77
+ inputSize: { width: 256, height: 256 },
78
+ labels: [],
79
+ inputLayout: "nchw",
80
+ inputNormalization: "none",
81
+ formats: {
82
+ onnx: {
83
+ url: "https://huggingface.co/onnx-community/siglip2-base-patch16-256-ONNX/resolve/main/onnx/vision_model_quantized.onnx",
84
+ sizeMB: 90
85
+ }
86
+ }
87
+ }
88
+ ];
89
+ var CLIP_TEXT_MODELS = [
90
+ {
91
+ id: "clip-vit-b32-text",
92
+ name: "CLIP ViT-B/32 Text Encoder",
93
+ description: "Text encoder for CLIP ViT-B/32, int8 quantized (62 MB)",
94
+ inputSize: { width: 0, height: 0 },
95
+ labels: [],
96
+ formats: {
97
+ onnx: {
98
+ url: "https://huggingface.co/Xenova/clip-vit-base-patch32/resolve/main/onnx/text_model_quantized.onnx",
99
+ sizeMB: 62
100
+ }
101
+ }
102
+ },
103
+ {
104
+ id: "clip-vit-b16-text",
105
+ name: "CLIP ViT-B/16 Text Encoder",
106
+ description: "Text encoder for CLIP ViT-B/16, int8 quantized (62 MB)",
107
+ inputSize: { width: 0, height: 0 },
108
+ labels: [],
109
+ formats: {
110
+ onnx: {
111
+ url: "https://huggingface.co/Xenova/clip-vit-base-patch16/resolve/main/onnx/text_model_quantized.onnx",
112
+ sizeMB: 62
113
+ }
114
+ }
115
+ },
116
+ {
117
+ id: "siglip2-b16-256-text",
118
+ name: "SigLIP2 Base/16 256 Text Encoder",
119
+ description: "Text encoder for SigLIP2, int8 quantized (270 MB)",
120
+ inputSize: { width: 0, height: 0 },
121
+ labels: [],
122
+ formats: {
123
+ onnx: {
124
+ url: "https://huggingface.co/onnx-community/siglip2-base-patch16-256-ONNX/resolve/main/onnx/text_model_quantized.onnx",
125
+ sizeMB: 270
126
+ }
127
+ }
128
+ }
129
+ ];
130
+
131
+ // src/shared/noop-logger.ts
132
+ var noop = () => {
133
+ };
134
+ function createNoopLogger() {
135
+ const logger = {
136
+ debug: noop,
137
+ info: noop,
138
+ warn: noop,
139
+ error: noop,
140
+ child: () => logger,
141
+ withTags: (_tags) => logger
142
+ };
143
+ return logger;
144
+ }
145
+
146
+ // src/shared/node-raw-tensor-engine.ts
147
+ var path = __toESM(require("path"));
148
+ var BACKEND_TO_DEVICE = {
149
+ cpu: "cpu",
150
+ coreml: "gpu-mps",
151
+ cuda: "gpu-cuda",
152
+ tensorrt: "tensorrt"
153
+ };
154
+ var NodeRawTensorEngine = class {
155
+ constructor(modelPath, backend, logger) {
156
+ this.modelPath = modelPath;
157
+ this.backend = backend;
158
+ this.device = BACKEND_TO_DEVICE[backend] ?? "cpu";
159
+ this.log = logger ?? createNoopLogger();
160
+ }
161
+ modelPath;
162
+ backend;
163
+ runtime = "onnx";
164
+ device;
165
+ session = null;
166
+ log;
167
+ async initialize() {
168
+ const ort = await import("onnxruntime-node");
169
+ const provider = this.backend === "coreml" ? "coreml" : this.backend === "cuda" ? "cuda" : "cpu";
170
+ const absModelPath = path.isAbsolute(this.modelPath) ? this.modelPath : path.resolve(process.cwd(), this.modelPath);
171
+ this.session = await ort.InferenceSession.create(absModelPath, {
172
+ executionProviders: [provider]
173
+ });
174
+ this.log.info("ONNX session loaded", { meta: { modelPath: absModelPath, backend: this.backend, provider } });
175
+ }
176
+ async run(input, inputShape) {
177
+ if (!this.session) {
178
+ throw new Error("NodeRawTensorEngine: not initialized \u2014 call initialize() first");
179
+ }
180
+ const ort = await import("onnxruntime-node");
181
+ const sess = this.session;
182
+ const inputName = sess.inputNames[0];
183
+ const tensor = new ort.Tensor("float32", input, [...inputShape]);
184
+ const feeds = { [inputName]: tensor };
185
+ const start = Date.now();
186
+ let results;
187
+ try {
188
+ results = await sess.run(feeds);
189
+ } catch (err) {
190
+ const error = err instanceof Error ? err : new Error(String(err));
191
+ this.log.error("Inference failed", { meta: { error: error.message } });
192
+ throw error;
193
+ }
194
+ const outputName = sess.outputNames[0];
195
+ this.log.debug("Inference complete", { meta: { durationMs: Date.now() - start, outputKeys: [outputName], preprocessMode: "raw-tensor" } });
196
+ return results[outputName].data;
197
+ }
198
+ async dispose() {
199
+ this.session = null;
200
+ this.log.debug("Session disposed");
201
+ }
202
+ };
203
+
204
+ // src/shared/engine-resolver.ts
205
+ var import_types = require("@camstack/types");
206
+ var fs = __toESM(require("fs"));
207
+ var path3 = __toESM(require("path"));
208
+
209
+ // src/shared/image-utils.ts
210
+ var import_sharp = __toESM(require("sharp"));
211
+ async function letterbox(jpeg, targetSize) {
212
+ const meta = await (0, import_sharp.default)(jpeg).metadata();
213
+ const originalWidth = meta.width ?? 0;
214
+ const originalHeight = meta.height ?? 0;
215
+ const scale = Math.min(targetSize / originalWidth, targetSize / originalHeight);
216
+ const scaledWidth = Math.round(originalWidth * scale);
217
+ const scaledHeight = Math.round(originalHeight * scale);
218
+ const padX = Math.floor((targetSize - scaledWidth) / 2);
219
+ const padY = Math.floor((targetSize - scaledHeight) / 2);
220
+ const { data } = await (0, import_sharp.default)(jpeg).resize(scaledWidth, scaledHeight).extend({
221
+ top: padY,
222
+ bottom: targetSize - scaledHeight - padY,
223
+ left: padX,
224
+ right: targetSize - scaledWidth - padX,
225
+ background: { r: 114, g: 114, b: 114 }
226
+ }).removeAlpha().raw().toBuffer({ resolveWithObject: true });
227
+ const numPixels = targetSize * targetSize;
228
+ const float32 = new Float32Array(3 * numPixels);
229
+ for (let i = 0; i < numPixels; i++) {
230
+ const srcBase = i * 3;
231
+ float32[0 * numPixels + i] = data[srcBase] / 255;
232
+ float32[1 * numPixels + i] = data[srcBase + 1] / 255;
233
+ float32[2 * numPixels + i] = data[srcBase + 2] / 255;
234
+ }
235
+ return { data: float32, scale, padX, padY, originalWidth, originalHeight };
236
+ }
237
+ async function resizeAndNormalize(jpeg, targetWidth, targetHeight, normalization, layout) {
238
+ const { data } = await (0, import_sharp.default)(jpeg).resize(targetWidth, targetHeight, { fit: "fill" }).removeAlpha().raw().toBuffer({ resolveWithObject: true });
239
+ const numPixels = targetWidth * targetHeight;
240
+ const float32 = new Float32Array(3 * numPixels);
241
+ const mean = [0.485, 0.456, 0.406];
242
+ const std = [0.229, 0.224, 0.225];
243
+ if (layout === "nchw") {
244
+ for (let i = 0; i < numPixels; i++) {
245
+ const srcBase = i * 3;
246
+ for (let c = 0; c < 3; c++) {
247
+ const raw = data[srcBase + c] / 255;
248
+ let val;
249
+ if (normalization === "zero-one") {
250
+ val = raw;
251
+ } else if (normalization === "imagenet") {
252
+ val = (raw - mean[c]) / std[c];
253
+ } else {
254
+ val = data[srcBase + c];
255
+ }
256
+ float32[c * numPixels + i] = val;
257
+ }
258
+ }
259
+ } else {
260
+ for (let i = 0; i < numPixels; i++) {
261
+ const srcBase = i * 3;
262
+ for (let c = 0; c < 3; c++) {
263
+ const raw = data[srcBase + c] / 255;
264
+ let val;
265
+ if (normalization === "zero-one") {
266
+ val = raw;
267
+ } else if (normalization === "imagenet") {
268
+ val = (raw - mean[c]) / std[c];
269
+ } else {
270
+ val = data[srcBase + c];
271
+ }
272
+ float32[i * 3 + c] = val;
273
+ }
274
+ }
275
+ }
276
+ return float32;
277
+ }
278
+
279
+ // src/shared/node-engine.ts
280
+ var path2 = __toESM(require("path"));
281
+ var BACKEND_TO_PROVIDER = {
282
+ cpu: "cpu",
283
+ coreml: "coreml",
284
+ cuda: "cuda",
285
+ tensorrt: "tensorrt",
286
+ dml: "dml"
287
+ };
288
+ var BACKEND_TO_DEVICE2 = {
289
+ cpu: "cpu",
290
+ coreml: "gpu-mps",
291
+ cuda: "gpu-cuda",
292
+ tensorrt: "tensorrt"
293
+ };
294
+ var NodeInferenceEngine = class {
295
+ constructor(modelPath, backend, modelMeta, logger) {
296
+ this.modelPath = modelPath;
297
+ this.backend = backend;
298
+ this.modelMeta = modelMeta;
299
+ this.device = BACKEND_TO_DEVICE2[backend] ?? "cpu";
300
+ this.log = logger ?? createNoopLogger();
301
+ }
302
+ modelPath;
303
+ backend;
304
+ modelMeta;
305
+ runtime = "onnx";
306
+ device;
307
+ session = null;
308
+ log;
309
+ async initialize() {
310
+ const ort = await import("onnxruntime-node");
311
+ const provider = BACKEND_TO_PROVIDER[this.backend] ?? "cpu";
312
+ const absModelPath = path2.isAbsolute(this.modelPath) ? this.modelPath : path2.resolve(process.cwd(), this.modelPath);
313
+ const sessionOptions = {
314
+ executionProviders: [provider]
315
+ };
316
+ this.session = await ort.InferenceSession.create(absModelPath, sessionOptions);
317
+ this.log.info("ONNX session loaded", { meta: { modelPath: absModelPath, backend: this.backend, provider } });
318
+ }
319
+ async infer(input) {
320
+ const jpeg = input.kind === "jpeg" ? input.data : await this.encodeRawAsJpeg(input.data, input.width, input.height, input.format);
321
+ const { data, letterboxMeta } = await this.preprocess(jpeg);
322
+ const { inputSize } = this.modelMeta;
323
+ const inputShape = this.modelMeta.preprocessMode === "letterbox" ? [1, 3, inputSize.height, inputSize.width] : [1, 3, inputSize.height, inputSize.width];
324
+ const start = Date.now();
325
+ let result;
326
+ try {
327
+ result = await this.runSession(data, inputShape);
328
+ } catch (err) {
329
+ const error = err instanceof Error ? err : new Error(String(err));
330
+ this.log.error("Inference failed", { meta: { error: error.message } });
331
+ throw error;
332
+ }
333
+ const durationMs = Date.now() - start;
334
+ if ("tensor" in result) {
335
+ this.log.debug("Inference complete", { meta: { durationMs, outputKeys: ["tensor"], preprocessMode: this.modelMeta.preprocessMode } });
336
+ return { tensor: result.tensor, letterbox: letterboxMeta, inferenceMs: durationMs };
337
+ }
338
+ this.log.debug("Inference complete", { meta: { durationMs, outputKeys: Object.keys(result.tensors), preprocessMode: this.modelMeta.preprocessMode } });
339
+ return { tensors: result.tensors, letterbox: letterboxMeta, inferenceMs: durationMs };
340
+ }
341
+ /** Preprocess JPEG to Float32Array using the configured mode */
342
+ async preprocess(jpeg) {
343
+ const { inputSize, inputNormalization, inputLayout, preprocessMode } = this.modelMeta;
344
+ if (preprocessMode === "letterbox") {
345
+ const targetSize = Math.max(inputSize.width, inputSize.height);
346
+ const result = await letterbox(jpeg, targetSize);
347
+ const letterboxMeta = {
348
+ scale: result.scale,
349
+ padX: result.padX,
350
+ padY: result.padY,
351
+ originalWidth: result.originalWidth,
352
+ originalHeight: result.originalHeight
353
+ };
354
+ return { data: result.data, letterboxMeta };
355
+ }
356
+ const data = await resizeAndNormalize(
357
+ jpeg,
358
+ inputSize.width,
359
+ inputSize.height,
360
+ inputNormalization,
361
+ inputLayout
362
+ );
363
+ return { data };
364
+ }
365
+ async encodeRawAsJpeg(raw, width, height, format) {
366
+ const sharp2 = (await import("sharp")).default;
367
+ const channels = format === "gray" ? 1 : 3;
368
+ return sharp2(raw, { raw: { width, height, channels } }).jpeg({ quality: 80, mozjpeg: false }).toBuffer();
369
+ }
370
+ /** Run an ONNX session with a single input, handling both single and multi-output models */
371
+ async runSession(input, inputShape) {
372
+ if (!this.session) {
373
+ throw new Error("NodeInferenceEngine: not initialized \u2014 call initialize() first");
374
+ }
375
+ const ort = await import("onnxruntime-node");
376
+ const sess = this.session;
377
+ const inputName = sess.inputNames[0];
378
+ const tensor = new ort.Tensor("float32", input, [...inputShape]);
379
+ const feeds = { [inputName]: tensor };
380
+ const results = await sess.run(feeds);
381
+ const outputNames = sess.outputNames;
382
+ if (outputNames.length === 1) {
383
+ const outputName = outputNames[0];
384
+ return { tensor: results[outputName].data };
385
+ }
386
+ const tensors = {};
387
+ for (const name of outputNames) {
388
+ tensors[name] = results[name].data;
389
+ }
390
+ return { tensors };
391
+ }
392
+ async run(input, inputShape) {
393
+ const result = await this.runSession(input, inputShape);
394
+ if ("tensor" in result) return result.tensor;
395
+ const firstKey = Object.keys(result.tensors)[0];
396
+ return result.tensors[firstKey];
397
+ }
398
+ async dispose() {
399
+ this.session = null;
400
+ this.log.debug("Session disposed");
401
+ }
402
+ };
403
+
404
+ // src/shared/python-engine.ts
405
+ var import_node_child_process = require("child_process");
406
+ var PythonInferenceEngine = class {
407
+ constructor(pythonPath, scriptPath, runtime, modelPath, extraArgs = [], logger) {
408
+ this.pythonPath = pythonPath;
409
+ this.scriptPath = scriptPath;
410
+ this.modelPath = modelPath;
411
+ this.extraArgs = extraArgs;
412
+ this.runtime = runtime;
413
+ const runtimeDeviceMap = {
414
+ onnx: "cpu",
415
+ coreml: "gpu-mps",
416
+ pytorch: "cpu",
417
+ openvino: "cpu",
418
+ tflite: "cpu"
419
+ };
420
+ this.device = runtimeDeviceMap[runtime];
421
+ this.log = logger ?? createNoopLogger();
422
+ }
423
+ pythonPath;
424
+ scriptPath;
425
+ modelPath;
426
+ extraArgs;
427
+ runtime;
428
+ device;
429
+ process = null;
430
+ receiveBuffer = Buffer.alloc(0);
431
+ pendingResolve = null;
432
+ pendingReject = null;
433
+ log;
434
+ async initialize() {
435
+ const args = [this.scriptPath, this.modelPath, ...this.extraArgs];
436
+ this.process = (0, import_node_child_process.spawn)(this.pythonPath, args, {
437
+ stdio: ["pipe", "pipe", "pipe"]
438
+ });
439
+ if (!this.process.stdout || !this.process.stdin) {
440
+ throw new Error("PythonInferenceEngine: failed to create process pipes");
441
+ }
442
+ this.log.info("Python process started", { meta: { pythonPath: this.pythonPath, scriptPath: this.scriptPath, modelPath: this.modelPath } });
443
+ this.process.stderr?.on("data", (chunk) => {
444
+ const lines = chunk.toString().split("\n");
445
+ for (const line of lines) {
446
+ const trimmed = line.trim();
447
+ if (trimmed) {
448
+ this.log.warn(trimmed);
449
+ }
450
+ }
451
+ });
452
+ this.process.on("error", (err) => {
453
+ this.log.error("Process error", { meta: { error: err.message } });
454
+ this.pendingReject?.(err);
455
+ this.pendingReject = null;
456
+ this.pendingResolve = null;
457
+ });
458
+ this.process.on("exit", (code) => {
459
+ if (code !== 0) {
460
+ this.log.error("Process exited", { meta: { code } });
461
+ const err = new Error(`PythonInferenceEngine: process exited with code ${code}`);
462
+ this.pendingReject?.(err);
463
+ this.pendingReject = null;
464
+ this.pendingResolve = null;
465
+ }
466
+ });
467
+ this.process.stdout.on("data", (chunk) => {
468
+ this.receiveBuffer = Buffer.concat([this.receiveBuffer, chunk]);
469
+ this._tryReceive();
470
+ });
471
+ await new Promise((resolve3, reject) => {
472
+ const timeout = setTimeout(() => resolve3(), 2e3);
473
+ this.process?.on("error", (err) => {
474
+ clearTimeout(timeout);
475
+ reject(err);
476
+ });
477
+ this.process?.on("exit", (code) => {
478
+ clearTimeout(timeout);
479
+ if (code !== 0) {
480
+ reject(new Error(`PythonInferenceEngine: process exited early with code ${code}`));
481
+ }
482
+ });
483
+ });
484
+ }
485
+ _tryReceive() {
486
+ if (this.receiveBuffer.length < 4) return;
487
+ const length = this.receiveBuffer.readUInt32LE(0);
488
+ if (this.receiveBuffer.length < 4 + length) return;
489
+ const jsonBytes = this.receiveBuffer.subarray(4, 4 + length);
490
+ this.receiveBuffer = this.receiveBuffer.subarray(4 + length);
491
+ const resolve3 = this.pendingResolve;
492
+ const reject = this.pendingReject;
493
+ this.pendingResolve = null;
494
+ this.pendingReject = null;
495
+ if (!resolve3) return;
496
+ try {
497
+ const parsed = JSON.parse(jsonBytes.toString("utf8"));
498
+ resolve3(parsed);
499
+ } catch (err) {
500
+ reject?.(err instanceof Error ? err : new Error(String(err)));
501
+ }
502
+ }
503
+ /** Run inference, returning structured detection results. Encodes raw input to JPEG when needed. */
504
+ async infer(input) {
505
+ const start = Date.now();
506
+ const jpeg = input.kind === "jpeg" ? input.data : await this.encodeRawAsJpeg(input.data, input.width, input.height, input.format);
507
+ const result = await this.sendJpeg(jpeg);
508
+ const durationMs = Date.now() - start;
509
+ this.log.debug("Inference complete", { meta: { durationMs } });
510
+ return { structured: result, inferenceMs: durationMs };
511
+ }
512
+ async encodeRawAsJpeg(raw, width, height, format) {
513
+ const sharp2 = (await import("sharp")).default;
514
+ const channels = format === "gray" ? 1 : 3;
515
+ return sharp2(raw, { raw: { width, height, channels } }).jpeg({ quality: 80, mozjpeg: false }).toBuffer();
516
+ }
517
+ /** Send JPEG buffer via binary IPC, receive JSON detection results */
518
+ async sendJpeg(jpeg) {
519
+ if (!this.process?.stdin) {
520
+ throw new Error("PythonInferenceEngine: process not initialized");
521
+ }
522
+ return new Promise((resolve3, reject) => {
523
+ this.pendingResolve = resolve3;
524
+ this.pendingReject = reject;
525
+ const lengthBuf = Buffer.allocUnsafe(4);
526
+ lengthBuf.writeUInt32LE(jpeg.length, 0);
527
+ this.process.stdin.write(Buffer.concat([lengthBuf, jpeg]));
528
+ });
529
+ }
530
+ async dispose() {
531
+ const proc = this.process;
532
+ if (!proc) return;
533
+ this.process = null;
534
+ proc.stdin?.end();
535
+ proc.kill("SIGTERM");
536
+ const exited = await new Promise((resolve3) => {
537
+ const timer = setTimeout(() => {
538
+ resolve3(false);
539
+ }, 5e3);
540
+ proc.once("exit", () => {
541
+ clearTimeout(timer);
542
+ resolve3(true);
543
+ });
544
+ });
545
+ if (!exited) {
546
+ try {
547
+ proc.kill("SIGKILL");
548
+ } catch {
549
+ }
550
+ this.log.warn("Python process did not exit gracefully \u2014 sent SIGKILL");
551
+ } else {
552
+ this.log.debug("Python process terminated");
553
+ }
554
+ }
555
+ };
556
+
557
+ // src/shared/engine-resolver.ts
558
+ var AUTO_BACKEND_PRIORITY = ["coreml", "cuda", "tensorrt", "cpu"];
559
+ var BACKEND_TO_FORMAT = import_types.BACKEND_TO_FORMAT;
560
+ var RUNTIME_TO_FORMAT = import_types.RUNTIME_TO_FORMAT;
561
+ function extractModelMeta(entry) {
562
+ return {
563
+ inputSize: entry.inputSize,
564
+ inputNormalization: entry.inputNormalization ?? "zero-one",
565
+ inputLayout: entry.inputLayout ?? "nchw",
566
+ preprocessMode: entry.preprocessMode ?? "letterbox"
567
+ };
568
+ }
569
+ function modelFilePath(modelsDir, modelEntry, format) {
570
+ const formatEntry = modelEntry.formats[format];
571
+ if (!formatEntry) {
572
+ throw new Error(`Model ${modelEntry.id} has no ${format} format`);
573
+ }
574
+ const urlParts = formatEntry.url.split("/");
575
+ const filename = urlParts[urlParts.length - 1] ?? `${modelEntry.id}.${format}`;
576
+ return path3.join(modelsDir, filename);
577
+ }
578
+ function modelExists(filePath) {
579
+ try {
580
+ return fs.existsSync(filePath);
581
+ } catch {
582
+ return false;
583
+ }
584
+ }
585
+ async function resolveEngine(options) {
586
+ const { runtime, backend, modelEntry, modelsDir, models } = options;
587
+ const log = options.logger ?? createNoopLogger();
588
+ let selectedFormat;
589
+ let selectedBackend;
590
+ if (runtime === "auto") {
591
+ const available = await probeOnnxBackends();
592
+ let chosen = null;
593
+ for (const b of AUTO_BACKEND_PRIORITY) {
594
+ if (!available.includes(b)) continue;
595
+ const fmt = BACKEND_TO_FORMAT[b];
596
+ if (!fmt) continue;
597
+ if (!modelEntry.formats[fmt]) continue;
598
+ chosen = { backend: b, format: fmt };
599
+ break;
600
+ }
601
+ if (!chosen) {
602
+ throw new Error(
603
+ `resolveEngine: no compatible backend found for model ${modelEntry.id}. Available backends: ${available.join(", ")}`
604
+ );
605
+ }
606
+ selectedFormat = chosen.format;
607
+ selectedBackend = chosen.backend;
608
+ } else {
609
+ const fmt = RUNTIME_TO_FORMAT[runtime];
610
+ if (!fmt) {
611
+ throw new Error(`resolveEngine: unsupported runtime "${runtime}"`);
612
+ }
613
+ if (!modelEntry.formats[fmt]) {
614
+ if (fmt !== "onnx" && modelEntry.formats["onnx"]) {
615
+ selectedFormat = "onnx";
616
+ selectedBackend = backend || "cpu";
617
+ } else {
618
+ throw new Error(
619
+ `resolveEngine: model ${modelEntry.id} has no ${fmt} format for runtime ${runtime}`
620
+ );
621
+ }
622
+ } else {
623
+ selectedFormat = fmt;
624
+ selectedBackend = runtime === "onnx" ? backend || "cpu" : runtime;
625
+ }
626
+ }
627
+ let modelPath;
628
+ if (models) {
629
+ modelPath = await models.ensure(modelEntry.id, selectedFormat);
630
+ } else {
631
+ modelPath = modelFilePath(modelsDir, modelEntry, selectedFormat);
632
+ if (!modelExists(modelPath)) {
633
+ throw new Error(
634
+ `resolveEngine: model file not found at ${modelPath} and no model service provided`
635
+ );
636
+ }
637
+ }
638
+ log.info("Engine resolved", { meta: { format: selectedFormat, backend: selectedBackend, modelId: modelEntry.id } });
639
+ if (selectedFormat === "onnx") {
640
+ const engine = new NodeInferenceEngine(modelPath, selectedBackend, extractModelMeta(modelEntry), options.logger);
641
+ await engine.initialize();
642
+ return { engine, format: selectedFormat, modelPath };
643
+ }
644
+ const effectiveRuntime = runtime === "auto" ? selectedBackend : runtime;
645
+ let { pythonPath } = options;
646
+ if (!pythonPath) {
647
+ const { execFileSync: efs } = await import("child_process");
648
+ for (const cmd of ["python3", "python"]) {
649
+ try {
650
+ efs(cmd, ["--version"], { timeout: 3e3, stdio: "ignore" });
651
+ pythonPath = cmd;
652
+ break;
653
+ } catch {
654
+ }
655
+ }
656
+ }
657
+ const scriptName = import_types.PYTHON_SCRIPT[effectiveRuntime];
658
+ if (scriptName && pythonPath) {
659
+ const candidates = [
660
+ path3.join(__dirname, "../../python", scriptName),
661
+ path3.join(__dirname, "../python", scriptName),
662
+ path3.join(__dirname, "../../../python", scriptName)
663
+ ];
664
+ const scriptPath = candidates.find((p) => fs.existsSync(p));
665
+ if (!scriptPath) {
666
+ throw new Error(
667
+ `resolveEngine: Python script "${scriptName}" not found. Searched:
668
+ ${candidates.join("\n")}`
669
+ );
670
+ }
671
+ const inputSize = Math.max(modelEntry.inputSize.width, modelEntry.inputSize.height);
672
+ const engine = new PythonInferenceEngine(pythonPath, scriptPath, effectiveRuntime, modelPath, [
673
+ `--input-size=${inputSize}`,
674
+ `--confidence=0.25`
675
+ ], options.logger);
676
+ await engine.initialize();
677
+ return { engine, format: selectedFormat, modelPath };
678
+ }
679
+ const fallbackPath = modelFilePath(modelsDir, modelEntry, "onnx");
680
+ if (modelEntry.formats["onnx"] && modelExists(fallbackPath)) {
681
+ const engine = new NodeInferenceEngine(fallbackPath, "cpu", extractModelMeta(modelEntry), options.logger);
682
+ await engine.initialize();
683
+ return { engine, format: "onnx", modelPath: fallbackPath };
684
+ }
685
+ throw new Error(
686
+ `resolveEngine: format ${selectedFormat} is not yet supported by NodeInferenceEngine, no Python runtime is available, and no ONNX fallback exists`
687
+ );
688
+ }
689
+ async function probeOnnxBackends() {
690
+ const available = ["cpu"];
691
+ try {
692
+ const ort = await import("onnxruntime-node");
693
+ const providers = ort.env?.webgl?.disabled !== void 0 ? ort.InferenceSession.getAvailableProviders?.() ?? [] : [];
694
+ for (const p of providers) {
695
+ const normalized = p.toLowerCase().replace("executionprovider", "");
696
+ if (normalized === "coreml") available.push("coreml");
697
+ else if (normalized === "cuda") available.push("cuda");
698
+ else if (normalized === "tensorrt") available.push("tensorrt");
699
+ }
700
+ } catch {
701
+ }
702
+ if (process.platform === "darwin" && !available.includes("coreml")) {
703
+ available.push("coreml");
704
+ }
705
+ return [...new Set(available)];
706
+ }
707
+
708
+ // src/addon/clip-models.ts
709
+ var CLIP_MODEL_META = {
710
+ "clip-vit-b32": {
711
+ imageModelId: "clip-vit-b32",
712
+ textModelId: "clip-vit-b32-text",
713
+ embeddingDim: 512,
714
+ inputSize: 224,
715
+ tokenizerType: "clip"
716
+ },
717
+ "clip-vit-b16": {
718
+ imageModelId: "clip-vit-b16",
719
+ textModelId: "clip-vit-b16-text",
720
+ embeddingDim: 512,
721
+ inputSize: 224,
722
+ tokenizerType: "clip"
723
+ },
724
+ "siglip2-b16-256": {
725
+ imageModelId: "siglip2-b16-256",
726
+ textModelId: "siglip2-b16-256-text",
727
+ embeddingDim: 768,
728
+ inputSize: 256,
729
+ tokenizerType: "siglip"
730
+ }
731
+ };
732
+ var DEFAULT_CLIP_MODEL = "clip-vit-b32";
733
+ function getModelMeta(modelId) {
734
+ return CLIP_MODEL_META[modelId] ?? CLIP_MODEL_META[DEFAULT_CLIP_MODEL];
735
+ }
736
+
737
+ // src/addon/clip-preprocessing.ts
738
+ var CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073];
739
+ var CLIP_STD = [0.26862954, 0.26130258, 0.27577711];
740
+ function preprocessForClip(rgb, srcWidth, srcHeight, targetWidth, targetHeight) {
741
+ const pixels = targetWidth * targetHeight;
742
+ const result = new Float32Array(3 * pixels);
743
+ for (let y = 0; y < targetHeight; y++) {
744
+ for (let x = 0; x < targetWidth; x++) {
745
+ const srcX = Math.min(Math.floor(x / targetWidth * srcWidth), srcWidth - 1);
746
+ const srcY = Math.min(Math.floor(y / targetHeight * srcHeight), srcHeight - 1);
747
+ const srcIdx = (srcY * srcWidth + srcX) * 3;
748
+ const dstIdx = y * targetWidth + x;
749
+ for (let c = 0; c < 3; c++) {
750
+ const val = (rgb[srcIdx + c] ?? 0) / 255;
751
+ result[c * pixels + dstIdx] = (val - CLIP_MEAN[c]) / CLIP_STD[c];
752
+ }
753
+ }
754
+ }
755
+ return result;
756
+ }
757
+ function l2Normalize(vec) {
758
+ let norm = 0;
759
+ for (let i = 0; i < vec.length; i++) norm += vec[i] * vec[i];
760
+ norm = Math.sqrt(norm);
761
+ if (norm > 0) {
762
+ for (let i = 0; i < vec.length; i++) vec[i] /= norm;
763
+ }
764
+ return vec;
765
+ }
766
+
767
+ // src/addon/index.ts
768
+ var EmbeddingEncoderAddon = class extends import_types2.BaseAddon {
769
+ imageRawEngine = null;
770
+ textRawEngine = null;
771
+ imagePythonEngine = null;
772
+ textPythonEngine = null;
773
+ models = null;
774
+ isPython = false;
775
+ constructor() {
776
+ super({ modelId: DEFAULT_CLIP_MODEL, runtime: "auto", backend: "cpu" });
777
+ }
778
+ async onInitialize() {
779
+ const modelsDir = await this.ctx.api.storage.resolve.query({ location: "models", relativePath: "" }).catch(() => "camstack-data/models");
780
+ this.models = new import_core.ModelDownloadService(modelsDir, []);
781
+ return [{ capability: import_types2.embeddingEncoderCapability, provider: this }];
782
+ }
783
+ async encode(crop, width, height) {
784
+ await this.ensureImageEngine();
785
+ const meta = getModelMeta(this.config.modelId);
786
+ const start = Date.now();
787
+ if (this.isPython && this.imagePythonEngine) {
788
+ const result = await this.imagePythonEngine.infer({ kind: "jpeg", data: crop });
789
+ const rawEmbedding = result.structured?.["embedding"];
790
+ const embedding = new Float32Array(rawEmbedding);
791
+ return {
792
+ embedding: l2Normalize(embedding),
793
+ inferenceMs: result.inferenceMs ?? Date.now() - start
794
+ };
795
+ }
796
+ const preprocessed = preprocessForClip(crop, width, height, meta.inputSize, meta.inputSize);
797
+ const output = await this.imageRawEngine.run(preprocessed, [1, 3, meta.inputSize, meta.inputSize]);
798
+ const sliced = output.length > meta.embeddingDim ? output.slice(0, meta.embeddingDim) : output;
799
+ return {
800
+ embedding: l2Normalize(new Float32Array(sliced)),
801
+ inferenceMs: Date.now() - start
802
+ };
803
+ }
804
+ async encodeText(text) {
805
+ await this.ensureTextEngine();
806
+ const meta = getModelMeta(this.config.modelId);
807
+ const start = Date.now();
808
+ if (this.isPython && this.textPythonEngine) {
809
+ const textBuffer = Buffer.from(JSON.stringify({ text }), "utf-8");
810
+ const result = await this.textPythonEngine.infer({ kind: "jpeg", data: textBuffer });
811
+ const rawEmbedding = result.structured?.["embedding"];
812
+ const embedding = new Float32Array(rawEmbedding);
813
+ return {
814
+ embedding: l2Normalize(embedding),
815
+ inferenceMs: result.inferenceMs ?? Date.now() - start
816
+ };
817
+ }
818
+ const tokenIds = clipTokenize(text);
819
+ const inputTensor = new Float32Array(tokenIds);
820
+ const output = await this.textRawEngine.run(inputTensor, [1, tokenIds.length]);
821
+ const sliced = output.length > meta.embeddingDim ? output.slice(0, meta.embeddingDim) : output;
822
+ return {
823
+ embedding: l2Normalize(new Float32Array(sliced)),
824
+ inferenceMs: Date.now() - start
825
+ };
826
+ }
827
+ getInfo() {
828
+ const meta = getModelMeta(this.config.modelId);
829
+ return {
830
+ modelId: this.config.modelId,
831
+ embeddingDim: meta.embeddingDim,
832
+ ready: this.imageRawEngine !== null || this.imagePythonEngine !== null
833
+ };
834
+ }
835
+ async ensureImageEngine() {
836
+ if (this.imageRawEngine || this.imagePythonEngine) return;
837
+ const meta = getModelMeta(this.config.modelId);
838
+ const imageEntry = CLIP_IMAGE_MODELS.find((m) => m.id === meta.imageModelId);
839
+ if (!imageEntry) {
840
+ throw new Error(`EmbeddingEncoderAddon: unknown image model "${meta.imageModelId}"`);
841
+ }
842
+ await this.resolveForEntry(imageEntry, "image");
843
+ }
844
+ async ensureTextEngine() {
845
+ if (this.textRawEngine || this.textPythonEngine) return;
846
+ const meta = getModelMeta(this.config.modelId);
847
+ const textEntry = CLIP_TEXT_MODELS.find((m) => m.id === meta.textModelId);
848
+ if (!textEntry) {
849
+ throw new Error(`EmbeddingEncoderAddon: unknown text model "${meta.textModelId}"`);
850
+ }
851
+ await this.resolveForEntry(textEntry, "text");
852
+ }
853
+ async resolveForEntry(entry, target) {
854
+ const runtime = this.config.runtime === "auto" ? "auto" : this.config.runtime === "node" ? "onnx" : this.config.runtime;
855
+ const modelsDir = this.models.getModelsDir();
856
+ const engineLogger = this.ctx.logger.withTags({
857
+ modelId: entry.id,
858
+ runtime: this.config.runtime,
859
+ backend: this.config.backend
860
+ });
861
+ await this.models.ensure(entry.id, "onnx");
862
+ const resolved = await resolveEngine({
863
+ runtime,
864
+ backend: this.config.backend,
865
+ modelEntry: entry,
866
+ modelsDir,
867
+ models: this.models ?? void 0,
868
+ logger: engineLogger
869
+ });
870
+ if (resolved.format !== "onnx") {
871
+ this.isPython = true;
872
+ if (target === "image") {
873
+ this.imagePythonEngine = resolved.engine;
874
+ } else {
875
+ this.textPythonEngine = resolved.engine;
876
+ }
877
+ } else {
878
+ const rawEngine = new NodeRawTensorEngine(resolved.modelPath, this.config.backend, engineLogger);
879
+ await rawEngine.initialize();
880
+ await resolved.engine.dispose();
881
+ if (target === "image") {
882
+ this.imageRawEngine = rawEngine;
883
+ } else {
884
+ this.textRawEngine = rawEngine;
885
+ }
886
+ }
887
+ }
888
+ async onShutdown() {
889
+ await this.imageRawEngine?.dispose();
890
+ await this.textRawEngine?.dispose();
891
+ await this.imagePythonEngine?.dispose();
892
+ await this.textPythonEngine?.dispose();
893
+ }
894
+ // ── Three-level settings API (Phase 3) ──────────────────────────────
895
+ globalSettingsSchema() {
896
+ return this.schema({
897
+ sections: [
898
+ {
899
+ id: "embedding-encoder-settings",
900
+ title: "Embedding Encoder",
901
+ columns: 2,
902
+ fields: [
903
+ {
904
+ type: "text",
905
+ key: "modelId",
906
+ label: "Model ID",
907
+ description: "CLIP model identifier to use for image/text embedding",
908
+ default: DEFAULT_CLIP_MODEL
909
+ },
910
+ {
911
+ type: "select",
912
+ key: "runtime",
913
+ label: "Runtime",
914
+ description: "Inference runtime (auto selects the best available)",
915
+ default: "auto",
916
+ options: [
917
+ { label: "Auto", value: "auto" },
918
+ { label: "Node (ONNX)", value: "node" },
919
+ { label: "Python", value: "python" }
920
+ ]
921
+ },
922
+ {
923
+ type: "select",
924
+ key: "backend",
925
+ label: "Backend",
926
+ description: "Hardware backend for inference acceleration",
927
+ default: "cpu",
928
+ options: [
929
+ { label: "CPU", value: "cpu" },
930
+ { label: "CUDA", value: "cuda" },
931
+ { label: "CoreML", value: "coreml" }
932
+ ]
933
+ }
934
+ ]
935
+ }
936
+ ]
937
+ });
938
+ }
939
+ async onConfigChanged() {
940
+ }
941
+ };
942
+ function clipTokenize(text, maxLength = 77) {
943
+ const SOT_TOKEN = 49406;
944
+ const EOT_TOKEN = 49407;
945
+ const tokens = [SOT_TOKEN];
946
+ for (let i = 0; i < text.length && tokens.length < maxLength - 1; i++) {
947
+ tokens.push(text.charCodeAt(i) + 256);
948
+ }
949
+ tokens.push(EOT_TOKEN);
950
+ while (tokens.length < maxLength) {
951
+ tokens.push(0);
952
+ }
953
+ return tokens;
954
+ }
955
+ // Annotate the CommonJS export names for ESM import in node:
956
+ 0 && (module.exports = {
957
+ EmbeddingEncoderAddon
958
+ });
959
+ //# sourceMappingURL=index.js.map