@camstack/addon-scene-intelligence 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +87 -0
- package/dist/index.d.ts +87 -0
- package/dist/index.js +525 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +479 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +55 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { ICamstackAddon, ISceneIntelligence, AddonManifest, AddonContext, CapabilityProviderMap, CropInput, ClassifierOutput, EmbeddingMetadata, EmbeddingFilter, VectorSearchResult, SceneStateResult, IScopedLogger, IEmbeddingsBackend, ModelCatalogEntry } from '@camstack/types';
|
|
2
|
+
|
|
3
|
+
declare class SceneIntelligenceAddon implements ICamstackAddon, ISceneIntelligence {
|
|
4
|
+
readonly manifest: AddonManifest;
|
|
5
|
+
private logger;
|
|
6
|
+
private imageEncoder;
|
|
7
|
+
private textEncoder;
|
|
8
|
+
private sceneStateMachine;
|
|
9
|
+
private referenceStore;
|
|
10
|
+
private searchService;
|
|
11
|
+
private ctx;
|
|
12
|
+
initialize(context: AddonContext): Promise<void>;
|
|
13
|
+
shutdown(): Promise<void>;
|
|
14
|
+
getCapabilityProvider<K extends keyof CapabilityProviderMap>(name: K): CapabilityProviderMap[K] | null;
|
|
15
|
+
classify(input: CropInput): Promise<ClassifierOutput>;
|
|
16
|
+
embed(deviceId: string, crop: Buffer, metadata: EmbeddingMetadata): Promise<string>;
|
|
17
|
+
search(query: string, topK: number, filter?: EmbeddingFilter): Promise<VectorSearchResult[]>;
|
|
18
|
+
searchByImage(image: Buffer, topK: number, filter?: EmbeddingFilter): Promise<VectorSearchResult[]>;
|
|
19
|
+
evaluateSceneState(deviceId: string, crop: Buffer): Promise<SceneStateResult | null>;
|
|
20
|
+
private ensureTextEncoder;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
declare class ClipImageEncoder {
|
|
24
|
+
private session;
|
|
25
|
+
private readonly modelId;
|
|
26
|
+
private readonly inputSize;
|
|
27
|
+
private readonly logger;
|
|
28
|
+
constructor(modelId: string, logger: IScopedLogger);
|
|
29
|
+
load(modelPath: string): Promise<void>;
|
|
30
|
+
/**
|
|
31
|
+
* Encode a raw RGB buffer into a CLIP embedding.
|
|
32
|
+
* Caller must provide RGB buffer (use sharp to decode JPEG first).
|
|
33
|
+
*/
|
|
34
|
+
encode(rgb: Buffer, width: number, height: number): Promise<Float32Array>;
|
|
35
|
+
dispose(): Promise<void>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
declare class ClipTextEncoder {
|
|
39
|
+
private session;
|
|
40
|
+
private readonly logger;
|
|
41
|
+
constructor(logger: IScopedLogger);
|
|
42
|
+
load(modelPath: string): Promise<void>;
|
|
43
|
+
encode(text: string): Promise<Float32Array>;
|
|
44
|
+
dispose(): Promise<void>;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
declare class SearchService {
|
|
48
|
+
private readonly backend;
|
|
49
|
+
private index;
|
|
50
|
+
constructor(backend: IEmbeddingsBackend);
|
|
51
|
+
initialize(): Promise<void>;
|
|
52
|
+
storeEmbedding(id: string, embedding: Float32Array, metadata: EmbeddingMetadata): Promise<void>;
|
|
53
|
+
searchByVector(query: Float32Array, topK: number, filter?: EmbeddingFilter): Promise<readonly VectorSearchResult[]>;
|
|
54
|
+
count(): Promise<number>;
|
|
55
|
+
shutdown(): Promise<void>;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface SceneStateDefinition {
|
|
59
|
+
readonly id: string;
|
|
60
|
+
readonly name: string;
|
|
61
|
+
readonly referenceEmbedding: Float32Array;
|
|
62
|
+
readonly threshold: number;
|
|
63
|
+
}
|
|
64
|
+
declare class SceneStateMachine {
|
|
65
|
+
private readonly debounceFrames;
|
|
66
|
+
private readonly cameraStates;
|
|
67
|
+
constructor(debounceFrames?: number);
|
|
68
|
+
evaluate(deviceId: string, embedding: Float32Array, referenceStates: readonly SceneStateDefinition[]): SceneStateResult | null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
declare const CLIP_MODELS: readonly ModelCatalogEntry[];
|
|
72
|
+
declare const DEFAULT_CLIP_MODEL = "mobileclip-s0";
|
|
73
|
+
declare const CLIP_EMBEDDING_DIM = 512;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Preprocess raw RGB buffer for CLIP inference.
|
|
77
|
+
* Resizes (nearest-neighbor for speed), normalizes with CLIP mean/std, outputs NCHW Float32Array.
|
|
78
|
+
* For production use, the caller should use sharp to resize the JPEG to targetW×targetH
|
|
79
|
+
* before calling this with the raw RGB. This function handles normalization + layout.
|
|
80
|
+
*/
|
|
81
|
+
declare function preprocessForClip(rgb: Buffer, srcWidth: number, srcHeight: number, targetWidth: number, targetHeight: number): Float32Array;
|
|
82
|
+
/**
|
|
83
|
+
* L2-normalize a vector in-place and return it.
|
|
84
|
+
*/
|
|
85
|
+
declare function l2Normalize(vec: Float32Array): Float32Array;
|
|
86
|
+
|
|
87
|
+
export { CLIP_EMBEDDING_DIM, CLIP_MODELS, ClipImageEncoder, ClipTextEncoder, DEFAULT_CLIP_MODEL, SceneIntelligenceAddon, SceneStateMachine, SearchService, SceneIntelligenceAddon as default, l2Normalize, preprocessForClip };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { ICamstackAddon, ISceneIntelligence, AddonManifest, AddonContext, CapabilityProviderMap, CropInput, ClassifierOutput, EmbeddingMetadata, EmbeddingFilter, VectorSearchResult, SceneStateResult, IScopedLogger, IEmbeddingsBackend, ModelCatalogEntry } from '@camstack/types';
|
|
2
|
+
|
|
3
|
+
declare class SceneIntelligenceAddon implements ICamstackAddon, ISceneIntelligence {
|
|
4
|
+
readonly manifest: AddonManifest;
|
|
5
|
+
private logger;
|
|
6
|
+
private imageEncoder;
|
|
7
|
+
private textEncoder;
|
|
8
|
+
private sceneStateMachine;
|
|
9
|
+
private referenceStore;
|
|
10
|
+
private searchService;
|
|
11
|
+
private ctx;
|
|
12
|
+
initialize(context: AddonContext): Promise<void>;
|
|
13
|
+
shutdown(): Promise<void>;
|
|
14
|
+
getCapabilityProvider<K extends keyof CapabilityProviderMap>(name: K): CapabilityProviderMap[K] | null;
|
|
15
|
+
classify(input: CropInput): Promise<ClassifierOutput>;
|
|
16
|
+
embed(deviceId: string, crop: Buffer, metadata: EmbeddingMetadata): Promise<string>;
|
|
17
|
+
search(query: string, topK: number, filter?: EmbeddingFilter): Promise<VectorSearchResult[]>;
|
|
18
|
+
searchByImage(image: Buffer, topK: number, filter?: EmbeddingFilter): Promise<VectorSearchResult[]>;
|
|
19
|
+
evaluateSceneState(deviceId: string, crop: Buffer): Promise<SceneStateResult | null>;
|
|
20
|
+
private ensureTextEncoder;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
declare class ClipImageEncoder {
|
|
24
|
+
private session;
|
|
25
|
+
private readonly modelId;
|
|
26
|
+
private readonly inputSize;
|
|
27
|
+
private readonly logger;
|
|
28
|
+
constructor(modelId: string, logger: IScopedLogger);
|
|
29
|
+
load(modelPath: string): Promise<void>;
|
|
30
|
+
/**
|
|
31
|
+
* Encode a raw RGB buffer into a CLIP embedding.
|
|
32
|
+
* Caller must provide RGB buffer (use sharp to decode JPEG first).
|
|
33
|
+
*/
|
|
34
|
+
encode(rgb: Buffer, width: number, height: number): Promise<Float32Array>;
|
|
35
|
+
dispose(): Promise<void>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
declare class ClipTextEncoder {
|
|
39
|
+
private session;
|
|
40
|
+
private readonly logger;
|
|
41
|
+
constructor(logger: IScopedLogger);
|
|
42
|
+
load(modelPath: string): Promise<void>;
|
|
43
|
+
encode(text: string): Promise<Float32Array>;
|
|
44
|
+
dispose(): Promise<void>;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
declare class SearchService {
|
|
48
|
+
private readonly backend;
|
|
49
|
+
private index;
|
|
50
|
+
constructor(backend: IEmbeddingsBackend);
|
|
51
|
+
initialize(): Promise<void>;
|
|
52
|
+
storeEmbedding(id: string, embedding: Float32Array, metadata: EmbeddingMetadata): Promise<void>;
|
|
53
|
+
searchByVector(query: Float32Array, topK: number, filter?: EmbeddingFilter): Promise<readonly VectorSearchResult[]>;
|
|
54
|
+
count(): Promise<number>;
|
|
55
|
+
shutdown(): Promise<void>;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface SceneStateDefinition {
|
|
59
|
+
readonly id: string;
|
|
60
|
+
readonly name: string;
|
|
61
|
+
readonly referenceEmbedding: Float32Array;
|
|
62
|
+
readonly threshold: number;
|
|
63
|
+
}
|
|
64
|
+
declare class SceneStateMachine {
|
|
65
|
+
private readonly debounceFrames;
|
|
66
|
+
private readonly cameraStates;
|
|
67
|
+
constructor(debounceFrames?: number);
|
|
68
|
+
evaluate(deviceId: string, embedding: Float32Array, referenceStates: readonly SceneStateDefinition[]): SceneStateResult | null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
declare const CLIP_MODELS: readonly ModelCatalogEntry[];
|
|
72
|
+
declare const DEFAULT_CLIP_MODEL = "mobileclip-s0";
|
|
73
|
+
declare const CLIP_EMBEDDING_DIM = 512;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Preprocess raw RGB buffer for CLIP inference.
|
|
77
|
+
* Resizes (nearest-neighbor for speed), normalizes with CLIP mean/std, outputs NCHW Float32Array.
|
|
78
|
+
* For production use, the caller should use sharp to resize the JPEG to targetW×targetH
|
|
79
|
+
* before calling this with the raw RGB. This function handles normalization + layout.
|
|
80
|
+
*/
|
|
81
|
+
declare function preprocessForClip(rgb: Buffer, srcWidth: number, srcHeight: number, targetWidth: number, targetHeight: number): Float32Array;
|
|
82
|
+
/**
|
|
83
|
+
* L2-normalize a vector in-place and return it.
|
|
84
|
+
*/
|
|
85
|
+
declare function l2Normalize(vec: Float32Array): Float32Array;
|
|
86
|
+
|
|
87
|
+
export { CLIP_EMBEDDING_DIM, CLIP_MODELS, ClipImageEncoder, ClipTextEncoder, DEFAULT_CLIP_MODEL, SceneIntelligenceAddon, SceneStateMachine, SearchService, SceneIntelligenceAddon as default, l2Normalize, preprocessForClip };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var src_exports = {};
|
|
32
|
+
__export(src_exports, {
|
|
33
|
+
CLIP_EMBEDDING_DIM: () => CLIP_EMBEDDING_DIM,
|
|
34
|
+
CLIP_MODELS: () => CLIP_MODELS,
|
|
35
|
+
ClipImageEncoder: () => ClipImageEncoder,
|
|
36
|
+
ClipTextEncoder: () => ClipTextEncoder,
|
|
37
|
+
DEFAULT_CLIP_MODEL: () => DEFAULT_CLIP_MODEL,
|
|
38
|
+
SceneIntelligenceAddon: () => SceneIntelligenceAddon,
|
|
39
|
+
SceneStateMachine: () => SceneStateMachine,
|
|
40
|
+
SearchService: () => SearchService,
|
|
41
|
+
default: () => SceneIntelligenceAddon,
|
|
42
|
+
l2Normalize: () => l2Normalize,
|
|
43
|
+
preprocessForClip: () => preprocessForClip
|
|
44
|
+
});
|
|
45
|
+
module.exports = __toCommonJS(src_exports);
|
|
46
|
+
|
|
47
|
+
// src/clip/preprocessing.ts
|
|
48
|
+
var CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073];
|
|
49
|
+
var CLIP_STD = [0.26862954, 0.26130258, 0.27577711];
|
|
50
|
+
function preprocessForClip(rgb, srcWidth, srcHeight, targetWidth, targetHeight) {
|
|
51
|
+
const pixels = targetWidth * targetHeight;
|
|
52
|
+
const result = new Float32Array(3 * pixels);
|
|
53
|
+
for (let y = 0; y < targetHeight; y++) {
|
|
54
|
+
for (let x = 0; x < targetWidth; x++) {
|
|
55
|
+
const srcX = Math.min(Math.floor(x / targetWidth * srcWidth), srcWidth - 1);
|
|
56
|
+
const srcY = Math.min(Math.floor(y / targetHeight * srcHeight), srcHeight - 1);
|
|
57
|
+
const srcIdx = (srcY * srcWidth + srcX) * 3;
|
|
58
|
+
const dstIdx = y * targetWidth + x;
|
|
59
|
+
for (let c = 0; c < 3; c++) {
|
|
60
|
+
const val = (rgb[srcIdx + c] ?? 0) / 255;
|
|
61
|
+
const mean = CLIP_MEAN[c];
|
|
62
|
+
const std = CLIP_STD[c];
|
|
63
|
+
result[c * pixels + dstIdx] = (val - mean) / std;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
function l2Normalize(vec) {
|
|
70
|
+
let norm = 0;
|
|
71
|
+
for (let i = 0; i < vec.length; i++) norm += vec[i] * vec[i];
|
|
72
|
+
norm = Math.sqrt(norm);
|
|
73
|
+
if (norm > 0) {
|
|
74
|
+
for (let i = 0; i < vec.length; i++) vec[i] /= norm;
|
|
75
|
+
}
|
|
76
|
+
return vec;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// src/clip/models.ts
|
|
80
|
+
var CLIP_MODELS = [
|
|
81
|
+
{
|
|
82
|
+
id: "mobileclip-s0",
|
|
83
|
+
name: "MobileCLIP-S0",
|
|
84
|
+
description: "Lightweight CLIP model optimized for edge devices (~60MB)",
|
|
85
|
+
inputSize: { width: 256, height: 256 },
|
|
86
|
+
labels: [],
|
|
87
|
+
inputLayout: "nchw",
|
|
88
|
+
inputNormalization: "none",
|
|
89
|
+
// We handle CLIP normalization ourselves
|
|
90
|
+
formats: {
|
|
91
|
+
onnx: {
|
|
92
|
+
url: "https://huggingface.co/nicovdw/mobileclip-s0-onnx/resolve/main/image_encoder.onnx",
|
|
93
|
+
sizeMB: 60
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
id: "mobileclip-s0-text",
|
|
99
|
+
name: "MobileCLIP-S0 Text Encoder",
|
|
100
|
+
description: "Text encoder for MobileCLIP-S0",
|
|
101
|
+
inputSize: { width: 0, height: 0 },
|
|
102
|
+
labels: [],
|
|
103
|
+
formats: {
|
|
104
|
+
onnx: {
|
|
105
|
+
url: "https://huggingface.co/nicovdw/mobileclip-s0-onnx/resolve/main/text_encoder.onnx",
|
|
106
|
+
sizeMB: 65
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
id: "clip-vit-b32",
|
|
112
|
+
name: "CLIP ViT-B/32",
|
|
113
|
+
description: "Standard OpenAI CLIP model, higher accuracy (~340MB)",
|
|
114
|
+
inputSize: { width: 224, height: 224 },
|
|
115
|
+
labels: [],
|
|
116
|
+
inputLayout: "nchw",
|
|
117
|
+
inputNormalization: "none",
|
|
118
|
+
formats: {
|
|
119
|
+
onnx: {
|
|
120
|
+
url: "https://huggingface.co/nicovdw/clip-vit-b32-onnx/resolve/main/image_encoder.onnx",
|
|
121
|
+
sizeMB: 340
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
id: "clip-vit-b32-text",
|
|
127
|
+
name: "CLIP ViT-B/32 Text Encoder",
|
|
128
|
+
description: "Text encoder for CLIP ViT-B/32",
|
|
129
|
+
inputSize: { width: 0, height: 0 },
|
|
130
|
+
labels: [],
|
|
131
|
+
formats: {
|
|
132
|
+
onnx: {
|
|
133
|
+
url: "https://huggingface.co/nicovdw/clip-vit-b32-onnx/resolve/main/text_encoder.onnx",
|
|
134
|
+
sizeMB: 170
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
];
|
|
139
|
+
var DEFAULT_CLIP_MODEL = "mobileclip-s0";
|
|
140
|
+
var CLIP_EMBEDDING_DIM = 512;
|
|
141
|
+
var MOBILECLIP_INPUT_SIZE = 256;
|
|
142
|
+
var VITB32_INPUT_SIZE = 224;
|
|
143
|
+
function getInputSize(modelId) {
|
|
144
|
+
if (modelId.startsWith("mobileclip")) return MOBILECLIP_INPUT_SIZE;
|
|
145
|
+
return VITB32_INPUT_SIZE;
|
|
146
|
+
}
|
|
147
|
+
function getTextModelId(imageModelId) {
|
|
148
|
+
return `${imageModelId}-text`;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// src/clip/image-encoder.ts
|
|
152
|
+
var ClipImageEncoder = class {
|
|
153
|
+
session = null;
|
|
154
|
+
// onnxruntime InferenceSession
|
|
155
|
+
modelId;
|
|
156
|
+
inputSize;
|
|
157
|
+
logger;
|
|
158
|
+
constructor(modelId, logger) {
|
|
159
|
+
this.modelId = modelId;
|
|
160
|
+
this.inputSize = getInputSize(modelId);
|
|
161
|
+
this.logger = logger;
|
|
162
|
+
}
|
|
163
|
+
async load(modelPath) {
|
|
164
|
+
const ort = await import("onnxruntime-node");
|
|
165
|
+
this.session = await ort.InferenceSession.create(modelPath, {
|
|
166
|
+
executionProviders: ["cpu"]
|
|
167
|
+
});
|
|
168
|
+
this.logger.info(`CLIP image encoder loaded: ${this.modelId} (${this.inputSize}\xD7${this.inputSize})`);
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Encode a raw RGB buffer into a CLIP embedding.
|
|
172
|
+
* Caller must provide RGB buffer (use sharp to decode JPEG first).
|
|
173
|
+
*/
|
|
174
|
+
async encode(rgb, width, height) {
|
|
175
|
+
if (!this.session) throw new Error("Image encoder not loaded");
|
|
176
|
+
const ort = await import("onnxruntime-node");
|
|
177
|
+
const input = preprocessForClip(rgb, width, height, this.inputSize, this.inputSize);
|
|
178
|
+
const tensor = new ort.Tensor("float32", input, [1, 3, this.inputSize, this.inputSize]);
|
|
179
|
+
const feeds = {};
|
|
180
|
+
const inputName = this.session.inputNames[0];
|
|
181
|
+
feeds[inputName] = tensor;
|
|
182
|
+
const results = await this.session.run(feeds);
|
|
183
|
+
const outputName = this.session.outputNames[0];
|
|
184
|
+
const output = results[outputName].data;
|
|
185
|
+
const embedding = new Float32Array(output.slice(0, CLIP_EMBEDDING_DIM));
|
|
186
|
+
return l2Normalize(embedding);
|
|
187
|
+
}
|
|
188
|
+
async dispose() {
|
|
189
|
+
if (this.session) {
|
|
190
|
+
await this.session.release?.();
|
|
191
|
+
this.session = null;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
// src/clip/tokenizer.ts
|
|
197
|
+
var SOT_TOKEN = 49406;
|
|
198
|
+
var EOT_TOKEN = 49407;
|
|
199
|
+
var MAX_LENGTH = 77;
|
|
200
|
+
function tokenize(text) {
|
|
201
|
+
const tokens = [SOT_TOKEN];
|
|
202
|
+
const cleaned = text.toLowerCase().trim();
|
|
203
|
+
for (const char of cleaned) {
|
|
204
|
+
const code = char.codePointAt(0) ?? 0;
|
|
205
|
+
if (code < 256) {
|
|
206
|
+
tokens.push(code + 1);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
tokens.push(EOT_TOKEN);
|
|
210
|
+
while (tokens.length < MAX_LENGTH) tokens.push(0);
|
|
211
|
+
if (tokens.length > MAX_LENGTH) tokens.length = MAX_LENGTH;
|
|
212
|
+
return BigInt64Array.from(tokens.map((t) => BigInt(t)));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// src/clip/text-encoder.ts
|
|
216
|
+
var ClipTextEncoder = class {
|
|
217
|
+
session = null;
|
|
218
|
+
logger;
|
|
219
|
+
constructor(logger) {
|
|
220
|
+
this.logger = logger;
|
|
221
|
+
}
|
|
222
|
+
async load(modelPath) {
|
|
223
|
+
const ort = await import("onnxruntime-node");
|
|
224
|
+
this.session = await ort.InferenceSession.create(modelPath, {
|
|
225
|
+
executionProviders: ["cpu"]
|
|
226
|
+
});
|
|
227
|
+
this.logger.info("CLIP text encoder loaded");
|
|
228
|
+
}
|
|
229
|
+
async encode(text) {
|
|
230
|
+
if (!this.session) throw new Error("Text encoder not loaded");
|
|
231
|
+
const ort = await import("onnxruntime-node");
|
|
232
|
+
const tokens = tokenize(text);
|
|
233
|
+
const tensor = new ort.Tensor("int64", tokens, [1, 77]);
|
|
234
|
+
const feeds = {};
|
|
235
|
+
const inputName = this.session.inputNames[0];
|
|
236
|
+
feeds[inputName] = tensor;
|
|
237
|
+
const results = await this.session.run(feeds);
|
|
238
|
+
const outputName = this.session.outputNames[0];
|
|
239
|
+
const output = results[outputName].data;
|
|
240
|
+
const embedding = new Float32Array(output.slice(0, CLIP_EMBEDDING_DIM));
|
|
241
|
+
return l2Normalize(embedding);
|
|
242
|
+
}
|
|
243
|
+
async dispose() {
|
|
244
|
+
if (this.session) {
|
|
245
|
+
await this.session.release?.();
|
|
246
|
+
this.session = null;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
// src/scene-state/state-machine.ts
|
|
252
|
+
var import_types = require("@camstack/types");
|
|
253
|
+
var SceneStateMachine = class {
|
|
254
|
+
debounceFrames;
|
|
255
|
+
cameraStates = /* @__PURE__ */ new Map();
|
|
256
|
+
constructor(debounceFrames = 3) {
|
|
257
|
+
this.debounceFrames = debounceFrames;
|
|
258
|
+
}
|
|
259
|
+
evaluate(deviceId, embedding, referenceStates) {
|
|
260
|
+
if (referenceStates.length === 0) return null;
|
|
261
|
+
let cs = this.cameraStates.get(deviceId);
|
|
262
|
+
if (!cs) {
|
|
263
|
+
cs = { currentState: null, pendingState: null, pendingCount: 0, pendingConfidence: 0 };
|
|
264
|
+
this.cameraStates.set(deviceId, cs);
|
|
265
|
+
}
|
|
266
|
+
let bestState = null;
|
|
267
|
+
let bestScore = -1;
|
|
268
|
+
for (const ref of referenceStates) {
|
|
269
|
+
const similarity = (0, import_types.cosineSimilarity)(embedding, ref.referenceEmbedding);
|
|
270
|
+
if (similarity >= ref.threshold && similarity > bestScore) {
|
|
271
|
+
bestScore = similarity;
|
|
272
|
+
bestState = ref.name;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
if (!bestState) return null;
|
|
276
|
+
if (bestState === cs.pendingState) {
|
|
277
|
+
cs.pendingCount++;
|
|
278
|
+
cs.pendingConfidence = bestScore;
|
|
279
|
+
} else {
|
|
280
|
+
cs.pendingState = bestState;
|
|
281
|
+
cs.pendingCount = 1;
|
|
282
|
+
cs.pendingConfidence = bestScore;
|
|
283
|
+
}
|
|
284
|
+
if (cs.pendingCount < this.debounceFrames) return null;
|
|
285
|
+
if (bestState === cs.currentState) return null;
|
|
286
|
+
const previousState = cs.currentState ?? "unknown";
|
|
287
|
+
cs.currentState = bestState;
|
|
288
|
+
cs.pendingState = null;
|
|
289
|
+
cs.pendingCount = 0;
|
|
290
|
+
return {
|
|
291
|
+
previousState,
|
|
292
|
+
currentState: bestState,
|
|
293
|
+
confidence: bestScore
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
// src/scene-state/reference-store.ts
|
|
299
|
+
var COLLECTION = "device-settings";
|
|
300
|
+
var ReferenceStore = class {
|
|
301
|
+
settingsBackend;
|
|
302
|
+
constructor(settingsBackend) {
|
|
303
|
+
this.settingsBackend = settingsBackend;
|
|
304
|
+
}
|
|
305
|
+
async getStatesForCamera(deviceId) {
|
|
306
|
+
const raw = await this.settingsBackend.get(COLLECTION, `scene-states:${deviceId}`);
|
|
307
|
+
if (!raw || !Array.isArray(raw)) return [];
|
|
308
|
+
return raw.map((s) => ({
|
|
309
|
+
id: s.id,
|
|
310
|
+
name: s.name,
|
|
311
|
+
referenceEmbedding: new Float32Array(s.referenceEmbedding),
|
|
312
|
+
threshold: s.threshold ?? 0.7
|
|
313
|
+
}));
|
|
314
|
+
}
|
|
315
|
+
async setStatesForCamera(deviceId, states) {
|
|
316
|
+
const serializable = states.map((s) => ({
|
|
317
|
+
id: s.id,
|
|
318
|
+
name: s.name,
|
|
319
|
+
referenceEmbedding: Array.from(s.referenceEmbedding),
|
|
320
|
+
threshold: s.threshold
|
|
321
|
+
}));
|
|
322
|
+
await this.settingsBackend.set(COLLECTION, `scene-states:${deviceId}`, serializable);
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
// src/search/search-service.ts
|
|
327
|
+
var INDEX_NAME = "clip-embeddings";
|
|
328
|
+
var SearchService = class {
|
|
329
|
+
backend;
|
|
330
|
+
index = null;
|
|
331
|
+
constructor(backend) {
|
|
332
|
+
this.backend = backend;
|
|
333
|
+
}
|
|
334
|
+
async initialize() {
|
|
335
|
+
this.index = await this.backend.openIndex(INDEX_NAME, CLIP_EMBEDDING_DIM);
|
|
336
|
+
}
|
|
337
|
+
async storeEmbedding(id, embedding, metadata) {
|
|
338
|
+
if (!this.index) return;
|
|
339
|
+
await this.index.insert(id, embedding, metadata);
|
|
340
|
+
}
|
|
341
|
+
async searchByVector(query, topK, filter) {
|
|
342
|
+
if (!this.index) return [];
|
|
343
|
+
return this.index.search(query, topK, filter);
|
|
344
|
+
}
|
|
345
|
+
async count() {
|
|
346
|
+
return this.index?.count() ?? 0;
|
|
347
|
+
}
|
|
348
|
+
async shutdown() {
|
|
349
|
+
await this.index?.flush();
|
|
350
|
+
this.index = null;
|
|
351
|
+
}
|
|
352
|
+
};
|
|
353
|
+
|
|
354
|
+
// src/addon.ts
|
|
355
|
+
var SceneIntelligenceAddon = class {
|
|
356
|
+
manifest = {
|
|
357
|
+
id: "scene-intelligence",
|
|
358
|
+
name: "Scene Intelligence",
|
|
359
|
+
version: "0.1.0",
|
|
360
|
+
description: "CLIP embeddings, semantic search, and scene state detection",
|
|
361
|
+
slot: "classifier",
|
|
362
|
+
inputClasses: [],
|
|
363
|
+
outputClasses: [],
|
|
364
|
+
labelOutputType: "classification",
|
|
365
|
+
passive: false,
|
|
366
|
+
capabilities: [
|
|
367
|
+
{ name: "scene-intelligence", mode: "singleton" }
|
|
368
|
+
],
|
|
369
|
+
defaultConfig: {
|
|
370
|
+
modelId: DEFAULT_CLIP_MODEL,
|
|
371
|
+
minConfidence: 0.5
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
logger;
|
|
375
|
+
imageEncoder = null;
|
|
376
|
+
textEncoder = null;
|
|
377
|
+
sceneStateMachine = null;
|
|
378
|
+
referenceStore = null;
|
|
379
|
+
searchService = null;
|
|
380
|
+
ctx = null;
|
|
381
|
+
async initialize(context) {
|
|
382
|
+
this.ctx = context;
|
|
383
|
+
this.logger = context.logger;
|
|
384
|
+
const modelId = context.addonConfig["modelId"] ?? DEFAULT_CLIP_MODEL;
|
|
385
|
+
if (context.models) {
|
|
386
|
+
try {
|
|
387
|
+
const imagePath = await context.models.ensure(modelId, "onnx");
|
|
388
|
+
this.imageEncoder = new ClipImageEncoder(modelId, this.logger);
|
|
389
|
+
await this.imageEncoder.load(imagePath);
|
|
390
|
+
} catch (err) {
|
|
391
|
+
this.logger.warn(`Failed to load CLIP image encoder: ${err}`);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
this.sceneStateMachine = new SceneStateMachine(3);
|
|
395
|
+
if (context.settingsBackend) {
|
|
396
|
+
this.referenceStore = new ReferenceStore(context.settingsBackend);
|
|
397
|
+
}
|
|
398
|
+
if (context.embeddingsBackend) {
|
|
399
|
+
this.searchService = new SearchService(context.embeddingsBackend);
|
|
400
|
+
await this.searchService.initialize();
|
|
401
|
+
}
|
|
402
|
+
this.logger.info(`Scene Intelligence initialized (model=${modelId})`);
|
|
403
|
+
}
|
|
404
|
+
async shutdown() {
|
|
405
|
+
await this.imageEncoder?.dispose();
|
|
406
|
+
await this.textEncoder?.dispose();
|
|
407
|
+
await this.searchService?.shutdown();
|
|
408
|
+
this.imageEncoder = null;
|
|
409
|
+
this.textEncoder = null;
|
|
410
|
+
this.searchService = null;
|
|
411
|
+
this.sceneStateMachine = null;
|
|
412
|
+
this.referenceStore = null;
|
|
413
|
+
this.ctx = null;
|
|
414
|
+
}
|
|
415
|
+
getCapabilityProvider(name) {
|
|
416
|
+
if (name === "scene-intelligence") {
|
|
417
|
+
return this;
|
|
418
|
+
}
|
|
419
|
+
return null;
|
|
420
|
+
}
|
|
421
|
+
// --- IClassifierProvider (pipeline classifier slot) ---
|
|
422
|
+
async classify(input) {
|
|
423
|
+
if (!this.imageEncoder) {
|
|
424
|
+
return { classifications: [], inferenceMs: 0, modelId: "none" };
|
|
425
|
+
}
|
|
426
|
+
const start = performance.now();
|
|
427
|
+
try {
|
|
428
|
+
const sharp = await import("sharp").then((m) => m.default ?? m);
|
|
429
|
+
const { data, info } = await sharp(input.frame.data).extract({
|
|
430
|
+
left: Math.round(input.roi.x),
|
|
431
|
+
top: Math.round(input.roi.y),
|
|
432
|
+
width: Math.round(input.roi.w),
|
|
433
|
+
height: Math.round(input.roi.h)
|
|
434
|
+
}).removeAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
435
|
+
const embedding = await this.imageEncoder.encode(data, info.width, info.height);
|
|
436
|
+
const inferenceMs = performance.now() - start;
|
|
437
|
+
if (this.searchService) {
|
|
438
|
+
const embeddingId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
439
|
+
this.searchService.storeEmbedding(embeddingId, embedding, {
|
|
440
|
+
timestamp: Date.now(),
|
|
441
|
+
className: input.parentDetection.class,
|
|
442
|
+
deviceId: ""
|
|
443
|
+
}).catch(() => {
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
return {
|
|
447
|
+
classifications: [{
|
|
448
|
+
class: "clip-embedding",
|
|
449
|
+
score: 1,
|
|
450
|
+
embedding,
|
|
451
|
+
metadata: { embeddingDim: embedding.length }
|
|
452
|
+
}],
|
|
453
|
+
inferenceMs,
|
|
454
|
+
modelId: this.manifest.defaultConfig?.["modelId"] ?? DEFAULT_CLIP_MODEL
|
|
455
|
+
};
|
|
456
|
+
} catch (err) {
|
|
457
|
+
this.logger.debug(`CLIP classify failed: ${err}`);
|
|
458
|
+
return { classifications: [], inferenceMs: performance.now() - start, modelId: "error" };
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
// --- ISceneIntelligence ---
|
|
462
|
+
async embed(deviceId, crop, metadata) {
|
|
463
|
+
if (!this.imageEncoder || !this.searchService) throw new Error("Not initialized");
|
|
464
|
+
const sharp = await import("sharp").then((m) => m.default ?? m);
|
|
465
|
+
const { data, info } = await sharp(crop).removeAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
466
|
+
const embedding = await this.imageEncoder.encode(data, info.width, info.height);
|
|
467
|
+
const id = `${deviceId}/${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
468
|
+
await this.searchService.storeEmbedding(id, embedding, metadata);
|
|
469
|
+
return id;
|
|
470
|
+
}
|
|
471
|
+
async search(query, topK, filter) {
|
|
472
|
+
if (!this.searchService) return [];
|
|
473
|
+
const textEncoder = await this.ensureTextEncoder();
|
|
474
|
+
if (!textEncoder) return [];
|
|
475
|
+
const queryEmbedding = await textEncoder.encode(query);
|
|
476
|
+
const results = await this.searchService.searchByVector(queryEmbedding, topK, filter);
|
|
477
|
+
return [...results];
|
|
478
|
+
}
|
|
479
|
+
async searchByImage(image, topK, filter) {
|
|
480
|
+
if (!this.imageEncoder || !this.searchService) return [];
|
|
481
|
+
const sharp = await import("sharp").then((m) => m.default ?? m);
|
|
482
|
+
const { data, info } = await sharp(image).removeAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
483
|
+
const embedding = await this.imageEncoder.encode(data, info.width, info.height);
|
|
484
|
+
const results = await this.searchService.searchByVector(embedding, topK, filter);
|
|
485
|
+
return [...results];
|
|
486
|
+
}
|
|
487
|
+
async evaluateSceneState(deviceId, crop) {
|
|
488
|
+
if (!this.imageEncoder || !this.sceneStateMachine || !this.referenceStore) return null;
|
|
489
|
+
const sharp = await import("sharp").then((m) => m.default ?? m);
|
|
490
|
+
const { data, info } = await sharp(crop).removeAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
491
|
+
const embedding = await this.imageEncoder.encode(data, info.width, info.height);
|
|
492
|
+
const states = await this.referenceStore.getStatesForCamera(deviceId);
|
|
493
|
+
return this.sceneStateMachine.evaluate(deviceId, embedding, states);
|
|
494
|
+
}
|
|
495
|
+
// --- Private ---
|
|
496
|
+
async ensureTextEncoder() {
|
|
497
|
+
if (this.textEncoder) return this.textEncoder;
|
|
498
|
+
if (!this.ctx?.models) return null;
|
|
499
|
+
const modelId = this.ctx.addonConfig["modelId"] ?? DEFAULT_CLIP_MODEL;
|
|
500
|
+
const textModelId = getTextModelId(modelId);
|
|
501
|
+
try {
|
|
502
|
+
const textPath = await this.ctx.models.ensure(textModelId, "onnx");
|
|
503
|
+
this.textEncoder = new ClipTextEncoder(this.logger);
|
|
504
|
+
await this.textEncoder.load(textPath);
|
|
505
|
+
return this.textEncoder;
|
|
506
|
+
} catch (err) {
|
|
507
|
+
this.logger.warn(`Failed to load CLIP text encoder: ${err}`);
|
|
508
|
+
return null;
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
};
|
|
512
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
513
|
+
0 && (module.exports = {
|
|
514
|
+
CLIP_EMBEDDING_DIM,
|
|
515
|
+
CLIP_MODELS,
|
|
516
|
+
ClipImageEncoder,
|
|
517
|
+
ClipTextEncoder,
|
|
518
|
+
DEFAULT_CLIP_MODEL,
|
|
519
|
+
SceneIntelligenceAddon,
|
|
520
|
+
SceneStateMachine,
|
|
521
|
+
SearchService,
|
|
522
|
+
l2Normalize,
|
|
523
|
+
preprocessForClip
|
|
524
|
+
});
|
|
525
|
+
//# sourceMappingURL=index.js.map
|