@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -301
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7957 -6238
- package/dist/index.js.map +41 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1075 -7821
- package/dist/workers/ocr-worker.js.map +10 -51
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +364 -6
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +39 -21
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -89
- package/dist/workers/florence2-worker.js +0 -779
- package/dist/workers/florence2-worker.js.map +0 -13
|
@@ -1,779 +0,0 @@
|
|
|
1
|
-
var __create = Object.create;
|
|
2
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
3
|
-
var __defProp = Object.defineProperty;
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
-
function __accessProp(key) {
|
|
7
|
-
return this[key];
|
|
8
|
-
}
|
|
9
|
-
var __toESMCache_node;
|
|
10
|
-
var __toESMCache_esm;
|
|
11
|
-
var __toESM = (mod, isNodeMode, target) => {
|
|
12
|
-
var canCache = mod != null && typeof mod === "object";
|
|
13
|
-
if (canCache) {
|
|
14
|
-
var cache = isNodeMode ? __toESMCache_node ??= new WeakMap : __toESMCache_esm ??= new WeakMap;
|
|
15
|
-
var cached = cache.get(mod);
|
|
16
|
-
if (cached)
|
|
17
|
-
return cached;
|
|
18
|
-
}
|
|
19
|
-
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
20
|
-
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
21
|
-
for (let key of __getOwnPropNames(mod))
|
|
22
|
-
if (!__hasOwnProp.call(to, key))
|
|
23
|
-
__defProp(to, key, {
|
|
24
|
-
get: __accessProp.bind(mod, key),
|
|
25
|
-
enumerable: true
|
|
26
|
-
});
|
|
27
|
-
if (canCache)
|
|
28
|
-
cache.set(mod, to);
|
|
29
|
-
return to;
|
|
30
|
-
};
|
|
31
|
-
var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
|
|
32
|
-
|
|
33
|
-
// src/workers/florence2-worker.ts
|
|
34
|
-
var import_node_worker_threads2 = require("node:worker_threads");
|
|
35
|
-
var import_sharp2 = __toESM(require("sharp"));
|
|
36
|
-
|
|
37
|
-
// src/florence2-model.ts
|
|
38
|
-
var import_core2 = require("@elizaos/core");
|
|
39
|
-
|
|
40
|
-
// src/florence2-local.ts
|
|
41
|
-
var import_core = require("@elizaos/core");
|
|
42
|
-
var import_sharp = __toESM(require("sharp"));
|
|
43
|
-
var tf = null;
|
|
44
|
-
async function ensureTf() {
|
|
45
|
-
if (tf)
|
|
46
|
-
return true;
|
|
47
|
-
try {
|
|
48
|
-
tf = await import("@tensorflow/tfjs-node");
|
|
49
|
-
return true;
|
|
50
|
-
} catch {
|
|
51
|
-
import_core.logger.warn("[Florence2Local] TensorFlow.js native addon not available — using heuristic analysis.");
|
|
52
|
-
return false;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
class Florence2Local {
|
|
57
|
-
model = null;
|
|
58
|
-
initialized = false;
|
|
59
|
-
config;
|
|
60
|
-
constructor(config) {
|
|
61
|
-
this.config = {
|
|
62
|
-
modelUrl: config?.modelUrl || "https://tfhub.dev/google/tfjs-model/imagenet/mobilenet_v3_small_100_224/feature_vector/5/default/1",
|
|
63
|
-
cacheDir: config?.cacheDir || "./models/cache"
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
async initialize() {
|
|
67
|
-
if (this.initialized) {
|
|
68
|
-
return;
|
|
69
|
-
}
|
|
70
|
-
const tfAvailable = await ensureTf();
|
|
71
|
-
if (!tfAvailable || !tf) {
|
|
72
|
-
this.initialized = true;
|
|
73
|
-
return;
|
|
74
|
-
}
|
|
75
|
-
try {
|
|
76
|
-
import_core.logger.info("[VisionModel] Initializing MobileNet model for image analysis...");
|
|
77
|
-
this.model = await tf.loadGraphModel(this.config.modelUrl);
|
|
78
|
-
this.initialized = true;
|
|
79
|
-
import_core.logger.info("[VisionModel] Model initialized successfully");
|
|
80
|
-
} catch (error) {
|
|
81
|
-
import_core.logger.error("[VisionModel] Failed to initialize model:", error);
|
|
82
|
-
this.initialized = true;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
async analyzeImage(imageBuffer) {
|
|
86
|
-
if (!this.initialized) {
|
|
87
|
-
await this.initialize();
|
|
88
|
-
}
|
|
89
|
-
if (!tf || !this.model) {
|
|
90
|
-
return await this.enhancedFallback(imageBuffer);
|
|
91
|
-
}
|
|
92
|
-
try {
|
|
93
|
-
const preprocessed = await this.preprocessImage(imageBuffer);
|
|
94
|
-
const predictions = await this.runInference(preprocessed);
|
|
95
|
-
preprocessed.dispose?.();
|
|
96
|
-
return this.parseModelOutput(predictions);
|
|
97
|
-
} catch (error) {
|
|
98
|
-
import_core.logger.error("[VisionModel] Analysis failed:", error);
|
|
99
|
-
return await this.enhancedFallback(imageBuffer);
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
async preprocessImage(imageBuffer) {
|
|
103
|
-
if (!tf)
|
|
104
|
-
throw new Error("TensorFlow.js not available");
|
|
105
|
-
const resized = await import_sharp.default(imageBuffer).resize(224, 224).raw().toBuffer();
|
|
106
|
-
const tensor = tf.node.decodeImage(resized, 3);
|
|
107
|
-
const normalized = tf.div(tensor, 255);
|
|
108
|
-
return normalized;
|
|
109
|
-
}
|
|
110
|
-
async runInference(input) {
|
|
111
|
-
if (!this.model) {
|
|
112
|
-
throw new Error("Model not loaded");
|
|
113
|
-
}
|
|
114
|
-
const batched = input.expandDims(0);
|
|
115
|
-
const output = this.model.predict(batched);
|
|
116
|
-
batched.dispose();
|
|
117
|
-
return output;
|
|
118
|
-
}
|
|
119
|
-
async parseModelOutput(predictions) {
|
|
120
|
-
const values = await predictions.array();
|
|
121
|
-
predictions.dispose();
|
|
122
|
-
const caption = this.generateCaptionFromFeatures(values);
|
|
123
|
-
return {
|
|
124
|
-
caption,
|
|
125
|
-
objects: [],
|
|
126
|
-
regions: [],
|
|
127
|
-
tags: this.extractTagsFromCaption(caption)
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
generateCaptionFromFeatures(features) {
|
|
131
|
-
const scenes = [
|
|
132
|
-
"Indoor scene with various objects visible",
|
|
133
|
-
"Person in a room with furniture",
|
|
134
|
-
"Computer workspace with monitor and desk",
|
|
135
|
-
"Living space with natural lighting",
|
|
136
|
-
"Office environment with equipment"
|
|
137
|
-
];
|
|
138
|
-
const index = Math.abs(features[0][0]) * scenes.length;
|
|
139
|
-
return scenes[Math.floor(index) % scenes.length];
|
|
140
|
-
}
|
|
141
|
-
extractTagsFromCaption(caption) {
|
|
142
|
-
const words = caption.toLowerCase().split(/\s+/);
|
|
143
|
-
const validTags = [
|
|
144
|
-
"indoor",
|
|
145
|
-
"outdoor",
|
|
146
|
-
"person",
|
|
147
|
-
"computer",
|
|
148
|
-
"desk",
|
|
149
|
-
"office",
|
|
150
|
-
"room",
|
|
151
|
-
"furniture",
|
|
152
|
-
"monitor",
|
|
153
|
-
"workspace"
|
|
154
|
-
];
|
|
155
|
-
return words.filter((word) => validTags.includes(word));
|
|
156
|
-
}
|
|
157
|
-
async enhancedFallback(imageBuffer) {
|
|
158
|
-
const metadata = await import_sharp.default(imageBuffer).metadata();
|
|
159
|
-
const stats = await import_sharp.default(imageBuffer).stats();
|
|
160
|
-
const brightness = (stats.channels[0].mean + stats.channels[1].mean + stats.channels[2].mean) / 3;
|
|
161
|
-
const isIndoor = brightness < 180;
|
|
162
|
-
let caption = isIndoor ? "Indoor scene" : "Outdoor scene";
|
|
163
|
-
if (metadata.width && metadata.height) {
|
|
164
|
-
const aspectRatio = metadata.width / metadata.height;
|
|
165
|
-
if (aspectRatio > 1.5) {
|
|
166
|
-
caption += " with wide field of view";
|
|
167
|
-
} else if (aspectRatio < 0.7) {
|
|
168
|
-
caption += " in portrait orientation";
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
const dominantColor = stats.dominant;
|
|
172
|
-
if (dominantColor.r > 200 && dominantColor.g > 200 && dominantColor.b > 200) {
|
|
173
|
-
caption += ", well-lit environment";
|
|
174
|
-
} else if (dominantColor.r < 100 && dominantColor.g < 100 && dominantColor.b < 100) {
|
|
175
|
-
caption += ", dimly lit conditions";
|
|
176
|
-
}
|
|
177
|
-
return {
|
|
178
|
-
caption,
|
|
179
|
-
objects: [],
|
|
180
|
-
regions: [],
|
|
181
|
-
tags: this.extractTagsFromCaption(caption)
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
isInitialized() {
|
|
185
|
-
return this.initialized;
|
|
186
|
-
}
|
|
187
|
-
async dispose() {
|
|
188
|
-
if (this.model) {
|
|
189
|
-
this.model.dispose();
|
|
190
|
-
this.model = null;
|
|
191
|
-
}
|
|
192
|
-
this.initialized = false;
|
|
193
|
-
import_core.logger.info("[VisionModel] Model disposed");
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
// src/florence2-model.ts
|
|
198
|
-
class Florence2Model {
|
|
199
|
-
initialized = false;
|
|
200
|
-
localModel;
|
|
201
|
-
constructor() {
|
|
202
|
-
this.localModel = new Florence2Local;
|
|
203
|
-
}
|
|
204
|
-
async initialize() {
|
|
205
|
-
if (this.initialized) {
|
|
206
|
-
return;
|
|
207
|
-
}
|
|
208
|
-
try {
|
|
209
|
-
import_core2.logger.info("[Florence2] Initializing local Florence-2 model with TensorFlow.js...");
|
|
210
|
-
await this.localModel.initialize();
|
|
211
|
-
this.initialized = true;
|
|
212
|
-
import_core2.logger.info("[Florence2] Local model initialized successfully");
|
|
213
|
-
} catch (error) {
|
|
214
|
-
import_core2.logger.error("[Florence2] Failed to initialize local model:", error);
|
|
215
|
-
this.initialized = true;
|
|
216
|
-
import_core2.logger.warn("[Florence2] Running with enhanced fallback mode");
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
async analyzeTile(tile) {
|
|
220
|
-
if (!this.initialized) {
|
|
221
|
-
await this.initialize();
|
|
222
|
-
}
|
|
223
|
-
if (!tile.data) {
|
|
224
|
-
throw new Error("Tile has no image data");
|
|
225
|
-
}
|
|
226
|
-
try {
|
|
227
|
-
try {
|
|
228
|
-
const result2 = await this.localModel.analyzeImage(tile.data);
|
|
229
|
-
import_core2.logger.debug(`[Florence2] Analyzed tile ${tile.id}: ${result2.caption}`);
|
|
230
|
-
return result2;
|
|
231
|
-
} catch (_modelError) {
|
|
232
|
-
import_core2.logger.warn("[Florence2] Local model analysis failed, falling back:", _modelError);
|
|
233
|
-
}
|
|
234
|
-
const result = await this.mockAnalyze(tile);
|
|
235
|
-
import_core2.logger.debug(`[Florence2] Mock analyzed tile ${tile.id}: ${result.caption}`);
|
|
236
|
-
return result;
|
|
237
|
-
} catch (error) {
|
|
238
|
-
import_core2.logger.error("[Florence2] Analysis failed:", error);
|
|
239
|
-
throw error;
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
async analyzeImage(imageBuffer) {
|
|
243
|
-
if (!this.initialized) {
|
|
244
|
-
await this.initialize();
|
|
245
|
-
}
|
|
246
|
-
try {
|
|
247
|
-
try {
|
|
248
|
-
const result2 = await this.localModel.analyzeImage(imageBuffer);
|
|
249
|
-
import_core2.logger.debug(`[Florence2] Analyzed image: ${result2.caption}`);
|
|
250
|
-
return result2;
|
|
251
|
-
} catch (_modelError) {
|
|
252
|
-
import_core2.logger.warn("[Florence2] Local model analysis failed, falling back:", _modelError);
|
|
253
|
-
}
|
|
254
|
-
const result = await this.mockAnalyzeBuffer(imageBuffer);
|
|
255
|
-
import_core2.logger.debug(`[Florence2] Mock analyzed image: ${result.caption}`);
|
|
256
|
-
return result;
|
|
257
|
-
} catch (error) {
|
|
258
|
-
import_core2.logger.error("[Florence2] Image analysis failed:", error);
|
|
259
|
-
throw error;
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
async mockAnalyze(tile) {
|
|
263
|
-
const isUpperRegion = tile.row < 2;
|
|
264
|
-
const isLeftRegion = tile.col < 2;
|
|
265
|
-
let caption = "Desktop screen region";
|
|
266
|
-
const objects = [];
|
|
267
|
-
const regions = [];
|
|
268
|
-
const tags = [];
|
|
269
|
-
if (isUpperRegion) {
|
|
270
|
-
caption = "Application window with menu bar";
|
|
271
|
-
objects.push({
|
|
272
|
-
label: "window",
|
|
273
|
-
bbox: { x: 0, y: 0, width: tile.width, height: 50 },
|
|
274
|
-
confidence: 0.9
|
|
275
|
-
});
|
|
276
|
-
objects.push({
|
|
277
|
-
label: "menu_bar",
|
|
278
|
-
bbox: { x: 0, y: 0, width: tile.width, height: 30 },
|
|
279
|
-
confidence: 0.85
|
|
280
|
-
});
|
|
281
|
-
tags.push("ui", "application", "desktop");
|
|
282
|
-
}
|
|
283
|
-
if (isLeftRegion) {
|
|
284
|
-
caption = "Sidebar or navigation area";
|
|
285
|
-
objects.push({
|
|
286
|
-
label: "sidebar",
|
|
287
|
-
bbox: { x: 0, y: 0, width: 100, height: tile.height },
|
|
288
|
-
confidence: 0.8
|
|
289
|
-
});
|
|
290
|
-
tags.push("navigation", "sidebar");
|
|
291
|
-
}
|
|
292
|
-
const buttonCount = Math.floor(Math.random() * 3) + 1;
|
|
293
|
-
for (let i = 0;i < buttonCount; i++) {
|
|
294
|
-
objects.push({
|
|
295
|
-
label: "button",
|
|
296
|
-
bbox: {
|
|
297
|
-
x: Math.random() * (tile.width - 100),
|
|
298
|
-
y: Math.random() * (tile.height - 40),
|
|
299
|
-
width: 100,
|
|
300
|
-
height: 40
|
|
301
|
-
},
|
|
302
|
-
confidence: 0.7 + Math.random() * 0.2
|
|
303
|
-
});
|
|
304
|
-
}
|
|
305
|
-
const textRegions = Math.floor(Math.random() * 2) + 1;
|
|
306
|
-
for (let i = 0;i < textRegions; i++) {
|
|
307
|
-
regions.push({
|
|
308
|
-
description: "Text content area",
|
|
309
|
-
bbox: {
|
|
310
|
-
x: Math.random() * (tile.width - 200),
|
|
311
|
-
y: Math.random() * (tile.height - 100),
|
|
312
|
-
width: 200,
|
|
313
|
-
height: 100
|
|
314
|
-
}
|
|
315
|
-
});
|
|
316
|
-
}
|
|
317
|
-
tags.push("screen", "interface", "computer");
|
|
318
|
-
return {
|
|
319
|
-
caption,
|
|
320
|
-
objects,
|
|
321
|
-
regions,
|
|
322
|
-
tags
|
|
323
|
-
};
|
|
324
|
-
}
|
|
325
|
-
async detectUIElements(imageBuffer) {
|
|
326
|
-
if (!this.initialized) {
|
|
327
|
-
await this.initialize();
|
|
328
|
-
}
|
|
329
|
-
try {
|
|
330
|
-
let result;
|
|
331
|
-
try {
|
|
332
|
-
result = await this.localModel.analyzeImage(imageBuffer);
|
|
333
|
-
} catch (_modelError) {
|
|
334
|
-
import_core2.logger.warn("[Florence2] Local model failed for UI detection, using fallback");
|
|
335
|
-
result = await this.mockAnalyzeBuffer(imageBuffer);
|
|
336
|
-
}
|
|
337
|
-
return (result.objects || []).map((obj) => ({
|
|
338
|
-
type: this.mapToUIElementType(obj.label),
|
|
339
|
-
bbox: obj.bbox,
|
|
340
|
-
confidence: obj.confidence
|
|
341
|
-
}));
|
|
342
|
-
} catch (error) {
|
|
343
|
-
import_core2.logger.error("[Florence2] UI element detection failed:", error);
|
|
344
|
-
return [];
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
async mockAnalyzeBuffer(_imageBuffer) {
|
|
348
|
-
const scenarios = [
|
|
349
|
-
{
|
|
350
|
-
caption: "Indoor scene with a person in front of a computer",
|
|
351
|
-
objects: [
|
|
352
|
-
{
|
|
353
|
-
label: "person",
|
|
354
|
-
bbox: { x: 300, y: 200, width: 200, height: 300 },
|
|
355
|
-
confidence: 0.9
|
|
356
|
-
},
|
|
357
|
-
{
|
|
358
|
-
label: "computer",
|
|
359
|
-
bbox: { x: 400, y: 350, width: 150, height: 100 },
|
|
360
|
-
confidence: 0.85
|
|
361
|
-
},
|
|
362
|
-
{
|
|
363
|
-
label: "desk",
|
|
364
|
-
bbox: { x: 350, y: 400, width: 250, height: 100 },
|
|
365
|
-
confidence: 0.8
|
|
366
|
-
}
|
|
367
|
-
],
|
|
368
|
-
tags: ["indoor", "office", "workspace", "person", "computer"]
|
|
369
|
-
},
|
|
370
|
-
{
|
|
371
|
-
caption: "Room interior with furniture and lighting",
|
|
372
|
-
objects: [
|
|
373
|
-
{
|
|
374
|
-
label: "chair",
|
|
375
|
-
bbox: { x: 200, y: 300, width: 100, height: 150 },
|
|
376
|
-
confidence: 0.85
|
|
377
|
-
},
|
|
378
|
-
{
|
|
379
|
-
label: "table",
|
|
380
|
-
bbox: { x: 350, y: 350, width: 150, height: 100 },
|
|
381
|
-
confidence: 0.8
|
|
382
|
-
},
|
|
383
|
-
{
|
|
384
|
-
label: "lamp",
|
|
385
|
-
bbox: { x: 500, y: 200, width: 50, height: 100 },
|
|
386
|
-
confidence: 0.75
|
|
387
|
-
}
|
|
388
|
-
],
|
|
389
|
-
tags: ["indoor", "room", "furniture", "interior"]
|
|
390
|
-
},
|
|
391
|
-
{
|
|
392
|
-
caption: "Person working at a desk with computer monitor",
|
|
393
|
-
objects: [
|
|
394
|
-
{
|
|
395
|
-
label: "person",
|
|
396
|
-
bbox: { x: 250, y: 150, width: 250, height: 350 },
|
|
397
|
-
confidence: 0.92
|
|
398
|
-
},
|
|
399
|
-
{
|
|
400
|
-
label: "monitor",
|
|
401
|
-
bbox: { x: 450, y: 300, width: 120, height: 80 },
|
|
402
|
-
confidence: 0.88
|
|
403
|
-
},
|
|
404
|
-
{
|
|
405
|
-
label: "keyboard",
|
|
406
|
-
bbox: { x: 430, y: 380, width: 100, height: 30 },
|
|
407
|
-
confidence: 0.82
|
|
408
|
-
}
|
|
409
|
-
],
|
|
410
|
-
tags: ["person", "working", "computer", "desk", "office"]
|
|
411
|
-
}
|
|
412
|
-
];
|
|
413
|
-
const scenario = scenarios[Math.floor(Math.random() * scenarios.length)];
|
|
414
|
-
return {
|
|
415
|
-
caption: scenario.caption,
|
|
416
|
-
objects: scenario.objects,
|
|
417
|
-
regions: [],
|
|
418
|
-
tags: scenario.tags
|
|
419
|
-
};
|
|
420
|
-
}
|
|
421
|
-
mapToUIElementType(label) {
|
|
422
|
-
const mapping = {
|
|
423
|
-
button: "button",
|
|
424
|
-
text_field: "input",
|
|
425
|
-
text_area: "textarea",
|
|
426
|
-
checkbox: "checkbox",
|
|
427
|
-
radio_button: "radio",
|
|
428
|
-
dropdown: "select",
|
|
429
|
-
menu: "menu",
|
|
430
|
-
menu_bar: "menubar",
|
|
431
|
-
toolbar: "toolbar",
|
|
432
|
-
window: "window",
|
|
433
|
-
dialog: "dialog",
|
|
434
|
-
icon: "icon",
|
|
435
|
-
image: "image",
|
|
436
|
-
video: "video",
|
|
437
|
-
link: "link",
|
|
438
|
-
heading: "heading",
|
|
439
|
-
paragraph: "text",
|
|
440
|
-
list: "list",
|
|
441
|
-
table: "table",
|
|
442
|
-
scrollbar: "scrollbar",
|
|
443
|
-
tab: "tab",
|
|
444
|
-
panel: "panel"
|
|
445
|
-
};
|
|
446
|
-
return mapping[label.toLowerCase()] || "unknown";
|
|
447
|
-
}
|
|
448
|
-
async generateSceneGraph(tiles) {
|
|
449
|
-
const nodes = [];
|
|
450
|
-
const edges = [];
|
|
451
|
-
for (const tile of tiles) {
|
|
452
|
-
if (!tile.data) {
|
|
453
|
-
continue;
|
|
454
|
-
}
|
|
455
|
-
const analysis = await this.analyzeTile(tile);
|
|
456
|
-
if (analysis.objects) {
|
|
457
|
-
for (const obj of analysis.objects) {
|
|
458
|
-
const nodeId = `${tile.id}-${obj.label}-${nodes.length}`;
|
|
459
|
-
nodes.push({
|
|
460
|
-
id: nodeId,
|
|
461
|
-
type: obj.label,
|
|
462
|
-
label: obj.label,
|
|
463
|
-
position: {
|
|
464
|
-
x: tile.x + obj.bbox.x,
|
|
465
|
-
y: tile.y + obj.bbox.y,
|
|
466
|
-
width: obj.bbox.width,
|
|
467
|
-
height: obj.bbox.height
|
|
468
|
-
}
|
|
469
|
-
});
|
|
470
|
-
}
|
|
471
|
-
}
|
|
472
|
-
}
|
|
473
|
-
for (let i = 0;i < nodes.length; i++) {
|
|
474
|
-
for (let j = i + 1;j < nodes.length; j++) {
|
|
475
|
-
const relation = this.inferSpatialRelation(nodes[i].position, nodes[j].position);
|
|
476
|
-
if (relation) {
|
|
477
|
-
edges.push({
|
|
478
|
-
source: nodes[i].id,
|
|
479
|
-
target: nodes[j].id,
|
|
480
|
-
relation
|
|
481
|
-
});
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
return { nodes, edges };
|
|
486
|
-
}
|
|
487
|
-
inferSpatialRelation(box1, box2) {
|
|
488
|
-
const center1 = {
|
|
489
|
-
x: box1.x + box1.width / 2,
|
|
490
|
-
y: box1.y + box1.height / 2
|
|
491
|
-
};
|
|
492
|
-
const center2 = {
|
|
493
|
-
x: box2.x + box2.width / 2,
|
|
494
|
-
y: box2.y + box2.height / 2
|
|
495
|
-
};
|
|
496
|
-
if (this.contains(box1, box2)) {
|
|
497
|
-
return "contains";
|
|
498
|
-
}
|
|
499
|
-
if (this.contains(box2, box1)) {
|
|
500
|
-
return "contained_by";
|
|
501
|
-
}
|
|
502
|
-
if (this.overlaps(box1, box2)) {
|
|
503
|
-
return "overlaps";
|
|
504
|
-
}
|
|
505
|
-
const dx = center2.x - center1.x;
|
|
506
|
-
const dy = center2.y - center1.y;
|
|
507
|
-
const distance = Math.sqrt(dx * dx + dy * dy);
|
|
508
|
-
if (distance < 100) {
|
|
509
|
-
if (Math.abs(dx) > Math.abs(dy)) {
|
|
510
|
-
return dx > 0 ? "right_of" : "left_of";
|
|
511
|
-
} else {
|
|
512
|
-
return dy > 0 ? "below" : "above";
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
return null;
|
|
516
|
-
}
|
|
517
|
-
contains(box1, box2) {
|
|
518
|
-
return box1.x <= box2.x && box1.y <= box2.y && box1.x + box1.width >= box2.x + box2.width && box1.y + box1.height >= box2.y + box2.height;
|
|
519
|
-
}
|
|
520
|
-
overlaps(box1, box2) {
|
|
521
|
-
return !(box1.x + box1.width < box2.x || box2.x + box2.width < box1.x || box1.y + box1.height < box2.y || box2.y + box2.height < box1.y);
|
|
522
|
-
}
|
|
523
|
-
isInitialized() {
|
|
524
|
-
return this.initialized;
|
|
525
|
-
}
|
|
526
|
-
async dispose() {
|
|
527
|
-
this.initialized = false;
|
|
528
|
-
import_core2.logger.info("[Florence2] Model disposed");
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
// src/workers/worker-logger.ts
|
|
533
|
-
var import_node_worker_threads = require("node:worker_threads");
|
|
534
|
-
var logger3 = {
|
|
535
|
-
info: (message, ...args) => {
|
|
536
|
-
const logMessage = {
|
|
537
|
-
type: "log",
|
|
538
|
-
level: "info",
|
|
539
|
-
message,
|
|
540
|
-
args,
|
|
541
|
-
timestamp: new Date().toISOString()
|
|
542
|
-
};
|
|
543
|
-
if (import_node_worker_threads.parentPort) {
|
|
544
|
-
import_node_worker_threads.parentPort.postMessage(logMessage);
|
|
545
|
-
} else {
|
|
546
|
-
console.log(`[INFO] ${message}`, ...args);
|
|
547
|
-
}
|
|
548
|
-
},
|
|
549
|
-
warn: (message, ...args) => {
|
|
550
|
-
const logMessage = {
|
|
551
|
-
type: "log",
|
|
552
|
-
level: "warn",
|
|
553
|
-
message,
|
|
554
|
-
args,
|
|
555
|
-
timestamp: new Date().toISOString()
|
|
556
|
-
};
|
|
557
|
-
if (import_node_worker_threads.parentPort) {
|
|
558
|
-
import_node_worker_threads.parentPort.postMessage(logMessage);
|
|
559
|
-
} else {
|
|
560
|
-
console.warn(`[WARN] ${message}`, ...args);
|
|
561
|
-
}
|
|
562
|
-
},
|
|
563
|
-
error: (message, ...args) => {
|
|
564
|
-
const logMessage = {
|
|
565
|
-
type: "log",
|
|
566
|
-
level: "error",
|
|
567
|
-
message,
|
|
568
|
-
args,
|
|
569
|
-
timestamp: new Date().toISOString()
|
|
570
|
-
};
|
|
571
|
-
if (import_node_worker_threads.parentPort) {
|
|
572
|
-
import_node_worker_threads.parentPort.postMessage(logMessage);
|
|
573
|
-
} else {
|
|
574
|
-
console.error(`[ERROR] ${message}`, ...args);
|
|
575
|
-
}
|
|
576
|
-
},
|
|
577
|
-
debug: (message, ...args) => {
|
|
578
|
-
const logMessage = {
|
|
579
|
-
type: "log",
|
|
580
|
-
level: "debug",
|
|
581
|
-
message,
|
|
582
|
-
args,
|
|
583
|
-
timestamp: new Date().toISOString()
|
|
584
|
-
};
|
|
585
|
-
if (import_node_worker_threads.parentPort) {
|
|
586
|
-
import_node_worker_threads.parentPort.postMessage(logMessage);
|
|
587
|
-
} else {
|
|
588
|
-
console.debug(`[DEBUG] ${message}`, ...args);
|
|
589
|
-
}
|
|
590
|
-
}
|
|
591
|
-
};
|
|
592
|
-
|
|
593
|
-
// src/workers/florence2-worker.ts
|
|
594
|
-
class Florence2Worker {
|
|
595
|
-
config;
|
|
596
|
-
dataView;
|
|
597
|
-
atomicState;
|
|
598
|
-
resultsView;
|
|
599
|
-
florence2;
|
|
600
|
-
isRunning = true;
|
|
601
|
-
frameCount = 0;
|
|
602
|
-
lastFPSReport = Date.now();
|
|
603
|
-
lastFrameId = -1;
|
|
604
|
-
FRAME_ID_INDEX = 0;
|
|
605
|
-
WIDTH_INDEX = 2;
|
|
606
|
-
HEIGHT_INDEX = 3;
|
|
607
|
-
DISPLAY_INDEX = 4;
|
|
608
|
-
TIMESTAMP_INDEX = 5;
|
|
609
|
-
DATA_OFFSET = 24;
|
|
610
|
-
RESULTS_HEADER_SIZE = 16;
|
|
611
|
-
MAX_RESULT_SIZE = 4096;
|
|
612
|
-
constructor(config, sharedBuffer, resultsBuffer) {
|
|
613
|
-
this.config = config;
|
|
614
|
-
this.dataView = new DataView(sharedBuffer);
|
|
615
|
-
this.atomicState = new Int32Array(sharedBuffer, 0, 6);
|
|
616
|
-
this.resultsView = new DataView(resultsBuffer);
|
|
617
|
-
this.florence2 = new Florence2Model;
|
|
618
|
-
}
|
|
619
|
-
async initialize() {
|
|
620
|
-
await this.florence2.initialize();
|
|
621
|
-
logger3.info("[Florence2Worker] Initialized and ready");
|
|
622
|
-
}
|
|
623
|
-
async run() {
|
|
624
|
-
await this.initialize();
|
|
625
|
-
logger3.info("[Florence2Worker] Starting analysis loop...");
|
|
626
|
-
while (this.isRunning) {
|
|
627
|
-
try {
|
|
628
|
-
const currentFrameId = Atomics.load(this.atomicState, this.FRAME_ID_INDEX);
|
|
629
|
-
if (currentFrameId > this.lastFrameId) {
|
|
630
|
-
await this.processFrame();
|
|
631
|
-
this.lastFrameId = currentFrameId;
|
|
632
|
-
this.frameCount++;
|
|
633
|
-
const now = Date.now();
|
|
634
|
-
if (now - this.lastFPSReport >= 1000) {
|
|
635
|
-
const fps = this.frameCount / ((now - this.lastFPSReport) / 1000);
|
|
636
|
-
logger3.info(`[Florence2Worker] Analysis FPS: ${fps.toFixed(2)}`);
|
|
637
|
-
import_node_worker_threads2.parentPort?.postMessage({
|
|
638
|
-
type: "fps",
|
|
639
|
-
fps,
|
|
640
|
-
frameCount: this.frameCount
|
|
641
|
-
});
|
|
642
|
-
this.frameCount = 0;
|
|
643
|
-
this.lastFPSReport = now;
|
|
644
|
-
}
|
|
645
|
-
} else {
|
|
646
|
-
await new Promise((resolve) => setImmediate(resolve));
|
|
647
|
-
}
|
|
648
|
-
} catch (error) {
|
|
649
|
-
logger3.error("[Florence2Worker] Processing error:", error);
|
|
650
|
-
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
async processFrame() {
|
|
655
|
-
const metadata = {
|
|
656
|
-
frameId: Atomics.load(this.atomicState, this.FRAME_ID_INDEX),
|
|
657
|
-
width: Atomics.load(this.atomicState, this.WIDTH_INDEX),
|
|
658
|
-
height: Atomics.load(this.atomicState, this.HEIGHT_INDEX),
|
|
659
|
-
displayIndex: Atomics.load(this.atomicState, this.DISPLAY_INDEX),
|
|
660
|
-
timestamp: Atomics.load(this.atomicState, this.TIMESTAMP_INDEX)
|
|
661
|
-
};
|
|
662
|
-
const tiles = this.calculateTiles(metadata.width, metadata.height);
|
|
663
|
-
const tilesToProcess = this.config.priorityTiles ? this.config.priorityTiles.map((i) => tiles[i]).filter(Boolean) : tiles;
|
|
664
|
-
for (let i = 0;i < tilesToProcess.length; i++) {
|
|
665
|
-
const tile = tilesToProcess[i];
|
|
666
|
-
if (!tile) {
|
|
667
|
-
continue;
|
|
668
|
-
}
|
|
669
|
-
try {
|
|
670
|
-
const tileBuffer = await this.extractTileFromSharedBuffer(tile, metadata);
|
|
671
|
-
const result = await this.florence2.analyzeTile({
|
|
672
|
-
...tile,
|
|
673
|
-
data: tileBuffer
|
|
674
|
-
});
|
|
675
|
-
await this.writeResultToBuffer(tile.id, result, metadata.frameId);
|
|
676
|
-
import_node_worker_threads2.parentPort?.postMessage({
|
|
677
|
-
type: "tile_analyzed",
|
|
678
|
-
tileId: tile.id,
|
|
679
|
-
frameId: metadata.frameId,
|
|
680
|
-
displayIndex: metadata.displayIndex,
|
|
681
|
-
hasObjects: (result.objects?.length || 0) > 0,
|
|
682
|
-
caption: result.caption
|
|
683
|
-
});
|
|
684
|
-
} catch (error) {
|
|
685
|
-
logger3.error(`[Florence2Worker] Failed to analyze tile ${tile.id}:`, error);
|
|
686
|
-
}
|
|
687
|
-
}
|
|
688
|
-
}
|
|
689
|
-
calculateTiles(width, height) {
|
|
690
|
-
const tileSize = this.config.tileSize;
|
|
691
|
-
const tiles = [];
|
|
692
|
-
for (let row = 0;row < Math.ceil(height / tileSize); row++) {
|
|
693
|
-
for (let col = 0;col < Math.ceil(width / tileSize); col++) {
|
|
694
|
-
const x = col * tileSize;
|
|
695
|
-
const y = row * tileSize;
|
|
696
|
-
const tileWidth = Math.min(tileSize, width - x);
|
|
697
|
-
const tileHeight = Math.min(tileSize, height - y);
|
|
698
|
-
tiles.push({
|
|
699
|
-
id: `tile-${row}-${col}`,
|
|
700
|
-
row,
|
|
701
|
-
col,
|
|
702
|
-
x,
|
|
703
|
-
y,
|
|
704
|
-
width: tileWidth,
|
|
705
|
-
height: tileHeight
|
|
706
|
-
});
|
|
707
|
-
}
|
|
708
|
-
}
|
|
709
|
-
return tiles;
|
|
710
|
-
}
|
|
711
|
-
async extractTileFromSharedBuffer(tile, metadata) {
|
|
712
|
-
const bytesPerPixel = 4;
|
|
713
|
-
const rowStride = metadata.width * bytesPerPixel;
|
|
714
|
-
const tileData = Buffer.allocUnsafe(tile.width * tile.height * bytesPerPixel);
|
|
715
|
-
for (let row = 0;row < tile.height; row++) {
|
|
716
|
-
const sourceY = tile.y + row;
|
|
717
|
-
const sourceOffset = this.DATA_OFFSET + sourceY * rowStride + tile.x * bytesPerPixel;
|
|
718
|
-
const destOffset = row * tile.width * bytesPerPixel;
|
|
719
|
-
for (let i = 0;i < tile.width * bytesPerPixel; i++) {
|
|
720
|
-
tileData[destOffset + i] = this.dataView.getUint8(sourceOffset + i);
|
|
721
|
-
}
|
|
722
|
-
}
|
|
723
|
-
const pngBuffer = await import_sharp2.default(tileData, {
|
|
724
|
-
raw: {
|
|
725
|
-
width: tile.width,
|
|
726
|
-
height: tile.height,
|
|
727
|
-
channels: 4
|
|
728
|
-
}
|
|
729
|
-
}).png().toBuffer();
|
|
730
|
-
return pngBuffer;
|
|
731
|
-
}
|
|
732
|
-
async writeResultToBuffer(tileId, result, frameId) {
|
|
733
|
-
const resultJson = JSON.stringify({
|
|
734
|
-
tileId,
|
|
735
|
-
frameId,
|
|
736
|
-
timestamp: Date.now(),
|
|
737
|
-
...result
|
|
738
|
-
});
|
|
739
|
-
const resultBytes = Buffer.from(resultJson, "utf-8");
|
|
740
|
-
const match = tileId.match(/tile-(\d+)-(\d+)/);
|
|
741
|
-
if (!match) {
|
|
742
|
-
return;
|
|
743
|
-
}
|
|
744
|
-
const row = parseInt(match[1], 10);
|
|
745
|
-
const col = parseInt(match[2], 10);
|
|
746
|
-
const tileIndex = row * 10 + col;
|
|
747
|
-
const offset = this.RESULTS_HEADER_SIZE + tileIndex * this.MAX_RESULT_SIZE;
|
|
748
|
-
this.resultsView.setUint32(offset, resultBytes.length, true);
|
|
749
|
-
for (let i = 0;i < Math.min(resultBytes.length, this.MAX_RESULT_SIZE - 4); i++) {
|
|
750
|
-
this.resultsView.setUint8(offset + 4 + i, resultBytes[i]);
|
|
751
|
-
}
|
|
752
|
-
}
|
|
753
|
-
stop() {
|
|
754
|
-
this.isRunning = false;
|
|
755
|
-
}
|
|
756
|
-
async dispose() {
|
|
757
|
-
await this.florence2.dispose();
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
if (import_node_worker_threads2.parentPort) {
|
|
761
|
-
const { config, sharedBuffer, resultsBuffer } = import_node_worker_threads2.workerData;
|
|
762
|
-
const worker = new Florence2Worker(config, sharedBuffer, resultsBuffer);
|
|
763
|
-
import_node_worker_threads2.parentPort.on("message", (msg) => {
|
|
764
|
-
if (msg.type === "stop") {
|
|
765
|
-
worker.stop();
|
|
766
|
-
worker.dispose().then(() => {
|
|
767
|
-
import_node_worker_threads2.parentPort?.postMessage({ type: "stopped" });
|
|
768
|
-
});
|
|
769
|
-
}
|
|
770
|
-
});
|
|
771
|
-
worker.run().catch((error) => {
|
|
772
|
-
logger3.error("[Florence2Worker] Fatal error:", error);
|
|
773
|
-
import_node_worker_threads2.parentPort?.postMessage({ type: "error", error: error.message });
|
|
774
|
-
process.exit(1);
|
|
775
|
-
});
|
|
776
|
-
}
|
|
777
|
-
|
|
778
|
-
//# debugId=759CBCF921774D7464756E2164756E21
|
|
779
|
-
//# sourceMappingURL=florence2-worker.js.map
|