@elizaos/plugin-vision 2.0.0-alpha.3 → 2.0.0-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +91 -35
- package/dist/index.js.map +12 -12
- package/package.json +11 -7
- package/dist/workers/florence2-worker.js +0 -123036
- package/dist/workers/florence2-worker.js.map +0 -92
- package/dist/workers/ocr-worker.js +0 -130444
- package/dist/workers/ocr-worker.js.map +0 -137
- package/dist/workers/screen-capture-worker.js +0 -359
- package/dist/workers/screen-capture-worker.js.map +0 -11
package/dist/index.js
CHANGED
|
@@ -1,4 +1,20 @@
|
|
|
1
1
|
import { createRequire } from "node:module";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
4
|
+
var __defProp = Object.defineProperty;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __toESM = (mod, isNodeMode, target) => {
|
|
8
|
+
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
9
|
+
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
10
|
+
for (let key of __getOwnPropNames(mod))
|
|
11
|
+
if (!__hasOwnProp.call(to, key))
|
|
12
|
+
__defProp(to, key, {
|
|
13
|
+
get: () => mod[key],
|
|
14
|
+
enumerable: true
|
|
15
|
+
});
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
2
18
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
3
19
|
|
|
4
20
|
// src/action.ts
|
|
@@ -2264,8 +2280,19 @@ import { logger as logger8 } from "@elizaos/core";
|
|
|
2264
2280
|
|
|
2265
2281
|
// src/florence2-local.ts
|
|
2266
2282
|
import { logger as logger7 } from "@elizaos/core";
|
|
2267
|
-
import * as tf from "@tensorflow/tfjs-node";
|
|
2268
2283
|
import sharp from "sharp";
|
|
2284
|
+
var tf = null;
|
|
2285
|
+
async function ensureTf() {
|
|
2286
|
+
if (tf)
|
|
2287
|
+
return true;
|
|
2288
|
+
try {
|
|
2289
|
+
tf = await import("@tensorflow/tfjs-node");
|
|
2290
|
+
return true;
|
|
2291
|
+
} catch {
|
|
2292
|
+
logger7.warn("[Florence2Local] TensorFlow.js native addon not available — using heuristic analysis.");
|
|
2293
|
+
return false;
|
|
2294
|
+
}
|
|
2295
|
+
}
|
|
2269
2296
|
|
|
2270
2297
|
class Florence2Local {
|
|
2271
2298
|
model = null;
|
|
@@ -2281,6 +2308,11 @@ class Florence2Local {
|
|
|
2281
2308
|
if (this.initialized) {
|
|
2282
2309
|
return;
|
|
2283
2310
|
}
|
|
2311
|
+
const tfAvailable = await ensureTf();
|
|
2312
|
+
if (!tfAvailable || !tf) {
|
|
2313
|
+
this.initialized = true;
|
|
2314
|
+
return;
|
|
2315
|
+
}
|
|
2284
2316
|
try {
|
|
2285
2317
|
logger7.info("[VisionModel] Initializing MobileNet model for image analysis...");
|
|
2286
2318
|
this.model = await tf.loadGraphModel(this.config.modelUrl);
|
|
@@ -2295,22 +2327,22 @@ class Florence2Local {
|
|
|
2295
2327
|
if (!this.initialized) {
|
|
2296
2328
|
await this.initialize();
|
|
2297
2329
|
}
|
|
2330
|
+
if (!tf || !this.model) {
|
|
2331
|
+
return await this.enhancedFallback(imageBuffer);
|
|
2332
|
+
}
|
|
2298
2333
|
try {
|
|
2299
2334
|
const preprocessed = await this.preprocessImage(imageBuffer);
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
return this.parseModelOutput(predictions);
|
|
2304
|
-
} else {
|
|
2305
|
-
preprocessed.dispose();
|
|
2306
|
-
return await this.enhancedFallback(imageBuffer);
|
|
2307
|
-
}
|
|
2335
|
+
const predictions = await this.runInference(preprocessed);
|
|
2336
|
+
preprocessed.dispose?.();
|
|
2337
|
+
return this.parseModelOutput(predictions);
|
|
2308
2338
|
} catch (error) {
|
|
2309
2339
|
logger7.error("[VisionModel] Analysis failed:", error);
|
|
2310
2340
|
return await this.enhancedFallback(imageBuffer);
|
|
2311
2341
|
}
|
|
2312
2342
|
}
|
|
2313
2343
|
async preprocessImage(imageBuffer) {
|
|
2344
|
+
if (!tf)
|
|
2345
|
+
throw new Error("TensorFlow.js not available");
|
|
2314
2346
|
const resized = await sharp(imageBuffer).resize(224, 224).raw().toBuffer();
|
|
2315
2347
|
const tensor = tf.node.decodeImage(resized, 3);
|
|
2316
2348
|
const normalized = tf.div(tensor, 255);
|
|
@@ -3252,19 +3284,39 @@ class ScreenCaptureService {
|
|
|
3252
3284
|
|
|
3253
3285
|
// src/vision-models.ts
|
|
3254
3286
|
import { logger as logger12 } from "@elizaos/core";
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3287
|
+
var tf2 = null;
|
|
3288
|
+
var cocoSsd = null;
|
|
3289
|
+
var poseDetection = null;
|
|
3290
|
+
async function loadTfModules() {
|
|
3291
|
+
if (tf2)
|
|
3292
|
+
return true;
|
|
3293
|
+
try {
|
|
3294
|
+
tf2 = await import("@tensorflow/tfjs-node");
|
|
3295
|
+
cocoSsd = await import("@tensorflow-models/coco-ssd");
|
|
3296
|
+
poseDetection = await import("@tensorflow-models/pose-detection");
|
|
3297
|
+
return true;
|
|
3298
|
+
} catch (err) {
|
|
3299
|
+
logger12.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
|
|
3300
|
+
return false;
|
|
3301
|
+
}
|
|
3302
|
+
}
|
|
3258
3303
|
|
|
3259
3304
|
class VisionModels {
|
|
3260
3305
|
objectDetectionModel = null;
|
|
3261
3306
|
poseDetector = null;
|
|
3262
3307
|
initialized = false;
|
|
3308
|
+
tfAvailable = false;
|
|
3263
3309
|
async initialize(config) {
|
|
3264
3310
|
if (this.initialized) {
|
|
3265
3311
|
return;
|
|
3266
3312
|
}
|
|
3267
3313
|
logger12.info("[VisionModels] Initializing vision models...");
|
|
3314
|
+
this.tfAvailable = await loadTfModules();
|
|
3315
|
+
if (!this.tfAvailable || !tf2 || !cocoSsd || !poseDetection) {
|
|
3316
|
+
this.initialized = true;
|
|
3317
|
+
logger12.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
|
|
3318
|
+
return;
|
|
3319
|
+
}
|
|
3268
3320
|
try {
|
|
3269
3321
|
await tf2.ready();
|
|
3270
3322
|
logger12.info("[VisionModels] TensorFlow.js backend ready");
|
|
@@ -3308,7 +3360,7 @@ class VisionModels {
|
|
|
3308
3360
|
return this.poseDetector !== null;
|
|
3309
3361
|
}
|
|
3310
3362
|
async detectObjects(imageData, _width, _height, description) {
|
|
3311
|
-
if (!this.objectDetectionModel) {
|
|
3363
|
+
if (!this.objectDetectionModel || !tf2) {
|
|
3312
3364
|
logger12.warn("[VisionModels] Object detection model not loaded");
|
|
3313
3365
|
return this.enhancedObjectDetection(description);
|
|
3314
3366
|
}
|
|
@@ -3394,7 +3446,7 @@ class VisionModels {
|
|
|
3394
3446
|
};
|
|
3395
3447
|
}
|
|
3396
3448
|
async detectPoses(imageData, width, height, description) {
|
|
3397
|
-
if (!this.poseDetector) {
|
|
3449
|
+
if (!this.poseDetector || !tf2) {
|
|
3398
3450
|
logger12.warn("[VisionModels] Pose detection model not loaded");
|
|
3399
3451
|
return this.enhancedPoseDetection(description);
|
|
3400
3452
|
}
|
|
@@ -3601,7 +3653,7 @@ import * as path4 from "node:path";
|
|
|
3601
3653
|
import { TextDecoder } from "node:util";
|
|
3602
3654
|
import { Worker } from "node:worker_threads";
|
|
3603
3655
|
import { logger as logger13 } from "@elizaos/core";
|
|
3604
|
-
var __dirname = "/Users/shawwalters/eliza-
|
|
3656
|
+
var __dirname = "/Users/shawwalters/eliza-workspace/plugins/plugin-vision/typescript/src";
|
|
3605
3657
|
|
|
3606
3658
|
class VisionWorkerManager {
|
|
3607
3659
|
config;
|
|
@@ -6041,11 +6093,11 @@ ${Object.entries(captureData.statistics.poseCounts).map(([pose, count]) => `- **
|
|
|
6041
6093
|
`) || "- No people detected"}
|
|
6042
6094
|
|
|
6043
6095
|
## Sample Scene Descriptions
|
|
6044
|
-
${captureData.captures.filter((c) => c.scene?.description).slice(0, 5).map((c, _i) => `### Capture ${c.index} (${c.elapsedMs}ms)
|
|
6045
|
-
"${c.scene
|
|
6046
|
-
- Change: ${c.scene
|
|
6047
|
-
- Objects: ${c.scene
|
|
6048
|
-
- People: ${c.scene
|
|
6096
|
+
${captureData.captures.filter((c) => c.scene?.description).slice(0, 5).map((c, _i) => `### Capture ${c.index} (${c.elapsedMs ?? 0}ms)
|
|
6097
|
+
"${c.scene?.description ?? ""}"
|
|
6098
|
+
- Change: ${c.scene?.changePercentage?.toFixed(1) ?? "0"}%
|
|
6099
|
+
- Objects: ${c.scene?.objectCount ?? 0}
|
|
6100
|
+
- People: ${c.scene?.peopleCount ?? 0}`).join(`
|
|
6049
6101
|
|
|
6050
6102
|
`)}
|
|
6051
6103
|
|
|
@@ -6078,7 +6130,6 @@ ${captureData.captures.filter((c) => c.scene?.description).slice(0, 5).map((c, _
|
|
|
6078
6130
|
var vision_capture_log_default = new VisionCaptureLogTestSuite;
|
|
6079
6131
|
// src/tests/e2e/vision-runtime.ts
|
|
6080
6132
|
import { logger as logger15 } from "@elizaos/core";
|
|
6081
|
-
|
|
6082
6133
|
class VisionRuntimeTestSuite {
|
|
6083
6134
|
name = "vision-runtime-tests";
|
|
6084
6135
|
description = "Real runtime tests for vision plugin functionality";
|
|
@@ -6087,7 +6138,7 @@ class VisionRuntimeTestSuite {
|
|
|
6087
6138
|
name: "Vision service initialization",
|
|
6088
6139
|
fn: async (runtime) => {
|
|
6089
6140
|
logger15.info("[Test] Testing vision service initialization...");
|
|
6090
|
-
const visionService = runtime.getService(
|
|
6141
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6091
6142
|
if (!visionService) {
|
|
6092
6143
|
throw new Error("Vision service not found in runtime");
|
|
6093
6144
|
}
|
|
@@ -6096,7 +6147,7 @@ class VisionRuntimeTestSuite {
|
|
|
6096
6147
|
}
|
|
6097
6148
|
const isActive = visionService.isActive();
|
|
6098
6149
|
logger15.info(`[Test] Vision service active: ${isActive}`);
|
|
6099
|
-
if (!isActive && runtime.getSetting("VISION_MODE") !== "OFF") {
|
|
6150
|
+
if (!isActive && runtime.getSetting("VISION_MODE") !== "OFF" /* OFF */) {
|
|
6100
6151
|
throw new Error("Vision service should be active but is not");
|
|
6101
6152
|
}
|
|
6102
6153
|
logger15.info("[Test] ✅ Vision service initialization test passed");
|
|
@@ -6106,7 +6157,7 @@ class VisionRuntimeTestSuite {
|
|
|
6106
6157
|
name: "Scene description functionality",
|
|
6107
6158
|
fn: async (runtime) => {
|
|
6108
6159
|
logger15.info("[Test] Testing scene description...");
|
|
6109
|
-
const visionService = runtime.getService(
|
|
6160
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6110
6161
|
if (!visionService) {
|
|
6111
6162
|
throw new Error("Vision service not found");
|
|
6112
6163
|
}
|
|
@@ -6136,13 +6187,18 @@ class VisionRuntimeTestSuite {
|
|
|
6136
6187
|
name: "Vision mode switching",
|
|
6137
6188
|
fn: async (runtime) => {
|
|
6138
6189
|
logger15.info("[Test] Testing vision mode switching...");
|
|
6139
|
-
const visionService = runtime.getService(
|
|
6190
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6140
6191
|
if (!visionService) {
|
|
6141
6192
|
throw new Error("Vision service not found");
|
|
6142
6193
|
}
|
|
6143
6194
|
const originalMode = visionService.getVisionMode();
|
|
6144
6195
|
logger15.info(`[Test] Original mode: ${originalMode}`);
|
|
6145
|
-
const testModes = [
|
|
6196
|
+
const testModes = [
|
|
6197
|
+
"CAMERA" /* CAMERA */,
|
|
6198
|
+
"SCREEN" /* SCREEN */,
|
|
6199
|
+
"BOTH" /* BOTH */,
|
|
6200
|
+
"OFF" /* OFF */
|
|
6201
|
+
];
|
|
6146
6202
|
for (const mode of testModes) {
|
|
6147
6203
|
logger15.info(`[Test] Switching to mode: ${mode}`);
|
|
6148
6204
|
await visionService.setVisionMode(mode);
|
|
@@ -6185,7 +6241,7 @@ class VisionRuntimeTestSuite {
|
|
|
6185
6241
|
}
|
|
6186
6242
|
return [];
|
|
6187
6243
|
};
|
|
6188
|
-
await action.handler(runtime, message, {}, {}, callback);
|
|
6244
|
+
await action.handler(runtime, message, { values: {}, data: {}, text: "" }, {}, callback);
|
|
6189
6245
|
if (!responseReceived) {
|
|
6190
6246
|
throw new Error("DESCRIBE_SCENE action did not produce a response");
|
|
6191
6247
|
}
|
|
@@ -6226,7 +6282,7 @@ class VisionRuntimeTestSuite {
|
|
|
6226
6282
|
name: "Florence-2 model initialization",
|
|
6227
6283
|
fn: async (runtime) => {
|
|
6228
6284
|
logger15.info("[Test] Testing Florence-2 model...");
|
|
6229
|
-
const visionService = runtime.getService(
|
|
6285
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6230
6286
|
if (!visionService) {
|
|
6231
6287
|
throw new Error("Vision service not found");
|
|
6232
6288
|
}
|
|
@@ -6236,7 +6292,7 @@ class VisionRuntimeTestSuite {
|
|
|
6236
6292
|
return;
|
|
6237
6293
|
}
|
|
6238
6294
|
const mode = visionService.getVisionMode();
|
|
6239
|
-
if (mode === "SCREEN" || mode === "BOTH") {
|
|
6295
|
+
if (mode === "SCREEN" /* SCREEN */ || mode === "BOTH" /* BOTH */) {
|
|
6240
6296
|
const screenCapture = await visionService.getScreenCapture();
|
|
6241
6297
|
if (screenCapture) {
|
|
6242
6298
|
logger15.info("[Test] Screen capture available");
|
|
@@ -6251,7 +6307,7 @@ class VisionRuntimeTestSuite {
|
|
|
6251
6307
|
name: "OCR service functionality",
|
|
6252
6308
|
fn: async (runtime) => {
|
|
6253
6309
|
logger15.info("[Test] Testing OCR service...");
|
|
6254
|
-
const visionService = runtime.getService(
|
|
6310
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6255
6311
|
if (!visionService) {
|
|
6256
6312
|
throw new Error("Vision service not found");
|
|
6257
6313
|
}
|
|
@@ -6261,7 +6317,7 @@ class VisionRuntimeTestSuite {
|
|
|
6261
6317
|
return;
|
|
6262
6318
|
}
|
|
6263
6319
|
const mode = visionService.getVisionMode();
|
|
6264
|
-
if (mode === "SCREEN" || mode === "BOTH") {
|
|
6320
|
+
if (mode === "SCREEN" /* SCREEN */ || mode === "BOTH" /* BOTH */) {
|
|
6265
6321
|
const enhancedScene = await visionService.getEnhancedSceneDescription();
|
|
6266
6322
|
if (enhancedScene?.screenAnalysis) {
|
|
6267
6323
|
const ocrText = enhancedScene.screenAnalysis.fullScreenOCR;
|
|
@@ -6278,7 +6334,7 @@ class VisionRuntimeTestSuite {
|
|
|
6278
6334
|
name: "Entity tracking system",
|
|
6279
6335
|
fn: async (runtime) => {
|
|
6280
6336
|
logger15.info("[Test] Testing entity tracking...");
|
|
6281
|
-
const visionService = runtime.getService(
|
|
6337
|
+
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6282
6338
|
if (!visionService) {
|
|
6283
6339
|
throw new Error("Vision service not found");
|
|
6284
6340
|
}
|
|
@@ -6289,10 +6345,10 @@ class VisionRuntimeTestSuite {
|
|
|
6289
6345
|
const entities = entityTracker.getActiveEntities();
|
|
6290
6346
|
logger15.info(`[Test] Active entities: ${entities.length}`);
|
|
6291
6347
|
for (const entity of entities) {
|
|
6292
|
-
if (!entity.id || !entity.
|
|
6348
|
+
if (!entity.id || !entity.entityType || !entity.lastSeen) {
|
|
6293
6349
|
throw new Error("Entity missing required fields");
|
|
6294
6350
|
}
|
|
6295
|
-
logger15.info(`[Test] Entity ${entity.id}: type=${entity.
|
|
6351
|
+
logger15.info(`[Test] Entity ${entity.id}: type=${entity.entityType}, tracked=${entity.lastSeen - entity.firstSeen}ms`);
|
|
6296
6352
|
}
|
|
6297
6353
|
logger15.info("[Test] ✅ Entity tracking test passed");
|
|
6298
6354
|
}
|
|
@@ -6703,4 +6759,4 @@ export {
|
|
|
6703
6759
|
src_default as default
|
|
6704
6760
|
};
|
|
6705
6761
|
|
|
6706
|
-
//# debugId=
|
|
6762
|
+
//# debugId=4F137AC378EA73BB64756E2164756E21
|