@elizaos/plugin-vision 2.0.0-alpha.4 → 2.0.0-alpha.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +99 -46
- package/dist/index.js.map +5 -5
- package/package.json +8 -6
- package/dist/workers/florence2-worker.js +0 -121102
- package/dist/workers/florence2-worker.js.map +0 -92
- package/dist/workers/ocr-worker.js +0 -128510
- package/dist/workers/ocr-worker.js.map +0 -137
- package/dist/workers/screen-capture-worker.js +0 -359
- package/dist/workers/screen-capture-worker.js.map +0 -11
package/dist/index.js
CHANGED
|
@@ -1,4 +1,20 @@
|
|
|
1
1
|
import { createRequire } from "node:module";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
4
|
+
var __defProp = Object.defineProperty;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __toESM = (mod, isNodeMode, target) => {
|
|
8
|
+
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
9
|
+
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
10
|
+
for (let key of __getOwnPropNames(mod))
|
|
11
|
+
if (!__hasOwnProp.call(to, key))
|
|
12
|
+
__defProp(to, key, {
|
|
13
|
+
get: () => mod[key],
|
|
14
|
+
enumerable: true
|
|
15
|
+
});
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
2
18
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
3
19
|
|
|
4
20
|
// src/action.ts
|
|
@@ -1836,7 +1852,7 @@ class StreamingAudioCaptureService extends EventEmitter {
|
|
|
1836
1852
|
}
|
|
1837
1853
|
|
|
1838
1854
|
// src/entity-tracker.ts
|
|
1839
|
-
import { logger as logger5 } from "@elizaos/core";
|
|
1855
|
+
import { createUniqueUuid as createUniqueUuid2, logger as logger5 } from "@elizaos/core";
|
|
1840
1856
|
|
|
1841
1857
|
class EntityTracker {
|
|
1842
1858
|
worldState;
|
|
@@ -2014,8 +2030,9 @@ class EntityTracker {
|
|
|
2014
2030
|
async syncWithRuntime(runtime, frameEntities) {
|
|
2015
2031
|
try {
|
|
2016
2032
|
for (const entity of frameEntities) {
|
|
2033
|
+
const entityUuid = createUniqueUuid2(runtime, entity.id);
|
|
2017
2034
|
const elizaEntity = {
|
|
2018
|
-
id:
|
|
2035
|
+
id: entityUuid,
|
|
2019
2036
|
names: [entity.attributes.name || entity.id],
|
|
2020
2037
|
metadata: {
|
|
2021
2038
|
type: entity.entityType,
|
|
@@ -2027,16 +2044,16 @@ class EntityTracker {
|
|
|
2027
2044
|
agentId: runtime.agentId
|
|
2028
2045
|
};
|
|
2029
2046
|
try {
|
|
2030
|
-
const existing = await runtime.getEntityById(
|
|
2047
|
+
const existing = await runtime.getEntityById(entityUuid);
|
|
2031
2048
|
if (!existing) {
|
|
2032
2049
|
await runtime.createEntity(elizaEntity);
|
|
2033
|
-
logger5.debug(`[EntityTracker] Created entity ${entity.id} in runtime`);
|
|
2050
|
+
logger5.debug(`[EntityTracker] Created entity ${entity.id} (${entityUuid}) in runtime`);
|
|
2034
2051
|
} else {
|
|
2035
2052
|
await runtime.updateEntity({
|
|
2036
2053
|
...elizaEntity,
|
|
2037
|
-
id:
|
|
2054
|
+
id: entityUuid
|
|
2038
2055
|
});
|
|
2039
|
-
logger5.debug(`[EntityTracker] Updated entity ${entity.id} in runtime`);
|
|
2056
|
+
logger5.debug(`[EntityTracker] Updated entity ${entity.id} (${entityUuid}) in runtime`);
|
|
2040
2057
|
}
|
|
2041
2058
|
} catch (err) {
|
|
2042
2059
|
logger5.debug(`[EntityTracker] Could not sync entity ${entity.id}: ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -2264,8 +2281,19 @@ import { logger as logger8 } from "@elizaos/core";
|
|
|
2264
2281
|
|
|
2265
2282
|
// src/florence2-local.ts
|
|
2266
2283
|
import { logger as logger7 } from "@elizaos/core";
|
|
2267
|
-
import * as tf from "@tensorflow/tfjs-node";
|
|
2268
2284
|
import sharp from "sharp";
|
|
2285
|
+
var tf = null;
|
|
2286
|
+
async function ensureTf() {
|
|
2287
|
+
if (tf)
|
|
2288
|
+
return true;
|
|
2289
|
+
try {
|
|
2290
|
+
tf = await import("@tensorflow/tfjs-node");
|
|
2291
|
+
return true;
|
|
2292
|
+
} catch {
|
|
2293
|
+
logger7.warn("[Florence2Local] TensorFlow.js native addon not available — using heuristic analysis.");
|
|
2294
|
+
return false;
|
|
2295
|
+
}
|
|
2296
|
+
}
|
|
2269
2297
|
|
|
2270
2298
|
class Florence2Local {
|
|
2271
2299
|
model = null;
|
|
@@ -2281,6 +2309,11 @@ class Florence2Local {
|
|
|
2281
2309
|
if (this.initialized) {
|
|
2282
2310
|
return;
|
|
2283
2311
|
}
|
|
2312
|
+
const tfAvailable = await ensureTf();
|
|
2313
|
+
if (!tfAvailable || !tf) {
|
|
2314
|
+
this.initialized = true;
|
|
2315
|
+
return;
|
|
2316
|
+
}
|
|
2284
2317
|
try {
|
|
2285
2318
|
logger7.info("[VisionModel] Initializing MobileNet model for image analysis...");
|
|
2286
2319
|
this.model = await tf.loadGraphModel(this.config.modelUrl);
|
|
@@ -2295,22 +2328,22 @@ class Florence2Local {
|
|
|
2295
2328
|
if (!this.initialized) {
|
|
2296
2329
|
await this.initialize();
|
|
2297
2330
|
}
|
|
2331
|
+
if (!tf || !this.model) {
|
|
2332
|
+
return await this.enhancedFallback(imageBuffer);
|
|
2333
|
+
}
|
|
2298
2334
|
try {
|
|
2299
2335
|
const preprocessed = await this.preprocessImage(imageBuffer);
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
return this.parseModelOutput(predictions);
|
|
2304
|
-
} else {
|
|
2305
|
-
preprocessed.dispose();
|
|
2306
|
-
return await this.enhancedFallback(imageBuffer);
|
|
2307
|
-
}
|
|
2336
|
+
const predictions = await this.runInference(preprocessed);
|
|
2337
|
+
preprocessed.dispose?.();
|
|
2338
|
+
return this.parseModelOutput(predictions);
|
|
2308
2339
|
} catch (error) {
|
|
2309
2340
|
logger7.error("[VisionModel] Analysis failed:", error);
|
|
2310
2341
|
return await this.enhancedFallback(imageBuffer);
|
|
2311
2342
|
}
|
|
2312
2343
|
}
|
|
2313
2344
|
async preprocessImage(imageBuffer) {
|
|
2345
|
+
if (!tf)
|
|
2346
|
+
throw new Error("TensorFlow.js not available");
|
|
2314
2347
|
const resized = await sharp(imageBuffer).resize(224, 224).raw().toBuffer();
|
|
2315
2348
|
const tensor = tf.node.decodeImage(resized, 3);
|
|
2316
2349
|
const normalized = tf.div(tensor, 255);
|
|
@@ -3252,19 +3285,39 @@ class ScreenCaptureService {
|
|
|
3252
3285
|
|
|
3253
3286
|
// src/vision-models.ts
|
|
3254
3287
|
import { logger as logger12 } from "@elizaos/core";
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3288
|
+
var tf2 = null;
|
|
3289
|
+
var cocoSsd = null;
|
|
3290
|
+
var poseDetection = null;
|
|
3291
|
+
async function loadTfModules() {
|
|
3292
|
+
if (tf2)
|
|
3293
|
+
return true;
|
|
3294
|
+
try {
|
|
3295
|
+
tf2 = await import("@tensorflow/tfjs-node");
|
|
3296
|
+
cocoSsd = await import("@tensorflow-models/coco-ssd");
|
|
3297
|
+
poseDetection = await import("@tensorflow-models/pose-detection");
|
|
3298
|
+
return true;
|
|
3299
|
+
} catch (err) {
|
|
3300
|
+
logger12.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
|
|
3301
|
+
return false;
|
|
3302
|
+
}
|
|
3303
|
+
}
|
|
3258
3304
|
|
|
3259
3305
|
class VisionModels {
|
|
3260
3306
|
objectDetectionModel = null;
|
|
3261
3307
|
poseDetector = null;
|
|
3262
3308
|
initialized = false;
|
|
3309
|
+
tfAvailable = false;
|
|
3263
3310
|
async initialize(config) {
|
|
3264
3311
|
if (this.initialized) {
|
|
3265
3312
|
return;
|
|
3266
3313
|
}
|
|
3267
3314
|
logger12.info("[VisionModels] Initializing vision models...");
|
|
3315
|
+
this.tfAvailable = await loadTfModules();
|
|
3316
|
+
if (!this.tfAvailable || !tf2 || !cocoSsd || !poseDetection) {
|
|
3317
|
+
this.initialized = true;
|
|
3318
|
+
logger12.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
|
|
3319
|
+
return;
|
|
3320
|
+
}
|
|
3268
3321
|
try {
|
|
3269
3322
|
await tf2.ready();
|
|
3270
3323
|
logger12.info("[VisionModels] TensorFlow.js backend ready");
|
|
@@ -3308,7 +3361,7 @@ class VisionModels {
|
|
|
3308
3361
|
return this.poseDetector !== null;
|
|
3309
3362
|
}
|
|
3310
3363
|
async detectObjects(imageData, _width, _height, description) {
|
|
3311
|
-
if (!this.objectDetectionModel) {
|
|
3364
|
+
if (!this.objectDetectionModel || !tf2) {
|
|
3312
3365
|
logger12.warn("[VisionModels] Object detection model not loaded");
|
|
3313
3366
|
return this.enhancedObjectDetection(description);
|
|
3314
3367
|
}
|
|
@@ -3394,7 +3447,7 @@ class VisionModels {
|
|
|
3394
3447
|
};
|
|
3395
3448
|
}
|
|
3396
3449
|
async detectPoses(imageData, width, height, description) {
|
|
3397
|
-
if (!this.poseDetector) {
|
|
3450
|
+
if (!this.poseDetector || !tf2) {
|
|
3398
3451
|
logger12.warn("[VisionModels] Pose detection model not loaded");
|
|
3399
3452
|
return this.enhancedPoseDetection(description);
|
|
3400
3453
|
}
|
|
@@ -3601,7 +3654,7 @@ import * as path4 from "node:path";
|
|
|
3601
3654
|
import { TextDecoder } from "node:util";
|
|
3602
3655
|
import { Worker } from "node:worker_threads";
|
|
3603
3656
|
import { logger as logger13 } from "@elizaos/core";
|
|
3604
|
-
var __dirname = "/Users/shawwalters/eliza-
|
|
3657
|
+
var __dirname = "/Users/shawwalters/eliza-workspace/plugins/plugin-vision/typescript/src";
|
|
3605
3658
|
|
|
3606
3659
|
class VisionWorkerManager {
|
|
3607
3660
|
config;
|
|
@@ -5131,7 +5184,7 @@ class VisionService extends Service {
|
|
|
5131
5184
|
}
|
|
5132
5185
|
|
|
5133
5186
|
// src/tests/e2e/screen-vision.ts
|
|
5134
|
-
import { createUniqueUuid as
|
|
5187
|
+
import { createUniqueUuid as createUniqueUuid3 } from "@elizaos/core";
|
|
5135
5188
|
class ScreenVisionE2ETestSuite {
|
|
5136
5189
|
name = "plugin-vision-screen-e2e";
|
|
5137
5190
|
description = "E2E tests for screen vision functionality including Florence-2 and OCR";
|
|
@@ -5250,11 +5303,11 @@ class ScreenVisionE2ETestSuite {
|
|
|
5250
5303
|
}
|
|
5251
5304
|
console.log(" Testing SET_VISION_MODE action...");
|
|
5252
5305
|
const message = {
|
|
5253
|
-
id:
|
|
5306
|
+
id: createUniqueUuid3(runtime, "test-msg"),
|
|
5254
5307
|
entityId: runtime.agentId,
|
|
5255
5308
|
content: { text: "set vision mode to both" },
|
|
5256
5309
|
agentId: runtime.agentId,
|
|
5257
|
-
roomId:
|
|
5310
|
+
roomId: createUniqueUuid3(runtime, "test-room"),
|
|
5258
5311
|
createdAt: Date.now()
|
|
5259
5312
|
};
|
|
5260
5313
|
let callbackCalled = false;
|
|
@@ -5303,11 +5356,11 @@ class ScreenVisionE2ETestSuite {
|
|
|
5303
5356
|
}
|
|
5304
5357
|
}
|
|
5305
5358
|
const state = await runtime.composeState({
|
|
5306
|
-
id:
|
|
5359
|
+
id: createUniqueUuid3(runtime, "test-msg"),
|
|
5307
5360
|
entityId: runtime.agentId,
|
|
5308
5361
|
content: { text: "test" },
|
|
5309
5362
|
agentId: runtime.agentId,
|
|
5310
|
-
roomId:
|
|
5363
|
+
roomId: createUniqueUuid3(runtime, "test-room"),
|
|
5311
5364
|
createdAt: Date.now()
|
|
5312
5365
|
});
|
|
5313
5366
|
if (state.text.includes("Vision mode: BOTH")) {
|
|
@@ -5343,7 +5396,7 @@ class ScreenVisionE2ETestSuite {
|
|
|
5343
5396
|
}
|
|
5344
5397
|
var screen_vision_default = new ScreenVisionE2ETestSuite;
|
|
5345
5398
|
// src/tests/e2e/vision-autonomy.ts
|
|
5346
|
-
import { createUniqueUuid as
|
|
5399
|
+
import { createUniqueUuid as createUniqueUuid4 } from "@elizaos/core";
|
|
5347
5400
|
class VisionAutonomyE2ETestSuite {
|
|
5348
5401
|
name = "plugin-vision-autonomy-e2e";
|
|
5349
5402
|
description = "Tests for vision plugin integration with autonomy plugin";
|
|
@@ -5352,9 +5405,9 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5352
5405
|
name: "Should stop autonomous loop with kill command",
|
|
5353
5406
|
fn: async (runtime) => {
|
|
5354
5407
|
console.log("Testing kill autonomous action...");
|
|
5355
|
-
const roomId =
|
|
5408
|
+
const roomId = createUniqueUuid4(runtime, "test-room");
|
|
5356
5409
|
const message = {
|
|
5357
|
-
id:
|
|
5410
|
+
id: createUniqueUuid4(runtime, "test-msg-kill"),
|
|
5358
5411
|
entityId: runtime.agentId,
|
|
5359
5412
|
content: { text: "kill the autonomous loop" },
|
|
5360
5413
|
agentId: runtime.agentId,
|
|
@@ -5445,9 +5498,9 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5445
5498
|
if (!visionService) {
|
|
5446
5499
|
throw new Error("Vision service not available");
|
|
5447
5500
|
}
|
|
5448
|
-
const roomId =
|
|
5501
|
+
const roomId = createUniqueUuid4(runtime, "test-room");
|
|
5449
5502
|
const firstMessage = {
|
|
5450
|
-
id:
|
|
5503
|
+
id: createUniqueUuid4(runtime, "test-msg-1"),
|
|
5451
5504
|
entityId: runtime.agentId,
|
|
5452
5505
|
content: { text: "what do you see?" },
|
|
5453
5506
|
agentId: runtime.agentId,
|
|
@@ -5456,7 +5509,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5456
5509
|
};
|
|
5457
5510
|
await runtime.createMemory(firstMessage, "messages");
|
|
5458
5511
|
const firstResponse = {
|
|
5459
|
-
id:
|
|
5512
|
+
id: createUniqueUuid4(runtime, "test-response-1"),
|
|
5460
5513
|
entityId: runtime.agentId,
|
|
5461
5514
|
content: {
|
|
5462
5515
|
text: "I see a test scene",
|
|
@@ -5469,7 +5522,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5469
5522
|
await runtime.createMemory(firstResponse, "messages");
|
|
5470
5523
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
5471
5524
|
const secondMessage = {
|
|
5472
|
-
id:
|
|
5525
|
+
id: createUniqueUuid4(runtime, "test-msg-2"),
|
|
5473
5526
|
entityId: runtime.agentId,
|
|
5474
5527
|
content: { text: "what did you see before?" },
|
|
5475
5528
|
agentId: runtime.agentId,
|
|
@@ -5478,7 +5531,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5478
5531
|
};
|
|
5479
5532
|
await runtime.createMemory(secondMessage, "messages");
|
|
5480
5533
|
const secondResponse = {
|
|
5481
|
-
id:
|
|
5534
|
+
id: createUniqueUuid4(runtime, "test-response-2"),
|
|
5482
5535
|
entityId: runtime.agentId,
|
|
5483
5536
|
content: {
|
|
5484
5537
|
text: "Previously, I saw a test scene",
|
|
@@ -5506,7 +5559,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5506
5559
|
name: "Should integrate vision data with agent decision making",
|
|
5507
5560
|
fn: async (runtime) => {
|
|
5508
5561
|
console.log("Testing vision-based decision making...");
|
|
5509
|
-
const roomId =
|
|
5562
|
+
const roomId = createUniqueUuid4(runtime, "test-room");
|
|
5510
5563
|
const scenarios = [
|
|
5511
5564
|
{ text: "Is anyone in the room?", expectedContext: "people" },
|
|
5512
5565
|
{ text: "Should I turn on the lights?", expectedContext: "scene" },
|
|
@@ -5515,7 +5568,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5515
5568
|
let scenariosWithVision = 0;
|
|
5516
5569
|
for (const scenario of scenarios) {
|
|
5517
5570
|
const message = {
|
|
5518
|
-
id:
|
|
5571
|
+
id: createUniqueUuid4(runtime, `test-msg-${Date.now()}`),
|
|
5519
5572
|
entityId: runtime.agentId,
|
|
5520
5573
|
content: { text: scenario.text },
|
|
5521
5574
|
agentId: runtime.agentId,
|
|
@@ -5550,11 +5603,11 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5550
5603
|
const isActive = visionService.isActive();
|
|
5551
5604
|
console.log(` Vision service active: ${isActive}`);
|
|
5552
5605
|
const message = {
|
|
5553
|
-
id:
|
|
5606
|
+
id: createUniqueUuid4(runtime, "test-msg-no-vision"),
|
|
5554
5607
|
entityId: runtime.agentId,
|
|
5555
5608
|
content: { text: "test without vision" },
|
|
5556
5609
|
agentId: runtime.agentId,
|
|
5557
|
-
roomId:
|
|
5610
|
+
roomId: createUniqueUuid4(runtime, "test-room"),
|
|
5558
5611
|
createdAt: Date.now()
|
|
5559
5612
|
};
|
|
5560
5613
|
const state = await runtime.composeState(message);
|
|
@@ -5583,7 +5636,7 @@ class VisionAutonomyE2ETestSuite {
|
|
|
5583
5636
|
}
|
|
5584
5637
|
var vision_autonomy_default = new VisionAutonomyE2ETestSuite;
|
|
5585
5638
|
// src/tests/e2e/vision-basic.ts
|
|
5586
|
-
import { createUniqueUuid as
|
|
5639
|
+
import { createUniqueUuid as createUniqueUuid5 } from "@elizaos/core";
|
|
5587
5640
|
class VisionBasicE2ETestSuite {
|
|
5588
5641
|
name = "plugin-vision-basic-e2e";
|
|
5589
5642
|
description = "Basic end-to-end tests for vision plugin functionality";
|
|
@@ -5611,9 +5664,9 @@ class VisionBasicE2ETestSuite {
|
|
|
5611
5664
|
name: "Should describe scene when requested",
|
|
5612
5665
|
fn: async (runtime) => {
|
|
5613
5666
|
console.log("Testing scene description action...");
|
|
5614
|
-
const roomId =
|
|
5667
|
+
const roomId = createUniqueUuid5(runtime, "test-room");
|
|
5615
5668
|
const message = {
|
|
5616
|
-
id:
|
|
5669
|
+
id: createUniqueUuid5(runtime, "test-msg-describe"),
|
|
5617
5670
|
entityId: runtime.agentId,
|
|
5618
5671
|
content: { text: "what do you see?" },
|
|
5619
5672
|
agentId: runtime.agentId,
|
|
@@ -5677,9 +5730,9 @@ class VisionBasicE2ETestSuite {
|
|
|
5677
5730
|
name: "Should capture image when requested",
|
|
5678
5731
|
fn: async (runtime) => {
|
|
5679
5732
|
console.log("Testing image capture action...");
|
|
5680
|
-
const roomId =
|
|
5733
|
+
const roomId = createUniqueUuid5(runtime, "test-room");
|
|
5681
5734
|
const message = {
|
|
5682
|
-
id:
|
|
5735
|
+
id: createUniqueUuid5(runtime, "test-msg-capture"),
|
|
5683
5736
|
entityId: runtime.agentId,
|
|
5684
5737
|
content: { text: "take a photo" },
|
|
5685
5738
|
agentId: runtime.agentId,
|
|
@@ -5749,11 +5802,11 @@ class VisionBasicE2ETestSuite {
|
|
|
5749
5802
|
fn: async (runtime) => {
|
|
5750
5803
|
console.log("Testing vision provider...");
|
|
5751
5804
|
const message = {
|
|
5752
|
-
id:
|
|
5805
|
+
id: createUniqueUuid5(runtime, "test-msg-provider"),
|
|
5753
5806
|
entityId: runtime.agentId,
|
|
5754
5807
|
content: { text: "test provider" },
|
|
5755
5808
|
agentId: runtime.agentId,
|
|
5756
|
-
roomId:
|
|
5809
|
+
roomId: createUniqueUuid5(runtime, "test-room"),
|
|
5757
5810
|
createdAt: Date.now()
|
|
5758
5811
|
};
|
|
5759
5812
|
const state = await runtime.composeState(message);
|
|
@@ -6707,4 +6760,4 @@ export {
|
|
|
6707
6760
|
src_default as default
|
|
6708
6761
|
};
|
|
6709
6762
|
|
|
6710
|
-
//# debugId=
|
|
6763
|
+
//# debugId=8F996A8EAA2C894364756E2164756E21
|