@elizaos/plugin-vision 2.0.0-alpha.4 → 2.0.0-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,20 @@
1
1
  import { createRequire } from "node:module";
2
+ var __create = Object.create;
3
+ var __getProtoOf = Object.getPrototypeOf;
4
+ var __defProp = Object.defineProperty;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __toESM = (mod, isNodeMode, target) => {
8
+ target = mod != null ? __create(__getProtoOf(mod)) : {};
9
+ const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
10
+ for (let key of __getOwnPropNames(mod))
11
+ if (!__hasOwnProp.call(to, key))
12
+ __defProp(to, key, {
13
+ get: () => mod[key],
14
+ enumerable: true
15
+ });
16
+ return to;
17
+ };
2
18
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
3
19
 
4
20
  // src/action.ts
@@ -1836,7 +1852,7 @@ class StreamingAudioCaptureService extends EventEmitter {
1836
1852
  }
1837
1853
 
1838
1854
  // src/entity-tracker.ts
1839
- import { logger as logger5 } from "@elizaos/core";
1855
+ import { createUniqueUuid as createUniqueUuid2, logger as logger5 } from "@elizaos/core";
1840
1856
 
1841
1857
  class EntityTracker {
1842
1858
  worldState;
@@ -2014,8 +2030,9 @@ class EntityTracker {
2014
2030
  async syncWithRuntime(runtime, frameEntities) {
2015
2031
  try {
2016
2032
  for (const entity of frameEntities) {
2033
+ const entityUuid = createUniqueUuid2(runtime, entity.id);
2017
2034
  const elizaEntity = {
2018
- id: entity.id,
2035
+ id: entityUuid,
2019
2036
  names: [entity.attributes.name || entity.id],
2020
2037
  metadata: {
2021
2038
  type: entity.entityType,
@@ -2027,16 +2044,16 @@ class EntityTracker {
2027
2044
  agentId: runtime.agentId
2028
2045
  };
2029
2046
  try {
2030
- const existing = await runtime.getEntityById(entity.id);
2047
+ const existing = await runtime.getEntityById(entityUuid);
2031
2048
  if (!existing) {
2032
2049
  await runtime.createEntity(elizaEntity);
2033
- logger5.debug(`[EntityTracker] Created entity ${entity.id} in runtime`);
2050
+ logger5.debug(`[EntityTracker] Created entity ${entity.id} (${entityUuid}) in runtime`);
2034
2051
  } else {
2035
2052
  await runtime.updateEntity({
2036
2053
  ...elizaEntity,
2037
- id: entity.id
2054
+ id: entityUuid
2038
2055
  });
2039
- logger5.debug(`[EntityTracker] Updated entity ${entity.id} in runtime`);
2056
+ logger5.debug(`[EntityTracker] Updated entity ${entity.id} (${entityUuid}) in runtime`);
2040
2057
  }
2041
2058
  } catch (err) {
2042
2059
  logger5.debug(`[EntityTracker] Could not sync entity ${entity.id}: ${err instanceof Error ? err.message : String(err)}`);
@@ -2264,8 +2281,19 @@ import { logger as logger8 } from "@elizaos/core";
2264
2281
 
2265
2282
  // src/florence2-local.ts
2266
2283
  import { logger as logger7 } from "@elizaos/core";
2267
- import * as tf from "@tensorflow/tfjs-node";
2268
2284
  import sharp from "sharp";
2285
+ var tf = null;
2286
+ async function ensureTf() {
2287
+ if (tf)
2288
+ return true;
2289
+ try {
2290
+ tf = await import("@tensorflow/tfjs-node");
2291
+ return true;
2292
+ } catch {
2293
+ logger7.warn("[Florence2Local] TensorFlow.js native addon not available — using heuristic analysis.");
2294
+ return false;
2295
+ }
2296
+ }
2269
2297
 
2270
2298
  class Florence2Local {
2271
2299
  model = null;
@@ -2281,6 +2309,11 @@ class Florence2Local {
2281
2309
  if (this.initialized) {
2282
2310
  return;
2283
2311
  }
2312
+ const tfAvailable = await ensureTf();
2313
+ if (!tfAvailable || !tf) {
2314
+ this.initialized = true;
2315
+ return;
2316
+ }
2284
2317
  try {
2285
2318
  logger7.info("[VisionModel] Initializing MobileNet model for image analysis...");
2286
2319
  this.model = await tf.loadGraphModel(this.config.modelUrl);
@@ -2295,22 +2328,22 @@ class Florence2Local {
2295
2328
  if (!this.initialized) {
2296
2329
  await this.initialize();
2297
2330
  }
2331
+ if (!tf || !this.model) {
2332
+ return await this.enhancedFallback(imageBuffer);
2333
+ }
2298
2334
  try {
2299
2335
  const preprocessed = await this.preprocessImage(imageBuffer);
2300
- if (this.model) {
2301
- const predictions = await this.runInference(preprocessed);
2302
- preprocessed.dispose();
2303
- return this.parseModelOutput(predictions);
2304
- } else {
2305
- preprocessed.dispose();
2306
- return await this.enhancedFallback(imageBuffer);
2307
- }
2336
+ const predictions = await this.runInference(preprocessed);
2337
+ preprocessed.dispose?.();
2338
+ return this.parseModelOutput(predictions);
2308
2339
  } catch (error) {
2309
2340
  logger7.error("[VisionModel] Analysis failed:", error);
2310
2341
  return await this.enhancedFallback(imageBuffer);
2311
2342
  }
2312
2343
  }
2313
2344
  async preprocessImage(imageBuffer) {
2345
+ if (!tf)
2346
+ throw new Error("TensorFlow.js not available");
2314
2347
  const resized = await sharp(imageBuffer).resize(224, 224).raw().toBuffer();
2315
2348
  const tensor = tf.node.decodeImage(resized, 3);
2316
2349
  const normalized = tf.div(tensor, 255);
@@ -3252,19 +3285,39 @@ class ScreenCaptureService {
3252
3285
 
3253
3286
  // src/vision-models.ts
3254
3287
  import { logger as logger12 } from "@elizaos/core";
3255
- import * as tf2 from "@tensorflow/tfjs-node";
3256
- import * as cocoSsd from "@tensorflow-models/coco-ssd";
3257
- import * as poseDetection from "@tensorflow-models/pose-detection";
3288
+ var tf2 = null;
3289
+ var cocoSsd = null;
3290
+ var poseDetection = null;
3291
+ async function loadTfModules() {
3292
+ if (tf2)
3293
+ return true;
3294
+ try {
3295
+ tf2 = await import("@tensorflow/tfjs-node");
3296
+ cocoSsd = await import("@tensorflow-models/coco-ssd");
3297
+ poseDetection = await import("@tensorflow-models/pose-detection");
3298
+ return true;
3299
+ } catch (err) {
3300
+ logger12.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
3301
+ return false;
3302
+ }
3303
+ }
3258
3304
 
3259
3305
  class VisionModels {
3260
3306
  objectDetectionModel = null;
3261
3307
  poseDetector = null;
3262
3308
  initialized = false;
3309
+ tfAvailable = false;
3263
3310
  async initialize(config) {
3264
3311
  if (this.initialized) {
3265
3312
  return;
3266
3313
  }
3267
3314
  logger12.info("[VisionModels] Initializing vision models...");
3315
+ this.tfAvailable = await loadTfModules();
3316
+ if (!this.tfAvailable || !tf2 || !cocoSsd || !poseDetection) {
3317
+ this.initialized = true;
3318
+ logger12.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
3319
+ return;
3320
+ }
3268
3321
  try {
3269
3322
  await tf2.ready();
3270
3323
  logger12.info("[VisionModels] TensorFlow.js backend ready");
@@ -3308,7 +3361,7 @@ class VisionModels {
3308
3361
  return this.poseDetector !== null;
3309
3362
  }
3310
3363
  async detectObjects(imageData, _width, _height, description) {
3311
- if (!this.objectDetectionModel) {
3364
+ if (!this.objectDetectionModel || !tf2) {
3312
3365
  logger12.warn("[VisionModels] Object detection model not loaded");
3313
3366
  return this.enhancedObjectDetection(description);
3314
3367
  }
@@ -3394,7 +3447,7 @@ class VisionModels {
3394
3447
  };
3395
3448
  }
3396
3449
  async detectPoses(imageData, width, height, description) {
3397
- if (!this.poseDetector) {
3450
+ if (!this.poseDetector || !tf2) {
3398
3451
  logger12.warn("[VisionModels] Pose detection model not loaded");
3399
3452
  return this.enhancedPoseDetection(description);
3400
3453
  }
@@ -3601,7 +3654,7 @@ import * as path4 from "node:path";
3601
3654
  import { TextDecoder } from "node:util";
3602
3655
  import { Worker } from "node:worker_threads";
3603
3656
  import { logger as logger13 } from "@elizaos/core";
3604
- var __dirname = "/Users/shawwalters/eliza-ok/plugins/plugin-vision/typescript/src";
3657
+ var __dirname = "/Users/shawwalters/eliza-workspace/plugins/plugin-vision/typescript/src";
3605
3658
 
3606
3659
  class VisionWorkerManager {
3607
3660
  config;
@@ -5131,7 +5184,7 @@ class VisionService extends Service {
5131
5184
  }
5132
5185
 
5133
5186
  // src/tests/e2e/screen-vision.ts
5134
- import { createUniqueUuid as createUniqueUuid2 } from "@elizaos/core";
5187
+ import { createUniqueUuid as createUniqueUuid3 } from "@elizaos/core";
5135
5188
  class ScreenVisionE2ETestSuite {
5136
5189
  name = "plugin-vision-screen-e2e";
5137
5190
  description = "E2E tests for screen vision functionality including Florence-2 and OCR";
@@ -5250,11 +5303,11 @@ class ScreenVisionE2ETestSuite {
5250
5303
  }
5251
5304
  console.log(" Testing SET_VISION_MODE action...");
5252
5305
  const message = {
5253
- id: createUniqueUuid2(runtime, "test-msg"),
5306
+ id: createUniqueUuid3(runtime, "test-msg"),
5254
5307
  entityId: runtime.agentId,
5255
5308
  content: { text: "set vision mode to both" },
5256
5309
  agentId: runtime.agentId,
5257
- roomId: createUniqueUuid2(runtime, "test-room"),
5310
+ roomId: createUniqueUuid3(runtime, "test-room"),
5258
5311
  createdAt: Date.now()
5259
5312
  };
5260
5313
  let callbackCalled = false;
@@ -5303,11 +5356,11 @@ class ScreenVisionE2ETestSuite {
5303
5356
  }
5304
5357
  }
5305
5358
  const state = await runtime.composeState({
5306
- id: createUniqueUuid2(runtime, "test-msg"),
5359
+ id: createUniqueUuid3(runtime, "test-msg"),
5307
5360
  entityId: runtime.agentId,
5308
5361
  content: { text: "test" },
5309
5362
  agentId: runtime.agentId,
5310
- roomId: createUniqueUuid2(runtime, "test-room"),
5363
+ roomId: createUniqueUuid3(runtime, "test-room"),
5311
5364
  createdAt: Date.now()
5312
5365
  });
5313
5366
  if (state.text.includes("Vision mode: BOTH")) {
@@ -5343,7 +5396,7 @@ class ScreenVisionE2ETestSuite {
5343
5396
  }
5344
5397
  var screen_vision_default = new ScreenVisionE2ETestSuite;
5345
5398
  // src/tests/e2e/vision-autonomy.ts
5346
- import { createUniqueUuid as createUniqueUuid3 } from "@elizaos/core";
5399
+ import { createUniqueUuid as createUniqueUuid4 } from "@elizaos/core";
5347
5400
  class VisionAutonomyE2ETestSuite {
5348
5401
  name = "plugin-vision-autonomy-e2e";
5349
5402
  description = "Tests for vision plugin integration with autonomy plugin";
@@ -5352,9 +5405,9 @@ class VisionAutonomyE2ETestSuite {
5352
5405
  name: "Should stop autonomous loop with kill command",
5353
5406
  fn: async (runtime) => {
5354
5407
  console.log("Testing kill autonomous action...");
5355
- const roomId = createUniqueUuid3(runtime, "test-room");
5408
+ const roomId = createUniqueUuid4(runtime, "test-room");
5356
5409
  const message = {
5357
- id: createUniqueUuid3(runtime, "test-msg-kill"),
5410
+ id: createUniqueUuid4(runtime, "test-msg-kill"),
5358
5411
  entityId: runtime.agentId,
5359
5412
  content: { text: "kill the autonomous loop" },
5360
5413
  agentId: runtime.agentId,
@@ -5445,9 +5498,9 @@ class VisionAutonomyE2ETestSuite {
5445
5498
  if (!visionService) {
5446
5499
  throw new Error("Vision service not available");
5447
5500
  }
5448
- const roomId = createUniqueUuid3(runtime, "test-room");
5501
+ const roomId = createUniqueUuid4(runtime, "test-room");
5449
5502
  const firstMessage = {
5450
- id: createUniqueUuid3(runtime, "test-msg-1"),
5503
+ id: createUniqueUuid4(runtime, "test-msg-1"),
5451
5504
  entityId: runtime.agentId,
5452
5505
  content: { text: "what do you see?" },
5453
5506
  agentId: runtime.agentId,
@@ -5456,7 +5509,7 @@ class VisionAutonomyE2ETestSuite {
5456
5509
  };
5457
5510
  await runtime.createMemory(firstMessage, "messages");
5458
5511
  const firstResponse = {
5459
- id: createUniqueUuid3(runtime, "test-response-1"),
5512
+ id: createUniqueUuid4(runtime, "test-response-1"),
5460
5513
  entityId: runtime.agentId,
5461
5514
  content: {
5462
5515
  text: "I see a test scene",
@@ -5469,7 +5522,7 @@ class VisionAutonomyE2ETestSuite {
5469
5522
  await runtime.createMemory(firstResponse, "messages");
5470
5523
  await new Promise((resolve) => setTimeout(resolve, 500));
5471
5524
  const secondMessage = {
5472
- id: createUniqueUuid3(runtime, "test-msg-2"),
5525
+ id: createUniqueUuid4(runtime, "test-msg-2"),
5473
5526
  entityId: runtime.agentId,
5474
5527
  content: { text: "what did you see before?" },
5475
5528
  agentId: runtime.agentId,
@@ -5478,7 +5531,7 @@ class VisionAutonomyE2ETestSuite {
5478
5531
  };
5479
5532
  await runtime.createMemory(secondMessage, "messages");
5480
5533
  const secondResponse = {
5481
- id: createUniqueUuid3(runtime, "test-response-2"),
5534
+ id: createUniqueUuid4(runtime, "test-response-2"),
5482
5535
  entityId: runtime.agentId,
5483
5536
  content: {
5484
5537
  text: "Previously, I saw a test scene",
@@ -5506,7 +5559,7 @@ class VisionAutonomyE2ETestSuite {
5506
5559
  name: "Should integrate vision data with agent decision making",
5507
5560
  fn: async (runtime) => {
5508
5561
  console.log("Testing vision-based decision making...");
5509
- const roomId = createUniqueUuid3(runtime, "test-room");
5562
+ const roomId = createUniqueUuid4(runtime, "test-room");
5510
5563
  const scenarios = [
5511
5564
  { text: "Is anyone in the room?", expectedContext: "people" },
5512
5565
  { text: "Should I turn on the lights?", expectedContext: "scene" },
@@ -5515,7 +5568,7 @@ class VisionAutonomyE2ETestSuite {
5515
5568
  let scenariosWithVision = 0;
5516
5569
  for (const scenario of scenarios) {
5517
5570
  const message = {
5518
- id: createUniqueUuid3(runtime, `test-msg-${Date.now()}`),
5571
+ id: createUniqueUuid4(runtime, `test-msg-${Date.now()}`),
5519
5572
  entityId: runtime.agentId,
5520
5573
  content: { text: scenario.text },
5521
5574
  agentId: runtime.agentId,
@@ -5550,11 +5603,11 @@ class VisionAutonomyE2ETestSuite {
5550
5603
  const isActive = visionService.isActive();
5551
5604
  console.log(` Vision service active: ${isActive}`);
5552
5605
  const message = {
5553
- id: createUniqueUuid3(runtime, "test-msg-no-vision"),
5606
+ id: createUniqueUuid4(runtime, "test-msg-no-vision"),
5554
5607
  entityId: runtime.agentId,
5555
5608
  content: { text: "test without vision" },
5556
5609
  agentId: runtime.agentId,
5557
- roomId: createUniqueUuid3(runtime, "test-room"),
5610
+ roomId: createUniqueUuid4(runtime, "test-room"),
5558
5611
  createdAt: Date.now()
5559
5612
  };
5560
5613
  const state = await runtime.composeState(message);
@@ -5583,7 +5636,7 @@ class VisionAutonomyE2ETestSuite {
5583
5636
  }
5584
5637
  var vision_autonomy_default = new VisionAutonomyE2ETestSuite;
5585
5638
  // src/tests/e2e/vision-basic.ts
5586
- import { createUniqueUuid as createUniqueUuid4 } from "@elizaos/core";
5639
+ import { createUniqueUuid as createUniqueUuid5 } from "@elizaos/core";
5587
5640
  class VisionBasicE2ETestSuite {
5588
5641
  name = "plugin-vision-basic-e2e";
5589
5642
  description = "Basic end-to-end tests for vision plugin functionality";
@@ -5611,9 +5664,9 @@ class VisionBasicE2ETestSuite {
5611
5664
  name: "Should describe scene when requested",
5612
5665
  fn: async (runtime) => {
5613
5666
  console.log("Testing scene description action...");
5614
- const roomId = createUniqueUuid4(runtime, "test-room");
5667
+ const roomId = createUniqueUuid5(runtime, "test-room");
5615
5668
  const message = {
5616
- id: createUniqueUuid4(runtime, "test-msg-describe"),
5669
+ id: createUniqueUuid5(runtime, "test-msg-describe"),
5617
5670
  entityId: runtime.agentId,
5618
5671
  content: { text: "what do you see?" },
5619
5672
  agentId: runtime.agentId,
@@ -5677,9 +5730,9 @@ class VisionBasicE2ETestSuite {
5677
5730
  name: "Should capture image when requested",
5678
5731
  fn: async (runtime) => {
5679
5732
  console.log("Testing image capture action...");
5680
- const roomId = createUniqueUuid4(runtime, "test-room");
5733
+ const roomId = createUniqueUuid5(runtime, "test-room");
5681
5734
  const message = {
5682
- id: createUniqueUuid4(runtime, "test-msg-capture"),
5735
+ id: createUniqueUuid5(runtime, "test-msg-capture"),
5683
5736
  entityId: runtime.agentId,
5684
5737
  content: { text: "take a photo" },
5685
5738
  agentId: runtime.agentId,
@@ -5749,11 +5802,11 @@ class VisionBasicE2ETestSuite {
5749
5802
  fn: async (runtime) => {
5750
5803
  console.log("Testing vision provider...");
5751
5804
  const message = {
5752
- id: createUniqueUuid4(runtime, "test-msg-provider"),
5805
+ id: createUniqueUuid5(runtime, "test-msg-provider"),
5753
5806
  entityId: runtime.agentId,
5754
5807
  content: { text: "test provider" },
5755
5808
  agentId: runtime.agentId,
5756
- roomId: createUniqueUuid4(runtime, "test-room"),
5809
+ roomId: createUniqueUuid5(runtime, "test-room"),
5757
5810
  createdAt: Date.now()
5758
5811
  };
5759
5812
  const state = await runtime.composeState(message);
@@ -6707,4 +6760,4 @@ export {
6707
6760
  src_default as default
6708
6761
  };
6709
6762
 
6710
- //# debugId=F534BCEA2CE2323664756E2164756E21
6763
+ //# debugId=8F996A8EAA2C894364756E2164756E21