@elizaos/plugin-vision 2.0.0-alpha.8 → 2.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,22 +1,9 @@
1
1
  import { createRequire } from "node:module";
2
- var __create = Object.create;
3
- var __getProtoOf = Object.getPrototypeOf;
4
- var __defProp = Object.defineProperty;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __hasOwnProp = Object.prototype.hasOwnProperty;
7
- var __toESM = (mod, isNodeMode, target) => {
8
- target = mod != null ? __create(__getProtoOf(mod)) : {};
9
- const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
10
- for (let key of __getOwnPropNames(mod))
11
- if (!__hasOwnProp.call(to, key))
12
- __defProp(to, key, {
13
- get: () => mod[key],
14
- enumerable: true
15
- });
16
- return to;
17
- };
18
2
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
19
3
 
4
+ // src/index.ts
5
+ import { promoteSubactionsToActions } from "@elizaos/core";
6
+
20
7
  // src/action.ts
21
8
  import {
22
9
  ContentType,
@@ -30,6 +17,200 @@ var VisionServiceType = {
30
17
  };
31
18
 
32
19
  // src/action.ts
20
+ var VISION_ACTION_TIMEOUT_MS = 1e4;
21
+ var MAX_VISION_TEXT_LENGTH = 4000;
22
+ var MAX_VISION_ENTITIES = 25;
23
+ var VISION_OPS = [
24
+ "describe",
25
+ "capture",
26
+ "set_mode",
27
+ "name_entity",
28
+ "identify_person",
29
+ "track_entity"
30
+ ];
31
+ var ALL_VISION_CONTEXTS = [
32
+ "media",
33
+ "screen_time",
34
+ "automation",
35
+ "memory",
36
+ "settings"
37
+ ];
38
+ var DESCRIBE_KEYWORDS = [
39
+ "describe",
40
+ "scene",
41
+ "see",
42
+ "look",
43
+ "camera",
44
+ "screen",
45
+ "object",
46
+ "person",
47
+ "escena",
48
+ "ver",
49
+ "camara",
50
+ "décrire",
51
+ "scène",
52
+ "voir",
53
+ "beschreiben",
54
+ "szene",
55
+ "sehen",
56
+ "descrivi",
57
+ "scena",
58
+ "vedi",
59
+ "説明",
60
+ "見える",
61
+ "场景",
62
+ "描述",
63
+ "看见",
64
+ "장면",
65
+ "설명",
66
+ "보여"
67
+ ];
68
+ var CAPTURE_KEYWORDS = [
69
+ "capture",
70
+ "image",
71
+ "photo",
72
+ "picture",
73
+ "snapshot",
74
+ "screenshot",
75
+ "camera",
76
+ "captura",
77
+ "foto",
78
+ "imagen",
79
+ "capturer",
80
+ "photo",
81
+ "bild",
82
+ "foto",
83
+ "capturare",
84
+ "写真",
85
+ "画像",
86
+ "スクリーンショット",
87
+ "拍照",
88
+ "截图",
89
+ "이미지",
90
+ "사진",
91
+ "스크린샷"
92
+ ];
93
+ var SET_MODE_KEYWORDS = [
94
+ "vision",
95
+ "mode",
96
+ "camera",
97
+ "screen",
98
+ "both",
99
+ "disable",
100
+ "enable",
101
+ "off",
102
+ "visión",
103
+ "camara",
104
+ "pantalla",
105
+ "écran",
106
+ "kamera",
107
+ "bildschirm",
108
+ "schermo",
109
+ "ビジョン",
110
+ "カメラ",
111
+ "画面",
112
+ "视觉",
113
+ "相机",
114
+ "屏幕",
115
+ "비전",
116
+ "카메라",
117
+ "화면"
118
+ ];
119
+ var NAME_ENTITY_KEYWORDS = [
120
+ "name",
121
+ "named",
122
+ "call",
123
+ "person",
124
+ "entity",
125
+ "remember",
126
+ "object",
127
+ "nombre",
128
+ "llama",
129
+ "persona",
130
+ "nom",
131
+ "appelle",
132
+ "personne",
133
+ "name",
134
+ "nenne",
135
+ "person",
136
+ "nome",
137
+ "chiama",
138
+ "persona",
139
+ "名前",
140
+ "呼ぶ",
141
+ "人",
142
+ "命名",
143
+ "叫",
144
+ "人",
145
+ "이름",
146
+ "불러",
147
+ "사람"
148
+ ];
149
+ var IDENTIFY_PERSON_KEYWORDS = [
150
+ "identify",
151
+ "recognize",
152
+ "who is",
153
+ "person",
154
+ "face",
155
+ "seen before",
156
+ "identificar",
157
+ "reconoces",
158
+ "persona",
159
+ "visage",
160
+ "reconnais",
161
+ "personne",
162
+ "erkennen",
163
+ "gesicht",
164
+ "person",
165
+ "riconosci",
166
+ "persona",
167
+ "識別",
168
+ "誰",
169
+ "顔",
170
+ "识别",
171
+ "是谁",
172
+ "人",
173
+ "식별",
174
+ "누구",
175
+ "얼굴"
176
+ ];
177
+ var TRACK_ENTITY_KEYWORDS = [
178
+ "track",
179
+ "follow",
180
+ "watch",
181
+ "keep an eye",
182
+ "entity",
183
+ "person",
184
+ "object",
185
+ "rastrear",
186
+ "seguir",
187
+ "vigilar",
188
+ "persona",
189
+ "suivre",
190
+ "surveiller",
191
+ "personne",
192
+ "verfolgen",
193
+ "beobachten",
194
+ "person",
195
+ "traccia",
196
+ "segui",
197
+ "persona",
198
+ "追跡",
199
+ "見張",
200
+ "人",
201
+ "跟踪",
202
+ "关注",
203
+ "人",
204
+ "추적",
205
+ "지켜봐",
206
+ "사람"
207
+ ];
208
+ function withVisionTimeout(promise, label) {
209
+ return Promise.race([
210
+ promise,
211
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`${label} timed out`)), VISION_ACTION_TIMEOUT_MS))
212
+ ]);
213
+ }
33
214
  async function saveExecutionRecord(runtime, messageContext, thought, text, actions, attachments) {
34
215
  const memory = {
35
216
  id: createUniqueUuid(runtime, `vision-record-${Date.now()}`),
@@ -47,1108 +228,1004 @@ async function saveExecutionRecord(runtime, messageContext, thought, text, actio
47
228
  };
48
229
  await runtime.createMemory(memory, "messages");
49
230
  }
50
- var describeSceneAction = {
51
- name: "DESCRIBE_SCENE",
52
- similes: ["ANALYZE_SCENE", "WHAT_DO_YOU_SEE", "VISION_CHECK", "LOOK_AROUND"],
53
- description: "Analyzes the current visual scene and provides a detailed description of what the agent sees through the camera. Returns scene analysis data including people count, objects, and camera info for action chaining.",
54
- validate: async (runtime, message, state, options) => {
55
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
56
- const __avText = __avTextRaw.toLowerCase();
57
- const __avVisionService = runtime?.getService?.("VISION");
58
- const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
59
- const __avKeywords = ["describe", "scene"];
60
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
61
- const __avRegex = new RegExp("\\b(?:describe|scene)\\b", "i");
62
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
63
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
64
- const __avExpectedSource = "";
65
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
66
- const __avOptions = options && typeof options === "object" ? options : {};
67
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
68
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
69
- return false;
70
- }
71
- const __avLegacyValidate = async (runtime2, _message, _state) => {
72
- const visionService = runtime2.getService("VISION");
73
- return !!visionService && visionService.isActive();
74
- };
75
- try {
76
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
77
- } catch {
78
- return false;
231
+ function readActionParams(options) {
232
+ const direct = options && typeof options === "object" ? options : {};
233
+ const parameters = direct.parameters && typeof direct.parameters === "object" ? direct.parameters : {};
234
+ return { ...direct, ...parameters };
235
+ }
236
+ function selectedContextMatches(state, contexts) {
237
+ const selected = new Set;
238
+ const collect = (value) => {
239
+ if (!Array.isArray(value))
240
+ return;
241
+ for (const item of value) {
242
+ if (typeof item === "string")
243
+ selected.add(item);
79
244
  }
80
- },
81
- handler: async (runtime, message, _state, _options, callback, _responses) => {
82
- const visionService = runtime.getService("VISION");
83
- if (!visionService || !visionService.isActive()) {
84
- const thought = "Vision service is not available or no camera is connected.";
85
- const text = "I cannot see anything right now. No camera is available.";
86
- await saveExecutionRecord(runtime, message, thought, text, ["DESCRIBE_SCENE"]);
245
+ };
246
+ collect(state?.values?.selectedContexts);
247
+ collect(state?.data?.selectedContexts);
248
+ const contextObject = state?.data?.contextObject;
249
+ collect(contextObject?.trajectoryPrefix?.selectedContexts);
250
+ collect(contextObject?.metadata?.selectedContexts);
251
+ return contexts.some((context) => selected.has(context));
252
+ }
253
+ function visionServiceIsActive(runtime) {
254
+ const visionService = runtime.getService("VISION");
255
+ return Boolean(visionService?.isActive());
256
+ }
257
+ function normalizeOp(value) {
258
+ if (typeof value !== "string")
259
+ return null;
260
+ const normalized = value.trim().toLowerCase().replace(/[\s-]+/g, "_");
261
+ if (!normalized)
262
+ return null;
263
+ const aliases = {
264
+ describe_scene: "describe",
265
+ scene: "describe",
266
+ capture_image: "capture",
267
+ image: "capture",
268
+ photo: "capture",
269
+ snapshot: "capture",
270
+ screenshot: "capture",
271
+ set_vision_mode: "set_mode",
272
+ mode: "set_mode",
273
+ vision_mode: "set_mode",
274
+ name: "name_entity",
275
+ identify: "identify_person",
276
+ recognize: "identify_person",
277
+ track: "track_entity",
278
+ follow: "track_entity"
279
+ };
280
+ if (aliases[normalized])
281
+ return aliases[normalized];
282
+ return VISION_OPS.includes(normalized) ? normalized : null;
283
+ }
284
+ function inferOpFromMessage(text) {
285
+ const lower = text.toLowerCase();
286
+ if (NAME_ENTITY_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(call|name|named)\b/.test(lower)) {
287
+ return "name_entity";
288
+ }
289
+ if (IDENTIFY_PERSON_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(who|identify|recognize)\b/.test(lower)) {
290
+ return "identify_person";
291
+ }
292
+ if (TRACK_ENTITY_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(track|follow|watch|keep an eye)\b/.test(lower)) {
293
+ return "track_entity";
294
+ }
295
+ if (SET_MODE_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(mode|enable|disable|turn off|turn on)\b/.test(lower)) {
296
+ return "set_mode";
297
+ }
298
+ if (CAPTURE_KEYWORDS.some((k) => lower.includes(k.toLowerCase()))) {
299
+ return "capture";
300
+ }
301
+ if (DESCRIBE_KEYWORDS.some((k) => lower.includes(k.toLowerCase()))) {
302
+ return "describe";
303
+ }
304
+ return null;
305
+ }
306
+ async function runDescribe(runtime, message, options, callback) {
307
+ const visionService = runtime.getService("VISION");
308
+ if (!visionService || !visionService.isActive()) {
309
+ const thought = "Vision service is not available or no camera is connected.";
310
+ const text = "I cannot see anything right now. No camera is available.";
311
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
312
+ if (callback) {
313
+ await callback({ thought, text, actions: ["VISION"] });
314
+ }
315
+ return {
316
+ success: false,
317
+ text: "Vision service unavailable - cannot analyze scene",
318
+ values: {
319
+ success: false,
320
+ visionAvailable: false,
321
+ error: "Vision service not available"
322
+ },
323
+ data: {
324
+ actionName: "VISION",
325
+ op: "describe",
326
+ error: "Vision service not available or no camera connected"
327
+ }
328
+ };
329
+ }
330
+ try {
331
+ const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
332
+ const cameraInfo = visionService.getCameraInfo();
333
+ if (!scene) {
334
+ const thought2 = "Camera is connected but no scene has been analyzed yet.";
335
+ const text2 = `Camera "${cameraInfo?.name}" is connected, but I haven't analyzed any scenes yet. Please wait a moment.`;
336
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
87
337
  if (callback) {
88
- await callback({
89
- thought,
90
- text,
91
- actions: ["DESCRIBE_SCENE"]
92
- });
338
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
93
339
  }
94
340
  return {
95
341
  success: false,
96
- text: "Vision service unavailable - cannot analyze scene",
342
+ text: "Camera connected but no scene analyzed yet",
97
343
  values: {
98
344
  success: false,
99
- visionAvailable: false,
100
- error: "Vision service not available"
345
+ visionAvailable: true,
346
+ sceneAnalyzed: false,
347
+ cameraName: cameraInfo?.name || undefined
101
348
  },
102
349
  data: {
103
- actionName: "DESCRIBE_SCENE",
104
- error: "Vision service not available or no camera connected"
350
+ actionName: "VISION",
351
+ op: "describe",
352
+ cameraInfo: cameraInfo ? {
353
+ id: cameraInfo.id,
354
+ name: cameraInfo.name,
355
+ connected: cameraInfo.connected
356
+ } : undefined,
357
+ sceneStatus: "not_analyzed"
105
358
  }
106
359
  };
107
360
  }
108
- try {
109
- const scene = await visionService.getSceneDescription();
110
- const cameraInfo = visionService.getCameraInfo();
111
- if (!scene) {
112
- const thought2 = "Camera is connected but no scene has been analyzed yet.";
113
- const text2 = `Camera "${cameraInfo?.name}" is connected, but I haven't analyzed any scenes yet. Please wait a moment.`;
114
- await saveExecutionRecord(runtime, message, thought2, text2, ["DESCRIBE_SCENE"]);
115
- if (callback) {
116
- await callback({
117
- thought: thought2,
118
- text: text2,
119
- actions: ["DESCRIBE_SCENE"]
120
- });
121
- }
122
- return {
123
- success: false,
124
- text: "Camera connected but no scene analyzed yet",
125
- values: {
126
- success: false,
127
- visionAvailable: true,
128
- sceneAnalyzed: false,
129
- cameraName: cameraInfo?.name || undefined
130
- },
131
- data: {
132
- actionName: "DESCRIBE_SCENE",
133
- cameraInfo: cameraInfo ? {
134
- id: cameraInfo.id,
135
- name: cameraInfo.name,
136
- connected: cameraInfo.connected
137
- } : undefined,
138
- sceneStatus: "not_analyzed"
139
- }
140
- };
141
- }
142
- const peopleCount = scene.people.length;
143
- const objectCount = scene.objects.length;
144
- const timestamp = new Date(scene.timestamp).toLocaleString();
145
- let description = `Looking through ${cameraInfo?.name || "the camera"}, `;
146
- description += scene.description;
147
- if (peopleCount > 0) {
148
- description += `
361
+ const peopleCount = scene.people.length;
362
+ const objectCount = scene.objects.length;
363
+ const people = scene.people.slice(0, MAX_VISION_ENTITIES);
364
+ const objects = scene.objects.slice(0, MAX_VISION_ENTITIES);
365
+ const timestamp = new Date(scene.timestamp).toLocaleString();
366
+ const detailLevel = options.detailLevel === "summary" ? "summary" : "detailed";
367
+ let description = `Looking through ${cameraInfo?.name || "the camera"}, `;
368
+ description += scene.description;
369
+ if (detailLevel === "detailed" && peopleCount > 0) {
370
+ description += `
149
371
 
150
372
  I can see ${peopleCount} ${peopleCount === 1 ? "person" : "people"}`;
151
- const facingData = scene.people.reduce((acc, person) => {
152
- if (person.facing && person.facing !== "unknown") {
153
- acc[person.facing] = (acc[person.facing] || 0) + 1;
154
- }
155
- return acc;
156
- }, {});
157
- if (Object.keys(facingData).length > 0) {
158
- const facingDescriptions = Object.entries(facingData).map(([direction, count]) => `${count} facing ${direction}`);
159
- description += ` (${facingDescriptions.join(", ")})`;
160
- }
161
- description += ".";
162
- }
163
- if (objectCount > 0) {
164
- const objectTypes = scene.objects.reduce((acc, obj) => {
165
- acc[obj.type] = (acc[obj.type] || 0) + 1;
166
- return acc;
167
- }, {});
168
- const objectDescriptions = Object.entries(objectTypes).map(([type, count]) => `${count} ${type}${count > 1 ? "s" : ""}`);
169
- description += `
373
+ const facingData = people.reduce((acc, person) => {
374
+ if (person.facing && person.facing !== "unknown") {
375
+ acc[person.facing] = (acc[person.facing] || 0) + 1;
376
+ }
377
+ return acc;
378
+ }, {});
379
+ if (Object.keys(facingData).length > 0) {
380
+ const facingDescriptions = Object.entries(facingData).map(([direction, count]) => `${count} facing ${direction}`);
381
+ description += ` (${facingDescriptions.join(", ")})`;
382
+ }
383
+ description += ".";
384
+ }
385
+ if (detailLevel === "detailed" && objectCount > 0) {
386
+ const objectTypes = objects.reduce((acc, obj) => {
387
+ acc[obj.type] = (acc[obj.type] || 0) + 1;
388
+ return acc;
389
+ }, {});
390
+ const objectDescriptions = Object.entries(objectTypes).map(([type, count]) => `${count} ${type}${count > 1 ? "s" : ""}`);
391
+ description += `
170
392
 
171
393
  Objects detected: ${objectDescriptions.join(", ")}.`;
172
- }
173
- if (scene.sceneChanged && scene.changePercentage) {
174
- description += `
394
+ }
395
+ if (detailLevel === "detailed" && scene.sceneChanged && scene.changePercentage) {
396
+ description += `
175
397
 
176
398
  (Scene changed by ${scene.changePercentage.toFixed(1)}% since last analysis)`;
399
+ }
400
+ const thought = `Analyzed the visual scene at ${timestamp}.`;
401
+ const text = description.slice(0, MAX_VISION_TEXT_LENGTH);
402
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
403
+ if (callback) {
404
+ await callback({ thought, text, actions: ["VISION"] });
405
+ }
406
+ return {
407
+ success: true,
408
+ text,
409
+ values: {
410
+ success: true,
411
+ visionAvailable: true,
412
+ sceneAnalyzed: true,
413
+ peopleCount,
414
+ objectCount,
415
+ cameraName: cameraInfo?.name || undefined,
416
+ sceneChanged: scene.sceneChanged,
417
+ changePercentage: scene.changePercentage,
418
+ detailLevel
419
+ },
420
+ data: {
421
+ actionName: "VISION",
422
+ op: "describe",
423
+ sceneTimestamp: scene.timestamp,
424
+ sceneDescription: scene.description.slice(0, MAX_VISION_TEXT_LENGTH),
425
+ sceneChanged: scene.sceneChanged,
426
+ changePercentage: scene.changePercentage,
427
+ audioTranscription: scene.audioTranscription || undefined,
428
+ objectCount: objects.length,
429
+ peopleCount: people.length,
430
+ cameraInfo: cameraInfo ? {
431
+ id: cameraInfo.id,
432
+ name: cameraInfo.name,
433
+ connected: cameraInfo.connected
434
+ } : undefined,
435
+ timestamp,
436
+ description: text
437
+ }
438
+ };
439
+ } catch (error) {
440
+ logger.error("[VISION/describe] Error analyzing scene:", error instanceof Error ? error.message : String(error));
441
+ const thought = "An error occurred while trying to analyze the visual scene.";
442
+ const text = `Error analyzing scene: ${error instanceof Error ? error.message : String(error)}`;
443
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
444
+ if (callback) {
445
+ await callback({ thought, text, actions: ["VISION"] });
446
+ }
447
+ const errorMessage = error instanceof Error ? error.message : String(error);
448
+ return {
449
+ success: false,
450
+ text: "Error analyzing scene",
451
+ values: {
452
+ success: false,
453
+ visionAvailable: true,
454
+ error: true,
455
+ errorMessage
456
+ },
457
+ data: {
458
+ actionName: "VISION",
459
+ op: "describe",
460
+ error: errorMessage,
461
+ errorType: "analysis_error"
462
+ }
463
+ };
464
+ }
465
+ }
466
+ async function runCapture(runtime, message, callback) {
467
+ const visionService = runtime.getService("VISION");
468
+ if (!visionService || !visionService.isActive()) {
469
+ const thought = "Vision service is not available or no camera is connected.";
470
+ const text = "I cannot capture an image right now. No camera is available.";
471
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
472
+ if (callback) {
473
+ await callback({ thought, text, actions: ["VISION"] });
474
+ }
475
+ return {
476
+ success: false,
477
+ text: "Vision service unavailable - cannot capture image",
478
+ values: {
479
+ success: false,
480
+ visionAvailable: false,
481
+ error: "Vision service not available"
482
+ },
483
+ data: {
484
+ actionName: "VISION",
485
+ op: "capture",
486
+ error: "Vision service not available or no camera connected"
177
487
  }
178
- const thought = `Analyzed the visual scene at ${timestamp}.`;
179
- const text = description;
180
- await saveExecutionRecord(runtime, message, thought, text, ["DESCRIBE_SCENE"]);
488
+ };
489
+ }
490
+ try {
491
+ const imageBuffer = await Promise.race([
492
+ visionService.captureImage(),
493
+ new Promise((_, reject) => setTimeout(() => reject(new Error("vision capture timed out")), VISION_ACTION_TIMEOUT_MS))
494
+ ]);
495
+ const cameraInfo = visionService.getCameraInfo();
496
+ if (!imageBuffer) {
497
+ const thought2 = "Failed to capture image from camera.";
498
+ const text2 = "I could not capture an image from the camera. Please try again.";
499
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
181
500
  if (callback) {
182
- await callback({
183
- thought,
184
- text,
185
- actions: ["DESCRIBE_SCENE"]
186
- });
501
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
187
502
  }
188
503
  return {
189
- success: true,
190
- text: description,
504
+ success: false,
505
+ text: "Failed to capture image from camera",
191
506
  values: {
192
- success: true,
507
+ success: false,
193
508
  visionAvailable: true,
194
- sceneAnalyzed: true,
195
- peopleCount,
196
- objectCount,
197
- cameraName: cameraInfo?.name || undefined,
198
- sceneChanged: scene.sceneChanged,
199
- changePercentage: scene.changePercentage
509
+ captureSuccess: false
200
510
  },
201
511
  data: {
202
- actionName: "DESCRIBE_SCENE",
203
- sceneTimestamp: scene.timestamp,
204
- sceneDescription: scene.description,
205
- sceneChanged: scene.sceneChanged,
206
- changePercentage: scene.changePercentage,
207
- audioTranscription: scene.audioTranscription || undefined,
208
- objectCount: scene.objects.length,
209
- peopleCount: scene.people.length,
512
+ actionName: "VISION",
513
+ op: "capture",
514
+ error: "Camera capture failed",
210
515
  cameraInfo: cameraInfo ? {
211
516
  id: cameraInfo.id,
212
517
  name: cameraInfo.name,
213
518
  connected: cameraInfo.connected
214
- } : undefined,
215
- timestamp,
216
- description
519
+ } : undefined
217
520
  }
218
521
  };
219
- } catch (error) {
220
- logger.error("[describeSceneAction] Error analyzing scene:", error instanceof Error ? error.message : String(error));
221
- const thought = "An error occurred while trying to analyze the visual scene.";
222
- const text = `Error analyzing scene: ${error instanceof Error ? error.message : String(error)}`;
223
- await saveExecutionRecord(runtime, message, thought, text, ["DESCRIBE_SCENE"]);
522
+ }
523
+ const attachmentId = createUniqueUuid(runtime, `capture-${Date.now()}`);
524
+ const timestamp = new Date().toISOString();
525
+ const imageAttachment = {
526
+ id: attachmentId,
527
+ title: `Camera Capture - ${timestamp}`,
528
+ contentType: ContentType.IMAGE,
529
+ source: `camera:${cameraInfo?.name || "unknown"}`,
530
+ url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`
531
+ };
532
+ const thought = `Captured an image from camera "${cameraInfo?.name}".`;
533
+ const text = `I've captured an image from the camera at ${timestamp}.`;
534
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"], [imageAttachment]);
535
+ if (callback) {
536
+ await callback({
537
+ thought,
538
+ text,
539
+ actions: ["VISION"],
540
+ attachments: [imageAttachment]
541
+ });
542
+ }
543
+ return {
544
+ success: true,
545
+ text: `I've captured an image from the camera at ${timestamp}.`,
546
+ values: {
547
+ success: true,
548
+ visionAvailable: true,
549
+ captureSuccess: true,
550
+ cameraName: cameraInfo?.name || undefined,
551
+ timestamp
552
+ },
553
+ data: {
554
+ actionName: "VISION",
555
+ op: "capture",
556
+ imageAttachment: {
557
+ id: imageAttachment.id,
558
+ title: imageAttachment.title,
559
+ contentType: imageAttachment.contentType,
560
+ source: imageAttachment.source,
561
+ url: imageAttachment.url
562
+ },
563
+ cameraInfo: cameraInfo ? {
564
+ id: cameraInfo.id,
565
+ name: cameraInfo.name,
566
+ connected: cameraInfo.connected
567
+ } : undefined,
568
+ timestamp
569
+ }
570
+ };
571
+ } catch (error) {
572
+ logger.error("[VISION/capture] Error capturing image:", error);
573
+ const thought = "An error occurred while trying to capture an image.";
574
+ const errorMessage = error instanceof Error ? error.message : String(error);
575
+ const text = `Error capturing image: ${errorMessage}`;
576
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
577
+ if (callback) {
578
+ await callback({ thought, text, actions: ["VISION"] });
579
+ }
580
+ return {
581
+ success: false,
582
+ text: "Error capturing image",
583
+ values: {
584
+ success: false,
585
+ visionAvailable: true,
586
+ error: true,
587
+ errorMessage
588
+ },
589
+ data: {
590
+ actionName: "VISION",
591
+ op: "capture",
592
+ error: errorMessage,
593
+ errorType: "capture_error"
594
+ }
595
+ };
596
+ }
597
+ }
598
+ async function runSetMode(runtime, message, options, callback) {
599
+ const visionService = runtime.getService("VISION");
600
+ if (!visionService) {
601
+ const thought = "Vision service is not available.";
602
+ const text = "I cannot change vision mode because the vision service is not available.";
603
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
604
+ if (callback) {
605
+ await callback({ thought, text, actions: ["VISION"] });
606
+ }
607
+ return {
608
+ success: false,
609
+ text,
610
+ data: { actionName: "VISION", op: "set_mode" }
611
+ };
612
+ }
613
+ try {
614
+ const explicitMode = typeof options.mode === "string" ? options.mode.toLowerCase() : "";
615
+ const messageText = explicitMode || message.content.text?.toLowerCase() || "";
616
+ let newMode = null;
617
+ if (messageText.includes("off") || messageText.includes("disable")) {
618
+ newMode = "OFF" /* OFF */;
619
+ } else if (messageText.includes("both")) {
620
+ newMode = "BOTH" /* BOTH */;
621
+ } else if (messageText.includes("screen")) {
622
+ newMode = "SCREEN" /* SCREEN */;
623
+ } else if (messageText.includes("camera")) {
624
+ newMode = "CAMERA" /* CAMERA */;
625
+ }
626
+ if (!newMode) {
627
+ const thought2 = "Could not determine the desired vision mode from the message.";
628
+ const text2 = "Please specify the vision mode: OFF, CAMERA, SCREEN, or BOTH.";
629
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
224
630
  if (callback) {
225
- await callback({
226
- thought,
227
- text,
228
- actions: ["DESCRIBE_SCENE"]
229
- });
631
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
230
632
  }
231
- const errorMessage = error instanceof Error ? error.message : String(error);
232
633
  return {
233
634
  success: false,
234
- text: "Error analyzing scene",
235
- values: {
236
- success: false,
237
- visionAvailable: true,
238
- error: true,
239
- errorMessage
240
- },
241
- data: {
242
- actionName: "DESCRIBE_SCENE",
243
- error: errorMessage,
244
- errorType: "analysis_error"
245
- }
635
+ text: text2,
636
+ data: { actionName: "VISION", op: "set_mode" }
246
637
  };
247
638
  }
248
- },
249
- examples: [
250
- [
251
- { name: "{{user}}", content: { text: "what do you see?" } },
252
- {
253
- name: "{{agent}}",
254
- content: {
255
- actions: ["DESCRIBE_SCENE"],
256
- thought: "The user wants to know what I can see through my camera.",
257
- text: "I see a room with a desk and computer setup. There are 2 people, one is sitting and one is standing."
258
- }
259
- }
260
- ],
261
- [
262
- { name: "{{user}}", content: { text: "describe the scene and then take a photo" } },
263
- {
264
- name: "{{agent}}",
265
- content: {
266
- actions: ["DESCRIBE_SCENE", "CAPTURE_IMAGE"],
267
- thought: "I should first analyze the scene, then capture an image for the user.",
268
- text: "I can see 3 people in an office setting. Let me capture this scene for you."
269
- }
270
- }
271
- ]
272
- ]
273
- };
274
- var captureImageAction = {
275
- name: "CAPTURE_IMAGE",
276
- similes: ["TAKE_PHOTO", "SCREENSHOT", "CAPTURE_FRAME", "TAKE_PICTURE"],
277
- description: "Captures the current frame from the camera and saves it as an image attachment. Returns image data with camera info and timestamp for action chaining. Can be combined with DESCRIBE_SCENE for analysis or NAME_ENTITY for identification workflows.",
278
- validate: async (runtime, message, state, options) => {
279
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
280
- const __avText = __avTextRaw.toLowerCase();
281
- const __avVisionService = runtime?.getService?.("VISION");
282
- const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
283
- const __avKeywords = ["capture", "image"];
284
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
285
- const __avRegex = new RegExp("\\b(?:capture|image)\\b", "i");
286
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
287
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
288
- const __avExpectedSource = "";
289
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
290
- const __avOptions = options && typeof options === "object" ? options : {};
291
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
292
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
293
- return false;
294
- }
295
- const __avLegacyValidate = async (runtime2, _message, _state) => {
296
- const visionService = runtime2.getService("VISION");
297
- return !!visionService && visionService.isActive();
639
+ const currentMode = visionService.getVisionMode();
640
+ await Promise.race([
641
+ visionService.setVisionMode(newMode),
642
+ new Promise((_, reject) => setTimeout(() => reject(new Error("vision mode change timed out")), VISION_ACTION_TIMEOUT_MS))
643
+ ]);
644
+ const thought = `Changed vision mode from ${currentMode} to ${newMode}.`;
645
+ let text = "";
646
+ switch (newMode) {
647
+ case "OFF" /* OFF */:
648
+ text = "Vision has been disabled. I will no longer process visual input.";
649
+ break;
650
+ case "CAMERA" /* CAMERA */:
651
+ text = "Vision mode set to CAMERA only. I will process input from the camera.";
652
+ break;
653
+ case "SCREEN" /* SCREEN */:
654
+ text = "Vision mode set to SCREEN only. I will analyze what's on your screen.";
655
+ break;
656
+ case "BOTH" /* BOTH */:
657
+ text = "Vision mode set to BOTH. I will process input from both camera and screen.";
658
+ break;
659
+ }
660
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
661
+ if (callback) {
662
+ await callback({ thought, text, actions: ["VISION"] });
663
+ }
664
+ return {
665
+ success: true,
666
+ text,
667
+ values: { visionMode: newMode },
668
+ data: { actionName: "VISION", op: "set_mode", visionMode: newMode }
298
669
  };
299
- try {
300
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
301
- } catch {
302
- return false;
670
+ } catch (error) {
671
+ logger.error("[VISION/set_mode] Error changing vision mode:", error);
672
+ const errorMessage = error instanceof Error ? error.message : String(error);
673
+ const thought = "An error occurred while trying to change the vision mode.";
674
+ const text = `Error changing vision mode: ${errorMessage}`;
675
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
676
+ if (callback) {
677
+ await callback({ thought, text, actions: ["VISION"] });
303
678
  }
304
- },
305
- handler: async (runtime, message, _state, _options, callback, _responses) => {
679
+ return {
680
+ success: false,
681
+ text,
682
+ error: errorMessage,
683
+ data: { actionName: "VISION", op: "set_mode" }
684
+ };
685
+ }
686
+ }
687
+ async function runNameEntity(runtime, message, options, callback) {
688
+ try {
306
689
  const visionService = runtime.getService("VISION");
307
- if (!visionService || !visionService.isActive()) {
308
- const thought = "Vision service is not available or no camera is connected.";
309
- const text = "I cannot capture an image right now. No camera is available.";
310
- await saveExecutionRecord(runtime, message, thought, text, ["CAPTURE_IMAGE"]);
690
+ if (!visionService) {
691
+ const thought = "Vision service is not available.";
692
+ const text = "I cannot name entities because the vision service is not available.";
693
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
311
694
  if (callback) {
312
- await callback({
313
- thought,
314
- text,
315
- actions: ["CAPTURE_IMAGE"]
316
- });
695
+ await callback({ thought, text, actions: ["VISION"] });
317
696
  }
318
697
  return {
319
698
  success: false,
320
- text: "Vision service unavailable - cannot capture image",
321
- values: {
322
- success: false,
323
- visionAvailable: false,
324
- error: "Vision service not available"
325
- },
326
- data: {
327
- actionName: "CAPTURE_IMAGE",
328
- error: "Vision service not available or no camera connected"
329
- }
699
+ text,
700
+ data: { actionName: "VISION", op: "name_entity" }
330
701
  };
331
702
  }
332
- try {
333
- const imageBuffer = await visionService.captureImage();
334
- const cameraInfo = visionService.getCameraInfo();
335
- if (!imageBuffer) {
336
- const thought2 = "Failed to capture image from camera.";
337
- const text2 = "I could not capture an image from the camera. Please try again.";
338
- await saveExecutionRecord(runtime, message, thought2, text2, ["CAPTURE_IMAGE"]);
339
- if (callback) {
340
- await callback({
341
- thought: thought2,
342
- text: text2,
343
- actions: ["CAPTURE_IMAGE"]
344
- });
345
- }
346
- return {
347
- success: false,
348
- text: "Failed to capture image from camera",
349
- values: {
350
- success: false,
351
- visionAvailable: true,
352
- captureSuccess: false
353
- },
354
- data: {
355
- actionName: "CAPTURE_IMAGE",
356
- error: "Camera capture failed",
357
- cameraInfo: cameraInfo ? {
358
- id: cameraInfo.id,
359
- name: cameraInfo.name,
360
- connected: cameraInfo.connected
361
- } : undefined
362
- }
363
- };
364
- }
365
- const attachmentId = createUniqueUuid(runtime, `capture-${Date.now()}`);
366
- const timestamp = new Date().toISOString();
367
- const imageAttachment = {
368
- id: attachmentId,
369
- title: `Camera Capture - ${timestamp}`,
370
- contentType: ContentType.IMAGE,
371
- source: `camera:${cameraInfo?.name || "unknown"}`,
372
- url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`
373
- };
374
- const thought = `Captured an image from camera "${cameraInfo?.name}".`;
375
- const text = `I've captured an image from the camera at ${timestamp}.`;
376
- await saveExecutionRecord(runtime, message, thought, text, ["CAPTURE_IMAGE"], [imageAttachment]);
703
+ const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
704
+ if (!scene || scene.people.length === 0) {
705
+ const thought = "No people visible to name.";
706
+ const text = "I don't see any people in the current scene to name.";
707
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
377
708
  if (callback) {
378
- await callback({
379
- thought,
380
- text,
381
- actions: ["CAPTURE_IMAGE"],
382
- attachments: [imageAttachment]
383
- });
709
+ await callback({ thought, text, actions: ["VISION"] });
384
710
  }
385
711
  return {
386
- success: true,
387
- text: `I've captured an image from the camera at ${timestamp}.`,
388
- values: {
389
- success: true,
390
- visionAvailable: true,
391
- captureSuccess: true,
392
- cameraName: cameraInfo?.name || undefined,
393
- timestamp
394
- },
395
- data: {
396
- actionName: "CAPTURE_IMAGE",
397
- imageAttachment: {
398
- id: imageAttachment.id,
399
- title: imageAttachment.title,
400
- contentType: imageAttachment.contentType,
401
- source: imageAttachment.source,
402
- url: imageAttachment.url
403
- },
404
- cameraInfo: cameraInfo ? {
405
- id: cameraInfo.id,
406
- name: cameraInfo.name,
407
- connected: cameraInfo.connected
408
- } : undefined,
409
- timestamp
410
- }
712
+ success: false,
713
+ text,
714
+ data: { actionName: "VISION", op: "name_entity" }
411
715
  };
412
- } catch (error) {
413
- logger.error("[captureImageAction] Error capturing image:", error);
414
- const thought = "An error occurred while trying to capture an image.";
415
- const errorMessage = error instanceof Error ? error.message : String(error);
416
- const text = `Error capturing image: ${errorMessage}`;
417
- await saveExecutionRecord(runtime, message, thought, text, ["CAPTURE_IMAGE"]);
716
+ }
717
+ const messageText = message.content.text?.toLowerCase() || "";
718
+ const explicitName = typeof options.name === "string" ? options.name.trim() : "";
719
+ const nameMatch = explicitName ? [explicitName, explicitName] : messageText.match(/(?:named?|call(?:ed)?|is)\s+(\w+)/i);
720
+ if (!nameMatch) {
721
+ const thought = "Could not extract name from message.";
722
+ const text = `I couldn't understand what name to assign. Please say something like "The person is named Alice".`;
723
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
418
724
  if (callback) {
419
- await callback({
420
- thought,
421
- text,
422
- actions: ["CAPTURE_IMAGE"]
423
- });
725
+ await callback({ thought, text, actions: ["VISION"] });
424
726
  }
425
727
  return {
426
728
  success: false,
427
- text: "Error capturing image",
428
- values: {
429
- success: false,
430
- visionAvailable: true,
431
- error: true,
432
- errorMessage
433
- },
434
- data: {
435
- actionName: "CAPTURE_IMAGE",
436
- error: errorMessage,
437
- errorType: "capture_error"
438
- }
729
+ text,
730
+ data: { actionName: "VISION", op: "name_entity" }
439
731
  };
440
732
  }
441
- },
442
- examples: [
443
- [
444
- { name: "{{user}}", content: { text: "describe what you see and take a photo" } },
445
- {
446
- name: "{{agent}}",
447
- content: {
448
- actions: ["DESCRIBE_SCENE", "CAPTURE_IMAGE"],
449
- thought: "User wants scene analysis followed by image capture.",
450
- text: "I can see 3 people in an office setting. Let me capture this scene for you."
451
- }
452
- }
453
- ],
454
- [
455
- { name: "{{user}}", content: { text: "take a photo" } },
456
- {
457
- name: "{{agent}}",
458
- content: {
459
- actions: ["CAPTURE_IMAGE"],
460
- thought: "The user wants me to capture an image from the camera.",
461
- text: "I've captured an image from the camera."
462
- }
463
- }
464
- ],
465
- [
466
- { name: "{{user}}", content: { text: "capture the current scene" } },
467
- {
468
- name: "{{agent}}",
469
- content: {
470
- actions: ["CAPTURE_IMAGE"]
471
- }
472
- }
473
- ]
474
- ]
475
- };
476
- var setVisionModeAction = {
477
- name: "SET_VISION_MODE",
478
- description: "Set the vision mode to OFF, CAMERA, SCREEN, or BOTH",
479
- similes: [
480
- "change vision to {mode}",
481
- "set vision mode {mode}",
482
- "switch to {mode} vision",
483
- "turn vision {mode}",
484
- "use {mode} vision",
485
- "enable {mode} vision",
486
- "disable vision"
487
- ],
488
- validate: async (runtime, message, state, options) => {
489
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
490
- const __avText = __avTextRaw.toLowerCase();
491
- const __avLegacyContextOk = Boolean(runtime?.getService?.("VISION"));
492
- const __avKeywords = ["set", "vision", "mode"];
493
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
494
- const __avRegex = new RegExp("\\b(?:set|vision|mode)\\b", "i");
495
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
496
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
497
- const __avExpectedSource = "";
498
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
499
- const __avOptions = options && typeof options === "object" ? options : {};
500
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
501
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
502
- return false;
503
- }
504
- const __avLegacyValidate = async (runtime2, _message, _state) => {
505
- const visionService = runtime2.getService("VISION");
506
- return visionService !== null;
507
- };
508
- try {
509
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
510
- } catch {
511
- return false;
512
- }
513
- },
514
- handler: async (runtime, message, _state, _options, callback, _responses) => {
515
- const visionService = runtime.getService("VISION");
516
- if (!visionService) {
517
- const thought = "Vision service is not available.";
518
- const text = "I cannot change vision mode because the vision service is not available.";
519
- await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
733
+ const name = nameMatch[1];
734
+ const entityTracker = visionService.getEntityTracker();
735
+ await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
736
+ const activeEntities = entityTracker.getActiveEntities();
737
+ const people = activeEntities.filter((e) => e.entityType === "person");
738
+ if (people.length === 0) {
739
+ const thought = "No tracked people found.";
740
+ const text = "I can see someone but haven't established tracking yet. Please try again in a moment.";
741
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
520
742
  if (callback) {
521
- await callback({
522
- thought,
523
- text,
524
- actions: ["SET_VISION_MODE"]
525
- });
743
+ await callback({ thought, text, actions: ["VISION"] });
526
744
  }
527
745
  return {
528
746
  success: false,
529
- text
747
+ text,
748
+ data: { actionName: "VISION", op: "name_entity" }
530
749
  };
531
750
  }
532
- try {
533
- const messageText = message.content.text?.toLowerCase() || "";
534
- let newMode = null;
535
- if (messageText.includes("off") || messageText.includes("disable")) {
536
- newMode = "OFF" /* OFF */;
537
- } else if (messageText.includes("both")) {
538
- newMode = "BOTH" /* BOTH */;
539
- } else if (messageText.includes("screen")) {
540
- newMode = "SCREEN" /* SCREEN */;
541
- } else if (messageText.includes("camera")) {
542
- newMode = "CAMERA" /* CAMERA */;
543
- }
544
- if (!newMode) {
545
- const thought2 = "Could not determine the desired vision mode from the message.";
546
- const text2 = "Please specify the vision mode: OFF, CAMERA, SCREEN, or BOTH.";
547
- await saveExecutionRecord(runtime, message, thought2, text2, ["SET_VISION_MODE"]);
548
- if (callback) {
549
- await callback({
550
- thought: thought2,
551
- text: text2,
552
- actions: ["SET_VISION_MODE"]
553
- });
554
- }
555
- return {
556
- success: false,
557
- text: text2
558
- };
559
- }
560
- const currentMode = visionService.getVisionMode();
561
- await visionService.setVisionMode(newMode);
562
- const thought = `Changed vision mode from ${currentMode} to ${newMode}.`;
563
- let text = "";
564
- switch (newMode) {
565
- case "OFF" /* OFF */:
566
- text = "Vision has been disabled. I will no longer process visual input.";
567
- break;
568
- case "CAMERA" /* CAMERA */:
569
- text = "Vision mode set to CAMERA only. I will process input from the camera.";
570
- break;
571
- case "SCREEN" /* SCREEN */:
572
- text = "Vision mode set to SCREEN only. I will analyze what's on your screen.";
573
- break;
574
- case "BOTH" /* BOTH */:
575
- text = "Vision mode set to BOTH. I will process input from both camera and screen.";
576
- break;
577
- }
578
- await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
751
+ let targetPerson = people[0];
752
+ if (people.length > 1) {
753
+ targetPerson = people.reduce((prev, curr) => {
754
+ const prevArea = prev.lastPosition.width * prev.lastPosition.height;
755
+ const currArea = curr.lastPosition.width * curr.lastPosition.height;
756
+ return currArea > prevArea ? curr : prev;
757
+ });
758
+ }
759
+ const success = entityTracker.assignNameToEntity(targetPerson.id, name);
760
+ if (success) {
761
+ const thought = `Named entity "${name}" and associated with person in scene.`;
762
+ const text = `I've identified the person as ${name}. I'll remember them for future interactions.`;
763
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"], undefined);
579
764
  if (callback) {
580
765
  await callback({
581
766
  thought,
582
767
  text,
583
- actions: ["SET_VISION_MODE"]
768
+ actions: ["VISION"],
769
+ data: { entityId: targetPerson.id, name }
584
770
  });
585
771
  }
772
+ logger.info(`[VISION/name_entity] Assigned name "${name}" to entity ${targetPerson.id}`);
586
773
  return {
587
774
  success: true,
588
775
  text,
589
- values: {
590
- visionMode: newMode
776
+ values: { entityId: targetPerson.id, name },
777
+ data: {
778
+ actionName: "VISION",
779
+ op: "name_entity",
780
+ entityId: targetPerson.id,
781
+ name
591
782
  }
592
783
  };
593
- } catch (error) {
594
- logger.error("[setVisionModeAction] Error changing vision mode:", error);
595
- const thought = "An error occurred while trying to change the vision mode.";
596
- const errorMessage = error instanceof Error ? error.message : String(error);
597
- const text = `Error changing vision mode: ${errorMessage}`;
598
- await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
784
+ } else {
785
+ const thought = "Failed to assign name to entity.";
786
+ const text = "There was an error assigning the name. Please try again.";
787
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
599
788
  if (callback) {
600
- await callback({
601
- thought,
602
- text,
603
- actions: ["SET_VISION_MODE"]
604
- });
789
+ await callback({ thought, text, actions: ["VISION"] });
605
790
  }
606
791
  return {
607
792
  success: false,
608
793
  text,
609
- error: errorMessage
794
+ data: { actionName: "VISION", op: "name_entity" }
610
795
  };
611
796
  }
612
- },
613
- examples: [
614
- [
615
- { name: "user", content: { text: "set vision mode to screen" } },
616
- {
617
- name: "agent",
618
- content: {
619
- actions: ["SET_VISION_MODE"],
620
- thought: "The user wants to switch to screen vision mode.",
621
- text: "Vision mode set to SCREEN only. I will analyze what's on your screen."
622
- }
623
- }
624
- ],
625
- [
626
- { name: "user", content: { text: "enable both camera and screen vision" } },
627
- {
628
- name: "agent",
629
- content: {
630
- actions: ["SET_VISION_MODE"],
631
- thought: "The user wants to enable both vision inputs.",
632
- text: "Vision mode set to BOTH. I will process input from both camera and screen."
633
- }
634
- }
635
- ],
636
- [
637
- { name: "user", content: { text: "turn off vision" } },
638
- {
639
- name: "agent",
640
- content: {
641
- actions: ["SET_VISION_MODE"],
642
- thought: "The user wants to disable vision.",
643
- text: "Vision has been disabled. I will no longer process visual input."
644
- }
645
- }
646
- ]
647
- ]
648
- };
649
- var nameEntityAction = {
650
- name: "NAME_ENTITY",
651
- description: "Assign a name to a person or object currently visible in the camera view",
652
- similes: [
653
- "call the person {name}",
654
- "the person in front is {name}",
655
- "name the person {name}",
656
- "that person is {name}",
657
- "the object is a {name}",
658
- "call that {name}"
659
- ],
660
- examples: [
661
- [
662
- {
663
- name: "user",
664
- content: {
665
- text: "The person wearing the blue shirt is named Alice"
666
- }
667
- },
668
- {
669
- name: "agent",
670
- content: {
671
- text: "I've identified the person in the blue shirt as Alice. I'll remember them for future interactions.",
672
- actions: ["NAME_ENTITY"]
673
- }
674
- }
675
- ],
676
- [
677
- {
678
- name: "user",
679
- content: {
680
- text: "Call the person on the left Bob"
681
- }
682
- },
683
- {
684
- name: "agent",
685
- content: {
686
- text: "I've named the person on the left as Bob. Their face profile has been updated.",
687
- actions: ["NAME_ENTITY"]
688
- }
689
- }
690
- ]
691
- ],
692
- validate: async (runtime, message, state, options) => {
693
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
694
- const __avText = __avTextRaw.toLowerCase();
695
- const __avVisionService = runtime?.getService?.("VISION");
696
- const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
697
- const __avKeywords = ["name", "entity"];
698
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
699
- const __avRegex = new RegExp("\\b(?:name|entity)\\b", "i");
700
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
701
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
702
- const __avExpectedSource = "";
703
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
704
- const __avOptions = options && typeof options === "object" ? options : {};
705
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
706
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
707
- return false;
708
- }
709
- const __avLegacyValidate = async (runtime2, _message, _state) => {
710
- const visionService = runtime2.getService("VISION");
711
- return visionService?.isActive() || false;
712
- };
713
- try {
714
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
715
- } catch {
716
- return false;
797
+ } catch (error) {
798
+ logger.error("[VISION/name_entity] Error:", error);
799
+ const thought = "Failed to name entity.";
800
+ const text = `Sorry, I couldn't name the entity: ${error instanceof Error ? error.message : "Unknown error"}`;
801
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
802
+ if (callback) {
803
+ await callback({ thought, text, actions: ["VISION"] });
717
804
  }
718
- },
719
- handler: async (runtime, message, _state, _options, callback) => {
720
- try {
721
- const visionService = runtime.getService("VISION");
722
- if (!visionService) {
723
- const thought = "Vision service is not available.";
724
- const text2 = "I cannot name entities because the vision service is not available.";
725
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
726
- if (callback) {
727
- await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
728
- }
729
- return {
730
- success: false,
731
- text: text2
732
- };
733
- }
734
- const scene = await visionService.getSceneDescription();
735
- if (!scene || scene.people.length === 0) {
736
- const thought = "No people visible to name.";
737
- const text2 = "I don't see any people in the current scene to name.";
738
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
739
- if (callback) {
740
- await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
741
- }
742
- return {
743
- success: false,
744
- text: text2
745
- };
746
- }
747
- const text = message.content.text?.toLowerCase() || "";
748
- const nameMatch = text.match(/(?:named?|call(?:ed)?|is)\s+(\w+)/i);
749
- if (!nameMatch) {
750
- const thought = "Could not extract name from message.";
751
- const text2 = `I couldn't understand what name to assign. Please say something like "The person is named Alice".`;
752
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
753
- if (callback) {
754
- await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
755
- }
756
- return {
757
- success: false,
758
- text: text2
759
- };
760
- }
761
- const name = nameMatch[1];
762
- const _worldId = message.worldId || "default-world";
763
- const entityTracker = visionService.getEntityTracker();
764
- await entityTracker.updateEntities(scene.objects, scene.people, undefined, runtime);
765
- const activeEntities = entityTracker.getActiveEntities();
766
- const people = activeEntities.filter((e) => e.entityType === "person");
767
- if (people.length === 0) {
768
- const thought = "No tracked people found.";
769
- const text2 = "I can see someone but haven't established tracking yet. Please try again in a moment.";
770
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
771
- if (callback) {
772
- await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
773
- }
774
- return {
775
- success: false,
776
- text: text2
777
- };
778
- }
779
- let targetPerson = people[0];
780
- if (people.length > 1) {
781
- targetPerson = people.reduce((prev, curr) => {
782
- const prevArea = prev.lastPosition.width * prev.lastPosition.height;
783
- const currArea = curr.lastPosition.width * curr.lastPosition.height;
784
- return currArea > prevArea ? curr : prev;
785
- });
805
+ return {
806
+ success: false,
807
+ text,
808
+ error: error instanceof Error ? error.message : String(error),
809
+ data: { actionName: "VISION", op: "name_entity" }
810
+ };
811
+ }
812
+ }
813
+ async function runIdentifyPerson(runtime, message, callback) {
814
+ try {
815
+ const visionService = runtime.getService("VISION");
816
+ if (!visionService) {
817
+ const thought2 = "Vision service is not available.";
818
+ const text2 = "I cannot identify people because the vision service is not available.";
819
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
820
+ if (callback) {
821
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
786
822
  }
787
- const success = entityTracker.assignNameToEntity(targetPerson.id, name);
788
- if (success) {
789
- const thought = `Named entity "${name}" and associated with person in scene.`;
790
- const text2 = `I've identified the person as ${name}. I'll remember them for future interactions.`;
791
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"], undefined);
792
- if (callback) {
793
- await callback({
794
- thought,
795
- text: text2,
796
- actions: ["NAME_ENTITY"],
797
- data: { entityId: targetPerson.id, name }
798
- });
799
- }
800
- logger.info(`[NameEntityAction] Assigned name "${name}" to entity ${targetPerson.id}`);
801
- return {
802
- success: true,
803
- text: text2,
804
- values: {
805
- entityId: targetPerson.id,
806
- name
807
- }
808
- };
809
- } else {
810
- const thought = "Failed to assign name to entity.";
811
- const text2 = "There was an error assigning the name. Please try again.";
812
- await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
813
- if (callback) {
814
- await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
815
- }
816
- return {
817
- success: false,
818
- text: text2
819
- };
823
+ return {
824
+ success: false,
825
+ text: text2,
826
+ data: { actionName: "VISION", op: "identify_person" }
827
+ };
828
+ }
829
+ const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
830
+ if (!scene || scene.people.length === 0) {
831
+ const thought2 = "No people visible to identify.";
832
+ const text2 = "I don't see any people in the current scene.";
833
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
834
+ if (callback) {
835
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
820
836
  }
821
- } catch (error) {
822
- logger.error("[NameEntityAction] Error:", error);
823
- const thought = "Failed to name entity.";
824
- const text = `Sorry, I couldn't name the entity: ${error instanceof Error ? error.message : "Unknown error"}`;
825
- await saveExecutionRecord(runtime, message, thought, text, ["NAME_ENTITY"]);
837
+ return {
838
+ success: false,
839
+ text: text2,
840
+ data: { actionName: "VISION", op: "identify_person" }
841
+ };
842
+ }
843
+ const entityTracker = visionService.getEntityTracker();
844
+ await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
845
+ const activeEntities = entityTracker.getActiveEntities();
846
+ const people = activeEntities.filter((e) => e.entityType === "person");
847
+ if (people.length === 0) {
848
+ const thought2 = "No tracked people found.";
849
+ const text2 = "I can see someone but I'm still processing their identity.";
850
+ await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
826
851
  if (callback) {
827
- await callback({ thought, text, actions: ["NAME_ENTITY"] });
852
+ await callback({ thought: thought2, text: text2, actions: ["VISION"] });
828
853
  }
829
854
  return {
830
855
  success: false,
831
- text,
832
- error: error instanceof Error ? error.message : String(error)
856
+ text: text2,
857
+ data: { actionName: "VISION", op: "identify_person" }
833
858
  };
834
859
  }
835
- }
836
- };
837
- var identifyPersonAction = {
838
- name: "IDENTIFY_PERSON",
839
- description: "Identify a person in view if they have been seen before",
840
- similes: [
841
- "who is that",
842
- "who is the person",
843
- "identify the person",
844
- "do you recognize them",
845
- "have you seen them before"
846
- ],
847
- examples: [
848
- [
849
- {
850
- name: "user",
851
- content: {
852
- text: "Who is the person in front of you?"
860
+ let recognizedCount = 0;
861
+ let unknownCount = 0;
862
+ const identifications = [];
863
+ for (const person of people) {
864
+ const name = person.attributes.name;
865
+ const duration = Date.now() - person.firstSeen;
866
+ const durationStr = duration < 60000 ? `${Math.round(duration / 1000)} seconds` : `${Math.round(duration / 60000)} minutes`;
867
+ if (name) {
868
+ recognizedCount++;
869
+ const personInfo = `I can see ${name}. They've been here for ${durationStr}.`;
870
+ identifications.push(personInfo);
871
+ if (person.appearances.length > 5) {
872
+ identifications.push("I've been tracking them consistently.");
853
873
  }
854
- },
855
- {
856
- name: "agent",
857
- content: {
858
- text: "That's Alice. I last saw her about 5 minutes ago. She's been here for the past 20 minutes.",
859
- actions: ["IDENTIFY_PERSON"]
874
+ } else {
875
+ unknownCount++;
876
+ const personInfo = `I see an unidentified person who has been here for ${durationStr}.`;
877
+ identifications.push(personInfo);
878
+ if (person.attributes.faceId) {
879
+ identifications.push("I've captured their face profile but they haven't been named yet.");
860
880
  }
861
881
  }
862
- ]
863
- ],
864
- validate: async (runtime, message, state, options) => {
865
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
866
- const __avText = __avTextRaw.toLowerCase();
867
- const __avVisionService = runtime?.getService?.("VISION");
868
- const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
869
- const __avKeywords = ["identify", "person"];
870
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
871
- const __avRegex = new RegExp("\\b(?:identify|person)\\b", "i");
872
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
873
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
874
- const __avExpectedSource = "";
875
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
876
- const __avOptions = options && typeof options === "object" ? options : {};
877
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
878
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
879
- return false;
880
- }
881
- const __avLegacyValidate = async (runtime2, _message, _state) => {
882
- const visionService = runtime2.getService("VISION");
883
- return visionService?.isActive() || false;
884
- };
885
- try {
886
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
887
- } catch {
888
- return false;
889
882
  }
890
- },
891
- handler: async (runtime, message, _state, _options, callback) => {
892
- try {
893
- const visionService = runtime.getService("VISION");
894
- if (!visionService) {
895
- const thought2 = "Vision service is not available.";
896
- const text2 = "I cannot identify people because the vision service is not available.";
897
- await saveExecutionRecord(runtime, message, thought2, text2, ["IDENTIFY_PERSON"]);
898
- if (callback) {
899
- await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
900
- }
901
- return {
902
- success: false,
903
- text: text2
904
- };
905
- }
906
- const scene = await visionService.getSceneDescription();
907
- if (!scene || scene.people.length === 0) {
908
- const thought2 = "No people visible to identify.";
909
- const text2 = "I don't see any people in the current scene.";
910
- await saveExecutionRecord(runtime, message, thought2, text2, ["IDENTIFY_PERSON"]);
911
- if (callback) {
912
- await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
913
- }
914
- return {
915
- success: false,
916
- text: text2
917
- };
918
- }
919
- const _worldId = message.worldId || "default-world";
920
- const entityTracker = visionService.getEntityTracker();
921
- await entityTracker.updateEntities(scene.objects, scene.people, undefined, runtime);
922
- const activeEntities = entityTracker.getActiveEntities();
923
- const people = activeEntities.filter((e) => e.entityType === "person");
924
- if (people.length === 0) {
925
- const thought2 = "No tracked people found.";
926
- const text2 = "I can see someone but I'm still processing their identity.";
927
- await saveExecutionRecord(runtime, message, thought2, text2, ["IDENTIFY_PERSON"]);
928
- if (callback) {
929
- await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
930
- }
931
- return {
932
- success: false,
933
- text: text2
934
- };
935
- }
936
- const _responseText = "";
937
- let recognizedCount = 0;
938
- let unknownCount = 0;
939
- const identifications = [];
940
- for (const person of people) {
941
- const name = person.attributes.name;
942
- const duration = Date.now() - person.firstSeen;
943
- const durationStr = duration < 60000 ? `${Math.round(duration / 1000)} seconds` : `${Math.round(duration / 60000)} minutes`;
944
- if (name) {
945
- recognizedCount++;
946
- const personInfo = `I can see ${name}. They've been here for ${durationStr}.`;
947
- identifications.push(personInfo);
948
- if (person.appearances.length > 5) {
949
- identifications.push("I've been tracking them consistently.");
950
- }
951
- } else {
952
- unknownCount++;
953
- const personInfo = `I see an unidentified person who has been here for ${durationStr}.`;
954
- identifications.push(personInfo);
955
- if (person.attributes.faceId) {
956
- identifications.push("I've captured their face profile but they haven't been named yet.");
957
- }
883
+ const recentlyLeft = entityTracker.getRecentlyLeft();
884
+ if (recentlyLeft.length > 0) {
885
+ identifications.push(`
886
+ Recently departed:`);
887
+ for (const { entity, leftAt } of recentlyLeft) {
888
+ if (entity.entityType === "person" && entity.attributes.name) {
889
+ const timeAgo = Date.now() - leftAt;
890
+ const timeStr = timeAgo < 60000 ? `${Math.round(timeAgo / 1000)} seconds ago` : `${Math.round(timeAgo / 60000)} minutes ago`;
891
+ identifications.push(`${entity.attributes.name} left ${timeStr}.`);
958
892
  }
959
893
  }
960
- const recentlyLeft = entityTracker.getRecentlyLeft();
961
- if (recentlyLeft.length > 0) {
962
- identifications.push(`
963
- Recently departed:`);
964
- for (const { entity, leftAt } of recentlyLeft) {
965
- if (entity.entityType === "person" && entity.attributes.name) {
966
- const timeAgo = Date.now() - leftAt;
967
- const timeStr = timeAgo < 60000 ? `${Math.round(timeAgo / 1000)} seconds ago` : `${Math.round(timeAgo / 60000)} minutes ago`;
968
- identifications.push(`${entity.attributes.name} left ${timeStr}.`);
969
- }
894
+ }
895
+ const thought = `Identified ${recognizedCount} known people and ${unknownCount} unknown people.`;
896
+ const text = identifications.join(" ");
897
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
898
+ if (callback) {
899
+ await callback({
900
+ thought,
901
+ text,
902
+ actions: ["VISION"],
903
+ data: {
904
+ identifications: people.slice(0, MAX_VISION_ENTITIES).map((p) => ({
905
+ id: p.id,
906
+ entityType: p.entityType,
907
+ name: p.attributes.name || undefined
908
+ }))
970
909
  }
910
+ });
911
+ }
912
+ return {
913
+ success: true,
914
+ text,
915
+ values: { recognizedCount, unknownCount },
916
+ data: {
917
+ actionName: "VISION",
918
+ op: "identify_person",
919
+ recognizedCount,
920
+ unknownCount
971
921
  }
972
- const thought = `Identified ${recognizedCount} known people and ${unknownCount} unknown people.`;
973
- const text = identifications.join(" ");
974
- await saveExecutionRecord(runtime, message, thought, text, ["IDENTIFY_PERSON"]);
975
- if (callback) {
976
- await callback({
977
- thought,
978
- text,
979
- actions: ["IDENTIFY_PERSON"],
980
- data: {
981
- identifications: people.map((p) => ({
982
- id: p.id,
983
- entityType: p.entityType,
984
- name: p.attributes.name || undefined
985
- }))
986
- }
987
- });
922
+ };
923
+ } catch (error) {
924
+ logger.error("[VISION/identify_person] Error:", error);
925
+ const thought = "Failed to identify people.";
926
+ const text = `Sorry, I couldn't identify people: ${error instanceof Error ? error.message : "Unknown error"}`;
927
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
928
+ if (callback) {
929
+ await callback({ thought, text, actions: ["VISION"] });
930
+ }
931
+ return {
932
+ success: false,
933
+ text,
934
+ error: error instanceof Error ? error.message : String(error),
935
+ data: { actionName: "VISION", op: "identify_person" }
936
+ };
937
+ }
938
+ }
939
+ async function runTrackEntity(runtime, message, callback) {
940
+ try {
941
+ const visionService = runtime.getService("VISION");
942
+ if (!visionService) {
943
+ const thought2 = "Vision service is not available.";
944
+ const text = "I cannot track entities because the vision service is not available.";
945
+ await saveExecutionRecord(runtime, message, thought2, text, ["VISION"]);
946
+ if (callback) {
947
+ await callback({ thought: thought2, text, actions: ["VISION"] });
988
948
  }
989
949
  return {
990
- success: true,
950
+ success: false,
991
951
  text,
992
- values: {
993
- recognizedCount,
994
- unknownCount
995
- }
952
+ data: { actionName: "VISION", op: "track_entity" }
996
953
  };
997
- } catch (error) {
998
- logger.error("[identifyPersonAction] Error:", error);
999
- const thought = "Failed to identify people.";
1000
- const text = `Sorry, I couldn't identify people: ${error instanceof Error ? error.message : "Unknown error"}`;
1001
- await saveExecutionRecord(runtime, message, thought, text, ["IDENTIFY_PERSON"]);
954
+ }
955
+ const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
956
+ if (!scene) {
957
+ const thought2 = "No scene available for tracking.";
958
+ const text = "I need a moment to process the visual scene before I can track entities.";
959
+ await saveExecutionRecord(runtime, message, thought2, text, ["VISION"]);
1002
960
  if (callback) {
1003
- await callback({ thought, text, actions: ["IDENTIFY_PERSON"] });
961
+ await callback({ thought: thought2, text, actions: ["VISION"] });
1004
962
  }
1005
963
  return {
1006
964
  success: false,
1007
965
  text,
1008
- error: error instanceof Error ? error.message : String(error)
966
+ data: { actionName: "VISION", op: "track_entity" }
1009
967
  };
1010
968
  }
969
+ const entityTracker = visionService.getEntityTracker();
970
+ await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
971
+ const stats = entityTracker.getStatistics();
972
+ const thought = `Tracking ${stats.activeEntities} entities in the scene.`;
973
+ const summary = [
974
+ `I'm now tracking ${stats.activeEntities} entities in the scene`,
975
+ `(${stats.people} people, ${stats.objects} objects).`,
976
+ "The visual tracking system will maintain persistent IDs for all entities",
977
+ "and notify you of significant changes."
978
+ ];
979
+ const responseText = summary.join(" ");
980
+ await saveExecutionRecord(runtime, message, thought, responseText, [
981
+ "VISION"
982
+ ]);
983
+ if (callback) {
984
+ await callback({
985
+ thought,
986
+ text: responseText,
987
+ actions: ["VISION"],
988
+ data: { entities: stats.activeEntities }
989
+ });
990
+ }
991
+ logger.info(`[VISION/track_entity] Tracking ${stats.activeEntities} entities`);
992
+ return {
993
+ success: true,
994
+ text: responseText,
995
+ values: {
996
+ activeEntities: stats.activeEntities,
997
+ people: stats.people,
998
+ objects: stats.objects
999
+ },
1000
+ data: {
1001
+ actionName: "VISION",
1002
+ op: "track_entity",
1003
+ activeEntities: stats.activeEntities,
1004
+ people: stats.people,
1005
+ objects: stats.objects
1006
+ }
1007
+ };
1008
+ } catch (error) {
1009
+ logger.error("[VISION/track_entity] Error:", error);
1010
+ const thought = "Failed to track entities.";
1011
+ const text = `Sorry, I couldn't track entities: ${error instanceof Error ? error.message : "Unknown error"}`;
1012
+ await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
1013
+ if (callback) {
1014
+ await callback({ thought, text, actions: ["VISION"] });
1015
+ }
1016
+ return {
1017
+ success: false,
1018
+ text,
1019
+ error: error instanceof Error ? error.message : String(error),
1020
+ data: { actionName: "VISION", op: "track_entity" }
1021
+ };
1011
1022
  }
1012
- };
1013
- var trackEntityAction = {
1014
- name: "TRACK_ENTITY",
1015
- description: "Start tracking a specific person or object in view",
1023
+ }
1024
+ var visionAction = {
1025
+ name: "VISION",
1026
+ contexts: [...ALL_VISION_CONTEXTS],
1027
+ contextGate: { anyOf: [...ALL_VISION_CONTEXTS] },
1028
+ roleGate: { minRole: "USER" },
1016
1029
  similes: [
1017
- "track the {description}",
1018
- "follow the {description}",
1019
- "keep an eye on the {description}",
1020
- "watch the {description}"
1030
+ "DESCRIBE_SCENE",
1031
+ "CAPTURE_IMAGE",
1032
+ "SET_VISION_MODE",
1033
+ "NAME_ENTITY",
1034
+ "IDENTIFY_PERSON",
1035
+ "TRACK_ENTITY",
1036
+ "ANALYZE_SCENE",
1037
+ "WHAT_DO_YOU_SEE",
1038
+ "VISION_CHECK",
1039
+ "LOOK_AROUND",
1040
+ "TAKE_PHOTO",
1041
+ "SCREENSHOT",
1042
+ "CAPTURE_FRAME",
1043
+ "TAKE_PICTURE"
1044
+ ],
1045
+ description: "Camera and screen vision: describe the current scene, capture an image, switch vision mode (off/camera/screen/both), name a visible entity, identify a person, or start tracking an entity. The op is inferred from the message text when not explicitly provided.",
1046
+ descriptionCompressed: "Vision: describe / capture / set_mode / name_entity / identify_person / track_entity.",
1047
+ parameters: [
1048
+ {
1049
+ name: "subaction",
1050
+ description: "Operation to perform: describe, capture, set_mode, name_entity, identify_person, or track_entity. Inferred from message text when omitted.",
1051
+ required: false,
1052
+ schema: { type: "string", enum: [...VISION_OPS] }
1053
+ },
1054
+ {
1055
+ name: "detailLevel",
1056
+ description: "For op=describe: 'summary' to omit object/person breakdowns, 'detailed' for the full breakdown.",
1057
+ required: false,
1058
+ schema: {
1059
+ type: "string",
1060
+ enum: ["summary", "detailed"],
1061
+ default: "detailed"
1062
+ }
1063
+ },
1064
+ {
1065
+ name: "mode",
1066
+ description: "For op=set_mode: vision mode to set: off, camera, screen, or both.",
1067
+ required: false,
1068
+ schema: { type: "string", enum: ["off", "camera", "screen", "both"] }
1069
+ },
1070
+ {
1071
+ name: "name",
1072
+ description: "For op=name_entity: the name to assign to the most relevant visible person or object.",
1073
+ required: false,
1074
+ schema: { type: "string" }
1075
+ },
1076
+ {
1077
+ name: "targetHint",
1078
+ description: "For op=name_entity or op=identify_person: optional phrase describing which visible entity to focus on.",
1079
+ required: false,
1080
+ schema: { type: "string" }
1081
+ },
1082
+ {
1083
+ name: "description",
1084
+ description: "For op=track_entity: optional description of the visible entity to prioritize for tracking.",
1085
+ required: false,
1086
+ schema: { type: "string" }
1087
+ },
1088
+ {
1089
+ name: "includeUnknown",
1090
+ description: "For op=identify_person: whether to mention unidentified people in the response.",
1091
+ required: false,
1092
+ schema: { type: "boolean", default: true }
1093
+ }
1021
1094
  ],
1095
+ validate: async (runtime, _message, state, options) => {
1096
+ if (!visionServiceIsActive(runtime)) {
1097
+ const visionService = runtime.getService("VISION");
1098
+ if (!visionService)
1099
+ return false;
1100
+ }
1101
+ const params = readActionParams(options);
1102
+ return selectedContextMatches(state, ALL_VISION_CONTEXTS) || typeof params.op === "string";
1103
+ },
1104
+ handler: async (runtime, message, _state, _options, callback, _responses) => {
1105
+ const params = readActionParams(_options);
1106
+ const explicitOp = normalizeOp(params.op ?? params.subaction);
1107
+ const inferredOp = explicitOp ?? inferOpFromMessage(message.content?.text ?? "");
1108
+ if (!inferredOp) {
1109
+ const text = `VISION could not determine the operation. Specify one of: ${VISION_OPS.join(", ")}.`;
1110
+ if (callback) {
1111
+ await callback({ text, actions: ["VISION"] });
1112
+ }
1113
+ return {
1114
+ success: false,
1115
+ text,
1116
+ values: { error: "MISSING" },
1117
+ data: {
1118
+ actionName: "VISION",
1119
+ availableOps: VISION_OPS
1120
+ }
1121
+ };
1122
+ }
1123
+ switch (inferredOp) {
1124
+ case "describe":
1125
+ return runDescribe(runtime, message, params, callback);
1126
+ case "capture":
1127
+ return runCapture(runtime, message, callback);
1128
+ case "set_mode":
1129
+ return runSetMode(runtime, message, params, callback);
1130
+ case "name_entity":
1131
+ return runNameEntity(runtime, message, params, callback);
1132
+ case "identify_person":
1133
+ return runIdentifyPerson(runtime, message, callback);
1134
+ case "track_entity":
1135
+ return runTrackEntity(runtime, message, callback);
1136
+ }
1137
+ },
1022
1138
  examples: [
1023
1139
  [
1140
+ { name: "{{user}}", content: { text: "what do you see?" } },
1024
1141
  {
1025
- name: "user",
1142
+ name: "{{agent}}",
1026
1143
  content: {
1027
- text: "Track the person wearing the red shirt"
1144
+ actions: ["VISION"],
1145
+ thought: "The user wants to know what I can see through my camera.",
1146
+ text: "I see a room with a desk and computer setup. There are 2 people, one is sitting and one is standing."
1028
1147
  }
1029
- },
1148
+ }
1149
+ ],
1150
+ [
1151
+ { name: "{{user}}", content: { text: "take a photo" } },
1030
1152
  {
1031
- name: "agent",
1153
+ name: "{{agent}}",
1032
1154
  content: {
1033
- text: "I'm now tracking the person in the red shirt. I'll notify you of any significant movements or if they leave the scene.",
1034
- actions: ["TRACK_ENTITY"]
1155
+ actions: ["VISION"],
1156
+ thought: "The user wants me to capture an image from the camera.",
1157
+ text: "I've captured an image from the camera."
1035
1158
  }
1036
1159
  }
1037
- ]
1038
- ],
1039
- validate: async (runtime, message, state, options) => {
1040
- const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
1041
- const __avText = __avTextRaw.toLowerCase();
1042
- const __avVisionService = runtime?.getService?.("VISION");
1043
- const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
1044
- const __avKeywords = ["track", "entity"];
1045
- const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
1046
- const __avRegex = new RegExp("\\b(?:track|entity)\\b", "i");
1047
- const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
1048
- const __avSource = String(message?.content?.source ?? message?.source ?? "");
1049
- const __avExpectedSource = "";
1050
- const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
1051
- const __avOptions = options && typeof options === "object" ? options : {};
1052
- const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
1053
- if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
1054
- return false;
1055
- }
1056
- const __avLegacyValidate = async (runtime2, _message, _state) => {
1057
- const visionService = runtime2.getService("VISION");
1058
- return visionService?.isActive() || false;
1059
- };
1060
- try {
1061
- return Boolean(await __avLegacyValidate(runtime, message, state, options));
1062
- } catch {
1063
- return false;
1064
- }
1065
- },
1066
- handler: async (runtime, message, _state, _options, callback) => {
1067
- try {
1068
- const visionService = runtime.getService("VISION");
1069
- if (!visionService) {
1070
- const thought2 = "Vision service is not available.";
1071
- const text = "I cannot track entities because the vision service is not available.";
1072
- await saveExecutionRecord(runtime, message, thought2, text, ["TRACK_ENTITY"]);
1073
- if (callback) {
1074
- await callback({ thought: thought2, text, actions: ["TRACK_ENTITY"] });
1075
- }
1076
- return {
1077
- success: false,
1078
- text
1079
- };
1160
+ ],
1161
+ [
1162
+ { name: "{{user}}", content: { text: "set vision mode to screen" } },
1163
+ {
1164
+ name: "{{agent}}",
1165
+ content: {
1166
+ actions: ["VISION"],
1167
+ thought: "The user wants to switch to screen vision mode.",
1168
+ text: "Vision mode set to SCREEN only. I will analyze what's on your screen."
1169
+ }
1080
1170
  }
1081
- const scene = await visionService.getSceneDescription();
1082
- if (!scene) {
1083
- const thought2 = "No scene available for tracking.";
1084
- const text = "I need a moment to process the visual scene before I can track entities.";
1085
- await saveExecutionRecord(runtime, message, thought2, text, ["TRACK_ENTITY"]);
1086
- if (callback) {
1087
- await callback({ thought: thought2, text, actions: ["TRACK_ENTITY"] });
1171
+ ],
1172
+ [
1173
+ {
1174
+ name: "{{user}}",
1175
+ content: { text: "the person wearing the blue shirt is named Alice" }
1176
+ },
1177
+ {
1178
+ name: "{{agent}}",
1179
+ content: {
1180
+ actions: ["VISION"],
1181
+ text: "I've identified the person in the blue shirt as Alice. I'll remember them for future interactions."
1088
1182
  }
1089
- return {
1090
- success: false,
1091
- text
1092
- };
1093
1183
  }
1094
- const _text = message.content.text?.toLowerCase() || "";
1095
- const _worldId = message.worldId || "default-world";
1096
- const entityTracker = visionService.getEntityTracker();
1097
- await entityTracker.updateEntities(scene.objects, scene.people, undefined, runtime);
1098
- const stats = entityTracker.getStatistics();
1099
- const thought = `Tracking ${stats.activeEntities} entities in the scene.`;
1100
- const summary = [
1101
- `I'm now tracking ${stats.activeEntities} entities in the scene`,
1102
- `(${stats.people} people, ${stats.objects} objects).`,
1103
- "The visual tracking system will maintain persistent IDs for all entities",
1104
- "and notify you of significant changes."
1105
- ];
1106
- const responseText = summary.join(" ");
1107
- await saveExecutionRecord(runtime, message, thought, responseText, ["TRACK_ENTITY"]);
1108
- if (callback) {
1109
- await callback({
1110
- thought,
1111
- text: responseText,
1112
- actions: ["TRACK_ENTITY"],
1113
- data: { entities: stats.activeEntities }
1114
- });
1184
+ ],
1185
+ [
1186
+ {
1187
+ name: "{{user}}",
1188
+ content: { text: "who is the person in front of you?" }
1189
+ },
1190
+ {
1191
+ name: "{{agent}}",
1192
+ content: {
1193
+ actions: ["VISION"],
1194
+ text: "That's Alice. I last saw her about 5 minutes ago."
1195
+ }
1115
1196
  }
1116
- logger.info(`[TrackEntityAction] Tracking ${stats.activeEntities} entities`);
1117
- return {
1118
- success: true,
1119
- text: responseText,
1120
- values: {
1121
- activeEntities: stats.activeEntities,
1122
- people: stats.people,
1123
- objects: stats.objects
1197
+ ],
1198
+ [
1199
+ {
1200
+ name: "{{user}}",
1201
+ content: { text: "track the person wearing the red shirt" }
1202
+ },
1203
+ {
1204
+ name: "{{agent}}",
1205
+ content: {
1206
+ actions: ["VISION"],
1207
+ text: "I'm now tracking the person in the red shirt."
1124
1208
  }
1125
- };
1126
- } catch (error) {
1127
- logger.error("[trackEntityAction] Error:", error);
1128
- const thought = "Failed to track entities.";
1129
- const text = `Sorry, I couldn't track entities: ${error instanceof Error ? error.message : "Unknown error"}`;
1130
- await saveExecutionRecord(runtime, message, thought, text, ["TRACK_ENTITY"]);
1131
- if (callback) {
1132
- await callback({ thought, text, actions: ["TRACK_ENTITY"] });
1133
1209
  }
1134
- return {
1135
- success: false,
1136
- text,
1137
- error: error instanceof Error ? error.message : String(error)
1138
- };
1139
- }
1140
- }
1210
+ ]
1211
+ ]
1141
1212
  };
1142
1213
 
1143
1214
  // src/provider.ts
1144
1215
  import {
1145
- addHeader,
1146
1216
  logger as logger2
1147
1217
  } from "@elizaos/core";
1218
+ var MAX_VISION_OBJECTS_IN_STATE = 50;
1219
+ var MAX_VISION_PEOPLE_IN_STATE = 25;
1220
+ var MAX_TRACKED_ENTITIES_IN_STATE = 25;
1148
1221
  var visionProvider = {
1149
1222
  name: "VISION_PERCEPTION",
1150
1223
  description: "Provides current visual perception data including scene description, detected objects, people, and entity tracking. This provider is always active and provides real-time visual awareness.",
1151
1224
  position: 99,
1225
+ contexts: ["media", "browser"],
1226
+ contextGate: { anyOf: ["media", "browser"] },
1227
+ cacheStable: false,
1228
+ cacheScope: "turn",
1152
1229
  dynamic: false,
1153
1230
  get: async (runtime, message, _state) => {
1154
1231
  const visionService = runtime.getService("VISION");
@@ -1160,201 +1237,231 @@ var visionProvider = {
1160
1237
  sceneDescription: "Vision service is not available.",
1161
1238
  cameraStatus: "No camera connected"
1162
1239
  },
1163
- text: addHeader("# Visual Perception", "Vision service is not available."),
1240
+ text: JSON.stringify({
1241
+ visual_perception: {
1242
+ visionAvailable: false,
1243
+ sceneDescription: "Vision service is not available.",
1244
+ cameraStatus: "No camera connected"
1245
+ }
1246
+ }, null, 2),
1164
1247
  data: { hasVision: false }
1165
1248
  };
1166
1249
  }
1167
- const sceneDescription = await visionService.getEnhancedSceneDescription() || await visionService.getSceneDescription();
1168
- const cameraInfo = visionService.getCameraInfo();
1169
- const isActive = visionService.isActive();
1170
- const visionMode = visionService.getVisionMode();
1171
- const screenCapture = await visionService.getScreenCapture();
1172
- const _worldId = message.worldId || "default-world";
1173
- const entityTracker = visionService.getEntityTracker();
1174
- let entityData = null;
1175
- if (sceneDescription && entityTracker) {
1176
- await entityTracker.updateEntities(sceneDescription.objects, sceneDescription.people, undefined, runtime);
1177
- const activeEntities = entityTracker.getActiveEntities();
1178
- const recentlyLeft = entityTracker.getRecentlyLeft();
1179
- const stats = entityTracker.getStatistics();
1180
- entityData = {
1181
- activeEntities: activeEntities.map((e) => ({
1182
- id: e.id,
1183
- type: e.entityType,
1184
- name: e.attributes.name,
1185
- firstSeen: e.firstSeen,
1186
- duration: Date.now() - e.firstSeen,
1187
- position: e.lastPosition,
1188
- attributes: e.attributes
1189
- })),
1190
- recentlyLeft: recentlyLeft.map(({ entity, leftAt }) => ({
1191
- id: entity.id,
1192
- name: entity.attributes.name,
1193
- leftAt,
1194
- timeAgo: Date.now() - leftAt
1195
- })),
1196
- statistics: stats
1197
- };
1198
- }
1199
- let perceptionText = "";
1200
- let values = {};
1201
- let data = {};
1202
- if (!isActive) {
1203
- perceptionText = `Vision mode: ${visionMode}
1250
+ try {
1251
+ const sceneDescription = await visionService.getEnhancedSceneDescription() || await visionService.getSceneDescription();
1252
+ const cameraInfo = visionService.getCameraInfo();
1253
+ const isActive = visionService.isActive();
1254
+ const visionMode = visionService.getVisionMode();
1255
+ const screenCapture = await visionService.getScreenCapture();
1256
+ const _worldId = message.worldId || "default-world";
1257
+ const entityTracker = visionService.getEntityTracker();
1258
+ let entityData = null;
1259
+ if (sceneDescription && entityTracker) {
1260
+ await entityTracker.updateEntities(sceneDescription.objects, sceneDescription.people, undefined, runtime);
1261
+ const activeEntities = entityTracker.getActiveEntities();
1262
+ const recentlyLeft = entityTracker.getRecentlyLeft();
1263
+ const stats = entityTracker.getStatistics();
1264
+ entityData = {
1265
+ activeEntities: activeEntities.slice(0, MAX_TRACKED_ENTITIES_IN_STATE).map((e) => ({
1266
+ id: e.id,
1267
+ type: e.entityType,
1268
+ name: e.attributes.name,
1269
+ firstSeen: e.firstSeen,
1270
+ duration: Date.now() - e.firstSeen,
1271
+ position: e.lastPosition,
1272
+ attributes: e.attributes
1273
+ })),
1274
+ recentlyLeft: recentlyLeft.slice(0, MAX_TRACKED_ENTITIES_IN_STATE).map(({ entity, leftAt }) => ({
1275
+ id: entity.id,
1276
+ name: entity.attributes.name,
1277
+ leftAt,
1278
+ timeAgo: Date.now() - leftAt
1279
+ })),
1280
+ statistics: stats
1281
+ };
1282
+ }
1283
+ let perceptionText = "";
1284
+ let values = {};
1285
+ let data = {};
1286
+ if (!isActive) {
1287
+ perceptionText = `Vision mode: ${visionMode}
1204
1288
  `;
1205
- if (visionMode === "OFF") {
1206
- perceptionText += "Vision is disabled.";
1289
+ if (visionMode === "OFF") {
1290
+ perceptionText += "Vision is disabled.";
1291
+ } else {
1292
+ perceptionText += "Vision service is initializing...";
1293
+ }
1294
+ values = {
1295
+ visionAvailable: false,
1296
+ visionMode,
1297
+ sceneDescription: "Vision not active",
1298
+ cameraStatus: cameraInfo ? `Camera "${cameraInfo.name}" detected but not active` : "No camera"
1299
+ };
1207
1300
  } else {
1208
- perceptionText += "Vision service is initializing...";
1209
- }
1210
- values = {
1211
- visionAvailable: false,
1212
- visionMode,
1213
- sceneDescription: "Vision not active",
1214
- cameraStatus: cameraInfo ? `Camera "${cameraInfo.name}" detected but not active` : "No camera"
1215
- };
1216
- } else {
1217
- perceptionText = `Vision mode: ${visionMode}
1301
+ perceptionText = `Vision mode: ${visionMode}
1218
1302
 
1219
1303
  `;
1220
- if ((visionMode === "CAMERA" || visionMode === "BOTH") && sceneDescription) {
1221
- const ageInSeconds = (Date.now() - sceneDescription.timestamp) / 1000;
1222
- const secondsAgo = Math.round(ageInSeconds);
1223
- perceptionText += `Camera view (${secondsAgo}s ago):
1304
+ if ((visionMode === "CAMERA" || visionMode === "BOTH") && sceneDescription) {
1305
+ const ageInSeconds = (Date.now() - sceneDescription.timestamp) / 1000;
1306
+ const secondsAgo = Math.round(ageInSeconds);
1307
+ perceptionText += `Camera view (${secondsAgo}s ago):
1224
1308
  ${sceneDescription.description}`;
1225
- if (sceneDescription.people.length > 0) {
1226
- perceptionText += `
1309
+ if (sceneDescription.people.length > 0) {
1310
+ perceptionText += `
1227
1311
 
1228
1312
  People detected: ${sceneDescription.people.length}`;
1229
- const poses = sceneDescription.people.map((p) => p.pose).filter((p) => p !== "unknown");
1230
- const facings = sceneDescription.people.map((p) => p.facing).filter((f) => f !== "unknown");
1231
- if (poses.length > 0) {
1232
- const poseCounts = poses.reduce((acc, pose) => {
1233
- acc[pose] = (acc[pose] || 0) + 1;
1234
- return acc;
1235
- }, {});
1236
- perceptionText += `
1313
+ const poses = sceneDescription.people.map((p) => p.pose).filter((p) => p !== "unknown");
1314
+ const facings = sceneDescription.people.map((p) => p.facing).filter((f) => f !== "unknown");
1315
+ if (poses.length > 0) {
1316
+ const poseCounts = poses.reduce((acc, pose) => {
1317
+ acc[pose] = (acc[pose] || 0) + 1;
1318
+ return acc;
1319
+ }, {});
1320
+ perceptionText += `
1237
1321
  Poses: ${Object.entries(poseCounts).map(([pose, count]) => `${count} ${pose}`).join(", ")}`;
1238
- }
1239
- if (facings.length > 0) {
1240
- const facingCounts = facings.reduce((acc, facing) => {
1241
- acc[facing] = (acc[facing] || 0) + 1;
1242
- return acc;
1243
- }, {});
1244
- perceptionText += `
1322
+ }
1323
+ if (facings.length > 0) {
1324
+ const facingCounts = facings.reduce((acc, facing) => {
1325
+ acc[facing] = (acc[facing] || 0) + 1;
1326
+ return acc;
1327
+ }, {});
1328
+ perceptionText += `
1245
1329
  Facing: ${Object.entries(facingCounts).map(([facing, count]) => `${count} facing ${facing}`).join(", ")}`;
1330
+ }
1246
1331
  }
1247
- }
1248
- if (sceneDescription.objects.length > 0) {
1249
- const objectTypes = sceneDescription.objects.map((o) => o.type);
1250
- const uniqueObjects = [...new Set(objectTypes)];
1251
- perceptionText += `
1332
+ if (sceneDescription.objects.length > 0) {
1333
+ const objectTypes = sceneDescription.objects.slice(0, MAX_VISION_OBJECTS_IN_STATE).map((o) => o.type);
1334
+ const uniqueObjects = [...new Set(objectTypes)];
1335
+ perceptionText += `
1252
1336
 
1253
1337
  Objects detected: ${uniqueObjects.join(", ")}`;
1254
- }
1255
- if (sceneDescription.sceneChanged) {
1256
- perceptionText += `
1338
+ }
1339
+ if (sceneDescription.sceneChanged) {
1340
+ perceptionText += `
1257
1341
 
1258
1342
  Scene change: ${sceneDescription.changePercentage.toFixed(1)}% of pixels changed`;
1259
- }
1260
- if (entityData) {
1261
- if (entityData.activeEntities.length > 0) {
1262
- perceptionText += `
1343
+ }
1344
+ if (entityData) {
1345
+ if (entityData.activeEntities.length > 0) {
1346
+ perceptionText += `
1263
1347
 
1264
1348
  Currently tracking:`;
1265
- for (const entity of entityData.activeEntities) {
1266
- const name = entity.name || `Unknown ${entity.type}`;
1267
- const duration = entity.duration < 60000 ? `${Math.round(entity.duration / 1000)}s` : `${Math.round(entity.duration / 60000)}m`;
1268
- perceptionText += `
1349
+ for (const entity of entityData.activeEntities) {
1350
+ const name = entity.name || `Unknown ${entity.type}`;
1351
+ const duration = entity.duration < 60000 ? `${Math.round(entity.duration / 1000)}s` : `${Math.round(entity.duration / 60000)}m`;
1352
+ perceptionText += `
1269
1353
  - ${name} (present for ${duration})`;
1354
+ }
1270
1355
  }
1271
- }
1272
- if (entityData.recentlyLeft.length > 0) {
1273
- perceptionText += `
1356
+ if (entityData.recentlyLeft.length > 0) {
1357
+ perceptionText += `
1274
1358
 
1275
1359
  Recently left:`;
1276
- for (const departed of entityData.recentlyLeft) {
1277
- const name = departed.name || "Unknown person";
1278
- const timeStr = departed.timeAgo < 60000 ? `${Math.round(departed.timeAgo / 1000)}s ago` : `${Math.round(departed.timeAgo / 60000)}m ago`;
1279
- perceptionText += `
1360
+ for (const departed of entityData.recentlyLeft) {
1361
+ const name = departed.name || "Unknown person";
1362
+ const timeStr = departed.timeAgo < 60000 ? `${Math.round(departed.timeAgo / 1000)}s ago` : `${Math.round(departed.timeAgo / 60000)}m ago`;
1363
+ perceptionText += `
1280
1364
  - ${name} left ${timeStr}`;
1365
+ }
1281
1366
  }
1282
1367
  }
1283
1368
  }
1284
- }
1285
- if ((visionMode === "SCREEN" || visionMode === "BOTH") && screenCapture) {
1286
- const screenAge = (Date.now() - screenCapture.timestamp) / 1000;
1287
- const screenSecondsAgo = Math.round(screenAge);
1288
- if (visionMode === "BOTH") {
1289
- perceptionText += `
1369
+ if ((visionMode === "SCREEN" || visionMode === "BOTH") && screenCapture) {
1370
+ const screenAge = (Date.now() - screenCapture.timestamp) / 1000;
1371
+ const screenSecondsAgo = Math.round(screenAge);
1372
+ if (visionMode === "BOTH") {
1373
+ perceptionText += `
1290
1374
 
1291
1375
  ---
1292
1376
 
1293
1377
  `;
1294
- }
1295
- perceptionText += `Screen capture (${screenSecondsAgo}s ago):
1378
+ }
1379
+ perceptionText += `Screen capture (${screenSecondsAgo}s ago):
1296
1380
  `;
1297
- perceptionText += `Resolution: ${screenCapture.width}x${screenCapture.height}
1381
+ perceptionText += `Resolution: ${screenCapture.width}x${screenCapture.height}
1298
1382
  `;
1299
- const enhanced = sceneDescription;
1300
- if (enhanced?.screenAnalysis) {
1301
- const tileAnalysis = enhanced.screenAnalysis.activeTile;
1302
- if (tileAnalysis) {
1303
- if (tileAnalysis.summary) {
1304
- perceptionText += `
1383
+ const enhanced = sceneDescription;
1384
+ if (enhanced?.screenAnalysis) {
1385
+ const tileAnalysis = enhanced.screenAnalysis.activeTile;
1386
+ if (tileAnalysis) {
1387
+ if (tileAnalysis.summary) {
1388
+ perceptionText += `
1305
1389
  Active area: ${tileAnalysis.summary}`;
1306
- }
1307
- if (tileAnalysis.text) {
1308
- perceptionText += `
1390
+ }
1391
+ if (tileAnalysis.text) {
1392
+ perceptionText += `
1309
1393
 
1310
1394
  Visible text:
1311
1395
  "${tileAnalysis.text.substring(0, 200)}${tileAnalysis.text.length > 200 ? "..." : ""}"`;
1312
- }
1313
- if (tileAnalysis.objects && tileAnalysis.objects.length > 0) {
1314
- const uiElements = tileAnalysis.objects.map((o) => o.type || "unknown");
1315
- const uniqueElements = [...new Set(uiElements)];
1316
- perceptionText += `
1396
+ }
1397
+ if (tileAnalysis.objects && tileAnalysis.objects.length > 0) {
1398
+ const uiElements = tileAnalysis.objects.map((o) => o.type || "unknown");
1399
+ const uniqueElements = [...new Set(uiElements)];
1400
+ perceptionText += `
1317
1401
 
1318
1402
  UI elements: ${uniqueElements.join(", ")}`;
1403
+ }
1319
1404
  }
1320
- }
1321
- if (enhanced.screenAnalysis.focusedApp) {
1322
- perceptionText += `
1405
+ if (enhanced.screenAnalysis.focusedApp) {
1406
+ perceptionText += `
1323
1407
 
1324
1408
  Active application: ${enhanced.screenAnalysis.focusedApp}`;
1409
+ }
1325
1410
  }
1326
1411
  }
1412
+ values = {
1413
+ visionAvailable: true,
1414
+ visionMode,
1415
+ sceneDescription: sceneDescription?.description || "Processing...",
1416
+ cameraStatus: cameraInfo ? `Connected to ${cameraInfo.name}` : "No camera",
1417
+ cameraId: cameraInfo?.id,
1418
+ peopleCount: sceneDescription?.people.length || 0,
1419
+ objectCount: sceneDescription?.objects.length || 0,
1420
+ sceneAge: sceneDescription ? Math.round((Date.now() - sceneDescription.timestamp) / 1000) : null,
1421
+ lastChange: sceneDescription?.sceneChanged ? sceneDescription.changePercentage : 0,
1422
+ hasScreenCapture: !!screenCapture,
1423
+ screenResolution: screenCapture ? `${screenCapture.width}x${screenCapture.height}` : null,
1424
+ activeEntities: entityData?.activeEntities || [],
1425
+ recentlyLeft: entityData?.recentlyLeft || [],
1426
+ entityStatistics: entityData?.statistics || null
1427
+ };
1428
+ data = {
1429
+ objects: sceneDescription?.objects.slice(0, MAX_VISION_OBJECTS_IN_STATE) || [],
1430
+ people: sceneDescription?.people.slice(0, MAX_VISION_PEOPLE_IN_STATE) || [],
1431
+ screenCapture: screenCapture || null,
1432
+ enhancedData: sceneDescription?.screenAnalysis || null,
1433
+ trackedEntities: entityData?.activeEntities || [],
1434
+ worldState: entityData || null
1435
+ };
1327
1436
  }
1328
- values = {
1329
- visionAvailable: true,
1330
- visionMode,
1331
- sceneDescription: sceneDescription?.description || "Processing...",
1332
- cameraStatus: cameraInfo ? `Connected to ${cameraInfo.name}` : "No camera",
1333
- cameraId: cameraInfo?.id,
1334
- peopleCount: sceneDescription?.people.length || 0,
1335
- objectCount: sceneDescription?.objects.length || 0,
1336
- sceneAge: sceneDescription ? Math.round((Date.now() - sceneDescription.timestamp) / 1000) : null,
1337
- lastChange: sceneDescription?.sceneChanged ? sceneDescription.changePercentage : 0,
1338
- hasScreenCapture: !!screenCapture,
1339
- screenResolution: screenCapture ? `${screenCapture.width}x${screenCapture.height}` : null,
1340
- activeEntities: entityData?.activeEntities || [],
1341
- recentlyLeft: entityData?.recentlyLeft || [],
1342
- entityStatistics: entityData?.statistics || null
1437
+ return {
1438
+ values,
1439
+ text: JSON.stringify({
1440
+ visual_perception: {
1441
+ summary: perceptionText,
1442
+ ...values
1443
+ }
1444
+ }, null, 2),
1445
+ data
1343
1446
  };
1344
- data = {
1345
- objects: sceneDescription?.objects || [],
1346
- people: sceneDescription?.people || [],
1347
- screenCapture: screenCapture || null,
1348
- enhancedData: sceneDescription?.screenAnalysis || null,
1349
- trackedEntities: entityData?.activeEntities || [],
1350
- worldState: entityData || null
1447
+ } catch (error) {
1448
+ return {
1449
+ values: {
1450
+ visionAvailable: false,
1451
+ error: error instanceof Error ? error.message : String(error)
1452
+ },
1453
+ text: JSON.stringify({
1454
+ visual_perception: {
1455
+ visionAvailable: false,
1456
+ error: error instanceof Error ? error.message : String(error)
1457
+ }
1458
+ }, null, 2),
1459
+ data: {
1460
+ hasVision: false,
1461
+ error: error instanceof Error ? error.message : String(error)
1462
+ }
1351
1463
  };
1352
1464
  }
1353
- return {
1354
- values,
1355
- text: addHeader("# Visual Perception", perceptionText),
1356
- data
1357
- };
1358
1465
  }
1359
1466
  };
1360
1467
 
@@ -1364,9 +1471,11 @@ import * as fs3 from "node:fs/promises";
1364
1471
  import * as path5 from "node:path";
1365
1472
  import { promisify as promisify3 } from "node:util";
1366
1473
  import {
1367
- logger as logger14,
1474
+ logger as logger15,
1368
1475
  ModelType as ModelType3,
1369
- Service
1476
+ recordLlmCall,
1477
+ Service,
1478
+ withStandaloneTrajectory as withStandaloneTrajectory3
1370
1479
  } from "@elizaos/core";
1371
1480
  import sharp4 from "sharp";
1372
1481
 
@@ -1375,7 +1484,11 @@ import { exec } from "node:child_process";
1375
1484
  import * as fs from "node:fs/promises";
1376
1485
  import * as path from "node:path";
1377
1486
  import { promisify } from "node:util";
1378
- import { logger as logger3, ModelType } from "@elizaos/core";
1487
+ import {
1488
+ logger as logger3,
1489
+ ModelType,
1490
+ withStandaloneTrajectory
1491
+ } from "@elizaos/core";
1379
1492
  var execAsync = promisify(exec);
1380
1493
 
1381
1494
  class AudioCaptureService {
@@ -1426,7 +1539,11 @@ class AudioCaptureService {
1426
1539
  await execAsync("where ffmpeg");
1427
1540
  return { available: true, tool: "ffmpeg" };
1428
1541
  }
1429
- return { available: false, tool: "none", message: "Unsupported platform" };
1542
+ return {
1543
+ available: false,
1544
+ tool: "none",
1545
+ message: "Unsupported platform"
1546
+ };
1430
1547
  } catch (_error) {
1431
1548
  const toolName = platform === "darwin" ? "sox" : platform === "linux" ? "arecord" : "ffmpeg";
1432
1549
  const installCmd = platform === "darwin" ? "brew install sox" : platform === "linux" ? "sudo apt-get install alsa-utils" : "Download ffmpeg from ffmpeg.org";
@@ -1460,7 +1577,13 @@ class AudioCaptureService {
1460
1577
  await this.recordAudio(audioFile, this.config.transcriptionInterval / 1000);
1461
1578
  logger3.debug("[AudioCapture] Recording complete, transcribing...");
1462
1579
  const audioBuffer = await fs.readFile(audioFile);
1463
- const transcription = await this.runtime.useModel(ModelType.TRANSCRIPTION, audioBuffer);
1580
+ const transcription = await withStandaloneTrajectory(this.runtime, {
1581
+ source: "plugin-vision:audio-transcription",
1582
+ metadata: {
1583
+ modelType: ModelType.TRANSCRIPTION,
1584
+ audioBytes: audioBuffer.byteLength
1585
+ }
1586
+ }, () => this.runtime.useModel(ModelType.TRANSCRIPTION, audioBuffer));
1464
1587
  await fs.unlink(audioFile).catch(() => {});
1465
1588
  if (transcription && typeof transcription === "string" && transcription.trim()) {
1466
1589
  logger3.info(`[AudioCapture] Transcribed: "${transcription}"`);
@@ -1583,7 +1706,11 @@ class AudioCaptureService {
1583
1706
  // src/audio-capture-stream.ts
1584
1707
  import { spawn } from "node:child_process";
1585
1708
  import { EventEmitter } from "node:events";
1586
- import { logger as logger4, ModelType as ModelType2 } from "@elizaos/core";
1709
+ import {
1710
+ logger as logger4,
1711
+ ModelType as ModelType2,
1712
+ withStandaloneTrajectory as withStandaloneTrajectory2
1713
+ } from "@elizaos/core";
1587
1714
 
1588
1715
  class StreamingAudioCaptureService extends EventEmitter {
1589
1716
  runtime;
@@ -1814,7 +1941,13 @@ class StreamingAudioCaptureService extends EventEmitter {
1814
1941
  async transcribeAudio(audioData) {
1815
1942
  try {
1816
1943
  const wavBuffer = this.rawToWav(audioData);
1817
- const result = await this.runtime.useModel(ModelType2.TRANSCRIPTION, wavBuffer);
1944
+ const result = await withStandaloneTrajectory2(this.runtime, {
1945
+ source: "plugin-vision:streaming-audio-transcription",
1946
+ metadata: {
1947
+ modelType: ModelType2.TRANSCRIPTION,
1948
+ audioBytes: wavBuffer.byteLength
1949
+ }
1950
+ }, () => this.runtime.useModel(ModelType2.TRANSCRIPTION, wavBuffer));
1818
1951
  return result;
1819
1952
  } catch (error) {
1820
1953
  logger4.error("[StreamingAudio] Transcription failed:", error);
@@ -1904,7 +2037,10 @@ class StreamingAudioCaptureService extends EventEmitter {
1904
2037
  }
1905
2038
 
1906
2039
  // src/entity-tracker.ts
1907
- import { createUniqueUuid as createUniqueUuid2, logger as logger5 } from "@elizaos/core";
2040
+ import {
2041
+ createUniqueUuid as createUniqueUuid2,
2042
+ logger as logger5
2043
+ } from "@elizaos/core";
1908
2044
 
1909
2045
  class EntityTracker {
1910
2046
  worldState;
@@ -3112,9 +3248,27 @@ class OCRService {
3112
3248
  logger10.debug("[OCR] Using fallback OCR implementation");
3113
3249
  const blocks = [];
3114
3250
  const mockTexts = [
3115
- { text: "File Edit View Window Help", x: 10, y: 5, width: 300, height: 20 },
3116
- { text: "Welcome to the application", x: 100, y: 100, width: 400, height: 40 },
3117
- { text: "Click here to continue", x: 200, y: 300, width: 200, height: 30 }
3251
+ {
3252
+ text: "File Edit View Window Help",
3253
+ x: 10,
3254
+ y: 5,
3255
+ width: 300,
3256
+ height: 20
3257
+ },
3258
+ {
3259
+ text: "Welcome to the application",
3260
+ x: 100,
3261
+ y: 100,
3262
+ width: 400,
3263
+ height: 40
3264
+ },
3265
+ {
3266
+ text: "Click here to continue",
3267
+ x: 200,
3268
+ y: 300,
3269
+ width: 200,
3270
+ height: 30
3271
+ }
3118
3272
  ];
3119
3273
  for (const mock of mockTexts) {
3120
3274
  blocks.push({
@@ -3335,8 +3489,46 @@ class ScreenCaptureService {
3335
3489
  }
3336
3490
  }
3337
3491
 
3338
- // src/vision-models.ts
3492
+ // src/test-input.ts
3493
+ import { existsSync, readFileSync } from "node:fs";
3494
+ import { resolve } from "node:path";
3339
3495
  import { logger as logger12 } from "@elizaos/core";
3496
+ function getTestInputMode() {
3497
+ const raw = (process.env.ELIZA_VISION_TEST_INPUT ?? "").trim().toLowerCase();
3498
+ if (raw === "image")
3499
+ return "image";
3500
+ if (raw === "camera")
3501
+ return "camera";
3502
+ if (raw === "screen")
3503
+ return "screen";
3504
+ return "unset";
3505
+ }
3506
+ var FIXTURE_DEFAULT_REL = "test/fixtures/sample-scene.png";
3507
+ function resolveFixturePath() {
3508
+ const fromEnv = process.env.ELIZA_VISION_TEST_FIXTURE;
3509
+ if (fromEnv)
3510
+ return resolve(fromEnv);
3511
+ return resolve(process.cwd(), FIXTURE_DEFAULT_REL);
3512
+ }
3513
+ var cached = null;
3514
+ var cachedKey = null;
3515
+ function getTestImage() {
3516
+ if (getTestInputMode() !== "image")
3517
+ return null;
3518
+ const path4 = resolveFixturePath();
3519
+ if (cachedKey === path4 && cached)
3520
+ return cached;
3521
+ if (!existsSync(path4)) {
3522
+ logger12.warn(`[plugin-vision] ELIZA_VISION_TEST_INPUT=image set but fixture not found at ${path4}.`);
3523
+ return null;
3524
+ }
3525
+ cached = readFileSync(path4);
3526
+ cachedKey = path4;
3527
+ return cached;
3528
+ }
3529
+
3530
+ // src/vision-models.ts
3531
+ import { logger as logger13 } from "@elizaos/core";
3340
3532
  var tf2 = null;
3341
3533
  var cocoSsd = null;
3342
3534
  var poseDetection = null;
@@ -3349,7 +3541,7 @@ async function loadTfModules() {
3349
3541
  poseDetection = await import("@tensorflow-models/pose-detection");
3350
3542
  return true;
3351
3543
  } catch (err) {
3352
- logger12.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
3544
+ logger13.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
3353
3545
  return false;
3354
3546
  }
3355
3547
  }
@@ -3363,30 +3555,30 @@ class VisionModels {
3363
3555
  if (this.initialized) {
3364
3556
  return;
3365
3557
  }
3366
- logger12.info("[VisionModels] Initializing vision models...");
3558
+ logger13.info("[VisionModels] Initializing vision models...");
3367
3559
  this.tfAvailable = await loadTfModules();
3368
3560
  if (!this.tfAvailable || !tf2 || !cocoSsd || !poseDetection) {
3369
3561
  this.initialized = true;
3370
- logger12.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
3562
+ logger13.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
3371
3563
  return;
3372
3564
  }
3373
3565
  try {
3374
3566
  await tf2.ready();
3375
- logger12.info("[VisionModels] TensorFlow.js backend ready");
3567
+ logger13.info("[VisionModels] TensorFlow.js backend ready");
3376
3568
  if (config.enableObjectDetection) {
3377
3569
  try {
3378
- logger12.info("[VisionModels] Loading COCO-SSD model...");
3570
+ logger13.info("[VisionModels] Loading COCO-SSD model...");
3379
3571
  this.objectDetectionModel = await cocoSsd.load({
3380
3572
  base: "mobilenet_v2"
3381
3573
  });
3382
- logger12.info("[VisionModels] COCO-SSD model loaded");
3574
+ logger13.info("[VisionModels] COCO-SSD model loaded");
3383
3575
  } catch (error) {
3384
- logger12.error("[VisionModels] Failed to load COCO-SSD model:", error);
3576
+ logger13.error("[VisionModels] Failed to load COCO-SSD model:", error);
3385
3577
  }
3386
3578
  }
3387
3579
  if (config.enablePoseDetection) {
3388
3580
  try {
3389
- logger12.info("[VisionModels] Loading PoseNet model...");
3581
+ logger13.info("[VisionModels] Loading PoseNet model...");
3390
3582
  const detectorConfig = {
3391
3583
  architecture: "MobileNetV1",
3392
3584
  outputStride: 16,
@@ -3394,15 +3586,15 @@ class VisionModels {
3394
3586
  multiplier: 0.75
3395
3587
  };
3396
3588
  this.poseDetector = await poseDetection.createDetector(poseDetection.SupportedModels.PoseNet, detectorConfig);
3397
- logger12.info("[VisionModels] PoseNet model loaded");
3589
+ logger13.info("[VisionModels] PoseNet model loaded");
3398
3590
  } catch (error) {
3399
- logger12.error("[VisionModels] Failed to load PoseNet model:", error);
3591
+ logger13.error("[VisionModels] Failed to load PoseNet model:", error);
3400
3592
  }
3401
3593
  }
3402
3594
  this.initialized = true;
3403
- logger12.info("[VisionModels] Vision models initialized");
3595
+ logger13.info("[VisionModels] Vision models initialized");
3404
3596
  } catch (error) {
3405
- logger12.error("[VisionModels] Initialization failed:", error);
3597
+ logger13.error("[VisionModels] Initialization failed:", error);
3406
3598
  throw error;
3407
3599
  }
3408
3600
  }
@@ -3414,7 +3606,7 @@ class VisionModels {
3414
3606
  }
3415
3607
  async detectObjects(imageData, _width, _height, description) {
3416
3608
  if (!this.objectDetectionModel || !tf2) {
3417
- logger12.warn("[VisionModels] Object detection model not loaded");
3609
+ logger13.warn("[VisionModels] Object detection model not loaded");
3418
3610
  return this.enhancedObjectDetection(description);
3419
3611
  }
3420
3612
  try {
@@ -3436,10 +3628,10 @@ class VisionModels {
3436
3628
  height: pred.bbox[3]
3437
3629
  }
3438
3630
  }));
3439
- logger12.debug(`[VisionModels] Detected ${objects.length} objects`);
3631
+ logger13.debug(`[VisionModels] Detected ${objects.length} objects`);
3440
3632
  return objects;
3441
3633
  } catch (error) {
3442
- logger12.error("[VisionModels] Object detection failed:", error);
3634
+ logger13.error("[VisionModels] Object detection failed:", error);
3443
3635
  return this.enhancedObjectDetection(description);
3444
3636
  }
3445
3637
  }
@@ -3449,8 +3641,14 @@ class VisionModels {
3449
3641
  }
3450
3642
  const objects = [];
3451
3643
  const objectPatterns = [
3452
- { pattern: /(\d+)?\s*(person|people|man|men|woman|women|child|children)/gi, type: "person" },
3453
- { pattern: /(\d+)?\s*(laptop|computer|monitor|screen|display)/gi, type: "laptop" },
3644
+ {
3645
+ pattern: /(\d+)?\s*(person|people|man|men|woman|women|child|children)/gi,
3646
+ type: "person"
3647
+ },
3648
+ {
3649
+ pattern: /(\d+)?\s*(laptop|computer|monitor|screen|display)/gi,
3650
+ type: "laptop"
3651
+ },
3454
3652
  { pattern: /(\d+)?\s*(phone|smartphone|mobile)/gi, type: "cell phone" },
3455
3653
  { pattern: /(\d+)?\s*(book|notebook|journal)/gi, type: "book" },
3456
3654
  { pattern: /(\d+)?\s*(cup|mug|glass|bottle)/gi, type: "cup" },
@@ -3500,7 +3698,7 @@ class VisionModels {
3500
3698
  }
3501
3699
  async detectPoses(imageData, width, height, description) {
3502
3700
  if (!this.poseDetector || !tf2) {
3503
- logger12.warn("[VisionModels] Pose detection model not loaded");
3701
+ logger13.warn("[VisionModels] Pose detection model not loaded");
3504
3702
  return this.enhancedPoseDetection(description);
3505
3703
  }
3506
3704
  try {
@@ -3513,7 +3711,7 @@ class VisionModels {
3513
3711
  imageTensor.dispose();
3514
3712
  return this.convertPosesToPersonInfo(poses);
3515
3713
  } catch (error) {
3516
- logger12.error("[VisionModels] Pose detection failed:", error);
3714
+ logger13.error("[VisionModels] Pose detection failed:", error);
3517
3715
  return this.enhancedPoseDetection(description);
3518
3716
  }
3519
3717
  }
@@ -3535,7 +3733,12 @@ class VisionModels {
3535
3733
  lying: ["lying", "laying", "reclined"]
3536
3734
  };
3537
3735
  const facingKeywords = {
3538
- camera: ["facing camera", "looking at camera", "facing forward", "front view"],
3736
+ camera: [
3737
+ "facing camera",
3738
+ "looking at camera",
3739
+ "facing forward",
3740
+ "front view"
3741
+ ],
3539
3742
  away: ["back to camera", "facing away", "back view"],
3540
3743
  left: ["facing left", "profile left", "left side"],
3541
3744
  right: ["facing right", "profile right", "right side"]
@@ -3697,7 +3900,7 @@ class VisionModels {
3697
3900
  this.poseDetector = null;
3698
3901
  }
3699
3902
  this.initialized = false;
3700
- logger12.info("[VisionModels] Models disposed");
3903
+ logger13.info("[VisionModels] Models disposed");
3701
3904
  }
3702
3905
  }
3703
3906
 
@@ -3705,8 +3908,8 @@ class VisionModels {
3705
3908
  import * as path4 from "node:path";
3706
3909
  import { TextDecoder } from "node:util";
3707
3910
  import { Worker } from "node:worker_threads";
3708
- import { logger as logger13 } from "@elizaos/core";
3709
- var __dirname = "/Users/shawwalters/eliza-workspace/milady/plugins/plugin-vision/typescript/src";
3911
+ import { logger as logger14 } from "@elizaos/core";
3912
+ var __dirname = "/Users/shawwalters/eliza-workspace/milady/eliza/plugins/plugin-vision/src";
3710
3913
 
3711
3914
  class VisionWorkerManager {
3712
3915
  config;
@@ -3744,7 +3947,7 @@ class VisionWorkerManager {
3744
3947
  this.ocrResultsView = new DataView(this.ocrResultsBuffer);
3745
3948
  }
3746
3949
  async initialize() {
3747
- logger13.info("[VisionWorkerManager] Initializing worker threads...");
3950
+ logger14.info("[VisionWorkerManager] Initializing worker threads...");
3748
3951
  try {
3749
3952
  await this.startScreenCaptureWorker();
3750
3953
  if (this.config.florence2Enabled) {
@@ -3753,9 +3956,9 @@ class VisionWorkerManager {
3753
3956
  if (this.config.ocrEnabled) {
3754
3957
  await this.startOCRWorker();
3755
3958
  }
3756
- logger13.info("[VisionWorkerManager] All workers initialized");
3959
+ logger14.info("[VisionWorkerManager] All workers initialized");
3757
3960
  } catch (error) {
3758
- logger13.error("[VisionWorkerManager] Failed to initialize workers:", error);
3961
+ logger14.error("[VisionWorkerManager] Failed to initialize workers:", error);
3759
3962
  throw error;
3760
3963
  }
3761
3964
  }
@@ -3780,18 +3983,18 @@ class VisionWorkerManager {
3780
3983
  lastUpdate: Date.now()
3781
3984
  });
3782
3985
  } else if (msg.type === "error") {
3783
- logger13.error("[ScreenCaptureWorker] Error:", msg.error);
3986
+ logger14.error("[ScreenCaptureWorker] Error:", msg.error);
3784
3987
  } else if (msg.type === "log") {
3785
3988
  this.handleWorkerLog("ScreenCaptureWorker", msg);
3786
3989
  }
3787
3990
  });
3788
3991
  this.screenCaptureWorker.on("error", (error) => {
3789
- logger13.error("[ScreenCaptureWorker] Worker error:", error instanceof Error ? error.message : String(error));
3992
+ logger14.error("[ScreenCaptureWorker] Worker error:", error instanceof Error ? error.message : String(error));
3790
3993
  setTimeout(() => this.restartScreenCaptureWorker(), 1000);
3791
3994
  });
3792
3995
  this.screenCaptureWorker.on("exit", (code) => {
3793
3996
  if (code !== 0) {
3794
- logger13.error(`[ScreenCaptureWorker] Worker stopped with exit code ${code}`);
3997
+ logger14.error(`[ScreenCaptureWorker] Worker stopped with exit code ${code}`);
3795
3998
  setTimeout(() => this.restartScreenCaptureWorker(), 1000);
3796
3999
  }
3797
4000
  });
@@ -3830,18 +4033,18 @@ class VisionWorkerManager {
3830
4033
  } else if (msg.type === "tile_analyzed") {
3831
4034
  this.updateFlorence2Cache(msg);
3832
4035
  } else if (msg.type === "error") {
3833
- logger13.error("[Florence2Worker] Error:", msg.error);
4036
+ logger14.error("[Florence2Worker] Error:", msg.error);
3834
4037
  } else if (msg.type === "log") {
3835
4038
  this.handleWorkerLog("Florence2Worker", msg);
3836
4039
  }
3837
4040
  });
3838
4041
  this.florence2Worker.on("error", (error) => {
3839
- logger13.error("[Florence2Worker] Worker error:", error instanceof Error ? error.message : String(error));
4042
+ logger14.error("[Florence2Worker] Worker error:", error instanceof Error ? error.message : String(error));
3840
4043
  setTimeout(() => this.restartFlorence2Worker(), 1000);
3841
4044
  });
3842
4045
  this.florence2Worker.on("exit", (code) => {
3843
4046
  if (code !== 0) {
3844
- logger13.error(`[Florence2Worker] Worker stopped with exit code ${code}`);
4047
+ logger14.error(`[Florence2Worker] Worker stopped with exit code ${code}`);
3845
4048
  setTimeout(() => this.restartFlorence2Worker(), 1000);
3846
4049
  }
3847
4050
  });
@@ -3869,18 +4072,18 @@ class VisionWorkerManager {
3869
4072
  } else if (msg.type === "ocr_complete") {
3870
4073
  this.updateOCRCache(msg);
3871
4074
  } else if (msg.type === "error") {
3872
- logger13.error("[OCRWorker] Error:", msg.error);
4075
+ logger14.error("[OCRWorker] Error:", msg.error);
3873
4076
  } else if (msg.type === "log") {
3874
4077
  this.handleWorkerLog("OCRWorker", msg);
3875
4078
  }
3876
4079
  });
3877
4080
  this.ocrWorker.on("error", (error) => {
3878
- logger13.error("[OCRWorker] Worker error:", error instanceof Error ? error.message : String(error));
4081
+ logger14.error("[OCRWorker] Worker error:", error instanceof Error ? error.message : String(error));
3879
4082
  setTimeout(() => this.restartOCRWorker(), 1000);
3880
4083
  });
3881
4084
  this.ocrWorker.on("exit", (code) => {
3882
4085
  if (code !== 0) {
3883
- logger13.error(`[OCRWorker] Worker stopped with exit code ${code}`);
4086
+ logger14.error(`[OCRWorker] Worker stopped with exit code ${code}`);
3884
4087
  setTimeout(() => this.restartOCRWorker(), 1000);
3885
4088
  }
3886
4089
  });
@@ -3893,7 +4096,7 @@ class VisionWorkerManager {
3893
4096
  this.latestFlorence2Results.set(tileId, result);
3894
4097
  }
3895
4098
  } catch (error) {
3896
- logger13.error("[VisionWorkerManager] Failed to update Florence2 cache:", error);
4099
+ logger14.error("[VisionWorkerManager] Failed to update Florence2 cache:", error);
3897
4100
  }
3898
4101
  }
3899
4102
  updateOCRCache(_msg) {
@@ -3903,7 +4106,7 @@ class VisionWorkerManager {
3903
4106
  this.latestOCRResult = result;
3904
4107
  }
3905
4108
  } catch (error) {
3906
- logger13.error("[VisionWorkerManager] Failed to update OCR cache:", error);
4109
+ logger14.error("[VisionWorkerManager] Failed to update OCR cache:", error);
3907
4110
  }
3908
4111
  }
3909
4112
  readFlorence2Result(tileId) {
@@ -3929,7 +4132,7 @@ class VisionWorkerManager {
3929
4132
  const json = new TextDecoder().decode(bytes);
3930
4133
  return JSON.parse(json);
3931
4134
  } catch (error) {
3932
- logger13.error("[VisionWorkerManager] Failed to read Florence2 result:", error);
4135
+ logger14.error("[VisionWorkerManager] Failed to read Florence2 result:", error);
3933
4136
  return null;
3934
4137
  }
3935
4138
  }
@@ -3951,7 +4154,7 @@ class VisionWorkerManager {
3951
4154
  const json = new TextDecoder().decode(bytes);
3952
4155
  return JSON.parse(json);
3953
4156
  } catch (error) {
3954
- logger13.error("[VisionWorkerManager] Failed to read OCR result:", error);
4157
+ logger14.error("[VisionWorkerManager] Failed to read OCR result:", error);
3955
4158
  return null;
3956
4159
  }
3957
4160
  }
@@ -3974,7 +4177,7 @@ class VisionWorkerManager {
3974
4177
  };
3975
4178
  this.lastProcessedFrameId = frameId;
3976
4179
  } catch (error) {
3977
- logger13.error("[VisionWorkerManager] Failed to read screen capture:", error);
4180
+ logger14.error("[VisionWorkerManager] Failed to read screen capture:", error);
3978
4181
  }
3979
4182
  return this.latestScreenCapture;
3980
4183
  }
@@ -4067,28 +4270,28 @@ class VisionWorkerManager {
4067
4270
  }
4068
4271
  }
4069
4272
  async stop() {
4070
- logger13.info("[VisionWorkerManager] Stopping all workers...");
4273
+ logger14.info("[VisionWorkerManager] Stopping all workers...");
4071
4274
  const stopPromises = [];
4072
4275
  if (this.screenCaptureWorker) {
4073
- stopPromises.push(new Promise((resolve) => {
4074
- this.screenCaptureWorker?.once("exit", () => resolve());
4276
+ stopPromises.push(new Promise((resolve2) => {
4277
+ this.screenCaptureWorker?.once("exit", () => resolve2());
4075
4278
  this.screenCaptureWorker?.postMessage({ type: "stop" });
4076
4279
  }));
4077
4280
  }
4078
4281
  if (this.florence2Worker) {
4079
- stopPromises.push(new Promise((resolve) => {
4080
- this.florence2Worker?.once("exit", () => resolve());
4282
+ stopPromises.push(new Promise((resolve2) => {
4283
+ this.florence2Worker?.once("exit", () => resolve2());
4081
4284
  this.florence2Worker?.postMessage({ type: "stop" });
4082
4285
  }));
4083
4286
  }
4084
4287
  if (this.ocrWorker) {
4085
- stopPromises.push(new Promise((resolve) => {
4086
- this.ocrWorker?.once("exit", () => resolve());
4288
+ stopPromises.push(new Promise((resolve2) => {
4289
+ this.ocrWorker?.once("exit", () => resolve2());
4087
4290
  this.ocrWorker?.postMessage({ type: "stop" });
4088
4291
  }));
4089
4292
  }
4090
4293
  await Promise.all(stopPromises);
4091
- logger13.info("[VisionWorkerManager] All workers stopped");
4294
+ logger14.info("[VisionWorkerManager] All workers stopped");
4092
4295
  }
4093
4296
  handleWorkerLog(workerName, msg) {
4094
4297
  const { level, message, args } = msg;
@@ -4096,27 +4299,27 @@ class VisionWorkerManager {
4096
4299
  const stringArgs = args.map((arg) => String(arg));
4097
4300
  switch (level) {
4098
4301
  case "info":
4099
- logger13.info(formattedMessage, ...stringArgs);
4302
+ logger14.info(formattedMessage, ...stringArgs);
4100
4303
  break;
4101
4304
  case "warn":
4102
- logger13.warn(formattedMessage, ...stringArgs);
4305
+ logger14.warn(formattedMessage, ...stringArgs);
4103
4306
  break;
4104
4307
  case "error":
4105
- logger13.error(formattedMessage, ...stringArgs);
4308
+ logger14.error(formattedMessage, ...stringArgs);
4106
4309
  break;
4107
4310
  case "debug":
4108
- logger13.debug(formattedMessage, ...stringArgs);
4311
+ logger14.debug(formattedMessage, ...stringArgs);
4109
4312
  break;
4110
4313
  }
4111
4314
  }
4112
4315
  async restartScreenCaptureWorker() {
4113
4316
  const attempts = this.restartAttempts.get("screenCapture") || 0;
4114
4317
  if (attempts >= this.MAX_RESTART_ATTEMPTS) {
4115
- logger13.error("[VisionWorkerManager] Max restart attempts reached for screen capture worker");
4318
+ logger14.error("[VisionWorkerManager] Max restart attempts reached for screen capture worker");
4116
4319
  return;
4117
4320
  }
4118
4321
  this.restartAttempts.set("screenCapture", attempts + 1);
4119
- logger13.info(`[VisionWorkerManager] Restarting screen capture worker (attempt ${attempts + 1})`);
4322
+ logger14.info(`[VisionWorkerManager] Restarting screen capture worker (attempt ${attempts + 1})`);
4120
4323
  try {
4121
4324
  if (this.screenCaptureWorker) {
4122
4325
  this.screenCaptureWorker.removeAllListeners();
@@ -4125,17 +4328,17 @@ class VisionWorkerManager {
4125
4328
  await this.startScreenCaptureWorker();
4126
4329
  this.restartAttempts.set("screenCapture", 0);
4127
4330
  } catch (error) {
4128
- logger13.error("[VisionWorkerManager] Failed to restart screen capture worker:", error);
4331
+ logger14.error("[VisionWorkerManager] Failed to restart screen capture worker:", error);
4129
4332
  }
4130
4333
  }
4131
4334
  async restartFlorence2Worker() {
4132
4335
  const attempts = this.restartAttempts.get("florence2") || 0;
4133
4336
  if (attempts >= this.MAX_RESTART_ATTEMPTS) {
4134
- logger13.error("[VisionWorkerManager] Max restart attempts reached for Florence2 worker");
4337
+ logger14.error("[VisionWorkerManager] Max restart attempts reached for Florence2 worker");
4135
4338
  return;
4136
4339
  }
4137
4340
  this.restartAttempts.set("florence2", attempts + 1);
4138
- logger13.info(`[VisionWorkerManager] Restarting Florence2 worker (attempt ${attempts + 1})`);
4341
+ logger14.info(`[VisionWorkerManager] Restarting Florence2 worker (attempt ${attempts + 1})`);
4139
4342
  try {
4140
4343
  if (this.florence2Worker) {
4141
4344
  this.florence2Worker.removeAllListeners();
@@ -4144,17 +4347,17 @@ class VisionWorkerManager {
4144
4347
  await this.startFlorence2Worker();
4145
4348
  this.restartAttempts.set("florence2", 0);
4146
4349
  } catch (error) {
4147
- logger13.error("[VisionWorkerManager] Failed to restart Florence2 worker:", error);
4350
+ logger14.error("[VisionWorkerManager] Failed to restart Florence2 worker:", error);
4148
4351
  }
4149
4352
  }
4150
4353
  async restartOCRWorker() {
4151
4354
  const attempts = this.restartAttempts.get("ocr") || 0;
4152
4355
  if (attempts >= this.MAX_RESTART_ATTEMPTS) {
4153
- logger13.error("[VisionWorkerManager] Max restart attempts reached for OCR worker");
4356
+ logger14.error("[VisionWorkerManager] Max restart attempts reached for OCR worker");
4154
4357
  return;
4155
4358
  }
4156
4359
  this.restartAttempts.set("ocr", attempts + 1);
4157
- logger13.info(`[VisionWorkerManager] Restarting OCR worker (attempt ${attempts + 1})`);
4360
+ logger14.info(`[VisionWorkerManager] Restarting OCR worker (attempt ${attempts + 1})`);
4158
4361
  try {
4159
4362
  if (this.ocrWorker) {
4160
4363
  this.ocrWorker.removeAllListeners();
@@ -4163,13 +4366,20 @@ class VisionWorkerManager {
4163
4366
  await this.startOCRWorker();
4164
4367
  this.restartAttempts.set("ocr", 0);
4165
4368
  } catch (error) {
4166
- logger13.error("[VisionWorkerManager] Failed to restart OCR worker:", error);
4369
+ logger14.error("[VisionWorkerManager] Failed to restart OCR worker:", error);
4167
4370
  }
4168
4371
  }
4169
4372
  }
4170
4373
 
4171
4374
  // src/service.ts
4172
4375
  var execAsync3 = promisify3(exec3);
4376
+ var SCENE_DESCRIPTION_PROMPT = JSON.stringify({
4377
+ task: "describe_visual_scene",
4378
+ instructions: [
4379
+ "Describe visible people, objects, UI, text, and notable scene changes.",
4380
+ "Keep the answer concise and factual."
4381
+ ]
4382
+ }, null, 2);
4173
4383
 
4174
4384
  class VisionService extends Service {
4175
4385
  static serviceType = VisionServiceType.VISION;
@@ -4223,7 +4433,7 @@ class VisionService extends Service {
4223
4433
  this.screenCapture = new ScreenCaptureService(this.visionConfig);
4224
4434
  this.florence2 = new Florence2Model;
4225
4435
  this.ocrService = new OCRService;
4226
- logger14.info("[VisionService] Constructed with config:", JSON.stringify(this.visionConfig));
4436
+ logger15.info("[VisionService] Constructed with config:", JSON.stringify(this.visionConfig));
4227
4437
  }
4228
4438
  parseConfig(runtime) {
4229
4439
  const getSettingString = (key) => {
@@ -4278,14 +4488,14 @@ class VisionService extends Service {
4278
4488
  enableObjectDetection: this.visionConfig.enableObjectDetection || false,
4279
4489
  enablePoseDetection: this.visionConfig.enablePoseDetection || false
4280
4490
  });
4281
- logger14.info("[VisionService] Using TensorFlow.js models for advanced detection");
4491
+ logger15.info("[VisionService] Using TensorFlow.js models for advanced detection");
4282
4492
  } catch (_tfError) {
4283
- logger14.warn("[VisionService] TensorFlow.js not available, falling back to enhanced heuristics");
4493
+ logger15.warn("[VisionService] TensorFlow.js not available, falling back to enhanced heuristics");
4284
4494
  await this.visionModels.initialize({
4285
4495
  enableObjectDetection: this.visionConfig.enableObjectDetection || false,
4286
4496
  enablePoseDetection: this.visionConfig.enablePoseDetection || false
4287
4497
  });
4288
- logger14.info("[VisionService] Using enhanced heuristics for detection");
4498
+ logger15.info("[VisionService] Using enhanced heuristics for detection");
4289
4499
  }
4290
4500
  }
4291
4501
  if (this.visionConfig.visionMode === "SCREEN" /* SCREEN */ || this.visionConfig.visionMode === "BOTH" /* BOTH */) {
@@ -4297,18 +4507,18 @@ class VisionService extends Service {
4297
4507
  await this.initializeAudioCapture();
4298
4508
  this.startProcessing();
4299
4509
  } catch (error) {
4300
- logger14.error("[VisionService] Failed to initialize:", error);
4510
+ logger15.error("[VisionService] Failed to initialize:", error);
4301
4511
  }
4302
4512
  }
4303
4513
  async initializeScreenVision() {
4304
4514
  try {
4305
- logger14.info("[VisionService] Initializing screen vision...");
4515
+ logger15.info("[VisionService] Initializing screen vision...");
4306
4516
  const useWorkers = this.visionConfig.targetScreenFPS && this.visionConfig.targetScreenFPS > 10;
4307
4517
  if (useWorkers) {
4308
- logger14.info("[VisionService] Initializing worker threads for high-FPS processing...");
4518
+ logger15.info("[VisionService] Initializing worker threads for high-FPS processing...");
4309
4519
  this.workerManager = new VisionWorkerManager(this.visionConfig);
4310
4520
  await this.workerManager.initialize();
4311
- logger14.info("[VisionService] Worker threads initialized");
4521
+ logger15.info("[VisionService] Worker threads initialized");
4312
4522
  } else {
4313
4523
  if (this.visionConfig.florence2Enabled) {
4314
4524
  await this.florence2.initialize();
@@ -4319,11 +4529,11 @@ class VisionService extends Service {
4319
4529
  }
4320
4530
  const screenInfo = await this.screenCapture.getScreenInfo();
4321
4531
  if (screenInfo) {
4322
- logger14.info(`[VisionService] Screen resolution: ${screenInfo.width}x${screenInfo.height}`);
4532
+ logger15.info(`[VisionService] Screen resolution: ${screenInfo.width}x${screenInfo.height}`);
4323
4533
  }
4324
- logger14.info("[VisionService] Screen vision initialized");
4534
+ logger15.info("[VisionService] Screen vision initialized");
4325
4535
  } catch (error) {
4326
- logger14.error("[VisionService] Failed to initialize screen vision:", error);
4536
+ logger15.error("[VisionService] Failed to initialize screen vision:", error);
4327
4537
  }
4328
4538
  }
4329
4539
  async initializeCameraVision() {
@@ -4331,18 +4541,18 @@ class VisionService extends Service {
4331
4541
  if (!toolCheck.available) {
4332
4542
  const platform = process.platform;
4333
4543
  const toolName = platform === "darwin" ? "imagesnap" : platform === "linux" ? "fswebcam" : "ffmpeg";
4334
- logger14.warn(`[VisionService] Camera capture tool '${toolName}' not found. Install it to enable camera functionality.`);
4335
- logger14.warn("[VisionService] For macOS: brew install imagesnap");
4336
- logger14.warn("[VisionService] For Linux: sudo apt-get install fswebcam");
4337
- logger14.warn("[VisionService] For Windows: Install ffmpeg and add to PATH");
4544
+ logger15.warn(`[VisionService] Camera capture tool '${toolName}' not found. Install it to enable camera functionality.`);
4545
+ logger15.warn("[VisionService] For macOS: brew install imagesnap");
4546
+ logger15.warn("[VisionService] For Linux: sudo apt-get install fswebcam");
4547
+ logger15.warn("[VisionService] For Windows: Install ffmpeg and add to PATH");
4338
4548
  return;
4339
4549
  }
4340
4550
  const camera = await this.findCamera();
4341
4551
  if (camera) {
4342
4552
  this.camera = camera;
4343
- logger14.info(`[VisionService] Connected to camera: ${camera.name}`);
4553
+ logger15.info(`[VisionService] Connected to camera: ${camera.name}`);
4344
4554
  } else {
4345
- logger14.warn("[VisionService] No suitable camera found");
4555
+ logger15.warn("[VisionService] No suitable camera found");
4346
4556
  }
4347
4557
  }
4348
4558
  async initializeAudioCapture() {
@@ -4353,7 +4563,7 @@ class VisionService extends Service {
4353
4563
  const enableMicrophone = getSettingString("ENABLE_MICROPHONE") === "true";
4354
4564
  const useStreamingAudio = getSettingString("USE_STREAMING_AUDIO") === "true";
4355
4565
  if (!enableMicrophone) {
4356
- logger14.info("[VisionService] Microphone capture disabled");
4566
+ logger15.info("[VisionService] Microphone capture disabled");
4357
4567
  return;
4358
4568
  }
4359
4569
  try {
@@ -4378,20 +4588,20 @@ class VisionService extends Service {
4378
4588
  };
4379
4589
  this.streamingAudioCapture = new StreamingAudioCaptureService(this.runtime, streamingConfig);
4380
4590
  this.streamingAudioCapture.on("speechStart", () => {
4381
- logger14.info("[VisionService] User started speaking");
4591
+ logger15.info("[VisionService] User started speaking");
4382
4592
  });
4383
4593
  this.streamingAudioCapture.on("speechEnd", () => {
4384
- logger14.info("[VisionService] User stopped speaking");
4594
+ logger15.info("[VisionService] User stopped speaking");
4385
4595
  });
4386
4596
  this.streamingAudioCapture.on("transcription", (data) => {
4387
- logger14.info(`[VisionService] Transcription (${data.isFinal ? "final" : "partial"}): ${data.text}`);
4597
+ logger15.info(`[VisionService] Transcription (${data.isFinal ? "final" : "partial"}): ${data.text}`);
4388
4598
  });
4389
4599
  this.streamingAudioCapture.on("utteranceComplete", async (text) => {
4390
- logger14.info("[VisionService] Processing complete utterance:", text);
4600
+ logger15.info("[VisionService] Processing complete utterance:", text);
4391
4601
  await this.storeAudioTranscription(text);
4392
4602
  });
4393
4603
  await this.streamingAudioCapture.initialize();
4394
- logger14.info("[VisionService] Streaming audio capture initialized with VAD");
4604
+ logger15.info("[VisionService] Streaming audio capture initialized with VAD");
4395
4605
  } else {
4396
4606
  const getSettingNumber = (key, defaultValue) => {
4397
4607
  const value = this.runtime.getSetting(key);
@@ -4409,10 +4619,10 @@ class VisionService extends Service {
4409
4619
  };
4410
4620
  this.audioCapture = new AudioCaptureService(this.runtime, audioConfig);
4411
4621
  await this.audioCapture.initialize();
4412
- logger14.info("[VisionService] Batch audio capture initialized");
4622
+ logger15.info("[VisionService] Batch audio capture initialized");
4413
4623
  }
4414
4624
  } catch (error) {
4415
- logger14.error("[VisionService] Failed to initialize audio capture:", error);
4625
+ logger15.error("[VisionService] Failed to initialize audio capture:", error);
4416
4626
  }
4417
4627
  }
4418
4628
  async storeAudioTranscription(text) {
@@ -4420,9 +4630,9 @@ class VisionService extends Service {
4420
4630
  if (this.lastSceneDescription) {
4421
4631
  this.lastSceneDescription.audioTranscription = text;
4422
4632
  }
4423
- logger14.debug("[VisionService] Stored audio transcription in scene context");
4633
+ logger15.debug("[VisionService] Stored audio transcription in scene context");
4424
4634
  } catch (error) {
4425
- logger14.error("[VisionService] Failed to store audio transcription:", error);
4635
+ logger15.error("[VisionService] Failed to store audio transcription:", error);
4426
4636
  }
4427
4637
  }
4428
4638
  startProcessing() {
@@ -4443,12 +4653,12 @@ class VisionService extends Service {
4443
4653
  try {
4444
4654
  await this.captureAndProcessFrame();
4445
4655
  } catch (error) {
4446
- logger14.error("[VisionService] Frame processing error:", error);
4656
+ logger15.error("[VisionService] Frame processing error:", error);
4447
4657
  }
4448
4658
  this.isProcessing = false;
4449
4659
  }
4450
4660
  }, this.visionConfig.updateInterval || 100);
4451
- logger14.debug("[VisionService] Started frame processing loop");
4661
+ logger15.debug("[VisionService] Started frame processing loop");
4452
4662
  }
4453
4663
  async captureAndProcessFrame() {
4454
4664
  if (!this.camera) {
@@ -4457,19 +4667,19 @@ class VisionService extends Service {
4457
4667
  try {
4458
4668
  const frameData = await this.camera.capture();
4459
4669
  if (!frameData || frameData.length === 0) {
4460
- logger14.debug("[VisionService] Camera returned empty frame, skipping");
4670
+ logger15.debug("[VisionService] Camera returned empty frame, skipping");
4461
4671
  return;
4462
4672
  }
4463
4673
  const frame = await this.processFrameData(frameData);
4464
4674
  if (!frame || frame.width === 0 || frame.height === 0) {
4465
- logger14.warn("[VisionService] Invalid frame dimensions, skipping");
4675
+ logger15.warn("[VisionService] Invalid frame dimensions, skipping");
4466
4676
  return;
4467
4677
  }
4468
4678
  const changePercentage = this.lastFrame ? await this.calculatePixelChange(this.lastFrame, frame) : 100;
4469
4679
  await this.updateSceneDescription(frame, changePercentage);
4470
4680
  this.lastFrame = frame;
4471
4681
  } catch (error) {
4472
- logger14.error("[VisionService] Error capturing frame:", error);
4682
+ logger15.error("[VisionService] Error capturing frame:", error);
4473
4683
  }
4474
4684
  }
4475
4685
  async processFrameData(data) {
@@ -4516,7 +4726,8 @@ class VisionService extends Service {
4516
4726
  async updateSceneDescription(frame, changePercentage) {
4517
4727
  try {
4518
4728
  const currentTime = Date.now();
4519
- const jpegBuffer = await sharp4(frame.data, {
4729
+ const testImage = getTestImage();
4730
+ const jpegBuffer = testImage ? await sharp4(testImage).jpeg().toBuffer() : await sharp4(frame.data, {
4520
4731
  raw: {
4521
4732
  width: frame.width,
4522
4733
  height: frame.height,
@@ -4534,7 +4745,7 @@ class VisionService extends Service {
4534
4745
  description = await this.describeSceneWithVLM(imageUrl);
4535
4746
  this.lastVlmUpdateTime = currentTime;
4536
4747
  this.lastTfDescription = description;
4537
- logger14.debug(`[VisionService] VLM updated: ${timeSinceVlmUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
4748
+ logger15.debug(`[VisionService] VLM updated: ${timeSinceVlmUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
4538
4749
  }
4539
4750
  const timeSinceTfUpdate = currentTime - this.lastTfUpdateTime;
4540
4751
  const tfUpdateInterval = this.visionConfig.tfUpdateInterval ?? 1000;
@@ -4544,18 +4755,18 @@ class VisionService extends Service {
4544
4755
  let people = [];
4545
4756
  if (shouldUpdateTf && (this.visionConfig.enableObjectDetection || this.visionConfig.enablePoseDetection)) {
4546
4757
  this.lastTfUpdateTime = currentTime;
4547
- logger14.debug(`[VisionService] TF updating: ${timeSinceTfUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
4758
+ logger15.debug(`[VisionService] TF updating: ${timeSinceTfUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
4548
4759
  if (this.visionConfig.enableObjectDetection) {
4549
4760
  if (this.visionModels.hasObjectDetection()) {
4550
4761
  detectedObjects = await this.visionModels.detectObjects(frame.data, frame.width, frame.height);
4551
- logger14.debug(`[VisionService] VisionModels detected ${detectedObjects.length} objects`);
4762
+ logger15.debug(`[VisionService] VisionModels detected ${detectedObjects.length} objects`);
4552
4763
  }
4553
4764
  }
4554
4765
  if (this.visionConfig.enablePoseDetection) {
4555
4766
  if (this.visionModels.hasPoseDetection()) {
4556
4767
  const poses = await this.visionModels.detectPoses(frame.data, frame.width, frame.height);
4557
4768
  people = poses;
4558
- logger14.debug(`[VisionService] VisionModels detected ${people.length} people with poses`);
4769
+ logger15.debug(`[VisionService] VisionModels detected ${people.length} people with poses`);
4559
4770
  }
4560
4771
  }
4561
4772
  if (people.length === 0 && detectedObjects.length > 0) {
@@ -4583,8 +4794,8 @@ class VisionService extends Service {
4583
4794
  const enableFaceRecognition = getSettingString("ENABLE_FACE_RECOGNITION") === "true";
4584
4795
  if (enableFaceRecognition && people.length > 0 && frame.width > 0 && frame.height > 0) {
4585
4796
  try {
4586
- if (!frame.data || frame.data.length === 0) {
4587
- logger14.warn("[VisionService] Invalid frame data for face recognition");
4797
+ if (frame.data.length === 0) {
4798
+ logger15.warn("[VisionService] Invalid frame data for face recognition");
4588
4799
  return;
4589
4800
  }
4590
4801
  const faces = await this.faceRecognition.detectFaces(frame.data, frame.width, frame.height);
@@ -4602,7 +4813,7 @@ class VisionService extends Service {
4602
4813
  let profileId;
4603
4814
  if (match) {
4604
4815
  profileId = match.profileId;
4605
- logger14.debug(`[VisionService] Recognized face: ${profileId} (distance: ${match.distance})`);
4816
+ logger15.debug(`[VisionService] Recognized face: ${profileId} (distance: ${match.distance})`);
4606
4817
  } else {
4607
4818
  profileId = await this.faceRecognition.addOrUpdateFace(face.descriptor, {
4608
4819
  attributes: {
@@ -4611,7 +4822,7 @@ class VisionService extends Service {
4611
4822
  emotion: face.expressions ? this.getDominantExpression(face.expressions) : undefined
4612
4823
  }
4613
4824
  });
4614
- logger14.info(`[VisionService] New face registered: ${profileId}`);
4825
+ logger15.info(`[VisionService] New face registered: ${profileId}`);
4615
4826
  }
4616
4827
  faceProfiles.set(person.id, profileId);
4617
4828
  break;
@@ -4619,7 +4830,7 @@ class VisionService extends Service {
4619
4830
  }
4620
4831
  }
4621
4832
  } catch (faceError) {
4622
- logger14.error("[VisionService] Face recognition error:", faceError);
4833
+ logger15.error("[VisionService] Face recognition error:", faceError);
4623
4834
  }
4624
4835
  }
4625
4836
  const _trackedEntities = await this.entityTracker.updateEntities(detectedObjects, people, faceProfiles, this.runtime);
@@ -4632,51 +4843,75 @@ class VisionService extends Service {
4632
4843
  changePercentage
4633
4844
  };
4634
4845
  if (shouldUpdateVlm || shouldUpdateTf) {
4635
- logger14.info("[VisionService] Scene Analysis Complete:");
4636
- logger14.info(` VLM Description: ${description.substring(0, 100)}...`);
4637
- logger14.info(` Change: ${changePercentage.toFixed(1)}%`);
4638
- logger14.info(` Updates: ${shouldUpdateVlm ? "VLM" : ""}${shouldUpdateVlm && shouldUpdateTf ? " + " : ""}${shouldUpdateTf ? "TF" : ""}`);
4639
- logger14.info(` Detection Mode: ${this.visionConfig.enableObjectDetection ? "Advanced CV" : "Motion-based"}`);
4846
+ logger15.info("[VisionService] Scene Analysis Complete:");
4847
+ logger15.info(` VLM Description: ${description.substring(0, 100)}...`);
4848
+ logger15.info(` Change: ${changePercentage.toFixed(1)}%`);
4849
+ logger15.info(` Updates: ${shouldUpdateVlm ? "VLM" : ""}${shouldUpdateVlm && shouldUpdateTf ? " + " : ""}${shouldUpdateTf ? "TF" : ""}`);
4850
+ logger15.info(` Detection Mode: ${this.visionConfig.enableObjectDetection ? "Advanced CV" : "Motion-based"}`);
4640
4851
  if (detectedObjects.length > 0) {
4641
- logger14.info(` Objects: ${detectedObjects.length} detected`);
4852
+ logger15.info(` Objects: ${detectedObjects.length} detected`);
4642
4853
  const objectSummary = detectedObjects.reduce((acc, obj) => {
4643
4854
  acc[obj.type] = (acc[obj.type] || 0) + 1;
4644
4855
  return acc;
4645
4856
  }, {});
4646
4857
  for (const [type, count] of Object.entries(objectSummary)) {
4647
- logger14.info(` - ${count} ${type}(s)`);
4858
+ logger15.info(` - ${count} ${type}(s)`);
4648
4859
  }
4649
4860
  }
4650
4861
  if (people.length > 0) {
4651
- logger14.info(` People: ${people.length} detected`);
4862
+ logger15.info(` People: ${people.length} detected`);
4652
4863
  for (const person of people) {
4653
- logger14.info(` - Person: ${person.pose} pose, facing ${person.facing}, confidence: ${person.confidence.toFixed(2)}`);
4864
+ logger15.info(` - Person: ${person.pose} pose, facing ${person.facing}, confidence: ${person.confidence.toFixed(2)}`);
4654
4865
  }
4655
4866
  }
4656
4867
  }
4657
4868
  } catch (error) {
4658
- logger14.error("[VisionService] Failed to update scene description:", error);
4869
+ logger15.error("[VisionService] Failed to update scene description:", error);
4659
4870
  }
4660
4871
  }
4661
4872
  async describeSceneWithVLM(imageUrl) {
4873
+ return withStandaloneTrajectory3(this.runtime, {
4874
+ source: "plugin-vision:scene-description",
4875
+ metadata: { modelType: ModelType3.IMAGE_DESCRIPTION }
4876
+ }, () => this.describeSceneWithVLMInTrajectory(imageUrl));
4877
+ }
4878
+ async describeSceneWithVLMInTrajectory(imageUrl) {
4662
4879
  try {
4663
4880
  if (imageUrl.startsWith("data:image/")) {
4664
4881
  const base64Data = imageUrl.split(",")[1];
4665
4882
  const imageBuffer = Buffer.from(base64Data, "base64");
4666
4883
  if (this.florence2.isInitialized()) {
4667
4884
  try {
4668
- const result = await this.florence2.analyzeImage(imageBuffer);
4885
+ const result = await recordLlmCall(this.runtime, {
4886
+ model: "florence2-local",
4887
+ systemPrompt: "",
4888
+ userPrompt: JSON.stringify({
4889
+ task: "describe_visual_scene",
4890
+ image: {
4891
+ source: "camera_frame",
4892
+ mimeType: "image/jpeg",
4893
+ bytes: imageBuffer.byteLength
4894
+ }
4895
+ }, null, 2),
4896
+ temperature: 0,
4897
+ maxTokens: 0,
4898
+ purpose: "background",
4899
+ actionType: "florence2.analyzeImage"
4900
+ }, () => this.florence2.analyzeImage(imageBuffer));
4669
4901
  if (result.caption) {
4670
- logger14.debug("[VisionService] Florence-2 description:", result.caption);
4902
+ logger15.debug("[VisionService] Florence-2 description:", result.caption);
4671
4903
  return result.caption;
4672
4904
  }
4673
4905
  } catch (florenceError) {
4674
- logger14.warn("[VisionService] Florence-2 analysis failed, falling back:", florenceError);
4906
+ logger15.warn("[VisionService] Florence-2 analysis failed, falling back:", florenceError);
4675
4907
  }
4676
4908
  }
4677
4909
  }
4678
4910
  try {
4679
- const result = await this.runtime.useModel(ModelType3.IMAGE_DESCRIPTION, imageUrl);
4911
+ const result = await this.runtime.useModel(ModelType3.IMAGE_DESCRIPTION, {
4912
+ imageUrl,
4913
+ prompt: SCENE_DESCRIPTION_PROMPT
4914
+ });
4680
4915
  if (result && typeof result === "object" && "description" in result) {
4681
4916
  const description = result.description;
4682
4917
  if (!description.includes("I'm unable to analyze images") && !description.includes("I can't analyze images")) {
@@ -4689,7 +4924,7 @@ class VisionService extends Service {
4689
4924
  }
4690
4925
  }
4691
4926
  } catch (modelError) {
4692
- logger14.warn("[VisionService] Runtime IMAGE_DESCRIPTION model failed:", modelError);
4927
+ logger15.warn("[VisionService] Runtime IMAGE_DESCRIPTION model failed:", modelError);
4693
4928
  }
4694
4929
  if (this.lastSceneDescription) {
4695
4930
  const { objects, people } = this.lastSceneDescription;
@@ -4715,7 +4950,7 @@ class VisionService extends Service {
4715
4950
  }
4716
4951
  return "Visual scene captured";
4717
4952
  } catch (error) {
4718
- logger14.error("[VisionService] VLM description failed:", error);
4953
+ logger15.error("[VisionService] VLM description failed:", error);
4719
4954
  return "Unable to describe scene";
4720
4955
  }
4721
4956
  }
@@ -4879,12 +5114,12 @@ class VisionService extends Service {
4879
5114
  try {
4880
5115
  await this.captureAndProcessScreen();
4881
5116
  } catch (error) {
4882
- logger14.error("[VisionService] Screen processing error:", error);
5117
+ logger15.error("[VisionService] Screen processing error:", error);
4883
5118
  }
4884
5119
  this.isProcessingScreen = false;
4885
5120
  }
4886
5121
  }, this.visionConfig.screenCaptureInterval || 2000);
4887
- logger14.debug("[VisionService] Started screen processing loop");
5122
+ logger15.debug("[VisionService] Started screen processing loop");
4888
5123
  }
4889
5124
  async captureAndProcessScreen() {
4890
5125
  try {
@@ -4897,7 +5132,7 @@ class VisionService extends Service {
4897
5132
  }
4898
5133
  await this.updateEnhancedSceneDescription();
4899
5134
  } catch (error) {
4900
- logger14.error("[VisionService] Error capturing screen:", error);
5135
+ logger15.error("[VisionService] Error capturing screen:", error);
4901
5136
  }
4902
5137
  }
4903
5138
  async analyzeTile(tile) {
@@ -4922,7 +5157,7 @@ class VisionService extends Service {
4922
5157
  }));
4923
5158
  }
4924
5159
  } catch (error) {
4925
- logger14.error("[VisionService] Error analyzing tile:", error);
5160
+ logger15.error("[VisionService] Error analyzing tile:", error);
4926
5161
  }
4927
5162
  return analysis;
4928
5163
  }
@@ -4984,11 +5219,11 @@ class VisionService extends Service {
4984
5219
  return this.visionConfig.visionMode || "CAMERA" /* CAMERA */;
4985
5220
  }
4986
5221
  async setVisionMode(mode) {
4987
- logger14.info(`[VisionService] Changing vision mode from ${this.visionConfig.visionMode} to ${mode}`);
5222
+ logger15.info(`[VisionService] Changing vision mode from ${this.visionConfig.visionMode} to ${mode}`);
4988
5223
  this.stopProcessing();
4989
5224
  this.visionConfig.visionMode = mode;
4990
5225
  if (mode === "OFF" /* OFF */) {
4991
- logger14.info("[VisionService] Vision disabled");
5226
+ logger15.info("[VisionService] Vision disabled");
4992
5227
  return;
4993
5228
  }
4994
5229
  if ((mode === "CAMERA" /* CAMERA */ || mode === "BOTH" /* BOTH */) && !this.camera) {
@@ -5063,7 +5298,7 @@ class VisionService extends Service {
5063
5298
  return this.faceRecognition;
5064
5299
  }
5065
5300
  async stop() {
5066
- logger14.info("[VisionService] Stopping vision service...");
5301
+ logger15.info("[VisionService] Stopping vision service...");
5067
5302
  this.stopProcessing();
5068
5303
  if (this.audioCapture) {
5069
5304
  await this.audioCapture.stop();
@@ -5089,13 +5324,13 @@ class VisionService extends Service {
5089
5324
  this.isProcessingScreen = false;
5090
5325
  await this.florence2.dispose();
5091
5326
  await this.ocrService.dispose();
5092
- logger14.info("[VisionService] Stopped.");
5327
+ logger15.info("[VisionService] Stopped.");
5093
5328
  }
5094
5329
  async findCamera() {
5095
5330
  try {
5096
5331
  const cameras = await this.listCameras();
5097
5332
  if (cameras.length === 0) {
5098
- logger14.warn("[VisionService] No cameras detected");
5333
+ logger15.warn("[VisionService] No cameras detected");
5099
5334
  return null;
5100
5335
  }
5101
5336
  if (this.visionConfig.cameraName) {
@@ -5104,11 +5339,11 @@ class VisionService extends Service {
5104
5339
  if (matchedCamera) {
5105
5340
  return this.createCameraDevice(matchedCamera);
5106
5341
  }
5107
- logger14.warn(`[VisionService] Camera "${this.visionConfig.cameraName}" not found, using default`);
5342
+ logger15.warn(`[VisionService] Camera "${this.visionConfig.cameraName}" not found, using default`);
5108
5343
  }
5109
5344
  return this.createCameraDevice(cameras[0]);
5110
5345
  } catch (error) {
5111
- logger14.error("[VisionService] Error finding camera:", error);
5346
+ logger15.error("[VisionService] Error finding camera:", error);
5112
5347
  return null;
5113
5348
  }
5114
5349
  }
@@ -5166,7 +5401,7 @@ class VisionService extends Service {
5166
5401
  }
5167
5402
  return [];
5168
5403
  } catch (error) {
5169
- logger14.error("[VisionService] Error listing cameras:", error);
5404
+ logger15.error("[VisionService] Error listing cameras:", error);
5170
5405
  return [];
5171
5406
  }
5172
5407
  }
@@ -5223,13 +5458,13 @@ class VisionService extends Service {
5223
5458
  }
5224
5459
  async captureImage() {
5225
5460
  if (!this.camera) {
5226
- logger14.warn("[VisionService] No camera available for capture");
5461
+ logger15.warn("[VisionService] No camera available for capture");
5227
5462
  return null;
5228
5463
  }
5229
5464
  try {
5230
5465
  return await this.camera.capture();
5231
5466
  } catch (error) {
5232
- logger14.error("[VisionService] Failed to capture image:", error);
5467
+ logger15.error("[VisionService] Failed to capture image:", error);
5233
5468
  return null;
5234
5469
  }
5235
5470
  }
@@ -5250,7 +5485,7 @@ class ScreenVisionE2ETestSuite {
5250
5485
  throw new Error("Vision service not available");
5251
5486
  }
5252
5487
  await visionService.setVisionMode("SCREEN" /* SCREEN */);
5253
- await new Promise((resolve) => setTimeout(resolve, 2000));
5488
+ await new Promise((resolve2) => setTimeout(resolve2, 2000));
5254
5489
  const mode = visionService.getVisionMode();
5255
5490
  if (mode !== "SCREEN" /* SCREEN */) {
5256
5491
  throw new Error(`Expected vision mode SCREEN but got ${mode}`);
@@ -5273,7 +5508,7 @@ class ScreenVisionE2ETestSuite {
5273
5508
  throw new Error("Vision service not available");
5274
5509
  }
5275
5510
  await visionService.setVisionMode("SCREEN" /* SCREEN */);
5276
- await new Promise((resolve) => setTimeout(resolve, 3000));
5511
+ await new Promise((resolve2) => setTimeout(resolve2, 3000));
5277
5512
  const screenCapture = await visionService.getScreenCapture();
5278
5513
  if (!screenCapture) {
5279
5514
  console.warn("⚠️ No screen capture available - screen capture may not be supported in this environment");
@@ -5303,7 +5538,7 @@ class ScreenVisionE2ETestSuite {
5303
5538
  throw new Error("Vision service not available");
5304
5539
  }
5305
5540
  await visionService.setVisionMode("SCREEN" /* SCREEN */);
5306
- await new Promise((resolve) => setTimeout(resolve, 5000));
5541
+ await new Promise((resolve2) => setTimeout(resolve2, 5000));
5307
5542
  const enhancedScene = await visionService.getEnhancedSceneDescription();
5308
5543
  if (!enhancedScene || !enhancedScene.screenAnalysis) {
5309
5544
  console.warn("⚠️ No enhanced scene analysis available yet");
@@ -5342,18 +5577,23 @@ class ScreenVisionE2ETestSuite {
5342
5577
  if (!visionService) {
5343
5578
  throw new Error("Vision service not available");
5344
5579
  }
5345
- const modes = ["CAMERA" /* CAMERA */, "SCREEN" /* SCREEN */, "BOTH" /* BOTH */, "OFF" /* OFF */];
5580
+ const modes = [
5581
+ "CAMERA" /* CAMERA */,
5582
+ "SCREEN" /* SCREEN */,
5583
+ "BOTH" /* BOTH */,
5584
+ "OFF" /* OFF */
5585
+ ];
5346
5586
  for (const mode of modes) {
5347
5587
  console.log(` Switching to ${mode} mode...`);
5348
5588
  await visionService.setVisionMode(mode);
5349
- await new Promise((resolve) => setTimeout(resolve, 1000));
5589
+ await new Promise((resolve2) => setTimeout(resolve2, 1000));
5350
5590
  const currentMode = visionService.getVisionMode();
5351
5591
  if (currentMode !== mode) {
5352
5592
  throw new Error(`Failed to switch to ${mode} mode, current mode is ${currentMode}`);
5353
5593
  }
5354
5594
  console.log(` ✓ Successfully switched to ${mode} mode`);
5355
5595
  }
5356
- console.log(" Testing SET_VISION_MODE action...");
5596
+ console.log(" Testing VISION action with op=set_mode...");
5357
5597
  const message = {
5358
5598
  id: createUniqueUuid3(runtime, "test-msg"),
5359
5599
  entityId: runtime.agentId,
@@ -5363,13 +5603,13 @@ class ScreenVisionE2ETestSuite {
5363
5603
  createdAt: Date.now()
5364
5604
  };
5365
5605
  let callbackCalled = false;
5366
- await setVisionModeAction.handler(runtime, message, { values: {}, data: {}, text: "" }, {}, async (response) => {
5606
+ await visionAction.handler(runtime, message, { values: {}, data: {}, text: "" }, { parameters: { op: "set_mode" } }, async (response) => {
5367
5607
  callbackCalled = true;
5368
5608
  console.log(` Action response: ${response.text}`);
5369
5609
  return [];
5370
5610
  });
5371
5611
  if (!callbackCalled) {
5372
- throw new Error("SET_VISION_MODE action did not call callback");
5612
+ throw new Error("VISION set_mode op did not call callback");
5373
5613
  }
5374
5614
  const finalMode = visionService.getVisionMode();
5375
5615
  if (finalMode !== "BOTH" /* BOTH */) {
@@ -5387,7 +5627,7 @@ class ScreenVisionE2ETestSuite {
5387
5627
  throw new Error("Vision service not available");
5388
5628
  }
5389
5629
  await visionService.setVisionMode("BOTH" /* BOTH */);
5390
- await new Promise((resolve) => setTimeout(resolve, 5000));
5630
+ await new Promise((resolve2) => setTimeout(resolve2, 5000));
5391
5631
  const enhancedScene = await visionService.getEnhancedSceneDescription();
5392
5632
  const hasCamera = visionService.getCameraInfo() !== null;
5393
5633
  const hasScreen = await visionService.getScreenCapture() !== null;
@@ -5428,19 +5668,22 @@ class ScreenVisionE2ETestSuite {
5428
5668
  if (!visionService) {
5429
5669
  throw new Error("Vision service not available");
5430
5670
  }
5431
- const testableService = visionService;
5432
- const originalConfig = testableService.visionConfig;
5433
- testableService.visionConfig.screenRegion = {
5434
- x: -100,
5435
- y: -100,
5436
- width: 50000,
5437
- height: 50000
5671
+ const originalConfig = Reflect.get(visionService, "visionConfig");
5672
+ const invalidConfig = {
5673
+ ...originalConfig,
5674
+ screenRegion: {
5675
+ x: -100,
5676
+ y: -100,
5677
+ width: 50000,
5678
+ height: 50000
5679
+ }
5438
5680
  };
5681
+ Reflect.set(visionService, "visionConfig", invalidConfig);
5439
5682
  await visionService.setVisionMode("SCREEN" /* SCREEN */);
5440
- await new Promise((resolve) => setTimeout(resolve, 2000));
5683
+ await new Promise((resolve2) => setTimeout(resolve2, 2000));
5441
5684
  const isActive = visionService.isActive();
5442
5685
  console.log(` Service active after invalid config: ${isActive}`);
5443
- testableService.visionConfig = originalConfig;
5686
+ Reflect.set(visionService, "visionConfig", originalConfig);
5444
5687
  console.log("✓ Error handling works correctly");
5445
5688
  }
5446
5689
  }
@@ -5489,13 +5732,13 @@ class VisionBasicE2ETestSuite {
5489
5732
  let callbackResponse = null;
5490
5733
  const state = { values: {}, data: {}, text: "" };
5491
5734
  const visionService = runtime.getService("VISION");
5492
- const isValid = await describeSceneAction.validate(runtime, message, state);
5735
+ const isValid = await visionAction.validate(runtime, message, state);
5493
5736
  if (!visionService || !visionService.isActive()) {
5494
5737
  if (isValid) {
5495
5738
  throw new Error("Action validation should return false when vision service is not active");
5496
5739
  }
5497
5740
  console.log(" Action validation correctly returned false (vision not active)");
5498
- await describeSceneAction.handler(runtime, message, state, {}, async (response) => {
5741
+ await visionAction.handler(runtime, message, state, { parameters: { op: "describe" } }, async (response) => {
5499
5742
  callbackCalled = true;
5500
5743
  callbackResponse = response;
5501
5744
  return [];
@@ -5513,10 +5756,10 @@ class VisionBasicE2ETestSuite {
5513
5756
  }
5514
5757
  } else {
5515
5758
  if (!isValid) {
5516
- throw new Error("describeSceneAction validation failed despite active vision");
5759
+ throw new Error("visionAction validation failed despite active vision");
5517
5760
  }
5518
5761
  console.log(" Action validation: passed");
5519
- await describeSceneAction.handler(runtime, message, state, {}, async (response) => {
5762
+ await visionAction.handler(runtime, message, state, { parameters: { op: "describe" } }, async (response) => {
5520
5763
  callbackCalled = true;
5521
5764
  callbackResponse = response;
5522
5765
  return [];
@@ -5533,8 +5776,8 @@ class VisionBasicE2ETestSuite {
5533
5776
  console.log(` Thought: ${callbackResponse.thought}`);
5534
5777
  }
5535
5778
  }
5536
- if (!callbackResponse.actions || !callbackResponse.actions.includes("DESCRIBE_SCENE")) {
5537
- throw new Error("Response does not include DESCRIBE_SCENE action");
5779
+ if (!callbackResponse.actions || !callbackResponse.actions.includes("VISION")) {
5780
+ throw new Error("Response does not include VISION action");
5538
5781
  }
5539
5782
  }
5540
5783
  },
@@ -5555,13 +5798,13 @@ class VisionBasicE2ETestSuite {
5555
5798
  let callbackResponse = null;
5556
5799
  const state = { values: {}, data: {}, text: "" };
5557
5800
  const visionService = runtime.getService("VISION");
5558
- const isValid = await captureImageAction.validate(runtime, message, state);
5801
+ const isValid = await visionAction.validate(runtime, message, state);
5559
5802
  if (!visionService || !visionService.isActive()) {
5560
5803
  if (isValid) {
5561
5804
  throw new Error("Action validation should return false when vision service is not active");
5562
5805
  }
5563
5806
  console.log(" Action validation correctly returned false (vision not active)");
5564
- await captureImageAction.handler(runtime, message, state, {}, async (response) => {
5807
+ await visionAction.handler(runtime, message, state, { parameters: { op: "capture" } }, async (response) => {
5565
5808
  callbackCalled = true;
5566
5809
  callbackResponse = response;
5567
5810
  return [];
@@ -5579,10 +5822,10 @@ class VisionBasicE2ETestSuite {
5579
5822
  }
5580
5823
  } else {
5581
5824
  if (!isValid) {
5582
- throw new Error("captureImageAction validation failed despite active vision");
5825
+ throw new Error("visionAction validation failed despite active vision");
5583
5826
  }
5584
5827
  console.log(" Action validation: passed");
5585
- await captureImageAction.handler(runtime, message, state, {}, async (response) => {
5828
+ await visionAction.handler(runtime, message, state, { parameters: { op: "capture" } }, async (response) => {
5586
5829
  callbackCalled = true;
5587
5830
  callbackResponse = response;
5588
5831
  return [];
@@ -5604,8 +5847,8 @@ class VisionBasicE2ETestSuite {
5604
5847
  }
5605
5848
  console.log(` ✓ Image attachment valid: ${attachment.title}`);
5606
5849
  }
5607
- if (!callbackResponse.actions || !callbackResponse.actions.includes("CAPTURE_IMAGE")) {
5608
- throw new Error("Response does not include CAPTURE_IMAGE action");
5850
+ if (!callbackResponse.actions || !callbackResponse.actions.includes("VISION")) {
5851
+ throw new Error("Response does not include VISION action");
5609
5852
  }
5610
5853
  }
5611
5854
  },
@@ -5652,7 +5895,7 @@ class VisionBasicE2ETestSuite {
5652
5895
  }
5653
5896
  const initialScene = await visionService.getSceneDescription();
5654
5897
  console.log(` Initial scene: ${initialScene ? "Available" : "Pending..."}`);
5655
- await new Promise((resolve) => setTimeout(resolve, 2000));
5898
+ await new Promise((resolve2) => setTimeout(resolve2, 2000));
5656
5899
  const updatedScene = await visionService.getSceneDescription();
5657
5900
  if (!updatedScene) {
5658
5901
  throw new Error("No scene description available after 2 seconds");
@@ -5679,7 +5922,7 @@ class VisionBasicE2ETestSuite {
5679
5922
  return;
5680
5923
  }
5681
5924
  console.log(" Waiting for scene analysis...");
5682
- await new Promise((resolve) => setTimeout(resolve, 3000));
5925
+ await new Promise((resolve2) => setTimeout(resolve2, 3000));
5683
5926
  const scene = await visionService.getSceneDescription();
5684
5927
  if (!scene) {
5685
5928
  throw new Error("No scene description available after 3 seconds");
@@ -5869,7 +6112,7 @@ class VisionCaptureLogTestSuite {
5869
6112
  const captureDuration = captureEndTime - captureStartTime;
5870
6113
  const waitTime = Math.max(0, captureInterval - captureDuration);
5871
6114
  if (waitTime > 0) {
5872
- await new Promise((resolve) => setTimeout(resolve, waitTime));
6115
+ await new Promise((resolve2) => setTimeout(resolve2, waitTime));
5873
6116
  }
5874
6117
  }
5875
6118
  captureData.endTime = new Date().toISOString();
@@ -5942,7 +6185,7 @@ ${captureData.captures.filter((c) => c.scene?.description).slice(0, 5).map((c, _
5942
6185
  }
5943
6186
  var vision_capture_log_default = new VisionCaptureLogTestSuite;
5944
6187
  // src/tests/e2e/vision-runtime.ts
5945
- import { logger as logger15 } from "@elizaos/core";
6188
+ import { logger as logger16 } from "@elizaos/core";
5946
6189
  class VisionRuntimeTestSuite {
5947
6190
  name = "vision-runtime-tests";
5948
6191
  description = "Real runtime tests for vision plugin functionality";
@@ -5950,7 +6193,7 @@ class VisionRuntimeTestSuite {
5950
6193
  {
5951
6194
  name: "Vision service initialization",
5952
6195
  fn: async (runtime) => {
5953
- logger15.info("[Test] Testing vision service initialization...");
6196
+ logger16.info("[Test] Testing vision service initialization...");
5954
6197
  const visionService = runtime.getService(VisionServiceType.VISION);
5955
6198
  if (!visionService) {
5956
6199
  throw new Error("Vision service not found in runtime");
@@ -5959,24 +6202,24 @@ class VisionRuntimeTestSuite {
5959
6202
  throw new Error("Vision service missing isActive method");
5960
6203
  }
5961
6204
  const isActive = visionService.isActive();
5962
- logger15.info(`[Test] Vision service active: ${isActive}`);
6205
+ logger16.info(`[Test] Vision service active: ${isActive}`);
5963
6206
  if (!isActive && runtime.getSetting("VISION_MODE") !== "OFF" /* OFF */) {
5964
6207
  throw new Error("Vision service should be active but is not");
5965
6208
  }
5966
- logger15.info("[Test] ✅ Vision service initialization test passed");
6209
+ logger16.info("[Test] ✅ Vision service initialization test passed");
5967
6210
  }
5968
6211
  },
5969
6212
  {
5970
6213
  name: "Scene description functionality",
5971
6214
  fn: async (runtime) => {
5972
- logger15.info("[Test] Testing scene description...");
6215
+ logger16.info("[Test] Testing scene description...");
5973
6216
  const visionService = runtime.getService(VisionServiceType.VISION);
5974
6217
  if (!visionService) {
5975
6218
  throw new Error("Vision service not found");
5976
6219
  }
5977
6220
  const scene = await visionService.getSceneDescription();
5978
6221
  if (!scene) {
5979
- logger15.warn("[Test] No scene description available (camera might not be connected)");
6222
+ logger16.warn("[Test] No scene description available (camera might not be connected)");
5980
6223
  return;
5981
6224
  }
5982
6225
  if (typeof scene.timestamp !== "number") {
@@ -5991,21 +6234,21 @@ class VisionRuntimeTestSuite {
5991
6234
  if (!Array.isArray(scene.people)) {
5992
6235
  throw new Error("Scene description missing people array");
5993
6236
  }
5994
- logger15.info(`[Test] Scene: ${scene.description.substring(0, 100)}...`);
5995
- logger15.info(`[Test] Objects: ${scene.objects.length}, People: ${scene.people.length}`);
5996
- logger15.info("[Test] ✅ Scene description test passed");
6237
+ logger16.info(`[Test] Scene: ${scene.description.substring(0, 100)}...`);
6238
+ logger16.info(`[Test] Objects: ${scene.objects.length}, People: ${scene.people.length}`);
6239
+ logger16.info("[Test] ✅ Scene description test passed");
5997
6240
  }
5998
6241
  },
5999
6242
  {
6000
6243
  name: "Vision mode switching",
6001
6244
  fn: async (runtime) => {
6002
- logger15.info("[Test] Testing vision mode switching...");
6245
+ logger16.info("[Test] Testing vision mode switching...");
6003
6246
  const visionService = runtime.getService(VisionServiceType.VISION);
6004
6247
  if (!visionService) {
6005
6248
  throw new Error("Vision service not found");
6006
6249
  }
6007
6250
  const originalMode = visionService.getVisionMode();
6008
- logger15.info(`[Test] Original mode: ${originalMode}`);
6251
+ logger16.info(`[Test] Original mode: ${originalMode}`);
6009
6252
  const testModes = [
6010
6253
  "CAMERA" /* CAMERA */,
6011
6254
  "SCREEN" /* SCREEN */,
@@ -6013,7 +6256,7 @@ class VisionRuntimeTestSuite {
6013
6256
  "OFF" /* OFF */
6014
6257
  ];
6015
6258
  for (const mode of testModes) {
6016
- logger15.info(`[Test] Switching to mode: ${mode}`);
6259
+ logger16.info(`[Test] Switching to mode: ${mode}`);
6017
6260
  await visionService.setVisionMode(mode);
6018
6261
  const currentMode = visionService.getVisionMode();
6019
6262
  if (currentMode !== mode) {
@@ -6021,16 +6264,16 @@ class VisionRuntimeTestSuite {
6021
6264
  }
6022
6265
  }
6023
6266
  await visionService.setVisionMode(originalMode);
6024
- logger15.info("[Test] ✅ Vision mode switching test passed");
6267
+ logger16.info("[Test] ✅ Vision mode switching test passed");
6025
6268
  }
6026
6269
  },
6027
6270
  {
6028
- name: "DESCRIBE_SCENE action execution",
6271
+ name: "VISION action describe op execution",
6029
6272
  fn: async (runtime) => {
6030
- logger15.info("[Test] Testing DESCRIBE_SCENE action...");
6031
- const action = runtime.actions.find((a) => a.name === "DESCRIBE_SCENE");
6273
+ logger16.info("[Test] Testing VISION action with op=describe...");
6274
+ const action = runtime.actions.find((a) => a.name === "VISION");
6032
6275
  if (!action) {
6033
- throw new Error("DESCRIBE_SCENE action not found");
6276
+ throw new Error("VISION action not found");
6034
6277
  }
6035
6278
  const message = {
6036
6279
  id: `test-msg-${Date.now()}`,
@@ -6044,27 +6287,27 @@ class VisionRuntimeTestSuite {
6044
6287
  };
6045
6288
  const isValid = await action.validate(runtime, message);
6046
6289
  if (!isValid) {
6047
- throw new Error("DESCRIBE_SCENE action validation failed");
6290
+ throw new Error("VISION action validation failed");
6048
6291
  }
6049
6292
  let responseReceived = false;
6050
6293
  const callback = async (response) => {
6051
6294
  if (response.text && response.text.length > 0) {
6052
6295
  responseReceived = true;
6053
- logger15.info(`[Test] Action response: ${response.text.substring(0, 100)}...`);
6296
+ logger16.info(`[Test] Action response: ${response.text.substring(0, 100)}...`);
6054
6297
  }
6055
6298
  return [];
6056
6299
  };
6057
- await action.handler(runtime, message, { values: {}, data: {}, text: "" }, {}, callback);
6300
+ await action.handler(runtime, message, { values: {}, data: {}, text: "" }, { parameters: { op: "describe" } }, callback);
6058
6301
  if (!responseReceived) {
6059
- throw new Error("DESCRIBE_SCENE action did not produce a response");
6302
+ throw new Error("VISION action with op=describe did not produce a response");
6060
6303
  }
6061
- logger15.info("[Test] ✅ DESCRIBE_SCENE action test passed");
6304
+ logger16.info("[Test] ✅ VISION action describe op test passed");
6062
6305
  }
6063
6306
  },
6064
6307
  {
6065
6308
  name: "Vision provider integration",
6066
6309
  fn: async (runtime) => {
6067
- logger15.info("[Test] Testing vision provider...");
6310
+ logger16.info("[Test] Testing vision provider...");
6068
6311
  const provider = runtime.providers.find((p) => p.name === "visionProvider");
6069
6312
  if (!provider) {
6070
6313
  throw new Error("Vision provider not found");
@@ -6086,47 +6329,47 @@ class VisionRuntimeTestSuite {
6086
6329
  throw new Error("Vision provider returned invalid result");
6087
6330
  }
6088
6331
  if (result.text?.includes("I can see")) {
6089
- logger15.info(`[Test] Provider text: ${result.text.substring(0, 100)}...`);
6332
+ logger16.info(`[Test] Provider text: ${result.text.substring(0, 100)}...`);
6090
6333
  }
6091
- logger15.info("[Test] ✅ Vision provider test passed");
6334
+ logger16.info("[Test] ✅ Vision provider test passed");
6092
6335
  }
6093
6336
  },
6094
6337
  {
6095
6338
  name: "Florence-2 model initialization",
6096
6339
  fn: async (runtime) => {
6097
- logger15.info("[Test] Testing Florence-2 model...");
6340
+ logger16.info("[Test] Testing Florence-2 model...");
6098
6341
  const visionService = runtime.getService(VisionServiceType.VISION);
6099
6342
  if (!visionService) {
6100
6343
  throw new Error("Vision service not found");
6101
6344
  }
6102
6345
  const florence2Enabled = runtime.getSetting("FLORENCE2_ENABLED") === "true" || runtime.getSetting("VISION_FLORENCE2_ENABLED") === "true";
6103
6346
  if (!florence2Enabled) {
6104
- logger15.info("[Test] Florence-2 is disabled, skipping test");
6347
+ logger16.info("[Test] Florence-2 is disabled, skipping test");
6105
6348
  return;
6106
6349
  }
6107
6350
  const mode = visionService.getVisionMode();
6108
6351
  if (mode === "SCREEN" /* SCREEN */ || mode === "BOTH" /* BOTH */) {
6109
6352
  const screenCapture = await visionService.getScreenCapture();
6110
6353
  if (screenCapture) {
6111
- logger15.info("[Test] Screen capture available");
6112
- logger15.info(`[Test] Screen size: ${screenCapture.width}x${screenCapture.height}`);
6113
- logger15.info(`[Test] Tiles: ${screenCapture.tiles.length}`);
6354
+ logger16.info("[Test] Screen capture available");
6355
+ logger16.info(`[Test] Screen size: ${screenCapture.width}x${screenCapture.height}`);
6356
+ logger16.info(`[Test] Tiles: ${screenCapture.tiles.length}`);
6114
6357
  }
6115
6358
  }
6116
- logger15.info("[Test] ✅ Florence-2 model test passed");
6359
+ logger16.info("[Test] ✅ Florence-2 model test passed");
6117
6360
  }
6118
6361
  },
6119
6362
  {
6120
6363
  name: "OCR service functionality",
6121
6364
  fn: async (runtime) => {
6122
- logger15.info("[Test] Testing OCR service...");
6365
+ logger16.info("[Test] Testing OCR service...");
6123
6366
  const visionService = runtime.getService(VisionServiceType.VISION);
6124
6367
  if (!visionService) {
6125
6368
  throw new Error("Vision service not found");
6126
6369
  }
6127
6370
  const ocrEnabled = runtime.getSetting("OCR_ENABLED") === "true" || runtime.getSetting("VISION_OCR_ENABLED") === "true";
6128
6371
  if (!ocrEnabled) {
6129
- logger15.info("[Test] OCR is disabled, skipping test");
6372
+ logger16.info("[Test] OCR is disabled, skipping test");
6130
6373
  return;
6131
6374
  }
6132
6375
  const mode = visionService.getVisionMode();
@@ -6135,18 +6378,18 @@ class VisionRuntimeTestSuite {
6135
6378
  if (enhancedScene?.screenAnalysis) {
6136
6379
  const ocrText = enhancedScene.screenAnalysis.fullScreenOCR;
6137
6380
  if (ocrText) {
6138
- logger15.info(`[Test] OCR extracted ${ocrText.length} characters`);
6139
- logger15.info(`[Test] OCR sample: ${ocrText.substring(0, 100)}...`);
6381
+ logger16.info(`[Test] OCR extracted ${ocrText.length} characters`);
6382
+ logger16.info(`[Test] OCR sample: ${ocrText.substring(0, 100)}...`);
6140
6383
  }
6141
6384
  }
6142
6385
  }
6143
- logger15.info("[Test] ✅ OCR service test passed");
6386
+ logger16.info("[Test] ✅ OCR service test passed");
6144
6387
  }
6145
6388
  },
6146
6389
  {
6147
6390
  name: "Entity tracking system",
6148
6391
  fn: async (runtime) => {
6149
- logger15.info("[Test] Testing entity tracking...");
6392
+ logger16.info("[Test] Testing entity tracking...");
6150
6393
  const visionService = runtime.getService(VisionServiceType.VISION);
6151
6394
  if (!visionService) {
6152
6395
  throw new Error("Vision service not found");
@@ -6156,14 +6399,14 @@ class VisionRuntimeTestSuite {
6156
6399
  throw new Error("Entity tracker not found");
6157
6400
  }
6158
6401
  const entities = entityTracker.getActiveEntities();
6159
- logger15.info(`[Test] Active entities: ${entities.length}`);
6402
+ logger16.info(`[Test] Active entities: ${entities.length}`);
6160
6403
  for (const entity of entities) {
6161
6404
  if (!entity.id || !entity.entityType || !entity.lastSeen) {
6162
6405
  throw new Error("Entity missing required fields");
6163
6406
  }
6164
- logger15.info(`[Test] Entity ${entity.id}: type=${entity.entityType}, tracked=${entity.lastSeen - entity.firstSeen}ms`);
6407
+ logger16.info(`[Test] Entity ${entity.id}: type=${entity.entityType}, tracked=${entity.lastSeen - entity.firstSeen}ms`);
6165
6408
  }
6166
- logger15.info("[Test] ✅ Entity tracking test passed");
6409
+ logger16.info("[Test] ✅ Entity tracking test passed");
6167
6410
  }
6168
6411
  }
6169
6412
  ];
@@ -6176,7 +6419,7 @@ import { promisify as promisify4 } from "node:util";
6176
6419
  // src/tests/test-pattern-generator.ts
6177
6420
  import * as fs5 from "node:fs/promises";
6178
6421
  import * as path7 from "node:path";
6179
- import { logger as logger16 } from "@elizaos/core";
6422
+ import { logger as logger17 } from "@elizaos/core";
6180
6423
  import sharp5 from "sharp";
6181
6424
  function generateGrid(width, height, spacing = 100) {
6182
6425
  const lines = [];
@@ -6286,7 +6529,7 @@ async function savePattern(buffer, filename) {
6286
6529
  await fs5.mkdir(outputDir, { recursive: true });
6287
6530
  const filepath = path7.join(outputDir, filename);
6288
6531
  await fs5.writeFile(filepath, buffer);
6289
- logger16.info(`[TestPatternGenerator] Saved test pattern to ${filepath}`);
6532
+ logger17.info(`[TestPatternGenerator] Saved test pattern to ${filepath}`);
6290
6533
  return filepath;
6291
6534
  }
6292
6535
  function verifyQuadrantNumbers(ocrText) {
@@ -6356,7 +6599,7 @@ class VisionWorkerE2ETestSuite {
6356
6599
  frameCount++;
6357
6600
  lastTimestamp = scene.timestamp;
6358
6601
  }
6359
- await new Promise((resolve) => setImmediate(resolve));
6602
+ await new Promise((resolve2) => setImmediate(resolve2));
6360
6603
  }
6361
6604
  const totalTime = (Date.now() - startTime) / 1000;
6362
6605
  const avgFPS = frameCount / totalTime;
@@ -6385,7 +6628,7 @@ class VisionWorkerE2ETestSuite {
6385
6628
  await displayTestPattern(patternPath);
6386
6629
  try {
6387
6630
  console.log("Waiting for OCR processing...");
6388
- await new Promise((resolve) => setTimeout(resolve, 3000));
6631
+ await new Promise((resolve2) => setTimeout(resolve2, 3000));
6389
6632
  const scene = await visionService.getEnhancedSceneDescription();
6390
6633
  const ocrText = scene?.screenAnalysis?.fullScreenOCR || "";
6391
6634
  console.log(`OCR detected text: "${ocrText.substring(0, 100)}..."`);
@@ -6424,7 +6667,7 @@ class VisionWorkerE2ETestSuite {
6424
6667
  if (scene?.screenCapture) {
6425
6668
  console.log(` Screen: ${scene.screenCapture.width}x${scene.screenCapture.height}`);
6426
6669
  }
6427
- await new Promise((resolve) => setTimeout(resolve, 500));
6670
+ await new Promise((resolve2) => setTimeout(resolve2, 500));
6428
6671
  }
6429
6672
  console.log(`✓ Monitored ${displayCount} displays`);
6430
6673
  }
@@ -6469,7 +6712,7 @@ Current stats:`);
6469
6712
  console.log(` OCR detections: ${stats.ocrDetections}`);
6470
6713
  console.log(` Florence-2 detections: ${stats.florence2Detections}`);
6471
6714
  }
6472
- await new Promise((resolve) => setTimeout(resolve, 100));
6715
+ await new Promise((resolve2) => setTimeout(resolve2, 100));
6473
6716
  }
6474
6717
  const totalTime = (Date.now() - startTime) / 1000;
6475
6718
  console.log(`
@@ -6520,7 +6763,7 @@ async function displayTestPattern(imagePath) {
6520
6763
  } else if (platform === "win32") {
6521
6764
  await execAsync4(`start "" "${imagePath}"`);
6522
6765
  }
6523
- await new Promise((resolve) => setTimeout(resolve, 1000));
6766
+ await new Promise((resolve2) => setTimeout(resolve2, 1000));
6524
6767
  } catch (error) {
6525
6768
  console.warn("Could not display test pattern:", error);
6526
6769
  }
@@ -6553,15 +6796,19 @@ var visionPlugin = {
6553
6796
  description: "Provides visual perception through camera integration and scene analysis",
6554
6797
  services: [VisionService],
6555
6798
  providers: [visionProvider],
6556
- actions: [
6557
- describeSceneAction,
6558
- captureImageAction,
6559
- setVisionModeAction,
6560
- nameEntityAction,
6561
- identifyPersonAction,
6562
- trackEntityAction
6563
- ],
6799
+ actions: [...promoteSubactionsToActions(visionAction)],
6564
6800
  tests: testSuites,
6801
+ autoEnable: {
6802
+ shouldEnable: (_env, config) => {
6803
+ const f = config?.features?.vision;
6804
+ const featureOn = f === true || typeof f === "object" && f !== null && f.enabled !== false;
6805
+ if (featureOn)
6806
+ return true;
6807
+ const media = config?.media;
6808
+ const visionMedia = media?.vision;
6809
+ return Boolean(visionMedia && visionMedia.enabled !== false && typeof visionMedia.provider === "string" && visionMedia.provider.length > 0);
6810
+ }
6811
+ },
6565
6812
  init: async (_config, _runtime) => {}
6566
6813
  };
6567
6814
  var src_default = visionPlugin;
@@ -6570,4 +6817,4 @@ export {
6570
6817
  src_default as default
6571
6818
  };
6572
6819
 
6573
- //# debugId=08C3A6DFE8F9170A64756E2164756E21
6820
+ //# debugId=177A2977D034662964756E2164756E21