@elizaos/plugin-vision 2.0.0-alpha.9 → 2.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +340 -0
- package/auto-enable.ts +29 -0
- package/build.config.ts +20 -1
- package/dist/index.js +1609 -1362
- package/dist/index.js.map +27 -26
- package/dist/workers/florence2-worker.js +83 -114154
- package/dist/workers/florence2-worker.js.map +6 -85
- package/dist/workers/ocr-worker.js +743 -114796
- package/dist/workers/ocr-worker.js.map +6 -85
- package/dist/workers/screen-capture-worker.js +16 -11
- package/dist/workers/screen-capture-worker.js.map +3 -3
- package/package.json +17 -9
package/dist/index.js
CHANGED
|
@@ -1,22 +1,9 @@
|
|
|
1
1
|
import { createRequire } from "node:module";
|
|
2
|
-
var __create = Object.create;
|
|
3
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
4
|
-
var __defProp = Object.defineProperty;
|
|
5
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
-
var __toESM = (mod, isNodeMode, target) => {
|
|
8
|
-
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
9
|
-
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
10
|
-
for (let key of __getOwnPropNames(mod))
|
|
11
|
-
if (!__hasOwnProp.call(to, key))
|
|
12
|
-
__defProp(to, key, {
|
|
13
|
-
get: () => mod[key],
|
|
14
|
-
enumerable: true
|
|
15
|
-
});
|
|
16
|
-
return to;
|
|
17
|
-
};
|
|
18
2
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
19
3
|
|
|
4
|
+
// src/index.ts
|
|
5
|
+
import { promoteSubactionsToActions } from "@elizaos/core";
|
|
6
|
+
|
|
20
7
|
// src/action.ts
|
|
21
8
|
import {
|
|
22
9
|
ContentType,
|
|
@@ -30,6 +17,200 @@ var VisionServiceType = {
|
|
|
30
17
|
};
|
|
31
18
|
|
|
32
19
|
// src/action.ts
|
|
20
|
+
var VISION_ACTION_TIMEOUT_MS = 1e4;
|
|
21
|
+
var MAX_VISION_TEXT_LENGTH = 4000;
|
|
22
|
+
var MAX_VISION_ENTITIES = 25;
|
|
23
|
+
var VISION_OPS = [
|
|
24
|
+
"describe",
|
|
25
|
+
"capture",
|
|
26
|
+
"set_mode",
|
|
27
|
+
"name_entity",
|
|
28
|
+
"identify_person",
|
|
29
|
+
"track_entity"
|
|
30
|
+
];
|
|
31
|
+
var ALL_VISION_CONTEXTS = [
|
|
32
|
+
"media",
|
|
33
|
+
"screen_time",
|
|
34
|
+
"automation",
|
|
35
|
+
"memory",
|
|
36
|
+
"settings"
|
|
37
|
+
];
|
|
38
|
+
var DESCRIBE_KEYWORDS = [
|
|
39
|
+
"describe",
|
|
40
|
+
"scene",
|
|
41
|
+
"see",
|
|
42
|
+
"look",
|
|
43
|
+
"camera",
|
|
44
|
+
"screen",
|
|
45
|
+
"object",
|
|
46
|
+
"person",
|
|
47
|
+
"escena",
|
|
48
|
+
"ver",
|
|
49
|
+
"camara",
|
|
50
|
+
"décrire",
|
|
51
|
+
"scène",
|
|
52
|
+
"voir",
|
|
53
|
+
"beschreiben",
|
|
54
|
+
"szene",
|
|
55
|
+
"sehen",
|
|
56
|
+
"descrivi",
|
|
57
|
+
"scena",
|
|
58
|
+
"vedi",
|
|
59
|
+
"説明",
|
|
60
|
+
"見える",
|
|
61
|
+
"场景",
|
|
62
|
+
"描述",
|
|
63
|
+
"看见",
|
|
64
|
+
"장면",
|
|
65
|
+
"설명",
|
|
66
|
+
"보여"
|
|
67
|
+
];
|
|
68
|
+
var CAPTURE_KEYWORDS = [
|
|
69
|
+
"capture",
|
|
70
|
+
"image",
|
|
71
|
+
"photo",
|
|
72
|
+
"picture",
|
|
73
|
+
"snapshot",
|
|
74
|
+
"screenshot",
|
|
75
|
+
"camera",
|
|
76
|
+
"captura",
|
|
77
|
+
"foto",
|
|
78
|
+
"imagen",
|
|
79
|
+
"capturer",
|
|
80
|
+
"photo",
|
|
81
|
+
"bild",
|
|
82
|
+
"foto",
|
|
83
|
+
"capturare",
|
|
84
|
+
"写真",
|
|
85
|
+
"画像",
|
|
86
|
+
"スクリーンショット",
|
|
87
|
+
"拍照",
|
|
88
|
+
"截图",
|
|
89
|
+
"이미지",
|
|
90
|
+
"사진",
|
|
91
|
+
"스크린샷"
|
|
92
|
+
];
|
|
93
|
+
var SET_MODE_KEYWORDS = [
|
|
94
|
+
"vision",
|
|
95
|
+
"mode",
|
|
96
|
+
"camera",
|
|
97
|
+
"screen",
|
|
98
|
+
"both",
|
|
99
|
+
"disable",
|
|
100
|
+
"enable",
|
|
101
|
+
"off",
|
|
102
|
+
"visión",
|
|
103
|
+
"camara",
|
|
104
|
+
"pantalla",
|
|
105
|
+
"écran",
|
|
106
|
+
"kamera",
|
|
107
|
+
"bildschirm",
|
|
108
|
+
"schermo",
|
|
109
|
+
"ビジョン",
|
|
110
|
+
"カメラ",
|
|
111
|
+
"画面",
|
|
112
|
+
"视觉",
|
|
113
|
+
"相机",
|
|
114
|
+
"屏幕",
|
|
115
|
+
"비전",
|
|
116
|
+
"카메라",
|
|
117
|
+
"화면"
|
|
118
|
+
];
|
|
119
|
+
var NAME_ENTITY_KEYWORDS = [
|
|
120
|
+
"name",
|
|
121
|
+
"named",
|
|
122
|
+
"call",
|
|
123
|
+
"person",
|
|
124
|
+
"entity",
|
|
125
|
+
"remember",
|
|
126
|
+
"object",
|
|
127
|
+
"nombre",
|
|
128
|
+
"llama",
|
|
129
|
+
"persona",
|
|
130
|
+
"nom",
|
|
131
|
+
"appelle",
|
|
132
|
+
"personne",
|
|
133
|
+
"name",
|
|
134
|
+
"nenne",
|
|
135
|
+
"person",
|
|
136
|
+
"nome",
|
|
137
|
+
"chiama",
|
|
138
|
+
"persona",
|
|
139
|
+
"名前",
|
|
140
|
+
"呼ぶ",
|
|
141
|
+
"人",
|
|
142
|
+
"命名",
|
|
143
|
+
"叫",
|
|
144
|
+
"人",
|
|
145
|
+
"이름",
|
|
146
|
+
"불러",
|
|
147
|
+
"사람"
|
|
148
|
+
];
|
|
149
|
+
var IDENTIFY_PERSON_KEYWORDS = [
|
|
150
|
+
"identify",
|
|
151
|
+
"recognize",
|
|
152
|
+
"who is",
|
|
153
|
+
"person",
|
|
154
|
+
"face",
|
|
155
|
+
"seen before",
|
|
156
|
+
"identificar",
|
|
157
|
+
"reconoces",
|
|
158
|
+
"persona",
|
|
159
|
+
"visage",
|
|
160
|
+
"reconnais",
|
|
161
|
+
"personne",
|
|
162
|
+
"erkennen",
|
|
163
|
+
"gesicht",
|
|
164
|
+
"person",
|
|
165
|
+
"riconosci",
|
|
166
|
+
"persona",
|
|
167
|
+
"識別",
|
|
168
|
+
"誰",
|
|
169
|
+
"顔",
|
|
170
|
+
"识别",
|
|
171
|
+
"是谁",
|
|
172
|
+
"人",
|
|
173
|
+
"식별",
|
|
174
|
+
"누구",
|
|
175
|
+
"얼굴"
|
|
176
|
+
];
|
|
177
|
+
var TRACK_ENTITY_KEYWORDS = [
|
|
178
|
+
"track",
|
|
179
|
+
"follow",
|
|
180
|
+
"watch",
|
|
181
|
+
"keep an eye",
|
|
182
|
+
"entity",
|
|
183
|
+
"person",
|
|
184
|
+
"object",
|
|
185
|
+
"rastrear",
|
|
186
|
+
"seguir",
|
|
187
|
+
"vigilar",
|
|
188
|
+
"persona",
|
|
189
|
+
"suivre",
|
|
190
|
+
"surveiller",
|
|
191
|
+
"personne",
|
|
192
|
+
"verfolgen",
|
|
193
|
+
"beobachten",
|
|
194
|
+
"person",
|
|
195
|
+
"traccia",
|
|
196
|
+
"segui",
|
|
197
|
+
"persona",
|
|
198
|
+
"追跡",
|
|
199
|
+
"見張",
|
|
200
|
+
"人",
|
|
201
|
+
"跟踪",
|
|
202
|
+
"关注",
|
|
203
|
+
"人",
|
|
204
|
+
"추적",
|
|
205
|
+
"지켜봐",
|
|
206
|
+
"사람"
|
|
207
|
+
];
|
|
208
|
+
function withVisionTimeout(promise, label) {
|
|
209
|
+
return Promise.race([
|
|
210
|
+
promise,
|
|
211
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error(`${label} timed out`)), VISION_ACTION_TIMEOUT_MS))
|
|
212
|
+
]);
|
|
213
|
+
}
|
|
33
214
|
async function saveExecutionRecord(runtime, messageContext, thought, text, actions, attachments) {
|
|
34
215
|
const memory = {
|
|
35
216
|
id: createUniqueUuid(runtime, `vision-record-${Date.now()}`),
|
|
@@ -47,1108 +228,1004 @@ async function saveExecutionRecord(runtime, messageContext, thought, text, actio
|
|
|
47
228
|
};
|
|
48
229
|
await runtime.createMemory(memory, "messages");
|
|
49
230
|
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
64
|
-
const __avExpectedSource = "";
|
|
65
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
66
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
67
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
68
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
69
|
-
return false;
|
|
70
|
-
}
|
|
71
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
72
|
-
const visionService = runtime2.getService("VISION");
|
|
73
|
-
return !!visionService && visionService.isActive();
|
|
74
|
-
};
|
|
75
|
-
try {
|
|
76
|
-
return Boolean(await __avLegacyValidate(runtime, message, state, options));
|
|
77
|
-
} catch {
|
|
78
|
-
return false;
|
|
231
|
+
function readActionParams(options) {
|
|
232
|
+
const direct = options && typeof options === "object" ? options : {};
|
|
233
|
+
const parameters = direct.parameters && typeof direct.parameters === "object" ? direct.parameters : {};
|
|
234
|
+
return { ...direct, ...parameters };
|
|
235
|
+
}
|
|
236
|
+
function selectedContextMatches(state, contexts) {
|
|
237
|
+
const selected = new Set;
|
|
238
|
+
const collect = (value) => {
|
|
239
|
+
if (!Array.isArray(value))
|
|
240
|
+
return;
|
|
241
|
+
for (const item of value) {
|
|
242
|
+
if (typeof item === "string")
|
|
243
|
+
selected.add(item);
|
|
79
244
|
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
245
|
+
};
|
|
246
|
+
collect(state?.values?.selectedContexts);
|
|
247
|
+
collect(state?.data?.selectedContexts);
|
|
248
|
+
const contextObject = state?.data?.contextObject;
|
|
249
|
+
collect(contextObject?.trajectoryPrefix?.selectedContexts);
|
|
250
|
+
collect(contextObject?.metadata?.selectedContexts);
|
|
251
|
+
return contexts.some((context) => selected.has(context));
|
|
252
|
+
}
|
|
253
|
+
function visionServiceIsActive(runtime) {
|
|
254
|
+
const visionService = runtime.getService("VISION");
|
|
255
|
+
return Boolean(visionService?.isActive());
|
|
256
|
+
}
|
|
257
|
+
function normalizeOp(value) {
|
|
258
|
+
if (typeof value !== "string")
|
|
259
|
+
return null;
|
|
260
|
+
const normalized = value.trim().toLowerCase().replace(/[\s-]+/g, "_");
|
|
261
|
+
if (!normalized)
|
|
262
|
+
return null;
|
|
263
|
+
const aliases = {
|
|
264
|
+
describe_scene: "describe",
|
|
265
|
+
scene: "describe",
|
|
266
|
+
capture_image: "capture",
|
|
267
|
+
image: "capture",
|
|
268
|
+
photo: "capture",
|
|
269
|
+
snapshot: "capture",
|
|
270
|
+
screenshot: "capture",
|
|
271
|
+
set_vision_mode: "set_mode",
|
|
272
|
+
mode: "set_mode",
|
|
273
|
+
vision_mode: "set_mode",
|
|
274
|
+
name: "name_entity",
|
|
275
|
+
identify: "identify_person",
|
|
276
|
+
recognize: "identify_person",
|
|
277
|
+
track: "track_entity",
|
|
278
|
+
follow: "track_entity"
|
|
279
|
+
};
|
|
280
|
+
if (aliases[normalized])
|
|
281
|
+
return aliases[normalized];
|
|
282
|
+
return VISION_OPS.includes(normalized) ? normalized : null;
|
|
283
|
+
}
|
|
284
|
+
function inferOpFromMessage(text) {
|
|
285
|
+
const lower = text.toLowerCase();
|
|
286
|
+
if (NAME_ENTITY_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(call|name|named)\b/.test(lower)) {
|
|
287
|
+
return "name_entity";
|
|
288
|
+
}
|
|
289
|
+
if (IDENTIFY_PERSON_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(who|identify|recognize)\b/.test(lower)) {
|
|
290
|
+
return "identify_person";
|
|
291
|
+
}
|
|
292
|
+
if (TRACK_ENTITY_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(track|follow|watch|keep an eye)\b/.test(lower)) {
|
|
293
|
+
return "track_entity";
|
|
294
|
+
}
|
|
295
|
+
if (SET_MODE_KEYWORDS.some((k) => lower.includes(k.toLowerCase())) && /\b(mode|enable|disable|turn off|turn on)\b/.test(lower)) {
|
|
296
|
+
return "set_mode";
|
|
297
|
+
}
|
|
298
|
+
if (CAPTURE_KEYWORDS.some((k) => lower.includes(k.toLowerCase()))) {
|
|
299
|
+
return "capture";
|
|
300
|
+
}
|
|
301
|
+
if (DESCRIBE_KEYWORDS.some((k) => lower.includes(k.toLowerCase()))) {
|
|
302
|
+
return "describe";
|
|
303
|
+
}
|
|
304
|
+
return null;
|
|
305
|
+
}
|
|
306
|
+
async function runDescribe(runtime, message, options, callback) {
|
|
307
|
+
const visionService = runtime.getService("VISION");
|
|
308
|
+
if (!visionService || !visionService.isActive()) {
|
|
309
|
+
const thought = "Vision service is not available or no camera is connected.";
|
|
310
|
+
const text = "I cannot see anything right now. No camera is available.";
|
|
311
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
312
|
+
if (callback) {
|
|
313
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
314
|
+
}
|
|
315
|
+
return {
|
|
316
|
+
success: false,
|
|
317
|
+
text: "Vision service unavailable - cannot analyze scene",
|
|
318
|
+
values: {
|
|
319
|
+
success: false,
|
|
320
|
+
visionAvailable: false,
|
|
321
|
+
error: "Vision service not available"
|
|
322
|
+
},
|
|
323
|
+
data: {
|
|
324
|
+
actionName: "VISION",
|
|
325
|
+
op: "describe",
|
|
326
|
+
error: "Vision service not available or no camera connected"
|
|
327
|
+
}
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
try {
|
|
331
|
+
const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
|
|
332
|
+
const cameraInfo = visionService.getCameraInfo();
|
|
333
|
+
if (!scene) {
|
|
334
|
+
const thought2 = "Camera is connected but no scene has been analyzed yet.";
|
|
335
|
+
const text2 = `Camera "${cameraInfo?.name}" is connected, but I haven't analyzed any scenes yet. Please wait a moment.`;
|
|
336
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
87
337
|
if (callback) {
|
|
88
|
-
await callback({
|
|
89
|
-
thought,
|
|
90
|
-
text,
|
|
91
|
-
actions: ["DESCRIBE_SCENE"]
|
|
92
|
-
});
|
|
338
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
93
339
|
}
|
|
94
340
|
return {
|
|
95
341
|
success: false,
|
|
96
|
-
text: "
|
|
342
|
+
text: "Camera connected but no scene analyzed yet",
|
|
97
343
|
values: {
|
|
98
344
|
success: false,
|
|
99
|
-
visionAvailable:
|
|
100
|
-
|
|
345
|
+
visionAvailable: true,
|
|
346
|
+
sceneAnalyzed: false,
|
|
347
|
+
cameraName: cameraInfo?.name || undefined
|
|
101
348
|
},
|
|
102
349
|
data: {
|
|
103
|
-
actionName: "
|
|
104
|
-
|
|
350
|
+
actionName: "VISION",
|
|
351
|
+
op: "describe",
|
|
352
|
+
cameraInfo: cameraInfo ? {
|
|
353
|
+
id: cameraInfo.id,
|
|
354
|
+
name: cameraInfo.name,
|
|
355
|
+
connected: cameraInfo.connected
|
|
356
|
+
} : undefined,
|
|
357
|
+
sceneStatus: "not_analyzed"
|
|
105
358
|
}
|
|
106
359
|
};
|
|
107
360
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
text: text2,
|
|
119
|
-
actions: ["DESCRIBE_SCENE"]
|
|
120
|
-
});
|
|
121
|
-
}
|
|
122
|
-
return {
|
|
123
|
-
success: false,
|
|
124
|
-
text: "Camera connected but no scene analyzed yet",
|
|
125
|
-
values: {
|
|
126
|
-
success: false,
|
|
127
|
-
visionAvailable: true,
|
|
128
|
-
sceneAnalyzed: false,
|
|
129
|
-
cameraName: cameraInfo?.name || undefined
|
|
130
|
-
},
|
|
131
|
-
data: {
|
|
132
|
-
actionName: "DESCRIBE_SCENE",
|
|
133
|
-
cameraInfo: cameraInfo ? {
|
|
134
|
-
id: cameraInfo.id,
|
|
135
|
-
name: cameraInfo.name,
|
|
136
|
-
connected: cameraInfo.connected
|
|
137
|
-
} : undefined,
|
|
138
|
-
sceneStatus: "not_analyzed"
|
|
139
|
-
}
|
|
140
|
-
};
|
|
141
|
-
}
|
|
142
|
-
const peopleCount = scene.people.length;
|
|
143
|
-
const objectCount = scene.objects.length;
|
|
144
|
-
const timestamp = new Date(scene.timestamp).toLocaleString();
|
|
145
|
-
let description = `Looking through ${cameraInfo?.name || "the camera"}, `;
|
|
146
|
-
description += scene.description;
|
|
147
|
-
if (peopleCount > 0) {
|
|
148
|
-
description += `
|
|
361
|
+
const peopleCount = scene.people.length;
|
|
362
|
+
const objectCount = scene.objects.length;
|
|
363
|
+
const people = scene.people.slice(0, MAX_VISION_ENTITIES);
|
|
364
|
+
const objects = scene.objects.slice(0, MAX_VISION_ENTITIES);
|
|
365
|
+
const timestamp = new Date(scene.timestamp).toLocaleString();
|
|
366
|
+
const detailLevel = options.detailLevel === "summary" ? "summary" : "detailed";
|
|
367
|
+
let description = `Looking through ${cameraInfo?.name || "the camera"}, `;
|
|
368
|
+
description += scene.description;
|
|
369
|
+
if (detailLevel === "detailed" && peopleCount > 0) {
|
|
370
|
+
description += `
|
|
149
371
|
|
|
150
372
|
I can see ${peopleCount} ${peopleCount === 1 ? "person" : "people"}`;
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
373
|
+
const facingData = people.reduce((acc, person) => {
|
|
374
|
+
if (person.facing && person.facing !== "unknown") {
|
|
375
|
+
acc[person.facing] = (acc[person.facing] || 0) + 1;
|
|
376
|
+
}
|
|
377
|
+
return acc;
|
|
378
|
+
}, {});
|
|
379
|
+
if (Object.keys(facingData).length > 0) {
|
|
380
|
+
const facingDescriptions = Object.entries(facingData).map(([direction, count]) => `${count} facing ${direction}`);
|
|
381
|
+
description += ` (${facingDescriptions.join(", ")})`;
|
|
382
|
+
}
|
|
383
|
+
description += ".";
|
|
384
|
+
}
|
|
385
|
+
if (detailLevel === "detailed" && objectCount > 0) {
|
|
386
|
+
const objectTypes = objects.reduce((acc, obj) => {
|
|
387
|
+
acc[obj.type] = (acc[obj.type] || 0) + 1;
|
|
388
|
+
return acc;
|
|
389
|
+
}, {});
|
|
390
|
+
const objectDescriptions = Object.entries(objectTypes).map(([type, count]) => `${count} ${type}${count > 1 ? "s" : ""}`);
|
|
391
|
+
description += `
|
|
170
392
|
|
|
171
393
|
Objects detected: ${objectDescriptions.join(", ")}.`;
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
394
|
+
}
|
|
395
|
+
if (detailLevel === "detailed" && scene.sceneChanged && scene.changePercentage) {
|
|
396
|
+
description += `
|
|
175
397
|
|
|
176
398
|
(Scene changed by ${scene.changePercentage.toFixed(1)}% since last analysis)`;
|
|
399
|
+
}
|
|
400
|
+
const thought = `Analyzed the visual scene at ${timestamp}.`;
|
|
401
|
+
const text = description.slice(0, MAX_VISION_TEXT_LENGTH);
|
|
402
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
403
|
+
if (callback) {
|
|
404
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
405
|
+
}
|
|
406
|
+
return {
|
|
407
|
+
success: true,
|
|
408
|
+
text,
|
|
409
|
+
values: {
|
|
410
|
+
success: true,
|
|
411
|
+
visionAvailable: true,
|
|
412
|
+
sceneAnalyzed: true,
|
|
413
|
+
peopleCount,
|
|
414
|
+
objectCount,
|
|
415
|
+
cameraName: cameraInfo?.name || undefined,
|
|
416
|
+
sceneChanged: scene.sceneChanged,
|
|
417
|
+
changePercentage: scene.changePercentage,
|
|
418
|
+
detailLevel
|
|
419
|
+
},
|
|
420
|
+
data: {
|
|
421
|
+
actionName: "VISION",
|
|
422
|
+
op: "describe",
|
|
423
|
+
sceneTimestamp: scene.timestamp,
|
|
424
|
+
sceneDescription: scene.description.slice(0, MAX_VISION_TEXT_LENGTH),
|
|
425
|
+
sceneChanged: scene.sceneChanged,
|
|
426
|
+
changePercentage: scene.changePercentage,
|
|
427
|
+
audioTranscription: scene.audioTranscription || undefined,
|
|
428
|
+
objectCount: objects.length,
|
|
429
|
+
peopleCount: people.length,
|
|
430
|
+
cameraInfo: cameraInfo ? {
|
|
431
|
+
id: cameraInfo.id,
|
|
432
|
+
name: cameraInfo.name,
|
|
433
|
+
connected: cameraInfo.connected
|
|
434
|
+
} : undefined,
|
|
435
|
+
timestamp,
|
|
436
|
+
description: text
|
|
437
|
+
}
|
|
438
|
+
};
|
|
439
|
+
} catch (error) {
|
|
440
|
+
logger.error("[VISION/describe] Error analyzing scene:", error instanceof Error ? error.message : String(error));
|
|
441
|
+
const thought = "An error occurred while trying to analyze the visual scene.";
|
|
442
|
+
const text = `Error analyzing scene: ${error instanceof Error ? error.message : String(error)}`;
|
|
443
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
444
|
+
if (callback) {
|
|
445
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
446
|
+
}
|
|
447
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
448
|
+
return {
|
|
449
|
+
success: false,
|
|
450
|
+
text: "Error analyzing scene",
|
|
451
|
+
values: {
|
|
452
|
+
success: false,
|
|
453
|
+
visionAvailable: true,
|
|
454
|
+
error: true,
|
|
455
|
+
errorMessage
|
|
456
|
+
},
|
|
457
|
+
data: {
|
|
458
|
+
actionName: "VISION",
|
|
459
|
+
op: "describe",
|
|
460
|
+
error: errorMessage,
|
|
461
|
+
errorType: "analysis_error"
|
|
462
|
+
}
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
async function runCapture(runtime, message, callback) {
|
|
467
|
+
const visionService = runtime.getService("VISION");
|
|
468
|
+
if (!visionService || !visionService.isActive()) {
|
|
469
|
+
const thought = "Vision service is not available or no camera is connected.";
|
|
470
|
+
const text = "I cannot capture an image right now. No camera is available.";
|
|
471
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
472
|
+
if (callback) {
|
|
473
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
474
|
+
}
|
|
475
|
+
return {
|
|
476
|
+
success: false,
|
|
477
|
+
text: "Vision service unavailable - cannot capture image",
|
|
478
|
+
values: {
|
|
479
|
+
success: false,
|
|
480
|
+
visionAvailable: false,
|
|
481
|
+
error: "Vision service not available"
|
|
482
|
+
},
|
|
483
|
+
data: {
|
|
484
|
+
actionName: "VISION",
|
|
485
|
+
op: "capture",
|
|
486
|
+
error: "Vision service not available or no camera connected"
|
|
177
487
|
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
try {
|
|
491
|
+
const imageBuffer = await Promise.race([
|
|
492
|
+
visionService.captureImage(),
|
|
493
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error("vision capture timed out")), VISION_ACTION_TIMEOUT_MS))
|
|
494
|
+
]);
|
|
495
|
+
const cameraInfo = visionService.getCameraInfo();
|
|
496
|
+
if (!imageBuffer) {
|
|
497
|
+
const thought2 = "Failed to capture image from camera.";
|
|
498
|
+
const text2 = "I could not capture an image from the camera. Please try again.";
|
|
499
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
181
500
|
if (callback) {
|
|
182
|
-
await callback({
|
|
183
|
-
thought,
|
|
184
|
-
text,
|
|
185
|
-
actions: ["DESCRIBE_SCENE"]
|
|
186
|
-
});
|
|
501
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
187
502
|
}
|
|
188
503
|
return {
|
|
189
|
-
success:
|
|
190
|
-
text:
|
|
504
|
+
success: false,
|
|
505
|
+
text: "Failed to capture image from camera",
|
|
191
506
|
values: {
|
|
192
|
-
success:
|
|
507
|
+
success: false,
|
|
193
508
|
visionAvailable: true,
|
|
194
|
-
|
|
195
|
-
peopleCount,
|
|
196
|
-
objectCount,
|
|
197
|
-
cameraName: cameraInfo?.name || undefined,
|
|
198
|
-
sceneChanged: scene.sceneChanged,
|
|
199
|
-
changePercentage: scene.changePercentage
|
|
509
|
+
captureSuccess: false
|
|
200
510
|
},
|
|
201
511
|
data: {
|
|
202
|
-
actionName: "
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
sceneChanged: scene.sceneChanged,
|
|
206
|
-
changePercentage: scene.changePercentage,
|
|
207
|
-
audioTranscription: scene.audioTranscription || undefined,
|
|
208
|
-
objectCount: scene.objects.length,
|
|
209
|
-
peopleCount: scene.people.length,
|
|
512
|
+
actionName: "VISION",
|
|
513
|
+
op: "capture",
|
|
514
|
+
error: "Camera capture failed",
|
|
210
515
|
cameraInfo: cameraInfo ? {
|
|
211
516
|
id: cameraInfo.id,
|
|
212
517
|
name: cameraInfo.name,
|
|
213
518
|
connected: cameraInfo.connected
|
|
214
|
-
} : undefined
|
|
215
|
-
timestamp,
|
|
216
|
-
description
|
|
519
|
+
} : undefined
|
|
217
520
|
}
|
|
218
521
|
};
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
522
|
+
}
|
|
523
|
+
const attachmentId = createUniqueUuid(runtime, `capture-${Date.now()}`);
|
|
524
|
+
const timestamp = new Date().toISOString();
|
|
525
|
+
const imageAttachment = {
|
|
526
|
+
id: attachmentId,
|
|
527
|
+
title: `Camera Capture - ${timestamp}`,
|
|
528
|
+
contentType: ContentType.IMAGE,
|
|
529
|
+
source: `camera:${cameraInfo?.name || "unknown"}`,
|
|
530
|
+
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`
|
|
531
|
+
};
|
|
532
|
+
const thought = `Captured an image from camera "${cameraInfo?.name}".`;
|
|
533
|
+
const text = `I've captured an image from the camera at ${timestamp}.`;
|
|
534
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"], [imageAttachment]);
|
|
535
|
+
if (callback) {
|
|
536
|
+
await callback({
|
|
537
|
+
thought,
|
|
538
|
+
text,
|
|
539
|
+
actions: ["VISION"],
|
|
540
|
+
attachments: [imageAttachment]
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
return {
|
|
544
|
+
success: true,
|
|
545
|
+
text: `I've captured an image from the camera at ${timestamp}.`,
|
|
546
|
+
values: {
|
|
547
|
+
success: true,
|
|
548
|
+
visionAvailable: true,
|
|
549
|
+
captureSuccess: true,
|
|
550
|
+
cameraName: cameraInfo?.name || undefined,
|
|
551
|
+
timestamp
|
|
552
|
+
},
|
|
553
|
+
data: {
|
|
554
|
+
actionName: "VISION",
|
|
555
|
+
op: "capture",
|
|
556
|
+
imageAttachment: {
|
|
557
|
+
id: imageAttachment.id,
|
|
558
|
+
title: imageAttachment.title,
|
|
559
|
+
contentType: imageAttachment.contentType,
|
|
560
|
+
source: imageAttachment.source,
|
|
561
|
+
url: imageAttachment.url
|
|
562
|
+
},
|
|
563
|
+
cameraInfo: cameraInfo ? {
|
|
564
|
+
id: cameraInfo.id,
|
|
565
|
+
name: cameraInfo.name,
|
|
566
|
+
connected: cameraInfo.connected
|
|
567
|
+
} : undefined,
|
|
568
|
+
timestamp
|
|
569
|
+
}
|
|
570
|
+
};
|
|
571
|
+
} catch (error) {
|
|
572
|
+
logger.error("[VISION/capture] Error capturing image:", error);
|
|
573
|
+
const thought = "An error occurred while trying to capture an image.";
|
|
574
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
575
|
+
const text = `Error capturing image: ${errorMessage}`;
|
|
576
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
577
|
+
if (callback) {
|
|
578
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
579
|
+
}
|
|
580
|
+
return {
|
|
581
|
+
success: false,
|
|
582
|
+
text: "Error capturing image",
|
|
583
|
+
values: {
|
|
584
|
+
success: false,
|
|
585
|
+
visionAvailable: true,
|
|
586
|
+
error: true,
|
|
587
|
+
errorMessage
|
|
588
|
+
},
|
|
589
|
+
data: {
|
|
590
|
+
actionName: "VISION",
|
|
591
|
+
op: "capture",
|
|
592
|
+
error: errorMessage,
|
|
593
|
+
errorType: "capture_error"
|
|
594
|
+
}
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
async function runSetMode(runtime, message, options, callback) {
|
|
599
|
+
const visionService = runtime.getService("VISION");
|
|
600
|
+
if (!visionService) {
|
|
601
|
+
const thought = "Vision service is not available.";
|
|
602
|
+
const text = "I cannot change vision mode because the vision service is not available.";
|
|
603
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
604
|
+
if (callback) {
|
|
605
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
606
|
+
}
|
|
607
|
+
return {
|
|
608
|
+
success: false,
|
|
609
|
+
text,
|
|
610
|
+
data: { actionName: "VISION", op: "set_mode" }
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
try {
|
|
614
|
+
const explicitMode = typeof options.mode === "string" ? options.mode.toLowerCase() : "";
|
|
615
|
+
const messageText = explicitMode || message.content.text?.toLowerCase() || "";
|
|
616
|
+
let newMode = null;
|
|
617
|
+
if (messageText.includes("off") || messageText.includes("disable")) {
|
|
618
|
+
newMode = "OFF" /* OFF */;
|
|
619
|
+
} else if (messageText.includes("both")) {
|
|
620
|
+
newMode = "BOTH" /* BOTH */;
|
|
621
|
+
} else if (messageText.includes("screen")) {
|
|
622
|
+
newMode = "SCREEN" /* SCREEN */;
|
|
623
|
+
} else if (messageText.includes("camera")) {
|
|
624
|
+
newMode = "CAMERA" /* CAMERA */;
|
|
625
|
+
}
|
|
626
|
+
if (!newMode) {
|
|
627
|
+
const thought2 = "Could not determine the desired vision mode from the message.";
|
|
628
|
+
const text2 = "Please specify the vision mode: OFF, CAMERA, SCREEN, or BOTH.";
|
|
629
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
224
630
|
if (callback) {
|
|
225
|
-
await callback({
|
|
226
|
-
thought,
|
|
227
|
-
text,
|
|
228
|
-
actions: ["DESCRIBE_SCENE"]
|
|
229
|
-
});
|
|
631
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
230
632
|
}
|
|
231
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
232
633
|
return {
|
|
233
634
|
success: false,
|
|
234
|
-
text:
|
|
235
|
-
|
|
236
|
-
success: false,
|
|
237
|
-
visionAvailable: true,
|
|
238
|
-
error: true,
|
|
239
|
-
errorMessage
|
|
240
|
-
},
|
|
241
|
-
data: {
|
|
242
|
-
actionName: "DESCRIBE_SCENE",
|
|
243
|
-
error: errorMessage,
|
|
244
|
-
errorType: "analysis_error"
|
|
245
|
-
}
|
|
635
|
+
text: text2,
|
|
636
|
+
data: { actionName: "VISION", op: "set_mode" }
|
|
246
637
|
};
|
|
247
638
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
validate: async (runtime, message, state, options) => {
|
|
279
|
-
const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
|
|
280
|
-
const __avText = __avTextRaw.toLowerCase();
|
|
281
|
-
const __avVisionService = runtime?.getService?.("VISION");
|
|
282
|
-
const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
|
|
283
|
-
const __avKeywords = ["capture", "image"];
|
|
284
|
-
const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
|
|
285
|
-
const __avRegex = new RegExp("\\b(?:capture|image)\\b", "i");
|
|
286
|
-
const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
|
|
287
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
288
|
-
const __avExpectedSource = "";
|
|
289
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
290
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
291
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
292
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
293
|
-
return false;
|
|
294
|
-
}
|
|
295
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
296
|
-
const visionService = runtime2.getService("VISION");
|
|
297
|
-
return !!visionService && visionService.isActive();
|
|
639
|
+
const currentMode = visionService.getVisionMode();
|
|
640
|
+
await Promise.race([
|
|
641
|
+
visionService.setVisionMode(newMode),
|
|
642
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error("vision mode change timed out")), VISION_ACTION_TIMEOUT_MS))
|
|
643
|
+
]);
|
|
644
|
+
const thought = `Changed vision mode from ${currentMode} to ${newMode}.`;
|
|
645
|
+
let text = "";
|
|
646
|
+
switch (newMode) {
|
|
647
|
+
case "OFF" /* OFF */:
|
|
648
|
+
text = "Vision has been disabled. I will no longer process visual input.";
|
|
649
|
+
break;
|
|
650
|
+
case "CAMERA" /* CAMERA */:
|
|
651
|
+
text = "Vision mode set to CAMERA only. I will process input from the camera.";
|
|
652
|
+
break;
|
|
653
|
+
case "SCREEN" /* SCREEN */:
|
|
654
|
+
text = "Vision mode set to SCREEN only. I will analyze what's on your screen.";
|
|
655
|
+
break;
|
|
656
|
+
case "BOTH" /* BOTH */:
|
|
657
|
+
text = "Vision mode set to BOTH. I will process input from both camera and screen.";
|
|
658
|
+
break;
|
|
659
|
+
}
|
|
660
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
661
|
+
if (callback) {
|
|
662
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
663
|
+
}
|
|
664
|
+
return {
|
|
665
|
+
success: true,
|
|
666
|
+
text,
|
|
667
|
+
values: { visionMode: newMode },
|
|
668
|
+
data: { actionName: "VISION", op: "set_mode", visionMode: newMode }
|
|
298
669
|
};
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
670
|
+
} catch (error) {
|
|
671
|
+
logger.error("[VISION/set_mode] Error changing vision mode:", error);
|
|
672
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
673
|
+
const thought = "An error occurred while trying to change the vision mode.";
|
|
674
|
+
const text = `Error changing vision mode: ${errorMessage}`;
|
|
675
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
676
|
+
if (callback) {
|
|
677
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
303
678
|
}
|
|
304
|
-
|
|
305
|
-
|
|
679
|
+
return {
|
|
680
|
+
success: false,
|
|
681
|
+
text,
|
|
682
|
+
error: errorMessage,
|
|
683
|
+
data: { actionName: "VISION", op: "set_mode" }
|
|
684
|
+
};
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
async function runNameEntity(runtime, message, options, callback) {
|
|
688
|
+
try {
|
|
306
689
|
const visionService = runtime.getService("VISION");
|
|
307
|
-
if (!visionService
|
|
308
|
-
const thought = "Vision service is not available
|
|
309
|
-
const text = "I cannot
|
|
310
|
-
await saveExecutionRecord(runtime, message, thought, text, ["
|
|
690
|
+
if (!visionService) {
|
|
691
|
+
const thought = "Vision service is not available.";
|
|
692
|
+
const text = "I cannot name entities because the vision service is not available.";
|
|
693
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
311
694
|
if (callback) {
|
|
312
|
-
await callback({
|
|
313
|
-
thought,
|
|
314
|
-
text,
|
|
315
|
-
actions: ["CAPTURE_IMAGE"]
|
|
316
|
-
});
|
|
695
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
317
696
|
}
|
|
318
697
|
return {
|
|
319
698
|
success: false,
|
|
320
|
-
text
|
|
321
|
-
|
|
322
|
-
success: false,
|
|
323
|
-
visionAvailable: false,
|
|
324
|
-
error: "Vision service not available"
|
|
325
|
-
},
|
|
326
|
-
data: {
|
|
327
|
-
actionName: "CAPTURE_IMAGE",
|
|
328
|
-
error: "Vision service not available or no camera connected"
|
|
329
|
-
}
|
|
699
|
+
text,
|
|
700
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
330
701
|
};
|
|
331
702
|
}
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
const
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
const text2 = "I could not capture an image from the camera. Please try again.";
|
|
338
|
-
await saveExecutionRecord(runtime, message, thought2, text2, ["CAPTURE_IMAGE"]);
|
|
339
|
-
if (callback) {
|
|
340
|
-
await callback({
|
|
341
|
-
thought: thought2,
|
|
342
|
-
text: text2,
|
|
343
|
-
actions: ["CAPTURE_IMAGE"]
|
|
344
|
-
});
|
|
345
|
-
}
|
|
346
|
-
return {
|
|
347
|
-
success: false,
|
|
348
|
-
text: "Failed to capture image from camera",
|
|
349
|
-
values: {
|
|
350
|
-
success: false,
|
|
351
|
-
visionAvailable: true,
|
|
352
|
-
captureSuccess: false
|
|
353
|
-
},
|
|
354
|
-
data: {
|
|
355
|
-
actionName: "CAPTURE_IMAGE",
|
|
356
|
-
error: "Camera capture failed",
|
|
357
|
-
cameraInfo: cameraInfo ? {
|
|
358
|
-
id: cameraInfo.id,
|
|
359
|
-
name: cameraInfo.name,
|
|
360
|
-
connected: cameraInfo.connected
|
|
361
|
-
} : undefined
|
|
362
|
-
}
|
|
363
|
-
};
|
|
364
|
-
}
|
|
365
|
-
const attachmentId = createUniqueUuid(runtime, `capture-${Date.now()}`);
|
|
366
|
-
const timestamp = new Date().toISOString();
|
|
367
|
-
const imageAttachment = {
|
|
368
|
-
id: attachmentId,
|
|
369
|
-
title: `Camera Capture - ${timestamp}`,
|
|
370
|
-
contentType: ContentType.IMAGE,
|
|
371
|
-
source: `camera:${cameraInfo?.name || "unknown"}`,
|
|
372
|
-
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`
|
|
373
|
-
};
|
|
374
|
-
const thought = `Captured an image from camera "${cameraInfo?.name}".`;
|
|
375
|
-
const text = `I've captured an image from the camera at ${timestamp}.`;
|
|
376
|
-
await saveExecutionRecord(runtime, message, thought, text, ["CAPTURE_IMAGE"], [imageAttachment]);
|
|
703
|
+
const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
|
|
704
|
+
if (!scene || scene.people.length === 0) {
|
|
705
|
+
const thought = "No people visible to name.";
|
|
706
|
+
const text = "I don't see any people in the current scene to name.";
|
|
707
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
377
708
|
if (callback) {
|
|
378
|
-
await callback({
|
|
379
|
-
thought,
|
|
380
|
-
text,
|
|
381
|
-
actions: ["CAPTURE_IMAGE"],
|
|
382
|
-
attachments: [imageAttachment]
|
|
383
|
-
});
|
|
709
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
384
710
|
}
|
|
385
711
|
return {
|
|
386
|
-
success:
|
|
387
|
-
text
|
|
388
|
-
|
|
389
|
-
success: true,
|
|
390
|
-
visionAvailable: true,
|
|
391
|
-
captureSuccess: true,
|
|
392
|
-
cameraName: cameraInfo?.name || undefined,
|
|
393
|
-
timestamp
|
|
394
|
-
},
|
|
395
|
-
data: {
|
|
396
|
-
actionName: "CAPTURE_IMAGE",
|
|
397
|
-
imageAttachment: {
|
|
398
|
-
id: imageAttachment.id,
|
|
399
|
-
title: imageAttachment.title,
|
|
400
|
-
contentType: imageAttachment.contentType,
|
|
401
|
-
source: imageAttachment.source,
|
|
402
|
-
url: imageAttachment.url
|
|
403
|
-
},
|
|
404
|
-
cameraInfo: cameraInfo ? {
|
|
405
|
-
id: cameraInfo.id,
|
|
406
|
-
name: cameraInfo.name,
|
|
407
|
-
connected: cameraInfo.connected
|
|
408
|
-
} : undefined,
|
|
409
|
-
timestamp
|
|
410
|
-
}
|
|
712
|
+
success: false,
|
|
713
|
+
text,
|
|
714
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
411
715
|
};
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
716
|
+
}
|
|
717
|
+
const messageText = message.content.text?.toLowerCase() || "";
|
|
718
|
+
const explicitName = typeof options.name === "string" ? options.name.trim() : "";
|
|
719
|
+
const nameMatch = explicitName ? [explicitName, explicitName] : messageText.match(/(?:named?|call(?:ed)?|is)\s+(\w+)/i);
|
|
720
|
+
if (!nameMatch) {
|
|
721
|
+
const thought = "Could not extract name from message.";
|
|
722
|
+
const text = `I couldn't understand what name to assign. Please say something like "The person is named Alice".`;
|
|
723
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
418
724
|
if (callback) {
|
|
419
|
-
await callback({
|
|
420
|
-
thought,
|
|
421
|
-
text,
|
|
422
|
-
actions: ["CAPTURE_IMAGE"]
|
|
423
|
-
});
|
|
725
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
424
726
|
}
|
|
425
727
|
return {
|
|
426
728
|
success: false,
|
|
427
|
-
text
|
|
428
|
-
|
|
429
|
-
success: false,
|
|
430
|
-
visionAvailable: true,
|
|
431
|
-
error: true,
|
|
432
|
-
errorMessage
|
|
433
|
-
},
|
|
434
|
-
data: {
|
|
435
|
-
actionName: "CAPTURE_IMAGE",
|
|
436
|
-
error: errorMessage,
|
|
437
|
-
errorType: "capture_error"
|
|
438
|
-
}
|
|
729
|
+
text,
|
|
730
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
439
731
|
};
|
|
440
732
|
}
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
text: "I can see 3 people in an office setting. Let me capture this scene for you."
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
],
|
|
454
|
-
[
|
|
455
|
-
{ name: "{{user}}", content: { text: "take a photo" } },
|
|
456
|
-
{
|
|
457
|
-
name: "{{agent}}",
|
|
458
|
-
content: {
|
|
459
|
-
actions: ["CAPTURE_IMAGE"],
|
|
460
|
-
thought: "The user wants me to capture an image from the camera.",
|
|
461
|
-
text: "I've captured an image from the camera."
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
],
|
|
465
|
-
[
|
|
466
|
-
{ name: "{{user}}", content: { text: "capture the current scene" } },
|
|
467
|
-
{
|
|
468
|
-
name: "{{agent}}",
|
|
469
|
-
content: {
|
|
470
|
-
actions: ["CAPTURE_IMAGE"]
|
|
471
|
-
}
|
|
472
|
-
}
|
|
473
|
-
]
|
|
474
|
-
]
|
|
475
|
-
};
|
|
476
|
-
var setVisionModeAction = {
|
|
477
|
-
name: "SET_VISION_MODE",
|
|
478
|
-
description: "Set the vision mode to OFF, CAMERA, SCREEN, or BOTH",
|
|
479
|
-
similes: [
|
|
480
|
-
"change vision to {mode}",
|
|
481
|
-
"set vision mode {mode}",
|
|
482
|
-
"switch to {mode} vision",
|
|
483
|
-
"turn vision {mode}",
|
|
484
|
-
"use {mode} vision",
|
|
485
|
-
"enable {mode} vision",
|
|
486
|
-
"disable vision"
|
|
487
|
-
],
|
|
488
|
-
validate: async (runtime, message, state, options) => {
|
|
489
|
-
const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
|
|
490
|
-
const __avText = __avTextRaw.toLowerCase();
|
|
491
|
-
const __avLegacyContextOk = Boolean(runtime?.getService?.("VISION"));
|
|
492
|
-
const __avKeywords = ["set", "vision", "mode"];
|
|
493
|
-
const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
|
|
494
|
-
const __avRegex = new RegExp("\\b(?:set|vision|mode)\\b", "i");
|
|
495
|
-
const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
|
|
496
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
497
|
-
const __avExpectedSource = "";
|
|
498
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
499
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
500
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
501
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
502
|
-
return false;
|
|
503
|
-
}
|
|
504
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
505
|
-
const visionService = runtime2.getService("VISION");
|
|
506
|
-
return visionService !== null;
|
|
507
|
-
};
|
|
508
|
-
try {
|
|
509
|
-
return Boolean(await __avLegacyValidate(runtime, message, state, options));
|
|
510
|
-
} catch {
|
|
511
|
-
return false;
|
|
512
|
-
}
|
|
513
|
-
},
|
|
514
|
-
handler: async (runtime, message, _state, _options, callback, _responses) => {
|
|
515
|
-
const visionService = runtime.getService("VISION");
|
|
516
|
-
if (!visionService) {
|
|
517
|
-
const thought = "Vision service is not available.";
|
|
518
|
-
const text = "I cannot change vision mode because the vision service is not available.";
|
|
519
|
-
await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
|
|
733
|
+
const name = nameMatch[1];
|
|
734
|
+
const entityTracker = visionService.getEntityTracker();
|
|
735
|
+
await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
|
|
736
|
+
const activeEntities = entityTracker.getActiveEntities();
|
|
737
|
+
const people = activeEntities.filter((e) => e.entityType === "person");
|
|
738
|
+
if (people.length === 0) {
|
|
739
|
+
const thought = "No tracked people found.";
|
|
740
|
+
const text = "I can see someone but haven't established tracking yet. Please try again in a moment.";
|
|
741
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
520
742
|
if (callback) {
|
|
521
|
-
await callback({
|
|
522
|
-
thought,
|
|
523
|
-
text,
|
|
524
|
-
actions: ["SET_VISION_MODE"]
|
|
525
|
-
});
|
|
743
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
526
744
|
}
|
|
527
745
|
return {
|
|
528
746
|
success: false,
|
|
529
|
-
text
|
|
747
|
+
text,
|
|
748
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
530
749
|
};
|
|
531
750
|
}
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
const thought2 = "Could not determine the desired vision mode from the message.";
|
|
546
|
-
const text2 = "Please specify the vision mode: OFF, CAMERA, SCREEN, or BOTH.";
|
|
547
|
-
await saveExecutionRecord(runtime, message, thought2, text2, ["SET_VISION_MODE"]);
|
|
548
|
-
if (callback) {
|
|
549
|
-
await callback({
|
|
550
|
-
thought: thought2,
|
|
551
|
-
text: text2,
|
|
552
|
-
actions: ["SET_VISION_MODE"]
|
|
553
|
-
});
|
|
554
|
-
}
|
|
555
|
-
return {
|
|
556
|
-
success: false,
|
|
557
|
-
text: text2
|
|
558
|
-
};
|
|
559
|
-
}
|
|
560
|
-
const currentMode = visionService.getVisionMode();
|
|
561
|
-
await visionService.setVisionMode(newMode);
|
|
562
|
-
const thought = `Changed vision mode from ${currentMode} to ${newMode}.`;
|
|
563
|
-
let text = "";
|
|
564
|
-
switch (newMode) {
|
|
565
|
-
case "OFF" /* OFF */:
|
|
566
|
-
text = "Vision has been disabled. I will no longer process visual input.";
|
|
567
|
-
break;
|
|
568
|
-
case "CAMERA" /* CAMERA */:
|
|
569
|
-
text = "Vision mode set to CAMERA only. I will process input from the camera.";
|
|
570
|
-
break;
|
|
571
|
-
case "SCREEN" /* SCREEN */:
|
|
572
|
-
text = "Vision mode set to SCREEN only. I will analyze what's on your screen.";
|
|
573
|
-
break;
|
|
574
|
-
case "BOTH" /* BOTH */:
|
|
575
|
-
text = "Vision mode set to BOTH. I will process input from both camera and screen.";
|
|
576
|
-
break;
|
|
577
|
-
}
|
|
578
|
-
await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
|
|
751
|
+
let targetPerson = people[0];
|
|
752
|
+
if (people.length > 1) {
|
|
753
|
+
targetPerson = people.reduce((prev, curr) => {
|
|
754
|
+
const prevArea = prev.lastPosition.width * prev.lastPosition.height;
|
|
755
|
+
const currArea = curr.lastPosition.width * curr.lastPosition.height;
|
|
756
|
+
return currArea > prevArea ? curr : prev;
|
|
757
|
+
});
|
|
758
|
+
}
|
|
759
|
+
const success = entityTracker.assignNameToEntity(targetPerson.id, name);
|
|
760
|
+
if (success) {
|
|
761
|
+
const thought = `Named entity "${name}" and associated with person in scene.`;
|
|
762
|
+
const text = `I've identified the person as ${name}. I'll remember them for future interactions.`;
|
|
763
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"], undefined);
|
|
579
764
|
if (callback) {
|
|
580
765
|
await callback({
|
|
581
766
|
thought,
|
|
582
767
|
text,
|
|
583
|
-
actions: ["
|
|
768
|
+
actions: ["VISION"],
|
|
769
|
+
data: { entityId: targetPerson.id, name }
|
|
584
770
|
});
|
|
585
771
|
}
|
|
772
|
+
logger.info(`[VISION/name_entity] Assigned name "${name}" to entity ${targetPerson.id}`);
|
|
586
773
|
return {
|
|
587
774
|
success: true,
|
|
588
775
|
text,
|
|
589
|
-
values: {
|
|
590
|
-
|
|
776
|
+
values: { entityId: targetPerson.id, name },
|
|
777
|
+
data: {
|
|
778
|
+
actionName: "VISION",
|
|
779
|
+
op: "name_entity",
|
|
780
|
+
entityId: targetPerson.id,
|
|
781
|
+
name
|
|
591
782
|
}
|
|
592
783
|
};
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
const
|
|
596
|
-
|
|
597
|
-
const text = `Error changing vision mode: ${errorMessage}`;
|
|
598
|
-
await saveExecutionRecord(runtime, message, thought, text, ["SET_VISION_MODE"]);
|
|
784
|
+
} else {
|
|
785
|
+
const thought = "Failed to assign name to entity.";
|
|
786
|
+
const text = "There was an error assigning the name. Please try again.";
|
|
787
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
599
788
|
if (callback) {
|
|
600
|
-
await callback({
|
|
601
|
-
thought,
|
|
602
|
-
text,
|
|
603
|
-
actions: ["SET_VISION_MODE"]
|
|
604
|
-
});
|
|
789
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
605
790
|
}
|
|
606
791
|
return {
|
|
607
792
|
success: false,
|
|
608
793
|
text,
|
|
609
|
-
|
|
794
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
610
795
|
};
|
|
611
796
|
}
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
actions: ["SET_VISION_MODE"],
|
|
620
|
-
thought: "The user wants to switch to screen vision mode.",
|
|
621
|
-
text: "Vision mode set to SCREEN only. I will analyze what's on your screen."
|
|
622
|
-
}
|
|
623
|
-
}
|
|
624
|
-
],
|
|
625
|
-
[
|
|
626
|
-
{ name: "user", content: { text: "enable both camera and screen vision" } },
|
|
627
|
-
{
|
|
628
|
-
name: "agent",
|
|
629
|
-
content: {
|
|
630
|
-
actions: ["SET_VISION_MODE"],
|
|
631
|
-
thought: "The user wants to enable both vision inputs.",
|
|
632
|
-
text: "Vision mode set to BOTH. I will process input from both camera and screen."
|
|
633
|
-
}
|
|
634
|
-
}
|
|
635
|
-
],
|
|
636
|
-
[
|
|
637
|
-
{ name: "user", content: { text: "turn off vision" } },
|
|
638
|
-
{
|
|
639
|
-
name: "agent",
|
|
640
|
-
content: {
|
|
641
|
-
actions: ["SET_VISION_MODE"],
|
|
642
|
-
thought: "The user wants to disable vision.",
|
|
643
|
-
text: "Vision has been disabled. I will no longer process visual input."
|
|
644
|
-
}
|
|
645
|
-
}
|
|
646
|
-
]
|
|
647
|
-
]
|
|
648
|
-
};
|
|
649
|
-
var nameEntityAction = {
|
|
650
|
-
name: "NAME_ENTITY",
|
|
651
|
-
description: "Assign a name to a person or object currently visible in the camera view",
|
|
652
|
-
similes: [
|
|
653
|
-
"call the person {name}",
|
|
654
|
-
"the person in front is {name}",
|
|
655
|
-
"name the person {name}",
|
|
656
|
-
"that person is {name}",
|
|
657
|
-
"the object is a {name}",
|
|
658
|
-
"call that {name}"
|
|
659
|
-
],
|
|
660
|
-
examples: [
|
|
661
|
-
[
|
|
662
|
-
{
|
|
663
|
-
name: "user",
|
|
664
|
-
content: {
|
|
665
|
-
text: "The person wearing the blue shirt is named Alice"
|
|
666
|
-
}
|
|
667
|
-
},
|
|
668
|
-
{
|
|
669
|
-
name: "agent",
|
|
670
|
-
content: {
|
|
671
|
-
text: "I've identified the person in the blue shirt as Alice. I'll remember them for future interactions.",
|
|
672
|
-
actions: ["NAME_ENTITY"]
|
|
673
|
-
}
|
|
674
|
-
}
|
|
675
|
-
],
|
|
676
|
-
[
|
|
677
|
-
{
|
|
678
|
-
name: "user",
|
|
679
|
-
content: {
|
|
680
|
-
text: "Call the person on the left Bob"
|
|
681
|
-
}
|
|
682
|
-
},
|
|
683
|
-
{
|
|
684
|
-
name: "agent",
|
|
685
|
-
content: {
|
|
686
|
-
text: "I've named the person on the left as Bob. Their face profile has been updated.",
|
|
687
|
-
actions: ["NAME_ENTITY"]
|
|
688
|
-
}
|
|
689
|
-
}
|
|
690
|
-
]
|
|
691
|
-
],
|
|
692
|
-
validate: async (runtime, message, state, options) => {
|
|
693
|
-
const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
|
|
694
|
-
const __avText = __avTextRaw.toLowerCase();
|
|
695
|
-
const __avVisionService = runtime?.getService?.("VISION");
|
|
696
|
-
const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
|
|
697
|
-
const __avKeywords = ["name", "entity"];
|
|
698
|
-
const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
|
|
699
|
-
const __avRegex = new RegExp("\\b(?:name|entity)\\b", "i");
|
|
700
|
-
const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
|
|
701
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
702
|
-
const __avExpectedSource = "";
|
|
703
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
704
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
705
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
706
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
707
|
-
return false;
|
|
708
|
-
}
|
|
709
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
710
|
-
const visionService = runtime2.getService("VISION");
|
|
711
|
-
return visionService?.isActive() || false;
|
|
712
|
-
};
|
|
713
|
-
try {
|
|
714
|
-
return Boolean(await __avLegacyValidate(runtime, message, state, options));
|
|
715
|
-
} catch {
|
|
716
|
-
return false;
|
|
797
|
+
} catch (error) {
|
|
798
|
+
logger.error("[VISION/name_entity] Error:", error);
|
|
799
|
+
const thought = "Failed to name entity.";
|
|
800
|
+
const text = `Sorry, I couldn't name the entity: ${error instanceof Error ? error.message : "Unknown error"}`;
|
|
801
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
802
|
+
if (callback) {
|
|
803
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
717
804
|
}
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
if (!scene || scene.people.length === 0) {
|
|
736
|
-
const thought = "No people visible to name.";
|
|
737
|
-
const text2 = "I don't see any people in the current scene to name.";
|
|
738
|
-
await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
|
|
739
|
-
if (callback) {
|
|
740
|
-
await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
|
|
741
|
-
}
|
|
742
|
-
return {
|
|
743
|
-
success: false,
|
|
744
|
-
text: text2
|
|
745
|
-
};
|
|
746
|
-
}
|
|
747
|
-
const text = message.content.text?.toLowerCase() || "";
|
|
748
|
-
const nameMatch = text.match(/(?:named?|call(?:ed)?|is)\s+(\w+)/i);
|
|
749
|
-
if (!nameMatch) {
|
|
750
|
-
const thought = "Could not extract name from message.";
|
|
751
|
-
const text2 = `I couldn't understand what name to assign. Please say something like "The person is named Alice".`;
|
|
752
|
-
await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
|
|
753
|
-
if (callback) {
|
|
754
|
-
await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
|
|
755
|
-
}
|
|
756
|
-
return {
|
|
757
|
-
success: false,
|
|
758
|
-
text: text2
|
|
759
|
-
};
|
|
760
|
-
}
|
|
761
|
-
const name = nameMatch[1];
|
|
762
|
-
const _worldId = message.worldId || "default-world";
|
|
763
|
-
const entityTracker = visionService.getEntityTracker();
|
|
764
|
-
await entityTracker.updateEntities(scene.objects, scene.people, undefined, runtime);
|
|
765
|
-
const activeEntities = entityTracker.getActiveEntities();
|
|
766
|
-
const people = activeEntities.filter((e) => e.entityType === "person");
|
|
767
|
-
if (people.length === 0) {
|
|
768
|
-
const thought = "No tracked people found.";
|
|
769
|
-
const text2 = "I can see someone but haven't established tracking yet. Please try again in a moment.";
|
|
770
|
-
await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
|
|
771
|
-
if (callback) {
|
|
772
|
-
await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
|
|
773
|
-
}
|
|
774
|
-
return {
|
|
775
|
-
success: false,
|
|
776
|
-
text: text2
|
|
777
|
-
};
|
|
778
|
-
}
|
|
779
|
-
let targetPerson = people[0];
|
|
780
|
-
if (people.length > 1) {
|
|
781
|
-
targetPerson = people.reduce((prev, curr) => {
|
|
782
|
-
const prevArea = prev.lastPosition.width * prev.lastPosition.height;
|
|
783
|
-
const currArea = curr.lastPosition.width * curr.lastPosition.height;
|
|
784
|
-
return currArea > prevArea ? curr : prev;
|
|
785
|
-
});
|
|
805
|
+
return {
|
|
806
|
+
success: false,
|
|
807
|
+
text,
|
|
808
|
+
error: error instanceof Error ? error.message : String(error),
|
|
809
|
+
data: { actionName: "VISION", op: "name_entity" }
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
async function runIdentifyPerson(runtime, message, callback) {
|
|
814
|
+
try {
|
|
815
|
+
const visionService = runtime.getService("VISION");
|
|
816
|
+
if (!visionService) {
|
|
817
|
+
const thought2 = "Vision service is not available.";
|
|
818
|
+
const text2 = "I cannot identify people because the vision service is not available.";
|
|
819
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
820
|
+
if (callback) {
|
|
821
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
786
822
|
}
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
}
|
|
800
|
-
logger.info(`[NameEntityAction] Assigned name "${name}" to entity ${targetPerson.id}`);
|
|
801
|
-
return {
|
|
802
|
-
success: true,
|
|
803
|
-
text: text2,
|
|
804
|
-
values: {
|
|
805
|
-
entityId: targetPerson.id,
|
|
806
|
-
name
|
|
807
|
-
}
|
|
808
|
-
};
|
|
809
|
-
} else {
|
|
810
|
-
const thought = "Failed to assign name to entity.";
|
|
811
|
-
const text2 = "There was an error assigning the name. Please try again.";
|
|
812
|
-
await saveExecutionRecord(runtime, message, thought, text2, ["NAME_ENTITY"]);
|
|
813
|
-
if (callback) {
|
|
814
|
-
await callback({ thought, text: text2, actions: ["NAME_ENTITY"] });
|
|
815
|
-
}
|
|
816
|
-
return {
|
|
817
|
-
success: false,
|
|
818
|
-
text: text2
|
|
819
|
-
};
|
|
823
|
+
return {
|
|
824
|
+
success: false,
|
|
825
|
+
text: text2,
|
|
826
|
+
data: { actionName: "VISION", op: "identify_person" }
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
|
|
830
|
+
if (!scene || scene.people.length === 0) {
|
|
831
|
+
const thought2 = "No people visible to identify.";
|
|
832
|
+
const text2 = "I don't see any people in the current scene.";
|
|
833
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
834
|
+
if (callback) {
|
|
835
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
820
836
|
}
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
837
|
+
return {
|
|
838
|
+
success: false,
|
|
839
|
+
text: text2,
|
|
840
|
+
data: { actionName: "VISION", op: "identify_person" }
|
|
841
|
+
};
|
|
842
|
+
}
|
|
843
|
+
const entityTracker = visionService.getEntityTracker();
|
|
844
|
+
await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
|
|
845
|
+
const activeEntities = entityTracker.getActiveEntities();
|
|
846
|
+
const people = activeEntities.filter((e) => e.entityType === "person");
|
|
847
|
+
if (people.length === 0) {
|
|
848
|
+
const thought2 = "No tracked people found.";
|
|
849
|
+
const text2 = "I can see someone but I'm still processing their identity.";
|
|
850
|
+
await saveExecutionRecord(runtime, message, thought2, text2, ["VISION"]);
|
|
826
851
|
if (callback) {
|
|
827
|
-
await callback({ thought, text, actions: ["
|
|
852
|
+
await callback({ thought: thought2, text: text2, actions: ["VISION"] });
|
|
828
853
|
}
|
|
829
854
|
return {
|
|
830
855
|
success: false,
|
|
831
|
-
text,
|
|
832
|
-
|
|
856
|
+
text: text2,
|
|
857
|
+
data: { actionName: "VISION", op: "identify_person" }
|
|
833
858
|
};
|
|
834
859
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
[
|
|
849
|
-
{
|
|
850
|
-
name: "user",
|
|
851
|
-
content: {
|
|
852
|
-
text: "Who is the person in front of you?"
|
|
860
|
+
let recognizedCount = 0;
|
|
861
|
+
let unknownCount = 0;
|
|
862
|
+
const identifications = [];
|
|
863
|
+
for (const person of people) {
|
|
864
|
+
const name = person.attributes.name;
|
|
865
|
+
const duration = Date.now() - person.firstSeen;
|
|
866
|
+
const durationStr = duration < 60000 ? `${Math.round(duration / 1000)} seconds` : `${Math.round(duration / 60000)} minutes`;
|
|
867
|
+
if (name) {
|
|
868
|
+
recognizedCount++;
|
|
869
|
+
const personInfo = `I can see ${name}. They've been here for ${durationStr}.`;
|
|
870
|
+
identifications.push(personInfo);
|
|
871
|
+
if (person.appearances.length > 5) {
|
|
872
|
+
identifications.push("I've been tracking them consistently.");
|
|
853
873
|
}
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
874
|
+
} else {
|
|
875
|
+
unknownCount++;
|
|
876
|
+
const personInfo = `I see an unidentified person who has been here for ${durationStr}.`;
|
|
877
|
+
identifications.push(personInfo);
|
|
878
|
+
if (person.attributes.faceId) {
|
|
879
|
+
identifications.push("I've captured their face profile but they haven't been named yet.");
|
|
860
880
|
}
|
|
861
881
|
}
|
|
862
|
-
]
|
|
863
|
-
],
|
|
864
|
-
validate: async (runtime, message, state, options) => {
|
|
865
|
-
const __avTextRaw = typeof message?.content?.text === "string" ? message.content.text : "";
|
|
866
|
-
const __avText = __avTextRaw.toLowerCase();
|
|
867
|
-
const __avVisionService = runtime?.getService?.("VISION");
|
|
868
|
-
const __avLegacyContextOk = Boolean(__avVisionService && typeof __avVisionService.isActive === "function" && __avVisionService.isActive());
|
|
869
|
-
const __avKeywords = ["identify", "person"];
|
|
870
|
-
const __avKeywordOk = __avKeywords.length > 0 && (__avKeywords.some((kw) => kw.length > 0 && __avText.includes(kw)) || __avLegacyContextOk);
|
|
871
|
-
const __avRegex = new RegExp("\\b(?:identify|person)\\b", "i");
|
|
872
|
-
const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
|
|
873
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
874
|
-
const __avExpectedSource = "";
|
|
875
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
876
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
877
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
878
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
879
|
-
return false;
|
|
880
|
-
}
|
|
881
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
882
|
-
const visionService = runtime2.getService("VISION");
|
|
883
|
-
return visionService?.isActive() || false;
|
|
884
|
-
};
|
|
885
|
-
try {
|
|
886
|
-
return Boolean(await __avLegacyValidate(runtime, message, state, options));
|
|
887
|
-
} catch {
|
|
888
|
-
return false;
|
|
889
882
|
}
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
|
|
900
|
-
}
|
|
901
|
-
return {
|
|
902
|
-
success: false,
|
|
903
|
-
text: text2
|
|
904
|
-
};
|
|
905
|
-
}
|
|
906
|
-
const scene = await visionService.getSceneDescription();
|
|
907
|
-
if (!scene || scene.people.length === 0) {
|
|
908
|
-
const thought2 = "No people visible to identify.";
|
|
909
|
-
const text2 = "I don't see any people in the current scene.";
|
|
910
|
-
await saveExecutionRecord(runtime, message, thought2, text2, ["IDENTIFY_PERSON"]);
|
|
911
|
-
if (callback) {
|
|
912
|
-
await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
|
|
913
|
-
}
|
|
914
|
-
return {
|
|
915
|
-
success: false,
|
|
916
|
-
text: text2
|
|
917
|
-
};
|
|
918
|
-
}
|
|
919
|
-
const _worldId = message.worldId || "default-world";
|
|
920
|
-
const entityTracker = visionService.getEntityTracker();
|
|
921
|
-
await entityTracker.updateEntities(scene.objects, scene.people, undefined, runtime);
|
|
922
|
-
const activeEntities = entityTracker.getActiveEntities();
|
|
923
|
-
const people = activeEntities.filter((e) => e.entityType === "person");
|
|
924
|
-
if (people.length === 0) {
|
|
925
|
-
const thought2 = "No tracked people found.";
|
|
926
|
-
const text2 = "I can see someone but I'm still processing their identity.";
|
|
927
|
-
await saveExecutionRecord(runtime, message, thought2, text2, ["IDENTIFY_PERSON"]);
|
|
928
|
-
if (callback) {
|
|
929
|
-
await callback({ thought: thought2, text: text2, actions: ["IDENTIFY_PERSON"] });
|
|
930
|
-
}
|
|
931
|
-
return {
|
|
932
|
-
success: false,
|
|
933
|
-
text: text2
|
|
934
|
-
};
|
|
935
|
-
}
|
|
936
|
-
const _responseText = "";
|
|
937
|
-
let recognizedCount = 0;
|
|
938
|
-
let unknownCount = 0;
|
|
939
|
-
const identifications = [];
|
|
940
|
-
for (const person of people) {
|
|
941
|
-
const name = person.attributes.name;
|
|
942
|
-
const duration = Date.now() - person.firstSeen;
|
|
943
|
-
const durationStr = duration < 60000 ? `${Math.round(duration / 1000)} seconds` : `${Math.round(duration / 60000)} minutes`;
|
|
944
|
-
if (name) {
|
|
945
|
-
recognizedCount++;
|
|
946
|
-
const personInfo = `I can see ${name}. They've been here for ${durationStr}.`;
|
|
947
|
-
identifications.push(personInfo);
|
|
948
|
-
if (person.appearances.length > 5) {
|
|
949
|
-
identifications.push("I've been tracking them consistently.");
|
|
950
|
-
}
|
|
951
|
-
} else {
|
|
952
|
-
unknownCount++;
|
|
953
|
-
const personInfo = `I see an unidentified person who has been here for ${durationStr}.`;
|
|
954
|
-
identifications.push(personInfo);
|
|
955
|
-
if (person.attributes.faceId) {
|
|
956
|
-
identifications.push("I've captured their face profile but they haven't been named yet.");
|
|
957
|
-
}
|
|
883
|
+
const recentlyLeft = entityTracker.getRecentlyLeft();
|
|
884
|
+
if (recentlyLeft.length > 0) {
|
|
885
|
+
identifications.push(`
|
|
886
|
+
Recently departed:`);
|
|
887
|
+
for (const { entity, leftAt } of recentlyLeft) {
|
|
888
|
+
if (entity.entityType === "person" && entity.attributes.name) {
|
|
889
|
+
const timeAgo = Date.now() - leftAt;
|
|
890
|
+
const timeStr = timeAgo < 60000 ? `${Math.round(timeAgo / 1000)} seconds ago` : `${Math.round(timeAgo / 60000)} minutes ago`;
|
|
891
|
+
identifications.push(`${entity.attributes.name} left ${timeStr}.`);
|
|
958
892
|
}
|
|
959
893
|
}
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
894
|
+
}
|
|
895
|
+
const thought = `Identified ${recognizedCount} known people and ${unknownCount} unknown people.`;
|
|
896
|
+
const text = identifications.join(" ");
|
|
897
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
898
|
+
if (callback) {
|
|
899
|
+
await callback({
|
|
900
|
+
thought,
|
|
901
|
+
text,
|
|
902
|
+
actions: ["VISION"],
|
|
903
|
+
data: {
|
|
904
|
+
identifications: people.slice(0, MAX_VISION_ENTITIES).map((p) => ({
|
|
905
|
+
id: p.id,
|
|
906
|
+
entityType: p.entityType,
|
|
907
|
+
name: p.attributes.name || undefined
|
|
908
|
+
}))
|
|
970
909
|
}
|
|
910
|
+
});
|
|
911
|
+
}
|
|
912
|
+
return {
|
|
913
|
+
success: true,
|
|
914
|
+
text,
|
|
915
|
+
values: { recognizedCount, unknownCount },
|
|
916
|
+
data: {
|
|
917
|
+
actionName: "VISION",
|
|
918
|
+
op: "identify_person",
|
|
919
|
+
recognizedCount,
|
|
920
|
+
unknownCount
|
|
971
921
|
}
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
922
|
+
};
|
|
923
|
+
} catch (error) {
|
|
924
|
+
logger.error("[VISION/identify_person] Error:", error);
|
|
925
|
+
const thought = "Failed to identify people.";
|
|
926
|
+
const text = `Sorry, I couldn't identify people: ${error instanceof Error ? error.message : "Unknown error"}`;
|
|
927
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
928
|
+
if (callback) {
|
|
929
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
930
|
+
}
|
|
931
|
+
return {
|
|
932
|
+
success: false,
|
|
933
|
+
text,
|
|
934
|
+
error: error instanceof Error ? error.message : String(error),
|
|
935
|
+
data: { actionName: "VISION", op: "identify_person" }
|
|
936
|
+
};
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
async function runTrackEntity(runtime, message, callback) {
|
|
940
|
+
try {
|
|
941
|
+
const visionService = runtime.getService("VISION");
|
|
942
|
+
if (!visionService) {
|
|
943
|
+
const thought2 = "Vision service is not available.";
|
|
944
|
+
const text = "I cannot track entities because the vision service is not available.";
|
|
945
|
+
await saveExecutionRecord(runtime, message, thought2, text, ["VISION"]);
|
|
946
|
+
if (callback) {
|
|
947
|
+
await callback({ thought: thought2, text, actions: ["VISION"] });
|
|
988
948
|
}
|
|
989
949
|
return {
|
|
990
|
-
success:
|
|
950
|
+
success: false,
|
|
991
951
|
text,
|
|
992
|
-
|
|
993
|
-
recognizedCount,
|
|
994
|
-
unknownCount
|
|
995
|
-
}
|
|
952
|
+
data: { actionName: "VISION", op: "track_entity" }
|
|
996
953
|
};
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
const
|
|
1001
|
-
|
|
954
|
+
}
|
|
955
|
+
const scene = await withVisionTimeout(visionService.getSceneDescription(), "vision scene description");
|
|
956
|
+
if (!scene) {
|
|
957
|
+
const thought2 = "No scene available for tracking.";
|
|
958
|
+
const text = "I need a moment to process the visual scene before I can track entities.";
|
|
959
|
+
await saveExecutionRecord(runtime, message, thought2, text, ["VISION"]);
|
|
1002
960
|
if (callback) {
|
|
1003
|
-
await callback({ thought, text, actions: ["
|
|
961
|
+
await callback({ thought: thought2, text, actions: ["VISION"] });
|
|
1004
962
|
}
|
|
1005
963
|
return {
|
|
1006
964
|
success: false,
|
|
1007
965
|
text,
|
|
1008
|
-
|
|
966
|
+
data: { actionName: "VISION", op: "track_entity" }
|
|
1009
967
|
};
|
|
1010
968
|
}
|
|
969
|
+
const entityTracker = visionService.getEntityTracker();
|
|
970
|
+
await entityTracker.updateEntities(scene.objects.slice(0, MAX_VISION_ENTITIES), scene.people.slice(0, MAX_VISION_ENTITIES), undefined, runtime);
|
|
971
|
+
const stats = entityTracker.getStatistics();
|
|
972
|
+
const thought = `Tracking ${stats.activeEntities} entities in the scene.`;
|
|
973
|
+
const summary = [
|
|
974
|
+
`I'm now tracking ${stats.activeEntities} entities in the scene`,
|
|
975
|
+
`(${stats.people} people, ${stats.objects} objects).`,
|
|
976
|
+
"The visual tracking system will maintain persistent IDs for all entities",
|
|
977
|
+
"and notify you of significant changes."
|
|
978
|
+
];
|
|
979
|
+
const responseText = summary.join(" ");
|
|
980
|
+
await saveExecutionRecord(runtime, message, thought, responseText, [
|
|
981
|
+
"VISION"
|
|
982
|
+
]);
|
|
983
|
+
if (callback) {
|
|
984
|
+
await callback({
|
|
985
|
+
thought,
|
|
986
|
+
text: responseText,
|
|
987
|
+
actions: ["VISION"],
|
|
988
|
+
data: { entities: stats.activeEntities }
|
|
989
|
+
});
|
|
990
|
+
}
|
|
991
|
+
logger.info(`[VISION/track_entity] Tracking ${stats.activeEntities} entities`);
|
|
992
|
+
return {
|
|
993
|
+
success: true,
|
|
994
|
+
text: responseText,
|
|
995
|
+
values: {
|
|
996
|
+
activeEntities: stats.activeEntities,
|
|
997
|
+
people: stats.people,
|
|
998
|
+
objects: stats.objects
|
|
999
|
+
},
|
|
1000
|
+
data: {
|
|
1001
|
+
actionName: "VISION",
|
|
1002
|
+
op: "track_entity",
|
|
1003
|
+
activeEntities: stats.activeEntities,
|
|
1004
|
+
people: stats.people,
|
|
1005
|
+
objects: stats.objects
|
|
1006
|
+
}
|
|
1007
|
+
};
|
|
1008
|
+
} catch (error) {
|
|
1009
|
+
logger.error("[VISION/track_entity] Error:", error);
|
|
1010
|
+
const thought = "Failed to track entities.";
|
|
1011
|
+
const text = `Sorry, I couldn't track entities: ${error instanceof Error ? error.message : "Unknown error"}`;
|
|
1012
|
+
await saveExecutionRecord(runtime, message, thought, text, ["VISION"]);
|
|
1013
|
+
if (callback) {
|
|
1014
|
+
await callback({ thought, text, actions: ["VISION"] });
|
|
1015
|
+
}
|
|
1016
|
+
return {
|
|
1017
|
+
success: false,
|
|
1018
|
+
text,
|
|
1019
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1020
|
+
data: { actionName: "VISION", op: "track_entity" }
|
|
1021
|
+
};
|
|
1011
1022
|
}
|
|
1012
|
-
}
|
|
1013
|
-
var
|
|
1014
|
-
name: "
|
|
1015
|
-
|
|
1023
|
+
}
|
|
1024
|
+
var visionAction = {
|
|
1025
|
+
name: "VISION",
|
|
1026
|
+
contexts: [...ALL_VISION_CONTEXTS],
|
|
1027
|
+
contextGate: { anyOf: [...ALL_VISION_CONTEXTS] },
|
|
1028
|
+
roleGate: { minRole: "USER" },
|
|
1016
1029
|
similes: [
|
|
1017
|
-
"
|
|
1018
|
-
"
|
|
1019
|
-
"
|
|
1020
|
-
"
|
|
1030
|
+
"DESCRIBE_SCENE",
|
|
1031
|
+
"CAPTURE_IMAGE",
|
|
1032
|
+
"SET_VISION_MODE",
|
|
1033
|
+
"NAME_ENTITY",
|
|
1034
|
+
"IDENTIFY_PERSON",
|
|
1035
|
+
"TRACK_ENTITY",
|
|
1036
|
+
"ANALYZE_SCENE",
|
|
1037
|
+
"WHAT_DO_YOU_SEE",
|
|
1038
|
+
"VISION_CHECK",
|
|
1039
|
+
"LOOK_AROUND",
|
|
1040
|
+
"TAKE_PHOTO",
|
|
1041
|
+
"SCREENSHOT",
|
|
1042
|
+
"CAPTURE_FRAME",
|
|
1043
|
+
"TAKE_PICTURE"
|
|
1044
|
+
],
|
|
1045
|
+
description: "Camera and screen vision: describe the current scene, capture an image, switch vision mode (off/camera/screen/both), name a visible entity, identify a person, or start tracking an entity. The op is inferred from the message text when not explicitly provided.",
|
|
1046
|
+
descriptionCompressed: "Vision: describe / capture / set_mode / name_entity / identify_person / track_entity.",
|
|
1047
|
+
parameters: [
|
|
1048
|
+
{
|
|
1049
|
+
name: "subaction",
|
|
1050
|
+
description: "Operation to perform: describe, capture, set_mode, name_entity, identify_person, or track_entity. Inferred from message text when omitted.",
|
|
1051
|
+
required: false,
|
|
1052
|
+
schema: { type: "string", enum: [...VISION_OPS] }
|
|
1053
|
+
},
|
|
1054
|
+
{
|
|
1055
|
+
name: "detailLevel",
|
|
1056
|
+
description: "For op=describe: 'summary' to omit object/person breakdowns, 'detailed' for the full breakdown.",
|
|
1057
|
+
required: false,
|
|
1058
|
+
schema: {
|
|
1059
|
+
type: "string",
|
|
1060
|
+
enum: ["summary", "detailed"],
|
|
1061
|
+
default: "detailed"
|
|
1062
|
+
}
|
|
1063
|
+
},
|
|
1064
|
+
{
|
|
1065
|
+
name: "mode",
|
|
1066
|
+
description: "For op=set_mode: vision mode to set: off, camera, screen, or both.",
|
|
1067
|
+
required: false,
|
|
1068
|
+
schema: { type: "string", enum: ["off", "camera", "screen", "both"] }
|
|
1069
|
+
},
|
|
1070
|
+
{
|
|
1071
|
+
name: "name",
|
|
1072
|
+
description: "For op=name_entity: the name to assign to the most relevant visible person or object.",
|
|
1073
|
+
required: false,
|
|
1074
|
+
schema: { type: "string" }
|
|
1075
|
+
},
|
|
1076
|
+
{
|
|
1077
|
+
name: "targetHint",
|
|
1078
|
+
description: "For op=name_entity or op=identify_person: optional phrase describing which visible entity to focus on.",
|
|
1079
|
+
required: false,
|
|
1080
|
+
schema: { type: "string" }
|
|
1081
|
+
},
|
|
1082
|
+
{
|
|
1083
|
+
name: "description",
|
|
1084
|
+
description: "For op=track_entity: optional description of the visible entity to prioritize for tracking.",
|
|
1085
|
+
required: false,
|
|
1086
|
+
schema: { type: "string" }
|
|
1087
|
+
},
|
|
1088
|
+
{
|
|
1089
|
+
name: "includeUnknown",
|
|
1090
|
+
description: "For op=identify_person: whether to mention unidentified people in the response.",
|
|
1091
|
+
required: false,
|
|
1092
|
+
schema: { type: "boolean", default: true }
|
|
1093
|
+
}
|
|
1021
1094
|
],
|
|
1095
|
+
validate: async (runtime, _message, state, options) => {
|
|
1096
|
+
if (!visionServiceIsActive(runtime)) {
|
|
1097
|
+
const visionService = runtime.getService("VISION");
|
|
1098
|
+
if (!visionService)
|
|
1099
|
+
return false;
|
|
1100
|
+
}
|
|
1101
|
+
const params = readActionParams(options);
|
|
1102
|
+
return selectedContextMatches(state, ALL_VISION_CONTEXTS) || typeof params.op === "string";
|
|
1103
|
+
},
|
|
1104
|
+
handler: async (runtime, message, _state, _options, callback, _responses) => {
|
|
1105
|
+
const params = readActionParams(_options);
|
|
1106
|
+
const explicitOp = normalizeOp(params.op ?? params.subaction);
|
|
1107
|
+
const inferredOp = explicitOp ?? inferOpFromMessage(message.content?.text ?? "");
|
|
1108
|
+
if (!inferredOp) {
|
|
1109
|
+
const text = `VISION could not determine the operation. Specify one of: ${VISION_OPS.join(", ")}.`;
|
|
1110
|
+
if (callback) {
|
|
1111
|
+
await callback({ text, actions: ["VISION"] });
|
|
1112
|
+
}
|
|
1113
|
+
return {
|
|
1114
|
+
success: false,
|
|
1115
|
+
text,
|
|
1116
|
+
values: { error: "MISSING" },
|
|
1117
|
+
data: {
|
|
1118
|
+
actionName: "VISION",
|
|
1119
|
+
availableOps: VISION_OPS
|
|
1120
|
+
}
|
|
1121
|
+
};
|
|
1122
|
+
}
|
|
1123
|
+
switch (inferredOp) {
|
|
1124
|
+
case "describe":
|
|
1125
|
+
return runDescribe(runtime, message, params, callback);
|
|
1126
|
+
case "capture":
|
|
1127
|
+
return runCapture(runtime, message, callback);
|
|
1128
|
+
case "set_mode":
|
|
1129
|
+
return runSetMode(runtime, message, params, callback);
|
|
1130
|
+
case "name_entity":
|
|
1131
|
+
return runNameEntity(runtime, message, params, callback);
|
|
1132
|
+
case "identify_person":
|
|
1133
|
+
return runIdentifyPerson(runtime, message, callback);
|
|
1134
|
+
case "track_entity":
|
|
1135
|
+
return runTrackEntity(runtime, message, callback);
|
|
1136
|
+
}
|
|
1137
|
+
},
|
|
1022
1138
|
examples: [
|
|
1023
1139
|
[
|
|
1140
|
+
{ name: "{{user}}", content: { text: "what do you see?" } },
|
|
1024
1141
|
{
|
|
1025
|
-
name: "
|
|
1142
|
+
name: "{{agent}}",
|
|
1026
1143
|
content: {
|
|
1027
|
-
|
|
1144
|
+
actions: ["VISION"],
|
|
1145
|
+
thought: "The user wants to know what I can see through my camera.",
|
|
1146
|
+
text: "I see a room with a desk and computer setup. There are 2 people, one is sitting and one is standing."
|
|
1028
1147
|
}
|
|
1029
|
-
}
|
|
1148
|
+
}
|
|
1149
|
+
],
|
|
1150
|
+
[
|
|
1151
|
+
{ name: "{{user}}", content: { text: "take a photo" } },
|
|
1030
1152
|
{
|
|
1031
|
-
name: "agent",
|
|
1153
|
+
name: "{{agent}}",
|
|
1032
1154
|
content: {
|
|
1033
|
-
|
|
1034
|
-
|
|
1155
|
+
actions: ["VISION"],
|
|
1156
|
+
thought: "The user wants me to capture an image from the camera.",
|
|
1157
|
+
text: "I've captured an image from the camera."
|
|
1035
1158
|
}
|
|
1036
1159
|
}
|
|
1037
|
-
]
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
const __avRegexOk = __avRegex.test(__avText) || __avLegacyContextOk;
|
|
1048
|
-
const __avSource = String(message?.content?.source ?? message?.source ?? "");
|
|
1049
|
-
const __avExpectedSource = "";
|
|
1050
|
-
const __avSourceOk = __avExpectedSource ? __avSource === __avExpectedSource : Boolean(__avSource || state || runtime?.agentId || runtime?.getService);
|
|
1051
|
-
const __avOptions = options && typeof options === "object" ? options : {};
|
|
1052
|
-
const __avInputOk = __avText.trim().length > 0 || Object.keys(__avOptions).length > 0 || Boolean(message?.content && typeof message.content === "object");
|
|
1053
|
-
if (!(__avKeywordOk && __avRegexOk && __avSourceOk && __avInputOk)) {
|
|
1054
|
-
return false;
|
|
1055
|
-
}
|
|
1056
|
-
const __avLegacyValidate = async (runtime2, _message, _state) => {
|
|
1057
|
-
const visionService = runtime2.getService("VISION");
|
|
1058
|
-
return visionService?.isActive() || false;
|
|
1059
|
-
};
|
|
1060
|
-
try {
|
|
1061
|
-
return Boolean(await __avLegacyValidate(runtime, message, state, options));
|
|
1062
|
-
} catch {
|
|
1063
|
-
return false;
|
|
1064
|
-
}
|
|
1065
|
-
},
|
|
1066
|
-
handler: async (runtime, message, _state, _options, callback) => {
|
|
1067
|
-
try {
|
|
1068
|
-
const visionService = runtime.getService("VISION");
|
|
1069
|
-
if (!visionService) {
|
|
1070
|
-
const thought2 = "Vision service is not available.";
|
|
1071
|
-
const text = "I cannot track entities because the vision service is not available.";
|
|
1072
|
-
await saveExecutionRecord(runtime, message, thought2, text, ["TRACK_ENTITY"]);
|
|
1073
|
-
if (callback) {
|
|
1074
|
-
await callback({ thought: thought2, text, actions: ["TRACK_ENTITY"] });
|
|
1075
|
-
}
|
|
1076
|
-
return {
|
|
1077
|
-
success: false,
|
|
1078
|
-
text
|
|
1079
|
-
};
|
|
1160
|
+
],
|
|
1161
|
+
[
|
|
1162
|
+
{ name: "{{user}}", content: { text: "set vision mode to screen" } },
|
|
1163
|
+
{
|
|
1164
|
+
name: "{{agent}}",
|
|
1165
|
+
content: {
|
|
1166
|
+
actions: ["VISION"],
|
|
1167
|
+
thought: "The user wants to switch to screen vision mode.",
|
|
1168
|
+
text: "Vision mode set to SCREEN only. I will analyze what's on your screen."
|
|
1169
|
+
}
|
|
1080
1170
|
}
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1171
|
+
],
|
|
1172
|
+
[
|
|
1173
|
+
{
|
|
1174
|
+
name: "{{user}}",
|
|
1175
|
+
content: { text: "the person wearing the blue shirt is named Alice" }
|
|
1176
|
+
},
|
|
1177
|
+
{
|
|
1178
|
+
name: "{{agent}}",
|
|
1179
|
+
content: {
|
|
1180
|
+
actions: ["VISION"],
|
|
1181
|
+
text: "I've identified the person in the blue shirt as Alice. I'll remember them for future interactions."
|
|
1088
1182
|
}
|
|
1089
|
-
return {
|
|
1090
|
-
success: false,
|
|
1091
|
-
text
|
|
1092
|
-
};
|
|
1093
1183
|
}
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
const responseText = summary.join(" ");
|
|
1107
|
-
await saveExecutionRecord(runtime, message, thought, responseText, ["TRACK_ENTITY"]);
|
|
1108
|
-
if (callback) {
|
|
1109
|
-
await callback({
|
|
1110
|
-
thought,
|
|
1111
|
-
text: responseText,
|
|
1112
|
-
actions: ["TRACK_ENTITY"],
|
|
1113
|
-
data: { entities: stats.activeEntities }
|
|
1114
|
-
});
|
|
1184
|
+
],
|
|
1185
|
+
[
|
|
1186
|
+
{
|
|
1187
|
+
name: "{{user}}",
|
|
1188
|
+
content: { text: "who is the person in front of you?" }
|
|
1189
|
+
},
|
|
1190
|
+
{
|
|
1191
|
+
name: "{{agent}}",
|
|
1192
|
+
content: {
|
|
1193
|
+
actions: ["VISION"],
|
|
1194
|
+
text: "That's Alice. I last saw her about 5 minutes ago."
|
|
1195
|
+
}
|
|
1115
1196
|
}
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1197
|
+
],
|
|
1198
|
+
[
|
|
1199
|
+
{
|
|
1200
|
+
name: "{{user}}",
|
|
1201
|
+
content: { text: "track the person wearing the red shirt" }
|
|
1202
|
+
},
|
|
1203
|
+
{
|
|
1204
|
+
name: "{{agent}}",
|
|
1205
|
+
content: {
|
|
1206
|
+
actions: ["VISION"],
|
|
1207
|
+
text: "I'm now tracking the person in the red shirt."
|
|
1124
1208
|
}
|
|
1125
|
-
};
|
|
1126
|
-
} catch (error) {
|
|
1127
|
-
logger.error("[trackEntityAction] Error:", error);
|
|
1128
|
-
const thought = "Failed to track entities.";
|
|
1129
|
-
const text = `Sorry, I couldn't track entities: ${error instanceof Error ? error.message : "Unknown error"}`;
|
|
1130
|
-
await saveExecutionRecord(runtime, message, thought, text, ["TRACK_ENTITY"]);
|
|
1131
|
-
if (callback) {
|
|
1132
|
-
await callback({ thought, text, actions: ["TRACK_ENTITY"] });
|
|
1133
1209
|
}
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
text,
|
|
1137
|
-
error: error instanceof Error ? error.message : String(error)
|
|
1138
|
-
};
|
|
1139
|
-
}
|
|
1140
|
-
}
|
|
1210
|
+
]
|
|
1211
|
+
]
|
|
1141
1212
|
};
|
|
1142
1213
|
|
|
1143
1214
|
// src/provider.ts
|
|
1144
1215
|
import {
|
|
1145
|
-
addHeader,
|
|
1146
1216
|
logger as logger2
|
|
1147
1217
|
} from "@elizaos/core";
|
|
1218
|
+
var MAX_VISION_OBJECTS_IN_STATE = 50;
|
|
1219
|
+
var MAX_VISION_PEOPLE_IN_STATE = 25;
|
|
1220
|
+
var MAX_TRACKED_ENTITIES_IN_STATE = 25;
|
|
1148
1221
|
var visionProvider = {
|
|
1149
1222
|
name: "VISION_PERCEPTION",
|
|
1150
1223
|
description: "Provides current visual perception data including scene description, detected objects, people, and entity tracking. This provider is always active and provides real-time visual awareness.",
|
|
1151
1224
|
position: 99,
|
|
1225
|
+
contexts: ["media", "browser"],
|
|
1226
|
+
contextGate: { anyOf: ["media", "browser"] },
|
|
1227
|
+
cacheStable: false,
|
|
1228
|
+
cacheScope: "turn",
|
|
1152
1229
|
dynamic: false,
|
|
1153
1230
|
get: async (runtime, message, _state) => {
|
|
1154
1231
|
const visionService = runtime.getService("VISION");
|
|
@@ -1160,201 +1237,231 @@ var visionProvider = {
|
|
|
1160
1237
|
sceneDescription: "Vision service is not available.",
|
|
1161
1238
|
cameraStatus: "No camera connected"
|
|
1162
1239
|
},
|
|
1163
|
-
text:
|
|
1240
|
+
text: JSON.stringify({
|
|
1241
|
+
visual_perception: {
|
|
1242
|
+
visionAvailable: false,
|
|
1243
|
+
sceneDescription: "Vision service is not available.",
|
|
1244
|
+
cameraStatus: "No camera connected"
|
|
1245
|
+
}
|
|
1246
|
+
}, null, 2),
|
|
1164
1247
|
data: { hasVision: false }
|
|
1165
1248
|
};
|
|
1166
1249
|
}
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1250
|
+
try {
|
|
1251
|
+
const sceneDescription = await visionService.getEnhancedSceneDescription() || await visionService.getSceneDescription();
|
|
1252
|
+
const cameraInfo = visionService.getCameraInfo();
|
|
1253
|
+
const isActive = visionService.isActive();
|
|
1254
|
+
const visionMode = visionService.getVisionMode();
|
|
1255
|
+
const screenCapture = await visionService.getScreenCapture();
|
|
1256
|
+
const _worldId = message.worldId || "default-world";
|
|
1257
|
+
const entityTracker = visionService.getEntityTracker();
|
|
1258
|
+
let entityData = null;
|
|
1259
|
+
if (sceneDescription && entityTracker) {
|
|
1260
|
+
await entityTracker.updateEntities(sceneDescription.objects, sceneDescription.people, undefined, runtime);
|
|
1261
|
+
const activeEntities = entityTracker.getActiveEntities();
|
|
1262
|
+
const recentlyLeft = entityTracker.getRecentlyLeft();
|
|
1263
|
+
const stats = entityTracker.getStatistics();
|
|
1264
|
+
entityData = {
|
|
1265
|
+
activeEntities: activeEntities.slice(0, MAX_TRACKED_ENTITIES_IN_STATE).map((e) => ({
|
|
1266
|
+
id: e.id,
|
|
1267
|
+
type: e.entityType,
|
|
1268
|
+
name: e.attributes.name,
|
|
1269
|
+
firstSeen: e.firstSeen,
|
|
1270
|
+
duration: Date.now() - e.firstSeen,
|
|
1271
|
+
position: e.lastPosition,
|
|
1272
|
+
attributes: e.attributes
|
|
1273
|
+
})),
|
|
1274
|
+
recentlyLeft: recentlyLeft.slice(0, MAX_TRACKED_ENTITIES_IN_STATE).map(({ entity, leftAt }) => ({
|
|
1275
|
+
id: entity.id,
|
|
1276
|
+
name: entity.attributes.name,
|
|
1277
|
+
leftAt,
|
|
1278
|
+
timeAgo: Date.now() - leftAt
|
|
1279
|
+
})),
|
|
1280
|
+
statistics: stats
|
|
1281
|
+
};
|
|
1282
|
+
}
|
|
1283
|
+
let perceptionText = "";
|
|
1284
|
+
let values = {};
|
|
1285
|
+
let data = {};
|
|
1286
|
+
if (!isActive) {
|
|
1287
|
+
perceptionText = `Vision mode: ${visionMode}
|
|
1204
1288
|
`;
|
|
1205
|
-
|
|
1206
|
-
|
|
1289
|
+
if (visionMode === "OFF") {
|
|
1290
|
+
perceptionText += "Vision is disabled.";
|
|
1291
|
+
} else {
|
|
1292
|
+
perceptionText += "Vision service is initializing...";
|
|
1293
|
+
}
|
|
1294
|
+
values = {
|
|
1295
|
+
visionAvailable: false,
|
|
1296
|
+
visionMode,
|
|
1297
|
+
sceneDescription: "Vision not active",
|
|
1298
|
+
cameraStatus: cameraInfo ? `Camera "${cameraInfo.name}" detected but not active` : "No camera"
|
|
1299
|
+
};
|
|
1207
1300
|
} else {
|
|
1208
|
-
perceptionText
|
|
1209
|
-
}
|
|
1210
|
-
values = {
|
|
1211
|
-
visionAvailable: false,
|
|
1212
|
-
visionMode,
|
|
1213
|
-
sceneDescription: "Vision not active",
|
|
1214
|
-
cameraStatus: cameraInfo ? `Camera "${cameraInfo.name}" detected but not active` : "No camera"
|
|
1215
|
-
};
|
|
1216
|
-
} else {
|
|
1217
|
-
perceptionText = `Vision mode: ${visionMode}
|
|
1301
|
+
perceptionText = `Vision mode: ${visionMode}
|
|
1218
1302
|
|
|
1219
1303
|
`;
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1304
|
+
if ((visionMode === "CAMERA" || visionMode === "BOTH") && sceneDescription) {
|
|
1305
|
+
const ageInSeconds = (Date.now() - sceneDescription.timestamp) / 1000;
|
|
1306
|
+
const secondsAgo = Math.round(ageInSeconds);
|
|
1307
|
+
perceptionText += `Camera view (${secondsAgo}s ago):
|
|
1224
1308
|
${sceneDescription.description}`;
|
|
1225
|
-
|
|
1226
|
-
|
|
1309
|
+
if (sceneDescription.people.length > 0) {
|
|
1310
|
+
perceptionText += `
|
|
1227
1311
|
|
|
1228
1312
|
People detected: ${sceneDescription.people.length}`;
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1313
|
+
const poses = sceneDescription.people.map((p) => p.pose).filter((p) => p !== "unknown");
|
|
1314
|
+
const facings = sceneDescription.people.map((p) => p.facing).filter((f) => f !== "unknown");
|
|
1315
|
+
if (poses.length > 0) {
|
|
1316
|
+
const poseCounts = poses.reduce((acc, pose) => {
|
|
1317
|
+
acc[pose] = (acc[pose] || 0) + 1;
|
|
1318
|
+
return acc;
|
|
1319
|
+
}, {});
|
|
1320
|
+
perceptionText += `
|
|
1237
1321
|
Poses: ${Object.entries(poseCounts).map(([pose, count]) => `${count} ${pose}`).join(", ")}`;
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1322
|
+
}
|
|
1323
|
+
if (facings.length > 0) {
|
|
1324
|
+
const facingCounts = facings.reduce((acc, facing) => {
|
|
1325
|
+
acc[facing] = (acc[facing] || 0) + 1;
|
|
1326
|
+
return acc;
|
|
1327
|
+
}, {});
|
|
1328
|
+
perceptionText += `
|
|
1245
1329
|
Facing: ${Object.entries(facingCounts).map(([facing, count]) => `${count} facing ${facing}`).join(", ")}`;
|
|
1330
|
+
}
|
|
1246
1331
|
}
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
perceptionText += `
|
|
1332
|
+
if (sceneDescription.objects.length > 0) {
|
|
1333
|
+
const objectTypes = sceneDescription.objects.slice(0, MAX_VISION_OBJECTS_IN_STATE).map((o) => o.type);
|
|
1334
|
+
const uniqueObjects = [...new Set(objectTypes)];
|
|
1335
|
+
perceptionText += `
|
|
1252
1336
|
|
|
1253
1337
|
Objects detected: ${uniqueObjects.join(", ")}`;
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1338
|
+
}
|
|
1339
|
+
if (sceneDescription.sceneChanged) {
|
|
1340
|
+
perceptionText += `
|
|
1257
1341
|
|
|
1258
1342
|
Scene change: ${sceneDescription.changePercentage.toFixed(1)}% of pixels changed`;
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1343
|
+
}
|
|
1344
|
+
if (entityData) {
|
|
1345
|
+
if (entityData.activeEntities.length > 0) {
|
|
1346
|
+
perceptionText += `
|
|
1263
1347
|
|
|
1264
1348
|
Currently tracking:`;
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1349
|
+
for (const entity of entityData.activeEntities) {
|
|
1350
|
+
const name = entity.name || `Unknown ${entity.type}`;
|
|
1351
|
+
const duration = entity.duration < 60000 ? `${Math.round(entity.duration / 1000)}s` : `${Math.round(entity.duration / 60000)}m`;
|
|
1352
|
+
perceptionText += `
|
|
1269
1353
|
- ${name} (present for ${duration})`;
|
|
1354
|
+
}
|
|
1270
1355
|
}
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
perceptionText += `
|
|
1356
|
+
if (entityData.recentlyLeft.length > 0) {
|
|
1357
|
+
perceptionText += `
|
|
1274
1358
|
|
|
1275
1359
|
Recently left:`;
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1360
|
+
for (const departed of entityData.recentlyLeft) {
|
|
1361
|
+
const name = departed.name || "Unknown person";
|
|
1362
|
+
const timeStr = departed.timeAgo < 60000 ? `${Math.round(departed.timeAgo / 1000)}s ago` : `${Math.round(departed.timeAgo / 60000)}m ago`;
|
|
1363
|
+
perceptionText += `
|
|
1280
1364
|
- ${name} left ${timeStr}`;
|
|
1365
|
+
}
|
|
1281
1366
|
}
|
|
1282
1367
|
}
|
|
1283
1368
|
}
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
perceptionText += `
|
|
1369
|
+
if ((visionMode === "SCREEN" || visionMode === "BOTH") && screenCapture) {
|
|
1370
|
+
const screenAge = (Date.now() - screenCapture.timestamp) / 1000;
|
|
1371
|
+
const screenSecondsAgo = Math.round(screenAge);
|
|
1372
|
+
if (visionMode === "BOTH") {
|
|
1373
|
+
perceptionText += `
|
|
1290
1374
|
|
|
1291
1375
|
---
|
|
1292
1376
|
|
|
1293
1377
|
`;
|
|
1294
|
-
|
|
1295
|
-
|
|
1378
|
+
}
|
|
1379
|
+
perceptionText += `Screen capture (${screenSecondsAgo}s ago):
|
|
1296
1380
|
`;
|
|
1297
|
-
|
|
1381
|
+
perceptionText += `Resolution: ${screenCapture.width}x${screenCapture.height}
|
|
1298
1382
|
`;
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1383
|
+
const enhanced = sceneDescription;
|
|
1384
|
+
if (enhanced?.screenAnalysis) {
|
|
1385
|
+
const tileAnalysis = enhanced.screenAnalysis.activeTile;
|
|
1386
|
+
if (tileAnalysis) {
|
|
1387
|
+
if (tileAnalysis.summary) {
|
|
1388
|
+
perceptionText += `
|
|
1305
1389
|
Active area: ${tileAnalysis.summary}`;
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1390
|
+
}
|
|
1391
|
+
if (tileAnalysis.text) {
|
|
1392
|
+
perceptionText += `
|
|
1309
1393
|
|
|
1310
1394
|
Visible text:
|
|
1311
1395
|
"${tileAnalysis.text.substring(0, 200)}${tileAnalysis.text.length > 200 ? "..." : ""}"`;
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1396
|
+
}
|
|
1397
|
+
if (tileAnalysis.objects && tileAnalysis.objects.length > 0) {
|
|
1398
|
+
const uiElements = tileAnalysis.objects.map((o) => o.type || "unknown");
|
|
1399
|
+
const uniqueElements = [...new Set(uiElements)];
|
|
1400
|
+
perceptionText += `
|
|
1317
1401
|
|
|
1318
1402
|
UI elements: ${uniqueElements.join(", ")}`;
|
|
1403
|
+
}
|
|
1319
1404
|
}
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
perceptionText += `
|
|
1405
|
+
if (enhanced.screenAnalysis.focusedApp) {
|
|
1406
|
+
perceptionText += `
|
|
1323
1407
|
|
|
1324
1408
|
Active application: ${enhanced.screenAnalysis.focusedApp}`;
|
|
1409
|
+
}
|
|
1325
1410
|
}
|
|
1326
1411
|
}
|
|
1412
|
+
values = {
|
|
1413
|
+
visionAvailable: true,
|
|
1414
|
+
visionMode,
|
|
1415
|
+
sceneDescription: sceneDescription?.description || "Processing...",
|
|
1416
|
+
cameraStatus: cameraInfo ? `Connected to ${cameraInfo.name}` : "No camera",
|
|
1417
|
+
cameraId: cameraInfo?.id,
|
|
1418
|
+
peopleCount: sceneDescription?.people.length || 0,
|
|
1419
|
+
objectCount: sceneDescription?.objects.length || 0,
|
|
1420
|
+
sceneAge: sceneDescription ? Math.round((Date.now() - sceneDescription.timestamp) / 1000) : null,
|
|
1421
|
+
lastChange: sceneDescription?.sceneChanged ? sceneDescription.changePercentage : 0,
|
|
1422
|
+
hasScreenCapture: !!screenCapture,
|
|
1423
|
+
screenResolution: screenCapture ? `${screenCapture.width}x${screenCapture.height}` : null,
|
|
1424
|
+
activeEntities: entityData?.activeEntities || [],
|
|
1425
|
+
recentlyLeft: entityData?.recentlyLeft || [],
|
|
1426
|
+
entityStatistics: entityData?.statistics || null
|
|
1427
|
+
};
|
|
1428
|
+
data = {
|
|
1429
|
+
objects: sceneDescription?.objects.slice(0, MAX_VISION_OBJECTS_IN_STATE) || [],
|
|
1430
|
+
people: sceneDescription?.people.slice(0, MAX_VISION_PEOPLE_IN_STATE) || [],
|
|
1431
|
+
screenCapture: screenCapture || null,
|
|
1432
|
+
enhancedData: sceneDescription?.screenAnalysis || null,
|
|
1433
|
+
trackedEntities: entityData?.activeEntities || [],
|
|
1434
|
+
worldState: entityData || null
|
|
1435
|
+
};
|
|
1327
1436
|
}
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
lastChange: sceneDescription?.sceneChanged ? sceneDescription.changePercentage : 0,
|
|
1338
|
-
hasScreenCapture: !!screenCapture,
|
|
1339
|
-
screenResolution: screenCapture ? `${screenCapture.width}x${screenCapture.height}` : null,
|
|
1340
|
-
activeEntities: entityData?.activeEntities || [],
|
|
1341
|
-
recentlyLeft: entityData?.recentlyLeft || [],
|
|
1342
|
-
entityStatistics: entityData?.statistics || null
|
|
1437
|
+
return {
|
|
1438
|
+
values,
|
|
1439
|
+
text: JSON.stringify({
|
|
1440
|
+
visual_perception: {
|
|
1441
|
+
summary: perceptionText,
|
|
1442
|
+
...values
|
|
1443
|
+
}
|
|
1444
|
+
}, null, 2),
|
|
1445
|
+
data
|
|
1343
1446
|
};
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1447
|
+
} catch (error) {
|
|
1448
|
+
return {
|
|
1449
|
+
values: {
|
|
1450
|
+
visionAvailable: false,
|
|
1451
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1452
|
+
},
|
|
1453
|
+
text: JSON.stringify({
|
|
1454
|
+
visual_perception: {
|
|
1455
|
+
visionAvailable: false,
|
|
1456
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1457
|
+
}
|
|
1458
|
+
}, null, 2),
|
|
1459
|
+
data: {
|
|
1460
|
+
hasVision: false,
|
|
1461
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1462
|
+
}
|
|
1351
1463
|
};
|
|
1352
1464
|
}
|
|
1353
|
-
return {
|
|
1354
|
-
values,
|
|
1355
|
-
text: addHeader("# Visual Perception", perceptionText),
|
|
1356
|
-
data
|
|
1357
|
-
};
|
|
1358
1465
|
}
|
|
1359
1466
|
};
|
|
1360
1467
|
|
|
@@ -1364,9 +1471,11 @@ import * as fs3 from "node:fs/promises";
|
|
|
1364
1471
|
import * as path5 from "node:path";
|
|
1365
1472
|
import { promisify as promisify3 } from "node:util";
|
|
1366
1473
|
import {
|
|
1367
|
-
logger as
|
|
1474
|
+
logger as logger15,
|
|
1368
1475
|
ModelType as ModelType3,
|
|
1369
|
-
|
|
1476
|
+
recordLlmCall,
|
|
1477
|
+
Service,
|
|
1478
|
+
withStandaloneTrajectory as withStandaloneTrajectory3
|
|
1370
1479
|
} from "@elizaos/core";
|
|
1371
1480
|
import sharp4 from "sharp";
|
|
1372
1481
|
|
|
@@ -1375,7 +1484,11 @@ import { exec } from "node:child_process";
|
|
|
1375
1484
|
import * as fs from "node:fs/promises";
|
|
1376
1485
|
import * as path from "node:path";
|
|
1377
1486
|
import { promisify } from "node:util";
|
|
1378
|
-
import {
|
|
1487
|
+
import {
|
|
1488
|
+
logger as logger3,
|
|
1489
|
+
ModelType,
|
|
1490
|
+
withStandaloneTrajectory
|
|
1491
|
+
} from "@elizaos/core";
|
|
1379
1492
|
var execAsync = promisify(exec);
|
|
1380
1493
|
|
|
1381
1494
|
class AudioCaptureService {
|
|
@@ -1426,7 +1539,11 @@ class AudioCaptureService {
|
|
|
1426
1539
|
await execAsync("where ffmpeg");
|
|
1427
1540
|
return { available: true, tool: "ffmpeg" };
|
|
1428
1541
|
}
|
|
1429
|
-
return {
|
|
1542
|
+
return {
|
|
1543
|
+
available: false,
|
|
1544
|
+
tool: "none",
|
|
1545
|
+
message: "Unsupported platform"
|
|
1546
|
+
};
|
|
1430
1547
|
} catch (_error) {
|
|
1431
1548
|
const toolName = platform === "darwin" ? "sox" : platform === "linux" ? "arecord" : "ffmpeg";
|
|
1432
1549
|
const installCmd = platform === "darwin" ? "brew install sox" : platform === "linux" ? "sudo apt-get install alsa-utils" : "Download ffmpeg from ffmpeg.org";
|
|
@@ -1460,7 +1577,13 @@ class AudioCaptureService {
|
|
|
1460
1577
|
await this.recordAudio(audioFile, this.config.transcriptionInterval / 1000);
|
|
1461
1578
|
logger3.debug("[AudioCapture] Recording complete, transcribing...");
|
|
1462
1579
|
const audioBuffer = await fs.readFile(audioFile);
|
|
1463
|
-
const transcription = await this.runtime
|
|
1580
|
+
const transcription = await withStandaloneTrajectory(this.runtime, {
|
|
1581
|
+
source: "plugin-vision:audio-transcription",
|
|
1582
|
+
metadata: {
|
|
1583
|
+
modelType: ModelType.TRANSCRIPTION,
|
|
1584
|
+
audioBytes: audioBuffer.byteLength
|
|
1585
|
+
}
|
|
1586
|
+
}, () => this.runtime.useModel(ModelType.TRANSCRIPTION, audioBuffer));
|
|
1464
1587
|
await fs.unlink(audioFile).catch(() => {});
|
|
1465
1588
|
if (transcription && typeof transcription === "string" && transcription.trim()) {
|
|
1466
1589
|
logger3.info(`[AudioCapture] Transcribed: "${transcription}"`);
|
|
@@ -1583,7 +1706,11 @@ class AudioCaptureService {
|
|
|
1583
1706
|
// src/audio-capture-stream.ts
|
|
1584
1707
|
import { spawn } from "node:child_process";
|
|
1585
1708
|
import { EventEmitter } from "node:events";
|
|
1586
|
-
import {
|
|
1709
|
+
import {
|
|
1710
|
+
logger as logger4,
|
|
1711
|
+
ModelType as ModelType2,
|
|
1712
|
+
withStandaloneTrajectory as withStandaloneTrajectory2
|
|
1713
|
+
} from "@elizaos/core";
|
|
1587
1714
|
|
|
1588
1715
|
class StreamingAudioCaptureService extends EventEmitter {
|
|
1589
1716
|
runtime;
|
|
@@ -1814,7 +1941,13 @@ class StreamingAudioCaptureService extends EventEmitter {
|
|
|
1814
1941
|
async transcribeAudio(audioData) {
|
|
1815
1942
|
try {
|
|
1816
1943
|
const wavBuffer = this.rawToWav(audioData);
|
|
1817
|
-
const result = await this.runtime
|
|
1944
|
+
const result = await withStandaloneTrajectory2(this.runtime, {
|
|
1945
|
+
source: "plugin-vision:streaming-audio-transcription",
|
|
1946
|
+
metadata: {
|
|
1947
|
+
modelType: ModelType2.TRANSCRIPTION,
|
|
1948
|
+
audioBytes: wavBuffer.byteLength
|
|
1949
|
+
}
|
|
1950
|
+
}, () => this.runtime.useModel(ModelType2.TRANSCRIPTION, wavBuffer));
|
|
1818
1951
|
return result;
|
|
1819
1952
|
} catch (error) {
|
|
1820
1953
|
logger4.error("[StreamingAudio] Transcription failed:", error);
|
|
@@ -1904,7 +2037,10 @@ class StreamingAudioCaptureService extends EventEmitter {
|
|
|
1904
2037
|
}
|
|
1905
2038
|
|
|
1906
2039
|
// src/entity-tracker.ts
|
|
1907
|
-
import {
|
|
2040
|
+
import {
|
|
2041
|
+
createUniqueUuid as createUniqueUuid2,
|
|
2042
|
+
logger as logger5
|
|
2043
|
+
} from "@elizaos/core";
|
|
1908
2044
|
|
|
1909
2045
|
class EntityTracker {
|
|
1910
2046
|
worldState;
|
|
@@ -3112,9 +3248,27 @@ class OCRService {
|
|
|
3112
3248
|
logger10.debug("[OCR] Using fallback OCR implementation");
|
|
3113
3249
|
const blocks = [];
|
|
3114
3250
|
const mockTexts = [
|
|
3115
|
-
{
|
|
3116
|
-
|
|
3117
|
-
|
|
3251
|
+
{
|
|
3252
|
+
text: "File Edit View Window Help",
|
|
3253
|
+
x: 10,
|
|
3254
|
+
y: 5,
|
|
3255
|
+
width: 300,
|
|
3256
|
+
height: 20
|
|
3257
|
+
},
|
|
3258
|
+
{
|
|
3259
|
+
text: "Welcome to the application",
|
|
3260
|
+
x: 100,
|
|
3261
|
+
y: 100,
|
|
3262
|
+
width: 400,
|
|
3263
|
+
height: 40
|
|
3264
|
+
},
|
|
3265
|
+
{
|
|
3266
|
+
text: "Click here to continue",
|
|
3267
|
+
x: 200,
|
|
3268
|
+
y: 300,
|
|
3269
|
+
width: 200,
|
|
3270
|
+
height: 30
|
|
3271
|
+
}
|
|
3118
3272
|
];
|
|
3119
3273
|
for (const mock of mockTexts) {
|
|
3120
3274
|
blocks.push({
|
|
@@ -3335,8 +3489,46 @@ class ScreenCaptureService {
|
|
|
3335
3489
|
}
|
|
3336
3490
|
}
|
|
3337
3491
|
|
|
3338
|
-
// src/
|
|
3492
|
+
// src/test-input.ts
|
|
3493
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3494
|
+
import { resolve } from "node:path";
|
|
3339
3495
|
import { logger as logger12 } from "@elizaos/core";
|
|
3496
|
+
function getTestInputMode() {
|
|
3497
|
+
const raw = (process.env.ELIZA_VISION_TEST_INPUT ?? "").trim().toLowerCase();
|
|
3498
|
+
if (raw === "image")
|
|
3499
|
+
return "image";
|
|
3500
|
+
if (raw === "camera")
|
|
3501
|
+
return "camera";
|
|
3502
|
+
if (raw === "screen")
|
|
3503
|
+
return "screen";
|
|
3504
|
+
return "unset";
|
|
3505
|
+
}
|
|
3506
|
+
var FIXTURE_DEFAULT_REL = "test/fixtures/sample-scene.png";
|
|
3507
|
+
function resolveFixturePath() {
|
|
3508
|
+
const fromEnv = process.env.ELIZA_VISION_TEST_FIXTURE;
|
|
3509
|
+
if (fromEnv)
|
|
3510
|
+
return resolve(fromEnv);
|
|
3511
|
+
return resolve(process.cwd(), FIXTURE_DEFAULT_REL);
|
|
3512
|
+
}
|
|
3513
|
+
var cached = null;
|
|
3514
|
+
var cachedKey = null;
|
|
3515
|
+
function getTestImage() {
|
|
3516
|
+
if (getTestInputMode() !== "image")
|
|
3517
|
+
return null;
|
|
3518
|
+
const path4 = resolveFixturePath();
|
|
3519
|
+
if (cachedKey === path4 && cached)
|
|
3520
|
+
return cached;
|
|
3521
|
+
if (!existsSync(path4)) {
|
|
3522
|
+
logger12.warn(`[plugin-vision] ELIZA_VISION_TEST_INPUT=image set but fixture not found at ${path4}.`);
|
|
3523
|
+
return null;
|
|
3524
|
+
}
|
|
3525
|
+
cached = readFileSync(path4);
|
|
3526
|
+
cachedKey = path4;
|
|
3527
|
+
return cached;
|
|
3528
|
+
}
|
|
3529
|
+
|
|
3530
|
+
// src/vision-models.ts
|
|
3531
|
+
import { logger as logger13 } from "@elizaos/core";
|
|
3340
3532
|
var tf2 = null;
|
|
3341
3533
|
var cocoSsd = null;
|
|
3342
3534
|
var poseDetection = null;
|
|
@@ -3349,7 +3541,7 @@ async function loadTfModules() {
|
|
|
3349
3541
|
poseDetection = await import("@tensorflow-models/pose-detection");
|
|
3350
3542
|
return true;
|
|
3351
3543
|
} catch (err) {
|
|
3352
|
-
|
|
3544
|
+
logger13.warn("[VisionModels] TensorFlow.js native addon not available — " + "falling back to description-based detection. Run `npm rebuild @tensorflow/tfjs-node --build-addon-from-source` to enable hardware-accelerated vision.");
|
|
3353
3545
|
return false;
|
|
3354
3546
|
}
|
|
3355
3547
|
}
|
|
@@ -3363,30 +3555,30 @@ class VisionModels {
|
|
|
3363
3555
|
if (this.initialized) {
|
|
3364
3556
|
return;
|
|
3365
3557
|
}
|
|
3366
|
-
|
|
3558
|
+
logger13.info("[VisionModels] Initializing vision models...");
|
|
3367
3559
|
this.tfAvailable = await loadTfModules();
|
|
3368
3560
|
if (!this.tfAvailable || !tf2 || !cocoSsd || !poseDetection) {
|
|
3369
3561
|
this.initialized = true;
|
|
3370
|
-
|
|
3562
|
+
logger13.info("[VisionModels] Initialized without TensorFlow (fallback mode)");
|
|
3371
3563
|
return;
|
|
3372
3564
|
}
|
|
3373
3565
|
try {
|
|
3374
3566
|
await tf2.ready();
|
|
3375
|
-
|
|
3567
|
+
logger13.info("[VisionModels] TensorFlow.js backend ready");
|
|
3376
3568
|
if (config.enableObjectDetection) {
|
|
3377
3569
|
try {
|
|
3378
|
-
|
|
3570
|
+
logger13.info("[VisionModels] Loading COCO-SSD model...");
|
|
3379
3571
|
this.objectDetectionModel = await cocoSsd.load({
|
|
3380
3572
|
base: "mobilenet_v2"
|
|
3381
3573
|
});
|
|
3382
|
-
|
|
3574
|
+
logger13.info("[VisionModels] COCO-SSD model loaded");
|
|
3383
3575
|
} catch (error) {
|
|
3384
|
-
|
|
3576
|
+
logger13.error("[VisionModels] Failed to load COCO-SSD model:", error);
|
|
3385
3577
|
}
|
|
3386
3578
|
}
|
|
3387
3579
|
if (config.enablePoseDetection) {
|
|
3388
3580
|
try {
|
|
3389
|
-
|
|
3581
|
+
logger13.info("[VisionModels] Loading PoseNet model...");
|
|
3390
3582
|
const detectorConfig = {
|
|
3391
3583
|
architecture: "MobileNetV1",
|
|
3392
3584
|
outputStride: 16,
|
|
@@ -3394,15 +3586,15 @@ class VisionModels {
|
|
|
3394
3586
|
multiplier: 0.75
|
|
3395
3587
|
};
|
|
3396
3588
|
this.poseDetector = await poseDetection.createDetector(poseDetection.SupportedModels.PoseNet, detectorConfig);
|
|
3397
|
-
|
|
3589
|
+
logger13.info("[VisionModels] PoseNet model loaded");
|
|
3398
3590
|
} catch (error) {
|
|
3399
|
-
|
|
3591
|
+
logger13.error("[VisionModels] Failed to load PoseNet model:", error);
|
|
3400
3592
|
}
|
|
3401
3593
|
}
|
|
3402
3594
|
this.initialized = true;
|
|
3403
|
-
|
|
3595
|
+
logger13.info("[VisionModels] Vision models initialized");
|
|
3404
3596
|
} catch (error) {
|
|
3405
|
-
|
|
3597
|
+
logger13.error("[VisionModels] Initialization failed:", error);
|
|
3406
3598
|
throw error;
|
|
3407
3599
|
}
|
|
3408
3600
|
}
|
|
@@ -3414,7 +3606,7 @@ class VisionModels {
|
|
|
3414
3606
|
}
|
|
3415
3607
|
async detectObjects(imageData, _width, _height, description) {
|
|
3416
3608
|
if (!this.objectDetectionModel || !tf2) {
|
|
3417
|
-
|
|
3609
|
+
logger13.warn("[VisionModels] Object detection model not loaded");
|
|
3418
3610
|
return this.enhancedObjectDetection(description);
|
|
3419
3611
|
}
|
|
3420
3612
|
try {
|
|
@@ -3436,10 +3628,10 @@ class VisionModels {
|
|
|
3436
3628
|
height: pred.bbox[3]
|
|
3437
3629
|
}
|
|
3438
3630
|
}));
|
|
3439
|
-
|
|
3631
|
+
logger13.debug(`[VisionModels] Detected ${objects.length} objects`);
|
|
3440
3632
|
return objects;
|
|
3441
3633
|
} catch (error) {
|
|
3442
|
-
|
|
3634
|
+
logger13.error("[VisionModels] Object detection failed:", error);
|
|
3443
3635
|
return this.enhancedObjectDetection(description);
|
|
3444
3636
|
}
|
|
3445
3637
|
}
|
|
@@ -3449,8 +3641,14 @@ class VisionModels {
|
|
|
3449
3641
|
}
|
|
3450
3642
|
const objects = [];
|
|
3451
3643
|
const objectPatterns = [
|
|
3452
|
-
{
|
|
3453
|
-
|
|
3644
|
+
{
|
|
3645
|
+
pattern: /(\d+)?\s*(person|people|man|men|woman|women|child|children)/gi,
|
|
3646
|
+
type: "person"
|
|
3647
|
+
},
|
|
3648
|
+
{
|
|
3649
|
+
pattern: /(\d+)?\s*(laptop|computer|monitor|screen|display)/gi,
|
|
3650
|
+
type: "laptop"
|
|
3651
|
+
},
|
|
3454
3652
|
{ pattern: /(\d+)?\s*(phone|smartphone|mobile)/gi, type: "cell phone" },
|
|
3455
3653
|
{ pattern: /(\d+)?\s*(book|notebook|journal)/gi, type: "book" },
|
|
3456
3654
|
{ pattern: /(\d+)?\s*(cup|mug|glass|bottle)/gi, type: "cup" },
|
|
@@ -3500,7 +3698,7 @@ class VisionModels {
|
|
|
3500
3698
|
}
|
|
3501
3699
|
async detectPoses(imageData, width, height, description) {
|
|
3502
3700
|
if (!this.poseDetector || !tf2) {
|
|
3503
|
-
|
|
3701
|
+
logger13.warn("[VisionModels] Pose detection model not loaded");
|
|
3504
3702
|
return this.enhancedPoseDetection(description);
|
|
3505
3703
|
}
|
|
3506
3704
|
try {
|
|
@@ -3513,7 +3711,7 @@ class VisionModels {
|
|
|
3513
3711
|
imageTensor.dispose();
|
|
3514
3712
|
return this.convertPosesToPersonInfo(poses);
|
|
3515
3713
|
} catch (error) {
|
|
3516
|
-
|
|
3714
|
+
logger13.error("[VisionModels] Pose detection failed:", error);
|
|
3517
3715
|
return this.enhancedPoseDetection(description);
|
|
3518
3716
|
}
|
|
3519
3717
|
}
|
|
@@ -3535,7 +3733,12 @@ class VisionModels {
|
|
|
3535
3733
|
lying: ["lying", "laying", "reclined"]
|
|
3536
3734
|
};
|
|
3537
3735
|
const facingKeywords = {
|
|
3538
|
-
camera: [
|
|
3736
|
+
camera: [
|
|
3737
|
+
"facing camera",
|
|
3738
|
+
"looking at camera",
|
|
3739
|
+
"facing forward",
|
|
3740
|
+
"front view"
|
|
3741
|
+
],
|
|
3539
3742
|
away: ["back to camera", "facing away", "back view"],
|
|
3540
3743
|
left: ["facing left", "profile left", "left side"],
|
|
3541
3744
|
right: ["facing right", "profile right", "right side"]
|
|
@@ -3697,7 +3900,7 @@ class VisionModels {
|
|
|
3697
3900
|
this.poseDetector = null;
|
|
3698
3901
|
}
|
|
3699
3902
|
this.initialized = false;
|
|
3700
|
-
|
|
3903
|
+
logger13.info("[VisionModels] Models disposed");
|
|
3701
3904
|
}
|
|
3702
3905
|
}
|
|
3703
3906
|
|
|
@@ -3705,8 +3908,8 @@ class VisionModels {
|
|
|
3705
3908
|
import * as path4 from "node:path";
|
|
3706
3909
|
import { TextDecoder } from "node:util";
|
|
3707
3910
|
import { Worker } from "node:worker_threads";
|
|
3708
|
-
import { logger as
|
|
3709
|
-
var __dirname = "/Users/shawwalters/eliza-workspace/milady/plugins/plugin-vision/
|
|
3911
|
+
import { logger as logger14 } from "@elizaos/core";
|
|
3912
|
+
var __dirname = "/Users/shawwalters/eliza-workspace/milady/eliza/plugins/plugin-vision/src";
|
|
3710
3913
|
|
|
3711
3914
|
class VisionWorkerManager {
|
|
3712
3915
|
config;
|
|
@@ -3744,7 +3947,7 @@ class VisionWorkerManager {
|
|
|
3744
3947
|
this.ocrResultsView = new DataView(this.ocrResultsBuffer);
|
|
3745
3948
|
}
|
|
3746
3949
|
async initialize() {
|
|
3747
|
-
|
|
3950
|
+
logger14.info("[VisionWorkerManager] Initializing worker threads...");
|
|
3748
3951
|
try {
|
|
3749
3952
|
await this.startScreenCaptureWorker();
|
|
3750
3953
|
if (this.config.florence2Enabled) {
|
|
@@ -3753,9 +3956,9 @@ class VisionWorkerManager {
|
|
|
3753
3956
|
if (this.config.ocrEnabled) {
|
|
3754
3957
|
await this.startOCRWorker();
|
|
3755
3958
|
}
|
|
3756
|
-
|
|
3959
|
+
logger14.info("[VisionWorkerManager] All workers initialized");
|
|
3757
3960
|
} catch (error) {
|
|
3758
|
-
|
|
3961
|
+
logger14.error("[VisionWorkerManager] Failed to initialize workers:", error);
|
|
3759
3962
|
throw error;
|
|
3760
3963
|
}
|
|
3761
3964
|
}
|
|
@@ -3780,18 +3983,18 @@ class VisionWorkerManager {
|
|
|
3780
3983
|
lastUpdate: Date.now()
|
|
3781
3984
|
});
|
|
3782
3985
|
} else if (msg.type === "error") {
|
|
3783
|
-
|
|
3986
|
+
logger14.error("[ScreenCaptureWorker] Error:", msg.error);
|
|
3784
3987
|
} else if (msg.type === "log") {
|
|
3785
3988
|
this.handleWorkerLog("ScreenCaptureWorker", msg);
|
|
3786
3989
|
}
|
|
3787
3990
|
});
|
|
3788
3991
|
this.screenCaptureWorker.on("error", (error) => {
|
|
3789
|
-
|
|
3992
|
+
logger14.error("[ScreenCaptureWorker] Worker error:", error instanceof Error ? error.message : String(error));
|
|
3790
3993
|
setTimeout(() => this.restartScreenCaptureWorker(), 1000);
|
|
3791
3994
|
});
|
|
3792
3995
|
this.screenCaptureWorker.on("exit", (code) => {
|
|
3793
3996
|
if (code !== 0) {
|
|
3794
|
-
|
|
3997
|
+
logger14.error(`[ScreenCaptureWorker] Worker stopped with exit code ${code}`);
|
|
3795
3998
|
setTimeout(() => this.restartScreenCaptureWorker(), 1000);
|
|
3796
3999
|
}
|
|
3797
4000
|
});
|
|
@@ -3830,18 +4033,18 @@ class VisionWorkerManager {
|
|
|
3830
4033
|
} else if (msg.type === "tile_analyzed") {
|
|
3831
4034
|
this.updateFlorence2Cache(msg);
|
|
3832
4035
|
} else if (msg.type === "error") {
|
|
3833
|
-
|
|
4036
|
+
logger14.error("[Florence2Worker] Error:", msg.error);
|
|
3834
4037
|
} else if (msg.type === "log") {
|
|
3835
4038
|
this.handleWorkerLog("Florence2Worker", msg);
|
|
3836
4039
|
}
|
|
3837
4040
|
});
|
|
3838
4041
|
this.florence2Worker.on("error", (error) => {
|
|
3839
|
-
|
|
4042
|
+
logger14.error("[Florence2Worker] Worker error:", error instanceof Error ? error.message : String(error));
|
|
3840
4043
|
setTimeout(() => this.restartFlorence2Worker(), 1000);
|
|
3841
4044
|
});
|
|
3842
4045
|
this.florence2Worker.on("exit", (code) => {
|
|
3843
4046
|
if (code !== 0) {
|
|
3844
|
-
|
|
4047
|
+
logger14.error(`[Florence2Worker] Worker stopped with exit code ${code}`);
|
|
3845
4048
|
setTimeout(() => this.restartFlorence2Worker(), 1000);
|
|
3846
4049
|
}
|
|
3847
4050
|
});
|
|
@@ -3869,18 +4072,18 @@ class VisionWorkerManager {
|
|
|
3869
4072
|
} else if (msg.type === "ocr_complete") {
|
|
3870
4073
|
this.updateOCRCache(msg);
|
|
3871
4074
|
} else if (msg.type === "error") {
|
|
3872
|
-
|
|
4075
|
+
logger14.error("[OCRWorker] Error:", msg.error);
|
|
3873
4076
|
} else if (msg.type === "log") {
|
|
3874
4077
|
this.handleWorkerLog("OCRWorker", msg);
|
|
3875
4078
|
}
|
|
3876
4079
|
});
|
|
3877
4080
|
this.ocrWorker.on("error", (error) => {
|
|
3878
|
-
|
|
4081
|
+
logger14.error("[OCRWorker] Worker error:", error instanceof Error ? error.message : String(error));
|
|
3879
4082
|
setTimeout(() => this.restartOCRWorker(), 1000);
|
|
3880
4083
|
});
|
|
3881
4084
|
this.ocrWorker.on("exit", (code) => {
|
|
3882
4085
|
if (code !== 0) {
|
|
3883
|
-
|
|
4086
|
+
logger14.error(`[OCRWorker] Worker stopped with exit code ${code}`);
|
|
3884
4087
|
setTimeout(() => this.restartOCRWorker(), 1000);
|
|
3885
4088
|
}
|
|
3886
4089
|
});
|
|
@@ -3893,7 +4096,7 @@ class VisionWorkerManager {
|
|
|
3893
4096
|
this.latestFlorence2Results.set(tileId, result);
|
|
3894
4097
|
}
|
|
3895
4098
|
} catch (error) {
|
|
3896
|
-
|
|
4099
|
+
logger14.error("[VisionWorkerManager] Failed to update Florence2 cache:", error);
|
|
3897
4100
|
}
|
|
3898
4101
|
}
|
|
3899
4102
|
updateOCRCache(_msg) {
|
|
@@ -3903,7 +4106,7 @@ class VisionWorkerManager {
|
|
|
3903
4106
|
this.latestOCRResult = result;
|
|
3904
4107
|
}
|
|
3905
4108
|
} catch (error) {
|
|
3906
|
-
|
|
4109
|
+
logger14.error("[VisionWorkerManager] Failed to update OCR cache:", error);
|
|
3907
4110
|
}
|
|
3908
4111
|
}
|
|
3909
4112
|
readFlorence2Result(tileId) {
|
|
@@ -3929,7 +4132,7 @@ class VisionWorkerManager {
|
|
|
3929
4132
|
const json = new TextDecoder().decode(bytes);
|
|
3930
4133
|
return JSON.parse(json);
|
|
3931
4134
|
} catch (error) {
|
|
3932
|
-
|
|
4135
|
+
logger14.error("[VisionWorkerManager] Failed to read Florence2 result:", error);
|
|
3933
4136
|
return null;
|
|
3934
4137
|
}
|
|
3935
4138
|
}
|
|
@@ -3951,7 +4154,7 @@ class VisionWorkerManager {
|
|
|
3951
4154
|
const json = new TextDecoder().decode(bytes);
|
|
3952
4155
|
return JSON.parse(json);
|
|
3953
4156
|
} catch (error) {
|
|
3954
|
-
|
|
4157
|
+
logger14.error("[VisionWorkerManager] Failed to read OCR result:", error);
|
|
3955
4158
|
return null;
|
|
3956
4159
|
}
|
|
3957
4160
|
}
|
|
@@ -3974,7 +4177,7 @@ class VisionWorkerManager {
|
|
|
3974
4177
|
};
|
|
3975
4178
|
this.lastProcessedFrameId = frameId;
|
|
3976
4179
|
} catch (error) {
|
|
3977
|
-
|
|
4180
|
+
logger14.error("[VisionWorkerManager] Failed to read screen capture:", error);
|
|
3978
4181
|
}
|
|
3979
4182
|
return this.latestScreenCapture;
|
|
3980
4183
|
}
|
|
@@ -4067,28 +4270,28 @@ class VisionWorkerManager {
|
|
|
4067
4270
|
}
|
|
4068
4271
|
}
|
|
4069
4272
|
async stop() {
|
|
4070
|
-
|
|
4273
|
+
logger14.info("[VisionWorkerManager] Stopping all workers...");
|
|
4071
4274
|
const stopPromises = [];
|
|
4072
4275
|
if (this.screenCaptureWorker) {
|
|
4073
|
-
stopPromises.push(new Promise((
|
|
4074
|
-
this.screenCaptureWorker?.once("exit", () =>
|
|
4276
|
+
stopPromises.push(new Promise((resolve2) => {
|
|
4277
|
+
this.screenCaptureWorker?.once("exit", () => resolve2());
|
|
4075
4278
|
this.screenCaptureWorker?.postMessage({ type: "stop" });
|
|
4076
4279
|
}));
|
|
4077
4280
|
}
|
|
4078
4281
|
if (this.florence2Worker) {
|
|
4079
|
-
stopPromises.push(new Promise((
|
|
4080
|
-
this.florence2Worker?.once("exit", () =>
|
|
4282
|
+
stopPromises.push(new Promise((resolve2) => {
|
|
4283
|
+
this.florence2Worker?.once("exit", () => resolve2());
|
|
4081
4284
|
this.florence2Worker?.postMessage({ type: "stop" });
|
|
4082
4285
|
}));
|
|
4083
4286
|
}
|
|
4084
4287
|
if (this.ocrWorker) {
|
|
4085
|
-
stopPromises.push(new Promise((
|
|
4086
|
-
this.ocrWorker?.once("exit", () =>
|
|
4288
|
+
stopPromises.push(new Promise((resolve2) => {
|
|
4289
|
+
this.ocrWorker?.once("exit", () => resolve2());
|
|
4087
4290
|
this.ocrWorker?.postMessage({ type: "stop" });
|
|
4088
4291
|
}));
|
|
4089
4292
|
}
|
|
4090
4293
|
await Promise.all(stopPromises);
|
|
4091
|
-
|
|
4294
|
+
logger14.info("[VisionWorkerManager] All workers stopped");
|
|
4092
4295
|
}
|
|
4093
4296
|
handleWorkerLog(workerName, msg) {
|
|
4094
4297
|
const { level, message, args } = msg;
|
|
@@ -4096,27 +4299,27 @@ class VisionWorkerManager {
|
|
|
4096
4299
|
const stringArgs = args.map((arg) => String(arg));
|
|
4097
4300
|
switch (level) {
|
|
4098
4301
|
case "info":
|
|
4099
|
-
|
|
4302
|
+
logger14.info(formattedMessage, ...stringArgs);
|
|
4100
4303
|
break;
|
|
4101
4304
|
case "warn":
|
|
4102
|
-
|
|
4305
|
+
logger14.warn(formattedMessage, ...stringArgs);
|
|
4103
4306
|
break;
|
|
4104
4307
|
case "error":
|
|
4105
|
-
|
|
4308
|
+
logger14.error(formattedMessage, ...stringArgs);
|
|
4106
4309
|
break;
|
|
4107
4310
|
case "debug":
|
|
4108
|
-
|
|
4311
|
+
logger14.debug(formattedMessage, ...stringArgs);
|
|
4109
4312
|
break;
|
|
4110
4313
|
}
|
|
4111
4314
|
}
|
|
4112
4315
|
async restartScreenCaptureWorker() {
|
|
4113
4316
|
const attempts = this.restartAttempts.get("screenCapture") || 0;
|
|
4114
4317
|
if (attempts >= this.MAX_RESTART_ATTEMPTS) {
|
|
4115
|
-
|
|
4318
|
+
logger14.error("[VisionWorkerManager] Max restart attempts reached for screen capture worker");
|
|
4116
4319
|
return;
|
|
4117
4320
|
}
|
|
4118
4321
|
this.restartAttempts.set("screenCapture", attempts + 1);
|
|
4119
|
-
|
|
4322
|
+
logger14.info(`[VisionWorkerManager] Restarting screen capture worker (attempt ${attempts + 1})`);
|
|
4120
4323
|
try {
|
|
4121
4324
|
if (this.screenCaptureWorker) {
|
|
4122
4325
|
this.screenCaptureWorker.removeAllListeners();
|
|
@@ -4125,17 +4328,17 @@ class VisionWorkerManager {
|
|
|
4125
4328
|
await this.startScreenCaptureWorker();
|
|
4126
4329
|
this.restartAttempts.set("screenCapture", 0);
|
|
4127
4330
|
} catch (error) {
|
|
4128
|
-
|
|
4331
|
+
logger14.error("[VisionWorkerManager] Failed to restart screen capture worker:", error);
|
|
4129
4332
|
}
|
|
4130
4333
|
}
|
|
4131
4334
|
async restartFlorence2Worker() {
|
|
4132
4335
|
const attempts = this.restartAttempts.get("florence2") || 0;
|
|
4133
4336
|
if (attempts >= this.MAX_RESTART_ATTEMPTS) {
|
|
4134
|
-
|
|
4337
|
+
logger14.error("[VisionWorkerManager] Max restart attempts reached for Florence2 worker");
|
|
4135
4338
|
return;
|
|
4136
4339
|
}
|
|
4137
4340
|
this.restartAttempts.set("florence2", attempts + 1);
|
|
4138
|
-
|
|
4341
|
+
logger14.info(`[VisionWorkerManager] Restarting Florence2 worker (attempt ${attempts + 1})`);
|
|
4139
4342
|
try {
|
|
4140
4343
|
if (this.florence2Worker) {
|
|
4141
4344
|
this.florence2Worker.removeAllListeners();
|
|
@@ -4144,17 +4347,17 @@ class VisionWorkerManager {
|
|
|
4144
4347
|
await this.startFlorence2Worker();
|
|
4145
4348
|
this.restartAttempts.set("florence2", 0);
|
|
4146
4349
|
} catch (error) {
|
|
4147
|
-
|
|
4350
|
+
logger14.error("[VisionWorkerManager] Failed to restart Florence2 worker:", error);
|
|
4148
4351
|
}
|
|
4149
4352
|
}
|
|
4150
4353
|
async restartOCRWorker() {
|
|
4151
4354
|
const attempts = this.restartAttempts.get("ocr") || 0;
|
|
4152
4355
|
if (attempts >= this.MAX_RESTART_ATTEMPTS) {
|
|
4153
|
-
|
|
4356
|
+
logger14.error("[VisionWorkerManager] Max restart attempts reached for OCR worker");
|
|
4154
4357
|
return;
|
|
4155
4358
|
}
|
|
4156
4359
|
this.restartAttempts.set("ocr", attempts + 1);
|
|
4157
|
-
|
|
4360
|
+
logger14.info(`[VisionWorkerManager] Restarting OCR worker (attempt ${attempts + 1})`);
|
|
4158
4361
|
try {
|
|
4159
4362
|
if (this.ocrWorker) {
|
|
4160
4363
|
this.ocrWorker.removeAllListeners();
|
|
@@ -4163,13 +4366,20 @@ class VisionWorkerManager {
|
|
|
4163
4366
|
await this.startOCRWorker();
|
|
4164
4367
|
this.restartAttempts.set("ocr", 0);
|
|
4165
4368
|
} catch (error) {
|
|
4166
|
-
|
|
4369
|
+
logger14.error("[VisionWorkerManager] Failed to restart OCR worker:", error);
|
|
4167
4370
|
}
|
|
4168
4371
|
}
|
|
4169
4372
|
}
|
|
4170
4373
|
|
|
4171
4374
|
// src/service.ts
|
|
4172
4375
|
var execAsync3 = promisify3(exec3);
|
|
4376
|
+
var SCENE_DESCRIPTION_PROMPT = JSON.stringify({
|
|
4377
|
+
task: "describe_visual_scene",
|
|
4378
|
+
instructions: [
|
|
4379
|
+
"Describe visible people, objects, UI, text, and notable scene changes.",
|
|
4380
|
+
"Keep the answer concise and factual."
|
|
4381
|
+
]
|
|
4382
|
+
}, null, 2);
|
|
4173
4383
|
|
|
4174
4384
|
class VisionService extends Service {
|
|
4175
4385
|
static serviceType = VisionServiceType.VISION;
|
|
@@ -4223,7 +4433,7 @@ class VisionService extends Service {
|
|
|
4223
4433
|
this.screenCapture = new ScreenCaptureService(this.visionConfig);
|
|
4224
4434
|
this.florence2 = new Florence2Model;
|
|
4225
4435
|
this.ocrService = new OCRService;
|
|
4226
|
-
|
|
4436
|
+
logger15.info("[VisionService] Constructed with config:", JSON.stringify(this.visionConfig));
|
|
4227
4437
|
}
|
|
4228
4438
|
parseConfig(runtime) {
|
|
4229
4439
|
const getSettingString = (key) => {
|
|
@@ -4278,14 +4488,14 @@ class VisionService extends Service {
|
|
|
4278
4488
|
enableObjectDetection: this.visionConfig.enableObjectDetection || false,
|
|
4279
4489
|
enablePoseDetection: this.visionConfig.enablePoseDetection || false
|
|
4280
4490
|
});
|
|
4281
|
-
|
|
4491
|
+
logger15.info("[VisionService] Using TensorFlow.js models for advanced detection");
|
|
4282
4492
|
} catch (_tfError) {
|
|
4283
|
-
|
|
4493
|
+
logger15.warn("[VisionService] TensorFlow.js not available, falling back to enhanced heuristics");
|
|
4284
4494
|
await this.visionModels.initialize({
|
|
4285
4495
|
enableObjectDetection: this.visionConfig.enableObjectDetection || false,
|
|
4286
4496
|
enablePoseDetection: this.visionConfig.enablePoseDetection || false
|
|
4287
4497
|
});
|
|
4288
|
-
|
|
4498
|
+
logger15.info("[VisionService] Using enhanced heuristics for detection");
|
|
4289
4499
|
}
|
|
4290
4500
|
}
|
|
4291
4501
|
if (this.visionConfig.visionMode === "SCREEN" /* SCREEN */ || this.visionConfig.visionMode === "BOTH" /* BOTH */) {
|
|
@@ -4297,18 +4507,18 @@ class VisionService extends Service {
|
|
|
4297
4507
|
await this.initializeAudioCapture();
|
|
4298
4508
|
this.startProcessing();
|
|
4299
4509
|
} catch (error) {
|
|
4300
|
-
|
|
4510
|
+
logger15.error("[VisionService] Failed to initialize:", error);
|
|
4301
4511
|
}
|
|
4302
4512
|
}
|
|
4303
4513
|
async initializeScreenVision() {
|
|
4304
4514
|
try {
|
|
4305
|
-
|
|
4515
|
+
logger15.info("[VisionService] Initializing screen vision...");
|
|
4306
4516
|
const useWorkers = this.visionConfig.targetScreenFPS && this.visionConfig.targetScreenFPS > 10;
|
|
4307
4517
|
if (useWorkers) {
|
|
4308
|
-
|
|
4518
|
+
logger15.info("[VisionService] Initializing worker threads for high-FPS processing...");
|
|
4309
4519
|
this.workerManager = new VisionWorkerManager(this.visionConfig);
|
|
4310
4520
|
await this.workerManager.initialize();
|
|
4311
|
-
|
|
4521
|
+
logger15.info("[VisionService] Worker threads initialized");
|
|
4312
4522
|
} else {
|
|
4313
4523
|
if (this.visionConfig.florence2Enabled) {
|
|
4314
4524
|
await this.florence2.initialize();
|
|
@@ -4319,11 +4529,11 @@ class VisionService extends Service {
|
|
|
4319
4529
|
}
|
|
4320
4530
|
const screenInfo = await this.screenCapture.getScreenInfo();
|
|
4321
4531
|
if (screenInfo) {
|
|
4322
|
-
|
|
4532
|
+
logger15.info(`[VisionService] Screen resolution: ${screenInfo.width}x${screenInfo.height}`);
|
|
4323
4533
|
}
|
|
4324
|
-
|
|
4534
|
+
logger15.info("[VisionService] Screen vision initialized");
|
|
4325
4535
|
} catch (error) {
|
|
4326
|
-
|
|
4536
|
+
logger15.error("[VisionService] Failed to initialize screen vision:", error);
|
|
4327
4537
|
}
|
|
4328
4538
|
}
|
|
4329
4539
|
async initializeCameraVision() {
|
|
@@ -4331,18 +4541,18 @@ class VisionService extends Service {
|
|
|
4331
4541
|
if (!toolCheck.available) {
|
|
4332
4542
|
const platform = process.platform;
|
|
4333
4543
|
const toolName = platform === "darwin" ? "imagesnap" : platform === "linux" ? "fswebcam" : "ffmpeg";
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4337
|
-
|
|
4544
|
+
logger15.warn(`[VisionService] Camera capture tool '${toolName}' not found. Install it to enable camera functionality.`);
|
|
4545
|
+
logger15.warn("[VisionService] For macOS: brew install imagesnap");
|
|
4546
|
+
logger15.warn("[VisionService] For Linux: sudo apt-get install fswebcam");
|
|
4547
|
+
logger15.warn("[VisionService] For Windows: Install ffmpeg and add to PATH");
|
|
4338
4548
|
return;
|
|
4339
4549
|
}
|
|
4340
4550
|
const camera = await this.findCamera();
|
|
4341
4551
|
if (camera) {
|
|
4342
4552
|
this.camera = camera;
|
|
4343
|
-
|
|
4553
|
+
logger15.info(`[VisionService] Connected to camera: ${camera.name}`);
|
|
4344
4554
|
} else {
|
|
4345
|
-
|
|
4555
|
+
logger15.warn("[VisionService] No suitable camera found");
|
|
4346
4556
|
}
|
|
4347
4557
|
}
|
|
4348
4558
|
async initializeAudioCapture() {
|
|
@@ -4353,7 +4563,7 @@ class VisionService extends Service {
|
|
|
4353
4563
|
const enableMicrophone = getSettingString("ENABLE_MICROPHONE") === "true";
|
|
4354
4564
|
const useStreamingAudio = getSettingString("USE_STREAMING_AUDIO") === "true";
|
|
4355
4565
|
if (!enableMicrophone) {
|
|
4356
|
-
|
|
4566
|
+
logger15.info("[VisionService] Microphone capture disabled");
|
|
4357
4567
|
return;
|
|
4358
4568
|
}
|
|
4359
4569
|
try {
|
|
@@ -4378,20 +4588,20 @@ class VisionService extends Service {
|
|
|
4378
4588
|
};
|
|
4379
4589
|
this.streamingAudioCapture = new StreamingAudioCaptureService(this.runtime, streamingConfig);
|
|
4380
4590
|
this.streamingAudioCapture.on("speechStart", () => {
|
|
4381
|
-
|
|
4591
|
+
logger15.info("[VisionService] User started speaking");
|
|
4382
4592
|
});
|
|
4383
4593
|
this.streamingAudioCapture.on("speechEnd", () => {
|
|
4384
|
-
|
|
4594
|
+
logger15.info("[VisionService] User stopped speaking");
|
|
4385
4595
|
});
|
|
4386
4596
|
this.streamingAudioCapture.on("transcription", (data) => {
|
|
4387
|
-
|
|
4597
|
+
logger15.info(`[VisionService] Transcription (${data.isFinal ? "final" : "partial"}): ${data.text}`);
|
|
4388
4598
|
});
|
|
4389
4599
|
this.streamingAudioCapture.on("utteranceComplete", async (text) => {
|
|
4390
|
-
|
|
4600
|
+
logger15.info("[VisionService] Processing complete utterance:", text);
|
|
4391
4601
|
await this.storeAudioTranscription(text);
|
|
4392
4602
|
});
|
|
4393
4603
|
await this.streamingAudioCapture.initialize();
|
|
4394
|
-
|
|
4604
|
+
logger15.info("[VisionService] Streaming audio capture initialized with VAD");
|
|
4395
4605
|
} else {
|
|
4396
4606
|
const getSettingNumber = (key, defaultValue) => {
|
|
4397
4607
|
const value = this.runtime.getSetting(key);
|
|
@@ -4409,10 +4619,10 @@ class VisionService extends Service {
|
|
|
4409
4619
|
};
|
|
4410
4620
|
this.audioCapture = new AudioCaptureService(this.runtime, audioConfig);
|
|
4411
4621
|
await this.audioCapture.initialize();
|
|
4412
|
-
|
|
4622
|
+
logger15.info("[VisionService] Batch audio capture initialized");
|
|
4413
4623
|
}
|
|
4414
4624
|
} catch (error) {
|
|
4415
|
-
|
|
4625
|
+
logger15.error("[VisionService] Failed to initialize audio capture:", error);
|
|
4416
4626
|
}
|
|
4417
4627
|
}
|
|
4418
4628
|
async storeAudioTranscription(text) {
|
|
@@ -4420,9 +4630,9 @@ class VisionService extends Service {
|
|
|
4420
4630
|
if (this.lastSceneDescription) {
|
|
4421
4631
|
this.lastSceneDescription.audioTranscription = text;
|
|
4422
4632
|
}
|
|
4423
|
-
|
|
4633
|
+
logger15.debug("[VisionService] Stored audio transcription in scene context");
|
|
4424
4634
|
} catch (error) {
|
|
4425
|
-
|
|
4635
|
+
logger15.error("[VisionService] Failed to store audio transcription:", error);
|
|
4426
4636
|
}
|
|
4427
4637
|
}
|
|
4428
4638
|
startProcessing() {
|
|
@@ -4443,12 +4653,12 @@ class VisionService extends Service {
|
|
|
4443
4653
|
try {
|
|
4444
4654
|
await this.captureAndProcessFrame();
|
|
4445
4655
|
} catch (error) {
|
|
4446
|
-
|
|
4656
|
+
logger15.error("[VisionService] Frame processing error:", error);
|
|
4447
4657
|
}
|
|
4448
4658
|
this.isProcessing = false;
|
|
4449
4659
|
}
|
|
4450
4660
|
}, this.visionConfig.updateInterval || 100);
|
|
4451
|
-
|
|
4661
|
+
logger15.debug("[VisionService] Started frame processing loop");
|
|
4452
4662
|
}
|
|
4453
4663
|
async captureAndProcessFrame() {
|
|
4454
4664
|
if (!this.camera) {
|
|
@@ -4457,19 +4667,19 @@ class VisionService extends Service {
|
|
|
4457
4667
|
try {
|
|
4458
4668
|
const frameData = await this.camera.capture();
|
|
4459
4669
|
if (!frameData || frameData.length === 0) {
|
|
4460
|
-
|
|
4670
|
+
logger15.debug("[VisionService] Camera returned empty frame, skipping");
|
|
4461
4671
|
return;
|
|
4462
4672
|
}
|
|
4463
4673
|
const frame = await this.processFrameData(frameData);
|
|
4464
4674
|
if (!frame || frame.width === 0 || frame.height === 0) {
|
|
4465
|
-
|
|
4675
|
+
logger15.warn("[VisionService] Invalid frame dimensions, skipping");
|
|
4466
4676
|
return;
|
|
4467
4677
|
}
|
|
4468
4678
|
const changePercentage = this.lastFrame ? await this.calculatePixelChange(this.lastFrame, frame) : 100;
|
|
4469
4679
|
await this.updateSceneDescription(frame, changePercentage);
|
|
4470
4680
|
this.lastFrame = frame;
|
|
4471
4681
|
} catch (error) {
|
|
4472
|
-
|
|
4682
|
+
logger15.error("[VisionService] Error capturing frame:", error);
|
|
4473
4683
|
}
|
|
4474
4684
|
}
|
|
4475
4685
|
async processFrameData(data) {
|
|
@@ -4516,7 +4726,8 @@ class VisionService extends Service {
|
|
|
4516
4726
|
async updateSceneDescription(frame, changePercentage) {
|
|
4517
4727
|
try {
|
|
4518
4728
|
const currentTime = Date.now();
|
|
4519
|
-
const
|
|
4729
|
+
const testImage = getTestImage();
|
|
4730
|
+
const jpegBuffer = testImage ? await sharp4(testImage).jpeg().toBuffer() : await sharp4(frame.data, {
|
|
4520
4731
|
raw: {
|
|
4521
4732
|
width: frame.width,
|
|
4522
4733
|
height: frame.height,
|
|
@@ -4534,7 +4745,7 @@ class VisionService extends Service {
|
|
|
4534
4745
|
description = await this.describeSceneWithVLM(imageUrl);
|
|
4535
4746
|
this.lastVlmUpdateTime = currentTime;
|
|
4536
4747
|
this.lastTfDescription = description;
|
|
4537
|
-
|
|
4748
|
+
logger15.debug(`[VisionService] VLM updated: ${timeSinceVlmUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
|
|
4538
4749
|
}
|
|
4539
4750
|
const timeSinceTfUpdate = currentTime - this.lastTfUpdateTime;
|
|
4540
4751
|
const tfUpdateInterval = this.visionConfig.tfUpdateInterval ?? 1000;
|
|
@@ -4544,18 +4755,18 @@ class VisionService extends Service {
|
|
|
4544
4755
|
let people = [];
|
|
4545
4756
|
if (shouldUpdateTf && (this.visionConfig.enableObjectDetection || this.visionConfig.enablePoseDetection)) {
|
|
4546
4757
|
this.lastTfUpdateTime = currentTime;
|
|
4547
|
-
|
|
4758
|
+
logger15.debug(`[VisionService] TF updating: ${timeSinceTfUpdate}ms since last update, ${changePercentage.toFixed(1)}% change`);
|
|
4548
4759
|
if (this.visionConfig.enableObjectDetection) {
|
|
4549
4760
|
if (this.visionModels.hasObjectDetection()) {
|
|
4550
4761
|
detectedObjects = await this.visionModels.detectObjects(frame.data, frame.width, frame.height);
|
|
4551
|
-
|
|
4762
|
+
logger15.debug(`[VisionService] VisionModels detected ${detectedObjects.length} objects`);
|
|
4552
4763
|
}
|
|
4553
4764
|
}
|
|
4554
4765
|
if (this.visionConfig.enablePoseDetection) {
|
|
4555
4766
|
if (this.visionModels.hasPoseDetection()) {
|
|
4556
4767
|
const poses = await this.visionModels.detectPoses(frame.data, frame.width, frame.height);
|
|
4557
4768
|
people = poses;
|
|
4558
|
-
|
|
4769
|
+
logger15.debug(`[VisionService] VisionModels detected ${people.length} people with poses`);
|
|
4559
4770
|
}
|
|
4560
4771
|
}
|
|
4561
4772
|
if (people.length === 0 && detectedObjects.length > 0) {
|
|
@@ -4583,8 +4794,8 @@ class VisionService extends Service {
|
|
|
4583
4794
|
const enableFaceRecognition = getSettingString("ENABLE_FACE_RECOGNITION") === "true";
|
|
4584
4795
|
if (enableFaceRecognition && people.length > 0 && frame.width > 0 && frame.height > 0) {
|
|
4585
4796
|
try {
|
|
4586
|
-
if (
|
|
4587
|
-
|
|
4797
|
+
if (frame.data.length === 0) {
|
|
4798
|
+
logger15.warn("[VisionService] Invalid frame data for face recognition");
|
|
4588
4799
|
return;
|
|
4589
4800
|
}
|
|
4590
4801
|
const faces = await this.faceRecognition.detectFaces(frame.data, frame.width, frame.height);
|
|
@@ -4602,7 +4813,7 @@ class VisionService extends Service {
|
|
|
4602
4813
|
let profileId;
|
|
4603
4814
|
if (match) {
|
|
4604
4815
|
profileId = match.profileId;
|
|
4605
|
-
|
|
4816
|
+
logger15.debug(`[VisionService] Recognized face: ${profileId} (distance: ${match.distance})`);
|
|
4606
4817
|
} else {
|
|
4607
4818
|
profileId = await this.faceRecognition.addOrUpdateFace(face.descriptor, {
|
|
4608
4819
|
attributes: {
|
|
@@ -4611,7 +4822,7 @@ class VisionService extends Service {
|
|
|
4611
4822
|
emotion: face.expressions ? this.getDominantExpression(face.expressions) : undefined
|
|
4612
4823
|
}
|
|
4613
4824
|
});
|
|
4614
|
-
|
|
4825
|
+
logger15.info(`[VisionService] New face registered: ${profileId}`);
|
|
4615
4826
|
}
|
|
4616
4827
|
faceProfiles.set(person.id, profileId);
|
|
4617
4828
|
break;
|
|
@@ -4619,7 +4830,7 @@ class VisionService extends Service {
|
|
|
4619
4830
|
}
|
|
4620
4831
|
}
|
|
4621
4832
|
} catch (faceError) {
|
|
4622
|
-
|
|
4833
|
+
logger15.error("[VisionService] Face recognition error:", faceError);
|
|
4623
4834
|
}
|
|
4624
4835
|
}
|
|
4625
4836
|
const _trackedEntities = await this.entityTracker.updateEntities(detectedObjects, people, faceProfiles, this.runtime);
|
|
@@ -4632,51 +4843,75 @@ class VisionService extends Service {
|
|
|
4632
4843
|
changePercentage
|
|
4633
4844
|
};
|
|
4634
4845
|
if (shouldUpdateVlm || shouldUpdateTf) {
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
4638
|
-
|
|
4639
|
-
|
|
4846
|
+
logger15.info("[VisionService] Scene Analysis Complete:");
|
|
4847
|
+
logger15.info(` VLM Description: ${description.substring(0, 100)}...`);
|
|
4848
|
+
logger15.info(` Change: ${changePercentage.toFixed(1)}%`);
|
|
4849
|
+
logger15.info(` Updates: ${shouldUpdateVlm ? "VLM" : ""}${shouldUpdateVlm && shouldUpdateTf ? " + " : ""}${shouldUpdateTf ? "TF" : ""}`);
|
|
4850
|
+
logger15.info(` Detection Mode: ${this.visionConfig.enableObjectDetection ? "Advanced CV" : "Motion-based"}`);
|
|
4640
4851
|
if (detectedObjects.length > 0) {
|
|
4641
|
-
|
|
4852
|
+
logger15.info(` Objects: ${detectedObjects.length} detected`);
|
|
4642
4853
|
const objectSummary = detectedObjects.reduce((acc, obj) => {
|
|
4643
4854
|
acc[obj.type] = (acc[obj.type] || 0) + 1;
|
|
4644
4855
|
return acc;
|
|
4645
4856
|
}, {});
|
|
4646
4857
|
for (const [type, count] of Object.entries(objectSummary)) {
|
|
4647
|
-
|
|
4858
|
+
logger15.info(` - ${count} ${type}(s)`);
|
|
4648
4859
|
}
|
|
4649
4860
|
}
|
|
4650
4861
|
if (people.length > 0) {
|
|
4651
|
-
|
|
4862
|
+
logger15.info(` People: ${people.length} detected`);
|
|
4652
4863
|
for (const person of people) {
|
|
4653
|
-
|
|
4864
|
+
logger15.info(` - Person: ${person.pose} pose, facing ${person.facing}, confidence: ${person.confidence.toFixed(2)}`);
|
|
4654
4865
|
}
|
|
4655
4866
|
}
|
|
4656
4867
|
}
|
|
4657
4868
|
} catch (error) {
|
|
4658
|
-
|
|
4869
|
+
logger15.error("[VisionService] Failed to update scene description:", error);
|
|
4659
4870
|
}
|
|
4660
4871
|
}
|
|
4661
4872
|
async describeSceneWithVLM(imageUrl) {
|
|
4873
|
+
return withStandaloneTrajectory3(this.runtime, {
|
|
4874
|
+
source: "plugin-vision:scene-description",
|
|
4875
|
+
metadata: { modelType: ModelType3.IMAGE_DESCRIPTION }
|
|
4876
|
+
}, () => this.describeSceneWithVLMInTrajectory(imageUrl));
|
|
4877
|
+
}
|
|
4878
|
+
async describeSceneWithVLMInTrajectory(imageUrl) {
|
|
4662
4879
|
try {
|
|
4663
4880
|
if (imageUrl.startsWith("data:image/")) {
|
|
4664
4881
|
const base64Data = imageUrl.split(",")[1];
|
|
4665
4882
|
const imageBuffer = Buffer.from(base64Data, "base64");
|
|
4666
4883
|
if (this.florence2.isInitialized()) {
|
|
4667
4884
|
try {
|
|
4668
|
-
const result = await this.
|
|
4885
|
+
const result = await recordLlmCall(this.runtime, {
|
|
4886
|
+
model: "florence2-local",
|
|
4887
|
+
systemPrompt: "",
|
|
4888
|
+
userPrompt: JSON.stringify({
|
|
4889
|
+
task: "describe_visual_scene",
|
|
4890
|
+
image: {
|
|
4891
|
+
source: "camera_frame",
|
|
4892
|
+
mimeType: "image/jpeg",
|
|
4893
|
+
bytes: imageBuffer.byteLength
|
|
4894
|
+
}
|
|
4895
|
+
}, null, 2),
|
|
4896
|
+
temperature: 0,
|
|
4897
|
+
maxTokens: 0,
|
|
4898
|
+
purpose: "background",
|
|
4899
|
+
actionType: "florence2.analyzeImage"
|
|
4900
|
+
}, () => this.florence2.analyzeImage(imageBuffer));
|
|
4669
4901
|
if (result.caption) {
|
|
4670
|
-
|
|
4902
|
+
logger15.debug("[VisionService] Florence-2 description:", result.caption);
|
|
4671
4903
|
return result.caption;
|
|
4672
4904
|
}
|
|
4673
4905
|
} catch (florenceError) {
|
|
4674
|
-
|
|
4906
|
+
logger15.warn("[VisionService] Florence-2 analysis failed, falling back:", florenceError);
|
|
4675
4907
|
}
|
|
4676
4908
|
}
|
|
4677
4909
|
}
|
|
4678
4910
|
try {
|
|
4679
|
-
const result = await this.runtime.useModel(ModelType3.IMAGE_DESCRIPTION,
|
|
4911
|
+
const result = await this.runtime.useModel(ModelType3.IMAGE_DESCRIPTION, {
|
|
4912
|
+
imageUrl,
|
|
4913
|
+
prompt: SCENE_DESCRIPTION_PROMPT
|
|
4914
|
+
});
|
|
4680
4915
|
if (result && typeof result === "object" && "description" in result) {
|
|
4681
4916
|
const description = result.description;
|
|
4682
4917
|
if (!description.includes("I'm unable to analyze images") && !description.includes("I can't analyze images")) {
|
|
@@ -4689,7 +4924,7 @@ class VisionService extends Service {
|
|
|
4689
4924
|
}
|
|
4690
4925
|
}
|
|
4691
4926
|
} catch (modelError) {
|
|
4692
|
-
|
|
4927
|
+
logger15.warn("[VisionService] Runtime IMAGE_DESCRIPTION model failed:", modelError);
|
|
4693
4928
|
}
|
|
4694
4929
|
if (this.lastSceneDescription) {
|
|
4695
4930
|
const { objects, people } = this.lastSceneDescription;
|
|
@@ -4715,7 +4950,7 @@ class VisionService extends Service {
|
|
|
4715
4950
|
}
|
|
4716
4951
|
return "Visual scene captured";
|
|
4717
4952
|
} catch (error) {
|
|
4718
|
-
|
|
4953
|
+
logger15.error("[VisionService] VLM description failed:", error);
|
|
4719
4954
|
return "Unable to describe scene";
|
|
4720
4955
|
}
|
|
4721
4956
|
}
|
|
@@ -4879,12 +5114,12 @@ class VisionService extends Service {
|
|
|
4879
5114
|
try {
|
|
4880
5115
|
await this.captureAndProcessScreen();
|
|
4881
5116
|
} catch (error) {
|
|
4882
|
-
|
|
5117
|
+
logger15.error("[VisionService] Screen processing error:", error);
|
|
4883
5118
|
}
|
|
4884
5119
|
this.isProcessingScreen = false;
|
|
4885
5120
|
}
|
|
4886
5121
|
}, this.visionConfig.screenCaptureInterval || 2000);
|
|
4887
|
-
|
|
5122
|
+
logger15.debug("[VisionService] Started screen processing loop");
|
|
4888
5123
|
}
|
|
4889
5124
|
async captureAndProcessScreen() {
|
|
4890
5125
|
try {
|
|
@@ -4897,7 +5132,7 @@ class VisionService extends Service {
|
|
|
4897
5132
|
}
|
|
4898
5133
|
await this.updateEnhancedSceneDescription();
|
|
4899
5134
|
} catch (error) {
|
|
4900
|
-
|
|
5135
|
+
logger15.error("[VisionService] Error capturing screen:", error);
|
|
4901
5136
|
}
|
|
4902
5137
|
}
|
|
4903
5138
|
async analyzeTile(tile) {
|
|
@@ -4922,7 +5157,7 @@ class VisionService extends Service {
|
|
|
4922
5157
|
}));
|
|
4923
5158
|
}
|
|
4924
5159
|
} catch (error) {
|
|
4925
|
-
|
|
5160
|
+
logger15.error("[VisionService] Error analyzing tile:", error);
|
|
4926
5161
|
}
|
|
4927
5162
|
return analysis;
|
|
4928
5163
|
}
|
|
@@ -4984,11 +5219,11 @@ class VisionService extends Service {
|
|
|
4984
5219
|
return this.visionConfig.visionMode || "CAMERA" /* CAMERA */;
|
|
4985
5220
|
}
|
|
4986
5221
|
async setVisionMode(mode) {
|
|
4987
|
-
|
|
5222
|
+
logger15.info(`[VisionService] Changing vision mode from ${this.visionConfig.visionMode} to ${mode}`);
|
|
4988
5223
|
this.stopProcessing();
|
|
4989
5224
|
this.visionConfig.visionMode = mode;
|
|
4990
5225
|
if (mode === "OFF" /* OFF */) {
|
|
4991
|
-
|
|
5226
|
+
logger15.info("[VisionService] Vision disabled");
|
|
4992
5227
|
return;
|
|
4993
5228
|
}
|
|
4994
5229
|
if ((mode === "CAMERA" /* CAMERA */ || mode === "BOTH" /* BOTH */) && !this.camera) {
|
|
@@ -5063,7 +5298,7 @@ class VisionService extends Service {
|
|
|
5063
5298
|
return this.faceRecognition;
|
|
5064
5299
|
}
|
|
5065
5300
|
async stop() {
|
|
5066
|
-
|
|
5301
|
+
logger15.info("[VisionService] Stopping vision service...");
|
|
5067
5302
|
this.stopProcessing();
|
|
5068
5303
|
if (this.audioCapture) {
|
|
5069
5304
|
await this.audioCapture.stop();
|
|
@@ -5089,13 +5324,13 @@ class VisionService extends Service {
|
|
|
5089
5324
|
this.isProcessingScreen = false;
|
|
5090
5325
|
await this.florence2.dispose();
|
|
5091
5326
|
await this.ocrService.dispose();
|
|
5092
|
-
|
|
5327
|
+
logger15.info("[VisionService] Stopped.");
|
|
5093
5328
|
}
|
|
5094
5329
|
async findCamera() {
|
|
5095
5330
|
try {
|
|
5096
5331
|
const cameras = await this.listCameras();
|
|
5097
5332
|
if (cameras.length === 0) {
|
|
5098
|
-
|
|
5333
|
+
logger15.warn("[VisionService] No cameras detected");
|
|
5099
5334
|
return null;
|
|
5100
5335
|
}
|
|
5101
5336
|
if (this.visionConfig.cameraName) {
|
|
@@ -5104,11 +5339,11 @@ class VisionService extends Service {
|
|
|
5104
5339
|
if (matchedCamera) {
|
|
5105
5340
|
return this.createCameraDevice(matchedCamera);
|
|
5106
5341
|
}
|
|
5107
|
-
|
|
5342
|
+
logger15.warn(`[VisionService] Camera "${this.visionConfig.cameraName}" not found, using default`);
|
|
5108
5343
|
}
|
|
5109
5344
|
return this.createCameraDevice(cameras[0]);
|
|
5110
5345
|
} catch (error) {
|
|
5111
|
-
|
|
5346
|
+
logger15.error("[VisionService] Error finding camera:", error);
|
|
5112
5347
|
return null;
|
|
5113
5348
|
}
|
|
5114
5349
|
}
|
|
@@ -5166,7 +5401,7 @@ class VisionService extends Service {
|
|
|
5166
5401
|
}
|
|
5167
5402
|
return [];
|
|
5168
5403
|
} catch (error) {
|
|
5169
|
-
|
|
5404
|
+
logger15.error("[VisionService] Error listing cameras:", error);
|
|
5170
5405
|
return [];
|
|
5171
5406
|
}
|
|
5172
5407
|
}
|
|
@@ -5223,13 +5458,13 @@ class VisionService extends Service {
|
|
|
5223
5458
|
}
|
|
5224
5459
|
async captureImage() {
|
|
5225
5460
|
if (!this.camera) {
|
|
5226
|
-
|
|
5461
|
+
logger15.warn("[VisionService] No camera available for capture");
|
|
5227
5462
|
return null;
|
|
5228
5463
|
}
|
|
5229
5464
|
try {
|
|
5230
5465
|
return await this.camera.capture();
|
|
5231
5466
|
} catch (error) {
|
|
5232
|
-
|
|
5467
|
+
logger15.error("[VisionService] Failed to capture image:", error);
|
|
5233
5468
|
return null;
|
|
5234
5469
|
}
|
|
5235
5470
|
}
|
|
@@ -5250,7 +5485,7 @@ class ScreenVisionE2ETestSuite {
|
|
|
5250
5485
|
throw new Error("Vision service not available");
|
|
5251
5486
|
}
|
|
5252
5487
|
await visionService.setVisionMode("SCREEN" /* SCREEN */);
|
|
5253
|
-
await new Promise((
|
|
5488
|
+
await new Promise((resolve2) => setTimeout(resolve2, 2000));
|
|
5254
5489
|
const mode = visionService.getVisionMode();
|
|
5255
5490
|
if (mode !== "SCREEN" /* SCREEN */) {
|
|
5256
5491
|
throw new Error(`Expected vision mode SCREEN but got ${mode}`);
|
|
@@ -5273,7 +5508,7 @@ class ScreenVisionE2ETestSuite {
|
|
|
5273
5508
|
throw new Error("Vision service not available");
|
|
5274
5509
|
}
|
|
5275
5510
|
await visionService.setVisionMode("SCREEN" /* SCREEN */);
|
|
5276
|
-
await new Promise((
|
|
5511
|
+
await new Promise((resolve2) => setTimeout(resolve2, 3000));
|
|
5277
5512
|
const screenCapture = await visionService.getScreenCapture();
|
|
5278
5513
|
if (!screenCapture) {
|
|
5279
5514
|
console.warn("⚠️ No screen capture available - screen capture may not be supported in this environment");
|
|
@@ -5303,7 +5538,7 @@ class ScreenVisionE2ETestSuite {
|
|
|
5303
5538
|
throw new Error("Vision service not available");
|
|
5304
5539
|
}
|
|
5305
5540
|
await visionService.setVisionMode("SCREEN" /* SCREEN */);
|
|
5306
|
-
await new Promise((
|
|
5541
|
+
await new Promise((resolve2) => setTimeout(resolve2, 5000));
|
|
5307
5542
|
const enhancedScene = await visionService.getEnhancedSceneDescription();
|
|
5308
5543
|
if (!enhancedScene || !enhancedScene.screenAnalysis) {
|
|
5309
5544
|
console.warn("⚠️ No enhanced scene analysis available yet");
|
|
@@ -5342,18 +5577,23 @@ class ScreenVisionE2ETestSuite {
|
|
|
5342
5577
|
if (!visionService) {
|
|
5343
5578
|
throw new Error("Vision service not available");
|
|
5344
5579
|
}
|
|
5345
|
-
const modes = [
|
|
5580
|
+
const modes = [
|
|
5581
|
+
"CAMERA" /* CAMERA */,
|
|
5582
|
+
"SCREEN" /* SCREEN */,
|
|
5583
|
+
"BOTH" /* BOTH */,
|
|
5584
|
+
"OFF" /* OFF */
|
|
5585
|
+
];
|
|
5346
5586
|
for (const mode of modes) {
|
|
5347
5587
|
console.log(` Switching to ${mode} mode...`);
|
|
5348
5588
|
await visionService.setVisionMode(mode);
|
|
5349
|
-
await new Promise((
|
|
5589
|
+
await new Promise((resolve2) => setTimeout(resolve2, 1000));
|
|
5350
5590
|
const currentMode = visionService.getVisionMode();
|
|
5351
5591
|
if (currentMode !== mode) {
|
|
5352
5592
|
throw new Error(`Failed to switch to ${mode} mode, current mode is ${currentMode}`);
|
|
5353
5593
|
}
|
|
5354
5594
|
console.log(` ✓ Successfully switched to ${mode} mode`);
|
|
5355
5595
|
}
|
|
5356
|
-
console.log(" Testing
|
|
5596
|
+
console.log(" Testing VISION action with op=set_mode...");
|
|
5357
5597
|
const message = {
|
|
5358
5598
|
id: createUniqueUuid3(runtime, "test-msg"),
|
|
5359
5599
|
entityId: runtime.agentId,
|
|
@@ -5363,13 +5603,13 @@ class ScreenVisionE2ETestSuite {
|
|
|
5363
5603
|
createdAt: Date.now()
|
|
5364
5604
|
};
|
|
5365
5605
|
let callbackCalled = false;
|
|
5366
|
-
await
|
|
5606
|
+
await visionAction.handler(runtime, message, { values: {}, data: {}, text: "" }, { parameters: { op: "set_mode" } }, async (response) => {
|
|
5367
5607
|
callbackCalled = true;
|
|
5368
5608
|
console.log(` Action response: ${response.text}`);
|
|
5369
5609
|
return [];
|
|
5370
5610
|
});
|
|
5371
5611
|
if (!callbackCalled) {
|
|
5372
|
-
throw new Error("
|
|
5612
|
+
throw new Error("VISION set_mode op did not call callback");
|
|
5373
5613
|
}
|
|
5374
5614
|
const finalMode = visionService.getVisionMode();
|
|
5375
5615
|
if (finalMode !== "BOTH" /* BOTH */) {
|
|
@@ -5387,7 +5627,7 @@ class ScreenVisionE2ETestSuite {
|
|
|
5387
5627
|
throw new Error("Vision service not available");
|
|
5388
5628
|
}
|
|
5389
5629
|
await visionService.setVisionMode("BOTH" /* BOTH */);
|
|
5390
|
-
await new Promise((
|
|
5630
|
+
await new Promise((resolve2) => setTimeout(resolve2, 5000));
|
|
5391
5631
|
const enhancedScene = await visionService.getEnhancedSceneDescription();
|
|
5392
5632
|
const hasCamera = visionService.getCameraInfo() !== null;
|
|
5393
5633
|
const hasScreen = await visionService.getScreenCapture() !== null;
|
|
@@ -5428,19 +5668,22 @@ class ScreenVisionE2ETestSuite {
|
|
|
5428
5668
|
if (!visionService) {
|
|
5429
5669
|
throw new Error("Vision service not available");
|
|
5430
5670
|
}
|
|
5431
|
-
const
|
|
5432
|
-
const
|
|
5433
|
-
|
|
5434
|
-
|
|
5435
|
-
|
|
5436
|
-
|
|
5437
|
-
|
|
5671
|
+
const originalConfig = Reflect.get(visionService, "visionConfig");
|
|
5672
|
+
const invalidConfig = {
|
|
5673
|
+
...originalConfig,
|
|
5674
|
+
screenRegion: {
|
|
5675
|
+
x: -100,
|
|
5676
|
+
y: -100,
|
|
5677
|
+
width: 50000,
|
|
5678
|
+
height: 50000
|
|
5679
|
+
}
|
|
5438
5680
|
};
|
|
5681
|
+
Reflect.set(visionService, "visionConfig", invalidConfig);
|
|
5439
5682
|
await visionService.setVisionMode("SCREEN" /* SCREEN */);
|
|
5440
|
-
await new Promise((
|
|
5683
|
+
await new Promise((resolve2) => setTimeout(resolve2, 2000));
|
|
5441
5684
|
const isActive = visionService.isActive();
|
|
5442
5685
|
console.log(` Service active after invalid config: ${isActive}`);
|
|
5443
|
-
|
|
5686
|
+
Reflect.set(visionService, "visionConfig", originalConfig);
|
|
5444
5687
|
console.log("✓ Error handling works correctly");
|
|
5445
5688
|
}
|
|
5446
5689
|
}
|
|
@@ -5489,13 +5732,13 @@ class VisionBasicE2ETestSuite {
|
|
|
5489
5732
|
let callbackResponse = null;
|
|
5490
5733
|
const state = { values: {}, data: {}, text: "" };
|
|
5491
5734
|
const visionService = runtime.getService("VISION");
|
|
5492
|
-
const isValid = await
|
|
5735
|
+
const isValid = await visionAction.validate(runtime, message, state);
|
|
5493
5736
|
if (!visionService || !visionService.isActive()) {
|
|
5494
5737
|
if (isValid) {
|
|
5495
5738
|
throw new Error("Action validation should return false when vision service is not active");
|
|
5496
5739
|
}
|
|
5497
5740
|
console.log(" Action validation correctly returned false (vision not active)");
|
|
5498
|
-
await
|
|
5741
|
+
await visionAction.handler(runtime, message, state, { parameters: { op: "describe" } }, async (response) => {
|
|
5499
5742
|
callbackCalled = true;
|
|
5500
5743
|
callbackResponse = response;
|
|
5501
5744
|
return [];
|
|
@@ -5513,10 +5756,10 @@ class VisionBasicE2ETestSuite {
|
|
|
5513
5756
|
}
|
|
5514
5757
|
} else {
|
|
5515
5758
|
if (!isValid) {
|
|
5516
|
-
throw new Error("
|
|
5759
|
+
throw new Error("visionAction validation failed despite active vision");
|
|
5517
5760
|
}
|
|
5518
5761
|
console.log(" Action validation: passed");
|
|
5519
|
-
await
|
|
5762
|
+
await visionAction.handler(runtime, message, state, { parameters: { op: "describe" } }, async (response) => {
|
|
5520
5763
|
callbackCalled = true;
|
|
5521
5764
|
callbackResponse = response;
|
|
5522
5765
|
return [];
|
|
@@ -5533,8 +5776,8 @@ class VisionBasicE2ETestSuite {
|
|
|
5533
5776
|
console.log(` Thought: ${callbackResponse.thought}`);
|
|
5534
5777
|
}
|
|
5535
5778
|
}
|
|
5536
|
-
if (!callbackResponse.actions || !callbackResponse.actions.includes("
|
|
5537
|
-
throw new Error("Response does not include
|
|
5779
|
+
if (!callbackResponse.actions || !callbackResponse.actions.includes("VISION")) {
|
|
5780
|
+
throw new Error("Response does not include VISION action");
|
|
5538
5781
|
}
|
|
5539
5782
|
}
|
|
5540
5783
|
},
|
|
@@ -5555,13 +5798,13 @@ class VisionBasicE2ETestSuite {
|
|
|
5555
5798
|
let callbackResponse = null;
|
|
5556
5799
|
const state = { values: {}, data: {}, text: "" };
|
|
5557
5800
|
const visionService = runtime.getService("VISION");
|
|
5558
|
-
const isValid = await
|
|
5801
|
+
const isValid = await visionAction.validate(runtime, message, state);
|
|
5559
5802
|
if (!visionService || !visionService.isActive()) {
|
|
5560
5803
|
if (isValid) {
|
|
5561
5804
|
throw new Error("Action validation should return false when vision service is not active");
|
|
5562
5805
|
}
|
|
5563
5806
|
console.log(" Action validation correctly returned false (vision not active)");
|
|
5564
|
-
await
|
|
5807
|
+
await visionAction.handler(runtime, message, state, { parameters: { op: "capture" } }, async (response) => {
|
|
5565
5808
|
callbackCalled = true;
|
|
5566
5809
|
callbackResponse = response;
|
|
5567
5810
|
return [];
|
|
@@ -5579,10 +5822,10 @@ class VisionBasicE2ETestSuite {
|
|
|
5579
5822
|
}
|
|
5580
5823
|
} else {
|
|
5581
5824
|
if (!isValid) {
|
|
5582
|
-
throw new Error("
|
|
5825
|
+
throw new Error("visionAction validation failed despite active vision");
|
|
5583
5826
|
}
|
|
5584
5827
|
console.log(" Action validation: passed");
|
|
5585
|
-
await
|
|
5828
|
+
await visionAction.handler(runtime, message, state, { parameters: { op: "capture" } }, async (response) => {
|
|
5586
5829
|
callbackCalled = true;
|
|
5587
5830
|
callbackResponse = response;
|
|
5588
5831
|
return [];
|
|
@@ -5604,8 +5847,8 @@ class VisionBasicE2ETestSuite {
|
|
|
5604
5847
|
}
|
|
5605
5848
|
console.log(` ✓ Image attachment valid: ${attachment.title}`);
|
|
5606
5849
|
}
|
|
5607
|
-
if (!callbackResponse.actions || !callbackResponse.actions.includes("
|
|
5608
|
-
throw new Error("Response does not include
|
|
5850
|
+
if (!callbackResponse.actions || !callbackResponse.actions.includes("VISION")) {
|
|
5851
|
+
throw new Error("Response does not include VISION action");
|
|
5609
5852
|
}
|
|
5610
5853
|
}
|
|
5611
5854
|
},
|
|
@@ -5652,7 +5895,7 @@ class VisionBasicE2ETestSuite {
|
|
|
5652
5895
|
}
|
|
5653
5896
|
const initialScene = await visionService.getSceneDescription();
|
|
5654
5897
|
console.log(` Initial scene: ${initialScene ? "Available" : "Pending..."}`);
|
|
5655
|
-
await new Promise((
|
|
5898
|
+
await new Promise((resolve2) => setTimeout(resolve2, 2000));
|
|
5656
5899
|
const updatedScene = await visionService.getSceneDescription();
|
|
5657
5900
|
if (!updatedScene) {
|
|
5658
5901
|
throw new Error("No scene description available after 2 seconds");
|
|
@@ -5679,7 +5922,7 @@ class VisionBasicE2ETestSuite {
|
|
|
5679
5922
|
return;
|
|
5680
5923
|
}
|
|
5681
5924
|
console.log(" Waiting for scene analysis...");
|
|
5682
|
-
await new Promise((
|
|
5925
|
+
await new Promise((resolve2) => setTimeout(resolve2, 3000));
|
|
5683
5926
|
const scene = await visionService.getSceneDescription();
|
|
5684
5927
|
if (!scene) {
|
|
5685
5928
|
throw new Error("No scene description available after 3 seconds");
|
|
@@ -5869,7 +6112,7 @@ class VisionCaptureLogTestSuite {
|
|
|
5869
6112
|
const captureDuration = captureEndTime - captureStartTime;
|
|
5870
6113
|
const waitTime = Math.max(0, captureInterval - captureDuration);
|
|
5871
6114
|
if (waitTime > 0) {
|
|
5872
|
-
await new Promise((
|
|
6115
|
+
await new Promise((resolve2) => setTimeout(resolve2, waitTime));
|
|
5873
6116
|
}
|
|
5874
6117
|
}
|
|
5875
6118
|
captureData.endTime = new Date().toISOString();
|
|
@@ -5942,7 +6185,7 @@ ${captureData.captures.filter((c) => c.scene?.description).slice(0, 5).map((c, _
|
|
|
5942
6185
|
}
|
|
5943
6186
|
var vision_capture_log_default = new VisionCaptureLogTestSuite;
|
|
5944
6187
|
// src/tests/e2e/vision-runtime.ts
|
|
5945
|
-
import { logger as
|
|
6188
|
+
import { logger as logger16 } from "@elizaos/core";
|
|
5946
6189
|
class VisionRuntimeTestSuite {
|
|
5947
6190
|
name = "vision-runtime-tests";
|
|
5948
6191
|
description = "Real runtime tests for vision plugin functionality";
|
|
@@ -5950,7 +6193,7 @@ class VisionRuntimeTestSuite {
|
|
|
5950
6193
|
{
|
|
5951
6194
|
name: "Vision service initialization",
|
|
5952
6195
|
fn: async (runtime) => {
|
|
5953
|
-
|
|
6196
|
+
logger16.info("[Test] Testing vision service initialization...");
|
|
5954
6197
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
5955
6198
|
if (!visionService) {
|
|
5956
6199
|
throw new Error("Vision service not found in runtime");
|
|
@@ -5959,24 +6202,24 @@ class VisionRuntimeTestSuite {
|
|
|
5959
6202
|
throw new Error("Vision service missing isActive method");
|
|
5960
6203
|
}
|
|
5961
6204
|
const isActive = visionService.isActive();
|
|
5962
|
-
|
|
6205
|
+
logger16.info(`[Test] Vision service active: ${isActive}`);
|
|
5963
6206
|
if (!isActive && runtime.getSetting("VISION_MODE") !== "OFF" /* OFF */) {
|
|
5964
6207
|
throw new Error("Vision service should be active but is not");
|
|
5965
6208
|
}
|
|
5966
|
-
|
|
6209
|
+
logger16.info("[Test] ✅ Vision service initialization test passed");
|
|
5967
6210
|
}
|
|
5968
6211
|
},
|
|
5969
6212
|
{
|
|
5970
6213
|
name: "Scene description functionality",
|
|
5971
6214
|
fn: async (runtime) => {
|
|
5972
|
-
|
|
6215
|
+
logger16.info("[Test] Testing scene description...");
|
|
5973
6216
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
5974
6217
|
if (!visionService) {
|
|
5975
6218
|
throw new Error("Vision service not found");
|
|
5976
6219
|
}
|
|
5977
6220
|
const scene = await visionService.getSceneDescription();
|
|
5978
6221
|
if (!scene) {
|
|
5979
|
-
|
|
6222
|
+
logger16.warn("[Test] No scene description available (camera might not be connected)");
|
|
5980
6223
|
return;
|
|
5981
6224
|
}
|
|
5982
6225
|
if (typeof scene.timestamp !== "number") {
|
|
@@ -5991,21 +6234,21 @@ class VisionRuntimeTestSuite {
|
|
|
5991
6234
|
if (!Array.isArray(scene.people)) {
|
|
5992
6235
|
throw new Error("Scene description missing people array");
|
|
5993
6236
|
}
|
|
5994
|
-
|
|
5995
|
-
|
|
5996
|
-
|
|
6237
|
+
logger16.info(`[Test] Scene: ${scene.description.substring(0, 100)}...`);
|
|
6238
|
+
logger16.info(`[Test] Objects: ${scene.objects.length}, People: ${scene.people.length}`);
|
|
6239
|
+
logger16.info("[Test] ✅ Scene description test passed");
|
|
5997
6240
|
}
|
|
5998
6241
|
},
|
|
5999
6242
|
{
|
|
6000
6243
|
name: "Vision mode switching",
|
|
6001
6244
|
fn: async (runtime) => {
|
|
6002
|
-
|
|
6245
|
+
logger16.info("[Test] Testing vision mode switching...");
|
|
6003
6246
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6004
6247
|
if (!visionService) {
|
|
6005
6248
|
throw new Error("Vision service not found");
|
|
6006
6249
|
}
|
|
6007
6250
|
const originalMode = visionService.getVisionMode();
|
|
6008
|
-
|
|
6251
|
+
logger16.info(`[Test] Original mode: ${originalMode}`);
|
|
6009
6252
|
const testModes = [
|
|
6010
6253
|
"CAMERA" /* CAMERA */,
|
|
6011
6254
|
"SCREEN" /* SCREEN */,
|
|
@@ -6013,7 +6256,7 @@ class VisionRuntimeTestSuite {
|
|
|
6013
6256
|
"OFF" /* OFF */
|
|
6014
6257
|
];
|
|
6015
6258
|
for (const mode of testModes) {
|
|
6016
|
-
|
|
6259
|
+
logger16.info(`[Test] Switching to mode: ${mode}`);
|
|
6017
6260
|
await visionService.setVisionMode(mode);
|
|
6018
6261
|
const currentMode = visionService.getVisionMode();
|
|
6019
6262
|
if (currentMode !== mode) {
|
|
@@ -6021,16 +6264,16 @@ class VisionRuntimeTestSuite {
|
|
|
6021
6264
|
}
|
|
6022
6265
|
}
|
|
6023
6266
|
await visionService.setVisionMode(originalMode);
|
|
6024
|
-
|
|
6267
|
+
logger16.info("[Test] ✅ Vision mode switching test passed");
|
|
6025
6268
|
}
|
|
6026
6269
|
},
|
|
6027
6270
|
{
|
|
6028
|
-
name: "
|
|
6271
|
+
name: "VISION action describe op execution",
|
|
6029
6272
|
fn: async (runtime) => {
|
|
6030
|
-
|
|
6031
|
-
const action = runtime.actions.find((a) => a.name === "
|
|
6273
|
+
logger16.info("[Test] Testing VISION action with op=describe...");
|
|
6274
|
+
const action = runtime.actions.find((a) => a.name === "VISION");
|
|
6032
6275
|
if (!action) {
|
|
6033
|
-
throw new Error("
|
|
6276
|
+
throw new Error("VISION action not found");
|
|
6034
6277
|
}
|
|
6035
6278
|
const message = {
|
|
6036
6279
|
id: `test-msg-${Date.now()}`,
|
|
@@ -6044,27 +6287,27 @@ class VisionRuntimeTestSuite {
|
|
|
6044
6287
|
};
|
|
6045
6288
|
const isValid = await action.validate(runtime, message);
|
|
6046
6289
|
if (!isValid) {
|
|
6047
|
-
throw new Error("
|
|
6290
|
+
throw new Error("VISION action validation failed");
|
|
6048
6291
|
}
|
|
6049
6292
|
let responseReceived = false;
|
|
6050
6293
|
const callback = async (response) => {
|
|
6051
6294
|
if (response.text && response.text.length > 0) {
|
|
6052
6295
|
responseReceived = true;
|
|
6053
|
-
|
|
6296
|
+
logger16.info(`[Test] Action response: ${response.text.substring(0, 100)}...`);
|
|
6054
6297
|
}
|
|
6055
6298
|
return [];
|
|
6056
6299
|
};
|
|
6057
|
-
await action.handler(runtime, message, { values: {}, data: {}, text: "" }, {}, callback);
|
|
6300
|
+
await action.handler(runtime, message, { values: {}, data: {}, text: "" }, { parameters: { op: "describe" } }, callback);
|
|
6058
6301
|
if (!responseReceived) {
|
|
6059
|
-
throw new Error("
|
|
6302
|
+
throw new Error("VISION action with op=describe did not produce a response");
|
|
6060
6303
|
}
|
|
6061
|
-
|
|
6304
|
+
logger16.info("[Test] ✅ VISION action describe op test passed");
|
|
6062
6305
|
}
|
|
6063
6306
|
},
|
|
6064
6307
|
{
|
|
6065
6308
|
name: "Vision provider integration",
|
|
6066
6309
|
fn: async (runtime) => {
|
|
6067
|
-
|
|
6310
|
+
logger16.info("[Test] Testing vision provider...");
|
|
6068
6311
|
const provider = runtime.providers.find((p) => p.name === "visionProvider");
|
|
6069
6312
|
if (!provider) {
|
|
6070
6313
|
throw new Error("Vision provider not found");
|
|
@@ -6086,47 +6329,47 @@ class VisionRuntimeTestSuite {
|
|
|
6086
6329
|
throw new Error("Vision provider returned invalid result");
|
|
6087
6330
|
}
|
|
6088
6331
|
if (result.text?.includes("I can see")) {
|
|
6089
|
-
|
|
6332
|
+
logger16.info(`[Test] Provider text: ${result.text.substring(0, 100)}...`);
|
|
6090
6333
|
}
|
|
6091
|
-
|
|
6334
|
+
logger16.info("[Test] ✅ Vision provider test passed");
|
|
6092
6335
|
}
|
|
6093
6336
|
},
|
|
6094
6337
|
{
|
|
6095
6338
|
name: "Florence-2 model initialization",
|
|
6096
6339
|
fn: async (runtime) => {
|
|
6097
|
-
|
|
6340
|
+
logger16.info("[Test] Testing Florence-2 model...");
|
|
6098
6341
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6099
6342
|
if (!visionService) {
|
|
6100
6343
|
throw new Error("Vision service not found");
|
|
6101
6344
|
}
|
|
6102
6345
|
const florence2Enabled = runtime.getSetting("FLORENCE2_ENABLED") === "true" || runtime.getSetting("VISION_FLORENCE2_ENABLED") === "true";
|
|
6103
6346
|
if (!florence2Enabled) {
|
|
6104
|
-
|
|
6347
|
+
logger16.info("[Test] Florence-2 is disabled, skipping test");
|
|
6105
6348
|
return;
|
|
6106
6349
|
}
|
|
6107
6350
|
const mode = visionService.getVisionMode();
|
|
6108
6351
|
if (mode === "SCREEN" /* SCREEN */ || mode === "BOTH" /* BOTH */) {
|
|
6109
6352
|
const screenCapture = await visionService.getScreenCapture();
|
|
6110
6353
|
if (screenCapture) {
|
|
6111
|
-
|
|
6112
|
-
|
|
6113
|
-
|
|
6354
|
+
logger16.info("[Test] Screen capture available");
|
|
6355
|
+
logger16.info(`[Test] Screen size: ${screenCapture.width}x${screenCapture.height}`);
|
|
6356
|
+
logger16.info(`[Test] Tiles: ${screenCapture.tiles.length}`);
|
|
6114
6357
|
}
|
|
6115
6358
|
}
|
|
6116
|
-
|
|
6359
|
+
logger16.info("[Test] ✅ Florence-2 model test passed");
|
|
6117
6360
|
}
|
|
6118
6361
|
},
|
|
6119
6362
|
{
|
|
6120
6363
|
name: "OCR service functionality",
|
|
6121
6364
|
fn: async (runtime) => {
|
|
6122
|
-
|
|
6365
|
+
logger16.info("[Test] Testing OCR service...");
|
|
6123
6366
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6124
6367
|
if (!visionService) {
|
|
6125
6368
|
throw new Error("Vision service not found");
|
|
6126
6369
|
}
|
|
6127
6370
|
const ocrEnabled = runtime.getSetting("OCR_ENABLED") === "true" || runtime.getSetting("VISION_OCR_ENABLED") === "true";
|
|
6128
6371
|
if (!ocrEnabled) {
|
|
6129
|
-
|
|
6372
|
+
logger16.info("[Test] OCR is disabled, skipping test");
|
|
6130
6373
|
return;
|
|
6131
6374
|
}
|
|
6132
6375
|
const mode = visionService.getVisionMode();
|
|
@@ -6135,18 +6378,18 @@ class VisionRuntimeTestSuite {
|
|
|
6135
6378
|
if (enhancedScene?.screenAnalysis) {
|
|
6136
6379
|
const ocrText = enhancedScene.screenAnalysis.fullScreenOCR;
|
|
6137
6380
|
if (ocrText) {
|
|
6138
|
-
|
|
6139
|
-
|
|
6381
|
+
logger16.info(`[Test] OCR extracted ${ocrText.length} characters`);
|
|
6382
|
+
logger16.info(`[Test] OCR sample: ${ocrText.substring(0, 100)}...`);
|
|
6140
6383
|
}
|
|
6141
6384
|
}
|
|
6142
6385
|
}
|
|
6143
|
-
|
|
6386
|
+
logger16.info("[Test] ✅ OCR service test passed");
|
|
6144
6387
|
}
|
|
6145
6388
|
},
|
|
6146
6389
|
{
|
|
6147
6390
|
name: "Entity tracking system",
|
|
6148
6391
|
fn: async (runtime) => {
|
|
6149
|
-
|
|
6392
|
+
logger16.info("[Test] Testing entity tracking...");
|
|
6150
6393
|
const visionService = runtime.getService(VisionServiceType.VISION);
|
|
6151
6394
|
if (!visionService) {
|
|
6152
6395
|
throw new Error("Vision service not found");
|
|
@@ -6156,14 +6399,14 @@ class VisionRuntimeTestSuite {
|
|
|
6156
6399
|
throw new Error("Entity tracker not found");
|
|
6157
6400
|
}
|
|
6158
6401
|
const entities = entityTracker.getActiveEntities();
|
|
6159
|
-
|
|
6402
|
+
logger16.info(`[Test] Active entities: ${entities.length}`);
|
|
6160
6403
|
for (const entity of entities) {
|
|
6161
6404
|
if (!entity.id || !entity.entityType || !entity.lastSeen) {
|
|
6162
6405
|
throw new Error("Entity missing required fields");
|
|
6163
6406
|
}
|
|
6164
|
-
|
|
6407
|
+
logger16.info(`[Test] Entity ${entity.id}: type=${entity.entityType}, tracked=${entity.lastSeen - entity.firstSeen}ms`);
|
|
6165
6408
|
}
|
|
6166
|
-
|
|
6409
|
+
logger16.info("[Test] ✅ Entity tracking test passed");
|
|
6167
6410
|
}
|
|
6168
6411
|
}
|
|
6169
6412
|
];
|
|
@@ -6176,7 +6419,7 @@ import { promisify as promisify4 } from "node:util";
|
|
|
6176
6419
|
// src/tests/test-pattern-generator.ts
|
|
6177
6420
|
import * as fs5 from "node:fs/promises";
|
|
6178
6421
|
import * as path7 from "node:path";
|
|
6179
|
-
import { logger as
|
|
6422
|
+
import { logger as logger17 } from "@elizaos/core";
|
|
6180
6423
|
import sharp5 from "sharp";
|
|
6181
6424
|
function generateGrid(width, height, spacing = 100) {
|
|
6182
6425
|
const lines = [];
|
|
@@ -6286,7 +6529,7 @@ async function savePattern(buffer, filename) {
|
|
|
6286
6529
|
await fs5.mkdir(outputDir, { recursive: true });
|
|
6287
6530
|
const filepath = path7.join(outputDir, filename);
|
|
6288
6531
|
await fs5.writeFile(filepath, buffer);
|
|
6289
|
-
|
|
6532
|
+
logger17.info(`[TestPatternGenerator] Saved test pattern to ${filepath}`);
|
|
6290
6533
|
return filepath;
|
|
6291
6534
|
}
|
|
6292
6535
|
function verifyQuadrantNumbers(ocrText) {
|
|
@@ -6356,7 +6599,7 @@ class VisionWorkerE2ETestSuite {
|
|
|
6356
6599
|
frameCount++;
|
|
6357
6600
|
lastTimestamp = scene.timestamp;
|
|
6358
6601
|
}
|
|
6359
|
-
await new Promise((
|
|
6602
|
+
await new Promise((resolve2) => setImmediate(resolve2));
|
|
6360
6603
|
}
|
|
6361
6604
|
const totalTime = (Date.now() - startTime) / 1000;
|
|
6362
6605
|
const avgFPS = frameCount / totalTime;
|
|
@@ -6385,7 +6628,7 @@ class VisionWorkerE2ETestSuite {
|
|
|
6385
6628
|
await displayTestPattern(patternPath);
|
|
6386
6629
|
try {
|
|
6387
6630
|
console.log("Waiting for OCR processing...");
|
|
6388
|
-
await new Promise((
|
|
6631
|
+
await new Promise((resolve2) => setTimeout(resolve2, 3000));
|
|
6389
6632
|
const scene = await visionService.getEnhancedSceneDescription();
|
|
6390
6633
|
const ocrText = scene?.screenAnalysis?.fullScreenOCR || "";
|
|
6391
6634
|
console.log(`OCR detected text: "${ocrText.substring(0, 100)}..."`);
|
|
@@ -6424,7 +6667,7 @@ class VisionWorkerE2ETestSuite {
|
|
|
6424
6667
|
if (scene?.screenCapture) {
|
|
6425
6668
|
console.log(` Screen: ${scene.screenCapture.width}x${scene.screenCapture.height}`);
|
|
6426
6669
|
}
|
|
6427
|
-
await new Promise((
|
|
6670
|
+
await new Promise((resolve2) => setTimeout(resolve2, 500));
|
|
6428
6671
|
}
|
|
6429
6672
|
console.log(`✓ Monitored ${displayCount} displays`);
|
|
6430
6673
|
}
|
|
@@ -6469,7 +6712,7 @@ Current stats:`);
|
|
|
6469
6712
|
console.log(` OCR detections: ${stats.ocrDetections}`);
|
|
6470
6713
|
console.log(` Florence-2 detections: ${stats.florence2Detections}`);
|
|
6471
6714
|
}
|
|
6472
|
-
await new Promise((
|
|
6715
|
+
await new Promise((resolve2) => setTimeout(resolve2, 100));
|
|
6473
6716
|
}
|
|
6474
6717
|
const totalTime = (Date.now() - startTime) / 1000;
|
|
6475
6718
|
console.log(`
|
|
@@ -6520,7 +6763,7 @@ async function displayTestPattern(imagePath) {
|
|
|
6520
6763
|
} else if (platform === "win32") {
|
|
6521
6764
|
await execAsync4(`start "" "${imagePath}"`);
|
|
6522
6765
|
}
|
|
6523
|
-
await new Promise((
|
|
6766
|
+
await new Promise((resolve2) => setTimeout(resolve2, 1000));
|
|
6524
6767
|
} catch (error) {
|
|
6525
6768
|
console.warn("Could not display test pattern:", error);
|
|
6526
6769
|
}
|
|
@@ -6553,15 +6796,19 @@ var visionPlugin = {
|
|
|
6553
6796
|
description: "Provides visual perception through camera integration and scene analysis",
|
|
6554
6797
|
services: [VisionService],
|
|
6555
6798
|
providers: [visionProvider],
|
|
6556
|
-
actions: [
|
|
6557
|
-
describeSceneAction,
|
|
6558
|
-
captureImageAction,
|
|
6559
|
-
setVisionModeAction,
|
|
6560
|
-
nameEntityAction,
|
|
6561
|
-
identifyPersonAction,
|
|
6562
|
-
trackEntityAction
|
|
6563
|
-
],
|
|
6799
|
+
actions: [...promoteSubactionsToActions(visionAction)],
|
|
6564
6800
|
tests: testSuites,
|
|
6801
|
+
autoEnable: {
|
|
6802
|
+
shouldEnable: (_env, config) => {
|
|
6803
|
+
const f = config?.features?.vision;
|
|
6804
|
+
const featureOn = f === true || typeof f === "object" && f !== null && f.enabled !== false;
|
|
6805
|
+
if (featureOn)
|
|
6806
|
+
return true;
|
|
6807
|
+
const media = config?.media;
|
|
6808
|
+
const visionMedia = media?.vision;
|
|
6809
|
+
return Boolean(visionMedia && visionMedia.enabled !== false && typeof visionMedia.provider === "string" && visionMedia.provider.length > 0);
|
|
6810
|
+
}
|
|
6811
|
+
},
|
|
6565
6812
|
init: async (_config, _runtime) => {}
|
|
6566
6813
|
};
|
|
6567
6814
|
var src_default = visionPlugin;
|
|
@@ -6570,4 +6817,4 @@ export {
|
|
|
6570
6817
|
src_default as default
|
|
6571
6818
|
};
|
|
6572
6819
|
|
|
6573
|
-
//# debugId=
|
|
6820
|
+
//# debugId=177A2977D034662964756E2164756E21
|