@elizaos/plugin-vision 2.0.0-alpha.8 → 2.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,340 @@
1
+ # ElizaOS Vision Plugin
2
+
3
+ A powerful visual perception plugin for ElizaOS that provides agents with
4
+ real-time camera integration and scene analysis capabilities. This plugin
5
+ enables agents to "see" their environment, describe scenes, detect people and
6
+ objects, and make decisions based on visual input.
7
+
8
+ ## Features
9
+
10
+ ### Phase 1 (Implemented)
11
+
12
+ - ✅ Camera detection and connection (platform-specific)
13
+ - ✅ Real-time frame capture and processing
14
+ - ✅ Scene description using Vision Language Models (VLM)
15
+ - ✅ Motion-based object detection
16
+ - ✅ Basic person detection with pose estimation
17
+ - ✅ Configurable pixel change threshold
18
+ - ✅ Image capture action with base64 attachments
19
+ - ✅ Non-dynamic vision provider (always active)
20
+ - ✅ Integration with autonomy plugin (kill switch)
21
+
22
+ ### Phase 2 (Implemented)
23
+
24
+ - ✅ Enhanced object detection with COCO-like classification
25
+ - ✅ Advanced pose detection with keypoint estimation
26
+ - ✅ Improved person detection and tracking
27
+ - ✅ Object classification (person, monitor, chair, keyboard, furniture, etc.)
28
+ - ✅ Configurable computer vision models
29
+ - ✅ Fallback to motion detection when CV is disabled
30
+
31
+ ### Phase 3 (Implemented)
32
+
33
+ - ✅ Real-time object tracking with IDs
34
+ - ✅ Face detection and recognition
35
+ - ✅ Screen capture and OCR integration
36
+ - ✅ Entity tracking with persistent IDs
37
+ - ✅ Multi-display support
38
+ - ✅ Circuit breaker pattern for error resilience
39
+ - ✅ Florence2 model integration for advanced scene understanding
40
+ - ✅ Worker-based processing for high-FPS operations
41
+
42
+ ### Phase 4 (Planned)
43
+
44
+ - 🔄 WebAssembly (WASM) integration for browser compatibility
45
+ - 🔄 Gesture recognition
46
+ - 🔄 Emotion detection
47
+ - 🔄 Advanced scene understanding and spatial relationships
48
+
49
+ ## Installation
50
+
51
+ ### TypeScript (Primary)
52
+
53
+ ```bash
54
+ npm install @elizaos/plugin-vision
55
+ # or
56
+ cd plugins/plugin-vision
57
+ bun install
58
+ bun run build
59
+ ```
60
+ ### Camera Tools Required
61
+
62
+ The plugin requires platform-specific camera tools:
63
+
64
+ - **macOS**: `brew install imagesnap`
65
+ - **Linux**: `sudo apt-get install fswebcam`
66
+ - **Windows**: Install ffmpeg and add to PATH
67
+
68
+ ## Configuration
69
+
70
+ ### Environment Variables
71
+
72
+ ```env
73
+ # Camera selection (partial name match, case-insensitive)
74
+ CAMERA_NAME=obsbot
75
+
76
+ # Pixel change threshold (percentage, default: 50)
77
+ PIXEL_CHANGE_THRESHOLD=30
78
+
79
+ # Enable advanced computer vision features (default: false)
80
+ ENABLE_OBJECT_DETECTION=true
81
+ ENABLE_POSE_DETECTION=true
82
+ ENABLE_FACE_RECOGNITION=false
83
+
84
+ # Vision mode: OFF, CAMERA, SCREEN, BOTH
85
+ VISION_MODE=CAMERA
86
+
87
+ # Update intervals (milliseconds)
88
+ TF_UPDATE_INTERVAL=1000
89
+ VLM_UPDATE_INTERVAL=10000
90
+
91
+ # Screen capture settings
92
+ SCREEN_CAPTURE_INTERVAL=2000
93
+ OCR_ENABLED=true
94
+ ```
95
+
96
+ ### Character Configuration
97
+
98
+ ```json
99
+ {
100
+ "name": "VisionAgent",
101
+ "plugins": ["@elizaos/plugin-vision"],
102
+ "settings": {
103
+ "CAMERA_NAME": "obsbot",
104
+ "PIXEL_CHANGE_THRESHOLD": "30",
105
+ "ENABLE_OBJECT_DETECTION": "true",
106
+ "ENABLE_POSE_DETECTION": "true"
107
+ }
108
+ }
109
+ ```
110
+
111
+ ## Actions
112
+
113
+ ### DESCRIBE_SCENE
114
+
115
+ Analyzes the current visual scene and provides a detailed description.
116
+
117
+ **Similes**: `ANALYZE_SCENE`, `WHAT_DO_YOU_SEE`, `VISION_CHECK`, `LOOK_AROUND`
118
+
119
+ **Example**:
120
+
121
+ ```
122
+ User: "What do you see?"
123
+ Agent: "Looking through the camera, I see a home office setup with a person sitting at a desk. There are 2 monitors, a keyboard, and various desk accessories. I detected 5 objects total: 1 person, 2 monitors, 1 keyboard, and 1 chair."
124
+ ```
125
+
126
+ ### CAPTURE_IMAGE
127
+
128
+ Captures the current frame and returns it as a base64 image attachment.
129
+
130
+ **Similes**: `TAKE_PHOTO`, `SCREENSHOT`, `CAPTURE_FRAME`, `TAKE_PICTURE`
131
+
132
+ **Example**:
133
+
134
+ ```
135
+ User: "Take a photo"
136
+ Agent: "I've captured an image from the camera." [Image attached]
137
+ ```
138
+
139
+ ### SET_VISION_MODE
140
+
141
+ Changes the vision mode (OFF, CAMERA, SCREEN, or BOTH).
142
+
143
+ **Similes**: `CHANGE_VISION_MODE`, `SET_VISION`, `TOGGLE_VISION`
144
+
145
+ ### NAME_ENTITY
146
+
147
+ Assigns a name to a detected entity for tracking.
148
+
149
+ **Similes**: `LABEL_ENTITY`, `NAME_OBJECT`, `IDENTIFY_ENTITY`
150
+
151
+ ### IDENTIFY_PERSON
152
+
153
+ Identifies a person using face recognition (requires face recognition to be enabled).
154
+
155
+ **Similes**: `RECOGNIZE_PERSON`, `IDENTIFY_FACE`
156
+
157
+ ### TRACK_ENTITY
158
+
159
+ Starts tracking an entity with a persistent ID.
160
+
161
+ **Similes**: `START_TRACKING`, `FOLLOW_ENTITY`
162
+
163
+ ### KILL_AUTONOMOUS
164
+
165
+ Stops the autonomous agent loop (useful for debugging with autonomy plugin).
166
+
167
+ **Similes**: `STOP_AUTONOMOUS`, `HALT_AUTONOMOUS`, `KILL_AUTO_LOOP`
168
+
169
+ ## Vision Provider
170
+
171
+ The vision provider is **non-dynamic** (always active) and provides:
172
+
173
+ - Current scene description
174
+ - Camera connection status
175
+ - Detected objects count and types
176
+ - Detected people count with poses
177
+ - Scene change percentage
178
+ - Time since last update
179
+
180
+ ### Provider Data Structure
181
+
182
+ ```typescript
183
+ {
184
+ visionAvailable: boolean,
185
+ sceneDescription: string,
186
+ cameraStatus: string,
187
+ cameraId?: string,
188
+ peopleCount?: number,
189
+ objectCount?: number,
190
+ sceneAge?: number,
191
+ lastChange?: number
192
+ }
193
+ ```
194
+
195
+ ## Detection Modes
196
+
197
+ ### Motion-Based Detection (Default)
198
+
199
+ - Lightweight and fast
200
+ - Detects movement between frames
201
+ - Groups motion blocks into objects
202
+ - Basic size-based classification
203
+
204
+ ### Advanced Computer Vision (Optional)
205
+
206
+ Enable with `ENABLE_OBJECT_DETECTION=true` and/or `ENABLE_POSE_DETECTION=true`
207
+
208
+ - **Object Detection**: Enhanced object recognition with COCO-like classes
209
+ - **Pose Detection**: 17-keypoint pose estimation
210
+ - **Better Classification**: Distinguishes between person, monitor, chair,
211
+ keyboard, etc.
212
+ - **Higher Accuracy**: Edge detection and color variance analysis
213
+
214
+ ## Integration with Autonomy
215
+
216
+ - Continuous environmental monitoring
217
+ - Autonomous responses to visual changes
218
+ - Visual memory persistence
219
+ - Scene-based decision making
220
+
221
+ Example autonomous behavior:
222
+
223
+ ```typescript
224
+ // Agent autonomously monitors environment
225
+ "I notice someone just entered the room.";
226
+ "The lighting has changed significantly.";
227
+ "A new object has appeared on the desk.";
228
+ ```
229
+
230
+ ## Performance Considerations
231
+
232
+ - Frame processing runs every 100ms by default
233
+ - VLM is only called when pixel change exceeds threshold
234
+ - Motion detection uses 64x64 pixel blocks with 50% overlap
235
+ - Advanced CV models add ~50-100ms processing time per frame
236
+ - Memory usage increases with resolution (1280x720 recommended)
237
+
238
+ ## Security & Privacy
239
+
240
+ - Camera access requires system permissions
241
+ - No images are stored permanently by default
242
+ - All processing happens locally
243
+ - Base64 images in messages are ephemeral
244
+ - Consider privacy implications in your implementation
245
+
246
+ ## Architecture
247
+
248
+ ```
249
+ plugin-vision/
250
+ ├── README.md # This file
251
+ ├── package.json # TypeScript package config
252
+ ├── src/ # TypeScript implementation (primary)
253
+ │ ├── index.ts # Plugin entry point
254
+ │ ├── service.ts # Vision service
255
+ │ ├── provider.ts # Vision provider
256
+ │ ├── action.ts # All actions
257
+ │ ├── entity-tracker.ts # Entity tracking
258
+ │ ├── screen-capture.ts # Screen capture
259
+ │ ├── ocr-service.ts # OCR service
260
+ │ ├── face-recognition.ts # Face recognition
261
+ │ ├── florence2-model.ts # Florence2 model integration
262
+ │ ├── vision-worker-manager.ts # Worker management
263
+ │ └── tests/ # E2E tests
264
+ ```
265
+
266
+ ## Development
267
+
268
+ ### Running Tests
269
+
270
+ ```bash
271
+ # Run E2E tests
272
+ cd plugins/plugin-vision
273
+ npx vitest
274
+
275
+ # Run local E2E tests
276
+ bun run test:e2e:local
277
+ ```
278
+
279
+ ### Test Coverage
280
+
281
+ - Service initialization
282
+ - Camera detection and connection
283
+ - Scene description generation
284
+ - Object and person detection
285
+ - Image capture
286
+ - Provider integration
287
+ - Autonomy integration
288
+
289
+ ## Troubleshooting
290
+
291
+ ### No Camera Detected
292
+
293
+ 1. Ensure camera tools are installed (imagesnap/fswebcam/ffmpeg)
294
+ 2. Check camera permissions in system settings
295
+ 3. Try without CAMERA_NAME to use default camera
296
+ 4. Verify camera is not in use by another application
297
+
298
+ ### Poor Object Detection
299
+
300
+ 1. Ensure good lighting conditions
301
+ 2. Adjust PIXEL_CHANGE_THRESHOLD (lower = more sensitive)
302
+ 3. Enable advanced CV with ENABLE_OBJECT_DETECTION=true
303
+ 4. Check camera resolution (higher is better for detection)
304
+
305
+ ### High CPU Usage
306
+
307
+ 1. Increase frame processing interval in code
308
+ 2. Disable advanced CV features if not needed
309
+ 3. Reduce camera resolution
310
+ 4. Increase pixel change threshold
311
+
312
+ ## Future Roadmap
313
+
314
+ ### Phase 3: WebAssembly Integration
315
+
316
+ - TensorFlow.js WASM backend
317
+ - Browser-compatible vision processing
318
+ - Real-time object tracking
319
+ - Face detection and recognition
320
+
321
+ ### Phase 4: Advanced Features
322
+
323
+ - Gesture recognition
324
+ - Emotion detection
325
+ - Scene understanding
326
+ - Spatial relationship mapping
327
+ - Multi-camera support
328
+
329
+ ## Contributing
330
+
331
+ Contributions are welcome! Please see the main ElizaOS repository for
332
+ contribution guidelines.
333
+
334
+ ## License
335
+
336
+ MIT
337
+
338
+ ## Support
339
+
340
+ For issues and feature requests, please use the GitHub issue tracker.
package/auto-enable.ts ADDED
@@ -0,0 +1,29 @@
1
+ // Auto-enable check for @elizaos/plugin-vision.
2
+ //
3
+ // Plugin manifest entry-point — referenced by package.json's
4
+ // `elizaos.plugin.autoEnableModule`. Keep this module light: env reads only,
5
+ // no service init, no transitive imports of the full plugin runtime. The
6
+ // auto-enable engine loads dozens of these per boot.
7
+ import type { PluginAutoEnableContext } from "@elizaos/core";
8
+
9
+ function isFeatureEnabled(
10
+ config: PluginAutoEnableContext["config"],
11
+ key: string,
12
+ ): boolean {
13
+ const f = (config?.features as Record<string, unknown> | undefined)?.[key];
14
+ if (f === true) return true;
15
+ if (f && typeof f === "object" && f !== null) {
16
+ return (f as Record<string, unknown>).enabled !== false;
17
+ }
18
+ return false;
19
+ }
20
+
21
+ /**
22
+ * Enable when `config.features.vision` is truthy, or when the user has
23
+ * explicitly chosen a vision provider via `config.media.vision.provider`.
24
+ */
25
+ export function shouldEnable(ctx: PluginAutoEnableContext): boolean {
26
+ if (isFeatureEnabled(ctx.config, "vision")) return true;
27
+ const visionProvider = ctx.config?.media?.vision?.provider;
28
+ return typeof visionProvider === "string" && visionProvider.length > 0;
29
+ }
package/build.config.ts CHANGED
@@ -34,7 +34,6 @@ export const buildConfig: BuildConfig = {
34
34
  "@tensorflow-models/posenet",
35
35
  "@tensorflow/tfjs-node",
36
36
  "@tensorflow-models/mobilenet",
37
- "axios",
38
37
  "canvas",
39
38
  "face-api.js",
40
39
  "sharp",
@@ -56,6 +55,26 @@ export const workersConfig: BuildConfig = {
56
55
  splitting: false,
57
56
  sourcemap: true,
58
57
  external: [
58
+ "fs",
59
+ "path",
60
+ "http",
61
+ "https",
62
+ "crypto",
63
+ "node:fs",
64
+ "node:path",
65
+ "node:http",
66
+ "node:https",
67
+ "node:crypto",
68
+ "node:stream",
69
+ "node:buffer",
70
+ "node:util",
71
+ "node:events",
72
+ "node:url",
73
+ "vitest",
74
+ "dotenv",
75
+ "zod",
76
+ "@elizaos/core",
77
+ "@elizaos/plugin-message-handling",
59
78
  "sharp",
60
79
  "@tensorflow/tfjs-node",
61
80
  "@tensorflow-models/mobilenet",