@elizaos/plugin-vision 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/.npmignore +5 -0
  2. package/README.md +270 -0
  3. package/build.config.ts +70 -0
  4. package/dist/action.d.ts +8 -0
  5. package/dist/action.js +1212 -0
  6. package/dist/action.js.map +1 -0
  7. package/dist/audio-capture-stream.d.ts +42 -0
  8. package/dist/audio-capture-stream.js +516 -0
  9. package/dist/audio-capture-stream.js.map +1 -0
  10. package/dist/audio-capture.d.ts +25 -0
  11. package/dist/audio-capture.js +412 -0
  12. package/dist/audio-capture.js.map +1 -0
  13. package/dist/basic.test.d.ts +1 -0
  14. package/dist/basic.test.js +97 -0
  15. package/dist/basic.test.js.map +1 -0
  16. package/dist/config.d.ts +73 -0
  17. package/dist/config.js +254 -0
  18. package/dist/config.js.map +1 -0
  19. package/dist/entity-tracker.d.ts +32 -0
  20. package/dist/entity-tracker.js +361 -0
  21. package/dist/entity-tracker.js.map +1 -0
  22. package/dist/errors.d.ts +67 -0
  23. package/dist/errors.js +395 -0
  24. package/dist/errors.js.map +1 -0
  25. package/dist/face-recognition.d.ts +31 -0
  26. package/dist/face-recognition.js +332 -0
  27. package/dist/face-recognition.js.map +1 -0
  28. package/dist/florence2-local.d.ts +25 -0
  29. package/dist/florence2-local.js +280 -0
  30. package/dist/florence2-local.js.map +1 -0
  31. package/dist/florence2-model.d.ts +36 -0
  32. package/dist/florence2-model.js +503 -0
  33. package/dist/florence2-model.js.map +1 -0
  34. package/dist/index.d.ts +3 -0
  35. package/dist/index.js +73 -0
  36. package/dist/index.js.map +1 -0
  37. package/dist/ocr-service-real.d.ts +32 -0
  38. package/dist/ocr-service-real.js +396 -0
  39. package/dist/ocr-service-real.js.map +1 -0
  40. package/dist/ocr-service.d.ts +28 -0
  41. package/dist/ocr-service.js +216 -0
  42. package/dist/ocr-service.js.map +1 -0
  43. package/dist/provider.d.ts +2 -0
  44. package/dist/provider.js +285 -0
  45. package/dist/provider.js.map +1 -0
  46. package/dist/screen-capture.d.ts +16 -0
  47. package/dist/screen-capture.js +302 -0
  48. package/dist/screen-capture.js.map +1 -0
  49. package/dist/service.d.ts +73 -0
  50. package/dist/service.js +1662 -0
  51. package/dist/service.js.map +1 -0
  52. package/dist/tests/e2e/index.d.ts +8 -0
  53. package/dist/tests/e2e/index.js +33 -0
  54. package/dist/tests/e2e/index.js.map +1 -0
  55. package/dist/tests/e2e/run-local.d.ts +2 -0
  56. package/dist/tests/e2e/run-local.js +166 -0
  57. package/dist/tests/e2e/run-local.js.map +1 -0
  58. package/dist/tests/e2e/screen-vision.d.ts +11 -0
  59. package/dist/tests/e2e/screen-vision.js +384 -0
  60. package/dist/tests/e2e/screen-vision.js.map +1 -0
  61. package/dist/tests/e2e/vision-autonomy.d.ts +11 -0
  62. package/dist/tests/e2e/vision-autonomy.js +375 -0
  63. package/dist/tests/e2e/vision-autonomy.js.map +1 -0
  64. package/dist/tests/e2e/vision-basic.d.ts +11 -0
  65. package/dist/tests/e2e/vision-basic.js +434 -0
  66. package/dist/tests/e2e/vision-basic.js.map +1 -0
  67. package/dist/tests/e2e/vision-capture-log.d.ts +11 -0
  68. package/dist/tests/e2e/vision-capture-log.js +302 -0
  69. package/dist/tests/e2e/vision-capture-log.js.map +1 -0
  70. package/dist/tests/e2e/vision-runtime.d.ts +11 -0
  71. package/dist/tests/e2e/vision-runtime.js +357 -0
  72. package/dist/tests/e2e/vision-runtime.js.map +1 -0
  73. package/dist/tests/e2e/vision-worker-tests.d.ts +11 -0
  74. package/dist/tests/e2e/vision-worker-tests.js +466 -0
  75. package/dist/tests/e2e/vision-worker-tests.js.map +1 -0
  76. package/dist/tests/test-pattern-generator.d.ts +40 -0
  77. package/dist/tests/test-pattern-generator.js +191 -0
  78. package/dist/tests/test-pattern-generator.js.map +1 -0
  79. package/dist/tests.d.ts +3 -0
  80. package/dist/tests.js +11 -0
  81. package/dist/tests.js.map +1 -0
  82. package/dist/types.d.ts +222 -0
  83. package/dist/types.js +16 -0
  84. package/dist/types.js.map +1 -0
  85. package/dist/vision-models.d.ts +47 -0
  86. package/dist/vision-models.js +501 -0
  87. package/dist/vision-models.js.map +1 -0
  88. package/dist/vision-worker-manager.d.ts +61 -0
  89. package/dist/vision-worker-manager.js +668 -0
  90. package/dist/vision-worker-manager.js.map +1 -0
  91. package/dist/workers/florence2-worker-simple.d.ts +13 -0
  92. package/dist/workers/florence2-worker-simple.js +121 -0
  93. package/dist/workers/florence2-worker-simple.js.map +1 -0
  94. package/dist/workers/florence2-worker.d.ts +1 -0
  95. package/dist/workers/florence2-worker.js +328 -0
  96. package/dist/workers/florence2-worker.js.map +1 -0
  97. package/dist/workers/ocr-worker.d.ts +1 -0
  98. package/dist/workers/ocr-worker.js +354 -0
  99. package/dist/workers/ocr-worker.js.map +1 -0
  100. package/dist/workers/screen-capture-worker.d.ts +1 -0
  101. package/dist/workers/screen-capture-worker.js +427 -0
  102. package/dist/workers/screen-capture-worker.js.map +1 -0
  103. package/dist/workers/worker-logger.d.ts +9 -0
  104. package/dist/workers/worker-logger.js +95 -0
  105. package/dist/workers/worker-logger.js.map +1 -0
  106. package/package.json +100 -0
package/.npmignore ADDED
@@ -0,0 +1,5 @@
1
+ .turbo
2
+ node_modules
3
+ .env
4
+ *.env
5
+ .env.local
package/README.md ADDED
@@ -0,0 +1,270 @@
1
+ # ElizaOS Vision Plugin
2
+
3
+ A powerful visual perception plugin for ElizaOS that provides agents with
4
+ real-time camera integration and scene analysis capabilities. This plugin
5
+ enables agents to "see" their environment, describe scenes, detect people and
6
+ objects, and make decisions based on visual input.
7
+
8
+ ## Features
9
+
10
+ ### Phase 1 (Implemented)
11
+
12
+ - ✅ Camera detection and connection (platform-specific)
13
+ - ✅ Real-time frame capture and processing
14
+ - ✅ Scene description using Vision Language Models (VLM)
15
+ - ✅ Motion-based object detection
16
+ - ✅ Basic person detection with pose estimation
17
+ - ✅ Configurable pixel change threshold
18
+ - ✅ Image capture action with base64 attachments
19
+ - ✅ Non-dynamic vision provider (always active)
20
+ - ✅ Integration with autonomy plugin (kill switch)
21
+
22
+ ### Phase 2 (Implemented)
23
+
24
+ - ✅ Enhanced object detection with COCO-like classification
25
+ - ✅ Advanced pose detection with keypoint estimation
26
+ - ✅ Improved person detection and tracking
27
+ - ✅ Object classification (person, monitor, chair, keyboard, furniture, etc.)
28
+ - ✅ Configurable computer vision models
29
+ - ✅ Fallback to motion detection when CV is disabled
30
+
31
+ ### Phase 3 (Planned)
32
+
33
+ - 🔄 WebAssembly (WASM) integration for browser compatibility
34
+ - 🔄 Real-time object tracking with IDs
35
+ - 🔄 Face detection and recognition
36
+ - 🔄 Gesture recognition
37
+ - 🔄 Scene understanding and spatial relationships
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ npm install @elizaos/plugin-vision
43
+ ```
44
+
45
+ ### Camera Tools Required
46
+
47
+ The plugin requires platform-specific camera tools:
48
+
49
+ - **macOS**: `brew install imagesnap`
50
+ - **Linux**: `sudo apt-get install fswebcam`
51
+ - **Windows**: Install ffmpeg and add to PATH
52
+
53
+ ## Configuration
54
+
55
+ ### Environment Variables
56
+
57
+ ```env
58
+ # Camera selection (partial name match, case-insensitive)
59
+ CAMERA_NAME=obsbot
60
+
61
+ # Pixel change threshold (percentage, default: 50)
62
+ PIXEL_CHANGE_THRESHOLD=30
63
+
64
+ # Enable advanced computer vision features (default: false)
65
+ ENABLE_OBJECT_DETECTION=true
66
+ ENABLE_POSE_DETECTION=true
67
+ ```
68
+
69
+ ### Character Configuration
70
+
71
+ ```json
72
+ {
73
+ "name": "VisionAgent",
74
+ "plugins": ["@elizaos/plugin-vision"],
75
+ "settings": {
76
+ "CAMERA_NAME": "obsbot",
77
+ "PIXEL_CHANGE_THRESHOLD": "30",
78
+ "ENABLE_OBJECT_DETECTION": "true",
79
+ "ENABLE_POSE_DETECTION": "true"
80
+ }
81
+ }
82
+ ```
83
+
84
+ ## Actions
85
+
86
+ ### DESCRIBE_SCENE
87
+
88
+ Analyzes the current visual scene and provides a detailed description.
89
+
90
+ **Similes**: `ANALYZE_SCENE`, `WHAT_DO_YOU_SEE`, `VISION_CHECK`, `LOOK_AROUND`
91
+
92
+ **Example**:
93
+
94
+ ```
95
+ User: "What do you see?"
96
+ Agent: "Looking through the camera, I see a home office setup with a person sitting at a desk. There are 2 monitors, a keyboard, and various desk accessories. I detected 5 objects total: 1 person, 2 monitors, 1 keyboard, and 1 chair."
97
+ ```
98
+
99
+ ### CAPTURE_IMAGE
100
+
101
+ Captures the current frame and returns it as a base64 image attachment.
102
+
103
+ **Similes**: `TAKE_PHOTO`, `SCREENSHOT`, `CAPTURE_FRAME`, `TAKE_PICTURE`
104
+
105
+ **Example**:
106
+
107
+ ```
108
+ User: "Take a photo"
109
+ Agent: "I've captured an image from the camera." [Image attached]
110
+ ```
111
+
112
+ ### KILL_AUTONOMOUS
113
+
114
+ Stops the autonomous agent loop (useful for debugging with autonomy plugin).
115
+
116
+ **Similes**: `STOP_AUTONOMOUS`, `HALT_AUTONOMOUS`, `KILL_AUTO_LOOP`
117
+
118
+ ## Vision Provider
119
+
120
+ The vision provider is **non-dynamic** (always active) and provides:
121
+
122
+ - Current scene description
123
+ - Camera connection status
124
+ - Detected objects count and types
125
+ - Detected people count with poses
126
+ - Scene change percentage
127
+ - Time since last update
128
+
129
+ ### Provider Data Structure
130
+
131
+ ```typescript
132
+ {
133
+ visionAvailable: boolean,
134
+ sceneDescription: string,
135
+ cameraStatus: string,
136
+ cameraId?: string,
137
+ peopleCount?: number,
138
+ objectCount?: number,
139
+ sceneAge?: number,
140
+ lastChange?: number
141
+ }
142
+ ```
143
+
144
+ ## Detection Modes
145
+
146
+ ### Motion-Based Detection (Default)
147
+
148
+ - Lightweight and fast
149
+ - Detects movement between frames
150
+ - Groups motion blocks into objects
151
+ - Basic size-based classification
152
+
153
+ ### Advanced Computer Vision (Optional)
154
+
155
+ Enable with `ENABLE_OBJECT_DETECTION=true` and/or `ENABLE_POSE_DETECTION=true`
156
+
157
+ - **Object Detection**: Enhanced object recognition with COCO-like classes
158
+ - **Pose Detection**: 17-keypoint pose estimation
159
+ - **Better Classification**: Distinguishes between person, monitor, chair,
160
+ keyboard, etc.
161
+ - **Higher Accuracy**: Edge detection and color variance analysis
162
+
163
+ ## Integration with Autonomy
164
+
165
+ When used with `@elizaos/plugin-autonomy`, the vision plugin enables:
166
+
167
+ - Continuous environmental monitoring
168
+ - Autonomous responses to visual changes
169
+ - Visual memory persistence
170
+ - Scene-based decision making
171
+
172
+ Example autonomous behavior:
173
+
174
+ ```typescript
175
+ // Agent autonomously monitors environment
176
+ 'I notice someone just entered the room.';
177
+ 'The lighting has changed significantly.';
178
+ 'A new object has appeared on the desk.';
179
+ ```
180
+
181
+ ## Performance Considerations
182
+
183
+ - Frame processing runs every 100ms by default
184
+ - VLM is only called when pixel change exceeds threshold
185
+ - Motion detection uses 64x64 pixel blocks with 50% overlap
186
+ - Advanced CV models add ~50-100ms processing time per frame
187
+ - Memory usage increases with resolution (1280x720 recommended)
188
+
189
+ ## Security & Privacy
190
+
191
+ - Camera access requires system permissions
192
+ - No images are stored permanently by default
193
+ - All processing happens locally
194
+ - Base64 images in messages are ephemeral
195
+ - Consider privacy implications in your implementation
196
+
197
+ ## Development
198
+
199
+ ### Running Tests
200
+
201
+ ```bash
202
+ # Run E2E tests
203
+ npm test
204
+
205
+ # Run local E2E tests
206
+ npm run test:e2e:local
207
+ ```
208
+
209
+ ### Test Coverage
210
+
211
+ - Service initialization
212
+ - Camera detection and connection
213
+ - Scene description generation
214
+ - Object and person detection
215
+ - Image capture
216
+ - Provider integration
217
+ - Autonomy integration
218
+
219
+ ## Troubleshooting
220
+
221
+ ### No Camera Detected
222
+
223
+ 1. Ensure camera tools are installed (imagesnap/fswebcam/ffmpeg)
224
+ 2. Check camera permissions in system settings
225
+ 3. Try without CAMERA_NAME to use default camera
226
+ 4. Verify camera is not in use by another application
227
+
228
+ ### Poor Object Detection
229
+
230
+ 1. Ensure good lighting conditions
231
+ 2. Adjust PIXEL_CHANGE_THRESHOLD (lower = more sensitive)
232
+ 3. Enable advanced CV with ENABLE_OBJECT_DETECTION=true
233
+ 4. Check camera resolution (higher is better for detection)
234
+
235
+ ### High CPU Usage
236
+
237
+ 1. Increase frame processing interval in code
238
+ 2. Disable advanced CV features if not needed
239
+ 3. Reduce camera resolution
240
+ 4. Increase pixel change threshold
241
+
242
+ ## Future Roadmap
243
+
244
+ ### Phase 3: WebAssembly Integration
245
+
246
+ - TensorFlow.js WASM backend
247
+ - Browser-compatible vision processing
248
+ - Real-time object tracking
249
+ - Face detection and recognition
250
+
251
+ ### Phase 4: Advanced Features
252
+
253
+ - Gesture recognition
254
+ - Emotion detection
255
+ - Scene understanding
256
+ - Spatial relationship mapping
257
+ - Multi-camera support
258
+
259
+ ## Contributing
260
+
261
+ Contributions are welcome! Please see the main ElizaOS repository for
262
+ contribution guidelines.
263
+
264
+ ## License
265
+
266
+ MIT
267
+
268
+ ## Support
269
+
270
+ For issues and feature requests, please use the GitHub issue tracker.
@@ -0,0 +1,70 @@
1
+ import type { BuildConfig } from 'bun';
2
+
3
+ // Main build configuration
4
+ export const buildConfig: BuildConfig = {
5
+ entrypoints: ['./src/index.ts'],
6
+ outdir: './dist',
7
+ target: 'node',
8
+ format: 'esm',
9
+ splitting: false,
10
+ sourcemap: 'external',
11
+ external: [
12
+ 'fs',
13
+ 'path',
14
+ 'http',
15
+ 'https',
16
+ 'crypto',
17
+ 'node:fs',
18
+ 'node:path',
19
+ 'node:http',
20
+ 'node:https',
21
+ 'node:crypto',
22
+ 'node:stream',
23
+ 'node:buffer',
24
+ 'node:util',
25
+ 'node:events',
26
+ 'node:url',
27
+ 'bun:test',
28
+ 'dotenv',
29
+ 'zod',
30
+ '@elizaos/core',
31
+ '@elizaos/plugin-message-handling',
32
+ '@tensorflow-models/coco-ssd',
33
+ '@tensorflow-models/pose-detection',
34
+ '@tensorflow-models/posenet',
35
+ '@tensorflow/tfjs-node',
36
+ '@tensorflow-models/mobilenet',
37
+ 'axios',
38
+ 'canvas',
39
+ 'face-api.js',
40
+ 'sharp',
41
+ 'tesseract.js',
42
+ ],
43
+ naming: '[dir]/[name].[ext]',
44
+ };
45
+
46
+ // Workers build configuration
47
+ export const workersConfig: BuildConfig = {
48
+ entrypoints: [
49
+ './src/workers/screen-capture-worker.ts',
50
+ './src/workers/florence2-worker.ts',
51
+ './src/workers/ocr-worker.ts',
52
+ ],
53
+ outdir: './dist/workers',
54
+ target: 'node',
55
+ format: 'cjs', // Workers need CommonJS format
56
+ splitting: false,
57
+ sourcemap: true,
58
+ external: [
59
+ 'sharp',
60
+ '@tensorflow/tfjs-node',
61
+ '@tensorflow-models/mobilenet',
62
+ '@mapbox/node-pre-gyp',
63
+ 'mock-aws-s3',
64
+ 'aws-sdk',
65
+ 'nock',
66
+ 'canvas',
67
+ 'face-api.js',
68
+ ],
69
+ naming: '[name].[ext]',
70
+ };
@@ -0,0 +1,8 @@
1
+ import { type Action } from '@elizaos/core';
2
+ export declare const describeSceneAction: Action;
3
+ export declare const captureImageAction: Action;
4
+ export declare const killAutonomousAction: Action;
5
+ export declare const setVisionModeAction: Action;
6
+ export declare const nameEntityAction: Action;
7
+ export declare const identifyPersonAction: Action;
8
+ export declare const trackEntityAction: Action;