@elizaos/plugin-vision 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.npmignore +5 -0
- package/README.md +270 -0
- package/build.config.ts +70 -0
- package/dist/action.d.ts +8 -0
- package/dist/action.js +1212 -0
- package/dist/action.js.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.js +516 -0
- package/dist/audio-capture-stream.js.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.js +412 -0
- package/dist/audio-capture.js.map +1 -0
- package/dist/basic.test.d.ts +1 -0
- package/dist/basic.test.js +97 -0
- package/dist/basic.test.js.map +1 -0
- package/dist/config.d.ts +73 -0
- package/dist/config.js +254 -0
- package/dist/config.js.map +1 -0
- package/dist/entity-tracker.d.ts +32 -0
- package/dist/entity-tracker.js +361 -0
- package/dist/entity-tracker.js.map +1 -0
- package/dist/errors.d.ts +67 -0
- package/dist/errors.js +395 -0
- package/dist/errors.js.map +1 -0
- package/dist/face-recognition.d.ts +31 -0
- package/dist/face-recognition.js +332 -0
- package/dist/face-recognition.js.map +1 -0
- package/dist/florence2-local.d.ts +25 -0
- package/dist/florence2-local.js +280 -0
- package/dist/florence2-local.js.map +1 -0
- package/dist/florence2-model.d.ts +36 -0
- package/dist/florence2-model.js +503 -0
- package/dist/florence2-model.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +73 -0
- package/dist/index.js.map +1 -0
- package/dist/ocr-service-real.d.ts +32 -0
- package/dist/ocr-service-real.js +396 -0
- package/dist/ocr-service-real.js.map +1 -0
- package/dist/ocr-service.d.ts +28 -0
- package/dist/ocr-service.js +216 -0
- package/dist/ocr-service.js.map +1 -0
- package/dist/provider.d.ts +2 -0
- package/dist/provider.js +285 -0
- package/dist/provider.js.map +1 -0
- package/dist/screen-capture.d.ts +16 -0
- package/dist/screen-capture.js +302 -0
- package/dist/screen-capture.js.map +1 -0
- package/dist/service.d.ts +73 -0
- package/dist/service.js +1662 -0
- package/dist/service.js.map +1 -0
- package/dist/tests/e2e/index.d.ts +8 -0
- package/dist/tests/e2e/index.js +33 -0
- package/dist/tests/e2e/index.js.map +1 -0
- package/dist/tests/e2e/run-local.d.ts +2 -0
- package/dist/tests/e2e/run-local.js +166 -0
- package/dist/tests/e2e/run-local.js.map +1 -0
- package/dist/tests/e2e/screen-vision.d.ts +11 -0
- package/dist/tests/e2e/screen-vision.js +384 -0
- package/dist/tests/e2e/screen-vision.js.map +1 -0
- package/dist/tests/e2e/vision-autonomy.d.ts +11 -0
- package/dist/tests/e2e/vision-autonomy.js +375 -0
- package/dist/tests/e2e/vision-autonomy.js.map +1 -0
- package/dist/tests/e2e/vision-basic.d.ts +11 -0
- package/dist/tests/e2e/vision-basic.js +434 -0
- package/dist/tests/e2e/vision-basic.js.map +1 -0
- package/dist/tests/e2e/vision-capture-log.d.ts +11 -0
- package/dist/tests/e2e/vision-capture-log.js +302 -0
- package/dist/tests/e2e/vision-capture-log.js.map +1 -0
- package/dist/tests/e2e/vision-runtime.d.ts +11 -0
- package/dist/tests/e2e/vision-runtime.js +357 -0
- package/dist/tests/e2e/vision-runtime.js.map +1 -0
- package/dist/tests/e2e/vision-worker-tests.d.ts +11 -0
- package/dist/tests/e2e/vision-worker-tests.js +466 -0
- package/dist/tests/e2e/vision-worker-tests.js.map +1 -0
- package/dist/tests/test-pattern-generator.d.ts +40 -0
- package/dist/tests/test-pattern-generator.js +191 -0
- package/dist/tests/test-pattern-generator.js.map +1 -0
- package/dist/tests.d.ts +3 -0
- package/dist/tests.js +11 -0
- package/dist/tests.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.js +16 -0
- package/dist/types.js.map +1 -0
- package/dist/vision-models.d.ts +47 -0
- package/dist/vision-models.js +501 -0
- package/dist/vision-models.js.map +1 -0
- package/dist/vision-worker-manager.d.ts +61 -0
- package/dist/vision-worker-manager.js +668 -0
- package/dist/vision-worker-manager.js.map +1 -0
- package/dist/workers/florence2-worker-simple.d.ts +13 -0
- package/dist/workers/florence2-worker-simple.js +121 -0
- package/dist/workers/florence2-worker-simple.js.map +1 -0
- package/dist/workers/florence2-worker.d.ts +1 -0
- package/dist/workers/florence2-worker.js +328 -0
- package/dist/workers/florence2-worker.js.map +1 -0
- package/dist/workers/ocr-worker.d.ts +1 -0
- package/dist/workers/ocr-worker.js +354 -0
- package/dist/workers/ocr-worker.js.map +1 -0
- package/dist/workers/screen-capture-worker.d.ts +1 -0
- package/dist/workers/screen-capture-worker.js +427 -0
- package/dist/workers/screen-capture-worker.js.map +1 -0
- package/dist/workers/worker-logger.d.ts +9 -0
- package/dist/workers/worker-logger.js +95 -0
- package/dist/workers/worker-logger.js.map +1 -0
- package/package.json +100 -0
package/.npmignore
ADDED
package/README.md
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# ElizaOS Vision Plugin
|
|
2
|
+
|
|
3
|
+
A powerful visual perception plugin for ElizaOS that provides agents with
|
|
4
|
+
real-time camera integration and scene analysis capabilities. This plugin
|
|
5
|
+
enables agents to "see" their environment, describe scenes, detect people and
|
|
6
|
+
objects, and make decisions based on visual input.
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
### Phase 1 (Implemented)
|
|
11
|
+
|
|
12
|
+
- ✅ Camera detection and connection (platform-specific)
|
|
13
|
+
- ✅ Real-time frame capture and processing
|
|
14
|
+
- ✅ Scene description using Vision Language Models (VLM)
|
|
15
|
+
- ✅ Motion-based object detection
|
|
16
|
+
- ✅ Basic person detection with pose estimation
|
|
17
|
+
- ✅ Configurable pixel change threshold
|
|
18
|
+
- ✅ Image capture action with base64 attachments
|
|
19
|
+
- ✅ Non-dynamic vision provider (always active)
|
|
20
|
+
- ✅ Integration with autonomy plugin (kill switch)
|
|
21
|
+
|
|
22
|
+
### Phase 2 (Implemented)
|
|
23
|
+
|
|
24
|
+
- ✅ Enhanced object detection with COCO-like classification
|
|
25
|
+
- ✅ Advanced pose detection with keypoint estimation
|
|
26
|
+
- ✅ Improved person detection and tracking
|
|
27
|
+
- ✅ Object classification (person, monitor, chair, keyboard, furniture, etc.)
|
|
28
|
+
- ✅ Configurable computer vision models
|
|
29
|
+
- ✅ Fallback to motion detection when CV is disabled
|
|
30
|
+
|
|
31
|
+
### Phase 3 (Planned)
|
|
32
|
+
|
|
33
|
+
- 🔄 WebAssembly (WASM) integration for browser compatibility
|
|
34
|
+
- 🔄 Real-time object tracking with IDs
|
|
35
|
+
- 🔄 Face detection and recognition
|
|
36
|
+
- 🔄 Gesture recognition
|
|
37
|
+
- 🔄 Scene understanding and spatial relationships
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
npm install @elizaos/plugin-vision
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Camera Tools Required
|
|
46
|
+
|
|
47
|
+
The plugin requires platform-specific camera tools:
|
|
48
|
+
|
|
49
|
+
- **macOS**: `brew install imagesnap`
|
|
50
|
+
- **Linux**: `sudo apt-get install fswebcam`
|
|
51
|
+
- **Windows**: Install ffmpeg and add to PATH
|
|
52
|
+
|
|
53
|
+
## Configuration
|
|
54
|
+
|
|
55
|
+
### Environment Variables
|
|
56
|
+
|
|
57
|
+
```env
|
|
58
|
+
# Camera selection (partial name match, case-insensitive)
|
|
59
|
+
CAMERA_NAME=obsbot
|
|
60
|
+
|
|
61
|
+
# Pixel change threshold (percentage, default: 50)
|
|
62
|
+
PIXEL_CHANGE_THRESHOLD=30
|
|
63
|
+
|
|
64
|
+
# Enable advanced computer vision features (default: false)
|
|
65
|
+
ENABLE_OBJECT_DETECTION=true
|
|
66
|
+
ENABLE_POSE_DETECTION=true
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Character Configuration
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{
|
|
73
|
+
"name": "VisionAgent",
|
|
74
|
+
"plugins": ["@elizaos/plugin-vision"],
|
|
75
|
+
"settings": {
|
|
76
|
+
"CAMERA_NAME": "obsbot",
|
|
77
|
+
"PIXEL_CHANGE_THRESHOLD": "30",
|
|
78
|
+
"ENABLE_OBJECT_DETECTION": "true",
|
|
79
|
+
"ENABLE_POSE_DETECTION": "true"
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Actions
|
|
85
|
+
|
|
86
|
+
### DESCRIBE_SCENE
|
|
87
|
+
|
|
88
|
+
Analyzes the current visual scene and provides a detailed description.
|
|
89
|
+
|
|
90
|
+
**Similes**: `ANALYZE_SCENE`, `WHAT_DO_YOU_SEE`, `VISION_CHECK`, `LOOK_AROUND`
|
|
91
|
+
|
|
92
|
+
**Example**:
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
User: "What do you see?"
|
|
96
|
+
Agent: "Looking through the camera, I see a home office setup with a person sitting at a desk. There are 2 monitors, a keyboard, and various desk accessories. I detected 5 objects total: 1 person, 2 monitors, 1 keyboard, and 1 chair."
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### CAPTURE_IMAGE
|
|
100
|
+
|
|
101
|
+
Captures the current frame and returns it as a base64 image attachment.
|
|
102
|
+
|
|
103
|
+
**Similes**: `TAKE_PHOTO`, `SCREENSHOT`, `CAPTURE_FRAME`, `TAKE_PICTURE`
|
|
104
|
+
|
|
105
|
+
**Example**:
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
User: "Take a photo"
|
|
109
|
+
Agent: "I've captured an image from the camera." [Image attached]
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### KILL_AUTONOMOUS
|
|
113
|
+
|
|
114
|
+
Stops the autonomous agent loop (useful for debugging with autonomy plugin).
|
|
115
|
+
|
|
116
|
+
**Similes**: `STOP_AUTONOMOUS`, `HALT_AUTONOMOUS`, `KILL_AUTO_LOOP`
|
|
117
|
+
|
|
118
|
+
## Vision Provider
|
|
119
|
+
|
|
120
|
+
The vision provider is **non-dynamic** (always active) and provides:
|
|
121
|
+
|
|
122
|
+
- Current scene description
|
|
123
|
+
- Camera connection status
|
|
124
|
+
- Detected objects count and types
|
|
125
|
+
- Detected people count with poses
|
|
126
|
+
- Scene change percentage
|
|
127
|
+
- Time since last update
|
|
128
|
+
|
|
129
|
+
### Provider Data Structure
|
|
130
|
+
|
|
131
|
+
```typescript
|
|
132
|
+
{
|
|
133
|
+
visionAvailable: boolean,
|
|
134
|
+
sceneDescription: string,
|
|
135
|
+
cameraStatus: string,
|
|
136
|
+
cameraId?: string,
|
|
137
|
+
peopleCount?: number,
|
|
138
|
+
objectCount?: number,
|
|
139
|
+
sceneAge?: number,
|
|
140
|
+
lastChange?: number
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Detection Modes
|
|
145
|
+
|
|
146
|
+
### Motion-Based Detection (Default)
|
|
147
|
+
|
|
148
|
+
- Lightweight and fast
|
|
149
|
+
- Detects movement between frames
|
|
150
|
+
- Groups motion blocks into objects
|
|
151
|
+
- Basic size-based classification
|
|
152
|
+
|
|
153
|
+
### Advanced Computer Vision (Optional)
|
|
154
|
+
|
|
155
|
+
Enable with `ENABLE_OBJECT_DETECTION=true` and/or `ENABLE_POSE_DETECTION=true`
|
|
156
|
+
|
|
157
|
+
- **Object Detection**: Enhanced object recognition with COCO-like classes
|
|
158
|
+
- **Pose Detection**: 17-keypoint pose estimation
|
|
159
|
+
- **Better Classification**: Distinguishes between person, monitor, chair,
|
|
160
|
+
keyboard, etc.
|
|
161
|
+
- **Higher Accuracy**: Edge detection and color variance analysis
|
|
162
|
+
|
|
163
|
+
## Integration with Autonomy
|
|
164
|
+
|
|
165
|
+
When used with `@elizaos/plugin-autonomy`, the vision plugin enables:
|
|
166
|
+
|
|
167
|
+
- Continuous environmental monitoring
|
|
168
|
+
- Autonomous responses to visual changes
|
|
169
|
+
- Visual memory persistence
|
|
170
|
+
- Scene-based decision making
|
|
171
|
+
|
|
172
|
+
Example autonomous behavior:
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
// Agent autonomously monitors environment
|
|
176
|
+
'I notice someone just entered the room.';
|
|
177
|
+
'The lighting has changed significantly.';
|
|
178
|
+
'A new object has appeared on the desk.';
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Performance Considerations
|
|
182
|
+
|
|
183
|
+
- Frame processing runs every 100ms by default
|
|
184
|
+
- VLM is only called when pixel change exceeds threshold
|
|
185
|
+
- Motion detection uses 64x64 pixel blocks with 50% overlap
|
|
186
|
+
- Advanced CV models add ~50-100ms processing time per frame
|
|
187
|
+
- Memory usage increases with resolution (1280x720 recommended)
|
|
188
|
+
|
|
189
|
+
## Security & Privacy
|
|
190
|
+
|
|
191
|
+
- Camera access requires system permissions
|
|
192
|
+
- No images are stored permanently by default
|
|
193
|
+
- All processing happens locally
|
|
194
|
+
- Base64 images in messages are ephemeral
|
|
195
|
+
- Consider privacy implications in your implementation
|
|
196
|
+
|
|
197
|
+
## Development
|
|
198
|
+
|
|
199
|
+
### Running Tests
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
# Run E2E tests
|
|
203
|
+
npm test
|
|
204
|
+
|
|
205
|
+
# Run local E2E tests
|
|
206
|
+
npm run test:e2e:local
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Test Coverage
|
|
210
|
+
|
|
211
|
+
- Service initialization
|
|
212
|
+
- Camera detection and connection
|
|
213
|
+
- Scene description generation
|
|
214
|
+
- Object and person detection
|
|
215
|
+
- Image capture
|
|
216
|
+
- Provider integration
|
|
217
|
+
- Autonomy integration
|
|
218
|
+
|
|
219
|
+
## Troubleshooting
|
|
220
|
+
|
|
221
|
+
### No Camera Detected
|
|
222
|
+
|
|
223
|
+
1. Ensure camera tools are installed (imagesnap/fswebcam/ffmpeg)
|
|
224
|
+
2. Check camera permissions in system settings
|
|
225
|
+
3. Try without CAMERA_NAME to use default camera
|
|
226
|
+
4. Verify camera is not in use by another application
|
|
227
|
+
|
|
228
|
+
### Poor Object Detection
|
|
229
|
+
|
|
230
|
+
1. Ensure good lighting conditions
|
|
231
|
+
2. Adjust PIXEL_CHANGE_THRESHOLD (lower = more sensitive)
|
|
232
|
+
3. Enable advanced CV with ENABLE_OBJECT_DETECTION=true
|
|
233
|
+
4. Check camera resolution (higher is better for detection)
|
|
234
|
+
|
|
235
|
+
### High CPU Usage
|
|
236
|
+
|
|
237
|
+
1. Increase frame processing interval in code
|
|
238
|
+
2. Disable advanced CV features if not needed
|
|
239
|
+
3. Reduce camera resolution
|
|
240
|
+
4. Increase pixel change threshold
|
|
241
|
+
|
|
242
|
+
## Future Roadmap
|
|
243
|
+
|
|
244
|
+
### Phase 3: WebAssembly Integration
|
|
245
|
+
|
|
246
|
+
- TensorFlow.js WASM backend
|
|
247
|
+
- Browser-compatible vision processing
|
|
248
|
+
- Real-time object tracking
|
|
249
|
+
- Face detection and recognition
|
|
250
|
+
|
|
251
|
+
### Phase 4: Advanced Features
|
|
252
|
+
|
|
253
|
+
- Gesture recognition
|
|
254
|
+
- Emotion detection
|
|
255
|
+
- Scene understanding
|
|
256
|
+
- Spatial relationship mapping
|
|
257
|
+
- Multi-camera support
|
|
258
|
+
|
|
259
|
+
## Contributing
|
|
260
|
+
|
|
261
|
+
Contributions are welcome! Please see the main ElizaOS repository for
|
|
262
|
+
contribution guidelines.
|
|
263
|
+
|
|
264
|
+
## License
|
|
265
|
+
|
|
266
|
+
MIT
|
|
267
|
+
|
|
268
|
+
## Support
|
|
269
|
+
|
|
270
|
+
For issues and feature requests, please use the GitHub issue tracker.
|
package/build.config.ts
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { BuildConfig } from 'bun';
|
|
2
|
+
|
|
3
|
+
// Main build configuration
|
|
4
|
+
export const buildConfig: BuildConfig = {
|
|
5
|
+
entrypoints: ['./src/index.ts'],
|
|
6
|
+
outdir: './dist',
|
|
7
|
+
target: 'node',
|
|
8
|
+
format: 'esm',
|
|
9
|
+
splitting: false,
|
|
10
|
+
sourcemap: 'external',
|
|
11
|
+
external: [
|
|
12
|
+
'fs',
|
|
13
|
+
'path',
|
|
14
|
+
'http',
|
|
15
|
+
'https',
|
|
16
|
+
'crypto',
|
|
17
|
+
'node:fs',
|
|
18
|
+
'node:path',
|
|
19
|
+
'node:http',
|
|
20
|
+
'node:https',
|
|
21
|
+
'node:crypto',
|
|
22
|
+
'node:stream',
|
|
23
|
+
'node:buffer',
|
|
24
|
+
'node:util',
|
|
25
|
+
'node:events',
|
|
26
|
+
'node:url',
|
|
27
|
+
'bun:test',
|
|
28
|
+
'dotenv',
|
|
29
|
+
'zod',
|
|
30
|
+
'@elizaos/core',
|
|
31
|
+
'@elizaos/plugin-message-handling',
|
|
32
|
+
'@tensorflow-models/coco-ssd',
|
|
33
|
+
'@tensorflow-models/pose-detection',
|
|
34
|
+
'@tensorflow-models/posenet',
|
|
35
|
+
'@tensorflow/tfjs-node',
|
|
36
|
+
'@tensorflow-models/mobilenet',
|
|
37
|
+
'axios',
|
|
38
|
+
'canvas',
|
|
39
|
+
'face-api.js',
|
|
40
|
+
'sharp',
|
|
41
|
+
'tesseract.js',
|
|
42
|
+
],
|
|
43
|
+
naming: '[dir]/[name].[ext]',
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
// Workers build configuration
|
|
47
|
+
export const workersConfig: BuildConfig = {
|
|
48
|
+
entrypoints: [
|
|
49
|
+
'./src/workers/screen-capture-worker.ts',
|
|
50
|
+
'./src/workers/florence2-worker.ts',
|
|
51
|
+
'./src/workers/ocr-worker.ts',
|
|
52
|
+
],
|
|
53
|
+
outdir: './dist/workers',
|
|
54
|
+
target: 'node',
|
|
55
|
+
format: 'cjs', // Workers need CommonJS format
|
|
56
|
+
splitting: false,
|
|
57
|
+
sourcemap: true,
|
|
58
|
+
external: [
|
|
59
|
+
'sharp',
|
|
60
|
+
'@tensorflow/tfjs-node',
|
|
61
|
+
'@tensorflow-models/mobilenet',
|
|
62
|
+
'@mapbox/node-pre-gyp',
|
|
63
|
+
'mock-aws-s3',
|
|
64
|
+
'aws-sdk',
|
|
65
|
+
'nock',
|
|
66
|
+
'canvas',
|
|
67
|
+
'face-api.js',
|
|
68
|
+
],
|
|
69
|
+
naming: '[name].[ext]',
|
|
70
|
+
};
|
package/dist/action.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { type Action } from '@elizaos/core';
|
|
2
|
+
export declare const describeSceneAction: Action;
|
|
3
|
+
export declare const captureImageAction: Action;
|
|
4
|
+
export declare const killAutonomousAction: Action;
|
|
5
|
+
export declare const setVisionModeAction: Action;
|
|
6
|
+
export declare const nameEntityAction: Action;
|
|
7
|
+
export declare const identifyPersonAction: Action;
|
|
8
|
+
export declare const trackEntityAction: Action;
|