@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shaw Walters and elizaOS Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,340 +1,112 @@
1
- # ElizaOS Vision Plugin
1
+ # @elizaos/plugin-vision
2
2
 
3
- A powerful visual perception plugin for ElizaOS that provides agents with
4
- real-time camera integration and scene analysis capabilities. This plugin
5
- enables agents to "see" their environment, describe scenes, detect people and
6
- objects, and make decisions based on visual input.
3
+ Visual perception plugin for elizaOS gives Eliza agents real-time awareness of their camera feed and/or screen through scene analysis, object/person detection, OCR, face recognition, and entity tracking.
7
4
 
8
- ## Features
5
+ ## What it does
9
6
 
10
- ### Phase 1 (Implemented)
11
-
12
- - Camera detection and connection (platform-specific)
13
- - Real-time frame capture and processing
14
- - Scene description using Vision Language Models (VLM)
15
- - ✅ Motion-based object detection
16
- - ✅ Basic person detection with pose estimation
17
- - ✅ Configurable pixel change threshold
18
- - ✅ Image capture action with base64 attachments
19
- - ✅ Non-dynamic vision provider (always active)
20
- - ✅ Integration with autonomy plugin (kill switch)
21
-
22
- ### Phase 2 (Implemented)
23
-
24
- - ✅ Enhanced object detection with COCO-like classification
25
- - ✅ Advanced pose detection with keypoint estimation
26
- - ✅ Improved person detection and tracking
27
- - ✅ Object classification (person, monitor, chair, keyboard, furniture, etc.)
28
- - ✅ Configurable computer vision models
29
- - ✅ Fallback to motion detection when CV is disabled
30
-
31
- ### Phase 3 (Implemented)
32
-
33
- - ✅ Real-time object tracking with IDs
34
- - ✅ Face detection and recognition
35
- - ✅ Screen capture and OCR integration
36
- - ✅ Entity tracking with persistent IDs
37
- - ✅ Multi-display support
38
- - ✅ Circuit breaker pattern for error resilience
39
- - ✅ Florence2 model integration for advanced scene understanding
40
- - ✅ Worker-based processing for high-FPS operations
41
-
42
- ### Phase 4 (Planned)
43
-
44
- - 🔄 WebAssembly (WASM) integration for browser compatibility
45
- - 🔄 Gesture recognition
46
- - 🔄 Emotion detection
47
- - 🔄 Advanced scene understanding and spatial relationships
7
+ - Captures frames from a connected camera (macOS/Linux/Windows) or the host screen.
8
+ - Describes scenes by routing images through `runtime.useModel(IMAGE_DESCRIPTION)` — compatible with any registered VLM (local or cloud).
9
+ - Detects and tracks people, objects, and faces across frames with persistent entity IDs.
10
+ - Reads text on screen through the generic Apple Vision/doCTR OCR service and the coordinate-aware OCR registry used by computeruse: Windows.Media.Ocr on Windows, Tesseract on Linux when available, and the RapidOCR adapter as the portable fallback.
11
+ - Exposes all capabilities through a single `VISION` action and a `VISION_PERCEPTION` context provider.
48
12
 
49
13
  ## Installation
50
14
 
51
- ### TypeScript (Primary)
52
-
53
15
  ```bash
54
16
  npm install @elizaos/plugin-vision
55
- # or
56
- cd plugins/plugin-vision
57
- bun install
58
- bun run build
59
17
  ```
60
- ### Camera Tools Required
61
-
62
- The plugin requires platform-specific camera tools:
63
-
64
- - **macOS**: `brew install imagesnap`
65
- - **Linux**: `sudo apt-get install fswebcam`
66
- - **Windows**: Install ffmpeg and add to PATH
67
-
68
- ## Configuration
69
-
70
- ### Environment Variables
71
-
72
- ```env
73
- # Camera selection (partial name match, case-insensitive)
74
- CAMERA_NAME=obsbot
75
18
 
76
- # Pixel change threshold (percentage, default: 50)
77
- PIXEL_CHANGE_THRESHOLD=30
19
+ ### Platform camera tools (required for camera mode)
78
20
 
79
- # Enable advanced computer vision features (default: false)
80
- ENABLE_OBJECT_DETECTION=true
81
- ENABLE_POSE_DETECTION=true
82
- ENABLE_FACE_RECOGNITION=false
21
+ | Platform | Tool |
22
+ |----------|------|
23
+ | macOS | `brew install imagesnap` |
24
+ | Linux | `sudo apt-get install fswebcam` |
25
+ | Windows | Install ffmpeg and add to PATH |
83
26
 
84
- # Vision mode: OFF, CAMERA, SCREEN, BOTH
85
- VISION_MODE=CAMERA
27
+ Screen capture and OCR work without these tools.
86
28
 
87
- # Update intervals (milliseconds)
88
- TF_UPDATE_INTERVAL=1000
89
- VLM_UPDATE_INTERVAL=10000
29
+ ## Enabling the plugin
90
30
 
91
- # Screen capture settings
92
- SCREEN_CAPTURE_INTERVAL=2000
93
- OCR_ENABLED=true
94
- ```
95
-
96
- ### Character Configuration
31
+ Add it to your character's plugin list:
97
32
 
98
33
  ```json
99
34
  {
100
- "name": "VisionAgent",
35
+ "name": "MyAgent",
101
36
  "plugins": ["@elizaos/plugin-vision"],
102
37
  "settings": {
103
38
  "CAMERA_NAME": "obsbot",
104
- "PIXEL_CHANGE_THRESHOLD": "30",
105
- "ENABLE_OBJECT_DETECTION": "true",
106
- "ENABLE_POSE_DETECTION": "true"
39
+ "VISION_MODE": "CAMERA"
107
40
  }
108
41
  }
109
42
  ```
110
43
 
111
- ## Actions
112
-
113
- ### DESCRIBE_SCENE
114
-
115
- Analyzes the current visual scene and provides a detailed description.
116
-
117
- **Similes**: `ANALYZE_SCENE`, `WHAT_DO_YOU_SEE`, `VISION_CHECK`, `LOOK_AROUND`
118
-
119
- **Example**:
120
-
121
- ```
122
- User: "What do you see?"
123
- Agent: "Looking through the camera, I see a home office setup with a person sitting at a desk. There are 2 monitors, a keyboard, and various desk accessories. I detected 5 objects total: 1 person, 2 monitors, 1 keyboard, and 1 chair."
124
- ```
125
-
126
- ### CAPTURE_IMAGE
127
-
128
- Captures the current frame and returns it as a base64 image attachment.
129
-
130
- **Similes**: `TAKE_PHOTO`, `SCREENSHOT`, `CAPTURE_FRAME`, `TAKE_PICTURE`
131
-
132
- **Example**:
133
-
134
- ```
135
- User: "Take a photo"
136
- Agent: "I've captured an image from the camera." [Image attached]
137
- ```
138
-
139
- ### SET_VISION_MODE
140
-
141
- Changes the vision mode (OFF, CAMERA, SCREEN, or BOTH).
44
+ The plugin auto-enables when `config.features.vision` is truthy or `config.media.vision.provider` is set.
142
45
 
143
- **Similes**: `CHANGE_VISION_MODE`, `SET_VISION`, `TOGGLE_VISION`
144
-
145
- ### NAME_ENTITY
146
-
147
- Assigns a name to a detected entity for tracking.
148
-
149
- **Similes**: `LABEL_ENTITY`, `NAME_OBJECT`, `IDENTIFY_ENTITY`
150
-
151
- ### IDENTIFY_PERSON
152
-
153
- Identifies a person using face recognition (requires face recognition to be enabled).
154
-
155
- **Similes**: `RECOGNIZE_PERSON`, `IDENTIFY_FACE`
156
-
157
- ### TRACK_ENTITY
158
-
159
- Starts tracking an entity with a persistent ID.
46
+ ## Configuration
160
47
 
161
- **Similes**: `START_TRACKING`, `FOLLOW_ENTITY`
48
+ | Setting | Default | Description |
49
+ |---------|---------|-------------|
50
+ | `CAMERA_NAME` | auto | Partial name match for camera device selection (case-insensitive) |
51
+ | `VISION_MODE` | `CAMERA` | `OFF` / `CAMERA` / `SCREEN` / `BOTH` |
52
+ | `PIXEL_CHANGE_THRESHOLD` | `50` | % pixel change required before triggering a VLM scene update |
53
+ | `VLM_UPDATE_INTERVAL` | `10000` | ms between VLM scene-describe calls |
54
+ | `SCREEN_CAPTURE_INTERVAL` | `2000` | ms between screen captures |
55
+ | `OCR_ENABLED` | `true` | Enable OCR on screen tiles |
56
+ | `ENABLE_OBJECT_DETECTION` | `false` | ggml YOLOv8n object detection (`native/yolo.cpp`) |
57
+ | `ENABLE_POSE_DETECTION` | `false` | Heuristic person detection (ggml pose pending) |
58
+ | `ENABLE_FACE_RECOGNITION` | `false` | Native ggml face recognition (BlazeFace + 128-d embed via `native/face-cpp`) |
59
+ | `ENTITY_TIMEOUT` | `30000` | ms before an inactive entity is evicted from tracking |
60
+
61
+ All settings can also be prefixed with `VISION_` (e.g. `VISION_CAMERA_NAME`).
162
62
 
163
- ### KILL_AUTONOMOUS
63
+ ## Actions
164
64
 
165
- Stops the autonomous agent loop (useful for debugging with autonomy plugin).
65
+ The plugin registers a single `VISION` action that routes to one of these sub-operations based on explicit `action` parameter or natural-language inference:
166
66
 
167
- **Similes**: `STOP_AUTONOMOUS`, `HALT_AUTONOMOUS`, `KILL_AUTO_LOOP`
67
+ | Sub-operation | Trigger examples | What it does |
68
+ |--------------|-----------------|-------------|
69
+ | `describe` | "what do you see?", "describe the scene" | Returns the current VLM scene description |
70
+ | `capture` | "take a photo", "screenshot" | Captures a frame and returns it as a base64 image attachment |
71
+ | `set_mode` | "set vision mode to screen" | Switches between `OFF`, `CAMERA`, `SCREEN`, `BOTH` |
72
+ | `enable_camera` / `disable_camera` | "turn on the camera" | Toggles camera input |
73
+ | `enable_screen` / `disable_screen` | "enable screen capture" | Toggles screen input |
74
+ | `name_entity` | "the person is named Alice" | Assigns a display name to the most prominent tracked entity |
75
+ | `identify_person` | "who is that?" | Lists tracked people with names and presence duration |
76
+ | `track_entity` | "track the person in the red shirt" | Refreshes entity tracking and reports statistics |
168
77
 
169
78
  ## Vision Provider
170
79
 
171
- The vision provider is **non-dynamic** (always active) and provides:
172
-
173
- - Current scene description
174
- - Camera connection status
175
- - Detected objects count and types
176
- - Detected people count with poses
177
- - Scene change percentage
178
- - Time since last update
179
-
180
- ### Provider Data Structure
181
-
182
- ```typescript
183
- {
184
- visionAvailable: boolean,
185
- sceneDescription: string,
186
- cameraStatus: string,
187
- cameraId?: string,
188
- peopleCount?: number,
189
- objectCount?: number,
190
- sceneAge?: number,
191
- lastChange?: number
192
- }
193
- ```
194
-
195
- ## Detection Modes
196
-
197
- ### Motion-Based Detection (Default)
198
-
199
- - Lightweight and fast
200
- - Detects movement between frames
201
- - Groups motion blocks into objects
202
- - Basic size-based classification
203
-
204
- ### Advanced Computer Vision (Optional)
205
-
206
- Enable with `ENABLE_OBJECT_DETECTION=true` and/or `ENABLE_POSE_DETECTION=true`
207
-
208
- - **Object Detection**: Enhanced object recognition with COCO-like classes
209
- - **Pose Detection**: 17-keypoint pose estimation
210
- - **Better Classification**: Distinguishes between person, monitor, chair,
211
- keyboard, etc.
212
- - **Higher Accuracy**: Edge detection and color variance analysis
213
-
214
- ## Integration with Autonomy
215
-
216
- - Continuous environmental monitoring
217
- - Autonomous responses to visual changes
218
- - Visual memory persistence
219
- - Scene-based decision making
220
-
221
- Example autonomous behavior:
222
-
223
- ```typescript
224
- // Agent autonomously monitors environment
225
- "I notice someone just entered the room.";
226
- "The lighting has changed significantly.";
227
- "A new object has appeared on the desk.";
228
- ```
229
-
230
- ## Performance Considerations
231
-
232
- - Frame processing runs every 100ms by default
233
- - VLM is only called when pixel change exceeds threshold
234
- - Motion detection uses 64x64 pixel blocks with 50% overlap
235
- - Advanced CV models add ~50-100ms processing time per frame
236
- - Memory usage increases with resolution (1280x720 recommended)
237
-
238
- ## Security & Privacy
239
-
240
- - Camera access requires system permissions
241
- - No images are stored permanently by default
242
- - All processing happens locally
243
- - Base64 images in messages are ephemeral
244
- - Consider privacy implications in your implementation
245
-
246
- ## Architecture
247
-
248
- ```
249
- plugin-vision/
250
- ├── README.md # This file
251
- ├── package.json # TypeScript package config
252
- ├── src/ # TypeScript implementation (primary)
253
- │ ├── index.ts # Plugin entry point
254
- │ ├── service.ts # Vision service
255
- │ ├── provider.ts # Vision provider
256
- │ ├── action.ts # All actions
257
- │ ├── entity-tracker.ts # Entity tracking
258
- │ ├── screen-capture.ts # Screen capture
259
- │ ├── ocr-service.ts # OCR service
260
- │ ├── face-recognition.ts # Face recognition
261
- │ ├── florence2-model.ts # Florence2 model integration
262
- │ ├── vision-worker-manager.ts # Worker management
263
- │ └── tests/ # E2E tests
264
- ```
265
-
266
- ## Development
267
-
268
- ### Running Tests
269
-
270
- ```bash
271
- # Run E2E tests
272
- cd plugins/plugin-vision
273
- npx vitest
274
-
275
- # Run local E2E tests
276
- bun run test:e2e:local
277
- ```
278
-
279
- ### Test Coverage
280
-
281
- - Service initialization
282
- - Camera detection and connection
283
- - Scene description generation
284
- - Object and person detection
285
- - Image capture
286
- - Provider integration
287
- - Autonomy integration
288
-
289
- ## Troubleshooting
290
-
291
- ### No Camera Detected
292
-
293
- 1. Ensure camera tools are installed (imagesnap/fswebcam/ffmpeg)
294
- 2. Check camera permissions in system settings
295
- 3. Try without CAMERA_NAME to use default camera
296
- 4. Verify camera is not in use by another application
297
-
298
- ### Poor Object Detection
299
-
300
- 1. Ensure good lighting conditions
301
- 2. Adjust PIXEL_CHANGE_THRESHOLD (lower = more sensitive)
302
- 3. Enable advanced CV with ENABLE_OBJECT_DETECTION=true
303
- 4. Check camera resolution (higher is better for detection)
304
-
305
- ### High CPU Usage
306
-
307
- 1. Increase frame processing interval in code
308
- 2. Disable advanced CV features if not needed
309
- 3. Reduce camera resolution
310
- 4. Increase pixel change threshold
311
-
312
- ## Future Roadmap
313
-
314
- ### Phase 3: WebAssembly Integration
315
-
316
- - TensorFlow.js WASM backend
317
- - Browser-compatible vision processing
318
- - Real-time object tracking
319
- - Face detection and recognition
320
-
321
- ### Phase 4: Advanced Features
80
+ `VISION_PERCEPTION` is injected into agent context during turns in the `media` and `browser` contexts. It provides:
322
81
 
323
- - Gesture recognition
324
- - Emotion detection
325
- - Scene understanding
326
- - Spatial relationship mapping
327
- - Multi-camera support
82
+ - Current scene description text
83
+ - Camera / screen connection status and mode
84
+ - Detected people (count, poses, facings)
85
+ - Detected objects (types)
86
+ - Active tracked entities with duration
87
+ - Recently-departed entities
88
+ - Screen tile OCR text and UI element list (when screen mode is active)
328
89
 
329
- ## Contributing
90
+ ## Detection backends
330
91
 
331
- Contributions are welcome! Please see the main ElizaOS repository for
332
- contribution guidelines.
92
+ | Capability | Default backend | Optional / alternative |
93
+ |-----------|-----------------|----------------------|
94
+ | Scene description | VLM via `runtime.useModel(IMAGE_DESCRIPTION)` | Any registered IMAGE_DESCRIPTION provider |
95
+ | Object detection | YOLOv8n ggml via `native/yolo.cpp` (`src/yolo-detector.ts`); build with `bun run build:native` + `bun run build:weights`. Service degrades to motion/heuristic + VLM when the lib/GGUF are absent. | — (TensorFlow.js path removed) |
96
+ | Pose detection | Heuristic person detection (motion-derived) | Planned ggml MoveNet port |
97
+ | OCR | Generic OCR uses Apple Vision (darwin, when a provider is registered) → doCTR ggml (`native/doctr.cpp`). Coordinate OCR for computeruse prefers Windows.Media.Ocr (Windows) → Tesseract CLI or vendored bundle (Linux) → RapidOCR adapter. | Native/mobile bridges can register platform OCR providers; no ONNX OCR path. |
98
+ | Set-of-Marks grounding | `src/som.ts` fuses GGUF YOLO icon boxes + OCR text boxes into a deduplicated, 1-indexed numbered set (icon-over-text suppression + NMS) and renders a numbered-overlay PNG via `sharp`. `src/set-of-marks-provider.ts` registers it into plugin-computeruse's `detect_elements` seam at boot (best-effort; degrades to text-only marks when the GGUF detector is absent). | trycua/cua OmniParser parity (#9170 M9) |
99
+ | Face recognition | Native ggml BlazeFace + 128-d embed (`face-detector-ggml.ts`, `face-recognition-ggml.ts`, `native/face-cpp`); disabled until the lib/GGUF artifacts land. No tfjs/face-api.js path. | MediaPipe BlazeFace migration shim is deprecated. |
333
100
 
334
- ## License
101
+ ## Platform notes
335
102
 
336
- MIT
103
+ - **Node.js only.** Mobile (iOS, Android) registers a `MobileCameraSource` (`src/mobile/capacitor-camera.ts`) bridged by plugin-ios / plugin-aosp.
104
+ - **Camera tools** (`imagesnap` / `fswebcam` / `ffmpeg`) are required for camera mode; screen capture and OCR work without them.
105
+ - **Native detectors and OCR** (`native/yolo.cpp`, `native/doctr.cpp`, and the coordinate-OCR providers) run through the available host backend. YOLO/doCTR require compiled libraries and GGUF artifacts; Tesseract requires a binary plus traineddata resolved from the vendored bundle or PATH.
337
106
 
338
- ## Support
107
+ ## Privacy
339
108
 
340
- For issues and feature requests, please use the GitHub issue tracker.
109
+ - Camera access requires OS-level permissions.
110
+ - No frames are written to disk by default.
111
+ - All inference runs locally unless a remote IMAGE_DESCRIPTION provider is registered.
112
+ - Consider access implications before enabling in shared or sensitive environments.
@@ -0,0 +1,3 @@
1
+ import { type Action } from "@elizaos/core";
2
+ export declare const visionAction: Action;
3
+ //# sourceMappingURL=action.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"action.d.ts","sourceRoot":"","sources":["../src/action.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,MAAM,EAYZ,MAAM,eAAe,CAAC;AA29CvB,eAAO,MAAM,YAAY,EAAE,MAmP1B,CAAC"}
@@ -0,0 +1,42 @@
1
+ import { EventEmitter } from "node:events";
2
+ import { type IAgentRuntime } from "@elizaos/core";
3
+ export interface StreamingAudioConfig {
4
+ enabled: boolean;
5
+ device?: string;
6
+ sampleRate?: number;
7
+ channels?: number;
8
+ vadThreshold?: number;
9
+ silenceTimeout?: number;
10
+ responseDelay?: number;
11
+ chunkSize?: number;
12
+ }
13
+ export declare class StreamingAudioCaptureService extends EventEmitter {
14
+ private runtime;
15
+ private config;
16
+ private captureProcess;
17
+ private isCapturing;
18
+ private audioBuffer;
19
+ private isSpeaking;
20
+ private silenceTimer;
21
+ private transcriptionInProgress;
22
+ private currentTranscription;
23
+ private responseTimer;
24
+ constructor(runtime: IAgentRuntime, config: StreamingAudioConfig);
25
+ initialize(): Promise<void>;
26
+ private startContinuousCapture;
27
+ private processAudioChunk;
28
+ private calculateEnergy;
29
+ private startStreamingTranscription;
30
+ private endSpeech;
31
+ private processFinalTranscription;
32
+ private getRecentAudioData;
33
+ private transcribeAudio;
34
+ private rawToWav;
35
+ private generateResponse;
36
+ private createAudioMemory;
37
+ stop(): Promise<void>;
38
+ isActive(): boolean;
39
+ getCurrentTranscription(): string;
40
+ isSpeechActive(): boolean;
41
+ }
42
+ //# sourceMappingURL=audio-capture-stream.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"audio-capture-stream.d.ts","sourceRoot":"","sources":["../src/audio-capture-stream.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EACL,KAAK,aAAa,EAInB,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,oBAAoB;IACnC,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAQD,qBAAa,4BAA6B,SAAQ,YAAY;IAC5D,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,MAAM,CAAuB;IACrC,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,WAAW,CAAoB;IACvC,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,YAAY,CAA+B;IACnD,OAAO,CAAC,uBAAuB,CAAS;IACxC,OAAO,CAAC,oBAAoB,CAAM;IAClC,OAAO,CAAC,aAAa,CAA+B;gBAExC,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,oBAAoB;IAc1D,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAmBnB,sBAAsB;IA2FpC,OAAO,CAAC,iBAAiB;IAuDzB,OAAO,CAAC,eAAe;YAcT,2BAA2B;IAqCzC,OAAO,CAAC,SAAS;YAcH,yBAAyB;IA8BvC,OAAO,CAAC,kBAAkB;YA2BZ,eAAe;IAyB7B,OAAO,CAAC,QAAQ;YAkCF,gBAAgB;YAchB,iBAAiB;IAwBzB,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAyB3B,QAAQ,IAAI,OAAO;IAInB,uBAAuB,IAAI,MAAM;IAIjC,cAAc,IAAI,OAAO;CAG1B"}
@@ -0,0 +1,25 @@
1
+ import { type IAgentRuntime } from "@elizaos/core";
2
+ export interface AudioConfig {
3
+ enabled: boolean;
4
+ transcriptionInterval: number;
5
+ device?: string;
6
+ sampleRate?: number;
7
+ channels?: number;
8
+ }
9
+ export declare class AudioCaptureService {
10
+ private runtime;
11
+ private config;
12
+ private isRecording;
13
+ private recordingInterval;
14
+ constructor(runtime: IAgentRuntime, config: AudioConfig);
15
+ initialize(): Promise<void>;
16
+ private checkAudioTools;
17
+ private startTranscriptionLoop;
18
+ recordAndTranscribe(): Promise<string | null>;
19
+ private recordAudio;
20
+ private createAudioMemory;
21
+ listAudioDevices(): Promise<string[]>;
22
+ isActive(): boolean;
23
+ stop(): Promise<void>;
24
+ }
25
+ //# sourceMappingURL=audio-capture.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"audio-capture.d.ts","sourceRoot":"","sources":["../src/audio-capture.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,KAAK,aAAa,EAInB,MAAM,eAAe,CAAC;AAIvB,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,OAAO,CAAC;IACjB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,MAAM,CAAc;IAC5B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,iBAAiB,CAA+B;gBAE5C,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,WAAW;IASjD,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAgCnB,eAAe;IAgD7B,OAAO,CAAC,sBAAsB;IAexB,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;YA4DrC,WAAW;YAqCX,iBAAiB;IAuBzB,gBAAgB,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IA+D3C,QAAQ,IAAI,OAAO;IAIb,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAe5B"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Bridge plugin-vision's hierarchical OCR into plugin-computeruse's
3
+ * `CoordOcrProvider` registry seam.
4
+ *
5
+ * plugin-vision owns the OCR implementations; plugin-computeruse's
6
+ * scene-builder + GET_SCREEN want coordinate-aware OCR but must NOT take a
7
+ * hard dependency on plugin-vision (that would create a cycle and force the
8
+ * vision OCR stack onto every computeruse consumer). So plugin-vision
9
+ * registers a bridge into computeruse's seam at boot via a best-effort dynamic
10
+ * import (see `index.ts`).
11
+ *
12
+ * The two interfaces are structurally identical — vision's
13
+ * `OcrWithCoordsService.describe(OcrWithCoordsInput) -> OcrWithCoordsResult`
14
+ * and computeruse's `CoordOcrProvider.describe(CoordOcrInput) -> CoordOcrResult`
15
+ * share field shapes (displayId/sourceX/sourceY/pngBytes in; blocks with
16
+ * bbox+words+semantic_position out) — so the bridge is a thin pass-through.
17
+ * The types live in different packages, so we describe computeruse's side
18
+ * structurally here rather than importing it (keeps the no-hard-dep rule).
19
+ *
20
+ * Pure + injectable so the wiring is unit-testable without a real
21
+ * plugin-computeruse present.
22
+ */
23
+ import { type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
24
+ /** Structural shape of `@elizaos/plugin-computeruse`'s `CoordOcrInput`. */
25
+ export interface CoordOcrInputLike {
26
+ readonly displayId: string;
27
+ readonly sourceX: number;
28
+ readonly sourceY: number;
29
+ readonly pngBytes: Uint8Array;
30
+ }
31
+ /** Structural shape of `@elizaos/plugin-computeruse`'s `CoordOcrProvider`. */
32
+ export interface CoordOcrProviderLike {
33
+ readonly name: string;
34
+ describe(input: CoordOcrInputLike): Promise<OcrWithCoordsResult>;
35
+ }
36
+ export type RegisterCoordOcrProvider = (provider: CoordOcrProviderLike | null) => void;
37
+ export declare const VISION_COORD_OCR_BRIDGE_NAME = "vision-coord-ocr-bridge";
38
+ /**
39
+ * Build a `CoordOcrProvider`-shaped bridge that delegates to whatever vision
40
+ * `OcrWithCoordsService` is currently registered. Resolving the service lazily
41
+ * (per call) means a later `registerOcrWithCoordsService()` (e.g. swapping in a
42
+ * native Windows.Media.Ocr / Apple Vision provider) is picked up automatically.
43
+ */
44
+ export declare function buildVisionCoordOcrBridge(resolve?: () => OcrWithCoordsService | null): CoordOcrProviderLike;
45
+ /**
46
+ * Register the vision OCR bridge into computeruse's CoordOcrProvider seam.
47
+ * Idempotent (the seam is last-call-wins). Returns true once registered.
48
+ */
49
+ export declare function wireComputerUseOcrBridge(register: RegisterCoordOcrProvider, resolve?: () => OcrWithCoordsService | null): boolean;
50
+ //# sourceMappingURL=computeruse-ocr-bridge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"computeruse-ocr-bridge.d.ts","sourceRoot":"","sources":["../src/computeruse-ocr-bridge.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,EAEL,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAC1B,MAAM,sBAAsB,CAAC;AAE9B,2EAA2E;AAC3E,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;CAC/B;AAED,8EAA8E;AAC9E,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,iBAAiB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;CAClE;AAED,MAAM,MAAM,wBAAwB,GAAG,CACrC,QAAQ,EAAE,oBAAoB,GAAG,IAAI,KAClC,IAAI,CAAC;AAEV,eAAO,MAAM,4BAA4B,4BAA4B,CAAC;AAEtE;;;;;GAKG;AACH,wBAAgB,yBAAyB,CACvC,OAAO,GAAE,MAAM,oBAAoB,GAAG,IAA8B,GACnE,oBAAoB,CAetB;AAED;;;GAGG;AACH,wBAAgB,wBAAwB,CACtC,QAAQ,EAAE,wBAAwB,EAClC,OAAO,CAAC,EAAE,MAAM,oBAAoB,GAAG,IAAI,GAC1C,OAAO,CAGT"}
@@ -0,0 +1,68 @@
1
+ import { z } from "zod";
2
+ import type { VisionConfig } from "./types";
3
+ export declare const defaultVisionConfig: VisionConfig;
4
+ export declare const VisionConfigSchema: z.ZodObject<{
5
+ cameraName: z.ZodOptional<z.ZodString>;
6
+ enableCamera: z.ZodDefault<z.ZodBoolean>;
7
+ pixelChangeThreshold: z.ZodDefault<z.ZodNumber>;
8
+ updateInterval: z.ZodDefault<z.ZodNumber>;
9
+ enableObjectDetection: z.ZodDefault<z.ZodBoolean>;
10
+ objectConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
11
+ enablePoseDetection: z.ZodDefault<z.ZodBoolean>;
12
+ poseConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
13
+ tfUpdateInterval: z.ZodDefault<z.ZodNumber>;
14
+ vlmUpdateInterval: z.ZodDefault<z.ZodNumber>;
15
+ tfChangeThreshold: z.ZodDefault<z.ZodNumber>;
16
+ vlmChangeThreshold: z.ZodDefault<z.ZodNumber>;
17
+ visionMode: z.ZodDefault<z.ZodEnum<{
18
+ OFF: "OFF";
19
+ CAMERA: "CAMERA";
20
+ SCREEN: "SCREEN";
21
+ BOTH: "BOTH";
22
+ }>>;
23
+ screenCaptureInterval: z.ZodDefault<z.ZodNumber>;
24
+ tileSize: z.ZodDefault<z.ZodNumber>;
25
+ tileProcessingOrder: z.ZodDefault<z.ZodEnum<{
26
+ sequential: "sequential";
27
+ priority: "priority";
28
+ random: "random";
29
+ }>>;
30
+ maxConcurrentTiles: z.ZodDefault<z.ZodNumber>;
31
+ ocrEnabled: z.ZodDefault<z.ZodBoolean>;
32
+ ocrLanguage: z.ZodDefault<z.ZodString>;
33
+ ocrConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
34
+ enableFaceRecognition: z.ZodDefault<z.ZodBoolean>;
35
+ faceMatchThreshold: z.ZodDefault<z.ZodNumber>;
36
+ maxFaceProfiles: z.ZodDefault<z.ZodNumber>;
37
+ entityTimeout: z.ZodDefault<z.ZodNumber>;
38
+ maxTrackedEntities: z.ZodDefault<z.ZodNumber>;
39
+ enableGPUAcceleration: z.ZodDefault<z.ZodBoolean>;
40
+ maxMemoryUsageMB: z.ZodDefault<z.ZodNumber>;
41
+ debugMode: z.ZodDefault<z.ZodBoolean>;
42
+ logLevel: z.ZodDefault<z.ZodEnum<{
43
+ info: "info";
44
+ error: "error";
45
+ warn: "warn";
46
+ debug: "debug";
47
+ }>>;
48
+ }, z.core.$strip>;
49
+ export type VisionConfigInput = z.input<typeof VisionConfigSchema>;
50
+ export type VisionConfigOutput = z.output<typeof VisionConfigSchema>;
51
+ interface RuntimeWithSettings {
52
+ getSetting(key: string): string | undefined;
53
+ }
54
+ export declare class ConfigurationManager {
55
+ private config;
56
+ private runtime;
57
+ constructor(runtime: RuntimeWithSettings);
58
+ private loadConfiguration;
59
+ private getSetting;
60
+ private getBooleanSetting;
61
+ private getNumberSetting;
62
+ private getEnumSetting;
63
+ get(): VisionConfigOutput;
64
+ update(updates: Partial<VisionConfigInput>): void;
65
+ static getPreset(name: string): Partial<VisionConfigInput>;
66
+ }
67
+ export {};
68
+ //# sourceMappingURL=config.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,YAAY,EAAc,MAAM,SAAS,CAAC;AAExD,eAAO,MAAM,mBAAmB,EAAE,YAcjC,CAAC;AAEF,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAgC7B,CAAC;AAEH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,kBAAkB,CAAC,CAAC;AAErE,UAAU,mBAAmB;IAC3B,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;CAC7C;AAED,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,MAAM,CAAqB;IACnC,OAAO,CAAC,OAAO,CAAsB;gBAEzB,OAAO,EAAE,mBAAmB;IAKxC,OAAO,CAAC,iBAAiB;IAiGzB,OAAO,CAAC,UAAU;IAOlB,OAAO,CAAC,iBAAiB;IAQzB,OAAO,CAAC,gBAAgB;IASxB,OAAO,CAAC,cAAc;IAetB,GAAG,IAAI,kBAAkB;IAIzB,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,iBAAiB,CAAC,GAAG,IAAI;IAejD,MAAM,CAAC,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC;CAmC3D"}