@elizaos/plugin-vision 2.0.0-alpha.9 → 2.0.3-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +112 -0
- package/auto-enable.ts +29 -0
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7992 -6026
- package/dist/index.js.map +42 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1066 -121865
- package/dist/workers/ocr-worker.js.map +10 -130
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +371 -8
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +50 -24
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -70
- package/dist/workers/florence2-worker.js +0 -114850
- package/dist/workers/florence2-worker.js.map +0 -92
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shaw Walters and elizaOS Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# @elizaos/plugin-vision
|
|
2
|
+
|
|
3
|
+
Visual perception plugin for elizaOS — gives Eliza agents real-time awareness of their camera feed and/or screen through scene analysis, object/person detection, OCR, face recognition, and entity tracking.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
- Captures frames from a connected camera (macOS/Linux/Windows) or the host screen.
|
|
8
|
+
- Describes scenes by routing images through `runtime.useModel(IMAGE_DESCRIPTION)` — compatible with any registered VLM (local or cloud).
|
|
9
|
+
- Detects and tracks people, objects, and faces across frames with persistent entity IDs.
|
|
10
|
+
- Reads text on screen through the generic Apple Vision/doCTR OCR service and the coordinate-aware OCR registry used by computeruse: Windows.Media.Ocr on Windows, Tesseract on Linux when available, and the RapidOCR adapter as the portable fallback.
|
|
11
|
+
- Exposes all capabilities through a single `VISION` action and a `VISION_PERCEPTION` context provider.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install @elizaos/plugin-vision
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Platform camera tools (required for camera mode)
|
|
20
|
+
|
|
21
|
+
| Platform | Tool |
|
|
22
|
+
|----------|------|
|
|
23
|
+
| macOS | `brew install imagesnap` |
|
|
24
|
+
| Linux | `sudo apt-get install fswebcam` |
|
|
25
|
+
| Windows | Install ffmpeg and add to PATH |
|
|
26
|
+
|
|
27
|
+
Screen capture and OCR work without these tools.
|
|
28
|
+
|
|
29
|
+
## Enabling the plugin
|
|
30
|
+
|
|
31
|
+
Add it to your character's plugin list:
|
|
32
|
+
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"name": "MyAgent",
|
|
36
|
+
"plugins": ["@elizaos/plugin-vision"],
|
|
37
|
+
"settings": {
|
|
38
|
+
"CAMERA_NAME": "obsbot",
|
|
39
|
+
"VISION_MODE": "CAMERA"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The plugin auto-enables when `config.features.vision` is truthy or `config.media.vision.provider` is set.
|
|
45
|
+
|
|
46
|
+
## Configuration
|
|
47
|
+
|
|
48
|
+
| Setting | Default | Description |
|
|
49
|
+
|---------|---------|-------------|
|
|
50
|
+
| `CAMERA_NAME` | auto | Partial name match for camera device selection (case-insensitive) |
|
|
51
|
+
| `VISION_MODE` | `CAMERA` | `OFF` / `CAMERA` / `SCREEN` / `BOTH` |
|
|
52
|
+
| `PIXEL_CHANGE_THRESHOLD` | `50` | % pixel change required before triggering a VLM scene update |
|
|
53
|
+
| `VLM_UPDATE_INTERVAL` | `10000` | ms between VLM scene-describe calls |
|
|
54
|
+
| `SCREEN_CAPTURE_INTERVAL` | `2000` | ms between screen captures |
|
|
55
|
+
| `OCR_ENABLED` | `true` | Enable OCR on screen tiles |
|
|
56
|
+
| `ENABLE_OBJECT_DETECTION` | `false` | ggml YOLOv8n object detection (`native/yolo.cpp`) |
|
|
57
|
+
| `ENABLE_POSE_DETECTION` | `false` | Heuristic person detection (ggml pose pending) |
|
|
58
|
+
| `ENABLE_FACE_RECOGNITION` | `false` | Native ggml face recognition (BlazeFace + 128-d embed via `native/face-cpp`) |
|
|
59
|
+
| `ENTITY_TIMEOUT` | `30000` | ms before an inactive entity is evicted from tracking |
|
|
60
|
+
|
|
61
|
+
All settings can also be prefixed with `VISION_` (e.g. `VISION_CAMERA_NAME`).
|
|
62
|
+
|
|
63
|
+
## Actions
|
|
64
|
+
|
|
65
|
+
The plugin registers a single `VISION` action that routes to one of these sub-operations based on explicit `action` parameter or natural-language inference:
|
|
66
|
+
|
|
67
|
+
| Sub-operation | Trigger examples | What it does |
|
|
68
|
+
|--------------|-----------------|-------------|
|
|
69
|
+
| `describe` | "what do you see?", "describe the scene" | Returns the current VLM scene description |
|
|
70
|
+
| `capture` | "take a photo", "screenshot" | Captures a frame and returns it as a base64 image attachment |
|
|
71
|
+
| `set_mode` | "set vision mode to screen" | Switches between `OFF`, `CAMERA`, `SCREEN`, `BOTH` |
|
|
72
|
+
| `enable_camera` / `disable_camera` | "turn on the camera" | Toggles camera input |
|
|
73
|
+
| `enable_screen` / `disable_screen` | "enable screen capture" | Toggles screen input |
|
|
74
|
+
| `name_entity` | "the person is named Alice" | Assigns a display name to the most prominent tracked entity |
|
|
75
|
+
| `identify_person` | "who is that?" | Lists tracked people with names and presence duration |
|
|
76
|
+
| `track_entity` | "track the person in the red shirt" | Refreshes entity tracking and reports statistics |
|
|
77
|
+
|
|
78
|
+
## Vision Provider
|
|
79
|
+
|
|
80
|
+
`VISION_PERCEPTION` is injected into agent context during turns in the `media` and `browser` contexts. It provides:
|
|
81
|
+
|
|
82
|
+
- Current scene description text
|
|
83
|
+
- Camera / screen connection status and mode
|
|
84
|
+
- Detected people (count, poses, facings)
|
|
85
|
+
- Detected objects (types)
|
|
86
|
+
- Active tracked entities with duration
|
|
87
|
+
- Recently-departed entities
|
|
88
|
+
- Screen tile OCR text and UI element list (when screen mode is active)
|
|
89
|
+
|
|
90
|
+
## Detection backends
|
|
91
|
+
|
|
92
|
+
| Capability | Default backend | Optional / alternative |
|
|
93
|
+
|-----------|-----------------|----------------------|
|
|
94
|
+
| Scene description | VLM via `runtime.useModel(IMAGE_DESCRIPTION)` | Any registered IMAGE_DESCRIPTION provider |
|
|
95
|
+
| Object detection | YOLOv8n ggml via `native/yolo.cpp` (`src/yolo-detector.ts`); build with `bun run build:native` + `bun run build:weights`. Service degrades to motion/heuristic + VLM when the lib/GGUF are absent. | — (TensorFlow.js path removed) |
|
|
96
|
+
| Pose detection | Heuristic person detection (motion-derived) | Planned ggml MoveNet port |
|
|
97
|
+
| OCR | Generic OCR uses Apple Vision (darwin, when a provider is registered) → doCTR ggml (`native/doctr.cpp`). Coordinate OCR for computeruse prefers Windows.Media.Ocr (Windows) → Tesseract CLI or vendored bundle (Linux) → RapidOCR adapter. | Native/mobile bridges can register platform OCR providers; no ONNX OCR path. |
|
|
98
|
+
| Set-of-Marks grounding | `src/som.ts` fuses GGUF YOLO icon boxes + OCR text boxes into a deduplicated, 1-indexed numbered set (icon-over-text suppression + NMS) and renders a numbered-overlay PNG via `sharp`. `src/set-of-marks-provider.ts` registers it into plugin-computeruse's `detect_elements` seam at boot (best-effort; degrades to text-only marks when the GGUF detector is absent). | trycua/cua OmniParser parity (#9170 M9) |
|
|
99
|
+
| Face recognition | Native ggml BlazeFace + 128-d embed (`face-detector-ggml.ts`, `face-recognition-ggml.ts`, `native/face-cpp`); disabled until the lib/GGUF artifacts land. No tfjs/face-api.js path. | MediaPipe BlazeFace migration shim is deprecated. |
|
|
100
|
+
|
|
101
|
+
## Platform notes
|
|
102
|
+
|
|
103
|
+
- **Node.js only.** Mobile (iOS, Android) registers a `MobileCameraSource` (`src/mobile/capacitor-camera.ts`) bridged by plugin-ios / plugin-aosp.
|
|
104
|
+
- **Camera tools** (`imagesnap` / `fswebcam` / `ffmpeg`) are required for camera mode; screen capture and OCR work without them.
|
|
105
|
+
- **Native detectors and OCR** (`native/yolo.cpp`, `native/doctr.cpp`, and the coordinate-OCR providers) run through the available host backend. YOLO/doCTR require compiled libraries and GGUF artifacts; Tesseract requires a binary plus traineddata resolved from the vendored bundle or PATH.
|
|
106
|
+
|
|
107
|
+
## Privacy
|
|
108
|
+
|
|
109
|
+
- Camera access requires OS-level permissions.
|
|
110
|
+
- No frames are written to disk by default.
|
|
111
|
+
- All inference runs locally unless a remote IMAGE_DESCRIPTION provider is registered.
|
|
112
|
+
- Consider access implications before enabling in shared or sensitive environments.
|
package/auto-enable.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
// Auto-enable check for @elizaos/plugin-vision.
|
|
2
|
+
//
|
|
3
|
+
// Plugin manifest entry-point — referenced by package.json's
|
|
4
|
+
// `elizaos.plugin.autoEnableModule`. Keep this module light: env reads only,
|
|
5
|
+
// no service init, no transitive imports of the full plugin runtime. The
|
|
6
|
+
// auto-enable engine loads dozens of these per boot.
|
|
7
|
+
import type { PluginAutoEnableContext } from "@elizaos/core";
|
|
8
|
+
|
|
9
|
+
function isFeatureEnabled(
|
|
10
|
+
config: PluginAutoEnableContext["config"],
|
|
11
|
+
key: string,
|
|
12
|
+
): boolean {
|
|
13
|
+
const f = (config?.features as Record<string, unknown> | undefined)?.[key];
|
|
14
|
+
if (f === true) return true;
|
|
15
|
+
if (f && typeof f === "object" && f !== null) {
|
|
16
|
+
return (f as Record<string, unknown>).enabled !== false;
|
|
17
|
+
}
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Enable when `config.features.vision` is truthy, or when the user has
|
|
23
|
+
* explicitly chosen a vision provider via `config.media.vision.provider`.
|
|
24
|
+
*/
|
|
25
|
+
export function shouldEnable(ctx: PluginAutoEnableContext): boolean {
|
|
26
|
+
if (isFeatureEnabled(ctx.config, "vision")) return true;
|
|
27
|
+
const visionProvider = ctx.config?.media?.vision?.provider;
|
|
28
|
+
return typeof visionProvider === "string" && visionProvider.length > 0;
|
|
29
|
+
}
|
package/dist/action.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"action.d.ts","sourceRoot":"","sources":["../src/action.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,MAAM,EAYZ,MAAM,eAAe,CAAC;AA29CvB,eAAO,MAAM,YAAY,EAAE,MAmP1B,CAAC"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { EventEmitter } from "node:events";
|
|
2
|
+
import { type IAgentRuntime } from "@elizaos/core";
|
|
3
|
+
export interface StreamingAudioConfig {
|
|
4
|
+
enabled: boolean;
|
|
5
|
+
device?: string;
|
|
6
|
+
sampleRate?: number;
|
|
7
|
+
channels?: number;
|
|
8
|
+
vadThreshold?: number;
|
|
9
|
+
silenceTimeout?: number;
|
|
10
|
+
responseDelay?: number;
|
|
11
|
+
chunkSize?: number;
|
|
12
|
+
}
|
|
13
|
+
export declare class StreamingAudioCaptureService extends EventEmitter {
|
|
14
|
+
private runtime;
|
|
15
|
+
private config;
|
|
16
|
+
private captureProcess;
|
|
17
|
+
private isCapturing;
|
|
18
|
+
private audioBuffer;
|
|
19
|
+
private isSpeaking;
|
|
20
|
+
private silenceTimer;
|
|
21
|
+
private transcriptionInProgress;
|
|
22
|
+
private currentTranscription;
|
|
23
|
+
private responseTimer;
|
|
24
|
+
constructor(runtime: IAgentRuntime, config: StreamingAudioConfig);
|
|
25
|
+
initialize(): Promise<void>;
|
|
26
|
+
private startContinuousCapture;
|
|
27
|
+
private processAudioChunk;
|
|
28
|
+
private calculateEnergy;
|
|
29
|
+
private startStreamingTranscription;
|
|
30
|
+
private endSpeech;
|
|
31
|
+
private processFinalTranscription;
|
|
32
|
+
private getRecentAudioData;
|
|
33
|
+
private transcribeAudio;
|
|
34
|
+
private rawToWav;
|
|
35
|
+
private generateResponse;
|
|
36
|
+
private createAudioMemory;
|
|
37
|
+
stop(): Promise<void>;
|
|
38
|
+
isActive(): boolean;
|
|
39
|
+
getCurrentTranscription(): string;
|
|
40
|
+
isSpeechActive(): boolean;
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=audio-capture-stream.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audio-capture-stream.d.ts","sourceRoot":"","sources":["../src/audio-capture-stream.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EACL,KAAK,aAAa,EAInB,MAAM,eAAe,CAAC;AAEvB,MAAM,WAAW,oBAAoB;IACnC,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAQD,qBAAa,4BAA6B,SAAQ,YAAY;IAC5D,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,MAAM,CAAuB;IACrC,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,WAAW,CAAoB;IACvC,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,YAAY,CAA+B;IACnD,OAAO,CAAC,uBAAuB,CAAS;IACxC,OAAO,CAAC,oBAAoB,CAAM;IAClC,OAAO,CAAC,aAAa,CAA+B;gBAExC,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,oBAAoB;IAc1D,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAmBnB,sBAAsB;IA2FpC,OAAO,CAAC,iBAAiB;IAuDzB,OAAO,CAAC,eAAe;YAcT,2BAA2B;IAqCzC,OAAO,CAAC,SAAS;YAcH,yBAAyB;IA8BvC,OAAO,CAAC,kBAAkB;YA2BZ,eAAe;IAyB7B,OAAO,CAAC,QAAQ;YAkCF,gBAAgB;YAchB,iBAAiB;IAwBzB,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAyB3B,QAAQ,IAAI,OAAO;IAInB,uBAAuB,IAAI,MAAM;IAIjC,cAAc,IAAI,OAAO;CAG1B"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { type IAgentRuntime } from "@elizaos/core";
|
|
2
|
+
export interface AudioConfig {
|
|
3
|
+
enabled: boolean;
|
|
4
|
+
transcriptionInterval: number;
|
|
5
|
+
device?: string;
|
|
6
|
+
sampleRate?: number;
|
|
7
|
+
channels?: number;
|
|
8
|
+
}
|
|
9
|
+
export declare class AudioCaptureService {
|
|
10
|
+
private runtime;
|
|
11
|
+
private config;
|
|
12
|
+
private isRecording;
|
|
13
|
+
private recordingInterval;
|
|
14
|
+
constructor(runtime: IAgentRuntime, config: AudioConfig);
|
|
15
|
+
initialize(): Promise<void>;
|
|
16
|
+
private checkAudioTools;
|
|
17
|
+
private startTranscriptionLoop;
|
|
18
|
+
recordAndTranscribe(): Promise<string | null>;
|
|
19
|
+
private recordAudio;
|
|
20
|
+
private createAudioMemory;
|
|
21
|
+
listAudioDevices(): Promise<string[]>;
|
|
22
|
+
isActive(): boolean;
|
|
23
|
+
stop(): Promise<void>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=audio-capture.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audio-capture.d.ts","sourceRoot":"","sources":["../src/audio-capture.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,KAAK,aAAa,EAInB,MAAM,eAAe,CAAC;AAIvB,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,OAAO,CAAC;IACjB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,MAAM,CAAc;IAC5B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,iBAAiB,CAA+B;gBAE5C,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,WAAW;IASjD,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAgCnB,eAAe;IAgD7B,OAAO,CAAC,sBAAsB;IAexB,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;YA4DrC,WAAW;YAqCX,iBAAiB;IAuBzB,gBAAgB,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IA+D3C,QAAQ,IAAI,OAAO;IAIb,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAe5B"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bridge plugin-vision's hierarchical OCR into plugin-computeruse's
|
|
3
|
+
* `CoordOcrProvider` registry seam.
|
|
4
|
+
*
|
|
5
|
+
* plugin-vision owns the OCR implementations; plugin-computeruse's
|
|
6
|
+
* scene-builder + GET_SCREEN want coordinate-aware OCR but must NOT take a
|
|
7
|
+
* hard dependency on plugin-vision (that would create a cycle and force the
|
|
8
|
+
* vision OCR stack onto every computeruse consumer). So plugin-vision
|
|
9
|
+
* registers a bridge into computeruse's seam at boot via a best-effort dynamic
|
|
10
|
+
* import (see `index.ts`).
|
|
11
|
+
*
|
|
12
|
+
* The two interfaces are structurally identical — vision's
|
|
13
|
+
* `OcrWithCoordsService.describe(OcrWithCoordsInput) -> OcrWithCoordsResult`
|
|
14
|
+
* and computeruse's `CoordOcrProvider.describe(CoordOcrInput) -> CoordOcrResult`
|
|
15
|
+
* share field shapes (displayId/sourceX/sourceY/pngBytes in; blocks with
|
|
16
|
+
* bbox+words+semantic_position out) — so the bridge is a thin pass-through.
|
|
17
|
+
* The types live in different packages, so we describe computeruse's side
|
|
18
|
+
* structurally here rather than importing it (keeps the no-hard-dep rule).
|
|
19
|
+
*
|
|
20
|
+
* Pure + injectable so the wiring is unit-testable without a real
|
|
21
|
+
* plugin-computeruse present.
|
|
22
|
+
*/
|
|
23
|
+
import { type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
|
|
24
|
+
/** Structural shape of `@elizaos/plugin-computeruse`'s `CoordOcrInput`. */
|
|
25
|
+
export interface CoordOcrInputLike {
|
|
26
|
+
readonly displayId: string;
|
|
27
|
+
readonly sourceX: number;
|
|
28
|
+
readonly sourceY: number;
|
|
29
|
+
readonly pngBytes: Uint8Array;
|
|
30
|
+
}
|
|
31
|
+
/** Structural shape of `@elizaos/plugin-computeruse`'s `CoordOcrProvider`. */
|
|
32
|
+
export interface CoordOcrProviderLike {
|
|
33
|
+
readonly name: string;
|
|
34
|
+
describe(input: CoordOcrInputLike): Promise<OcrWithCoordsResult>;
|
|
35
|
+
}
|
|
36
|
+
export type RegisterCoordOcrProvider = (provider: CoordOcrProviderLike | null) => void;
|
|
37
|
+
export declare const VISION_COORD_OCR_BRIDGE_NAME = "vision-coord-ocr-bridge";
|
|
38
|
+
/**
|
|
39
|
+
* Build a `CoordOcrProvider`-shaped bridge that delegates to whatever vision
|
|
40
|
+
* `OcrWithCoordsService` is currently registered. Resolving the service lazily
|
|
41
|
+
* (per call) means a later `registerOcrWithCoordsService()` (e.g. swapping in a
|
|
42
|
+
* native Windows.Media.Ocr / Apple Vision provider) is picked up automatically.
|
|
43
|
+
*/
|
|
44
|
+
export declare function buildVisionCoordOcrBridge(resolve?: () => OcrWithCoordsService | null): CoordOcrProviderLike;
|
|
45
|
+
/**
|
|
46
|
+
* Register the vision OCR bridge into computeruse's CoordOcrProvider seam.
|
|
47
|
+
* Idempotent (the seam is last-call-wins). Returns true once registered.
|
|
48
|
+
*/
|
|
49
|
+
export declare function wireComputerUseOcrBridge(register: RegisterCoordOcrProvider, resolve?: () => OcrWithCoordsService | null): boolean;
|
|
50
|
+
//# sourceMappingURL=computeruse-ocr-bridge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"computeruse-ocr-bridge.d.ts","sourceRoot":"","sources":["../src/computeruse-ocr-bridge.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,EAEL,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAC1B,MAAM,sBAAsB,CAAC;AAE9B,2EAA2E;AAC3E,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;CAC/B;AAED,8EAA8E;AAC9E,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,iBAAiB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;CAClE;AAED,MAAM,MAAM,wBAAwB,GAAG,CACrC,QAAQ,EAAE,oBAAoB,GAAG,IAAI,KAClC,IAAI,CAAC;AAEV,eAAO,MAAM,4BAA4B,4BAA4B,CAAC;AAEtE;;;;;GAKG;AACH,wBAAgB,yBAAyB,CACvC,OAAO,GAAE,MAAM,oBAAoB,GAAG,IAA8B,GACnE,oBAAoB,CAetB;AAED;;;GAGG;AACH,wBAAgB,wBAAwB,CACtC,QAAQ,EAAE,wBAAwB,EAClC,OAAO,CAAC,EAAE,MAAM,oBAAoB,GAAG,IAAI,GAC1C,OAAO,CAGT"}
|
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { VisionConfig } from "./types";
|
|
3
|
+
export declare const defaultVisionConfig: VisionConfig;
|
|
4
|
+
export declare const VisionConfigSchema: z.ZodObject<{
|
|
5
|
+
cameraName: z.ZodOptional<z.ZodString>;
|
|
6
|
+
enableCamera: z.ZodDefault<z.ZodBoolean>;
|
|
7
|
+
pixelChangeThreshold: z.ZodDefault<z.ZodNumber>;
|
|
8
|
+
updateInterval: z.ZodDefault<z.ZodNumber>;
|
|
9
|
+
enableObjectDetection: z.ZodDefault<z.ZodBoolean>;
|
|
10
|
+
objectConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
|
|
11
|
+
enablePoseDetection: z.ZodDefault<z.ZodBoolean>;
|
|
12
|
+
poseConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
|
|
13
|
+
tfUpdateInterval: z.ZodDefault<z.ZodNumber>;
|
|
14
|
+
vlmUpdateInterval: z.ZodDefault<z.ZodNumber>;
|
|
15
|
+
tfChangeThreshold: z.ZodDefault<z.ZodNumber>;
|
|
16
|
+
vlmChangeThreshold: z.ZodDefault<z.ZodNumber>;
|
|
17
|
+
visionMode: z.ZodDefault<z.ZodEnum<{
|
|
18
|
+
OFF: "OFF";
|
|
19
|
+
CAMERA: "CAMERA";
|
|
20
|
+
SCREEN: "SCREEN";
|
|
21
|
+
BOTH: "BOTH";
|
|
22
|
+
}>>;
|
|
23
|
+
screenCaptureInterval: z.ZodDefault<z.ZodNumber>;
|
|
24
|
+
tileSize: z.ZodDefault<z.ZodNumber>;
|
|
25
|
+
tileProcessingOrder: z.ZodDefault<z.ZodEnum<{
|
|
26
|
+
sequential: "sequential";
|
|
27
|
+
priority: "priority";
|
|
28
|
+
random: "random";
|
|
29
|
+
}>>;
|
|
30
|
+
maxConcurrentTiles: z.ZodDefault<z.ZodNumber>;
|
|
31
|
+
ocrEnabled: z.ZodDefault<z.ZodBoolean>;
|
|
32
|
+
ocrLanguage: z.ZodDefault<z.ZodString>;
|
|
33
|
+
ocrConfidenceThreshold: z.ZodDefault<z.ZodNumber>;
|
|
34
|
+
enableFaceRecognition: z.ZodDefault<z.ZodBoolean>;
|
|
35
|
+
faceMatchThreshold: z.ZodDefault<z.ZodNumber>;
|
|
36
|
+
maxFaceProfiles: z.ZodDefault<z.ZodNumber>;
|
|
37
|
+
entityTimeout: z.ZodDefault<z.ZodNumber>;
|
|
38
|
+
maxTrackedEntities: z.ZodDefault<z.ZodNumber>;
|
|
39
|
+
enableGPUAcceleration: z.ZodDefault<z.ZodBoolean>;
|
|
40
|
+
maxMemoryUsageMB: z.ZodDefault<z.ZodNumber>;
|
|
41
|
+
debugMode: z.ZodDefault<z.ZodBoolean>;
|
|
42
|
+
logLevel: z.ZodDefault<z.ZodEnum<{
|
|
43
|
+
info: "info";
|
|
44
|
+
error: "error";
|
|
45
|
+
warn: "warn";
|
|
46
|
+
debug: "debug";
|
|
47
|
+
}>>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type VisionConfigInput = z.input<typeof VisionConfigSchema>;
|
|
50
|
+
export type VisionConfigOutput = z.output<typeof VisionConfigSchema>;
|
|
51
|
+
interface RuntimeWithSettings {
|
|
52
|
+
getSetting(key: string): string | undefined;
|
|
53
|
+
}
|
|
54
|
+
export declare class ConfigurationManager {
|
|
55
|
+
private config;
|
|
56
|
+
private runtime;
|
|
57
|
+
constructor(runtime: RuntimeWithSettings);
|
|
58
|
+
private loadConfiguration;
|
|
59
|
+
private getSetting;
|
|
60
|
+
private getBooleanSetting;
|
|
61
|
+
private getNumberSetting;
|
|
62
|
+
private getEnumSetting;
|
|
63
|
+
get(): VisionConfigOutput;
|
|
64
|
+
update(updates: Partial<VisionConfigInput>): void;
|
|
65
|
+
static getPreset(name: string): Partial<VisionConfigInput>;
|
|
66
|
+
}
|
|
67
|
+
export {};
|
|
68
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,YAAY,EAAc,MAAM,SAAS,CAAC;AAExD,eAAO,MAAM,mBAAmB,EAAE,YAcjC,CAAC;AAEF,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAgC7B,CAAC;AAEH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,kBAAkB,CAAC,CAAC;AACnE,MAAM,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,kBAAkB,CAAC,CAAC;AAErE,UAAU,mBAAmB;IAC3B,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;CAC7C;AAED,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,MAAM,CAAqB;IACnC,OAAO,CAAC,OAAO,CAAsB;gBAEzB,OAAO,EAAE,mBAAmB;IAKxC,OAAO,CAAC,iBAAiB;IAiGzB,OAAO,CAAC,UAAU;IAOlB,OAAO,CAAC,iBAAiB;IAQzB,OAAO,CAAC,gBAAgB;IASxB,OAAO,CAAC,cAAc;IAetB,GAAG,IAAI,kBAAkB;IAIzB,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,iBAAiB,CAAC,GAAG,IAAI;IAejD,MAAM,CAAC,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC;CAmC3D"}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
export type MemoryPressureLevel = "nominal" | "low" | "critical";
|
|
2
|
+
export type DescribePauseReason = "arbiter-pressure" | "memory-cap" | null;
|
|
3
|
+
export interface DescribeBackpressureStats {
|
|
4
|
+
/** True while the describe step is currently being skipped. */
|
|
5
|
+
paused: boolean;
|
|
6
|
+
/** Last arbiter pressure level applied via `setPressure`. */
|
|
7
|
+
pressureLevel: MemoryPressureLevel;
|
|
8
|
+
/** Describe ticks skipped because of backpressure since construction. */
|
|
9
|
+
describesSkipped: number;
|
|
10
|
+
/** Count of paused<->active edges (telemetry / test signal). */
|
|
11
|
+
pauseTransitions: number;
|
|
12
|
+
/** RSS captured on the first describe tick, used as the local cap baseline. */
|
|
13
|
+
memoryBaselineBytes: number | null;
|
|
14
|
+
/** Latest sampled RSS growth over the captured baseline. */
|
|
15
|
+
memoryGrowthBytes: number | null;
|
|
16
|
+
}
|
|
17
|
+
export interface DescribeBackpressureDecision {
|
|
18
|
+
/** Run the expensive describe this tick? */
|
|
19
|
+
describe: boolean;
|
|
20
|
+
/** `"paused"`/`"active"` when this call flipped the state, else `null`. */
|
|
21
|
+
transitionedTo: "paused" | "active" | null;
|
|
22
|
+
/** Why we are paused (only meaningful when `describe === false`). */
|
|
23
|
+
reason: DescribePauseReason;
|
|
24
|
+
/** How long the current continuous pause has lasted, in ms. */
|
|
25
|
+
pausedForMs: number;
|
|
26
|
+
/** True when the caller should emit a throttled long-pause warning. */
|
|
27
|
+
warnPaused: boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface DescribeBackpressureConfig {
|
|
30
|
+
/**
|
|
31
|
+
* RSS growth cap in bytes. The first describe tick captures the process RSS
|
|
32
|
+
* baseline; while sampled RSS exceeds `baseline + memoryCapBytes`, the
|
|
33
|
+
* describe step pauses. `0` or negative disables the local check — only the
|
|
34
|
+
* arbiter signal can pause describing.
|
|
35
|
+
*/
|
|
36
|
+
memoryCapBytes?: number;
|
|
37
|
+
/**
|
|
38
|
+
* RSS sampler; defaults to `process.memoryUsage().rss`. Injected by tests so
|
|
39
|
+
* the cap can be exercised deterministically without allocating memory.
|
|
40
|
+
*/
|
|
41
|
+
sampleRssBytes?: () => number;
|
|
42
|
+
/**
|
|
43
|
+
* How long a single arbiter pressure signal keeps the loop paused, in ms.
|
|
44
|
+
* Because the WS1 bridge delivers pressure but not recovery, the pause
|
|
45
|
+
* auto-clears after this window of silence. Default 15_000.
|
|
46
|
+
*/
|
|
47
|
+
arbiterPauseCooldownMs?: number;
|
|
48
|
+
/** Continuous pause duration before a warning is requested. Default 60s. */
|
|
49
|
+
pauseWarningThresholdMs?: number;
|
|
50
|
+
/** Minimum interval between repeated long-pause warnings. Default 60s. */
|
|
51
|
+
pauseWarningIntervalMs?: number;
|
|
52
|
+
/** Clock, injectable for tests. Defaults to `Date.now`. */
|
|
53
|
+
now?: () => number;
|
|
54
|
+
}
|
|
55
|
+
export declare class DescribeBackpressureController {
|
|
56
|
+
private readonly memoryCapBytes;
|
|
57
|
+
private readonly sampleRssBytes;
|
|
58
|
+
private readonly arbiterPauseCooldownMs;
|
|
59
|
+
private readonly pauseWarningThresholdMs;
|
|
60
|
+
private readonly pauseWarningIntervalMs;
|
|
61
|
+
private readonly now;
|
|
62
|
+
private pressureLevel;
|
|
63
|
+
private pauseUntilMs;
|
|
64
|
+
private paused;
|
|
65
|
+
private describesSkipped;
|
|
66
|
+
private pauseTransitions;
|
|
67
|
+
private memoryBaselineBytes;
|
|
68
|
+
private latestMemoryGrowthBytes;
|
|
69
|
+
private pauseStartedAtMs;
|
|
70
|
+
private lastPauseWarningAtMs;
|
|
71
|
+
constructor(config?: DescribeBackpressureConfig);
|
|
72
|
+
/**
|
|
73
|
+
* Apply an arbiter memory-pressure level. A non-nominal level opens (or
|
|
74
|
+
* extends) the cooldown pause window; `nominal` clears it immediately (only
|
|
75
|
+
* arbiters that actually report recovery do this — the WS1 bridge relies on
|
|
76
|
+
* the cooldown instead).
|
|
77
|
+
*/
|
|
78
|
+
setPressure(level: MemoryPressureLevel): void;
|
|
79
|
+
/**
|
|
80
|
+
* Decide whether the expensive describe step may run this tick. Call ONLY
|
|
81
|
+
* when a describe would otherwise happen (the change/time gate already
|
|
82
|
+
* passed), so the skip counter reflects real avoided work. Has side effects:
|
|
83
|
+
* updates the skip counter and the pause/resume transition state. The
|
|
84
|
+
* arbiter signal takes precedence over the local cap when both are active so
|
|
85
|
+
* the reported `reason` is the more authoritative one.
|
|
86
|
+
*/
|
|
87
|
+
evaluate(): DescribeBackpressureDecision;
|
|
88
|
+
stats(): DescribeBackpressureStats;
|
|
89
|
+
}
|
|
90
|
+
//# sourceMappingURL=describe-backpressure.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"describe-backpressure.d.ts","sourceRoot":"","sources":["../src/describe-backpressure.ts"],"names":[],"mappings":"AAmCA,MAAM,MAAM,mBAAmB,GAAG,SAAS,GAAG,KAAK,GAAG,UAAU,CAAC;AAEjE,MAAM,MAAM,mBAAmB,GAAG,kBAAkB,GAAG,YAAY,GAAG,IAAI,CAAC;AAE3E,MAAM,WAAW,yBAAyB;IACxC,+DAA+D;IAC/D,MAAM,EAAE,OAAO,CAAC;IAChB,6DAA6D;IAC7D,aAAa,EAAE,mBAAmB,CAAC;IACnC,yEAAyE;IACzE,gBAAgB,EAAE,MAAM,CAAC;IACzB,gEAAgE;IAChE,gBAAgB,EAAE,MAAM,CAAC;IACzB,+EAA+E;IAC/E,mBAAmB,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,4DAA4D;IAC5D,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;CAClC;AAED,MAAM,WAAW,4BAA4B;IAC3C,4CAA4C;IAC5C,QAAQ,EAAE,OAAO,CAAC;IAClB,2EAA2E;IAC3E,cAAc,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC3C,qEAAqE;IACrE,MAAM,EAAE,mBAAmB,CAAC;IAC5B,+DAA+D;IAC/D,WAAW,EAAE,MAAM,CAAC;IACpB,uEAAuE;IACvE,UAAU,EAAE,OAAO,CAAC;CACrB;AAED,MAAM,WAAW,0BAA0B;IACzC;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,MAAM,CAAC;IAC9B;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,4EAA4E;IAC5E,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,0EAA0E;IAC1E,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,2DAA2D;IAC3D,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;CACpB;AAMD,qBAAa,8BAA8B;IACzC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAe;IAC9C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAS;IAChD,OAAO,CAAC,QAAQ,CAAC,uBAAuB,CAAS;IACjD,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAS;IAChD,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAe;IACnC,OAAO,CAAC,aAAa,CAAkC;IACvD,OAAO,CAAC,YAAY,CAAK;IACzB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,mBAAmB,CAAuB;IAClD,OAAO,CAAC,uBAAuB,CAAuB;IACtD,OAAO,CAAC,gBAAgB,CAAuB;IAC/C,OAAO,CAAC,oBAAoB,CAAK;gBAErB,MAAM,GAAE,0BAA+B;IAyBnD;;;;;OAKG;IACH,WAAW,CAAC,KAAK,EAAE,mBAAmB,GAAG,IAAI;IAS7C;;;;;;;OAOG;IACH,QAAQ,IAAI,4BAA4B;IA2DxC,KAAK,IAAI,yBAAyB;CAUnC"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DirtyTileDescriber — change-gated, per-tile screen description (#9105 M3).
|
|
3
|
+
*
|
|
4
|
+
* The dominant token cost in a CUA loop is re-describing a whole screen to a
|
|
5
|
+
* VLM every step even when almost nothing moved. The Brain already skips the
|
|
6
|
+
* describe entirely when the *whole frame* dHash is unchanged
|
|
7
|
+
* (`plugin-computeruse` `Brain` frame-dHash cache). This describer is the
|
|
8
|
+
* finer-grained tier: it splits a frame into tiles (via `screen-tiler.ts`),
|
|
9
|
+
* computes a per-tile perceptual hash, and only (re)describes tiles whose hash
|
|
10
|
+
* changed since the last frame — every unchanged tile reuses its cached
|
|
11
|
+
* description. So a single text field flipping characters re-describes one tile,
|
|
12
|
+
* not the entire screen.
|
|
13
|
+
*
|
|
14
|
+
* The describer is pure + injectable: the tile hash (`hashTile`) and the
|
|
15
|
+
* per-tile describe call (`describeTile`) are supplied by the caller. The real
|
|
16
|
+
* boot wiring injects `plugin-computeruse`'s `frameDhash` (the existing
|
|
17
|
+
* `scene/dhash.ts`) and a `runtime.useModel(IMAGE_DESCRIPTION)`-backed describe;
|
|
18
|
+
* tests inject deterministic fakes with no model and no native dHash. The
|
|
19
|
+
* counters (`describeCallsSaved`, `approxTokensSaved`) make the saving
|
|
20
|
+
* measurable so a test can assert it.
|
|
21
|
+
*/
|
|
22
|
+
import type { ScreenTile } from "./screen-tiler.js";
|
|
23
|
+
/** Approx image tokens charged for one tile describe — used only for the saved-tokens estimate. */
|
|
24
|
+
export declare const APPROX_TOKENS_PER_TILE = 256;
|
|
25
|
+
/** A described tile: its source rectangle plus the VLM/OCR text for it. */
|
|
26
|
+
export interface DescribedTile {
|
|
27
|
+
/** Tiler id, e.g. `tile-1-0`. */
|
|
28
|
+
id: string;
|
|
29
|
+
displayId: string;
|
|
30
|
+
/** Top-left of the tile in source display pixel space. */
|
|
31
|
+
sourceX: number;
|
|
32
|
+
sourceY: number;
|
|
33
|
+
sourceW: number;
|
|
34
|
+
sourceH: number;
|
|
35
|
+
/** The description text for this tile. */
|
|
36
|
+
description: string;
|
|
37
|
+
/** True when this tile's description was reused from cache (no describe call). */
|
|
38
|
+
cached: boolean;
|
|
39
|
+
}
|
|
40
|
+
export interface DirtyTileDescription {
|
|
41
|
+
/** One entry per tile, in tiler order. */
|
|
42
|
+
tiles: DescribedTile[];
|
|
43
|
+
/** Composed full-frame description (non-empty tile texts, source-order). */
|
|
44
|
+
vlmScene: string;
|
|
45
|
+
/** Per-tile elements suitable for `Scene.vlm_elements`. */
|
|
46
|
+
elements: DirtyTileElement[];
|
|
47
|
+
}
|
|
48
|
+
/** A described tile projected into the `Scene.vlm_elements` shape. */
|
|
49
|
+
export interface DirtyTileElement {
|
|
50
|
+
id: string;
|
|
51
|
+
kind: string;
|
|
52
|
+
desc: string;
|
|
53
|
+
/** Display-local `[x, y, w, h]` of the tile. */
|
|
54
|
+
bbox: [number, number, number, number];
|
|
55
|
+
displayId: number;
|
|
56
|
+
}
|
|
57
|
+
/** Token-accounting snapshot for a describer. */
|
|
58
|
+
export interface DirtyTileStats {
|
|
59
|
+
/** Tiles actually sent to the describe call. */
|
|
60
|
+
tilesDescribed: number;
|
|
61
|
+
/** Tiles served from the per-tile cache (no describe call). */
|
|
62
|
+
tilesSkipped: number;
|
|
63
|
+
/** Describe calls avoided by the cache (== tilesSkipped). */
|
|
64
|
+
describeCallsSaved: number;
|
|
65
|
+
/** Approx image tokens avoided (tilesSkipped × APPROX_TOKENS_PER_TILE). */
|
|
66
|
+
approxTokensSaved: number;
|
|
67
|
+
}
|
|
68
|
+
export interface DirtyTileDescriberDeps {
|
|
69
|
+
/**
|
|
70
|
+
* Perceptual hash of a tile PNG. Identical pixels MUST hash equal. The boot
|
|
71
|
+
* wiring passes `plugin-computeruse`'s `frameDhash`; `null` means "could not
|
|
72
|
+
* hash" and forces a (re)describe for that tile.
|
|
73
|
+
*/
|
|
74
|
+
hashTile: (png: Buffer) => bigint | null;
|
|
75
|
+
/** Describe one tile's pixels. Only called for changed/new tiles. */
|
|
76
|
+
describeTile: (tile: ScreenTile) => Promise<string>;
|
|
77
|
+
/** Tiling options forwarded to `tileScreenshot`. */
|
|
78
|
+
maxEdge?: number;
|
|
79
|
+
overlapFraction?: number;
|
|
80
|
+
/** Tokens charged per describe, for the saved-tokens estimate. */
|
|
81
|
+
approxTokensPerTile?: number;
|
|
82
|
+
}
|
|
83
|
+
export declare class DirtyTileDescriber {
|
|
84
|
+
private readonly deps;
|
|
85
|
+
/** tileId → { hash, description } from the previous frame. */
|
|
86
|
+
private readonly cache;
|
|
87
|
+
private stats;
|
|
88
|
+
constructor(deps: DirtyTileDescriberDeps);
|
|
89
|
+
getStats(): DirtyTileStats;
|
|
90
|
+
/**
|
|
91
|
+
* Describe a frame, re-describing only tiles whose hash changed since the
|
|
92
|
+
* previous call. Unchanged tiles reuse their cached description.
|
|
93
|
+
*/
|
|
94
|
+
describe(input: {
|
|
95
|
+
displayId: number;
|
|
96
|
+
width: number;
|
|
97
|
+
height: number;
|
|
98
|
+
pngBytes: Buffer;
|
|
99
|
+
}): Promise<DirtyTileDescription>;
|
|
100
|
+
private toDescribed;
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=dirty-tile-describer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dirty-tile-describer.d.ts","sourceRoot":"","sources":["../src/dirty-tile-describer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAGpD,mGAAmG;AACnG,eAAO,MAAM,sBAAsB,MAAM,CAAC;AAE1C,2EAA2E;AAC3E,MAAM,WAAW,aAAa;IAC5B,iCAAiC;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,0DAA0D;IAC1D,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,kFAAkF;IAClF,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,0CAA0C;IAC1C,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,4EAA4E;IAC5E,QAAQ,EAAE,MAAM,CAAC;IACjB,2DAA2D;IAC3D,QAAQ,EAAE,gBAAgB,EAAE,CAAC;CAC9B;AAED,sEAAsE;AACtE,MAAM,WAAW,gBAAgB;IAC/B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,gDAAgD;IAChD,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IACvC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,iDAAiD;AACjD,MAAM,WAAW,cAAc;IAC7B,gDAAgD;IAChD,cAAc,EAAE,MAAM,CAAC;IACvB,+DAA+D;IAC/D,YAAY,EAAE,MAAM,CAAC;IACrB,6DAA6D;IAC7D,kBAAkB,EAAE,MAAM,CAAC;IAC3B,2EAA2E;IAC3E,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,sBAAsB;IACrC;;;;OAIG;IACH,QAAQ,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;IACzC,qEAAqE;IACrE,YAAY,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;IACpD,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kEAAkE;IAClE,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,qBAAa,kBAAkB;IAajB,OAAO,CAAC,QAAQ,CAAC,IAAI;IAZjC,8DAA8D;IAC9D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAGlB;IACJ,OAAO,CAAC,KAAK,CAKX;gBAE2B,IAAI,EAAE,sBAAsB;IAEzD,QAAQ,IAAI,cAAc;IAI1B;;;OAGG;IACG,QAAQ,CAAC,KAAK,EAAE;QACpB,SAAS,EAAE,MAAM,CAAC;QAClB,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,QAAQ,EAAE,MAAM,CAAC;KAClB,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAqEjC,OAAO,CAAC,WAAW;CAgBpB"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wiring seam for the change-gated per-tile scene describe (#9105 efficiency).
|
|
3
|
+
*
|
|
4
|
+
* `DirtyTileDescriber` (dirty-tile-describer.ts) is pure + injectable: it owns
|
|
5
|
+
* the per-tile hash cache and the "only re-describe changed tiles" loop, but it
|
|
6
|
+
* does not know how to hash a tile or how to ask the VLM to describe one. This
|
|
7
|
+
* module supplies those two collaborators from the live runtime so
|
|
8
|
+
* `VisionService` can build a describer that re-describes only the screen
|
|
9
|
+
* regions that actually changed since the previous frame instead of paying for
|
|
10
|
+
* a whole-frame VLM pass every scene tick.
|
|
11
|
+
*
|
|
12
|
+
* Two collaborators:
|
|
13
|
+
* - `hashTile`: a perceptual hash. We reuse plugin-computeruse's `frameDhash`
|
|
14
|
+
* (the same dHash the Brain frame-cache uses), resolved via a best-effort
|
|
15
|
+
* dynamic import so plugin-vision never eagerly pulls computeruse's module
|
|
16
|
+
* graph at boot — exactly the idiom the OCR bridge already uses. When
|
|
17
|
+
* computeruse is absent the resolve returns `null` and the caller degrades
|
|
18
|
+
* to the existing full-frame describe.
|
|
19
|
+
* - `describeTile`: one `runtime.useModel(IMAGE_DESCRIPTION, …)` call per
|
|
20
|
+
* changed tile, built from a caller-supplied prompt + result normalizer so
|
|
21
|
+
* the per-tile path reuses the same prompt plumbing as the full-frame path.
|
|
22
|
+
*/
|
|
23
|
+
import type { ScreenTile } from "./screen-tiler.js";
|
|
24
|
+
/** PNG perceptual hash. Identical pixels MUST hash equal; `null` = undecodable. */
|
|
25
|
+
export type FrameHash = (png: Buffer) => bigint | null;
|
|
26
|
+
/** Per-tile describe call. Returns the model's description text for one tile. */
|
|
27
|
+
export type TileDescribeFn = (tile: ScreenTile) => Promise<string>;
|
|
28
|
+
export interface TileDescribeDeps {
|
|
29
|
+
/**
|
|
30
|
+
* Build the per-tile image URL the VLM is asked to describe. The tile carries
|
|
31
|
+
* PNG bytes (`tile.pngBytes`), so this is a `data:image/png;base64,…` URL.
|
|
32
|
+
*/
|
|
33
|
+
buildTileImageUrl: (tile: ScreenTile) => string;
|
|
34
|
+
/**
|
|
35
|
+
* Build the per-tile prompt. Receives the tile so callers can include bounds.
|
|
36
|
+
* Async because the scene context is pulled from a peer provider per call.
|
|
37
|
+
*/
|
|
38
|
+
buildTilePrompt: (tile: ScreenTile) => Promise<string>;
|
|
39
|
+
/** Invoke the IMAGE_DESCRIPTION model and return its raw result. */
|
|
40
|
+
invokeModel: (imageUrl: string, prompt: string) => Promise<unknown>;
|
|
41
|
+
/**
|
|
42
|
+
* Normalize a model result into a description string, or `null` when the
|
|
43
|
+
* result is unusable (sentinel / empty). A `null` result yields an empty tile
|
|
44
|
+
* description, which the describer treats as "nothing to compose for this
|
|
45
|
+
* tile" while still caching the (empty) result against the tile hash.
|
|
46
|
+
*/
|
|
47
|
+
extractDescription: (result: unknown) => string | null;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Build a `describeTile` function bound to the runtime's IMAGE_DESCRIPTION
|
|
51
|
+
* model. The describer calls this only for tiles whose hash changed.
|
|
52
|
+
*/
|
|
53
|
+
export declare function createTileDescribeFn(deps: TileDescribeDeps): TileDescribeFn;
|
|
54
|
+
/** Encode a tile's PNG bytes into a base64 data URL for the VLM. */
|
|
55
|
+
export declare function tilePngToImageUrl(tile: ScreenTile): string;
|
|
56
|
+
//# sourceMappingURL=dirty-tile-scene.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dirty-tile-scene.d.ts","sourceRoot":"","sources":["../src/dirty-tile-scene.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,mFAAmF;AACnF,MAAM,MAAM,SAAS,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;AAEvD,iFAAiF;AACjF,MAAM,MAAM,cAAc,GAAG,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;AAEnE,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,iBAAiB,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,MAAM,CAAC;IAChD;;;OAGG;IACH,eAAe,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;IACvD,oEAAoE;IACpE,WAAW,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IACpE;;;;;OAKG;IACH,kBAAkB,EAAE,CAAC,MAAM,EAAE,OAAO,KAAK,MAAM,GAAG,IAAI,CAAC;CACxD;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,gBAAgB,GAAG,cAAc,CAO3E;AAED,oEAAoE;AACpE,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,UAAU,GAAG,MAAM,CAE1D"}
|