@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -301
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7957 -6238
- package/dist/index.js.map +41 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1075 -7821
- package/dist/workers/ocr-worker.js.map +10 -51
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +364 -6
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +39 -21
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -89
- package/dist/workers/florence2-worker.js +0 -779
- package/dist/workers/florence2-worker.js.map +0 -13
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import type { BoundingBox, OCRResult, ScreenTile } from "./types";
|
|
2
|
+
export type OCRBackendName = "doctr" | "apple-vision";
|
|
3
|
+
export interface OCRServiceConfig {
|
|
4
|
+
/**
|
|
5
|
+
* Force a specific backend. If unset, the chain is:
|
|
6
|
+
* 1. Apple Vision (darwin only, when a provider has been registered)
|
|
7
|
+
* 2. doCTR (ggml-backed CRNN+DBNet via native/doctr.cpp)
|
|
8
|
+
*
|
|
9
|
+
* There is no tesseract / onnx fallback — the migration removed both.
|
|
10
|
+
* If neither backend can initialize, `initialize()` throws.
|
|
11
|
+
*/
|
|
12
|
+
backend?: OCRBackendName;
|
|
13
|
+
}
|
|
14
|
+
export interface StructuredOCRData {
|
|
15
|
+
tables: Array<{
|
|
16
|
+
rows: string[][];
|
|
17
|
+
bbox: BoundingBox;
|
|
18
|
+
}>;
|
|
19
|
+
forms: Array<{
|
|
20
|
+
label: string;
|
|
21
|
+
value: string;
|
|
22
|
+
bbox: BoundingBox;
|
|
23
|
+
}>;
|
|
24
|
+
lists: Array<{
|
|
25
|
+
items: string[];
|
|
26
|
+
bbox: BoundingBox;
|
|
27
|
+
}>;
|
|
28
|
+
}
|
|
29
|
+
export declare function extractStructuredDataFromOCR(ocr: OCRResult): StructuredOCRData;
|
|
30
|
+
/**
|
|
31
|
+
* External provider seam for the Apple Vision OCR backend.
|
|
32
|
+
*
|
|
33
|
+
* `plugin-vision` does not take a runtime dep on `@elizaos/plugin-computeruse`
|
|
34
|
+
* — that would invert the layering (computeruse is the higher-level seam).
|
|
35
|
+
* Instead, the runtime registers a provider here on iOS/macOS startup using
|
|
36
|
+
* `createIosVisionOcrProvider(...)` from
|
|
37
|
+
* `@elizaos/plugin-computeruse/mobile/ocr-provider`. Until a provider is
|
|
38
|
+
* registered, `AppleVisionBackend.extractText` throws so the chooser falls
|
|
39
|
+
* through to the doCTR ggml backend.
|
|
40
|
+
*
|
|
41
|
+
* The provider shape is intentionally structural so plugin-vision stays
|
|
42
|
+
* Node-importable on hosts that don't ship Capacitor.
|
|
43
|
+
*/
|
|
44
|
+
export interface AppleVisionOcrProvider {
|
|
45
|
+
/** Stable id used in logs/telemetry. */
|
|
46
|
+
readonly name: string;
|
|
47
|
+
/** True when the underlying bridge is registered and ready. */
|
|
48
|
+
available(): boolean;
|
|
49
|
+
/**
|
|
50
|
+
* Recognize text in the JPEG/PNG bytes. The plugin-computeruse iOS provider
|
|
51
|
+
* returns `OcrResult`; we map to plugin-vision's `OCRResult` shape inline.
|
|
52
|
+
*/
|
|
53
|
+
recognize(input: {
|
|
54
|
+
kind: "bytes";
|
|
55
|
+
data: Uint8Array;
|
|
56
|
+
}): Promise<{
|
|
57
|
+
readonly lines: ReadonlyArray<{
|
|
58
|
+
readonly text: string;
|
|
59
|
+
readonly confidence: number;
|
|
60
|
+
readonly boundingBox: {
|
|
61
|
+
readonly x: number;
|
|
62
|
+
readonly y: number;
|
|
63
|
+
readonly width: number;
|
|
64
|
+
readonly height: number;
|
|
65
|
+
};
|
|
66
|
+
}>;
|
|
67
|
+
readonly fullText: string;
|
|
68
|
+
}>;
|
|
69
|
+
}
|
|
70
|
+
export declare function registerAppleVisionOcrProvider(provider: AppleVisionOcrProvider | null): void;
|
|
71
|
+
export declare function getAppleVisionOcrProvider(): AppleVisionOcrProvider | null;
|
|
72
|
+
/**
|
|
73
|
+
* Walk the priority chain and pick the first backend that initializes.
|
|
74
|
+
* Backend instances are cached; per-call we just dispatch to the active one.
|
|
75
|
+
*/
|
|
76
|
+
export declare class OCRService {
|
|
77
|
+
private backends;
|
|
78
|
+
private chosen;
|
|
79
|
+
private initialized;
|
|
80
|
+
private readonly forced?;
|
|
81
|
+
constructor(config?: OCRServiceConfig);
|
|
82
|
+
initialize(): Promise<void>;
|
|
83
|
+
extractText(imageBuffer: Buffer): Promise<OCRResult>;
|
|
84
|
+
extractFromTile(tile: ScreenTile): Promise<OCRResult>;
|
|
85
|
+
extractFromImage(imageBuffer: Buffer): Promise<OCRResult>;
|
|
86
|
+
extractStructuredData(imageBuffer: Buffer): Promise<StructuredOCRData>;
|
|
87
|
+
getActiveBackend(): OCRBackendName | null;
|
|
88
|
+
isInitialized(): boolean;
|
|
89
|
+
dispose(): Promise<void>;
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=ocr-service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service.d.ts","sourceRoot":"","sources":["../src/ocr-service.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAElE,MAAM,MAAM,cAAc,GAAG,OAAO,GAAG,cAAc,CAAC;AAEtD,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;OAOG;IACH,OAAO,CAAC,EAAE,cAAc,CAAC;CAC1B;AASD,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;IACvD,KAAK,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;IAClE,KAAK,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,EAAE,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;CACtD;AAqBD,wBAAgB,4BAA4B,CAC1C,GAAG,EAAE,SAAS,GACb,iBAAiB,CAiEnB;AAgBD;;;;;;;;;;;;;GAaG;AACH,MAAM,WAAW,sBAAsB;IACrC,wCAAwC;IACxC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,+DAA+D;IAC/D,SAAS,IAAI,OAAO,CAAC;IACrB;;;OAGG;IACH,SAAS,CAAC,KAAK,EAAE;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,IAAI,EAAE,UAAU,CAAA;KAAE,GAAG,OAAO,CAAC;QAC7D,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC;YAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;YACtB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;YAC5B,QAAQ,CAAC,WAAW,EAAE;gBACpB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;gBACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;gBACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;aACzB,CAAC;SACH,CAAC,CAAC;QACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;KAC3B,CAAC,CAAC;CACJ;AAID,wBAAgB,8BAA8B,CAC5C,QAAQ,EAAE,sBAAsB,GAAG,IAAI,GACtC,IAAI,CAON;AAED,wBAAgB,yBAAyB,IAAI,sBAAsB,GAAG,IAAI,CAEzE;AAqED;;;GAGG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,MAAM,CAA2B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAiB;gBAE7B,MAAM,GAAE,gBAAqB;IAInC,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAkD3B,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAiCpD,eAAe,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC;IAOrD,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAIzD,qBAAqB,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAK5E,gBAAgB,IAAI,cAAc,GAAG,IAAI;IAIzC,aAAa,IAAI,OAAO;IAIlB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAW/B"}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OCR-with-coords — hierarchical (block / line / word) OCR with absolute
|
|
3
|
+
* source-display coordinates and a coarse semantic position label per
|
|
4
|
+
* recognized text element.
|
|
5
|
+
*
|
|
6
|
+
* Why this lives in plugin-vision:
|
|
7
|
+
* - plugin-computeruse needs OCR with coordinates so action targets can be
|
|
8
|
+
* computed in display-absolute coordinates without re-running detection.
|
|
9
|
+
* - It cannot take a runtime dep on plugin-vision (which would invert the
|
|
10
|
+
* layering: computeruse is the higher-level seam and plugin-vision must
|
|
11
|
+
* stay Node-importable on hosts that don't ship the action surface).
|
|
12
|
+
* - Mirroring the pattern used by `AppleVisionOcrProvider` in
|
|
13
|
+
* `./ocr-service.ts`, plugin-vision exports a structural interface plus a
|
|
14
|
+
* registry seam (`registerCoordOcrProvider` lives in
|
|
15
|
+
* plugin-computeruse/src/mobile/ocr-provider.ts) that the runtime wires up
|
|
16
|
+
* at boot.
|
|
17
|
+
*
|
|
18
|
+
* This file defines the canonical `OcrWithCoordsService` interface and the
|
|
19
|
+
* in-tree `RapidOcrCoordAdapter` provider. The adapter is backed by the
|
|
20
|
+
* existing `RapidOCRService` and computes `semantic_position`
|
|
21
|
+
* deterministically from the bbox center against tile-relative thirds. Native
|
|
22
|
+
* OCR providers can register the same interface without changing consumers.
|
|
23
|
+
*/
|
|
24
|
+
import { OCRService } from "./ocr-service";
|
|
25
|
+
import type { BoundingBox } from "./types";
|
|
26
|
+
/** Coarse 3x3 location of a text element relative to the source tile. */
|
|
27
|
+
export type SemanticPosition = "upper-left" | "upper-center" | "upper-right" | "middle-left" | "center" | "middle-right" | "lower-left" | "lower-center" | "lower-right";
|
|
28
|
+
export interface OcrWithCoordsWord {
|
|
29
|
+
readonly text: string;
|
|
30
|
+
/** Absolute source-display coordinates. */
|
|
31
|
+
readonly bbox: BoundingBox;
|
|
32
|
+
readonly semantic_position: SemanticPosition;
|
|
33
|
+
}
|
|
34
|
+
export interface OcrWithCoordsBlock {
|
|
35
|
+
readonly text: string;
|
|
36
|
+
/** Absolute source-display coordinates. */
|
|
37
|
+
readonly bbox: BoundingBox;
|
|
38
|
+
readonly words: ReadonlyArray<OcrWithCoordsWord>;
|
|
39
|
+
readonly semantic_position: SemanticPosition;
|
|
40
|
+
}
|
|
41
|
+
export interface OcrWithCoordsResult {
|
|
42
|
+
readonly blocks: ReadonlyArray<OcrWithCoordsBlock>;
|
|
43
|
+
}
|
|
44
|
+
export interface OcrWithCoordsInput {
|
|
45
|
+
/** Stable identifier of the source display. Echoed in logs only. */
|
|
46
|
+
readonly displayId: string;
|
|
47
|
+
/** Absolute X offset of the tile within the source display. */
|
|
48
|
+
readonly sourceX: number;
|
|
49
|
+
/** Absolute Y offset of the tile within the source display. */
|
|
50
|
+
readonly sourceY: number;
|
|
51
|
+
/** Encoded PNG bytes of the tile. */
|
|
52
|
+
readonly pngBytes: Uint8Array;
|
|
53
|
+
}
|
|
54
|
+
export interface OcrWithCoordsService {
|
|
55
|
+
readonly name: string;
|
|
56
|
+
describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Map a bbox center to one of nine semantic positions using strict thirds
|
|
60
|
+
* against the tile dimensions. Pure function — exported for tests so the
|
|
61
|
+
* thirds rule has a single source of truth.
|
|
62
|
+
*
|
|
63
|
+
* Rule:
|
|
64
|
+
* col = floor(centerX / (tileWidth / 3)) clamped to [0, 2]
|
|
65
|
+
* row = floor(centerY / (tileHeight / 3)) clamped to [0, 2]
|
|
66
|
+
* "middle" + "center" collapses to the literal "center".
|
|
67
|
+
*
|
|
68
|
+
* Inputs use tile-relative coordinates so the same function works for words
|
|
69
|
+
* inside their parent block too (callers can pass the parent block bbox as
|
|
70
|
+
* the tile dims for word-relative labeling, but for the canonical
|
|
71
|
+
* implementation here we always label against the source tile).
|
|
72
|
+
*/
|
|
73
|
+
export declare function computeSemanticPosition(args: {
|
|
74
|
+
readonly bbox: BoundingBox;
|
|
75
|
+
readonly tileWidth: number;
|
|
76
|
+
readonly tileHeight: number;
|
|
77
|
+
}): SemanticPosition;
|
|
78
|
+
export declare function registerOcrWithCoordsService(service: OcrWithCoordsService | null): void;
|
|
79
|
+
export declare function getOcrWithCoordsService(): OcrWithCoordsService | null;
|
|
80
|
+
/**
|
|
81
|
+
* Wraps the existing `RapidOCRService` and maps its line-level output to the
|
|
82
|
+
* hierarchical `OcrWithCoordsResult` shape, computing `semantic_position`
|
|
83
|
+
* deterministically against the source tile thirds.
|
|
84
|
+
*/
|
|
85
|
+
export declare class RapidOcrCoordAdapter implements OcrWithCoordsService {
|
|
86
|
+
private readonly impl;
|
|
87
|
+
readonly name = "rapid-coord-adapter";
|
|
88
|
+
constructor(impl?: Pick<OCRService, "extractText">);
|
|
89
|
+
describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Read width/height from the PNG IHDR chunk without pulling in sharp on the
|
|
93
|
+
* test path. PNG signature is 8 bytes; IHDR begins at offset 8 with a 4-byte
|
|
94
|
+
* length, 4-byte type ("IHDR"), then 4-byte width and 4-byte height (BE).
|
|
95
|
+
*
|
|
96
|
+
* Throws on malformed input so a corrupt tile surfaces immediately rather
|
|
97
|
+
* than silently producing zero-sized semantic-position math.
|
|
98
|
+
*/
|
|
99
|
+
export declare function readPngDimensions(pngBytes: Uint8Array): Promise<{
|
|
100
|
+
width: number;
|
|
101
|
+
height: number;
|
|
102
|
+
}>;
|
|
103
|
+
//# sourceMappingURL=ocr-with-coords.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-with-coords.d.ts","sourceRoot":"","sources":["../src/ocr-with-coords.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAE3C,yEAAyE;AACzE,MAAM,MAAM,gBAAgB,GACxB,YAAY,GACZ,cAAc,GACd,aAAa,GACb,aAAa,GACb,QAAQ,GACR,cAAc,GACd,YAAY,GACZ,cAAc,GACd,aAAa,CAAC;AAElB,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC;CAC9C;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC,iBAAiB,CAAC,CAAC;IACjD,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC;CAC9C;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,kBAAkB;IACjC,oEAAoE;IACpE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,+DAA+D;IAC/D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,+DAA+D;IAC/D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,qCAAqC;IACrC,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;CAC/B;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;CACnE;AAeD;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE;IAC5C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B,GAAG,gBAAgB,CAenB;AAYD,wBAAgB,4BAA4B,CAC1C,OAAO,EAAE,oBAAoB,GAAG,IAAI,GACnC,IAAI,CAON;AAED,wBAAgB,uBAAuB,IAAI,oBAAoB,GAAG,IAAI,CAErE;AAID;;;;GAIG;AACH,qBAAa,oBAAqB,YAAW,oBAAoB;IAI7D,OAAO,CAAC,QAAQ,CAAC,IAAI;IAHvB,QAAQ,CAAC,IAAI,yBAAyB;gBAGnB,IAAI,GAAE,IAAI,CAAC,UAAU,EAAE,aAAa,CAAoB;IAGrE,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAuDxE;AAMD;;;;;;;GAOG;AACH,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,UAAU,GACnB,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC,CA8B5C"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { PersonInfo } from "./types";
|
|
2
|
+
import { type YOLOConfig } from "./yolo-detector";
|
|
3
|
+
export interface PersonDetectorConfig extends Omit<YOLOConfig, "classFilter"> {
|
|
4
|
+
/** Score threshold specifically for person detections (defaults to 0.4). */
|
|
5
|
+
scoreThreshold?: number;
|
|
6
|
+
}
|
|
7
|
+
export declare class PersonDetector {
|
|
8
|
+
private yolo;
|
|
9
|
+
private initialized;
|
|
10
|
+
constructor(config?: PersonDetectorConfig);
|
|
11
|
+
static isAvailable(): Promise<boolean>;
|
|
12
|
+
isInitialized(): boolean;
|
|
13
|
+
initialize(): Promise<void>;
|
|
14
|
+
detect(imageBuffer: Buffer): Promise<PersonInfo[]>;
|
|
15
|
+
dispose(): Promise<void>;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=person-detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"person-detector.d.ts","sourceRoot":"","sources":["../src/person-detector.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAC1C,OAAO,EAAE,KAAK,UAAU,EAAgB,MAAM,iBAAiB,CAAC;AAEhE,MAAM,WAAW,oBAAqB,SAAQ,IAAI,CAAC,UAAU,EAAE,aAAa,CAAC;IAC3E,4EAA4E;IAC5E,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,qBAAa,cAAc;IACzB,OAAO,CAAC,IAAI,CAAe;IAC3B,OAAO,CAAC,WAAW,CAAS;gBAEhB,MAAM,GAAE,oBAAyB;IAQ7C,MAAM,CAAC,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAItC,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B,MAAM,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;IAclD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAI/B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"provider.d.ts","sourceRoot":"","sources":["../src/provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAIL,KAAK,QAAQ,EAEd,MAAM,eAAe,CAAC;AAYvB,eAAO,MAAM,cAAc,EAAE,QAiW5B,CAAC"}
|
package/dist/routes.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Route } from "@elizaos/core";
|
|
2
|
+
/** GET — drain the queue of pending capture requests for the renderer poller. */
|
|
3
|
+
export declare const captureRequestsRoute: Route;
|
|
4
|
+
/** POST — accept a captured frame (or a skip) for a queued request. */
|
|
5
|
+
export declare const screenFrameRoute: Route;
|
|
6
|
+
export declare const visionRoutes: Route[];
|
|
7
|
+
//# sourceMappingURL=routes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"routes.d.ts","sourceRoot":"","sources":["../src/routes.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAqC3C,iFAAiF;AACjF,eAAO,MAAM,oBAAoB,EAAE,KAWlC,CAAC;AAEF,uEAAuE;AACvE,eAAO,MAAM,gBAAgB,EAAE,KA2C9B,CAAC;AAEF,eAAO,MAAM,YAAY,EAAE,KAAK,EAA6C,CAAC"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { type IAgentRuntime, Service } from "@elizaos/core";
|
|
2
|
+
/** Service type used to resolve the bridge off the runtime. */
|
|
3
|
+
export declare const SCREEN_CAPTURE_BRIDGE_SERVICE_TYPE = "vision-screen-capture-bridge";
|
|
4
|
+
/** A single enqueued capture request, drained by the GET poll. */
|
|
5
|
+
export interface ScreenCaptureRequest {
|
|
6
|
+
requestId: string;
|
|
7
|
+
createdAt: number;
|
|
8
|
+
displayId?: number;
|
|
9
|
+
}
|
|
10
|
+
/** Result of a completed capture, returned to `requestFrame` callers. */
|
|
11
|
+
export interface ScreenCaptureFrame {
|
|
12
|
+
pngBytes: Uint8Array;
|
|
13
|
+
displayId: number;
|
|
14
|
+
capturedAt: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Renderer-pulled screen-capture bridge service.
|
|
18
|
+
*
|
|
19
|
+
* The agent calls `requestFrame()`; the renderer drains the queue via
|
|
20
|
+
* `takeRequests()` and delivers frames via `submitFrame()`.
|
|
21
|
+
*/
|
|
22
|
+
export declare class ScreenCaptureBridgeService extends Service {
|
|
23
|
+
static serviceType: string;
|
|
24
|
+
capabilityDescription: string;
|
|
25
|
+
private readonly queue;
|
|
26
|
+
private readonly pending;
|
|
27
|
+
private readonly timeoutMs;
|
|
28
|
+
constructor(runtime?: IAgentRuntime, timeoutMs?: number);
|
|
29
|
+
static start(runtime: IAgentRuntime): Promise<ScreenCaptureBridgeService>;
|
|
30
|
+
/**
|
|
31
|
+
* Enqueue a capture request and wait for the renderer to deliver a frame.
|
|
32
|
+
* Resolves `null` if no frame arrives within the timeout (never hangs).
|
|
33
|
+
*/
|
|
34
|
+
requestFrame(displayId?: number): Promise<ScreenCaptureFrame | null>;
|
|
35
|
+
/** Drain and return all queued requests (for the GET poll). */
|
|
36
|
+
takeRequests(): ScreenCaptureRequest[];
|
|
37
|
+
/**
|
|
38
|
+
* Deliver a captured frame for a queued request. Returns false if the
|
|
39
|
+
* requestId is unknown or already expired/resolved.
|
|
40
|
+
*/
|
|
41
|
+
submitFrame(requestId: string, base64: string, _format: string, _width: number, _height: number): boolean;
|
|
42
|
+
/**
|
|
43
|
+
* Resolve a queued request as a skip/failure so the agent's pending promise
|
|
44
|
+
* settles promptly (as `null`) instead of waiting the full timeout. The
|
|
45
|
+
* renderer calls this when a capture throws or is unavailable. Returns false
|
|
46
|
+
* for unknown/expired requestIds.
|
|
47
|
+
*/
|
|
48
|
+
failFrame(requestId: string, reason: string): boolean;
|
|
49
|
+
stop(): Promise<void>;
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=screen-capture-bridge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen-capture-bridge.d.ts","sourceRoot":"","sources":["../src/screen-capture-bridge.ts"],"names":[],"mappings":"AAWA,OAAO,EAAE,KAAK,aAAa,EAAU,OAAO,EAAE,MAAM,eAAe,CAAC;AAEpE,+DAA+D;AAC/D,eAAO,MAAM,kCAAkC,iCACf,CAAC;AASjC,kEAAkE;AAClE,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,yEAAyE;AACzE,MAAM,WAAW,kBAAkB;IACjC,QAAQ,EAAE,UAAU,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAQD;;;;;GAKG;AACH,qBAAa,0BAA2B,SAAQ,OAAO;IACrD,OAAgB,WAAW,EAAE,MAAM,CAAsC;IAChE,qBAAqB,SACyC;IAEvE,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA8B;IACpD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAqC;IAC7D,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAGjC,OAAO,CAAC,EAAE,aAAa,EACvB,SAAS,GAAE,MAAiC;WAMjC,KAAK,CAChB,OAAO,EAAE,aAAa,GACrB,OAAO,CAAC,0BAA0B,CAAC;IAItC;;;OAGG;IACH,YAAY,CAAC,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,GAAG,IAAI,CAAC;IA2BpE,+DAA+D;IAC/D,YAAY,IAAI,oBAAoB,EAAE;IAItC;;;OAGG;IACH,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,GACd,OAAO;IAcV;;;;;OAKG;IACH,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO;IAY/C,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAQ5B"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { ScreenCapture, ScreenTile, VisionConfig } from "./types";
|
|
2
|
+
export declare class ScreenCaptureService {
|
|
3
|
+
private config;
|
|
4
|
+
private activeTileIndex;
|
|
5
|
+
private lastCapture;
|
|
6
|
+
constructor(config: VisionConfig);
|
|
7
|
+
getScreenInfo(): Promise<{
|
|
8
|
+
width: number;
|
|
9
|
+
height: number;
|
|
10
|
+
} | null>;
|
|
11
|
+
captureScreen(): Promise<ScreenCapture>;
|
|
12
|
+
private captureScreenToFile;
|
|
13
|
+
getActiveTile(): ScreenTile | null;
|
|
14
|
+
getAllTiles(): ScreenTile[];
|
|
15
|
+
getProcessedTiles(): ScreenTile[];
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=screen-capture.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen-capture.d.ts","sourceRoot":"","sources":["../src/screen-capture.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,aAAa,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AAoEvE,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,WAAW,CAA8B;gBAErC,MAAM,EAAE,YAAY;IAI1B,aAAa,IAAI,OAAO,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IA0DlE,aAAa,IAAI,OAAO,CAAC,aAAa,CAAC;YAkF/B,mBAAmB;IA+CjC,aAAa,IAAI,UAAU,GAAG,IAAI;IAOlC,WAAW,IAAI,UAAU,EAAE;IAI3B,iBAAiB,IAAI,UAAU,EAAE;CAGlC"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* One output tile from `tileScreenshot`.
|
|
3
|
+
*
|
|
4
|
+
* - `sourceX/sourceY` are absolute pixel coords in the *source display's*
|
|
5
|
+
* native space (i.e. the same space `displayId` is reported in by
|
|
6
|
+
* `plugin-computeruse/src/platform/displays.ts`).
|
|
7
|
+
* - `tileW/tileH` are the actual rendered dimensions of `pngBytes` and may
|
|
8
|
+
* equal `sourceW/sourceH` (no resize) — the tiler does not downscale; it
|
|
9
|
+
* only crops. Resizing is the model preprocessor's job.
|
|
10
|
+
*/
|
|
11
|
+
export interface ScreenTile {
|
|
12
|
+
/** Stable id of the form `tile-<row>-<col>`. */
|
|
13
|
+
id: string;
|
|
14
|
+
/** Display this tile was sourced from. Stringified to keep types narrow. */
|
|
15
|
+
displayId: string;
|
|
16
|
+
/** Top-left X of the tile in the source display's pixel space. */
|
|
17
|
+
sourceX: number;
|
|
18
|
+
/** Top-left Y of the tile in the source display's pixel space. */
|
|
19
|
+
sourceY: number;
|
|
20
|
+
/** Width of the cropped region in source pixels. */
|
|
21
|
+
sourceW: number;
|
|
22
|
+
/** Height of the cropped region in source pixels. */
|
|
23
|
+
sourceH: number;
|
|
24
|
+
/** Pixel width of `pngBytes`. Equal to `sourceW` (no resize). */
|
|
25
|
+
tileW: number;
|
|
26
|
+
/** Pixel height of `pngBytes`. Equal to `sourceH` (no resize). */
|
|
27
|
+
tileH: number;
|
|
28
|
+
/** PNG-encoded crop. */
|
|
29
|
+
pngBytes: Buffer;
|
|
30
|
+
}
|
|
31
|
+
export interface TileScreenshotInput {
|
|
32
|
+
displayId: string;
|
|
33
|
+
width: number;
|
|
34
|
+
height: number;
|
|
35
|
+
pngBytes: Buffer;
|
|
36
|
+
}
|
|
37
|
+
export interface TileScreenshotOptions {
|
|
38
|
+
/** Maximum tile edge in pixels. Tiles never exceed this in either dim. */
|
|
39
|
+
maxEdge: number;
|
|
40
|
+
/**
|
|
41
|
+
* Fraction of `tileSize` that adjacent tiles overlap. 0.12 (default) is
|
|
42
|
+
* large enough to keep multi-glyph tokens intact across seams, small enough
|
|
43
|
+
* to keep tile count near minimum.
|
|
44
|
+
*/
|
|
45
|
+
overlapFraction: number;
|
|
46
|
+
}
|
|
47
|
+
/** Default local-VLM tile budget for Gemma vision. */
|
|
48
|
+
export declare const DEFAULT_MAX_EDGE = 1280;
|
|
49
|
+
/** Default seam overlap (12%). */
|
|
50
|
+
export declare const DEFAULT_OVERLAP_FRACTION = 0.12;
|
|
51
|
+
/**
|
|
52
|
+
* Tile a captured screenshot into local-VLM-sized PNG patches with
|
|
53
|
+
* pixel-overlap between neighbours.
|
|
54
|
+
*
|
|
55
|
+
* Single-tile fast path: when both dims fit within `maxEdge`, the input is
|
|
56
|
+
* returned as a single `ScreenTile` whose pngBytes is the unmodified input.
|
|
57
|
+
*
|
|
58
|
+
* Grid path: chooses the smallest grid (cols, rows) such that no individual
|
|
59
|
+
* tile exceeds `maxEdge`, then computes a per-axis stride that yields
|
|
60
|
+
* `overlapFraction * tileSize` of overlap between adjacent tiles. The last
|
|
61
|
+
* column/row is anchored to the source's right/bottom edge so we never
|
|
62
|
+
* extend past the screen.
|
|
63
|
+
*/
|
|
64
|
+
export declare function tileScreenshot(input: TileScreenshotInput, opts?: TileScreenshotOptions): Promise<ScreenTile[]>;
|
|
65
|
+
/**
|
|
66
|
+
* Map a (localX, localY) inside a tile back to the source display's
|
|
67
|
+
* absolute pixel coordinates. Use this to translate "model said click at
|
|
68
|
+
* (x, y) inside tile-0-1" into a coordinate the input driver can act on.
|
|
69
|
+
*/
|
|
70
|
+
export declare function reconstructAbsoluteCoords(tile: ScreenTile, localX: number, localY: number): {
|
|
71
|
+
displayId: string;
|
|
72
|
+
absoluteX: number;
|
|
73
|
+
absoluteY: number;
|
|
74
|
+
};
|
|
75
|
+
//# sourceMappingURL=screen-tiler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen-tiler.d.ts","sourceRoot":"","sources":["../src/screen-tiler.ts"],"names":[],"mappings":"AAoBA;;;;;;;;;GASG;AACH,MAAM,WAAW,UAAU;IACzB,gDAAgD;IAChD,EAAE,EAAE,MAAM,CAAC;IACX,4EAA4E;IAC5E,SAAS,EAAE,MAAM,CAAC;IAClB,kEAAkE;IAClE,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,OAAO,EAAE,MAAM,CAAC;IAChB,oDAAoD;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,qDAAqD;IACrD,OAAO,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,KAAK,EAAE,MAAM,CAAC;IACd,wBAAwB;IACxB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,mBAAmB;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,qBAAqB;IACpC,0EAA0E;IAC1E,OAAO,EAAE,MAAM,CAAC;IAChB;;;;OAIG;IACH,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,sDAAsD;AACtD,eAAO,MAAM,gBAAgB,OAAO,CAAC;AACrC,kCAAkC;AAClC,eAAO,MAAM,wBAAwB,OAAO,CAAC;AAE7C;;;;;;;;;;;;GAYG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,mBAAmB,EAC1B,IAAI,GAAE,qBAGL,GACA,OAAO,CAAC,UAAU,EAAE,CAAC,CAuEvB;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,MAAM,GACb;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAgB7D"}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import { type IAgentRuntime, Service, type ServiceTypeName } from "@elizaos/core";
|
|
2
|
+
import { EntityTracker } from "./entity-tracker";
|
|
3
|
+
import type { FaceRecognition } from "./face-recognition-ggml";
|
|
4
|
+
import { type BoundingBox, type CameraInfo, type DetectedObject, type EnhancedSceneDescription, type SceneDescription, type ScreenCapture, type VisionFrame, VisionMode } from "./types";
|
|
5
|
+
export interface VisionContextSnapshot {
|
|
6
|
+
openApps: string[];
|
|
7
|
+
focusedWindow: {
|
|
8
|
+
app: string;
|
|
9
|
+
title: string;
|
|
10
|
+
bbox: [number, number, number, number] | null;
|
|
11
|
+
} | null;
|
|
12
|
+
recentActions: Array<{
|
|
13
|
+
action: string;
|
|
14
|
+
ts: number;
|
|
15
|
+
}>;
|
|
16
|
+
currentTaskGoal: string | null;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* A face that the ggml face-recognition pipeline matched to a known profile,
|
|
20
|
+
* shaped for the VLM prompt. `label` is the profile's display name when set,
|
|
21
|
+
* otherwise the opaque profile id. `bbox` is the detected face region.
|
|
22
|
+
*/
|
|
23
|
+
export interface RecognizedFace {
|
|
24
|
+
label: string;
|
|
25
|
+
bbox: BoundingBox;
|
|
26
|
+
}
|
|
27
|
+
export declare function buildSceneDescriptionPrompt(context: VisionContextSnapshot | null, ocrText?: string | null, detectedObjects?: DetectedObject[] | null, recognizedFaces?: RecognizedFace[] | null): string;
|
|
28
|
+
export declare class VisionService extends Service {
|
|
29
|
+
static serviceType: ServiceTypeName;
|
|
30
|
+
capabilityDescription: string;
|
|
31
|
+
private visionConfig;
|
|
32
|
+
private camera;
|
|
33
|
+
private lastFrame;
|
|
34
|
+
private lastSceneDescription;
|
|
35
|
+
private frameProcessingInterval;
|
|
36
|
+
private screenProcessingInterval;
|
|
37
|
+
private isProcessing;
|
|
38
|
+
private isProcessingScreen;
|
|
39
|
+
private objectDetector;
|
|
40
|
+
private hasObjectDetection;
|
|
41
|
+
private faceRecognition;
|
|
42
|
+
private entityTracker;
|
|
43
|
+
private audioCapture;
|
|
44
|
+
private streamingAudioCapture;
|
|
45
|
+
private screenCapture;
|
|
46
|
+
private ocrService;
|
|
47
|
+
private lastScreenCapture;
|
|
48
|
+
private lastEnhancedScene;
|
|
49
|
+
private workerManager;
|
|
50
|
+
private lastTfUpdateTime;
|
|
51
|
+
private lastVlmUpdateTime;
|
|
52
|
+
private lastTfDescription;
|
|
53
|
+
private readonly describeBackpressure;
|
|
54
|
+
private arbiterUnsubscribe;
|
|
55
|
+
private dirtyTileDescriber;
|
|
56
|
+
private dirtyTileDescriberInit;
|
|
57
|
+
private dirtyTileDescribeContext;
|
|
58
|
+
private readonly DEFAULT_CONFIG;
|
|
59
|
+
constructor(runtime?: IAgentRuntime);
|
|
60
|
+
private parseConfig;
|
|
61
|
+
static start(runtime: IAgentRuntime): Promise<VisionService>;
|
|
62
|
+
private checkCameraTools;
|
|
63
|
+
private initialize;
|
|
64
|
+
private initializeScreenVision;
|
|
65
|
+
private initializeCameraVision;
|
|
66
|
+
private initializeAudioCapture;
|
|
67
|
+
private storeAudioTranscription;
|
|
68
|
+
private startProcessing;
|
|
69
|
+
/**
|
|
70
|
+
* Subscribe the describe-backpressure controller to WS1 memory-pressure
|
|
71
|
+
* events. Resolves the arbiter dynamically (no hard dependency on
|
|
72
|
+
* `@elizaos/plugin-local-inference`); when none is registered the controller
|
|
73
|
+
* still pauses on the local RSS cap. Idempotent — a prior subscription is
|
|
74
|
+
* released first so a restart doesn't double-subscribe.
|
|
75
|
+
*/
|
|
76
|
+
private attachMemoryArbiter;
|
|
77
|
+
/**
|
|
78
|
+
* Clear the arbiter-driven pause. Called by the WS1 bridge consumer when
|
|
79
|
+
* pressure returns to nominal; also exposed so an embedder can resume the
|
|
80
|
+
* describe loop explicitly.
|
|
81
|
+
*/
|
|
82
|
+
resumeDescribeLoop(): void;
|
|
83
|
+
/** Current describe-backpressure stats (telemetry / tests). */
|
|
84
|
+
getBackpressureStats(): import("./describe-backpressure").DescribeBackpressureStats;
|
|
85
|
+
private startFrameProcessing;
|
|
86
|
+
private captureAndProcessFrame;
|
|
87
|
+
private processFrameData;
|
|
88
|
+
private calculatePixelChange;
|
|
89
|
+
private updateSceneDescription;
|
|
90
|
+
/**
|
|
91
|
+
* Normalize the various shapes that `useModel(IMAGE_DESCRIPTION, …)` may
|
|
92
|
+
* return into a non-empty string. Returns `null` when the result is the
|
|
93
|
+
* "I'm unable to analyze images" sentinel or empty.
|
|
94
|
+
*/
|
|
95
|
+
private extractDescriptionFromUseModel;
|
|
96
|
+
/**
|
|
97
|
+
* Pull the latest desktop scene context from plugin-computeruse's
|
|
98
|
+
* VisionContextProvider when registered. Returns `null` when no provider is
|
|
99
|
+
* available (or when the lookup fails) so the VLM still receives a valid
|
|
100
|
+
* prompt — the context block is purely additive.
|
|
101
|
+
*/
|
|
102
|
+
private collectVisionContext;
|
|
103
|
+
private collectCurrentOcrTextForPrompt;
|
|
104
|
+
/**
|
|
105
|
+
* Resolve the change-gated per-tile describer, building it once. Returns
|
|
106
|
+
* `null` (and degrades to full-frame describe) when no perceptual hash is
|
|
107
|
+
* available — i.e. plugin-computeruse's `frameDhash` cannot be imported.
|
|
108
|
+
*
|
|
109
|
+
* The dHash is resolved via a best-effort dynamic import so plugin-vision
|
|
110
|
+
* never eagerly pulls computeruse's module graph at boot (same idiom as the
|
|
111
|
+
* coord-OCR bridge wiring in `index.ts`).
|
|
112
|
+
*/
|
|
113
|
+
private ensureDirtyTileDescriber;
|
|
114
|
+
private resolveFrameHash;
|
|
115
|
+
/**
|
|
116
|
+
* Build the per-tile describe call bound to this service's runtime. Reads the
|
|
117
|
+
* current frame's prompt context (`dirtyTileDescribeContext`) so each tile is
|
|
118
|
+
* described with the same context the full-frame path would use.
|
|
119
|
+
*/
|
|
120
|
+
private buildTileDescribeFn;
|
|
121
|
+
/**
|
|
122
|
+
* Per-tile incremental scene describe. Re-describes only the tiles whose
|
|
123
|
+
* perceptual hash changed since the previous frame; unchanged tiles reuse
|
|
124
|
+
* their cached description. Returns `null` when the per-tile path is
|
|
125
|
+
* unavailable or yields no usable text, so the caller falls back to the
|
|
126
|
+
* full-frame describe.
|
|
127
|
+
*/
|
|
128
|
+
private describeSceneWithDirtyTiles;
|
|
129
|
+
private describeSceneWithVLM;
|
|
130
|
+
private describeSceneWithVLMInTrajectory;
|
|
131
|
+
private detectMotionObjects;
|
|
132
|
+
private mergeAdjacentObjects;
|
|
133
|
+
private classifyObjectBySize;
|
|
134
|
+
private detectPeopleFromMotion;
|
|
135
|
+
private startScreenProcessing;
|
|
136
|
+
private captureAndProcessScreen;
|
|
137
|
+
private analyzeTile;
|
|
138
|
+
private updateEnhancedSceneDescription;
|
|
139
|
+
getCurrentFrame(): Promise<VisionFrame | null>;
|
|
140
|
+
getSceneDescription(): Promise<SceneDescription | null>;
|
|
141
|
+
getEnhancedSceneDescription(): Promise<EnhancedSceneDescription | null>;
|
|
142
|
+
getScreenCapture(): Promise<ScreenCapture | null>;
|
|
143
|
+
getVisionMode(): VisionMode;
|
|
144
|
+
/**
|
|
145
|
+
* Enable the camera input. If screen is already active, switches to BOTH;
|
|
146
|
+
* otherwise to CAMERA.
|
|
147
|
+
*/
|
|
148
|
+
enableCamera(): Promise<void>;
|
|
149
|
+
/**
|
|
150
|
+
* Disable the camera input. Keeps screen capture if active; otherwise OFF.
|
|
151
|
+
*/
|
|
152
|
+
disableCamera(): Promise<void>;
|
|
153
|
+
/**
|
|
154
|
+
* Enable screen capture. If displayIds are passed, the first id wins as
|
|
155
|
+
* the `displayIndex` (multi-display capture is still single-display
|
|
156
|
+
* upstream).
|
|
157
|
+
*/
|
|
158
|
+
enableScreen(displayIds?: number[]): Promise<void>;
|
|
159
|
+
/**
|
|
160
|
+
* Disable screen capture. Keeps camera if active; otherwise OFF.
|
|
161
|
+
*/
|
|
162
|
+
disableScreen(): Promise<void>;
|
|
163
|
+
setVisionMode(mode: VisionMode): Promise<void>;
|
|
164
|
+
private stopProcessing;
|
|
165
|
+
getCameraInfo(): CameraInfo | null;
|
|
166
|
+
isActive(): boolean;
|
|
167
|
+
private calculateBoxOverlap;
|
|
168
|
+
getEntityTracker(): EntityTracker;
|
|
169
|
+
getFaceRecognition(): Promise<FaceRecognition>;
|
|
170
|
+
stop(): Promise<void>;
|
|
171
|
+
private findCamera;
|
|
172
|
+
private listCameras;
|
|
173
|
+
private createCameraDevice;
|
|
174
|
+
captureImage(): Promise<Buffer | null>;
|
|
175
|
+
}
|
|
176
|
+
//# sourceMappingURL=service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"AAMA,OAAO,EACL,KAAK,aAAa,EAGlB,OAAO,EACP,KAAK,eAAe,EAErB,MAAM,eAAe,CAAC;AAiBvB,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAU/D,OAAO,EACL,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,cAAc,EACnB,KAAK,wBAAwB,EAE7B,KAAK,gBAAgB,EACrB,KAAK,aAAa,EAIlB,KAAK,WAAW,EAChB,UAAU,EAEX,MAAM,SAAS,CAAC;AAmBjB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,aAAa,EAAE;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC;KAC/C,GAAG,IAAI,CAAC;IACT,aAAa,EAAE,KAAK,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACrD,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAWD;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,WAAW,CAAC;CACnB;AAgCD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,EACvB,eAAe,CAAC,EAAE,cAAc,EAAE,GAAG,IAAI,EACzC,eAAe,CAAC,EAAE,cAAc,EAAE,GAAG,IAAI,GACxC,MAAM,CAaR;AA6DD,qBAAa,aAAc,SAAQ,OAAO;IACxC,OAAgB,WAAW,EAAE,eAAe,CAA4B;IAC/D,qBAAqB,SACgD;IAE9E,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,MAAM,CAA6B;IAC3C,OAAO,CAAC,SAAS,CAA4B;IAC7C,OAAO,CAAC,oBAAoB,CAAiC;IAC7D,OAAO,CAAC,uBAAuB,CAA+B;IAC9D,OAAO,CAAC,wBAAwB,CAA+B;IAC/D,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,kBAAkB,CAAS;IACnC,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,kBAAkB,CAAS;IAInC,OAAO,CAAC,eAAe,CAAgC;IACvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,YAAY,CAAoC;IACxD,OAAO,CAAC,qBAAqB,CAA6C;IAG1E,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAAa;IAC/B,OAAO,CAAC,iBAAiB,CAA8B;IACvD,OAAO,CAAC,iBAAiB,CAAyC;IAGlE,OAAO,CAAC,aAAa,CAAoC;IAGzD,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,iBAAiB,CAAK;IAC9B,OAAO,CAAC,iBAAiB,CAAM;IAM/B,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAiC;IACtE,OAAO,CAAC,kBAAkB,CAA6B;IAMvD,OAAO,CAAC,kBAAkB,CAAmC;IAC7D,OAAO,CAAC,sBAAsB,CAAS;IAIvC,OAAO,CAAC,wBAAwB,CAGiB;IAGjD,OAAO,CAAC,QAAQ,CAAC,cAAc,CAc7B;gBAEU,OAAO,CAAC,EAAE,aAAa;IAyCnC,OAAO,CAAC,WAAW;WA0EN,KAAK,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;YAMpD,gBAAgB;YA2BhB,UAAU;YA0DV,sBAAsB;YAyCtB,sBAAsB;YAgCtB,sBAAsB;YAmHtB,uBAAuB;IAmBrC,OAAO,CAAC,eAAe;IAuBvB;;;;;;OAMG;IACH,OAAO,CAAC,mBAAmB;IAoB3B;;;;OAIG;IACI,kBAAkB,IAAI,IAAI;IAIjC,+DAA+D;IACxD,oBAAoB;IAI3B,OAAO,CAAC,oBAAoB;YAoBd,sBAAsB;YAuCtB,gBAAgB;YA+BhB,oBAAoB;YA+BpB,sBAAsB;IAyWpC;;;;OAIG;IACH,OAAO,CAAC,8BAA8B;IAsBtC;;;;;OAKG;YACW,oBAAoB;IAclC,OAAO,CAAC,8BAA8B;IAatC;;;;;;;;OAQG;YACW,wBAAwB;YAuBxB,gBAAgB;IAoB9B;;;;OAIG;IACH,OAAO,CAAC,mBAAmB;IAsB3B;;;;;;OAMG;YACW,2BAA2B;YAwB3B,oBAAoB;YAoBpB,gCAAgC;YAoFhC,mBAAmB;IA2EjC,OAAO,CAAC,oBAAoB;IA6E5B,OAAO,CAAC,oBAAoB;YAgBd,sBAAsB;IA6CpC,OAAO,CAAC,qBAAqB;YAoBf,uBAAuB;YAoBvB,WAAW;YAkBX,8BAA8B;IAkD/B,eAAe,IAAI,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC;IAI9C,mBAAmB,IAAI,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAIvD,2BAA2B,IAAI,OAAO,CAAC,wBAAwB,GAAG,IAAI,CAAC;IAUvE,gBAAgB,IAAI,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC;IAIvD,aAAa,IAAI,UAAU;IAIlC;;;OAGG;IACU,YAAY,IAAI,OAAO,CAAC,IAAI,CAAC;IAS1C;;OAEG;IACU,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAS3C;;;;OAIG;IACU,YAAY,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAa/D;;OAEG;IACU,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAS9B,aAAa,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAoC3D,OAAO,CAAC,cAAc;IAiBf,aAAa,IAAI,UAAU,GAAG,IAAI;IAYlC,QAAQ,IAAI,OAAO;IAK1B,OAAO,CAAC,mBAAmB;IAmBpB,gBAAgB,IAAI,aAAa;IAI3B,kBAAkB,IAAI,OAAO,CAAC,eAAe,CAAC;IAQrD,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;YA0Cb,UAAU;YAkCV,WAAW;IAyEzB,OAAO,CAAC,kBAAkB;IAkFb,YAAY,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CAapD"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bridge plugin-vision's Set-of-Marks fusion into plugin-computeruse's
|
|
3
|
+
* `SetOfMarksProvider` registry seam (#9170 M9).
|
|
4
|
+
*
|
|
5
|
+
* Mirrors `computeruse-ocr-bridge.ts`: plugin-vision owns the GGUF YOLO icon
|
|
6
|
+
* detector and the OCR engines; plugin-computeruse exposes a registration slot
|
|
7
|
+
* and consumes whatever is registered from `detect_elements`, with NO hard
|
|
8
|
+
* dependency on plugin-vision. The provider is built here and registered at
|
|
9
|
+
* boot via a best-effort dynamic import (see `index.ts`).
|
|
10
|
+
*
|
|
11
|
+
* Pure + injectable: the YOLO detector and OCR resolver are passed in, so the
|
|
12
|
+
* fusion wiring is unit-testable with fakes and degrades gracefully when the
|
|
13
|
+
* GGUF detector or OCR engine is unavailable (icons or text simply absent).
|
|
14
|
+
*/
|
|
15
|
+
import { getOcrWithCoordsService } from "./ocr-with-coords.js";
|
|
16
|
+
import { type DetectedObjectLike, type SetOfMarksOptions, type SomMark } from "./som.js";
|
|
17
|
+
/** Structural shape of computeruse's `SetOfMarksInput`. */
|
|
18
|
+
export interface SetOfMarksInputLike {
|
|
19
|
+
readonly displayId: string;
|
|
20
|
+
readonly sourceX: number;
|
|
21
|
+
readonly sourceY: number;
|
|
22
|
+
readonly pngBytes: Uint8Array;
|
|
23
|
+
readonly renderOverlay?: boolean;
|
|
24
|
+
}
|
|
25
|
+
/** Structural shape of computeruse's `SetOfMarksResult`. */
|
|
26
|
+
export interface SetOfMarksResultLike {
|
|
27
|
+
readonly marks: ReadonlyArray<SomMark>;
|
|
28
|
+
readonly overlayPngBase64?: string;
|
|
29
|
+
}
|
|
30
|
+
/** Structural shape of computeruse's `SetOfMarksProvider`. */
|
|
31
|
+
export interface SetOfMarksProviderLike {
|
|
32
|
+
readonly name: string;
|
|
33
|
+
describe(input: SetOfMarksInputLike): Promise<SetOfMarksResultLike>;
|
|
34
|
+
}
|
|
35
|
+
export type RegisterSetOfMarksProvider = (provider: SetOfMarksProviderLike | null) => void;
|
|
36
|
+
export declare const VISION_SET_OF_MARKS_BRIDGE_NAME = "vision-set-of-marks-bridge";
|
|
37
|
+
export interface SetOfMarksProviderDeps {
|
|
38
|
+
/**
|
|
39
|
+
* Detect icon-ish boxes from PNG bytes (GGUF YOLO). Returns `[]` when the
|
|
40
|
+
* detector is unavailable — Set-of-Marks then falls back to text-only marks.
|
|
41
|
+
*/
|
|
42
|
+
readonly detectIcons?: (pngBytes: Uint8Array) => Promise<DetectedObjectLike[]>;
|
|
43
|
+
/** Resolve the OCR-with-coords service (defaults to the registered one). */
|
|
44
|
+
readonly resolveOcr?: typeof getOcrWithCoordsService;
|
|
45
|
+
/** Fusion tuning forwarded to `buildSetOfMarks`. */
|
|
46
|
+
readonly options?: SetOfMarksOptions;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Lazily-instantiated default GGUF YOLO icon detector. Best-effort: if the
|
|
50
|
+
* native bindings or GGUF weights are missing, every call resolves to `[]` so
|
|
51
|
+
* Set-of-Marks degrades to OCR-only text marks instead of throwing.
|
|
52
|
+
*/
|
|
53
|
+
export declare function createDefaultIconDetector(): (pngBytes: Uint8Array) => Promise<DetectedObjectLike[]>;
|
|
54
|
+
/**
|
|
55
|
+
* Build a `SetOfMarksProvider`-shaped bridge that fuses GGUF YOLO icon
|
|
56
|
+
* detections + OCR text blocks into a numbered mark set (and optional overlay).
|
|
57
|
+
*/
|
|
58
|
+
export declare function buildVisionSetOfMarksProvider(deps?: SetOfMarksProviderDeps): SetOfMarksProviderLike;
|
|
59
|
+
/**
|
|
60
|
+
* Register the vision Set-of-Marks bridge into computeruse's seam. Idempotent
|
|
61
|
+
* (last-call-wins). Returns true once registered.
|
|
62
|
+
*/
|
|
63
|
+
export declare function wireComputerUseSetOfMarksBridge(register: RegisterSetOfMarksProvider, deps?: SetOfMarksProviderDeps): boolean;
|
|
64
|
+
//# sourceMappingURL=set-of-marks-provider.d.ts.map
|