@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -301
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7957 -6238
- package/dist/index.js.map +41 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1075 -7821
- package/dist/workers/ocr-worker.js.map +10 -51
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +364 -6
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +39 -21
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -89
- package/dist/workers/florence2-worker.js +0 -779
- package/dist/workers/florence2-worker.js.map +0 -13
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
export type MemoryPressureLevel = "nominal" | "low" | "critical";
|
|
2
|
+
export type DescribePauseReason = "arbiter-pressure" | "memory-cap" | null;
|
|
3
|
+
export interface DescribeBackpressureStats {
|
|
4
|
+
/** True while the describe step is currently being skipped. */
|
|
5
|
+
paused: boolean;
|
|
6
|
+
/** Last arbiter pressure level applied via `setPressure`. */
|
|
7
|
+
pressureLevel: MemoryPressureLevel;
|
|
8
|
+
/** Describe ticks skipped because of backpressure since construction. */
|
|
9
|
+
describesSkipped: number;
|
|
10
|
+
/** Count of paused<->active edges (telemetry / test signal). */
|
|
11
|
+
pauseTransitions: number;
|
|
12
|
+
/** RSS captured on the first describe tick, used as the local cap baseline. */
|
|
13
|
+
memoryBaselineBytes: number | null;
|
|
14
|
+
/** Latest sampled RSS growth over the captured baseline. */
|
|
15
|
+
memoryGrowthBytes: number | null;
|
|
16
|
+
}
|
|
17
|
+
export interface DescribeBackpressureDecision {
|
|
18
|
+
/** Run the expensive describe this tick? */
|
|
19
|
+
describe: boolean;
|
|
20
|
+
/** `"paused"`/`"active"` when this call flipped the state, else `null`. */
|
|
21
|
+
transitionedTo: "paused" | "active" | null;
|
|
22
|
+
/** Why we are paused (only meaningful when `describe === false`). */
|
|
23
|
+
reason: DescribePauseReason;
|
|
24
|
+
/** How long the current continuous pause has lasted, in ms. */
|
|
25
|
+
pausedForMs: number;
|
|
26
|
+
/** True when the caller should emit a throttled long-pause warning. */
|
|
27
|
+
warnPaused: boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface DescribeBackpressureConfig {
|
|
30
|
+
/**
|
|
31
|
+
* RSS growth cap in bytes. The first describe tick captures the process RSS
|
|
32
|
+
* baseline; while sampled RSS exceeds `baseline + memoryCapBytes`, the
|
|
33
|
+
* describe step pauses. `0` or negative disables the local check — only the
|
|
34
|
+
* arbiter signal can pause describing.
|
|
35
|
+
*/
|
|
36
|
+
memoryCapBytes?: number;
|
|
37
|
+
/**
|
|
38
|
+
* RSS sampler; defaults to `process.memoryUsage().rss`. Injected by tests so
|
|
39
|
+
* the cap can be exercised deterministically without allocating memory.
|
|
40
|
+
*/
|
|
41
|
+
sampleRssBytes?: () => number;
|
|
42
|
+
/**
|
|
43
|
+
* How long a single arbiter pressure signal keeps the loop paused, in ms.
|
|
44
|
+
* Because the WS1 bridge delivers pressure but not recovery, the pause
|
|
45
|
+
* auto-clears after this window of silence. Default 15_000.
|
|
46
|
+
*/
|
|
47
|
+
arbiterPauseCooldownMs?: number;
|
|
48
|
+
/** Continuous pause duration before a warning is requested. Default 60s. */
|
|
49
|
+
pauseWarningThresholdMs?: number;
|
|
50
|
+
/** Minimum interval between repeated long-pause warnings. Default 60s. */
|
|
51
|
+
pauseWarningIntervalMs?: number;
|
|
52
|
+
/** Clock, injectable for tests. Defaults to `Date.now`. */
|
|
53
|
+
now?: () => number;
|
|
54
|
+
}
|
|
55
|
+
export declare class DescribeBackpressureController {
|
|
56
|
+
private readonly memoryCapBytes;
|
|
57
|
+
private readonly sampleRssBytes;
|
|
58
|
+
private readonly arbiterPauseCooldownMs;
|
|
59
|
+
private readonly pauseWarningThresholdMs;
|
|
60
|
+
private readonly pauseWarningIntervalMs;
|
|
61
|
+
private readonly now;
|
|
62
|
+
private pressureLevel;
|
|
63
|
+
private pauseUntilMs;
|
|
64
|
+
private paused;
|
|
65
|
+
private describesSkipped;
|
|
66
|
+
private pauseTransitions;
|
|
67
|
+
private memoryBaselineBytes;
|
|
68
|
+
private latestMemoryGrowthBytes;
|
|
69
|
+
private pauseStartedAtMs;
|
|
70
|
+
private lastPauseWarningAtMs;
|
|
71
|
+
constructor(config?: DescribeBackpressureConfig);
|
|
72
|
+
/**
|
|
73
|
+
* Apply an arbiter memory-pressure level. A non-nominal level opens (or
|
|
74
|
+
* extends) the cooldown pause window; `nominal` clears it immediately (only
|
|
75
|
+
* arbiters that actually report recovery do this — the WS1 bridge relies on
|
|
76
|
+
* the cooldown instead).
|
|
77
|
+
*/
|
|
78
|
+
setPressure(level: MemoryPressureLevel): void;
|
|
79
|
+
/**
|
|
80
|
+
* Decide whether the expensive describe step may run this tick. Call ONLY
|
|
81
|
+
* when a describe would otherwise happen (the change/time gate already
|
|
82
|
+
* passed), so the skip counter reflects real avoided work. Has side effects:
|
|
83
|
+
* updates the skip counter and the pause/resume transition state. The
|
|
84
|
+
* arbiter signal takes precedence over the local cap when both are active so
|
|
85
|
+
* the reported `reason` is the more authoritative one.
|
|
86
|
+
*/
|
|
87
|
+
evaluate(): DescribeBackpressureDecision;
|
|
88
|
+
stats(): DescribeBackpressureStats;
|
|
89
|
+
}
|
|
90
|
+
//# sourceMappingURL=describe-backpressure.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"describe-backpressure.d.ts","sourceRoot":"","sources":["../src/describe-backpressure.ts"],"names":[],"mappings":"AAmCA,MAAM,MAAM,mBAAmB,GAAG,SAAS,GAAG,KAAK,GAAG,UAAU,CAAC;AAEjE,MAAM,MAAM,mBAAmB,GAAG,kBAAkB,GAAG,YAAY,GAAG,IAAI,CAAC;AAE3E,MAAM,WAAW,yBAAyB;IACxC,+DAA+D;IAC/D,MAAM,EAAE,OAAO,CAAC;IAChB,6DAA6D;IAC7D,aAAa,EAAE,mBAAmB,CAAC;IACnC,yEAAyE;IACzE,gBAAgB,EAAE,MAAM,CAAC;IACzB,gEAAgE;IAChE,gBAAgB,EAAE,MAAM,CAAC;IACzB,+EAA+E;IAC/E,mBAAmB,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,4DAA4D;IAC5D,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;CAClC;AAED,MAAM,WAAW,4BAA4B;IAC3C,4CAA4C;IAC5C,QAAQ,EAAE,OAAO,CAAC;IAClB,2EAA2E;IAC3E,cAAc,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC3C,qEAAqE;IACrE,MAAM,EAAE,mBAAmB,CAAC;IAC5B,+DAA+D;IAC/D,WAAW,EAAE,MAAM,CAAC;IACpB,uEAAuE;IACvE,UAAU,EAAE,OAAO,CAAC;CACrB;AAED,MAAM,WAAW,0BAA0B;IACzC;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,MAAM,CAAC;IAC9B;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,4EAA4E;IAC5E,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,0EAA0E;IAC1E,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,2DAA2D;IAC3D,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;CACpB;AAMD,qBAAa,8BAA8B;IACzC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAe;IAC9C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAS;IAChD,OAAO,CAAC,QAAQ,CAAC,uBAAuB,CAAS;IACjD,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAS;IAChD,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAe;IACnC,OAAO,CAAC,aAAa,CAAkC;IACvD,OAAO,CAAC,YAAY,CAAK;IACzB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,mBAAmB,CAAuB;IAClD,OAAO,CAAC,uBAAuB,CAAuB;IACtD,OAAO,CAAC,gBAAgB,CAAuB;IAC/C,OAAO,CAAC,oBAAoB,CAAK;gBAErB,MAAM,GAAE,0BAA+B;IAyBnD;;;;;OAKG;IACH,WAAW,CAAC,KAAK,EAAE,mBAAmB,GAAG,IAAI;IAS7C;;;;;;;OAOG;IACH,QAAQ,IAAI,4BAA4B;IA2DxC,KAAK,IAAI,yBAAyB;CAUnC"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DirtyTileDescriber — change-gated, per-tile screen description (#9105 M3).
|
|
3
|
+
*
|
|
4
|
+
* The dominant token cost in a CUA loop is re-describing a whole screen to a
|
|
5
|
+
* VLM every step even when almost nothing moved. The Brain already skips the
|
|
6
|
+
* describe entirely when the *whole frame* dHash is unchanged
|
|
7
|
+
* (`plugin-computeruse` `Brain` frame-dHash cache). This describer is the
|
|
8
|
+
* finer-grained tier: it splits a frame into tiles (via `screen-tiler.ts`),
|
|
9
|
+
* computes a per-tile perceptual hash, and only (re)describes tiles whose hash
|
|
10
|
+
* changed since the last frame — every unchanged tile reuses its cached
|
|
11
|
+
* description. So a single text field flipping characters re-describes one tile,
|
|
12
|
+
* not the entire screen.
|
|
13
|
+
*
|
|
14
|
+
* The describer is pure + injectable: the tile hash (`hashTile`) and the
|
|
15
|
+
* per-tile describe call (`describeTile`) are supplied by the caller. The real
|
|
16
|
+
* boot wiring injects `plugin-computeruse`'s `frameDhash` (the existing
|
|
17
|
+
* `scene/dhash.ts`) and a `runtime.useModel(IMAGE_DESCRIPTION)`-backed describe;
|
|
18
|
+
* tests inject deterministic fakes with no model and no native dHash. The
|
|
19
|
+
* counters (`describeCallsSaved`, `approxTokensSaved`) make the saving
|
|
20
|
+
* measurable so a test can assert it.
|
|
21
|
+
*/
|
|
22
|
+
import type { ScreenTile } from "./screen-tiler.js";
|
|
23
|
+
/** Approx image tokens charged for one tile describe — used only for the saved-tokens estimate. */
|
|
24
|
+
export declare const APPROX_TOKENS_PER_TILE = 256;
|
|
25
|
+
/** A described tile: its source rectangle plus the VLM/OCR text for it. */
|
|
26
|
+
export interface DescribedTile {
|
|
27
|
+
/** Tiler id, e.g. `tile-1-0`. */
|
|
28
|
+
id: string;
|
|
29
|
+
displayId: string;
|
|
30
|
+
/** Top-left of the tile in source display pixel space. */
|
|
31
|
+
sourceX: number;
|
|
32
|
+
sourceY: number;
|
|
33
|
+
sourceW: number;
|
|
34
|
+
sourceH: number;
|
|
35
|
+
/** The description text for this tile. */
|
|
36
|
+
description: string;
|
|
37
|
+
/** True when this tile's description was reused from cache (no describe call). */
|
|
38
|
+
cached: boolean;
|
|
39
|
+
}
|
|
40
|
+
export interface DirtyTileDescription {
|
|
41
|
+
/** One entry per tile, in tiler order. */
|
|
42
|
+
tiles: DescribedTile[];
|
|
43
|
+
/** Composed full-frame description (non-empty tile texts, source-order). */
|
|
44
|
+
vlmScene: string;
|
|
45
|
+
/** Per-tile elements suitable for `Scene.vlm_elements`. */
|
|
46
|
+
elements: DirtyTileElement[];
|
|
47
|
+
}
|
|
48
|
+
/** A described tile projected into the `Scene.vlm_elements` shape. */
|
|
49
|
+
export interface DirtyTileElement {
|
|
50
|
+
id: string;
|
|
51
|
+
kind: string;
|
|
52
|
+
desc: string;
|
|
53
|
+
/** Display-local `[x, y, w, h]` of the tile. */
|
|
54
|
+
bbox: [number, number, number, number];
|
|
55
|
+
displayId: number;
|
|
56
|
+
}
|
|
57
|
+
/** Token-accounting snapshot for a describer. */
|
|
58
|
+
export interface DirtyTileStats {
|
|
59
|
+
/** Tiles actually sent to the describe call. */
|
|
60
|
+
tilesDescribed: number;
|
|
61
|
+
/** Tiles served from the per-tile cache (no describe call). */
|
|
62
|
+
tilesSkipped: number;
|
|
63
|
+
/** Describe calls avoided by the cache (== tilesSkipped). */
|
|
64
|
+
describeCallsSaved: number;
|
|
65
|
+
/** Approx image tokens avoided (tilesSkipped × APPROX_TOKENS_PER_TILE). */
|
|
66
|
+
approxTokensSaved: number;
|
|
67
|
+
}
|
|
68
|
+
export interface DirtyTileDescriberDeps {
|
|
69
|
+
/**
|
|
70
|
+
* Perceptual hash of a tile PNG. Identical pixels MUST hash equal. The boot
|
|
71
|
+
* wiring passes `plugin-computeruse`'s `frameDhash`; `null` means "could not
|
|
72
|
+
* hash" and forces a (re)describe for that tile.
|
|
73
|
+
*/
|
|
74
|
+
hashTile: (png: Buffer) => bigint | null;
|
|
75
|
+
/** Describe one tile's pixels. Only called for changed/new tiles. */
|
|
76
|
+
describeTile: (tile: ScreenTile) => Promise<string>;
|
|
77
|
+
/** Tiling options forwarded to `tileScreenshot`. */
|
|
78
|
+
maxEdge?: number;
|
|
79
|
+
overlapFraction?: number;
|
|
80
|
+
/** Tokens charged per describe, for the saved-tokens estimate. */
|
|
81
|
+
approxTokensPerTile?: number;
|
|
82
|
+
}
|
|
83
|
+
export declare class DirtyTileDescriber {
|
|
84
|
+
private readonly deps;
|
|
85
|
+
/** tileId → { hash, description } from the previous frame. */
|
|
86
|
+
private readonly cache;
|
|
87
|
+
private stats;
|
|
88
|
+
constructor(deps: DirtyTileDescriberDeps);
|
|
89
|
+
getStats(): DirtyTileStats;
|
|
90
|
+
/**
|
|
91
|
+
* Describe a frame, re-describing only tiles whose hash changed since the
|
|
92
|
+
* previous call. Unchanged tiles reuse their cached description.
|
|
93
|
+
*/
|
|
94
|
+
describe(input: {
|
|
95
|
+
displayId: number;
|
|
96
|
+
width: number;
|
|
97
|
+
height: number;
|
|
98
|
+
pngBytes: Buffer;
|
|
99
|
+
}): Promise<DirtyTileDescription>;
|
|
100
|
+
private toDescribed;
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=dirty-tile-describer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dirty-tile-describer.d.ts","sourceRoot":"","sources":["../src/dirty-tile-describer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAGpD,mGAAmG;AACnG,eAAO,MAAM,sBAAsB,MAAM,CAAC;AAE1C,2EAA2E;AAC3E,MAAM,WAAW,aAAa;IAC5B,iCAAiC;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,0DAA0D;IAC1D,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,kFAAkF;IAClF,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,oBAAoB;IACnC,0CAA0C;IAC1C,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,4EAA4E;IAC5E,QAAQ,EAAE,MAAM,CAAC;IACjB,2DAA2D;IAC3D,QAAQ,EAAE,gBAAgB,EAAE,CAAC;CAC9B;AAED,sEAAsE;AACtE,MAAM,WAAW,gBAAgB;IAC/B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,gDAAgD;IAChD,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IACvC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,iDAAiD;AACjD,MAAM,WAAW,cAAc;IAC7B,gDAAgD;IAChD,cAAc,EAAE,MAAM,CAAC;IACvB,+DAA+D;IAC/D,YAAY,EAAE,MAAM,CAAC;IACrB,6DAA6D;IAC7D,kBAAkB,EAAE,MAAM,CAAC;IAC3B,2EAA2E;IAC3E,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,sBAAsB;IACrC;;;;OAIG;IACH,QAAQ,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;IACzC,qEAAqE;IACrE,YAAY,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;IACpD,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kEAAkE;IAClE,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,qBAAa,kBAAkB;IAajB,OAAO,CAAC,QAAQ,CAAC,IAAI;IAZjC,8DAA8D;IAC9D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAGlB;IACJ,OAAO,CAAC,KAAK,CAKX;gBAE2B,IAAI,EAAE,sBAAsB;IAEzD,QAAQ,IAAI,cAAc;IAI1B;;;OAGG;IACG,QAAQ,CAAC,KAAK,EAAE;QACpB,SAAS,EAAE,MAAM,CAAC;QAClB,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,QAAQ,EAAE,MAAM,CAAC;KAClB,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAqEjC,OAAO,CAAC,WAAW;CAgBpB"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wiring seam for the change-gated per-tile scene describe (#9105 efficiency).
|
|
3
|
+
*
|
|
4
|
+
* `DirtyTileDescriber` (dirty-tile-describer.ts) is pure + injectable: it owns
|
|
5
|
+
* the per-tile hash cache and the "only re-describe changed tiles" loop, but it
|
|
6
|
+
* does not know how to hash a tile or how to ask the VLM to describe one. This
|
|
7
|
+
* module supplies those two collaborators from the live runtime so
|
|
8
|
+
* `VisionService` can build a describer that re-describes only the screen
|
|
9
|
+
* regions that actually changed since the previous frame instead of paying for
|
|
10
|
+
* a whole-frame VLM pass every scene tick.
|
|
11
|
+
*
|
|
12
|
+
* Two collaborators:
|
|
13
|
+
* - `hashTile`: a perceptual hash. We reuse plugin-computeruse's `frameDhash`
|
|
14
|
+
* (the same dHash the Brain frame-cache uses), resolved via a best-effort
|
|
15
|
+
* dynamic import so plugin-vision never eagerly pulls computeruse's module
|
|
16
|
+
* graph at boot — exactly the idiom the OCR bridge already uses. When
|
|
17
|
+
* computeruse is absent the resolve returns `null` and the caller degrades
|
|
18
|
+
* to the existing full-frame describe.
|
|
19
|
+
* - `describeTile`: one `runtime.useModel(IMAGE_DESCRIPTION, …)` call per
|
|
20
|
+
* changed tile, built from a caller-supplied prompt + result normalizer so
|
|
21
|
+
* the per-tile path reuses the same prompt plumbing as the full-frame path.
|
|
22
|
+
*/
|
|
23
|
+
import type { ScreenTile } from "./screen-tiler.js";
|
|
24
|
+
/** PNG perceptual hash. Identical pixels MUST hash equal; `null` = undecodable. */
|
|
25
|
+
export type FrameHash = (png: Buffer) => bigint | null;
|
|
26
|
+
/** Per-tile describe call. Returns the model's description text for one tile. */
|
|
27
|
+
export type TileDescribeFn = (tile: ScreenTile) => Promise<string>;
|
|
28
|
+
export interface TileDescribeDeps {
|
|
29
|
+
/**
|
|
30
|
+
* Build the per-tile image URL the VLM is asked to describe. The tile carries
|
|
31
|
+
* PNG bytes (`tile.pngBytes`), so this is a `data:image/png;base64,…` URL.
|
|
32
|
+
*/
|
|
33
|
+
buildTileImageUrl: (tile: ScreenTile) => string;
|
|
34
|
+
/**
|
|
35
|
+
* Build the per-tile prompt. Receives the tile so callers can include bounds.
|
|
36
|
+
* Async because the scene context is pulled from a peer provider per call.
|
|
37
|
+
*/
|
|
38
|
+
buildTilePrompt: (tile: ScreenTile) => Promise<string>;
|
|
39
|
+
/** Invoke the IMAGE_DESCRIPTION model and return its raw result. */
|
|
40
|
+
invokeModel: (imageUrl: string, prompt: string) => Promise<unknown>;
|
|
41
|
+
/**
|
|
42
|
+
* Normalize a model result into a description string, or `null` when the
|
|
43
|
+
* result is unusable (sentinel / empty). A `null` result yields an empty tile
|
|
44
|
+
* description, which the describer treats as "nothing to compose for this
|
|
45
|
+
* tile" while still caching the (empty) result against the tile hash.
|
|
46
|
+
*/
|
|
47
|
+
extractDescription: (result: unknown) => string | null;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Build a `describeTile` function bound to the runtime's IMAGE_DESCRIPTION
|
|
51
|
+
* model. The describer calls this only for tiles whose hash changed.
|
|
52
|
+
*/
|
|
53
|
+
export declare function createTileDescribeFn(deps: TileDescribeDeps): TileDescribeFn;
|
|
54
|
+
/** Encode a tile's PNG bytes into a base64 data URL for the VLM. */
|
|
55
|
+
export declare function tilePngToImageUrl(tile: ScreenTile): string;
|
|
56
|
+
//# sourceMappingURL=dirty-tile-scene.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dirty-tile-scene.d.ts","sourceRoot":"","sources":["../src/dirty-tile-scene.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAEpD,mFAAmF;AACnF,MAAM,MAAM,SAAS,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;AAEvD,iFAAiF;AACjF,MAAM,MAAM,cAAc,GAAG,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;AAEnE,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,iBAAiB,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,MAAM,CAAC;IAChD;;;OAGG;IACH,eAAe,EAAE,CAAC,IAAI,EAAE,UAAU,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;IACvD,oEAAoE;IACpE,WAAW,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IACpE;;;;;OAKG;IACH,kBAAkB,EAAE,CAAC,MAAM,EAAE,OAAO,KAAK,MAAM,GAAG,IAAI,CAAC;CACxD;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,gBAAgB,GAAG,cAAc,CAO3E;AAED,oEAAoE;AACpE,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,UAAU,GAAG,MAAM,CAE1D"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { type IAgentRuntime } from "@elizaos/core";
|
|
2
|
+
import type { DetectedObject, PersonInfo, TrackedEntity, WorldState } from "./types";
|
|
3
|
+
export declare class EntityTracker {
|
|
4
|
+
private worldState;
|
|
5
|
+
private readonly POSITION_THRESHOLD;
|
|
6
|
+
private readonly MISSING_THRESHOLD;
|
|
7
|
+
private readonly CLEANUP_THRESHOLD;
|
|
8
|
+
constructor(worldId: string);
|
|
9
|
+
updateEntities(detectedObjects: DetectedObject[], people: PersonInfo[], faceProfiles?: Map<string, string>, // Maps person ID to face profile ID
|
|
10
|
+
runtime?: IAgentRuntime): Promise<TrackedEntity[]>;
|
|
11
|
+
private trackPerson;
|
|
12
|
+
private trackObject;
|
|
13
|
+
private findMatchingEntity;
|
|
14
|
+
private calculateDistance;
|
|
15
|
+
private updateWorldState;
|
|
16
|
+
private syncWithRuntime;
|
|
17
|
+
getWorldState(): WorldState;
|
|
18
|
+
getActiveEntities(): TrackedEntity[];
|
|
19
|
+
getEntity(entityId: string): TrackedEntity | undefined;
|
|
20
|
+
getRecentlyLeft(): Array<{
|
|
21
|
+
entity: TrackedEntity;
|
|
22
|
+
leftAt: number;
|
|
23
|
+
}>;
|
|
24
|
+
assignNameToEntity(entityId: string, name: string): boolean;
|
|
25
|
+
getStatistics(): {
|
|
26
|
+
totalEntities: number;
|
|
27
|
+
activeEntities: number;
|
|
28
|
+
recentlyLeft: number;
|
|
29
|
+
people: number;
|
|
30
|
+
objects: number;
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=entity-tracker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"entity-tracker.d.ts","sourceRoot":"","sources":["../src/entity-tracker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAoB,KAAK,aAAa,EAAU,MAAM,eAAe,CAAC;AAC7E,OAAO,KAAK,EAEV,cAAc,EACd,UAAU,EACV,aAAa,EACb,UAAU,EACX,MAAM,SAAS,CAAC;AAEjB,qBAAa,aAAa;IACxB,OAAO,CAAC,UAAU,CAAa;IAC/B,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAQ;IAC1C,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAS;gBAE/B,OAAO,EAAE,MAAM;IAUrB,cAAc,CAClB,eAAe,EAAE,cAAc,EAAE,EACjC,MAAM,EAAE,UAAU,EAAE,EACpB,YAAY,CAAC,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,oCAAoC;IACxE,OAAO,CAAC,EAAE,aAAa,GACtB,OAAO,CAAC,aAAa,EAAE,CAAC;YAoCb,WAAW;YA8DX,WAAW;IAsDzB,OAAO,CAAC,kBAAkB;IA2C1B,OAAO,CAAC,iBAAiB;IAezB,OAAO,CAAC,gBAAgB;YAuCV,eAAe;IAqD7B,aAAa,IAAI,UAAU;IAI3B,iBAAiB,IAAI,aAAa,EAAE;IAMpC,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,GAAG,SAAS;IAItD,eAAe,IAAI,KAAK,CAAC;QAAE,MAAM,EAAE,aAAa,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAanE,kBAAkB,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO;IAa3D,aAAa,IAAI;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,cAAc,EAAE,MAAM,CAAC;QACvB,YAAY,EAAE,MAAM,CAAC;QACrB,MAAM,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;KACjB;CAUF"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import type { BoundingBox } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Same shape the removed ONNX detector exported. Kept identical so callers can
|
|
4
|
+
* select the native backend without reshaping results.
|
|
5
|
+
*/
|
|
6
|
+
export interface MediaPipeFaceConfig {
|
|
7
|
+
modelUrl?: string;
|
|
8
|
+
modelSha256?: string | null;
|
|
9
|
+
modelDir?: string;
|
|
10
|
+
scoreThreshold?: number;
|
|
11
|
+
trusted?: boolean;
|
|
12
|
+
}
|
|
13
|
+
export interface MediaPipeFaceDetection {
|
|
14
|
+
bbox: BoundingBox;
|
|
15
|
+
confidence: number;
|
|
16
|
+
/**
|
|
17
|
+
* BlazeFace's 6 keypoints in canonical order:
|
|
18
|
+
* 0: left eye 1: right eye 2: nose tip
|
|
19
|
+
* 3: mouth 4: left ear 5: right ear
|
|
20
|
+
* Coordinates are in source-image absolute pixels.
|
|
21
|
+
*/
|
|
22
|
+
keypoints?: Array<{
|
|
23
|
+
x: number;
|
|
24
|
+
y: number;
|
|
25
|
+
}>;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* ggml-backed BlazeFace face detector. Mirrors the
|
|
29
|
+
* `MediaPipeFaceDetector` compatibility surface — same constructor config,
|
|
30
|
+
* same `MediaPipeFaceDetection` output shape.
|
|
31
|
+
*
|
|
32
|
+
* Currently disabled (`isAvailable()` returns `false`) until the
|
|
33
|
+
* face-cpp model entries gain runtime implementations and a BlazeFace GGUF
|
|
34
|
+
* artifact lands.
|
|
35
|
+
*/
|
|
36
|
+
export declare class BlazeFaceGgmlDetector {
|
|
37
|
+
private readonly cfg;
|
|
38
|
+
private bindings;
|
|
39
|
+
private handle;
|
|
40
|
+
private initialized;
|
|
41
|
+
private initPromise;
|
|
42
|
+
constructor(config?: MediaPipeFaceConfig);
|
|
43
|
+
/**
|
|
44
|
+
* `true` only when both the native library AND the GGUF weights are
|
|
45
|
+
* on disk. Loading them happens lazily in `initialize()`.
|
|
46
|
+
*/
|
|
47
|
+
static isAvailable(): Promise<boolean>;
|
|
48
|
+
isInitialized(): boolean;
|
|
49
|
+
initialize(): Promise<void>;
|
|
50
|
+
private _initialize;
|
|
51
|
+
/**
|
|
52
|
+
* Detect faces in the given image buffer. The buffer can be any
|
|
53
|
+
* sharp-supported format (PNG, JPEG, raw); we resize/letterbox to
|
|
54
|
+
* the BlazeFace 128x128 input via sharp, run the native detector,
|
|
55
|
+
* then return source-pixel bboxes + 6 keypoints.
|
|
56
|
+
*/
|
|
57
|
+
detect(imageBuffer: Buffer): Promise<MediaPipeFaceDetection[]>;
|
|
58
|
+
dispose(): Promise<void>;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=face-detector-ggml.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"face-detector-ggml.d.ts","sourceRoot":"","sources":["../src/face-detector-ggml.ts"],"names":[],"mappings":"AAwBA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAO3C;;;GAGG;AACH,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,WAAW,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,KAAK,CAAC;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CAC7C;AA4MD;;;;;;;;GAQG;AACH,qBAAa,qBAAqB;IAChC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAGlB;IACF,OAAO,CAAC,QAAQ,CAAmC;IACnD,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,WAAW,CAA8B;gBAErC,MAAM,GAAE,mBAAwB;IAQ5C;;;OAGG;WACU,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB5C,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAOnB,WAAW;IAsBzB;;;;;OAKG;IACG,MAAM,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,sBAAsB,EAAE,CAAC;IAsD9D,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAS/B"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { BoundingBox } from "./types";
|
|
2
|
+
export interface MediaPipeFaceConfig {
|
|
3
|
+
modelUrl?: string;
|
|
4
|
+
modelSha256?: string | null;
|
|
5
|
+
modelDir?: string;
|
|
6
|
+
scoreThreshold?: number;
|
|
7
|
+
trusted?: boolean;
|
|
8
|
+
}
|
|
9
|
+
export interface MediaPipeFaceDetection {
|
|
10
|
+
bbox: BoundingBox;
|
|
11
|
+
confidence: number;
|
|
12
|
+
keypoints?: Array<{
|
|
13
|
+
x: number;
|
|
14
|
+
y: number;
|
|
15
|
+
}>;
|
|
16
|
+
}
|
|
17
|
+
export declare class MediaPipeFaceDetector {
|
|
18
|
+
constructor(_config?: MediaPipeFaceConfig);
|
|
19
|
+
static isAvailable(): Promise<boolean>;
|
|
20
|
+
isInitialized(): boolean;
|
|
21
|
+
initialize(): Promise<void>;
|
|
22
|
+
detect(_imageBuffer: Buffer): Promise<MediaPipeFaceDetection[]>;
|
|
23
|
+
dispose(): Promise<void>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=face-detector-mediapipe.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"face-detector-mediapipe.d.ts","sourceRoot":"","sources":["../src/face-detector-mediapipe.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAE3C,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,WAAW,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,KAAK,CAAC;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CAC7C;AAED,qBAAa,qBAAqB;gBAEpB,OAAO,GAAE,mBAAwB;WAIhC,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAI5C,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAM3B,MAAM,CAAC,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,sBAAsB,EAAE,CAAC;IAM/D,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { type MediaPipeFaceDetection } from "./face-detector-ggml";
|
|
2
|
+
import type { BoundingBox, FaceProfile } from "./types";
|
|
3
|
+
/**
|
|
4
|
+
* Configuration for the ggml-backed face embedder.
|
|
5
|
+
*/
|
|
6
|
+
export interface FaceEmbedGgmlConfig {
|
|
7
|
+
modelPath?: string;
|
|
8
|
+
modelDir?: string;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Cosine distance between two 128-d unit-norm embeddings. Matches
|
|
12
|
+
* `face_embed_distance` in the C library: 0 for identical, 1 for
|
|
13
|
+
* orthogonal, 2 for antipodal.
|
|
14
|
+
*/
|
|
15
|
+
export declare function cosineDistance(a: Float32Array, b: Float32Array): number;
|
|
16
|
+
/**
|
|
17
|
+
* L2 distance between two 128-d embeddings. For unit-norm inputs this
|
|
18
|
+
* is sqrt(2 - 2*dot(a, b)), in [0, 2].
|
|
19
|
+
*/
|
|
20
|
+
export declare function l2Distance(a: Float32Array, b: Float32Array): number;
|
|
21
|
+
/**
|
|
22
|
+
* ggml-backed 128-d face embedder: a 128-d L2-normalized descriptor per
|
|
23
|
+
* detected face, consumed by the `FaceRecognition` class below.
|
|
24
|
+
*/
|
|
25
|
+
export declare class FaceEmbedGgmlRecognizer {
|
|
26
|
+
private readonly cfg;
|
|
27
|
+
private bindings;
|
|
28
|
+
private handle;
|
|
29
|
+
private initialized;
|
|
30
|
+
private initPromise;
|
|
31
|
+
constructor(config?: FaceEmbedGgmlConfig);
|
|
32
|
+
/**
|
|
33
|
+
* `true` only when both the native library AND the GGUF weights are
|
|
34
|
+
* on disk.
|
|
35
|
+
*/
|
|
36
|
+
static isAvailable(): Promise<boolean>;
|
|
37
|
+
isInitialized(): boolean;
|
|
38
|
+
initialize(): Promise<void>;
|
|
39
|
+
private _initialize;
|
|
40
|
+
/**
|
|
41
|
+
* Compute a 128-d L2-normalized face embedding from an RGB(A) image
|
|
42
|
+
* buffer plus a detection record (bbox + BlazeFace landmarks).
|
|
43
|
+
*
|
|
44
|
+
* The image is decoded via sharp; pass any sharp-supported format
|
|
45
|
+
* (PNG, JPEG, raw). `detection` should come from
|
|
46
|
+
* `BlazeFaceGgmlDetector` so the keypoints already match the
|
|
47
|
+
* BlazeFace order.
|
|
48
|
+
*/
|
|
49
|
+
embed(imageBuffer: Buffer, detection: MediaPipeFaceDetection): Promise<Float32Array>;
|
|
50
|
+
dispose(): Promise<void>;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* A detected face: a native BlazeFace detection plus its 128-d ggml
|
|
54
|
+
* embedding. Mirrors the fields `VisionService` reads off each result.
|
|
55
|
+
* The native backend produces no expression / age-gender estimates, so
|
|
56
|
+
* those attributes are left to higher layers.
|
|
57
|
+
*/
|
|
58
|
+
export interface DetectedFace {
|
|
59
|
+
detection: {
|
|
60
|
+
box: BoundingBox;
|
|
61
|
+
};
|
|
62
|
+
descriptor: Float32Array;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Native ggml face recognition: BlazeFace detection + 128-d embedding +
|
|
66
|
+
* in-memory matching and persistence. When the native `libface` library
|
|
67
|
+
* or its GGUF weights are not on disk, detection returns an empty list
|
|
68
|
+
* (recognition is disabled, never faked). Matching and storage are pure
|
|
69
|
+
* JS and always available.
|
|
70
|
+
*/
|
|
71
|
+
export declare class FaceRecognition {
|
|
72
|
+
private readonly detector;
|
|
73
|
+
private readonly embedder;
|
|
74
|
+
private detectorAvailable;
|
|
75
|
+
private readonly faceLibrary;
|
|
76
|
+
private readonly FACE_MATCH_THRESHOLD;
|
|
77
|
+
private readonly MIN_FACE_SIZE;
|
|
78
|
+
/**
|
|
79
|
+
* Detect faces in a raw RGBA frame and compute an embedding for each.
|
|
80
|
+
* Returns an empty list when the native face backend is unavailable.
|
|
81
|
+
*/
|
|
82
|
+
detectFaces(imageData: Buffer, width: number, height: number): Promise<DetectedFace[]>;
|
|
83
|
+
recognizeFace(descriptor: Float32Array): Promise<{
|
|
84
|
+
profileId: string;
|
|
85
|
+
distance: number;
|
|
86
|
+
} | null>;
|
|
87
|
+
addOrUpdateFace(descriptor: Float32Array, attributes?: Partial<FaceProfile>): Promise<string>;
|
|
88
|
+
getFaceProfile(profileId: string): FaceProfile | undefined;
|
|
89
|
+
getAllProfiles(): FaceProfile[];
|
|
90
|
+
saveFaceLibrary(filePath: string): Promise<void>;
|
|
91
|
+
loadFaceLibrary(filePath: string): Promise<void>;
|
|
92
|
+
dispose(): Promise<void>;
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=face-recognition-ggml.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"face-recognition-ggml.d.ts","sourceRoot":"","sources":["../src/face-recognition-ggml.ts"],"names":[],"mappings":"AAyBA,OAAO,EAEL,KAAK,sBAAsB,EAC5B,MAAM,sBAAsB,CAAC;AAE9B,OAAO,KAAK,EAAE,WAAW,EAAe,WAAW,EAAE,MAAM,SAAS,CAAC;AAkMrE;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CASvE;AAED;;;GAGG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAUnE;AAED;;;GAGG;AACH,qBAAa,uBAAuB;IAClC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAA6C;IACjE,OAAO,CAAC,QAAQ,CAAkC;IAClD,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,WAAW,CAA8B;gBAErC,MAAM,GAAE,mBAAwB;IAO5C;;;OAGG;WACU,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB5C,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAOnB,WAAW;IAoBzB;;;;;;;;OAQG;IACG,KAAK,CACT,WAAW,EAAE,MAAM,EACnB,SAAS,EAAE,sBAAsB,GAChC,OAAO,CAAC,YAAY,CAAC;IAsBlB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAS/B;AAID;;;;;GAKG;AACH,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE;QAAE,GAAG,EAAE,WAAW,CAAA;KAAE,CAAC;IAChC,UAAU,EAAE,YAAY,CAAC;CAC1B;AAED;;;;;;GAMG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAA+B;IACxD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAiC;IAC1D,OAAO,CAAC,iBAAiB,CAAwB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAG1B;IAGF,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAO;IAE5C,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAM;IAEpC;;;OAGG;IACG,WAAW,CACf,SAAS,EAAE,MAAM,EACjB,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,MAAM,GACb,OAAO,CAAC,YAAY,EAAE,CAAC;IA+CpB,aAAa,CACjB,UAAU,EAAE,YAAY,GACvB,OAAO,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IAiBpD,eAAe,CACnB,UAAU,EAAE,YAAY,EACxB,UAAU,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GAChC,OAAO,CAAC,MAAM,CAAC;IAgDlB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS;IAI1D,cAAc,IAAI,WAAW,EAAE;IAIzB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAShD,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAahD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAI/B"}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure element-merge core for the GET_SCREEN action (#9105 Slice 2 / M2).
|
|
3
|
+
*
|
|
4
|
+
* GET_SCREEN returns a cheap, token-frugal list of grounded, clickable screen
|
|
5
|
+
* elements unified from three sources: OCR text boxes, accessibility (AX)
|
|
6
|
+
* clickables, and (optionally) VLM-detected elements. This module is the
|
|
7
|
+
* deterministic heart of that envelope — it collapses the three sources into a
|
|
8
|
+
* single deduplicated, stably-ordered element list, recording each element's
|
|
9
|
+
* `groundingSources` provenance.
|
|
10
|
+
*
|
|
11
|
+
* Like the M1 OCR bridge (`computeruse-ocr-bridge.ts`), this is intentionally
|
|
12
|
+
* dependency-free and pure: the source types live in `@elizaos/plugin-computeruse`
|
|
13
|
+
* (`SceneOcrBox` / `SceneAxNode` / `SceneVlmElement`), but we describe their
|
|
14
|
+
* shapes STRUCTURALLY here rather than importing them, to keep the no-hard-dep
|
|
15
|
+
* rule. That also makes the merge engine fully unit-testable with zero
|
|
16
|
+
* environment, decoupled from the runtime/native/model wiring (Slice 3).
|
|
17
|
+
*/
|
|
18
|
+
/** Display-local bounding box `[x, y, w, h]`. */
|
|
19
|
+
export type Bbox = readonly [number, number, number, number];
|
|
20
|
+
/** Structural shape of computeruse's `SceneOcrBox`. */
|
|
21
|
+
export interface OcrBoxLike {
|
|
22
|
+
readonly id: string;
|
|
23
|
+
readonly text: string;
|
|
24
|
+
readonly bbox: Bbox;
|
|
25
|
+
readonly conf?: number;
|
|
26
|
+
readonly displayId: number;
|
|
27
|
+
}
|
|
28
|
+
/** Structural shape of computeruse's `SceneAxNode`. */
|
|
29
|
+
export interface AxNodeLike {
|
|
30
|
+
readonly id: string;
|
|
31
|
+
readonly role: string;
|
|
32
|
+
readonly label?: string;
|
|
33
|
+
readonly bbox: Bbox;
|
|
34
|
+
readonly actions?: readonly string[];
|
|
35
|
+
readonly displayId: number;
|
|
36
|
+
}
|
|
37
|
+
/** Structural shape of computeruse's `SceneVlmElement`. */
|
|
38
|
+
export interface VlmElementLike {
|
|
39
|
+
readonly id: string;
|
|
40
|
+
readonly kind: string;
|
|
41
|
+
readonly desc: string;
|
|
42
|
+
readonly bbox: Bbox;
|
|
43
|
+
readonly displayId: number;
|
|
44
|
+
}
|
|
45
|
+
export type GroundingSource = "ocr" | "ax" | "vlm";
|
|
46
|
+
/** A single unified, grounded screen element in the GET_SCREEN envelope. */
|
|
47
|
+
export interface GetScreenElement {
|
|
48
|
+
/** Stable id, preferring the AX id, then OCR, then VLM. */
|
|
49
|
+
id: string;
|
|
50
|
+
/** Display-local bbox `[x, y, w, h]` of the representative (highest-priority) source. */
|
|
51
|
+
bbox: [number, number, number, number];
|
|
52
|
+
/** User-facing text/label: AX label, else OCR text, else VLM description. */
|
|
53
|
+
text: string;
|
|
54
|
+
/** Element kind/role when known: AX role, else VLM kind. */
|
|
55
|
+
kind?: string;
|
|
56
|
+
displayId: number;
|
|
57
|
+
/** AX actions when the element is accessibility-grounded. */
|
|
58
|
+
actions?: string[];
|
|
59
|
+
/** Provenance — every source that contributed to this element, in fixed
|
|
60
|
+
* `ocr < ax < vlm` order for stability. Always non-empty. */
|
|
61
|
+
groundingSources: GroundingSource[];
|
|
62
|
+
}
|
|
63
|
+
export interface MergeScreenInput {
|
|
64
|
+
readonly ocr?: readonly OcrBoxLike[];
|
|
65
|
+
readonly ax?: readonly AxNodeLike[];
|
|
66
|
+
readonly vlm?: readonly VlmElementLike[];
|
|
67
|
+
}
|
|
68
|
+
export interface MergeScreenOptions {
|
|
69
|
+
/** Boxes whose IoU exceeds this collapse into one element (default 0.6). */
|
|
70
|
+
readonly iouThreshold?: number;
|
|
71
|
+
}
|
|
72
|
+
/** Intersection-over-union of two `[x, y, w, h]` boxes. 0 when either is empty
|
|
73
|
+
* or they don't overlap. */
|
|
74
|
+
export declare function bboxIou(a: Bbox, b: Bbox): number;
|
|
75
|
+
/**
|
|
76
|
+
* Merge OCR boxes + AX clickables + VLM elements into one deduplicated,
|
|
77
|
+
* deterministically-ordered element list.
|
|
78
|
+
*
|
|
79
|
+
* - Elements from different sources whose bboxes overlap above `iouThreshold`
|
|
80
|
+
* (and share a `displayId`) collapse into one element that records all
|
|
81
|
+
* contributing sources in `groundingSources`.
|
|
82
|
+
* - Field precedence is AX > OCR > VLM (AX wins id/label/role; OCR text fills
|
|
83
|
+
* in when AX has no label; VLM desc is the last resort).
|
|
84
|
+
* - Output order is top-to-bottom, then left-to-right, so the envelope is
|
|
85
|
+
* stable across turns regardless of input ordering.
|
|
86
|
+
* - Degrades gracefully: any source may be absent/empty (e.g. accessibility off)
|
|
87
|
+
* and the function never throws.
|
|
88
|
+
*/
|
|
89
|
+
export declare function mergeScreenElements(input: MergeScreenInput, options?: MergeScreenOptions): GetScreenElement[];
|
|
90
|
+
//# sourceMappingURL=get-screen-elements.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"get-screen-elements.d.ts","sourceRoot":"","sources":["../src/get-screen-elements.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,iDAAiD;AACjD,MAAM,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;AAE7D,uDAAuD;AACvD,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC;IACpB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,uDAAuD;AACvD,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC;IACpB,QAAQ,CAAC,OAAO,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,2DAA2D;AAC3D,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,MAAM,eAAe,GAAG,KAAK,GAAG,IAAI,GAAG,KAAK,CAAC;AAEnD,4EAA4E;AAC5E,MAAM,WAAW,gBAAgB;IAC/B,2DAA2D;IAC3D,EAAE,EAAE,MAAM,CAAC;IACX,yFAAyF;IACzF,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IACvC,6EAA6E;IAC7E,IAAI,EAAE,MAAM,CAAC;IACb,4DAA4D;IAC5D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,6DAA6D;IAC7D,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB;iEAC6D;IAC7D,gBAAgB,EAAE,eAAe,EAAE,CAAC;CACrC;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,GAAG,CAAC,EAAE,SAAS,UAAU,EAAE,CAAC;IACrC,QAAQ,CAAC,EAAE,CAAC,EAAE,SAAS,UAAU,EAAE,CAAC;IACpC,QAAQ,CAAC,GAAG,CAAC,EAAE,SAAS,cAAc,EAAE,CAAC;CAC1C;AAED,MAAM,WAAW,kBAAkB;IACjC,4EAA4E;IAC5E,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;CAChC;AAID;4BAC4B;AAC5B,wBAAgB,OAAO,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,IAAI,GAAG,MAAM,CAchD;AAoBD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,mBAAmB,CACjC,KAAK,EAAE,gBAAgB,EACvB,OAAO,GAAE,kBAAuB,GAC/B,gBAAgB,EAAE,CAkEpB"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GET_SCREEN core (issue #9105 / M2) — token-frugal structured screen readout.
|
|
3
|
+
*
|
|
4
|
+
* Returns OCR text + grounded elements (id/text/bbox/semantic position) from a
|
|
5
|
+
* captured frame using the registered coord-OCR service (native Windows OCR /
|
|
6
|
+
* docTR / Apple Vision — zero LLM tokens). The raw image is OMITTED by default
|
|
7
|
+
* (`includeImage:false`) so a CUA loop can read the screen each tick without
|
|
8
|
+
* spending image tokens; it is only base64-attached when explicitly requested.
|
|
9
|
+
*
|
|
10
|
+
* Pure + injectable (the OCR service can be passed in) so it is unit-testable
|
|
11
|
+
* without a real capture or a registered provider.
|
|
12
|
+
*/
|
|
13
|
+
import { type OcrWithCoordsService } from "./ocr-with-coords.js";
|
|
14
|
+
export interface GetScreenElement {
|
|
15
|
+
/**
|
|
16
|
+
* Monotonic 1-based Set-of-Marks number (reading order). A model can pick
|
|
17
|
+
* `[3]` instead of regressing raw coordinates.
|
|
18
|
+
*/
|
|
19
|
+
index: number;
|
|
20
|
+
/** Stable per-result id. */
|
|
21
|
+
id: string;
|
|
22
|
+
text: string;
|
|
23
|
+
/** Display-absolute [x, y, width, height]. */
|
|
24
|
+
bbox: [number, number, number, number];
|
|
25
|
+
/** Click target — the integer center of `bbox`, the point `index` resolves to. */
|
|
26
|
+
center: {
|
|
27
|
+
x: number;
|
|
28
|
+
y: number;
|
|
29
|
+
};
|
|
30
|
+
semantic_position: string;
|
|
31
|
+
displayId: number;
|
|
32
|
+
}
|
|
33
|
+
export interface GetScreenResult {
|
|
34
|
+
op: "get_screen";
|
|
35
|
+
displayId: number;
|
|
36
|
+
width: number;
|
|
37
|
+
height: number;
|
|
38
|
+
/** When the source frame was captured (ms epoch). */
|
|
39
|
+
lastChangeTime: number;
|
|
40
|
+
/** True when a coord-OCR provider was available and ran. */
|
|
41
|
+
ocrAvailable: boolean;
|
|
42
|
+
ocrText: string;
|
|
43
|
+
elements: GetScreenElement[];
|
|
44
|
+
elementCount: number;
|
|
45
|
+
/** Base64 PNG — only present when `includeImage` was requested. */
|
|
46
|
+
image?: string;
|
|
47
|
+
}
|
|
48
|
+
export interface BuildGetScreenOptions {
|
|
49
|
+
pngBytes: Uint8Array;
|
|
50
|
+
displayId?: number;
|
|
51
|
+
includeImage?: boolean;
|
|
52
|
+
includeOcr?: boolean;
|
|
53
|
+
capturedAt?: number;
|
|
54
|
+
/** Override for tests; defaults to the registered coord-OCR service. */
|
|
55
|
+
ocrService?: OcrWithCoordsService | null;
|
|
56
|
+
}
|
|
57
|
+
export declare function buildGetScreen(opts: BuildGetScreenOptions): Promise<GetScreenResult>;
|
|
58
|
+
/** Human-readable one-line summary for the agent reply. */
|
|
59
|
+
export declare function summarizeGetScreen(r: GetScreenResult): string;
|
|
60
|
+
//# sourceMappingURL=get-screen.d.ts.map
|