@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -301
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7957 -6238
- package/dist/index.js.map +41 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1075 -7821
- package/dist/workers/ocr-worker.js.map +10 -51
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +364 -6
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +39 -21
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -89
- package/dist/workers/florence2-worker.js +0 -779
- package/dist/workers/florence2-worker.js.map +0 -13
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"set-of-marks-provider.d.ts","sourceRoot":"","sources":["../src/set-of-marks-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAC/D,OAAO,EAEL,KAAK,kBAAkB,EAGvB,KAAK,iBAAiB,EACtB,KAAK,OAAO,EACb,MAAM,UAAU,CAAC;AAElB,2DAA2D;AAC3D,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;IAC9B,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,4DAA4D;AAC5D,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC,OAAO,CAAC,CAAC;IACvC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;CACpC;AAED,8DAA8D;AAC9D,MAAM,WAAW,sBAAsB;IACrC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,mBAAmB,GAAG,OAAO,CAAC,oBAAoB,CAAC,CAAC;CACrE;AAED,MAAM,MAAM,0BAA0B,GAAG,CACvC,QAAQ,EAAE,sBAAsB,GAAG,IAAI,KACpC,IAAI,CAAC;AAEV,eAAO,MAAM,+BAA+B,+BAA+B,CAAC;AAE5E,MAAM,WAAW,sBAAsB;IACrC;;;OAGG;IACH,QAAQ,CAAC,WAAW,CAAC,EAAE,CACrB,QAAQ,EAAE,UAAU,KACjB,OAAO,CAAC,kBAAkB,EAAE,CAAC,CAAC;IACnC,4EAA4E;IAC5E,QAAQ,CAAC,UAAU,CAAC,EAAE,OAAO,uBAAuB,CAAC;IACrD,oDAAoD;IACpD,QAAQ,CAAC,OAAO,CAAC,EAAE,iBAAiB,CAAC;CACtC;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,IAAI,CAC3C,QAAQ,EAAE,UAAU,KACjB,OAAO,CAAC,kBAAkB,EAAE,CAAC,CA0BjC;AAED;;;GAGG;AACH,wBAAgB,6BAA6B,CAC3C,IAAI,GAAE,sBAA2B,GAChC,sBAAsB,CAkCxB;AAED;;;GAGG;AACH,wBAAgB,+BAA+B,CAC7C,QAAQ,EAAE,0BAA0B,EACpC,IAAI,CAAC,EAAE,sBAAsB,GAC5B,OAAO,CAGT"}
|
package/dist/som.d.ts
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Set-of-Marks (SoM) grounding — #9170 M9.
|
|
3
|
+
*
|
|
4
|
+
* Set-of-Marks is the grounding technique trycua/cua uses via OmniParser: take
|
|
5
|
+
* the icon detections (GGUF YOLO) and the OCR text boxes (the `CoordOcrProvider`
|
|
6
|
+
* seam plugin-vision already feeds), fuse them into ONE deduplicated set of
|
|
7
|
+
* candidate targets, draw a 1-indexed numbered box over each on the screenshot,
|
|
8
|
+
* and let the VLM pick a *number* instead of raw pixel coordinates. Numeric
|
|
9
|
+
* selection is far more reliable than free-floating coordinate regression.
|
|
10
|
+
*
|
|
11
|
+
* This module is split into:
|
|
12
|
+
* - a PURE core (`buildSetOfMarks`) — icon-over-text suppression + NMS +
|
|
13
|
+
* deterministic reading-order numbering. Dependency-free and structurally
|
|
14
|
+
* typed so it unit-tests with zero environment, mirroring
|
|
15
|
+
* `get-screen-elements.ts` and `computeruse-ocr-bridge.ts`.
|
|
16
|
+
* - a renderer (`renderSetOfMarksOverlay`) — composites a numbered SVG overlay
|
|
17
|
+
* onto the source PNG via `sharp`.
|
|
18
|
+
*
|
|
19
|
+
* The OmniParser fusion rules we reproduce:
|
|
20
|
+
* 1. Icon-over-text suppression — a text box mostly covered by an icon box is
|
|
21
|
+
* dropped; the icon is the interactable, the text is its caption.
|
|
22
|
+
* 2. Non-max suppression — overlapping boxes collapse to the highest-priority
|
|
23
|
+
* one (icons outrank text on ties), so each target is marked once.
|
|
24
|
+
* 3. 1-indexed reading-order numbering — top-to-bottom, then left-to-right,
|
|
25
|
+
* with a row tolerance so a visual row isn't scrambled by sub-pixel y jitter.
|
|
26
|
+
*/
|
|
27
|
+
/** Display-local bounding box `[x, y, w, h]`. */
|
|
28
|
+
export type SomBbox = readonly [number, number, number, number];
|
|
29
|
+
/** Where a candidate mark came from. Icons outrank text during suppression. */
|
|
30
|
+
export type SomSource = "icon" | "text";
|
|
31
|
+
/** A raw candidate box fed into the SoM fusion. */
|
|
32
|
+
export interface SomCandidate {
|
|
33
|
+
readonly bbox: SomBbox;
|
|
34
|
+
readonly source: SomSource;
|
|
35
|
+
/** Class name (icon) or recognized text (text). Optional. */
|
|
36
|
+
readonly label?: string;
|
|
37
|
+
/** Detector/OCR confidence in [0, 1]. Missing → treated as 0.5. */
|
|
38
|
+
readonly score?: number;
|
|
39
|
+
}
|
|
40
|
+
/** A finalized, numbered mark in the overlay. */
|
|
41
|
+
export interface SomMark {
|
|
42
|
+
/** 1-indexed mark number shown in the overlay. */
|
|
43
|
+
readonly index: number;
|
|
44
|
+
readonly bbox: [number, number, number, number];
|
|
45
|
+
/** Box center `[x, y]` — the click target the VLM's number resolves to. */
|
|
46
|
+
readonly center: [number, number];
|
|
47
|
+
readonly source: SomSource;
|
|
48
|
+
readonly label?: string;
|
|
49
|
+
readonly score: number;
|
|
50
|
+
}
|
|
51
|
+
export interface SetOfMarksOptions {
|
|
52
|
+
/**
|
|
53
|
+
* A text box is dropped when this fraction of its area is covered by an icon
|
|
54
|
+
* box (icon-over-text suppression). Default 0.7.
|
|
55
|
+
*/
|
|
56
|
+
readonly iconOverTextCoverage?: number;
|
|
57
|
+
/** Boxes overlapping above this IoU collapse during NMS. Default 0.5. */
|
|
58
|
+
readonly nmsIouThreshold?: number;
|
|
59
|
+
/**
|
|
60
|
+
* Rows within this many pixels of vertical offset are treated as the same
|
|
61
|
+
* reading row (so left-to-right ordering holds across a row). Default 12.
|
|
62
|
+
*/
|
|
63
|
+
readonly rowTolerance?: number;
|
|
64
|
+
/** Drop candidates with a smaller score before fusion. Default 0 (keep all). */
|
|
65
|
+
readonly minScore?: number;
|
|
66
|
+
}
|
|
67
|
+
/** Fraction of `inner`'s area covered by `outer` (containment ratio). Pure. */
|
|
68
|
+
export declare function coverageRatio(inner: SomBbox, outer: SomBbox): number;
|
|
69
|
+
/**
|
|
70
|
+
* Fuse icon + text candidates into a deduplicated, 1-indexed set of marks.
|
|
71
|
+
*
|
|
72
|
+
* Pure and deterministic: same inputs → identical numbering, regardless of
|
|
73
|
+
* input ordering. Degenerate boxes (non-finite / zero-area) are dropped.
|
|
74
|
+
*/
|
|
75
|
+
export declare function buildSetOfMarks(candidates: readonly SomCandidate[], options?: SetOfMarksOptions): SomMark[];
|
|
76
|
+
export interface SomOverlayOptions extends SetOfMarksOptions {
|
|
77
|
+
/** Stroke / badge color for icon marks. Default "#FF8C00" (orange). */
|
|
78
|
+
readonly iconColor?: string;
|
|
79
|
+
/** Stroke / badge color for text marks. Default "#1FA9FF". */
|
|
80
|
+
readonly textColor?: string;
|
|
81
|
+
/** Box stroke width in px. Default 2. */
|
|
82
|
+
readonly strokeWidth?: number;
|
|
83
|
+
/** Badge font size in px. Default 13. */
|
|
84
|
+
readonly fontSize?: number;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Build the SVG overlay markup for a set of marks over a `width × height`
|
|
88
|
+
* canvas. Pure (no I/O) — separated from the raster composite so it is
|
|
89
|
+
* unit-testable and reusable by non-sharp consumers (e.g. a browser overlay).
|
|
90
|
+
*/
|
|
91
|
+
export declare function buildSetOfMarksSvg(marks: readonly SomMark[], width: number, height: number, options?: SomOverlayOptions): string;
|
|
92
|
+
/** Structural shape of a `DetectedObject` (YOLO) — `{x,y,width,height}` box. */
|
|
93
|
+
export interface DetectedObjectLike {
|
|
94
|
+
readonly boundingBox: {
|
|
95
|
+
readonly x: number;
|
|
96
|
+
readonly y: number;
|
|
97
|
+
readonly width: number;
|
|
98
|
+
readonly height: number;
|
|
99
|
+
};
|
|
100
|
+
readonly type?: string;
|
|
101
|
+
readonly confidence?: number;
|
|
102
|
+
}
|
|
103
|
+
/** Structural shape of an OCR block (`OcrWithCoordsBlock`). */
|
|
104
|
+
export interface OcrBlockLike {
|
|
105
|
+
readonly text?: string;
|
|
106
|
+
readonly confidence?: number;
|
|
107
|
+
readonly bbox: {
|
|
108
|
+
readonly x: number;
|
|
109
|
+
readonly y: number;
|
|
110
|
+
readonly width: number;
|
|
111
|
+
readonly height: number;
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
/** Adapt GGUF YOLO detections into icon-source SoM candidates. */
|
|
115
|
+
export declare function somCandidatesFromDetections(objects: readonly DetectedObjectLike[]): SomCandidate[];
|
|
116
|
+
/** Adapt `CoordOcrProvider` text blocks into text-source SoM candidates. */
|
|
117
|
+
export declare function somCandidatesFromOcr(blocks: readonly OcrBlockLike[]): SomCandidate[];
|
|
118
|
+
/**
|
|
119
|
+
* Convenience: fuse YOLO detections + OCR blocks straight into a numbered mark
|
|
120
|
+
* set. The seam `detect_elements`/grounding calls — pass whatever icon and text
|
|
121
|
+
* boxes the scene already has.
|
|
122
|
+
*/
|
|
123
|
+
export declare function buildSceneSetOfMarks(args: {
|
|
124
|
+
readonly detections?: readonly DetectedObjectLike[];
|
|
125
|
+
readonly ocrBlocks?: readonly OcrBlockLike[];
|
|
126
|
+
}, options?: SetOfMarksOptions): SomMark[];
|
|
127
|
+
/**
|
|
128
|
+
* Composite a numbered Set-of-Marks overlay onto a source PNG.
|
|
129
|
+
*
|
|
130
|
+
* Returns PNG bytes the same size as the input. `sharp` is loaded lazily so the
|
|
131
|
+
* pure core (`buildSetOfMarks`) carries no native dependency for consumers that
|
|
132
|
+
* only need coordinates.
|
|
133
|
+
*/
|
|
134
|
+
export declare function renderSetOfMarksOverlay(pngBytes: Uint8Array, marks: readonly SomMark[], options?: SomOverlayOptions): Promise<Buffer>;
|
|
135
|
+
//# sourceMappingURL=som.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"som.d.ts","sourceRoot":"","sources":["../src/som.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAIH,iDAAiD;AACjD,MAAM,MAAM,OAAO,GAAG,SAAS,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;AAEhE,+EAA+E;AAC/E,MAAM,MAAM,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;AAExC,mDAAmD;AACnD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,SAAS,CAAC;IAC3B,6DAA6D;IAC7D,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,mEAAmE;IACnE,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,iDAAiD;AACjD,MAAM,WAAW,OAAO;IACtB,kDAAkD;IAClD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IAChD,2EAA2E;IAC3E,QAAQ,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClC,QAAQ,CAAC,MAAM,EAAE,SAAS,CAAC;IAC3B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,iBAAiB;IAChC;;;OAGG;IACH,QAAQ,CAAC,oBAAoB,CAAC,EAAE,MAAM,CAAC;IACvC,yEAAyE;IACzE,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC;IAClC;;;OAGG;IACH,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;IAC/B,gFAAgF;IAChF,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAuBD,+EAA+E;AAC/E,wBAAgB,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,GAAG,MAAM,CAIpE;AAaD;;;;;GAKG;AACH,wBAAgB,eAAe,CAC7B,UAAU,EAAE,SAAS,YAAY,EAAE,EACnC,OAAO,GAAE,iBAAsB,GAC9B,OAAO,EAAE,CAiEX;AAYD,MAAM,WAAW,iBAAkB,SAAQ,iBAAiB;IAC1D,uEAAuE;IACvE,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,8DAA8D;IAC9D,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,yCAAyC;IACzC,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,yCAAyC;IACzC,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,SAAS,OAAO,EAAE,EACzB,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,MAAM,EACd,OAAO,GAAE,iBAAsB,GAC9B,MAAM,CA+BR;AASD,gFAAgF;AAChF,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,WAAW,EAAE;QACpB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;QACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC;IACF,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,+DAA+D;AAC/D,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,IAAI,EAAE;QACb,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;QACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC;CACH;AAED,kEAAkE;AAClE,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,SAAS,kBAAkB,EAAE,GACrC,YAAY,EAAE,CAehB;AAED,4EAA4E;AAC5E,wBAAgB,oBAAoB,CAClC,MAAM,EAAE,SAAS,YAAY,EAAE,GAC9B,YAAY,EAAE,CAUhB;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE;IACJ,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,kBAAkB,EAAE,CAAC;IACpD,QAAQ,CAAC,SAAS,CAAC,EAAE,SAAS,YAAY,EAAE,CAAC;CAC9C,EACD,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,EAAE,CAQX;AAED;;;;;;GAMG;AACH,wBAAsB,uBAAuB,CAC3C,QAAQ,EAAE,UAAU,EACpB,KAAK,EAAE,SAAS,OAAO,EAAE,EACzB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,MAAM,CAAC,CAgBjB"}
|
package/dist/som.js
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { createRequire } from "node:module";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __returnValue = (v) => v;
|
|
4
|
+
function __exportSetter(name, newValue) {
|
|
5
|
+
this[name] = __returnValue.bind(null, newValue);
|
|
6
|
+
}
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, {
|
|
10
|
+
get: all[name],
|
|
11
|
+
enumerable: true,
|
|
12
|
+
configurable: true,
|
|
13
|
+
set: __exportSetter.bind(all, name)
|
|
14
|
+
});
|
|
15
|
+
};
|
|
16
|
+
var __esm = (fn, res) => () => (fn && (res = fn(fn = 0)), res);
|
|
17
|
+
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
18
|
+
|
|
19
|
+
// src/get-screen-elements.ts
|
|
20
|
+
function bboxIou(a, b) {
|
|
21
|
+
const [ax, ay, aw, ah] = a;
|
|
22
|
+
const [bx, by, bw, bh] = b;
|
|
23
|
+
if (aw <= 0 || ah <= 0 || bw <= 0 || bh <= 0)
|
|
24
|
+
return 0;
|
|
25
|
+
const ix = Math.max(ax, bx);
|
|
26
|
+
const iy = Math.max(ay, by);
|
|
27
|
+
const ix2 = Math.min(ax + aw, bx + bw);
|
|
28
|
+
const iy2 = Math.min(ay + ah, by + bh);
|
|
29
|
+
const iw = Math.max(0, ix2 - ix);
|
|
30
|
+
const ih = Math.max(0, iy2 - iy);
|
|
31
|
+
const inter = iw * ih;
|
|
32
|
+
if (inter <= 0)
|
|
33
|
+
return 0;
|
|
34
|
+
const union = aw * ah + bw * bh - inter;
|
|
35
|
+
return union > 0 ? inter / union : 0;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// src/som.ts
|
|
39
|
+
var DEFAULT_ICON_OVER_TEXT_COVERAGE = 0.7;
|
|
40
|
+
var DEFAULT_NMS_IOU = 0.5;
|
|
41
|
+
var DEFAULT_ROW_TOLERANCE = 12;
|
|
42
|
+
var DEFAULT_SCORE = 0.5;
|
|
43
|
+
function intersectionArea(a, b) {
|
|
44
|
+
const ix = Math.max(a[0], b[0]);
|
|
45
|
+
const iy = Math.max(a[1], b[1]);
|
|
46
|
+
const ix2 = Math.min(a[0] + a[2], b[0] + b[2]);
|
|
47
|
+
const iy2 = Math.min(a[1] + a[3], b[1] + b[3]);
|
|
48
|
+
return Math.max(0, ix2 - ix) * Math.max(0, iy2 - iy);
|
|
49
|
+
}
|
|
50
|
+
function coverageRatio(inner, outer) {
|
|
51
|
+
const area = inner[2] * inner[3];
|
|
52
|
+
if (area <= 0)
|
|
53
|
+
return 0;
|
|
54
|
+
return intersectionArea(inner, outer) / area;
|
|
55
|
+
}
|
|
56
|
+
function isValidBox(b) {
|
|
57
|
+
return Number.isFinite(b[0]) && Number.isFinite(b[1]) && Number.isFinite(b[2]) && Number.isFinite(b[3]) && b[2] > 0 && b[3] > 0;
|
|
58
|
+
}
|
|
59
|
+
function buildSetOfMarks(candidates, options = {}) {
|
|
60
|
+
const iconOverText = options.iconOverTextCoverage ?? DEFAULT_ICON_OVER_TEXT_COVERAGE;
|
|
61
|
+
const nmsIou = options.nmsIouThreshold ?? DEFAULT_NMS_IOU;
|
|
62
|
+
const rowTolerance = options.rowTolerance ?? DEFAULT_ROW_TOLERANCE;
|
|
63
|
+
const minScore = options.minScore ?? 0;
|
|
64
|
+
const norm = candidates.filter((c) => isValidBox(c.bbox)).map((c) => ({
|
|
65
|
+
bbox: [c.bbox[0], c.bbox[1], c.bbox[2], c.bbox[3]],
|
|
66
|
+
source: c.source,
|
|
67
|
+
label: c.label,
|
|
68
|
+
score: c.score ?? DEFAULT_SCORE
|
|
69
|
+
})).filter((c) => c.score >= minScore);
|
|
70
|
+
const icons = norm.filter((c) => c.source === "icon");
|
|
71
|
+
const texts = norm.filter((c) => c.source === "text");
|
|
72
|
+
const keptTexts = texts.filter((t) => !icons.some((icon) => coverageRatio(t.bbox, icon.bbox) >= iconOverText));
|
|
73
|
+
const pool = [...icons, ...keptTexts].sort((a, b) => {
|
|
74
|
+
if (a.source !== b.source)
|
|
75
|
+
return a.source === "icon" ? -1 : 1;
|
|
76
|
+
if (b.score !== a.score)
|
|
77
|
+
return b.score - a.score;
|
|
78
|
+
return a.bbox[1] - b.bbox[1] || a.bbox[0] - b.bbox[0];
|
|
79
|
+
});
|
|
80
|
+
const kept = [];
|
|
81
|
+
for (const cand of pool) {
|
|
82
|
+
const overlaps = kept.some((k) => bboxIou(cand.bbox, k.bbox) >= nmsIou);
|
|
83
|
+
if (!overlaps)
|
|
84
|
+
kept.push(cand);
|
|
85
|
+
}
|
|
86
|
+
const ordered = [...kept].sort((a, b) => {
|
|
87
|
+
const sameRow = Math.abs(a.bbox[1] - b.bbox[1]) <= rowTolerance;
|
|
88
|
+
if (sameRow)
|
|
89
|
+
return a.bbox[0] - b.bbox[0] || a.bbox[1] - b.bbox[1];
|
|
90
|
+
return a.bbox[1] - b.bbox[1];
|
|
91
|
+
});
|
|
92
|
+
return ordered.map((c, i) => {
|
|
93
|
+
const [x, y, w, h] = c.bbox;
|
|
94
|
+
const mark = {
|
|
95
|
+
index: i + 1,
|
|
96
|
+
bbox: [x, y, w, h],
|
|
97
|
+
center: [Math.round(x + w / 2), Math.round(y + h / 2)],
|
|
98
|
+
source: c.source,
|
|
99
|
+
score: c.score
|
|
100
|
+
};
|
|
101
|
+
if (c.label !== undefined)
|
|
102
|
+
mark.label = c.label;
|
|
103
|
+
return mark;
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
function escapeXml(s) {
|
|
107
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
108
|
+
}
|
|
109
|
+
function buildSetOfMarksSvg(marks, width, height, options = {}) {
|
|
110
|
+
const iconColor = options.iconColor ?? "#FF8C00";
|
|
111
|
+
const textColor = options.textColor ?? "#1FA9FF";
|
|
112
|
+
const strokeWidth = options.strokeWidth ?? 2;
|
|
113
|
+
const fontSize = options.fontSize ?? 13;
|
|
114
|
+
const badgeW = Math.round(fontSize * 1.4);
|
|
115
|
+
const badgeH = Math.round(fontSize * 1.3);
|
|
116
|
+
const parts = [];
|
|
117
|
+
for (const mark of marks) {
|
|
118
|
+
const [x, y, w, h] = mark.bbox;
|
|
119
|
+
const color = mark.source === "icon" ? iconColor : textColor;
|
|
120
|
+
const label = String(mark.index);
|
|
121
|
+
const bx = Math.max(0, Math.min(x, width - badgeW));
|
|
122
|
+
const by = Math.max(0, Math.min(y, height - badgeH));
|
|
123
|
+
parts.push(`<rect x="${x}" y="${y}" width="${w}" height="${h}" fill="none" ` + `stroke="${color}" stroke-width="${strokeWidth}" />`, `<rect x="${bx}" y="${by}" width="${badgeW}" height="${badgeH}" ` + `fill="${color}" />`, `<text x="${bx + badgeW / 2}" y="${by + badgeH / 2}" ` + `font-family="monospace" font-size="${fontSize}" font-weight="bold" ` + `fill="#000000" text-anchor="middle" dominant-baseline="central">` + `${escapeXml(label)}</text>`);
|
|
124
|
+
}
|
|
125
|
+
return `<svg xmlns="http://www.w3.org/2000/svg" width="${width}" ` + `height="${height}">${parts.join("")}</svg>`;
|
|
126
|
+
}
|
|
127
|
+
function somCandidatesFromDetections(objects) {
|
|
128
|
+
return objects.map((o) => {
|
|
129
|
+
const c = {
|
|
130
|
+
bbox: [
|
|
131
|
+
o.boundingBox.x,
|
|
132
|
+
o.boundingBox.y,
|
|
133
|
+
o.boundingBox.width,
|
|
134
|
+
o.boundingBox.height
|
|
135
|
+
],
|
|
136
|
+
source: "icon",
|
|
137
|
+
score: o.confidence ?? DEFAULT_SCORE
|
|
138
|
+
};
|
|
139
|
+
if (o.type !== undefined)
|
|
140
|
+
c.label = o.type;
|
|
141
|
+
return c;
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
function somCandidatesFromOcr(blocks) {
|
|
145
|
+
return blocks.map((b) => {
|
|
146
|
+
const c = {
|
|
147
|
+
bbox: [b.bbox.x, b.bbox.y, b.bbox.width, b.bbox.height],
|
|
148
|
+
source: "text",
|
|
149
|
+
score: b.confidence ?? DEFAULT_SCORE
|
|
150
|
+
};
|
|
151
|
+
if (b.text !== undefined)
|
|
152
|
+
c.label = b.text;
|
|
153
|
+
return c;
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
function buildSceneSetOfMarks(args, options) {
|
|
157
|
+
return buildSetOfMarks([
|
|
158
|
+
...somCandidatesFromDetections(args.detections ?? []),
|
|
159
|
+
...somCandidatesFromOcr(args.ocrBlocks ?? [])
|
|
160
|
+
], options);
|
|
161
|
+
}
|
|
162
|
+
async function renderSetOfMarksOverlay(pngBytes, marks, options = {}) {
|
|
163
|
+
const { default: sharp } = await import("sharp");
|
|
164
|
+
const base = sharp(Buffer.from(pngBytes));
|
|
165
|
+
const meta = await base.metadata();
|
|
166
|
+
const width = meta.width ?? 0;
|
|
167
|
+
const height = meta.height ?? 0;
|
|
168
|
+
if (width <= 0 || height <= 0) {
|
|
169
|
+
throw new Error(`[vision/som] cannot overlay marks: source image has no dimensions`);
|
|
170
|
+
}
|
|
171
|
+
const svg = buildSetOfMarksSvg(marks, width, height, options);
|
|
172
|
+
return base.composite([{ input: Buffer.from(svg), top: 0, left: 0 }]).png().toBuffer();
|
|
173
|
+
}
|
|
174
|
+
export {
|
|
175
|
+
somCandidatesFromOcr,
|
|
176
|
+
somCandidatesFromDetections,
|
|
177
|
+
renderSetOfMarksOverlay,
|
|
178
|
+
coverageRatio,
|
|
179
|
+
buildSetOfMarksSvg,
|
|
180
|
+
buildSetOfMarks,
|
|
181
|
+
buildSceneSetOfMarks
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
//# debugId=55C971527120024164756E2164756E21
|
package/dist/som.js.map
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../src/get-screen-elements.ts", "../src/som.ts"],
|
|
4
|
+
"sourcesContent": [
|
|
5
|
+
"/**\n * Pure element-merge core for the GET_SCREEN action (#9105 Slice 2 / M2).\n *\n * GET_SCREEN returns a cheap, token-frugal list of grounded, clickable screen\n * elements unified from three sources: OCR text boxes, accessibility (AX)\n * clickables, and (optionally) VLM-detected elements. This module is the\n * deterministic heart of that envelope — it collapses the three sources into a\n * single deduplicated, stably-ordered element list, recording each element's\n * `groundingSources` provenance.\n *\n * Like the M1 OCR bridge (`computeruse-ocr-bridge.ts`), this is intentionally\n * dependency-free and pure: the source types live in `@elizaos/plugin-computeruse`\n * (`SceneOcrBox` / `SceneAxNode` / `SceneVlmElement`), but we describe their\n * shapes STRUCTURALLY here rather than importing them, to keep the no-hard-dep\n * rule. That also makes the merge engine fully unit-testable with zero\n * environment, decoupled from the runtime/native/model wiring (Slice 3).\n */\n\n/** Display-local bounding box `[x, y, w, h]`. */\nexport type Bbox = readonly [number, number, number, number];\n\n/** Structural shape of computeruse's `SceneOcrBox`. */\nexport interface OcrBoxLike {\n readonly id: string;\n readonly text: string;\n readonly bbox: Bbox;\n readonly conf?: number;\n readonly displayId: number;\n}\n\n/** Structural shape of computeruse's `SceneAxNode`. */\nexport interface AxNodeLike {\n readonly id: string;\n readonly role: string;\n readonly label?: string;\n readonly bbox: Bbox;\n readonly actions?: readonly string[];\n readonly displayId: number;\n}\n\n/** Structural shape of computeruse's `SceneVlmElement`. */\nexport interface VlmElementLike {\n readonly id: string;\n readonly kind: string;\n readonly desc: string;\n readonly bbox: Bbox;\n readonly displayId: number;\n}\n\nexport type GroundingSource = \"ocr\" | \"ax\" | \"vlm\";\n\n/** A single unified, grounded screen element in the GET_SCREEN envelope. */\nexport interface GetScreenElement {\n /** Stable id, preferring the AX id, then OCR, then VLM. */\n id: string;\n /** Display-local bbox `[x, y, w, h]` of the representative (highest-priority) source. */\n bbox: [number, number, number, number];\n /** User-facing text/label: AX label, else OCR text, else VLM description. */\n text: string;\n /** Element kind/role when known: AX role, else VLM kind. */\n kind?: string;\n displayId: number;\n /** AX actions when the element is accessibility-grounded. */\n actions?: string[];\n /** Provenance — every source that contributed to this element, in fixed\n * `ocr < ax < vlm` order for stability. Always non-empty. */\n groundingSources: GroundingSource[];\n}\n\nexport interface MergeScreenInput {\n readonly ocr?: readonly OcrBoxLike[];\n readonly ax?: readonly AxNodeLike[];\n readonly vlm?: readonly VlmElementLike[];\n}\n\nexport interface MergeScreenOptions {\n /** Boxes whose IoU exceeds this collapse into one element (default 0.6). */\n readonly iouThreshold?: number;\n}\n\nconst DEFAULT_IOU_THRESHOLD = 0.6;\n\n/** Intersection-over-union of two `[x, y, w, h]` boxes. 0 when either is empty\n * or they don't overlap. */\nexport function bboxIou(a: Bbox, b: Bbox): number {\n const [ax, ay, aw, ah] = a;\n const [bx, by, bw, bh] = b;\n if (aw <= 0 || ah <= 0 || bw <= 0 || bh <= 0) return 0;\n const ix = Math.max(ax, bx);\n const iy = Math.max(ay, by);\n const ix2 = Math.min(ax + aw, bx + bw);\n const iy2 = Math.min(ay + ah, by + bh);\n const iw = Math.max(0, ix2 - ix);\n const ih = Math.max(0, iy2 - iy);\n const inter = iw * ih;\n if (inter <= 0) return 0;\n const union = aw * ah + bw * bh - inter;\n return union > 0 ? inter / union : 0;\n}\n\ninterface Cluster {\n displayId: number;\n bbox: Bbox;\n ax?: AxNodeLike;\n ocr?: OcrBoxLike;\n vlm?: VlmElementLike;\n}\n\n/** Stable sort key: top-to-bottom, then left-to-right, then by id for ties. */\nfunction byPosition(\n a: { bbox: Bbox; id: string },\n b: { bbox: Bbox; id: string },\n): number {\n return (\n a.bbox[1] - b.bbox[1] || a.bbox[0] - b.bbox[0] || a.id.localeCompare(b.id)\n );\n}\n\n/**\n * Merge OCR boxes + AX clickables + VLM elements into one deduplicated,\n * deterministically-ordered element list.\n *\n * - Elements from different sources whose bboxes overlap above `iouThreshold`\n * (and share a `displayId`) collapse into one element that records all\n * contributing sources in `groundingSources`.\n * - Field precedence is AX > OCR > VLM (AX wins id/label/role; OCR text fills\n * in when AX has no label; VLM desc is the last resort).\n * - Output order is top-to-bottom, then left-to-right, so the envelope is\n * stable across turns regardless of input ordering.\n * - Degrades gracefully: any source may be absent/empty (e.g. accessibility off)\n * and the function never throws.\n */\nexport function mergeScreenElements(\n input: MergeScreenInput,\n options: MergeScreenOptions = {},\n): GetScreenElement[] {\n const threshold = options.iouThreshold ?? DEFAULT_IOU_THRESHOLD;\n const clusters: Cluster[] = [];\n\n const attach = (\n displayId: number,\n bbox: Bbox,\n set: (c: Cluster) => void,\n ): void => {\n const match = clusters.find(\n (c) => c.displayId === displayId && bboxIou(c.bbox, bbox) >= threshold,\n );\n if (match) {\n set(match);\n return;\n }\n const cluster: Cluster = { displayId, bbox };\n set(cluster);\n clusters.push(cluster);\n };\n\n // Process in precedence order so a cluster's representative bbox is set by its\n // highest-priority contributing source, and pre-sort each source by position\n // so cluster creation order is deterministic.\n for (const ax of [...(input.ax ?? [])].sort(byPosition)) {\n attach(ax.displayId, ax.bbox, (c) => {\n if (!c.ax) c.ax = ax;\n });\n }\n for (const ocr of [...(input.ocr ?? [])].sort(byPosition)) {\n attach(ocr.displayId, ocr.bbox, (c) => {\n if (!c.ocr) c.ocr = ocr;\n });\n }\n for (const vlm of [...(input.vlm ?? [])].sort(byPosition)) {\n attach(vlm.displayId, vlm.bbox, (c) => {\n if (!c.vlm) c.vlm = vlm;\n });\n }\n\n const elements = clusters.map((c): GetScreenElement => {\n const groundingSources: GroundingSource[] = [];\n if (c.ocr) groundingSources.push(\"ocr\");\n if (c.ax) groundingSources.push(\"ax\");\n if (c.vlm) groundingSources.push(\"vlm\");\n\n const id = c.ax?.id ?? c.ocr?.id ?? c.vlm?.id ?? \"el\";\n const text = c.ax?.label || c.ocr?.text || c.vlm?.desc || \"\";\n const kind = c.ax?.role ?? c.vlm?.kind;\n const [x, y, w, h] = c.bbox;\n\n const element: GetScreenElement = {\n id,\n bbox: [x, y, w, h],\n text,\n displayId: c.displayId,\n groundingSources,\n };\n if (kind !== undefined) element.kind = kind;\n if (c.ax?.actions && c.ax.actions.length > 0) {\n element.actions = [...c.ax.actions];\n }\n return element;\n });\n\n return elements.sort(byPosition);\n}\n",
|
|
6
|
+
"/**\n * Set-of-Marks (SoM) grounding — #9170 M9.\n *\n * Set-of-Marks is the grounding technique trycua/cua uses via OmniParser: take\n * the icon detections (GGUF YOLO) and the OCR text boxes (the `CoordOcrProvider`\n * seam plugin-vision already feeds), fuse them into ONE deduplicated set of\n * candidate targets, draw a 1-indexed numbered box over each on the screenshot,\n * and let the VLM pick a *number* instead of raw pixel coordinates. Numeric\n * selection is far more reliable than free-floating coordinate regression.\n *\n * This module is split into:\n * - a PURE core (`buildSetOfMarks`) — icon-over-text suppression + NMS +\n * deterministic reading-order numbering. Dependency-free and structurally\n * typed so it unit-tests with zero environment, mirroring\n * `get-screen-elements.ts` and `computeruse-ocr-bridge.ts`.\n * - a renderer (`renderSetOfMarksOverlay`) — composites a numbered SVG overlay\n * onto the source PNG via `sharp`.\n *\n * The OmniParser fusion rules we reproduce:\n * 1. Icon-over-text suppression — a text box mostly covered by an icon box is\n * dropped; the icon is the interactable, the text is its caption.\n * 2. Non-max suppression — overlapping boxes collapse to the highest-priority\n * one (icons outrank text on ties), so each target is marked once.\n * 3. 1-indexed reading-order numbering — top-to-bottom, then left-to-right,\n * with a row tolerance so a visual row isn't scrambled by sub-pixel y jitter.\n */\n\nimport { bboxIou } from \"./get-screen-elements.js\";\n\n/** Display-local bounding box `[x, y, w, h]`. */\nexport type SomBbox = readonly [number, number, number, number];\n\n/** Where a candidate mark came from. Icons outrank text during suppression. */\nexport type SomSource = \"icon\" | \"text\";\n\n/** A raw candidate box fed into the SoM fusion. */\nexport interface SomCandidate {\n readonly bbox: SomBbox;\n readonly source: SomSource;\n /** Class name (icon) or recognized text (text). Optional. */\n readonly label?: string;\n /** Detector/OCR confidence in [0, 1]. Missing → treated as 0.5. */\n readonly score?: number;\n}\n\n/** A finalized, numbered mark in the overlay. */\nexport interface SomMark {\n /** 1-indexed mark number shown in the overlay. */\n readonly index: number;\n readonly bbox: [number, number, number, number];\n /** Box center `[x, y]` — the click target the VLM's number resolves to. */\n readonly center: [number, number];\n readonly source: SomSource;\n readonly label?: string;\n readonly score: number;\n}\n\nexport interface SetOfMarksOptions {\n /**\n * A text box is dropped when this fraction of its area is covered by an icon\n * box (icon-over-text suppression). Default 0.7.\n */\n readonly iconOverTextCoverage?: number;\n /** Boxes overlapping above this IoU collapse during NMS. Default 0.5. */\n readonly nmsIouThreshold?: number;\n /**\n * Rows within this many pixels of vertical offset are treated as the same\n * reading row (so left-to-right ordering holds across a row). Default 12.\n */\n readonly rowTolerance?: number;\n /** Drop candidates with a smaller score before fusion. Default 0 (keep all). */\n readonly minScore?: number;\n}\n\nconst DEFAULT_ICON_OVER_TEXT_COVERAGE = 0.7;\nconst DEFAULT_NMS_IOU = 0.5;\nconst DEFAULT_ROW_TOLERANCE = 12;\nconst DEFAULT_SCORE = 0.5;\n\ninterface NormCandidate {\n bbox: [number, number, number, number];\n source: SomSource;\n label?: string;\n score: number;\n}\n\n/** Area of intersection of two `[x, y, w, h]` boxes (0 when disjoint). Pure. */\nfunction intersectionArea(a: SomBbox, b: SomBbox): number {\n const ix = Math.max(a[0], b[0]);\n const iy = Math.max(a[1], b[1]);\n const ix2 = Math.min(a[0] + a[2], b[0] + b[2]);\n const iy2 = Math.min(a[1] + a[3], b[1] + b[3]);\n return Math.max(0, ix2 - ix) * Math.max(0, iy2 - iy);\n}\n\n/** Fraction of `inner`'s area covered by `outer` (containment ratio). Pure. */\nexport function coverageRatio(inner: SomBbox, outer: SomBbox): number {\n const area = inner[2] * inner[3];\n if (area <= 0) return 0;\n return intersectionArea(inner, outer) / area;\n}\n\nfunction isValidBox(b: SomBbox): boolean {\n return (\n Number.isFinite(b[0]) &&\n Number.isFinite(b[1]) &&\n Number.isFinite(b[2]) &&\n Number.isFinite(b[3]) &&\n b[2] > 0 &&\n b[3] > 0\n );\n}\n\n/**\n * Fuse icon + text candidates into a deduplicated, 1-indexed set of marks.\n *\n * Pure and deterministic: same inputs → identical numbering, regardless of\n * input ordering. Degenerate boxes (non-finite / zero-area) are dropped.\n */\nexport function buildSetOfMarks(\n candidates: readonly SomCandidate[],\n options: SetOfMarksOptions = {},\n): SomMark[] {\n const iconOverText =\n options.iconOverTextCoverage ?? DEFAULT_ICON_OVER_TEXT_COVERAGE;\n const nmsIou = options.nmsIouThreshold ?? DEFAULT_NMS_IOU;\n const rowTolerance = options.rowTolerance ?? DEFAULT_ROW_TOLERANCE;\n const minScore = options.minScore ?? 0;\n\n const norm: NormCandidate[] = candidates\n .filter((c) => isValidBox(c.bbox))\n .map((c) => ({\n bbox: [c.bbox[0], c.bbox[1], c.bbox[2], c.bbox[3]] as [\n number,\n number,\n number,\n number,\n ],\n source: c.source,\n label: c.label,\n score: c.score ?? DEFAULT_SCORE,\n }))\n .filter((c) => c.score >= minScore);\n\n const icons = norm.filter((c) => c.source === \"icon\");\n const texts = norm.filter((c) => c.source === \"text\");\n\n // 1. Icon-over-text suppression: drop text mostly covered by some icon.\n const keptTexts = texts.filter(\n (t) =>\n !icons.some((icon) => coverageRatio(t.bbox, icon.bbox) >= iconOverText),\n );\n\n // 2. NMS over the survivors. Sort so icons outrank text, then by score desc,\n // then by a stable positional key — the greedy keep is deterministic.\n const pool = [...icons, ...keptTexts].sort((a, b) => {\n if (a.source !== b.source) return a.source === \"icon\" ? -1 : 1;\n if (b.score !== a.score) return b.score - a.score;\n return a.bbox[1] - b.bbox[1] || a.bbox[0] - b.bbox[0];\n });\n\n const kept: NormCandidate[] = [];\n for (const cand of pool) {\n const overlaps = kept.some((k) => bboxIou(cand.bbox, k.bbox) >= nmsIou);\n if (!overlaps) kept.push(cand);\n }\n\n // 3. Reading-order numbering: bucket into rows (within rowTolerance), then\n // order rows top-to-bottom and members left-to-right.\n const ordered = [...kept].sort((a, b) => {\n const sameRow = Math.abs(a.bbox[1] - b.bbox[1]) <= rowTolerance;\n if (sameRow) return a.bbox[0] - b.bbox[0] || a.bbox[1] - b.bbox[1];\n return a.bbox[1] - b.bbox[1];\n });\n\n return ordered.map((c, i): SomMark => {\n const [x, y, w, h] = c.bbox;\n const mark: SomMark = {\n index: i + 1,\n bbox: [x, y, w, h],\n center: [Math.round(x + w / 2), Math.round(y + h / 2)],\n source: c.source,\n score: c.score,\n };\n if (c.label !== undefined) (mark as { label?: string }).label = c.label;\n return mark;\n });\n}\n\n/** XML-escape a label so it is safe to embed in the SVG overlay. */\nfunction escapeXml(s: string): string {\n return s\n .replace(/&/g, \"&\")\n .replace(/</g, \"<\")\n .replace(/>/g, \">\")\n .replace(/\"/g, \""\")\n .replace(/'/g, \"'\");\n}\n\nexport interface SomOverlayOptions extends SetOfMarksOptions {\n /** Stroke / badge color for icon marks. Default \"#FF8C00\" (orange). */\n readonly iconColor?: string;\n /** Stroke / badge color for text marks. Default \"#1FA9FF\". */\n readonly textColor?: string;\n /** Box stroke width in px. Default 2. */\n readonly strokeWidth?: number;\n /** Badge font size in px. Default 13. */\n readonly fontSize?: number;\n}\n\n/**\n * Build the SVG overlay markup for a set of marks over a `width × height`\n * canvas. Pure (no I/O) — separated from the raster composite so it is\n * unit-testable and reusable by non-sharp consumers (e.g. a browser overlay).\n */\nexport function buildSetOfMarksSvg(\n marks: readonly SomMark[],\n width: number,\n height: number,\n options: SomOverlayOptions = {},\n): string {\n const iconColor = options.iconColor ?? \"#FF8C00\";\n const textColor = options.textColor ?? \"#1FA9FF\";\n const strokeWidth = options.strokeWidth ?? 2;\n const fontSize = options.fontSize ?? 13;\n const badgeW = Math.round(fontSize * 1.4);\n const badgeH = Math.round(fontSize * 1.3);\n\n const parts: string[] = [];\n for (const mark of marks) {\n const [x, y, w, h] = mark.bbox;\n const color = mark.source === \"icon\" ? iconColor : textColor;\n const label = String(mark.index);\n // Badge sits just inside the top-left of the box, clamped to the canvas.\n const bx = Math.max(0, Math.min(x, width - badgeW));\n const by = Math.max(0, Math.min(y, height - badgeH));\n parts.push(\n `<rect x=\"${x}\" y=\"${y}\" width=\"${w}\" height=\"${h}\" fill=\"none\" ` +\n `stroke=\"${color}\" stroke-width=\"${strokeWidth}\" />`,\n `<rect x=\"${bx}\" y=\"${by}\" width=\"${badgeW}\" height=\"${badgeH}\" ` +\n `fill=\"${color}\" />`,\n `<text x=\"${bx + badgeW / 2}\" y=\"${by + badgeH / 2}\" ` +\n `font-family=\"monospace\" font-size=\"${fontSize}\" font-weight=\"bold\" ` +\n `fill=\"#000000\" text-anchor=\"middle\" dominant-baseline=\"central\">` +\n `${escapeXml(label)}</text>`,\n );\n }\n return (\n `<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"${width}\" ` +\n `height=\"${height}\">${parts.join(\"\")}</svg>`\n );\n}\n\n// ── Source adapters ─────────────────────────────────────────────────────────\n//\n// Turn the two grounding sources we already have — GGUF YOLO icon detections\n// and `CoordOcrProvider` text blocks — into SoM candidates. Both are described\n// structurally so this module keeps no hard dependency on the detector or OCR\n// implementations (same no-hard-dep rule as `get-screen-elements.ts`).\n\n/** Structural shape of a `DetectedObject` (YOLO) — `{x,y,width,height}` box. */\nexport interface DetectedObjectLike {\n readonly boundingBox: {\n readonly x: number;\n readonly y: number;\n readonly width: number;\n readonly height: number;\n };\n readonly type?: string;\n readonly confidence?: number;\n}\n\n/** Structural shape of an OCR block (`OcrWithCoordsBlock`). */\nexport interface OcrBlockLike {\n readonly text?: string;\n readonly confidence?: number;\n readonly bbox: {\n readonly x: number;\n readonly y: number;\n readonly width: number;\n readonly height: number;\n };\n}\n\n/** Adapt GGUF YOLO detections into icon-source SoM candidates. */\nexport function somCandidatesFromDetections(\n objects: readonly DetectedObjectLike[],\n): SomCandidate[] {\n return objects.map((o) => {\n const c: SomCandidate = {\n bbox: [\n o.boundingBox.x,\n o.boundingBox.y,\n o.boundingBox.width,\n o.boundingBox.height,\n ],\n source: \"icon\",\n score: o.confidence ?? DEFAULT_SCORE,\n };\n if (o.type !== undefined) (c as { label?: string }).label = o.type;\n return c;\n });\n}\n\n/** Adapt `CoordOcrProvider` text blocks into text-source SoM candidates. */\nexport function somCandidatesFromOcr(\n blocks: readonly OcrBlockLike[],\n): SomCandidate[] {\n return blocks.map((b) => {\n const c: SomCandidate = {\n bbox: [b.bbox.x, b.bbox.y, b.bbox.width, b.bbox.height],\n source: \"text\",\n score: b.confidence ?? DEFAULT_SCORE,\n };\n if (b.text !== undefined) (c as { label?: string }).label = b.text;\n return c;\n });\n}\n\n/**\n * Convenience: fuse YOLO detections + OCR blocks straight into a numbered mark\n * set. The seam `detect_elements`/grounding calls — pass whatever icon and text\n * boxes the scene already has.\n */\nexport function buildSceneSetOfMarks(\n args: {\n readonly detections?: readonly DetectedObjectLike[];\n readonly ocrBlocks?: readonly OcrBlockLike[];\n },\n options?: SetOfMarksOptions,\n): SomMark[] {\n return buildSetOfMarks(\n [\n ...somCandidatesFromDetections(args.detections ?? []),\n ...somCandidatesFromOcr(args.ocrBlocks ?? []),\n ],\n options,\n );\n}\n\n/**\n * Composite a numbered Set-of-Marks overlay onto a source PNG.\n *\n * Returns PNG bytes the same size as the input. `sharp` is loaded lazily so the\n * pure core (`buildSetOfMarks`) carries no native dependency for consumers that\n * only need coordinates.\n */\nexport async function renderSetOfMarksOverlay(\n pngBytes: Uint8Array,\n marks: readonly SomMark[],\n options: SomOverlayOptions = {},\n): Promise<Buffer> {\n const { default: sharp } = await import(\"sharp\");\n const base = sharp(Buffer.from(pngBytes));\n const meta = await base.metadata();\n const width = meta.width ?? 0;\n const height = meta.height ?? 0;\n if (width <= 0 || height <= 0) {\n throw new Error(\n `[vision/som] cannot overlay marks: source image has no dimensions`,\n );\n }\n const svg = buildSetOfMarksSvg(marks, width, height, options);\n return base\n .composite([{ input: Buffer.from(svg), top: 0, left: 0 }])\n .png()\n .toBuffer();\n}\n"
|
|
7
|
+
],
|
|
8
|
+
"mappings": ";;;;;;;;;;;;;;;;;;;AAoFO,SAAS,OAAO,CAAC,GAAS,GAAiB;AAAA,EAChD,OAAO,IAAI,IAAI,IAAI,MAAM;AAAA,EACzB,OAAO,IAAI,IAAI,IAAI,MAAM;AAAA,EACzB,IAAI,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM;AAAA,IAAG,OAAO;AAAA,EACrD,MAAM,KAAK,KAAK,IAAI,IAAI,EAAE;AAAA,EAC1B,MAAM,KAAK,KAAK,IAAI,IAAI,EAAE;AAAA,EAC1B,MAAM,MAAM,KAAK,IAAI,KAAK,IAAI,KAAK,EAAE;AAAA,EACrC,MAAM,MAAM,KAAK,IAAI,KAAK,IAAI,KAAK,EAAE;AAAA,EACrC,MAAM,KAAK,KAAK,IAAI,GAAG,MAAM,EAAE;AAAA,EAC/B,MAAM,KAAK,KAAK,IAAI,GAAG,MAAM,EAAE;AAAA,EAC/B,MAAM,QAAQ,KAAK;AAAA,EACnB,IAAI,SAAS;AAAA,IAAG,OAAO;AAAA,EACvB,MAAM,QAAQ,KAAK,KAAK,KAAK,KAAK;AAAA,EAClC,OAAO,QAAQ,IAAI,QAAQ,QAAQ;AAAA;;;ACvBrC,IAAM,kCAAkC;AACxC,IAAM,kBAAkB;AACxB,IAAM,wBAAwB;AAC9B,IAAM,gBAAgB;AAUtB,SAAS,gBAAgB,CAAC,GAAY,GAAoB;AAAA,EACxD,MAAM,KAAK,KAAK,IAAI,EAAE,IAAI,EAAE,EAAE;AAAA,EAC9B,MAAM,KAAK,KAAK,IAAI,EAAE,IAAI,EAAE,EAAE;AAAA,EAC9B,MAAM,MAAM,KAAK,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;AAAA,EAC7C,MAAM,MAAM,KAAK,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;AAAA,EAC7C,OAAO,KAAK,IAAI,GAAG,MAAM,EAAE,IAAI,KAAK,IAAI,GAAG,MAAM,EAAE;AAAA;AAI9C,SAAS,aAAa,CAAC,OAAgB,OAAwB;AAAA,EACpE,MAAM,OAAO,MAAM,KAAK,MAAM;AAAA,EAC9B,IAAI,QAAQ;AAAA,IAAG,OAAO;AAAA,EACtB,OAAO,iBAAiB,OAAO,KAAK,IAAI;AAAA;AAG1C,SAAS,UAAU,CAAC,GAAqB;AAAA,EACvC,OACE,OAAO,SAAS,EAAE,EAAE,KACpB,OAAO,SAAS,EAAE,EAAE,KACpB,OAAO,SAAS,EAAE,EAAE,KACpB,OAAO,SAAS,EAAE,EAAE,KACpB,EAAE,KAAK,KACP,EAAE,KAAK;AAAA;AAUJ,SAAS,eAAe,CAC7B,YACA,UAA6B,CAAC,GACnB;AAAA,EACX,MAAM,eACJ,QAAQ,wBAAwB;AAAA,EAClC,MAAM,SAAS,QAAQ,mBAAmB;AAAA,EAC1C,MAAM,eAAe,QAAQ,gBAAgB;AAAA,EAC7C,MAAM,WAAW,QAAQ,YAAY;AAAA,EAErC,MAAM,OAAwB,WAC3B,OAAO,CAAC,MAAM,WAAW,EAAE,IAAI,CAAC,EAChC,IAAI,CAAC,OAAO;AAAA,IACX,MAAM,CAAC,EAAE,KAAK,IAAI,EAAE,KAAK,IAAI,EAAE,KAAK,IAAI,EAAE,KAAK,EAAE;AAAA,IAMjD,QAAQ,EAAE;AAAA,IACV,OAAO,EAAE;AAAA,IACT,OAAO,EAAE,SAAS;AAAA,EACpB,EAAE,EACD,OAAO,CAAC,MAAM,EAAE,SAAS,QAAQ;AAAA,EAEpC,MAAM,QAAQ,KAAK,OAAO,CAAC,MAAM,EAAE,WAAW,MAAM;AAAA,EACpD,MAAM,QAAQ,KAAK,OAAO,CAAC,MAAM,EAAE,WAAW,MAAM;AAAA,EAGpD,MAAM,YAAY,MAAM,OACtB,CAAC,MACC,CAAC,MAAM,KAAK,CAAC,SAAS,cAAc,EAAE,MAAM,KAAK,IAAI,KAAK,YAAY,CAC1E;AAAA,EAIA,MAAM,OAAO,CAAC,GAAG,OAAO,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM;AAAA,IACnD,IAAI,EAAE,WAAW,EAAE;AAAA,MAAQ,OAAO,EAAE,WAAW,SAAS,KAAK;AAAA,IAC7D,IAAI,EAAE,UAAU,EAAE;AAAA,MAAO,OAAO,EAAE,QAAQ,EAAE;AAAA,IAC5C,OAAO,EAAE,KAAK,KAAK,EAAE,KAAK,MAAM,EAAE,KAAK,KAAK,EAAE,KAAK;AAAA,GACpD;AAAA,EAED,MAAM,OAAwB,CAAC;AAAA,EAC/B,WAAW,QAAQ,MAAM;AAAA,IACvB,MAAM,WAAW,KAAK,KAAK,CAAC,MAAM,QAAQ,KAAK,MAAM,EAAE,IAAI,KAAK,MAAM;AAAA,IACtE,IAAI,CAAC;AAAA,MAAU,KAAK,KAAK,IAAI;AAAA,EAC/B;AAAA,EAIA,MAAM,UAAU,CAAC,GAAG,IAAI,EAAE,KAAK,CAAC,GAAG,MAAM;AAAA,IACvC,MAAM,UAAU,KAAK,IAAI,EAAE,KAAK,KAAK,EAAE,KAAK,EAAE,KAAK;AAAA,IACnD,IAAI;AAAA,MAAS,OAAO,EAAE,KAAK,KAAK,EAAE,KAAK,MAAM,EAAE,KAAK,KAAK,EAAE,KAAK;AAAA,IAChE,OAAO,EAAE,KAAK,KAAK,EAAE,KAAK;AAAA,GAC3B;AAAA,EAED,OAAO,QAAQ,IAAI,CAAC,GAAG,MAAe;AAAA,IACpC,OAAO,GAAG,GAAG,GAAG,KAAK,EAAE;AAAA,IACvB,MAAM,OAAgB;AAAA,MACpB,OAAO,IAAI;AAAA,MACX,MAAM,CAAC,GAAG,GAAG,GAAG,CAAC;AAAA,MACjB,QAAQ,CAAC,KAAK,MAAM,IAAI,IAAI,CAAC,GAAG,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC;AAAA,MACrD,QAAQ,EAAE;AAAA,MACV,OAAO,EAAE;AAAA,IACX;AAAA,IACA,IAAI,EAAE,UAAU;AAAA,MAAY,KAA4B,QAAQ,EAAE;AAAA,IAClE,OAAO;AAAA,GACR;AAAA;AAIH,SAAS,SAAS,CAAC,GAAmB;AAAA,EACpC,OAAO,EACJ,QAAQ,MAAM,OAAO,EACrB,QAAQ,MAAM,MAAM,EACpB,QAAQ,MAAM,MAAM,EACpB,QAAQ,MAAM,QAAQ,EACtB,QAAQ,MAAM,QAAQ;AAAA;AAmBpB,SAAS,kBAAkB,CAChC,OACA,OACA,QACA,UAA6B,CAAC,GACtB;AAAA,EACR,MAAM,YAAY,QAAQ,aAAa;AAAA,EACvC,MAAM,YAAY,QAAQ,aAAa;AAAA,EACvC,MAAM,cAAc,QAAQ,eAAe;AAAA,EAC3C,MAAM,WAAW,QAAQ,YAAY;AAAA,EACrC,MAAM,SAAS,KAAK,MAAM,WAAW,GAAG;AAAA,EACxC,MAAM,SAAS,KAAK,MAAM,WAAW,GAAG;AAAA,EAExC,MAAM,QAAkB,CAAC;AAAA,EACzB,WAAW,QAAQ,OAAO;AAAA,IACxB,OAAO,GAAG,GAAG,GAAG,KAAK,KAAK;AAAA,IAC1B,MAAM,QAAQ,KAAK,WAAW,SAAS,YAAY;AAAA,IACnD,MAAM,QAAQ,OAAO,KAAK,KAAK;AAAA,IAE/B,MAAM,KAAK,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,QAAQ,MAAM,CAAC;AAAA,IAClD,MAAM,KAAK,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,SAAS,MAAM,CAAC;AAAA,IACnD,MAAM,KACJ,YAAY,SAAS,aAAa,cAAc,oBAC9C,WAAW,wBAAwB,mBACrC,YAAY,UAAU,cAAc,mBAAmB,aACrD,SAAS,aACX,YAAY,KAAK,SAAS,SAAS,KAAK,SAAS,QAC/C,sCAAsC,kCACtC,qEACA,GAAG,UAAU,KAAK,UACtB;AAAA,EACF;AAAA,EACA,OACE,kDAAkD,YAClD,WAAW,WAAW,MAAM,KAAK,EAAE;AAAA;AAoChC,SAAS,2BAA2B,CACzC,SACgB;AAAA,EAChB,OAAO,QAAQ,IAAI,CAAC,MAAM;AAAA,IACxB,MAAM,IAAkB;AAAA,MACtB,MAAM;AAAA,QACJ,EAAE,YAAY;AAAA,QACd,EAAE,YAAY;AAAA,QACd,EAAE,YAAY;AAAA,QACd,EAAE,YAAY;AAAA,MAChB;AAAA,MACA,QAAQ;AAAA,MACR,OAAO,EAAE,cAAc;AAAA,IACzB;AAAA,IACA,IAAI,EAAE,SAAS;AAAA,MAAY,EAAyB,QAAQ,EAAE;AAAA,IAC9D,OAAO;AAAA,GACR;AAAA;AAII,SAAS,oBAAoB,CAClC,QACgB;AAAA,EAChB,OAAO,OAAO,IAAI,CAAC,MAAM;AAAA,IACvB,MAAM,IAAkB;AAAA,MACtB,MAAM,CAAC,EAAE,KAAK,GAAG,EAAE,KAAK,GAAG,EAAE,KAAK,OAAO,EAAE,KAAK,MAAM;AAAA,MACtD,QAAQ;AAAA,MACR,OAAO,EAAE,cAAc;AAAA,IACzB;AAAA,IACA,IAAI,EAAE,SAAS;AAAA,MAAY,EAAyB,QAAQ,EAAE;AAAA,IAC9D,OAAO;AAAA,GACR;AAAA;AAQI,SAAS,oBAAoB,CAClC,MAIA,SACW;AAAA,EACX,OAAO,gBACL;AAAA,IACE,GAAG,4BAA4B,KAAK,cAAc,CAAC,CAAC;AAAA,IACpD,GAAG,qBAAqB,KAAK,aAAa,CAAC,CAAC;AAAA,EAC9C,GACA,OACF;AAAA;AAUF,eAAsB,uBAAuB,CAC3C,UACA,OACA,UAA6B,CAAC,GACb;AAAA,EACjB,QAAQ,SAAS,UAAU,MAAa;AAAA,EACxC,MAAM,OAAO,MAAM,OAAO,KAAK,QAAQ,CAAC;AAAA,EACxC,MAAM,OAAO,MAAM,KAAK,SAAS;AAAA,EACjC,MAAM,QAAQ,KAAK,SAAS;AAAA,EAC5B,MAAM,SAAS,KAAK,UAAU;AAAA,EAC9B,IAAI,SAAS,KAAK,UAAU,GAAG;AAAA,IAC7B,MAAM,IAAI,MACR,mEACF;AAAA,EACF;AAAA,EACA,MAAM,MAAM,mBAAmB,OAAO,OAAO,QAAQ,OAAO;AAAA,EAC5D,OAAO,KACJ,UAAU,CAAC,EAAE,OAAO,OAAO,KAAK,GAAG,GAAG,KAAK,GAAG,MAAM,EAAE,CAAC,CAAC,EACxD,IAAI,EACJ,SAAS;AAAA;",
|
|
9
|
+
"debugId": "55C971527120024164756E2164756E21",
|
|
10
|
+
"names": []
|
|
11
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test-input shim for plugin-vision.
|
|
3
|
+
*
|
|
4
|
+
* Lets tests inject a fixture image into the vision analysis pipeline
|
|
5
|
+
* without monkey-patching platform capture code.
|
|
6
|
+
*
|
|
7
|
+
* Selection: read `ELIZA_VISION_TEST_INPUT`:
|
|
8
|
+
* - "image" → return the fixture bytes (resolved via
|
|
9
|
+
* `ELIZA_VISION_TEST_FIXTURE` env var, default the bundled
|
|
10
|
+
* test/fixtures/sample-scene.png path).
|
|
11
|
+
* - "camera" / "screen" / unset → return null, signaling that the caller
|
|
12
|
+
* should use the existing platform capture path.
|
|
13
|
+
*
|
|
14
|
+
* Single-purpose helper: anything that wants test-injected pixels reads
|
|
15
|
+
* `getTestImage()`. When it returns a Buffer, use those bytes. When it
|
|
16
|
+
* returns null, fall through to live capture.
|
|
17
|
+
*/
|
|
18
|
+
export type TestInputMode = "image" | "camera" | "screen" | "unset";
|
|
19
|
+
export declare function getTestInputMode(): TestInputMode;
|
|
20
|
+
/**
|
|
21
|
+
* If `ELIZA_VISION_TEST_INPUT=image`, return fixture bytes (PNG buffer);
|
|
22
|
+
* otherwise return null. Result is cached per-fixture-path for the process.
|
|
23
|
+
*/
|
|
24
|
+
export declare function getTestImage(): Buffer | null;
|
|
25
|
+
//# sourceMappingURL=test-input.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-input.d.ts","sourceRoot":"","sources":["../src/test-input.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAMH,MAAM,MAAM,aAAa,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,GAAG,OAAO,CAAC;AAEpE,wBAAgB,gBAAgB,IAAI,aAAa,CAMhD;AAcD;;;GAGG;AACH,wBAAgB,YAAY,IAAI,MAAM,GAAG,IAAI,CAa5C"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import type { DescribePauseReason } from "./describe-backpressure";
|
|
2
|
+
export declare const VisionServiceType: {
|
|
3
|
+
VISION: "VISION";
|
|
4
|
+
};
|
|
5
|
+
declare module "@elizaos/core" {
|
|
6
|
+
interface ServiceTypeRegistry {
|
|
7
|
+
VISION: "VISION";
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
export interface CameraInfo {
|
|
11
|
+
id: string;
|
|
12
|
+
name: string;
|
|
13
|
+
connected: boolean;
|
|
14
|
+
}
|
|
15
|
+
export interface SceneDescription {
|
|
16
|
+
/** Freshest processed frame timestamp for object/person/change signals. */
|
|
17
|
+
timestamp: number;
|
|
18
|
+
/** Timestamp of the VLM prose in `description`; may be older than frame data. */
|
|
19
|
+
descriptionTimestamp?: number;
|
|
20
|
+
description: string;
|
|
21
|
+
objects: DetectedObject[];
|
|
22
|
+
people: PersonInfo[];
|
|
23
|
+
sceneChanged: boolean;
|
|
24
|
+
changePercentage: number;
|
|
25
|
+
/** True when the VLM prose was reused after a describe skip. */
|
|
26
|
+
descriptionStale?: boolean;
|
|
27
|
+
/** True when the VLM describe step is currently paused by backpressure. */
|
|
28
|
+
describePaused?: boolean;
|
|
29
|
+
describePauseReason?: Exclude<DescribePauseReason, null>;
|
|
30
|
+
audioTranscription?: string;
|
|
31
|
+
}
|
|
32
|
+
export interface DetectedObject {
|
|
33
|
+
id: string;
|
|
34
|
+
type: string;
|
|
35
|
+
confidence: number;
|
|
36
|
+
boundingBox: BoundingBox;
|
|
37
|
+
}
|
|
38
|
+
export interface PersonInfo {
|
|
39
|
+
id: string;
|
|
40
|
+
pose: "sitting" | "standing" | "lying" | "unknown";
|
|
41
|
+
facing: "camera" | "away" | "left" | "right" | "unknown";
|
|
42
|
+
confidence: number;
|
|
43
|
+
boundingBox: BoundingBox;
|
|
44
|
+
keypoints?: Array<{
|
|
45
|
+
part: string;
|
|
46
|
+
position: {
|
|
47
|
+
x: number;
|
|
48
|
+
y: number;
|
|
49
|
+
};
|
|
50
|
+
score: number;
|
|
51
|
+
}>;
|
|
52
|
+
}
|
|
53
|
+
export interface BoundingBox {
|
|
54
|
+
x: number;
|
|
55
|
+
y: number;
|
|
56
|
+
width: number;
|
|
57
|
+
height: number;
|
|
58
|
+
}
|
|
59
|
+
export interface VisionFrame {
|
|
60
|
+
timestamp: number;
|
|
61
|
+
width: number;
|
|
62
|
+
height: number;
|
|
63
|
+
data: Buffer;
|
|
64
|
+
format: "rgb" | "rgba" | "jpeg" | "png";
|
|
65
|
+
}
|
|
66
|
+
export declare enum VisionMode {
|
|
67
|
+
OFF = "OFF",
|
|
68
|
+
CAMERA = "CAMERA",
|
|
69
|
+
SCREEN = "SCREEN",
|
|
70
|
+
BOTH = "BOTH"
|
|
71
|
+
}
|
|
72
|
+
export interface ScreenCapture {
|
|
73
|
+
timestamp: number;
|
|
74
|
+
width: number;
|
|
75
|
+
height: number;
|
|
76
|
+
data: Buffer;
|
|
77
|
+
tiles: ScreenTile[];
|
|
78
|
+
}
|
|
79
|
+
export interface ScreenTile {
|
|
80
|
+
id: string;
|
|
81
|
+
row: number;
|
|
82
|
+
col: number;
|
|
83
|
+
/** Tile origin X within the source capture (display-local pixels). */
|
|
84
|
+
x: number;
|
|
85
|
+
/** Tile origin Y within the source capture (display-local pixels). */
|
|
86
|
+
y: number;
|
|
87
|
+
width: number;
|
|
88
|
+
height: number;
|
|
89
|
+
data?: Buffer;
|
|
90
|
+
analysis?: TileAnalysis;
|
|
91
|
+
/**
|
|
92
|
+
* Source display id, when the tile came from a per-display capture pass.
|
|
93
|
+
* Stringified so opaque platform ids (CGDirectDisplayID, sway output names)
|
|
94
|
+
* round-trip without lossy coercion.
|
|
95
|
+
*/
|
|
96
|
+
displayId?: string;
|
|
97
|
+
/**
|
|
98
|
+
* Absolute pixel X of the tile origin in the source display's native space.
|
|
99
|
+
* Same value as `x` when `displayId` refers to a single-display capture; it
|
|
100
|
+
* becomes load-bearing once the capture pipeline composes per-display
|
|
101
|
+
* screenshots into a multi-monitor stream.
|
|
102
|
+
*/
|
|
103
|
+
sourceX?: number;
|
|
104
|
+
/** Absolute pixel Y of the tile origin in the source display's native space. */
|
|
105
|
+
sourceY?: number;
|
|
106
|
+
}
|
|
107
|
+
export interface TileAnalysis {
|
|
108
|
+
timestamp: number;
|
|
109
|
+
ocr?: OCRResult;
|
|
110
|
+
objects?: DetectedObject[];
|
|
111
|
+
text?: string;
|
|
112
|
+
summary?: string;
|
|
113
|
+
}
|
|
114
|
+
export interface OCRResult {
|
|
115
|
+
text: string;
|
|
116
|
+
blocks: Array<{
|
|
117
|
+
text: string;
|
|
118
|
+
bbox: BoundingBox;
|
|
119
|
+
confidence: number;
|
|
120
|
+
words?: Array<{
|
|
121
|
+
text: string;
|
|
122
|
+
bbox: BoundingBox;
|
|
123
|
+
confidence: number;
|
|
124
|
+
}>;
|
|
125
|
+
}>;
|
|
126
|
+
fullText: string;
|
|
127
|
+
}
|
|
128
|
+
export interface EnhancedSceneDescription extends SceneDescription {
|
|
129
|
+
screenCapture?: ScreenCapture;
|
|
130
|
+
screenAnalysis?: {
|
|
131
|
+
fullScreenOCR?: string;
|
|
132
|
+
activeTile?: TileAnalysis;
|
|
133
|
+
gridSummary?: string;
|
|
134
|
+
focusedApp?: string;
|
|
135
|
+
uiElements?: Array<{
|
|
136
|
+
type: string;
|
|
137
|
+
text: string;
|
|
138
|
+
position: BoundingBox;
|
|
139
|
+
}>;
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
export interface VisionConfig {
|
|
143
|
+
cameraName?: string;
|
|
144
|
+
pixelChangeThreshold?: number;
|
|
145
|
+
updateInterval?: number;
|
|
146
|
+
enablePoseDetection?: boolean;
|
|
147
|
+
enableObjectDetection?: boolean;
|
|
148
|
+
tfUpdateInterval?: number;
|
|
149
|
+
vlmUpdateInterval?: number;
|
|
150
|
+
tfChangeThreshold?: number;
|
|
151
|
+
vlmChangeThreshold?: number;
|
|
152
|
+
visionMode?: VisionMode;
|
|
153
|
+
screenCaptureInterval?: number;
|
|
154
|
+
tileSize?: number;
|
|
155
|
+
tileProcessingOrder?: "sequential" | "priority" | "random";
|
|
156
|
+
ocrEnabled?: boolean;
|
|
157
|
+
screenRegion?: {
|
|
158
|
+
x: number;
|
|
159
|
+
y: number;
|
|
160
|
+
width: number;
|
|
161
|
+
height: number;
|
|
162
|
+
};
|
|
163
|
+
displayIndex?: number;
|
|
164
|
+
captureAllDisplays?: boolean;
|
|
165
|
+
targetScreenFPS?: number;
|
|
166
|
+
textRegions?: Array<{
|
|
167
|
+
x: number;
|
|
168
|
+
y: number;
|
|
169
|
+
width: number;
|
|
170
|
+
height: number;
|
|
171
|
+
}>;
|
|
172
|
+
}
|
|
173
|
+
export interface TrackedEntity {
|
|
174
|
+
id: string;
|
|
175
|
+
entityType: "person" | "object" | "pet";
|
|
176
|
+
firstSeen: number;
|
|
177
|
+
lastSeen: number;
|
|
178
|
+
lastPosition: BoundingBox;
|
|
179
|
+
appearances: EntityAppearance[];
|
|
180
|
+
attributes: EntityAttributes;
|
|
181
|
+
worldId?: string;
|
|
182
|
+
roomId?: string;
|
|
183
|
+
}
|
|
184
|
+
interface EntityAppearance {
|
|
185
|
+
timestamp: number;
|
|
186
|
+
boundingBox: BoundingBox;
|
|
187
|
+
confidence: number;
|
|
188
|
+
embedding?: number[];
|
|
189
|
+
keypoints?: Array<{
|
|
190
|
+
part: string;
|
|
191
|
+
position: {
|
|
192
|
+
x: number;
|
|
193
|
+
y: number;
|
|
194
|
+
};
|
|
195
|
+
score: number;
|
|
196
|
+
}>;
|
|
197
|
+
}
|
|
198
|
+
export interface EntityAttributes {
|
|
199
|
+
[key: string]: string | number | boolean | null | undefined | string[] | number[];
|
|
200
|
+
name?: string;
|
|
201
|
+
faceEmbedding?: number[];
|
|
202
|
+
faceId?: string;
|
|
203
|
+
clothing?: string[];
|
|
204
|
+
hairColor?: string;
|
|
205
|
+
accessories?: string[];
|
|
206
|
+
objectType?: string;
|
|
207
|
+
color?: string;
|
|
208
|
+
size?: "small" | "medium" | "large";
|
|
209
|
+
description?: string;
|
|
210
|
+
tags?: string[];
|
|
211
|
+
}
|
|
212
|
+
export interface FaceLibrary {
|
|
213
|
+
faces: Map<string, FaceProfile>;
|
|
214
|
+
embeddings: Map<string, number[][]>;
|
|
215
|
+
}
|
|
216
|
+
export interface FaceProfile {
|
|
217
|
+
id: string;
|
|
218
|
+
name?: string;
|
|
219
|
+
embeddings: number[][];
|
|
220
|
+
firstSeen: number;
|
|
221
|
+
lastSeen: number;
|
|
222
|
+
seenCount: number;
|
|
223
|
+
attributes?: {
|
|
224
|
+
age?: string;
|
|
225
|
+
gender?: string;
|
|
226
|
+
emotion?: string;
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
export interface WorldState {
|
|
230
|
+
worldId: string;
|
|
231
|
+
entities: Map<string, TrackedEntity>;
|
|
232
|
+
lastUpdate: number;
|
|
233
|
+
activeEntities: string[];
|
|
234
|
+
recentlyLeft: Array<{
|
|
235
|
+
entityId: string;
|
|
236
|
+
leftAt: number;
|
|
237
|
+
lastPosition: BoundingBox;
|
|
238
|
+
}>;
|
|
239
|
+
}
|
|
240
|
+
export {};
|
|
241
|
+
//# sourceMappingURL=types.d.ts.map
|