@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
@@ -0,0 +1,91 @@
1
+ import type { BoundingBox, OCRResult, ScreenTile } from "./types";
2
+ export type OCRBackendName = "doctr" | "apple-vision";
3
+ export interface OCRServiceConfig {
4
+ /**
5
+ * Force a specific backend. If unset, the chain is:
6
+ * 1. Apple Vision (darwin only, when a provider has been registered)
7
+ * 2. doCTR (ggml-backed CRNN+DBNet via native/doctr.cpp)
8
+ *
9
+ * There is no tesseract / onnx fallback — the migration removed both.
10
+ * If neither backend can initialize, `initialize()` throws.
11
+ */
12
+ backend?: OCRBackendName;
13
+ }
14
+ export interface StructuredOCRData {
15
+ tables: Array<{
16
+ rows: string[][];
17
+ bbox: BoundingBox;
18
+ }>;
19
+ forms: Array<{
20
+ label: string;
21
+ value: string;
22
+ bbox: BoundingBox;
23
+ }>;
24
+ lists: Array<{
25
+ items: string[];
26
+ bbox: BoundingBox;
27
+ }>;
28
+ }
29
+ export declare function extractStructuredDataFromOCR(ocr: OCRResult): StructuredOCRData;
30
+ /**
31
+ * External provider seam for the Apple Vision OCR backend.
32
+ *
33
+ * `plugin-vision` does not take a runtime dep on `@elizaos/plugin-computeruse`
34
+ * — that would invert the layering (computeruse is the higher-level seam).
35
+ * Instead, the runtime registers a provider here on iOS/macOS startup using
36
+ * `createIosVisionOcrProvider(...)` from
37
+ * `@elizaos/plugin-computeruse/mobile/ocr-provider`. Until a provider is
38
+ * registered, `AppleVisionBackend.extractText` throws so the chooser falls
39
+ * through to the doCTR ggml backend.
40
+ *
41
+ * The provider shape is intentionally structural so plugin-vision stays
42
+ * Node-importable on hosts that don't ship Capacitor.
43
+ */
44
+ export interface AppleVisionOcrProvider {
45
+ /** Stable id used in logs/telemetry. */
46
+ readonly name: string;
47
+ /** True when the underlying bridge is registered and ready. */
48
+ available(): boolean;
49
+ /**
50
+ * Recognize text in the JPEG/PNG bytes. The plugin-computeruse iOS provider
51
+ * returns `OcrResult`; we map to plugin-vision's `OCRResult` shape inline.
52
+ */
53
+ recognize(input: {
54
+ kind: "bytes";
55
+ data: Uint8Array;
56
+ }): Promise<{
57
+ readonly lines: ReadonlyArray<{
58
+ readonly text: string;
59
+ readonly confidence: number;
60
+ readonly boundingBox: {
61
+ readonly x: number;
62
+ readonly y: number;
63
+ readonly width: number;
64
+ readonly height: number;
65
+ };
66
+ }>;
67
+ readonly fullText: string;
68
+ }>;
69
+ }
70
+ export declare function registerAppleVisionOcrProvider(provider: AppleVisionOcrProvider | null): void;
71
+ export declare function getAppleVisionOcrProvider(): AppleVisionOcrProvider | null;
72
+ /**
73
+ * Walk the priority chain and pick the first backend that initializes.
74
+ * Backend instances are cached; per-call we just dispatch to the active one.
75
+ */
76
+ export declare class OCRService {
77
+ private backends;
78
+ private chosen;
79
+ private initialized;
80
+ private readonly forced?;
81
+ constructor(config?: OCRServiceConfig);
82
+ initialize(): Promise<void>;
83
+ extractText(imageBuffer: Buffer): Promise<OCRResult>;
84
+ extractFromTile(tile: ScreenTile): Promise<OCRResult>;
85
+ extractFromImage(imageBuffer: Buffer): Promise<OCRResult>;
86
+ extractStructuredData(imageBuffer: Buffer): Promise<StructuredOCRData>;
87
+ getActiveBackend(): OCRBackendName | null;
88
+ isInitialized(): boolean;
89
+ dispose(): Promise<void>;
90
+ }
91
+ //# sourceMappingURL=ocr-service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service.d.ts","sourceRoot":"","sources":["../src/ocr-service.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAElE,MAAM,MAAM,cAAc,GAAG,OAAO,GAAG,cAAc,CAAC;AAEtD,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;OAOG;IACH,OAAO,CAAC,EAAE,cAAc,CAAC;CAC1B;AASD,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;IACvD,KAAK,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;IAClE,KAAK,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,EAAE,CAAC;QAAC,IAAI,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;CACtD;AAqBD,wBAAgB,4BAA4B,CAC1C,GAAG,EAAE,SAAS,GACb,iBAAiB,CAiEnB;AAgBD;;;;;;;;;;;;;GAaG;AACH,MAAM,WAAW,sBAAsB;IACrC,wCAAwC;IACxC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,+DAA+D;IAC/D,SAAS,IAAI,OAAO,CAAC;IACrB;;;OAGG;IACH,SAAS,CAAC,KAAK,EAAE;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,IAAI,EAAE,UAAU,CAAA;KAAE,GAAG,OAAO,CAAC;QAC7D,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC;YAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;YACtB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;YAC5B,QAAQ,CAAC,WAAW,EAAE;gBACpB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;gBACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;gBACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;aACzB,CAAC;SACH,CAAC,CAAC;QACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;KAC3B,CAAC,CAAC;CACJ;AAID,wBAAgB,8BAA8B,CAC5C,QAAQ,EAAE,sBAAsB,GAAG,IAAI,GACtC,IAAI,CAON;AAED,wBAAgB,yBAAyB,IAAI,sBAAsB,GAAG,IAAI,CAEzE;AAqED;;;GAGG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,MAAM,CAA2B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAiB;gBAE7B,MAAM,GAAE,gBAAqB;IAInC,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAkD3B,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAiCpD,eAAe,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC;IAOrD,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAIzD,qBAAqB,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAK5E,gBAAgB,IAAI,cAAc,GAAG,IAAI;IAIzC,aAAa,IAAI,OAAO;IAIlB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAW/B"}
@@ -0,0 +1,103 @@
1
+ /**
2
+ * OCR-with-coords — hierarchical (block / line / word) OCR with absolute
3
+ * source-display coordinates and a coarse semantic position label per
4
+ * recognized text element.
5
+ *
6
+ * Why this lives in plugin-vision:
7
+ * - plugin-computeruse needs OCR with coordinates so action targets can be
8
+ * computed in display-absolute coordinates without re-running detection.
9
+ * - It cannot take a runtime dep on plugin-vision (which would invert the
10
+ * layering: computeruse is the higher-level seam and plugin-vision must
11
+ * stay Node-importable on hosts that don't ship the action surface).
12
+ * - Mirroring the pattern used by `AppleVisionOcrProvider` in
13
+ * `./ocr-service.ts`, plugin-vision exports a structural interface plus a
14
+ * registry seam (`registerCoordOcrProvider` lives in
15
+ * plugin-computeruse/src/mobile/ocr-provider.ts) that the runtime wires up
16
+ * at boot.
17
+ *
18
+ * This file defines the canonical `OcrWithCoordsService` interface and the
19
+ * in-tree `RapidOcrCoordAdapter` provider. The adapter is backed by the
20
+ * existing `RapidOCRService` and computes `semantic_position`
21
+ * deterministically from the bbox center against tile-relative thirds. Native
22
+ * OCR providers can register the same interface without changing consumers.
23
+ */
24
+ import { OCRService } from "./ocr-service";
25
+ import type { BoundingBox } from "./types";
26
+ /** Coarse 3x3 location of a text element relative to the source tile. */
27
+ export type SemanticPosition = "upper-left" | "upper-center" | "upper-right" | "middle-left" | "center" | "middle-right" | "lower-left" | "lower-center" | "lower-right";
28
+ export interface OcrWithCoordsWord {
29
+ readonly text: string;
30
+ /** Absolute source-display coordinates. */
31
+ readonly bbox: BoundingBox;
32
+ readonly semantic_position: SemanticPosition;
33
+ }
34
+ export interface OcrWithCoordsBlock {
35
+ readonly text: string;
36
+ /** Absolute source-display coordinates. */
37
+ readonly bbox: BoundingBox;
38
+ readonly words: ReadonlyArray<OcrWithCoordsWord>;
39
+ readonly semantic_position: SemanticPosition;
40
+ }
41
+ export interface OcrWithCoordsResult {
42
+ readonly blocks: ReadonlyArray<OcrWithCoordsBlock>;
43
+ }
44
+ export interface OcrWithCoordsInput {
45
+ /** Stable identifier of the source display. Echoed in logs only. */
46
+ readonly displayId: string;
47
+ /** Absolute X offset of the tile within the source display. */
48
+ readonly sourceX: number;
49
+ /** Absolute Y offset of the tile within the source display. */
50
+ readonly sourceY: number;
51
+ /** Encoded PNG bytes of the tile. */
52
+ readonly pngBytes: Uint8Array;
53
+ }
54
+ export interface OcrWithCoordsService {
55
+ readonly name: string;
56
+ describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
57
+ }
58
+ /**
59
+ * Map a bbox center to one of nine semantic positions using strict thirds
60
+ * against the tile dimensions. Pure function — exported for tests so the
61
+ * thirds rule has a single source of truth.
62
+ *
63
+ * Rule:
64
+ * col = floor(centerX / (tileWidth / 3)) clamped to [0, 2]
65
+ * row = floor(centerY / (tileHeight / 3)) clamped to [0, 2]
66
+ * "middle" + "center" collapses to the literal "center".
67
+ *
68
+ * Inputs use tile-relative coordinates so the same function works for words
69
+ * inside their parent block too (callers can pass the parent block bbox as
70
+ * the tile dims for word-relative labeling, but for the canonical
71
+ * implementation here we always label against the source tile).
72
+ */
73
+ export declare function computeSemanticPosition(args: {
74
+ readonly bbox: BoundingBox;
75
+ readonly tileWidth: number;
76
+ readonly tileHeight: number;
77
+ }): SemanticPosition;
78
+ export declare function registerOcrWithCoordsService(service: OcrWithCoordsService | null): void;
79
+ export declare function getOcrWithCoordsService(): OcrWithCoordsService | null;
80
+ /**
81
+ * Wraps the existing `RapidOCRService` and maps its line-level output to the
82
+ * hierarchical `OcrWithCoordsResult` shape, computing `semantic_position`
83
+ * deterministically against the source tile thirds.
84
+ */
85
+ export declare class RapidOcrCoordAdapter implements OcrWithCoordsService {
86
+ private readonly impl;
87
+ readonly name = "rapid-coord-adapter";
88
+ constructor(impl?: Pick<OCRService, "extractText">);
89
+ describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
90
+ }
91
+ /**
92
+ * Read width/height from the PNG IHDR chunk without pulling in sharp on the
93
+ * test path. PNG signature is 8 bytes; IHDR begins at offset 8 with a 4-byte
94
+ * length, 4-byte type ("IHDR"), then 4-byte width and 4-byte height (BE).
95
+ *
96
+ * Throws on malformed input so a corrupt tile surfaces immediately rather
97
+ * than silently producing zero-sized semantic-position math.
98
+ */
99
+ export declare function readPngDimensions(pngBytes: Uint8Array): Promise<{
100
+ width: number;
101
+ height: number;
102
+ }>;
103
+ //# sourceMappingURL=ocr-with-coords.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-with-coords.d.ts","sourceRoot":"","sources":["../src/ocr-with-coords.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAE3C,yEAAyE;AACzE,MAAM,MAAM,gBAAgB,GACxB,YAAY,GACZ,cAAc,GACd,aAAa,GACb,aAAa,GACb,QAAQ,GACR,cAAc,GACd,YAAY,GACZ,cAAc,GACd,aAAa,CAAC;AAElB,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC;CAC9C;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC,iBAAiB,CAAC,CAAC;IACjD,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC;CAC9C;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAC;CACpD;AAED,MAAM,WAAW,kBAAkB;IACjC,oEAAoE;IACpE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,+DAA+D;IAC/D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,+DAA+D;IAC/D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,qCAAqC;IACrC,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;CAC/B;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAC;CACnE;AAeD;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE;IAC5C,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B,GAAG,gBAAgB,CAenB;AAYD,wBAAgB,4BAA4B,CAC1C,OAAO,EAAE,oBAAoB,GAAG,IAAI,GACnC,IAAI,CAON;AAED,wBAAgB,uBAAuB,IAAI,oBAAoB,GAAG,IAAI,CAErE;AAID;;;;GAIG;AACH,qBAAa,oBAAqB,YAAW,oBAAoB;IAI7D,OAAO,CAAC,QAAQ,CAAC,IAAI;IAHvB,QAAQ,CAAC,IAAI,yBAAyB;gBAGnB,IAAI,GAAE,IAAI,CAAC,UAAU,EAAE,aAAa,CAAoB;IAGrE,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAuDxE;AAMD;;;;;;;GAOG;AACH,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,UAAU,GACnB,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC,CA8B5C"}
@@ -0,0 +1,17 @@
1
+ import type { PersonInfo } from "./types";
2
+ import { type YOLOConfig } from "./yolo-detector";
3
+ export interface PersonDetectorConfig extends Omit<YOLOConfig, "classFilter"> {
4
+ /** Score threshold specifically for person detections (defaults to 0.4). */
5
+ scoreThreshold?: number;
6
+ }
7
+ export declare class PersonDetector {
8
+ private yolo;
9
+ private initialized;
10
+ constructor(config?: PersonDetectorConfig);
11
+ static isAvailable(): Promise<boolean>;
12
+ isInitialized(): boolean;
13
+ initialize(): Promise<void>;
14
+ detect(imageBuffer: Buffer): Promise<PersonInfo[]>;
15
+ dispose(): Promise<void>;
16
+ }
17
+ //# sourceMappingURL=person-detector.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"person-detector.d.ts","sourceRoot":"","sources":["../src/person-detector.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAC1C,OAAO,EAAE,KAAK,UAAU,EAAgB,MAAM,iBAAiB,CAAC;AAEhE,MAAM,WAAW,oBAAqB,SAAQ,IAAI,CAAC,UAAU,EAAE,aAAa,CAAC;IAC3E,4EAA4E;IAC5E,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,qBAAa,cAAc;IACzB,OAAO,CAAC,IAAI,CAAe;IAC3B,OAAO,CAAC,WAAW,CAAS;gBAEhB,MAAM,GAAE,oBAAyB;IAQ7C,MAAM,CAAC,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAItC,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B,MAAM,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;IAclD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAI/B"}
@@ -0,0 +1,3 @@
1
+ import { type Provider } from "@elizaos/core";
2
+ export declare const visionProvider: Provider;
3
+ //# sourceMappingURL=provider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"provider.d.ts","sourceRoot":"","sources":["../src/provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAIL,KAAK,QAAQ,EAEd,MAAM,eAAe,CAAC;AAYvB,eAAO,MAAM,cAAc,EAAE,QAiW5B,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { Route } from "@elizaos/core";
2
+ /** GET — drain the queue of pending capture requests for the renderer poller. */
3
+ export declare const captureRequestsRoute: Route;
4
+ /** POST — accept a captured frame (or a skip) for a queued request. */
5
+ export declare const screenFrameRoute: Route;
6
+ export declare const visionRoutes: Route[];
7
+ //# sourceMappingURL=routes.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"routes.d.ts","sourceRoot":"","sources":["../src/routes.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAqC3C,iFAAiF;AACjF,eAAO,MAAM,oBAAoB,EAAE,KAWlC,CAAC;AAEF,uEAAuE;AACvE,eAAO,MAAM,gBAAgB,EAAE,KA2C9B,CAAC;AAEF,eAAO,MAAM,YAAY,EAAE,KAAK,EAA6C,CAAC"}
@@ -0,0 +1,51 @@
1
+ import { type IAgentRuntime, Service } from "@elizaos/core";
2
+ /** Service type used to resolve the bridge off the runtime. */
3
+ export declare const SCREEN_CAPTURE_BRIDGE_SERVICE_TYPE = "vision-screen-capture-bridge";
4
+ /** A single enqueued capture request, drained by the GET poll. */
5
+ export interface ScreenCaptureRequest {
6
+ requestId: string;
7
+ createdAt: number;
8
+ displayId?: number;
9
+ }
10
+ /** Result of a completed capture, returned to `requestFrame` callers. */
11
+ export interface ScreenCaptureFrame {
12
+ pngBytes: Uint8Array;
13
+ displayId: number;
14
+ capturedAt: number;
15
+ }
16
+ /**
17
+ * Renderer-pulled screen-capture bridge service.
18
+ *
19
+ * The agent calls `requestFrame()`; the renderer drains the queue via
20
+ * `takeRequests()` and delivers frames via `submitFrame()`.
21
+ */
22
+ export declare class ScreenCaptureBridgeService extends Service {
23
+ static serviceType: string;
24
+ capabilityDescription: string;
25
+ private readonly queue;
26
+ private readonly pending;
27
+ private readonly timeoutMs;
28
+ constructor(runtime?: IAgentRuntime, timeoutMs?: number);
29
+ static start(runtime: IAgentRuntime): Promise<ScreenCaptureBridgeService>;
30
+ /**
31
+ * Enqueue a capture request and wait for the renderer to deliver a frame.
32
+ * Resolves `null` if no frame arrives within the timeout (never hangs).
33
+ */
34
+ requestFrame(displayId?: number): Promise<ScreenCaptureFrame | null>;
35
+ /** Drain and return all queued requests (for the GET poll). */
36
+ takeRequests(): ScreenCaptureRequest[];
37
+ /**
38
+ * Deliver a captured frame for a queued request. Returns false if the
39
+ * requestId is unknown or already expired/resolved.
40
+ */
41
+ submitFrame(requestId: string, base64: string, _format: string, _width: number, _height: number): boolean;
42
+ /**
43
+ * Resolve a queued request as a skip/failure so the agent's pending promise
44
+ * settles promptly (as `null`) instead of waiting the full timeout. The
45
+ * renderer calls this when a capture throws or is unavailable. Returns false
46
+ * for unknown/expired requestIds.
47
+ */
48
+ failFrame(requestId: string, reason: string): boolean;
49
+ stop(): Promise<void>;
50
+ }
51
+ //# sourceMappingURL=screen-capture-bridge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"screen-capture-bridge.d.ts","sourceRoot":"","sources":["../src/screen-capture-bridge.ts"],"names":[],"mappings":"AAWA,OAAO,EAAE,KAAK,aAAa,EAAU,OAAO,EAAE,MAAM,eAAe,CAAC;AAEpE,+DAA+D;AAC/D,eAAO,MAAM,kCAAkC,iCACf,CAAC;AASjC,kEAAkE;AAClE,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,yEAAyE;AACzE,MAAM,WAAW,kBAAkB;IACjC,QAAQ,EAAE,UAAU,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAQD;;;;;GAKG;AACH,qBAAa,0BAA2B,SAAQ,OAAO;IACrD,OAAgB,WAAW,EAAE,MAAM,CAAsC;IAChE,qBAAqB,SACyC;IAEvE,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA8B;IACpD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAqC;IAC7D,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAGjC,OAAO,CAAC,EAAE,aAAa,EACvB,SAAS,GAAE,MAAiC;WAMjC,KAAK,CAChB,OAAO,EAAE,aAAa,GACrB,OAAO,CAAC,0BAA0B,CAAC;IAItC;;;OAGG;IACH,YAAY,CAAC,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,GAAG,IAAI,CAAC;IA2BpE,+DAA+D;IAC/D,YAAY,IAAI,oBAAoB,EAAE;IAItC;;;OAGG;IACH,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,GACd,OAAO;IAcV;;;;;OAKG;IACH,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO;IAY/C,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAQ5B"}
@@ -0,0 +1,17 @@
1
+ import type { ScreenCapture, ScreenTile, VisionConfig } from "./types";
2
+ export declare class ScreenCaptureService {
3
+ private config;
4
+ private activeTileIndex;
5
+ private lastCapture;
6
+ constructor(config: VisionConfig);
7
+ getScreenInfo(): Promise<{
8
+ width: number;
9
+ height: number;
10
+ } | null>;
11
+ captureScreen(): Promise<ScreenCapture>;
12
+ private captureScreenToFile;
13
+ getActiveTile(): ScreenTile | null;
14
+ getAllTiles(): ScreenTile[];
15
+ getProcessedTiles(): ScreenTile[];
16
+ }
17
+ //# sourceMappingURL=screen-capture.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"screen-capture.d.ts","sourceRoot":"","sources":["../src/screen-capture.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,aAAa,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AAoEvE,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,WAAW,CAA8B;gBAErC,MAAM,EAAE,YAAY;IAI1B,aAAa,IAAI,OAAO,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IA0DlE,aAAa,IAAI,OAAO,CAAC,aAAa,CAAC;YAkF/B,mBAAmB;IA+CjC,aAAa,IAAI,UAAU,GAAG,IAAI;IAOlC,WAAW,IAAI,UAAU,EAAE;IAI3B,iBAAiB,IAAI,UAAU,EAAE;CAGlC"}
@@ -0,0 +1,75 @@
1
+ /**
2
+ * One output tile from `tileScreenshot`.
3
+ *
4
+ * - `sourceX/sourceY` are absolute pixel coords in the *source display's*
5
+ * native space (i.e. the same space `displayId` is reported in by
6
+ * `plugin-computeruse/src/platform/displays.ts`).
7
+ * - `tileW/tileH` are the actual rendered dimensions of `pngBytes` and may
8
+ * equal `sourceW/sourceH` (no resize) — the tiler does not downscale; it
9
+ * only crops. Resizing is the model preprocessor's job.
10
+ */
11
+ export interface ScreenTile {
12
+ /** Stable id of the form `tile-<row>-<col>`. */
13
+ id: string;
14
+ /** Display this tile was sourced from. Stringified to keep types narrow. */
15
+ displayId: string;
16
+ /** Top-left X of the tile in the source display's pixel space. */
17
+ sourceX: number;
18
+ /** Top-left Y of the tile in the source display's pixel space. */
19
+ sourceY: number;
20
+ /** Width of the cropped region in source pixels. */
21
+ sourceW: number;
22
+ /** Height of the cropped region in source pixels. */
23
+ sourceH: number;
24
+ /** Pixel width of `pngBytes`. Equal to `sourceW` (no resize). */
25
+ tileW: number;
26
+ /** Pixel height of `pngBytes`. Equal to `sourceH` (no resize). */
27
+ tileH: number;
28
+ /** PNG-encoded crop. */
29
+ pngBytes: Buffer;
30
+ }
31
+ export interface TileScreenshotInput {
32
+ displayId: string;
33
+ width: number;
34
+ height: number;
35
+ pngBytes: Buffer;
36
+ }
37
+ export interface TileScreenshotOptions {
38
+ /** Maximum tile edge in pixels. Tiles never exceed this in either dim. */
39
+ maxEdge: number;
40
+ /**
41
+ * Fraction of `tileSize` that adjacent tiles overlap. 0.12 (default) is
42
+ * large enough to keep multi-glyph tokens intact across seams, small enough
43
+ * to keep tile count near minimum.
44
+ */
45
+ overlapFraction: number;
46
+ }
47
+ /** Default local-VLM tile budget for Gemma vision. */
48
+ export declare const DEFAULT_MAX_EDGE = 1280;
49
+ /** Default seam overlap (12%). */
50
+ export declare const DEFAULT_OVERLAP_FRACTION = 0.12;
51
+ /**
52
+ * Tile a captured screenshot into local-VLM-sized PNG patches with
53
+ * pixel-overlap between neighbours.
54
+ *
55
+ * Single-tile fast path: when both dims fit within `maxEdge`, the input is
56
+ * returned as a single `ScreenTile` whose pngBytes is the unmodified input.
57
+ *
58
+ * Grid path: chooses the smallest grid (cols, rows) such that no individual
59
+ * tile exceeds `maxEdge`, then computes a per-axis stride that yields
60
+ * `overlapFraction * tileSize` of overlap between adjacent tiles. The last
61
+ * column/row is anchored to the source's right/bottom edge so we never
62
+ * extend past the screen.
63
+ */
64
+ export declare function tileScreenshot(input: TileScreenshotInput, opts?: TileScreenshotOptions): Promise<ScreenTile[]>;
65
+ /**
66
+ * Map a (localX, localY) inside a tile back to the source display's
67
+ * absolute pixel coordinates. Use this to translate "model said click at
68
+ * (x, y) inside tile-0-1" into a coordinate the input driver can act on.
69
+ */
70
+ export declare function reconstructAbsoluteCoords(tile: ScreenTile, localX: number, localY: number): {
71
+ displayId: string;
72
+ absoluteX: number;
73
+ absoluteY: number;
74
+ };
75
+ //# sourceMappingURL=screen-tiler.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"screen-tiler.d.ts","sourceRoot":"","sources":["../src/screen-tiler.ts"],"names":[],"mappings":"AAoBA;;;;;;;;;GASG;AACH,MAAM,WAAW,UAAU;IACzB,gDAAgD;IAChD,EAAE,EAAE,MAAM,CAAC;IACX,4EAA4E;IAC5E,SAAS,EAAE,MAAM,CAAC;IAClB,kEAAkE;IAClE,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,OAAO,EAAE,MAAM,CAAC;IAChB,oDAAoD;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,qDAAqD;IACrD,OAAO,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,KAAK,EAAE,MAAM,CAAC;IACd,wBAAwB;IACxB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,mBAAmB;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,qBAAqB;IACpC,0EAA0E;IAC1E,OAAO,EAAE,MAAM,CAAC;IAChB;;;;OAIG;IACH,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,sDAAsD;AACtD,eAAO,MAAM,gBAAgB,OAAO,CAAC;AACrC,kCAAkC;AAClC,eAAO,MAAM,wBAAwB,OAAO,CAAC;AAE7C;;;;;;;;;;;;GAYG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,mBAAmB,EAC1B,IAAI,GAAE,qBAGL,GACA,OAAO,CAAC,UAAU,EAAE,CAAC,CAuEvB;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,MAAM,GACb;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAgB7D"}
@@ -0,0 +1,176 @@
1
+ import { type IAgentRuntime, Service, type ServiceTypeName } from "@elizaos/core";
2
+ import { EntityTracker } from "./entity-tracker";
3
+ import type { FaceRecognition } from "./face-recognition-ggml";
4
+ import { type BoundingBox, type CameraInfo, type DetectedObject, type EnhancedSceneDescription, type SceneDescription, type ScreenCapture, type VisionFrame, VisionMode } from "./types";
5
+ export interface VisionContextSnapshot {
6
+ openApps: string[];
7
+ focusedWindow: {
8
+ app: string;
9
+ title: string;
10
+ bbox: [number, number, number, number] | null;
11
+ } | null;
12
+ recentActions: Array<{
13
+ action: string;
14
+ ts: number;
15
+ }>;
16
+ currentTaskGoal: string | null;
17
+ }
18
+ /**
19
+ * A face that the ggml face-recognition pipeline matched to a known profile,
20
+ * shaped for the VLM prompt. `label` is the profile's display name when set,
21
+ * otherwise the opaque profile id. `bbox` is the detected face region.
22
+ */
23
+ export interface RecognizedFace {
24
+ label: string;
25
+ bbox: BoundingBox;
26
+ }
27
+ export declare function buildSceneDescriptionPrompt(context: VisionContextSnapshot | null, ocrText?: string | null, detectedObjects?: DetectedObject[] | null, recognizedFaces?: RecognizedFace[] | null): string;
28
+ export declare class VisionService extends Service {
29
+ static serviceType: ServiceTypeName;
30
+ capabilityDescription: string;
31
+ private visionConfig;
32
+ private camera;
33
+ private lastFrame;
34
+ private lastSceneDescription;
35
+ private frameProcessingInterval;
36
+ private screenProcessingInterval;
37
+ private isProcessing;
38
+ private isProcessingScreen;
39
+ private objectDetector;
40
+ private hasObjectDetection;
41
+ private faceRecognition;
42
+ private entityTracker;
43
+ private audioCapture;
44
+ private streamingAudioCapture;
45
+ private screenCapture;
46
+ private ocrService;
47
+ private lastScreenCapture;
48
+ private lastEnhancedScene;
49
+ private workerManager;
50
+ private lastTfUpdateTime;
51
+ private lastVlmUpdateTime;
52
+ private lastTfDescription;
53
+ private readonly describeBackpressure;
54
+ private arbiterUnsubscribe;
55
+ private dirtyTileDescriber;
56
+ private dirtyTileDescriberInit;
57
+ private dirtyTileDescribeContext;
58
+ private readonly DEFAULT_CONFIG;
59
+ constructor(runtime?: IAgentRuntime);
60
+ private parseConfig;
61
+ static start(runtime: IAgentRuntime): Promise<VisionService>;
62
+ private checkCameraTools;
63
+ private initialize;
64
+ private initializeScreenVision;
65
+ private initializeCameraVision;
66
+ private initializeAudioCapture;
67
+ private storeAudioTranscription;
68
+ private startProcessing;
69
+ /**
70
+ * Subscribe the describe-backpressure controller to WS1 memory-pressure
71
+ * events. Resolves the arbiter dynamically (no hard dependency on
72
+ * `@elizaos/plugin-local-inference`); when none is registered the controller
73
+ * still pauses on the local RSS cap. Idempotent — a prior subscription is
74
+ * released first so a restart doesn't double-subscribe.
75
+ */
76
+ private attachMemoryArbiter;
77
+ /**
78
+ * Clear the arbiter-driven pause. Called by the WS1 bridge consumer when
79
+ * pressure returns to nominal; also exposed so an embedder can resume the
80
+ * describe loop explicitly.
81
+ */
82
+ resumeDescribeLoop(): void;
83
+ /** Current describe-backpressure stats (telemetry / tests). */
84
+ getBackpressureStats(): import("./describe-backpressure").DescribeBackpressureStats;
85
+ private startFrameProcessing;
86
+ private captureAndProcessFrame;
87
+ private processFrameData;
88
+ private calculatePixelChange;
89
+ private updateSceneDescription;
90
+ /**
91
+ * Normalize the various shapes that `useModel(IMAGE_DESCRIPTION, …)` may
92
+ * return into a non-empty string. Returns `null` when the result is the
93
+ * "I'm unable to analyze images" sentinel or empty.
94
+ */
95
+ private extractDescriptionFromUseModel;
96
+ /**
97
+ * Pull the latest desktop scene context from plugin-computeruse's
98
+ * VisionContextProvider when registered. Returns `null` when no provider is
99
+ * available (or when the lookup fails) so the VLM still receives a valid
100
+ * prompt — the context block is purely additive.
101
+ */
102
+ private collectVisionContext;
103
+ private collectCurrentOcrTextForPrompt;
104
+ /**
105
+ * Resolve the change-gated per-tile describer, building it once. Returns
106
+ * `null` (and degrades to full-frame describe) when no perceptual hash is
107
+ * available — i.e. plugin-computeruse's `frameDhash` cannot be imported.
108
+ *
109
+ * The dHash is resolved via a best-effort dynamic import so plugin-vision
110
+ * never eagerly pulls computeruse's module graph at boot (same idiom as the
111
+ * coord-OCR bridge wiring in `index.ts`).
112
+ */
113
+ private ensureDirtyTileDescriber;
114
+ private resolveFrameHash;
115
+ /**
116
+ * Build the per-tile describe call bound to this service's runtime. Reads the
117
+ * current frame's prompt context (`dirtyTileDescribeContext`) so each tile is
118
+ * described with the same context the full-frame path would use.
119
+ */
120
+ private buildTileDescribeFn;
121
+ /**
122
+ * Per-tile incremental scene describe. Re-describes only the tiles whose
123
+ * perceptual hash changed since the previous frame; unchanged tiles reuse
124
+ * their cached description. Returns `null` when the per-tile path is
125
+ * unavailable or yields no usable text, so the caller falls back to the
126
+ * full-frame describe.
127
+ */
128
+ private describeSceneWithDirtyTiles;
129
+ private describeSceneWithVLM;
130
+ private describeSceneWithVLMInTrajectory;
131
+ private detectMotionObjects;
132
+ private mergeAdjacentObjects;
133
+ private classifyObjectBySize;
134
+ private detectPeopleFromMotion;
135
+ private startScreenProcessing;
136
+ private captureAndProcessScreen;
137
+ private analyzeTile;
138
+ private updateEnhancedSceneDescription;
139
+ getCurrentFrame(): Promise<VisionFrame | null>;
140
+ getSceneDescription(): Promise<SceneDescription | null>;
141
+ getEnhancedSceneDescription(): Promise<EnhancedSceneDescription | null>;
142
+ getScreenCapture(): Promise<ScreenCapture | null>;
143
+ getVisionMode(): VisionMode;
144
+ /**
145
+ * Enable the camera input. If screen is already active, switches to BOTH;
146
+ * otherwise to CAMERA.
147
+ */
148
+ enableCamera(): Promise<void>;
149
+ /**
150
+ * Disable the camera input. Keeps screen capture if active; otherwise OFF.
151
+ */
152
+ disableCamera(): Promise<void>;
153
+ /**
154
+ * Enable screen capture. If displayIds are passed, the first id wins as
155
+ * the `displayIndex` (multi-display capture is still single-display
156
+ * upstream).
157
+ */
158
+ enableScreen(displayIds?: number[]): Promise<void>;
159
+ /**
160
+ * Disable screen capture. Keeps camera if active; otherwise OFF.
161
+ */
162
+ disableScreen(): Promise<void>;
163
+ setVisionMode(mode: VisionMode): Promise<void>;
164
+ private stopProcessing;
165
+ getCameraInfo(): CameraInfo | null;
166
+ isActive(): boolean;
167
+ private calculateBoxOverlap;
168
+ getEntityTracker(): EntityTracker;
169
+ getFaceRecognition(): Promise<FaceRecognition>;
170
+ stop(): Promise<void>;
171
+ private findCamera;
172
+ private listCameras;
173
+ private createCameraDevice;
174
+ captureImage(): Promise<Buffer | null>;
175
+ }
176
+ //# sourceMappingURL=service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"AAMA,OAAO,EACL,KAAK,aAAa,EAGlB,OAAO,EACP,KAAK,eAAe,EAErB,MAAM,eAAe,CAAC;AAiBvB,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAU/D,OAAO,EACL,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,cAAc,EACnB,KAAK,wBAAwB,EAE7B,KAAK,gBAAgB,EACrB,KAAK,aAAa,EAIlB,KAAK,WAAW,EAChB,UAAU,EAEX,MAAM,SAAS,CAAC;AAmBjB,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,aAAa,EAAE;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC;KAC/C,GAAG,IAAI,CAAC;IACT,aAAa,EAAE,KAAK,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACrD,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;CAChC;AAWD;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,WAAW,CAAC;CACnB;AAgCD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,EACvB,eAAe,CAAC,EAAE,cAAc,EAAE,GAAG,IAAI,EACzC,eAAe,CAAC,EAAE,cAAc,EAAE,GAAG,IAAI,GACxC,MAAM,CAaR;AA6DD,qBAAa,aAAc,SAAQ,OAAO;IACxC,OAAgB,WAAW,EAAE,eAAe,CAA4B;IAC/D,qBAAqB,SACgD;IAE9E,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,MAAM,CAA6B;IAC3C,OAAO,CAAC,SAAS,CAA4B;IAC7C,OAAO,CAAC,oBAAoB,CAAiC;IAC7D,OAAO,CAAC,uBAAuB,CAA+B;IAC9D,OAAO,CAAC,wBAAwB,CAA+B;IAC/D,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,kBAAkB,CAAS;IACnC,OAAO,CAAC,cAAc,CAA6B;IACnD,OAAO,CAAC,kBAAkB,CAAS;IAInC,OAAO,CAAC,eAAe,CAAgC;IACvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,YAAY,CAAoC;IACxD,OAAO,CAAC,qBAAqB,CAA6C;IAG1E,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAAa;IAC/B,OAAO,CAAC,iBAAiB,CAA8B;IACvD,OAAO,CAAC,iBAAiB,CAAyC;IAGlE,OAAO,CAAC,aAAa,CAAoC;IAGzD,OAAO,CAAC,gBAAgB,CAAK;IAC7B,OAAO,CAAC,iBAAiB,CAAK;IAC9B,OAAO,CAAC,iBAAiB,CAAM;IAM/B,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAiC;IACtE,OAAO,CAAC,kBAAkB,CAA6B;IAMvD,OAAO,CAAC,kBAAkB,CAAmC;IAC7D,OAAO,CAAC,sBAAsB,CAAS;IAIvC,OAAO,CAAC,wBAAwB,CAGiB;IAGjD,OAAO,CAAC,QAAQ,CAAC,cAAc,CAc7B;gBAEU,OAAO,CAAC,EAAE,aAAa;IAyCnC,OAAO,CAAC,WAAW;WA0EN,KAAK,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;YAMpD,gBAAgB;YA2BhB,UAAU;YA0DV,sBAAsB;YAyCtB,sBAAsB;YAgCtB,sBAAsB;YAmHtB,uBAAuB;IAmBrC,OAAO,CAAC,eAAe;IAuBvB;;;;;;OAMG;IACH,OAAO,CAAC,mBAAmB;IAoB3B;;;;OAIG;IACI,kBAAkB,IAAI,IAAI;IAIjC,+DAA+D;IACxD,oBAAoB;IAI3B,OAAO,CAAC,oBAAoB;YAoBd,sBAAsB;YAuCtB,gBAAgB;YA+BhB,oBAAoB;YA+BpB,sBAAsB;IAyWpC;;;;OAIG;IACH,OAAO,CAAC,8BAA8B;IAsBtC;;;;;OAKG;YACW,oBAAoB;IAclC,OAAO,CAAC,8BAA8B;IAatC;;;;;;;;OAQG;YACW,wBAAwB;YAuBxB,gBAAgB;IAoB9B;;;;OAIG;IACH,OAAO,CAAC,mBAAmB;IAsB3B;;;;;;OAMG;YACW,2BAA2B;YAwB3B,oBAAoB;YAoBpB,gCAAgC;YAoFhC,mBAAmB;IA2EjC,OAAO,CAAC,oBAAoB;IA6E5B,OAAO,CAAC,oBAAoB;YAgBd,sBAAsB;IA6CpC,OAAO,CAAC,qBAAqB;YAoBf,uBAAuB;YAoBvB,WAAW;YAkBX,8BAA8B;IAkD/B,eAAe,IAAI,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC;IAI9C,mBAAmB,IAAI,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAIvD,2BAA2B,IAAI,OAAO,CAAC,wBAAwB,GAAG,IAAI,CAAC;IAUvE,gBAAgB,IAAI,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC;IAIvD,aAAa,IAAI,UAAU;IAIlC;;;OAGG;IACU,YAAY,IAAI,OAAO,CAAC,IAAI,CAAC;IAS1C;;OAEG;IACU,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAS3C;;;;OAIG;IACU,YAAY,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAa/D;;OAEG;IACU,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAS9B,aAAa,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAoC3D,OAAO,CAAC,cAAc;IAiBf,aAAa,IAAI,UAAU,GAAG,IAAI;IAYlC,QAAQ,IAAI,OAAO;IAK1B,OAAO,CAAC,mBAAmB;IAmBpB,gBAAgB,IAAI,aAAa;IAI3B,kBAAkB,IAAI,OAAO,CAAC,eAAe,CAAC;IAQrD,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;YA0Cb,UAAU;YAkCV,WAAW;IAyEzB,OAAO,CAAC,kBAAkB;IAkFb,YAAY,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CAapD"}
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Bridge plugin-vision's Set-of-Marks fusion into plugin-computeruse's
3
+ * `SetOfMarksProvider` registry seam (#9170 M9).
4
+ *
5
+ * Mirrors `computeruse-ocr-bridge.ts`: plugin-vision owns the GGUF YOLO icon
6
+ * detector and the OCR engines; plugin-computeruse exposes a registration slot
7
+ * and consumes whatever is registered from `detect_elements`, with NO hard
8
+ * dependency on plugin-vision. The provider is built here and registered at
9
+ * boot via a best-effort dynamic import (see `index.ts`).
10
+ *
11
+ * Pure + injectable: the YOLO detector and OCR resolver are passed in, so the
12
+ * fusion wiring is unit-testable with fakes and degrades gracefully when the
13
+ * GGUF detector or OCR engine is unavailable (icons or text simply absent).
14
+ */
15
+ import { getOcrWithCoordsService } from "./ocr-with-coords.js";
16
+ import { type DetectedObjectLike, type SetOfMarksOptions, type SomMark } from "./som.js";
17
+ /** Structural shape of computeruse's `SetOfMarksInput`. */
18
+ export interface SetOfMarksInputLike {
19
+ readonly displayId: string;
20
+ readonly sourceX: number;
21
+ readonly sourceY: number;
22
+ readonly pngBytes: Uint8Array;
23
+ readonly renderOverlay?: boolean;
24
+ }
25
+ /** Structural shape of computeruse's `SetOfMarksResult`. */
26
+ export interface SetOfMarksResultLike {
27
+ readonly marks: ReadonlyArray<SomMark>;
28
+ readonly overlayPngBase64?: string;
29
+ }
30
+ /** Structural shape of computeruse's `SetOfMarksProvider`. */
31
+ export interface SetOfMarksProviderLike {
32
+ readonly name: string;
33
+ describe(input: SetOfMarksInputLike): Promise<SetOfMarksResultLike>;
34
+ }
35
+ export type RegisterSetOfMarksProvider = (provider: SetOfMarksProviderLike | null) => void;
36
+ export declare const VISION_SET_OF_MARKS_BRIDGE_NAME = "vision-set-of-marks-bridge";
37
+ export interface SetOfMarksProviderDeps {
38
+ /**
39
+ * Detect icon-ish boxes from PNG bytes (GGUF YOLO). Returns `[]` when the
40
+ * detector is unavailable — Set-of-Marks then falls back to text-only marks.
41
+ */
42
+ readonly detectIcons?: (pngBytes: Uint8Array) => Promise<DetectedObjectLike[]>;
43
+ /** Resolve the OCR-with-coords service (defaults to the registered one). */
44
+ readonly resolveOcr?: typeof getOcrWithCoordsService;
45
+ /** Fusion tuning forwarded to `buildSetOfMarks`. */
46
+ readonly options?: SetOfMarksOptions;
47
+ }
48
+ /**
49
+ * Lazily-instantiated default GGUF YOLO icon detector. Best-effort: if the
50
+ * native bindings or GGUF weights are missing, every call resolves to `[]` so
51
+ * Set-of-Marks degrades to OCR-only text marks instead of throwing.
52
+ */
53
+ export declare function createDefaultIconDetector(): (pngBytes: Uint8Array) => Promise<DetectedObjectLike[]>;
54
+ /**
55
+ * Build a `SetOfMarksProvider`-shaped bridge that fuses GGUF YOLO icon
56
+ * detections + OCR text blocks into a numbered mark set (and optional overlay).
57
+ */
58
+ export declare function buildVisionSetOfMarksProvider(deps?: SetOfMarksProviderDeps): SetOfMarksProviderLike;
59
+ /**
60
+ * Register the vision Set-of-Marks bridge into computeruse's seam. Idempotent
61
+ * (last-call-wins). Returns true once registered.
62
+ */
63
+ export declare function wireComputerUseSetOfMarksBridge(register: RegisterSetOfMarksProvider, deps?: SetOfMarksProviderDeps): boolean;
64
+ //# sourceMappingURL=set-of-marks-provider.d.ts.map