@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Minimal contract a memory arbiter must implement so vision can plug into
3
+ * WS1's load/unload pipeline. Mirrors the (forthcoming) interface in
4
+ * `@elizaos/plugin-local-inference/src/services/memory-arbiter.ts` but is
5
+ * declared here so plugin-vision compiles standalone.
6
+ */
7
+ export interface IModelArbiter {
8
+ /**
9
+ * Reserve `bytes` of model memory for `holder`. Returning `false` means the
10
+ * arbiter refused — the caller must skip the load.
11
+ */
12
+ acquire(holder: string, bytes: number): Promise<boolean> | boolean;
13
+ /**
14
+ * Release the prior reservation for `holder`.
15
+ */
16
+ release(holder: string): Promise<void> | void;
17
+ /**
18
+ * Subscribe to memory-pressure events. The arbiter calls the listener with
19
+ * a non-empty list of holders when pressure is high enough that those
20
+ * holders should release.
21
+ */
22
+ onPressure(listener: (holders: string[]) => void): () => void;
23
+ }
24
+ export interface VisionSubServiceHandle {
25
+ /** Stable holder id (e.g. "vision:yolo"). */
26
+ id: string;
27
+ /** Approximate VRAM/RAM cost in bytes. Used by the arbiter; ignored if 0. */
28
+ memoryBytes: number;
29
+ /** Optional hook invoked when the sub-service has been released. */
30
+ unload(): Promise<void> | void;
31
+ /** Optional hook invoked to re-load after a prior release. */
32
+ acquire?(): Promise<void> | void;
33
+ }
34
+ export interface VisionLifecycleConfig {
35
+ /** Milliseconds of inactivity before a sub-service is released. */
36
+ idleUnloadMs?: number;
37
+ /** Tick interval for the idle watchdog. */
38
+ watchdogIntervalMs?: number;
39
+ }
40
+ export declare class VisionServiceLifecycleManager {
41
+ private readonly subs;
42
+ private readonly idleUnloadMs;
43
+ private readonly watchdogIntervalMs;
44
+ private arbiter;
45
+ private unsubscribePressure;
46
+ private watchdogTimer;
47
+ private stopped;
48
+ constructor(config?: VisionLifecycleConfig);
49
+ attachArbiter(arbiter: IModelArbiter | null): void;
50
+ register(handle: VisionSubServiceHandle): void;
51
+ unregister(id: string): void;
52
+ /**
53
+ * Mark a sub-service as in-use. If it was previously released, re-acquire
54
+ * via the registered `acquire` callback (if any).
55
+ *
56
+ * Returns `true` if the sub-service is loaded after the call.
57
+ */
58
+ touch(id: string): Promise<boolean>;
59
+ /**
60
+ * Force-release a single holder.
61
+ */
62
+ release(id: string): Promise<void>;
63
+ /**
64
+ * Drop every registered sub-service (used during plugin stop()).
65
+ */
66
+ stop(): Promise<void>;
67
+ /** Test-only: return current snapshot. */
68
+ snapshot(): Array<{
69
+ id: string;
70
+ loaded: boolean;
71
+ lastUsed: number;
72
+ }>;
73
+ private ensureWatchdog;
74
+ private runWatchdog;
75
+ private handlePressure;
76
+ }
77
+ /**
78
+ * Try to resolve a model arbiter from the runtime, dynamically. This avoids
79
+ * a hard dependency on `@elizaos/plugin-local-inference` (WS1) — vision still
80
+ * works standalone when WS1 isn't installed.
81
+ *
82
+ * Two resolution paths:
83
+ * 1. Direct: a service named `MEMORY_ARBITER` / `memory_arbiter` /
84
+ * `memoryArbiter` that already implements the `IModelArbiter` shape.
85
+ * Used by tests and standalone arbiter services.
86
+ * 2. WS1 bridge: a `localInferenceLoader` / `localInference` service that
87
+ * exposes `getMemoryArbiter()` returning the WS1 `MemoryArbiter`. We
88
+ * adapt it to `IModelArbiter` via `adaptWS1ArbiterToIModelArbiter` so
89
+ * memory-pressure events cascade into vision sub-service release.
90
+ */
91
+ export declare function resolveArbiterFromRuntime(runtime: {
92
+ getService?: (name: string) => unknown;
93
+ }): IModelArbiter | null;
94
+ //# sourceMappingURL=lifecycle.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"lifecycle.d.ts","sourceRoot":"","sources":["../src/lifecycle.ts"],"names":[],"mappings":"AAmBA;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B;;;OAGG;IACH,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC;IAEnE;;OAEG;IACH,OAAO,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IAE9C;;;;OAIG;IACH,UAAU,CAAC,QAAQ,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,IAAI,GAAG,MAAM,IAAI,CAAC;CAC/D;AAED,MAAM,WAAW,sBAAsB;IACrC,6CAA6C;IAC7C,EAAE,EAAE,MAAM,CAAC;IACX,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,oEAAoE;IACpE,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IAC/B,8DAA8D;IAC9D,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;CAClC;AAED,MAAM,WAAW,qBAAqB;IACpC,mEAAmE;IACnE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAYD,qBAAa,6BAA6B;IACxC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAoC;IACzD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;IACtC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAS;IAC5C,OAAO,CAAC,OAAO,CAA8B;IAC7C,OAAO,CAAC,mBAAmB,CAA6B;IACxD,OAAO,CAAC,aAAa,CAA+B;IACpD,OAAO,CAAC,OAAO,CAAS;gBAEZ,MAAM,GAAE,qBAA0B;IAM9C,aAAa,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,GAAG,IAAI;IAelD,QAAQ,CAAC,MAAM,EAAE,sBAAsB,GAAG,IAAI;IAW9C,UAAU,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAI5B;;;;;OAKG;IACG,KAAK,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAgCzC;;OAEG;IACG,OAAO,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAYxC;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAkB3B,0CAA0C;IAC1C,QAAQ,IAAI,KAAK,CAAC;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC;IAQpE,OAAO,CAAC,cAAc;YAWR,WAAW;YAYX,cAAc;CAa7B;AAyDD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,yBAAyB,CAAC,OAAO,EAAE;IACjD,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;CACxC,GAAG,aAAa,GAAG,IAAI,CAqCvB"}
@@ -0,0 +1,85 @@
1
+ import type { CameraInfo, VisionFrame } from "../types";
2
+ interface MobileCameraOpenOptions {
3
+ /** Stable camera id (typically `back` / `front` / a per-device id). */
4
+ cameraId?: string;
5
+ /** Desired frame width in pixels — the native side may snap to nearest. */
6
+ width?: number;
7
+ /** Desired frame height in pixels. */
8
+ height?: number;
9
+ /** Desired frame rate. */
10
+ fps?: number;
11
+ }
12
+ /**
13
+ * Minimal interface every mobile camera implementation must satisfy.
14
+ *
15
+ * Implementations live in:
16
+ * - plugin-aosp (Android NNAPI / CameraX) — WS8
17
+ * - plugin-ios (Core ML / AVFoundation) — WS9
18
+ * - plugin-capacitor-bridge (cross-platform Capacitor plugin) — planned bridge package
19
+ */
20
+ export interface MobileCameraSource {
21
+ /** Discover cameras visible to the OS. */
22
+ listCameras(): Promise<CameraInfo[]>;
23
+ /** Open a session when the native source supports continuous capture. */
24
+ open(opts?: MobileCameraOpenOptions): Promise<void>;
25
+ /** Capture a single frame as a JPEG buffer. */
26
+ captureJpeg(): Promise<Buffer>;
27
+ /** Capture and return a fully-decoded RGBA frame. */
28
+ captureRgbaFrame?(): Promise<VisionFrame>;
29
+ /** Tear down the session. */
30
+ close(): Promise<void>;
31
+ /** Optional capability declaration — UIs use this to gate buttons. */
32
+ capabilities?(): {
33
+ supportsContinuousFrames: boolean;
34
+ supportsExposureLock: boolean;
35
+ supportsTorch: boolean;
36
+ };
37
+ }
38
+ interface CapacitorVisionPlugin {
39
+ listCameras?: () => Promise<CameraInfo[]>;
40
+ open?: (opts?: MobileCameraOpenOptions) => Promise<void>;
41
+ captureJpeg?: () => Promise<Buffer | Uint8Array | string | {
42
+ data?: string;
43
+ }>;
44
+ captureRgbaFrame?: () => Promise<VisionFrame | {
45
+ data: string;
46
+ }>;
47
+ close?: () => Promise<void>;
48
+ capabilities?: () => Promise<{
49
+ supportsContinuousFrames: boolean;
50
+ supportsExposureLock: boolean;
51
+ supportsTorch: boolean;
52
+ }>;
53
+ }
54
+ export declare class CapacitorCameraSource implements MobileCameraSource {
55
+ private readonly plugin;
56
+ constructor(plugin: CapacitorVisionPlugin);
57
+ listCameras(): Promise<CameraInfo[]>;
58
+ open(opts?: MobileCameraOpenOptions): Promise<void>;
59
+ captureJpeg(): Promise<Buffer>;
60
+ captureRgbaFrame(): Promise<VisionFrame>;
61
+ close(): Promise<void>;
62
+ capabilities(): {
63
+ supportsContinuousFrames: boolean;
64
+ supportsExposureLock: boolean;
65
+ supportsTorch: boolean;
66
+ };
67
+ }
68
+ /**
69
+ * Default unavailable implementation. Returns no cameras and refuses captures.
70
+ * This keeps the plugin-vision JS surface buildable on Node platforms where no
71
+ * native bridge is registered.
72
+ */
73
+ export declare class UnavailableMobileCameraSource implements MobileCameraSource {
74
+ listCameras(): Promise<CameraInfo[]>;
75
+ open(): Promise<void>;
76
+ captureJpeg(): Promise<Buffer>;
77
+ close(): Promise<void>;
78
+ }
79
+ /** Compatibility alias for older imports. */
80
+ export declare const CapacitorCameraStub: typeof UnavailableMobileCameraSource;
81
+ export declare function registerMobileCameraSource(source: MobileCameraSource): void;
82
+ export declare function getMobileCameraSource(): MobileCameraSource | null;
83
+ export declare function clearMobileCameraSource(): void;
84
+ export {};
85
+ //# sourceMappingURL=capacitor-camera.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"capacitor-camera.d.ts","sourceRoot":"","sources":["../../src/mobile/capacitor-camera.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAExD,UAAU,uBAAuB;IAC/B,uEAAuE;IACvE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2EAA2E;IAC3E,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,sCAAsC;IACtC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,0BAA0B;IAC1B,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,kBAAkB;IACjC,0CAA0C;IAC1C,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IACrC,yEAAyE;IACzE,IAAI,CAAC,IAAI,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACpD,+CAA+C;IAC/C,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IAC/B,qDAAqD;IACrD,gBAAgB,CAAC,IAAI,OAAO,CAAC,WAAW,CAAC,CAAC;IAC1C,6BAA6B;IAC7B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,sEAAsE;IACtE,YAAY,CAAC,IAAI;QACf,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC;CACH;AAED,UAAU,qBAAqB;IAC7B,WAAW,CAAC,EAAE,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1C,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,uBAAuB,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,WAAW,CAAC,EAAE,MAAM,OAAO,CAAC,MAAM,GAAG,UAAU,GAAG,MAAM,GAAG;QAAE,IAAI,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9E,gBAAgB,CAAC,EAAE,MAAM,OAAO,CAAC,WAAW,GAAG;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,KAAK,CAAC,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,OAAO,CAAC;QAC3B,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC,CAAC;CACJ;AAsBD,qBAAa,qBAAsB,YAAW,kBAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM;gBAAN,MAAM,EAAE,qBAAqB;IAEpD,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAIpC,IAAI,CAAC,IAAI,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC;IAOnD,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAO9B,gBAAgB,IAAI,OAAO,CAAC,WAAW,CAAC;IAaxC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAI5B,YAAY,IAAI;QACd,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB;CAOF;AAED;;;;GAIG;AACH,qBAAa,6BAA8B,YAAW,kBAAkB;IAChE,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAMpC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAGrB,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAG9B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAC7B;AAED,6CAA6C;AAC7C,eAAO,MAAM,mBAAmB,sCAAgC,CAAC;AAiBjE,wBAAgB,0BAA0B,CAAC,MAAM,EAAE,kBAAkB,GAAG,IAAI,CAwB3E;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAMjE;AAED,wBAAgB,uBAAuB,IAAI,IAAI,CAE9C"}
@@ -0,0 +1,40 @@
1
+ /** Where the runtime expects GGUF weights. */
2
+ export declare function defaultDetWeightsPath(): string;
3
+ export declare function defaultRecWeightsPath(): string;
4
+ interface DocTRBindings {
5
+ /** Detection forward pass. Output: prob map at H/4 × W/4. */
6
+ detect(detGGUFPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
7
+ probMap: Float32Array;
8
+ h: number;
9
+ w: number;
10
+ }>;
11
+ /** Recognition forward pass on a cropped line image. */
12
+ recognize(recGGUFPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
13
+ logits: Float32Array;
14
+ T: number;
15
+ C: number;
16
+ }>;
17
+ /** Returns the recognition charset (utf-8, newline separated). */
18
+ charset(recGGUFPath: string): Promise<string>;
19
+ dispose(): Promise<void>;
20
+ }
21
+ /**
22
+ * Load the doctr.cpp shared library via `bun:ffi`. Returns null when either
23
+ * the library or the GGUF weights are missing — the caller is expected to
24
+ * throw a clear error in that case rather than silently fall back.
25
+ */
26
+ export declare function loadDoctrBindings(): Promise<DocTRBindings | null>;
27
+ /**
28
+ * `true` when both the native library and the GGUF weights exist on disk. Does
29
+ * not actually initialize anything — callers should still expect the C++ side
30
+ * to return DOCTR_ERR_BACKEND until the ggml graph is wired.
31
+ */
32
+ export declare function isDoctrReady(opts?: {
33
+ detPath?: string;
34
+ recPath?: string;
35
+ }): Promise<{
36
+ ready: boolean;
37
+ reason?: string;
38
+ }>;
39
+ export {};
40
+ //# sourceMappingURL=doctr-ffi.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"doctr-ffi.d.ts","sourceRoot":"","sources":["../../src/native/doctr-ffi.ts"],"names":[],"mappings":"AA0CA,8CAA8C;AAC9C,wBAAgB,qBAAqB,IAAI,MAAM,CAO9C;AAED,wBAAgB,qBAAqB,IAAI,MAAM,CAO9C;AAsBD,UAAU,aAAa;IACrB,6DAA6D;IAC7D,MAAM,CACJ,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,OAAO,EAAE,YAAY,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAE5D,wDAAwD;IACxD,SAAS,CACP,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,MAAM,EAAE,YAAY,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAE3D,kEAAkE;IAClE,OAAO,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAE9C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAID;;;;GAIG;AACH,wBAAsB,iBAAiB,IAAI,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CAsLvE;AAED;;;;GAIG;AACH,wBAAsB,YAAY,CAAC,IAAI,CAAC,EAAE;IACxC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,GAAG,OAAO,CAAC;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAqB/C"}
@@ -0,0 +1,21 @@
1
+ export declare function defaultYoloWeightsPath(): string;
2
+ interface YoloBindings {
3
+ /** Run forward pass. Returns the raw (channels, anchors) logits tensor. */
4
+ run(ggufPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
5
+ logits: Float32Array;
6
+ channels: number;
7
+ anchors: number;
8
+ }>;
9
+ /** Returns the embedded class names (newline-separated). */
10
+ classes(ggufPath: string): Promise<string>;
11
+ dispose(): Promise<void>;
12
+ }
13
+ export declare function loadYoloBindings(): Promise<YoloBindings | null>;
14
+ export declare function isYoloReady(opts?: {
15
+ weightsPath?: string;
16
+ }): Promise<{
17
+ ready: boolean;
18
+ reason?: string;
19
+ }>;
20
+ export {};
21
+ //# sourceMappingURL=yolo-ffi.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"yolo-ffi.d.ts","sourceRoot":"","sources":["../../src/native/yolo-ffi.ts"],"names":[],"mappings":"AAuCA,wBAAgB,sBAAsB,IAAI,MAAM,CAO/C;AAED,UAAU,YAAY;IACpB,2EAA2E;IAC3E,GAAG,CACD,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,MAAM,EAAE,YAAY,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAExE,4DAA4D;IAC5D,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAE3C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAmBD,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CA0HrE;AAED,wBAAsB,WAAW,CAAC,IAAI,CAAC,EAAE;IACvC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAe/C"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Persistent WinRT OCR host (Windows-only) — kills the per-OCR cold-spawn tax.
3
+ *
4
+ * `WindowsMediaOcrService.describe()` previously ran `powershell -File
5
+ * windows-ocr.ps1` for EVERY recognized region. On Defender-heavy hosts a cold
6
+ * `powershell.exe` spawn is ~10-16s (#9581), and OCR fires on every dirty region
7
+ * every turn — and the scene pipeline OCRs regions in parallel, so a turn would
8
+ * spawn N cold processes at once and thrash the AV scanner.
9
+ *
10
+ * This keeps ONE long-lived `powershell.exe` that loads the (expensive) WinRT
11
+ * projection + `OcrEngine` ONCE in its parent scope, then loops: read an image
12
+ * path on stdin, recognize, emit one compact JSON line on stdout. So each call
13
+ * pays neither the process spawn NOR the WinRT type-load — only the recognize
14
+ * (~0.3-1s). Requests are serialized over the one pipe (fine — each is fast).
15
+ *
16
+ * It is a pure latency optimization: `describe()` falls back to the original
17
+ * one-shot `-File` spawn whenever the host is unavailable / disabled / errors,
18
+ * so output is unchanged. No-op off Windows. Disable with `ELIZA_VISION_OCR_HOST=0`.
19
+ *
20
+ * Protocol: JS writes `<absolute-image-path>\n`; the host writes exactly one
21
+ * line of compact JSON (`{width,height,lines}` — same shape as the one-shot
22
+ * script). base64 isn't needed: temp image paths never contain newlines, and
23
+ * `ConvertTo-Json -Compress` output is always a single physical line.
24
+ */
25
+ export declare function ocrHostAvailable(): boolean;
26
+ export declare function shutdownOcrHost(): void;
27
+ /**
28
+ * Recognize the image at `imagePath` via the warm host, returning the raw JSON
29
+ * line (same shape the one-shot script emits). Serialized against other calls.
30
+ * Rejects (so the caller can fall back to a one-shot spawn) on host-start
31
+ * failure, timeout, or unexpected exit.
32
+ */
33
+ export declare function runOcrHost(imagePath: string): Promise<string>;
34
+ //# sourceMappingURL=ocr-host-windows.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-host-windows.d.ts","sourceRoot":"","sources":["../src/ocr-host-windows.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAqFH,wBAAgB,gBAAgB,IAAI,OAAO,CAK1C;AA2BD,wBAAgB,eAAe,IAAI,IAAI,CAsBtC;AAuFD;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQ7D"}
@@ -0,0 +1,51 @@
1
+ /**
2
+ * macOS Apple Vision OCR provider (issue #9105 — per-OS native OCR fallback).
3
+ *
4
+ * Implements the structural `AppleVisionOcrProvider` seam from `ocr-service.ts`
5
+ * by shelling out to a bundled Swift helper (`native/macos-vision-ocr.swift`)
6
+ * that runs `VNRecognizeTextRequest` (accurate level, language correction on).
7
+ * The helper reads PNG/JPEG bytes from stdin and prints a single JSON object;
8
+ * this module pipes the bytes in and maps the result onto the provider shape.
9
+ *
10
+ * Zero LLM tokens, no model download — Apple Vision ships with macOS. This is
11
+ * the darwin sibling of `WindowsMediaOcrService` (Windows.Media.Ocr) and the
12
+ * iOS `createIosVisionOcrProvider` (Capacitor bridge): same VNRecognizeText
13
+ * engine, reached without Capacitor on a desktop host.
14
+ *
15
+ * Coordinate convention: Vision returns normalized BOTTOM-LEFT bboxes; the
16
+ * Swift helper converts them to TOP-LEFT PIXEL coordinates so the result
17
+ * matches the display-absolute convention used by every other provider.
18
+ *
19
+ * Fails soft: `available()` is false off darwin or when `swift` is missing, and
20
+ * `recognize()` returns an empty result rather than throwing on a helper
21
+ * failure, so the `OCRService` chain falls through to the doCTR backend.
22
+ */
23
+ import type { AppleVisionOcrProvider } from "./ocr-service";
24
+ interface MacosVisionAvailabilityOptions {
25
+ platform?: NodeJS.Platform;
26
+ env?: NodeJS.ProcessEnv;
27
+ pathExists?: (candidate: string) => boolean;
28
+ executableExists?: (name: string, env: NodeJS.ProcessEnv) => boolean;
29
+ }
30
+ /**
31
+ * Resolve the bundled Swift helper. Works from both the dev tree (`src/`) and
32
+ * the published build (`dist/`) — the `native/` directory sits alongside both,
33
+ * at the package root. `ELIZA_MACOS_VISION_OCR_SCRIPT` overrides for tests.
34
+ */
35
+ declare function resolveScriptPath(env?: NodeJS.ProcessEnv, pathExists?: (candidate: string) => boolean): string | null;
36
+ /** True when running on macOS with the `swift` toolchain and the helper present. */
37
+ declare function macosVisionAvailable(options?: MacosVisionAvailabilityOptions): boolean;
38
+ /**
39
+ * Build an `AppleVisionOcrProvider` backed by macOS Apple Vision. Register it
40
+ * via `registerAppleVisionOcrProvider(createMacosVisionOcrProvider())` on
41
+ * darwin so the `OCRService` Apple-Vision backend resolves a real engine.
42
+ */
43
+ export declare function createMacosVisionOcrProvider(): AppleVisionOcrProvider;
44
+ /** Exposed for the runtime wire-up + tests; mirrors `macosVisionAvailable`. */
45
+ export declare function isMacosVisionOcrAvailable(): boolean;
46
+ export declare const __test__: {
47
+ macosVisionAvailable: typeof macosVisionAvailable;
48
+ resolveScriptPath: typeof resolveScriptPath;
49
+ };
50
+ export {};
51
+ //# sourceMappingURL=ocr-service-apple-vision-macos.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service-apple-vision-macos.d.ts","sourceRoot":"","sources":["../src/ocr-service-apple-vision-macos.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAOH,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AAe5D,UAAU,8BAA8B;IACtC,QAAQ,CAAC,EAAE,MAAM,CAAC,QAAQ,CAAC;IAC3B,GAAG,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;IACxB,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC;IAC5C,gBAAgB,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,CAAC,UAAU,KAAK,OAAO,CAAC;CACtE;AAED;;;;GAIG;AACH,iBAAS,iBAAiB,CACxB,GAAG,GAAE,MAAM,CAAC,UAAwB,EACpC,UAAU,GAAE,CAAC,SAAS,EAAE,MAAM,KAAK,OAAoB,GACtD,MAAM,GAAG,IAAI,CAQf;AAsBD,oFAAoF;AACpF,iBAAS,oBAAoB,CAC3B,OAAO,GAAE,8BAAmC,GAC3C,OAAO,CAQT;AA8BD;;;;GAIG;AACH,wBAAgB,4BAA4B,IAAI,sBAAsB,CAoCrE;AAED,+EAA+E;AAC/E,wBAAgB,yBAAyB,IAAI,OAAO,CAEnD;AAED,eAAO,MAAM,QAAQ;;;CAGpB,CAAC"}
@@ -0,0 +1,61 @@
1
+ import type { OCRResult } from "./types";
2
+ export interface DoctrOCRConfig {
3
+ /** GGUF detection weights path. */
4
+ detPath?: string;
5
+ /** GGUF recognition weights path. */
6
+ recPath?: string;
7
+ /** Detection input resolution (square). Default 1024. */
8
+ inputSize?: number;
9
+ /** Probability threshold for the DBNet output. */
10
+ probThreshold?: number;
11
+ /** Minimum connected-component pixel count for a detection. */
12
+ minComponentSize?: number;
13
+ }
14
+ /**
15
+ * Detect platforms where Apple Vision is the better OCR choice.
16
+ *
17
+ * macOS Sonoma+ and iOS expose VNRecognizeTextRequest which is faster and
18
+ * higher-quality than any community OCR for Latin scripts. The integration
19
+ * lives in `plugin-computeruse/mobile`; we just refuse to claim availability
20
+ * so the higher-priority Apple Vision backend wins on darwin.
21
+ */
22
+ export declare function shouldPreferAppleVision(): boolean;
23
+ export declare class DoctrOCRService {
24
+ private readonly cfg;
25
+ private initPromise;
26
+ private initialized;
27
+ private charset;
28
+ constructor(config?: DoctrOCRConfig);
29
+ /**
30
+ * Best-effort availability check. Confirms the native lib loads and the
31
+ * GGUF files are on disk. Does NOT prove the ggml forward pass works —
32
+ * that's discovered on the first `extractText` call.
33
+ */
34
+ static isAvailable(opts?: {
35
+ detPath?: string;
36
+ recPath?: string;
37
+ }): Promise<boolean>;
38
+ isInitialized(): boolean;
39
+ initialize(): Promise<void>;
40
+ private _initialize;
41
+ extractText(imageBuffer: Buffer): Promise<OCRResult>;
42
+ private toCHWFloat32;
43
+ /**
44
+ * DBNet contouring: scan the probability map, group above-threshold pixels
45
+ * into connected components, return axis-aligned bboxes in original image
46
+ * coordinates.
47
+ *
48
+ * This is the same algorithm as the previous PP-OCRv5 path — it's a
49
+ * standard DBNet post-process and works for both detection backbones.
50
+ */
51
+ private probMapToBoxes;
52
+ /**
53
+ * Recognition step: crop the bbox, resize to 32xN, run through the CRNN
54
+ * recognizer, CTC-decode the output.
55
+ */
56
+ private recognizeCrop;
57
+ /** CTC greedy decoding. Blank index = 0. */
58
+ private ctcDecode;
59
+ dispose(): Promise<void>;
60
+ }
61
+ //# sourceMappingURL=ocr-service-doctr.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service-doctr.d.ts","sourceRoot":"","sources":["../src/ocr-service-doctr.ts"],"names":[],"mappings":"AA0BA,OAAO,KAAK,EAAe,SAAS,EAAE,MAAM,SAAS,CAAC;AAEtD,MAAM,WAAW,cAAc;IAC7B,mCAAmC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yDAAyD;IACzD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,+DAA+D;IAC/D,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;;;GAOG;AACH,wBAAgB,uBAAuB,IAAI,OAAO,CAKjD;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,GAAG,CAGH;IACjB,OAAO,CAAC,WAAW,CAA8B;IACjD,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,OAAO,CAAgB;gBAEnB,MAAM,GAAE,cAAmB;IAUvC;;;;OAIG;WACU,WAAW,CAAC,IAAI,CAAC,EAAE;QAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,GAAG,OAAO,CAAC,OAAO,CAAC;IAKpB,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAOnB,WAAW;IA0BnB,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAmD1D,OAAO,CAAC,YAAY;IAWpB;;;;;;;OAOG;IACH,OAAO,CAAC,cAAc;IAoEtB;;;OAGG;YACW,aAAa;IAwC3B,4CAA4C;IAC5C,OAAO,CAAC,SAAS;IAwBX,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAO/B"}
@@ -0,0 +1,85 @@
1
+ /**
2
+ * Native Linux OCR-with-coords via the classic `tesseract` CLI (issue #9105 /
3
+ * M4).
4
+ *
5
+ * Zero LLM tokens, no in-repo model download, no ONNX — `tesseract` is a
6
+ * standalone C++ engine packaged by every Linux distro (`apt install
7
+ * tesseract-ocr`). We shell to it with `tsv` output, which emits one row per
8
+ * recognized element with a `level` column (1=page, 2=block, 3=para, 4=line,
9
+ * 5=word) plus per-element `left/top/width/height` boxes and a per-word `conf`.
10
+ * Output maps onto `OcrWithCoordsResult`, so this plugs straight into the
11
+ * `OcrWithCoordsService` registry seam and (via the M1 bridge) into
12
+ * plugin-computeruse's `CoordOcrProvider`.
13
+ *
14
+ * We read the word rows (`level == 5`), group them by their parent
15
+ * `(block, paragraph, line)` triple into one `OcrWithCoordsBlock` per text
16
+ * line (block bbox = union of its word rects), compute the semantic position
17
+ * against the source-tile thirds, and shift every bbox into display-absolute
18
+ * coordinates via `sourceX/sourceY` — the same shape as the Windows provider.
19
+ *
20
+ * Availability is feature-detected on the `tesseract` binary and cached for the
21
+ * process lifetime. When the binary is absent the provider reports unavailable
22
+ * and `describe()` returns empty blocks; it never throws so the boot chain
23
+ * falls through to the docTR ggml backend cleanly.
24
+ */
25
+ import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
26
+ /** A single `level == 5` (word) row parsed from the tesseract TSV. */
27
+ interface TesseractWordRow {
28
+ readonly blockNum: number;
29
+ readonly parNum: number;
30
+ readonly lineNum: number;
31
+ readonly left: number;
32
+ readonly top: number;
33
+ readonly width: number;
34
+ readonly height: number;
35
+ /** Tesseract confidence in [0, 100]; `-1` for non-word rows (filtered out). */
36
+ readonly conf: number;
37
+ readonly text: string;
38
+ }
39
+ /**
40
+ * Parse the raw tesseract TSV into word rows. Pure — exported for tests so the
41
+ * column mapping has a single source of truth and CI never needs a real
42
+ * tesseract binary. Skips the header row, non-word levels, blank text, and any
43
+ * row with too few columns.
44
+ */
45
+ export declare function parseTesseractTsv(tsv: string): TesseractWordRow[];
46
+ /**
47
+ * Pure mapper: raw tesseract TSV → `OcrWithCoordsResult`. Exported for
48
+ * cross-platform unit tests that inject a fixed TSV string (no real binary).
49
+ * Word rows are grouped by their `(block, paragraph, line)` triple — one
50
+ * `OcrWithCoordsBlock` per recognized text line — in first-seen order.
51
+ */
52
+ export declare function mapTesseractTsvToResult(tsv: string, tileWidth: number, tileHeight: number, sourceX: number, sourceY: number): OcrWithCoordsResult;
53
+ /** Resolved tesseract invocation: the binary path + the env it must run under. */
54
+ export interface TesseractResolution {
55
+ bin: string;
56
+ /** Extra env merged onto `process.env` for child runs (LD_LIBRARY_PATH,
57
+ * TESSDATA_PREFIX) — non-empty only when a bundled tesseract is used. */
58
+ env: Record<string, string>;
59
+ }
60
+ /**
61
+ * Resolve a tesseract to run, so the OCR path "just ships and works" without a
62
+ * system `apt install tesseract-ocr` (#9105). Order:
63
+ *
64
+ * 1. `ELIZA_TESSERACT_BIN` — an explicit binary path (CI / power users).
65
+ * 2. A vendored bundle the app ships, found at
66
+ * `${ELIZA_VISION_VENDOR_DIR}/tesseract/{bin/tesseract, lib/*.so*,
67
+ * tessdata/<lang>.traineddata}`. The desktop build stages a portable
68
+ * tesseract there (binary + libtesseract/libleptonica + eng.traineddata);
69
+ * we then run it with that `lib/` on `LD_LIBRARY_PATH` and that `tessdata/`
70
+ * as `TESSDATA_PREFIX`, so no host install is needed.
71
+ * 3. `tesseract` on `PATH` (a system install, the legacy path).
72
+ *
73
+ * Pure-ish + exported for tests (it only reads env + the filesystem); cached for
74
+ * the process lifetime.
75
+ */
76
+ export declare function resolveTesseract(): TesseractResolution;
77
+ /** Test-only: reset the cached availability + resolution probes between cases. */
78
+ export declare function _resetTesseractAvailabilityForTests(): void;
79
+ export declare class LinuxTesseractOcrService implements OcrWithCoordsService {
80
+ readonly name = "linux-tesseract";
81
+ static isAvailable(): boolean;
82
+ describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
83
+ }
84
+ export {};
85
+ //# sourceMappingURL=ocr-service-linux-tesseract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service-linux-tesseract.d.ts","sourceRoot":"","sources":["../src/ocr-service-linux-tesseract.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAOH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAE1B,MAAM,sBAAsB,CAAC;AAG9B,sEAAsE;AACtE,UAAU,gBAAgB;IACxB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,+EAA+E;IAC/E,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAkBD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,EAAE,CA2BjE;AA4ED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CA4BrB;AAED,kFAAkF;AAClF,MAAM,WAAW,mBAAmB;IAClC,GAAG,EAAE,MAAM,CAAC;IACZ;6EACyE;IACzE,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC7B;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,gBAAgB,IAAI,mBAAmB,CA2BtD;AA2CD,kFAAkF;AAClF,wBAAgB,mCAAmC,IAAI,IAAI,CAG1D;AA8CD,qBAAa,wBAAyB,YAAW,oBAAoB;IACnE,QAAQ,CAAC,IAAI,qBAAqB;IAElC,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAwBxE"}
@@ -0,0 +1,59 @@
1
+ /**
2
+ * PaddleOCR / Paddle-Lite OCR-with-coords backend (issue #9581).
3
+ *
4
+ * The alternate coord-OCR provider beyond the shipped Tesseract + RapidOCR
5
+ * adapters. PaddleOCR is a standalone, cross-platform OCR engine (pip install
6
+ * paddleocr) with strong multilingual detection. We drive it through a small
7
+ * self-contained Python wrapper so the JS side parses a stable JSON shape we
8
+ * control — not PaddleOCR's version-sensitive raw `ocr.ocr()` return — and so
9
+ * the wrapper absorbs the numpy/tuple conversion and the 2.x detection layout
10
+ * (`[page][det] = [box4pts, (text, conf)]`).
11
+ *
12
+ * The wrapper emits one object per recognized text line:
13
+ * `[{ "box": [[x,y],[x,y],[x,y],[x,y]], "text": "...", "conf": 0.0..1.0 }, …]`
14
+ * PaddleOCR returns line-level (not word-level) detections, so each entry maps
15
+ * to one `OcrWithCoordsBlock` whose single word is the line; the block bbox is
16
+ * the axis-aligned hull of the (possibly rotated) detection quad, shifted into
17
+ * display-absolute coordinates via `sourceX/sourceY` — the same output shape as
18
+ * the Tesseract and Windows providers, so it plugs straight into the
19
+ * `OcrWithCoordsService` registry seam (and via the bridge into
20
+ * plugin-computeruse's `CoordOcrProvider`).
21
+ *
22
+ * Opt-in: this provider is only selected when `ELIZA_VISION_OCR_BACKEND` is
23
+ * `paddleocr`, so it never displaces a verified default provider. When PaddleOCR
24
+ * (or python3) is absent it reports unavailable and `describe()` returns empty
25
+ * blocks; it never throws, so the boot chain falls through cleanly.
26
+ *
27
+ * NOTE (#9581): the JSON parser below is unit-tested without the engine (CI
28
+ * needs no PaddleOCR install). End-to-end behaviour against a real PaddleOCR
29
+ * install still needs on-target verification before this becomes a default.
30
+ */
31
+ import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
32
+ /** One detection from the wrapper's stable JSON: a quad + line text + score. */
33
+ interface PaddleOcrDetection {
34
+ readonly box: ReadonlyArray<readonly [number, number]>;
35
+ readonly text: string;
36
+ readonly conf: number;
37
+ }
38
+ /**
39
+ * Parse the wrapper's stable JSON into typed detections. Pure — exported for
40
+ * tests so the contract with `PADDLE_PY` has a single source of truth and CI
41
+ * never needs a real PaddleOCR install. Drops entries without a 4-point box,
42
+ * blank text, or a non-finite score.
43
+ */
44
+ export declare function parsePaddleOcrJson(raw: string): PaddleOcrDetection[];
45
+ /**
46
+ * Pure mapper: wrapper JSON → `OcrWithCoordsResult`. Exported for unit tests
47
+ * that inject a fixed JSON string (no real engine). One block per detected line
48
+ * (PaddleOCR is line-level), in first-seen order; the single word is the line.
49
+ */
50
+ export declare function mapPaddleOcrJsonToResult(raw: string, tileWidth: number, tileHeight: number, sourceX: number, sourceY: number): OcrWithCoordsResult;
51
+ /** Test-only: reset the cached availability probe between cases. */
52
+ export declare function _resetPaddleOcrAvailabilityForTests(): void;
53
+ export declare class PaddleOcrService implements OcrWithCoordsService {
54
+ readonly name = "paddleocr";
55
+ static isAvailable(): boolean;
56
+ describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
57
+ }
58
+ export {};
59
+ //# sourceMappingURL=ocr-service-paddleocr.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service-paddleocr.d.ts","sourceRoot":"","sources":["../src/ocr-service-paddleocr.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAOH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAC1B,MAAM,sBAAsB,CAAC;AAG9B,gFAAgF;AAChF,UAAU,kBAAkB;IAC1B,QAAQ,CAAC,GAAG,EAAE,aAAa,CAAC,SAAS,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACvD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAwDD;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,kBAAkB,EAAE,CAuCpE;AAED;;;;GAIG;AACH,wBAAgB,wBAAwB,CACtC,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CA0BrB;AAgDD,oEAAoE;AACpE,wBAAgB,mCAAmC,IAAI,IAAI,CAE1D;AAmBD,qBAAa,gBAAiB,YAAW,oBAAoB;IAC3D,QAAQ,CAAC,IAAI,eAAe;IAE5B,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAwBxE"}
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Native Windows OCR-with-coords via the built-in WinRT `Windows.Media.Ocr`
3
+ * engine (issue #9105 / M4a).
4
+ *
5
+ * Zero LLM tokens, no model download, NPU-accelerated where available. The
6
+ * WinRT projection is only reachable from Windows PowerShell 5.1 (`powershell`),
7
+ * not PowerShell 7 (`pwsh`), so we shell to `powershell` with an embedded
8
+ * script. Output is `OcrWithCoordsResult`, so this plugs straight into the
9
+ * `OcrWithCoordsService` registry seam and (via the M1 bridge) into
10
+ * plugin-computeruse's `CoordOcrProvider`.
11
+ *
12
+ * The engine returns text LINES, each with WORDS that carry bounding rects.
13
+ * We map each line to one `OcrWithCoordsBlock` (block bbox = union of its word
14
+ * rects), compute the semantic position against the source tile thirds, and
15
+ * shift every bbox into display-absolute coordinates via `sourceX/sourceY`.
16
+ */
17
+ import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
18
+ /** Shape emitted by the embedded PowerShell script (parsed from stdout JSON). */
19
+ interface WinOcrRaw {
20
+ width: number;
21
+ height: number;
22
+ lines: Array<{
23
+ text: string;
24
+ words: Array<{
25
+ text: string;
26
+ x: number;
27
+ y: number;
28
+ width: number;
29
+ height: number;
30
+ }>;
31
+ }>;
32
+ }
33
+ /** Pure mapper (exported for cross-platform unit tests). */
34
+ export declare function mapWinOcrToResult(raw: WinOcrRaw, sourceX: number, sourceY: number): OcrWithCoordsResult;
35
+ export declare class WindowsMediaOcrService implements OcrWithCoordsService {
36
+ readonly name = "windows-media-ocr";
37
+ static isAvailable(): boolean;
38
+ describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
39
+ }
40
+ export {};
41
+ //# sourceMappingURL=ocr-service-windows.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-service-windows.d.ts","sourceRoot":"","sources":["../src/ocr-service-windows.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAQH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAE1B,MAAM,sBAAsB,CAAC;AAG9B,iFAAiF;AACjF,UAAU,SAAS;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,KAAK,CAAC;QACX,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,KAAK,CAAC;YACX,IAAI,EAAE,MAAM,CAAC;YACb,CAAC,EAAE,MAAM,CAAC;YACV,CAAC,EAAE,MAAM,CAAC;YACV,KAAK,EAAE,MAAM,CAAC;YACd,MAAM,EAAE,MAAM,CAAC;SAChB,CAAC,CAAC;KACJ,CAAC,CAAC;CACJ;AA+JD,4DAA4D;AAC5D,wBAAgB,iBAAiB,CAC/B,GAAG,EAAE,SAAS,EACd,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CAOrB;AAED,qBAAa,sBAAuB,YAAW,oBAAoB;IACjE,QAAQ,CAAC,IAAI,uBAAuB;IAEpC,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAyCxE"}