@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -301
- package/dist/action.d.ts +3 -0
- package/dist/action.d.ts.map +1 -0
- package/dist/audio-capture-stream.d.ts +42 -0
- package/dist/audio-capture-stream.d.ts.map +1 -0
- package/dist/audio-capture.d.ts +25 -0
- package/dist/audio-capture.d.ts.map +1 -0
- package/dist/computeruse-ocr-bridge.d.ts +50 -0
- package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
- package/dist/config.d.ts +68 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/describe-backpressure.d.ts +90 -0
- package/dist/describe-backpressure.d.ts.map +1 -0
- package/dist/dirty-tile-describer.d.ts +102 -0
- package/dist/dirty-tile-describer.d.ts.map +1 -0
- package/dist/dirty-tile-scene.d.ts +56 -0
- package/dist/dirty-tile-scene.d.ts.map +1 -0
- package/dist/entity-tracker.d.ts +33 -0
- package/dist/entity-tracker.d.ts.map +1 -0
- package/dist/face-detector-ggml.d.ts +60 -0
- package/dist/face-detector-ggml.d.ts.map +1 -0
- package/dist/face-detector-mediapipe.d.ts +25 -0
- package/dist/face-detector-mediapipe.d.ts.map +1 -0
- package/dist/face-recognition-ggml.d.ts +94 -0
- package/dist/face-recognition-ggml.d.ts.map +1 -0
- package/dist/get-screen-elements.d.ts +90 -0
- package/dist/get-screen-elements.d.ts.map +1 -0
- package/dist/get-screen.d.ts +60 -0
- package/dist/get-screen.d.ts.map +1 -0
- package/dist/image/sharp-compat.d.ts +89 -0
- package/dist/image/sharp-compat.d.ts.map +1 -0
- package/dist/image-input.d.ts +15 -0
- package/dist/image-input.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +7957 -6238
- package/dist/index.js.map +41 -26
- package/dist/lifecycle.d.ts +94 -0
- package/dist/lifecycle.d.ts.map +1 -0
- package/dist/mobile/capacitor-camera.d.ts +85 -0
- package/dist/mobile/capacitor-camera.d.ts.map +1 -0
- package/dist/native/doctr-ffi.d.ts +40 -0
- package/dist/native/doctr-ffi.d.ts.map +1 -0
- package/dist/native/yolo-ffi.d.ts +21 -0
- package/dist/native/yolo-ffi.d.ts.map +1 -0
- package/dist/ocr-host-windows.d.ts +34 -0
- package/dist/ocr-host-windows.d.ts.map +1 -0
- package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
- package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
- package/dist/ocr-service-doctr.d.ts +61 -0
- package/dist/ocr-service-doctr.d.ts.map +1 -0
- package/dist/ocr-service-linux-tesseract.d.ts +85 -0
- package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
- package/dist/ocr-service-paddleocr.d.ts +59 -0
- package/dist/ocr-service-paddleocr.d.ts.map +1 -0
- package/dist/ocr-service-windows.d.ts +41 -0
- package/dist/ocr-service-windows.d.ts.map +1 -0
- package/dist/ocr-service.d.ts +91 -0
- package/dist/ocr-service.d.ts.map +1 -0
- package/dist/ocr-with-coords.d.ts +103 -0
- package/dist/ocr-with-coords.d.ts.map +1 -0
- package/dist/person-detector.d.ts +17 -0
- package/dist/person-detector.d.ts.map +1 -0
- package/dist/provider.d.ts +3 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes.d.ts +7 -0
- package/dist/routes.d.ts.map +1 -0
- package/dist/screen-capture-bridge.d.ts +51 -0
- package/dist/screen-capture-bridge.d.ts.map +1 -0
- package/dist/screen-capture.d.ts +17 -0
- package/dist/screen-capture.d.ts.map +1 -0
- package/dist/screen-tiler.d.ts +75 -0
- package/dist/screen-tiler.d.ts.map +1 -0
- package/dist/service.d.ts +176 -0
- package/dist/service.d.ts.map +1 -0
- package/dist/set-of-marks-provider.d.ts +64 -0
- package/dist/set-of-marks-provider.d.ts.map +1 -0
- package/dist/som.d.ts +135 -0
- package/dist/som.d.ts.map +1 -0
- package/dist/som.js +184 -0
- package/dist/som.js.map +11 -0
- package/dist/test-input.d.ts +25 -0
- package/dist/test-input.d.ts.map +1 -0
- package/dist/types.d.ts +241 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/vision-context-augmenter.d.ts +93 -0
- package/dist/vision-context-augmenter.d.ts.map +1 -0
- package/dist/vision-worker-manager.d.ts +51 -0
- package/dist/vision-worker-manager.d.ts.map +1 -0
- package/dist/workers/ocr-worker.d.ts +2 -0
- package/dist/workers/ocr-worker.d.ts.map +1 -0
- package/dist/workers/ocr-worker.js +1075 -7821
- package/dist/workers/ocr-worker.js.map +10 -51
- package/dist/workers/screen-capture-worker.d.ts +2 -0
- package/dist/workers/screen-capture-worker.d.ts.map +1 -0
- package/dist/workers/screen-capture-worker.js +364 -6
- package/dist/workers/screen-capture-worker.js.map +5 -4
- package/dist/workers/worker-logger.d.ts +10 -0
- package/dist/workers/worker-logger.d.ts.map +1 -0
- package/dist/yolo-detector.d.ts +37 -0
- package/dist/yolo-detector.d.ts.map +1 -0
- package/native/doctr.cpp/CMakeLists.txt +58 -0
- package/native/doctr.cpp/README.md +62 -0
- package/native/doctr.cpp/include/doctr.h +91 -0
- package/native/doctr.cpp/scripts/convert.py +98 -0
- package/native/doctr.cpp/src/doctr_det.cpp +112 -0
- package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
- package/native/macos-vision-ocr.swift +113 -0
- package/native/mobilefacenet.cpp/README.md +13 -0
- package/native/movenet.cpp/README.md +10 -0
- package/native/retinaface.cpp/README.md +12 -0
- package/native/yolo.cpp/CMakeLists.txt +57 -0
- package/native/yolo.cpp/README.md +64 -0
- package/native/yolo.cpp/build.mjs +76 -0
- package/native/yolo.cpp/include/yolo.h +62 -0
- package/native/yolo.cpp/scripts/convert.py +248 -0
- package/native/yolo.cpp/src/yolo.cpp +425 -0
- package/native/yolo.cpp/verify/compare.py +99 -0
- package/native/yolo.cpp/verify/make_ref.py +75 -0
- package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
- package/native/yolo.cpp/verify/run_ts.mjs +26 -0
- package/package.json +39 -21
- package/registry-entry.json +43 -0
- package/scripts/vendor-tesseract-linux.mjs +177 -0
- package/build.config.ts +0 -89
- package/dist/workers/florence2-worker.js +0 -779
- package/dist/workers/florence2-worker.js.map +0 -13
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal contract a memory arbiter must implement so vision can plug into
|
|
3
|
+
* WS1's load/unload pipeline. Mirrors the (forthcoming) interface in
|
|
4
|
+
* `@elizaos/plugin-local-inference/src/services/memory-arbiter.ts` but is
|
|
5
|
+
* declared here so plugin-vision compiles standalone.
|
|
6
|
+
*/
|
|
7
|
+
export interface IModelArbiter {
|
|
8
|
+
/**
|
|
9
|
+
* Reserve `bytes` of model memory for `holder`. Returning `false` means the
|
|
10
|
+
* arbiter refused — the caller must skip the load.
|
|
11
|
+
*/
|
|
12
|
+
acquire(holder: string, bytes: number): Promise<boolean> | boolean;
|
|
13
|
+
/**
|
|
14
|
+
* Release the prior reservation for `holder`.
|
|
15
|
+
*/
|
|
16
|
+
release(holder: string): Promise<void> | void;
|
|
17
|
+
/**
|
|
18
|
+
* Subscribe to memory-pressure events. The arbiter calls the listener with
|
|
19
|
+
* a non-empty list of holders when pressure is high enough that those
|
|
20
|
+
* holders should release.
|
|
21
|
+
*/
|
|
22
|
+
onPressure(listener: (holders: string[]) => void): () => void;
|
|
23
|
+
}
|
|
24
|
+
export interface VisionSubServiceHandle {
|
|
25
|
+
/** Stable holder id (e.g. "vision:yolo"). */
|
|
26
|
+
id: string;
|
|
27
|
+
/** Approximate VRAM/RAM cost in bytes. Used by the arbiter; ignored if 0. */
|
|
28
|
+
memoryBytes: number;
|
|
29
|
+
/** Optional hook invoked when the sub-service has been released. */
|
|
30
|
+
unload(): Promise<void> | void;
|
|
31
|
+
/** Optional hook invoked to re-load after a prior release. */
|
|
32
|
+
acquire?(): Promise<void> | void;
|
|
33
|
+
}
|
|
34
|
+
export interface VisionLifecycleConfig {
|
|
35
|
+
/** Milliseconds of inactivity before a sub-service is released. */
|
|
36
|
+
idleUnloadMs?: number;
|
|
37
|
+
/** Tick interval for the idle watchdog. */
|
|
38
|
+
watchdogIntervalMs?: number;
|
|
39
|
+
}
|
|
40
|
+
export declare class VisionServiceLifecycleManager {
|
|
41
|
+
private readonly subs;
|
|
42
|
+
private readonly idleUnloadMs;
|
|
43
|
+
private readonly watchdogIntervalMs;
|
|
44
|
+
private arbiter;
|
|
45
|
+
private unsubscribePressure;
|
|
46
|
+
private watchdogTimer;
|
|
47
|
+
private stopped;
|
|
48
|
+
constructor(config?: VisionLifecycleConfig);
|
|
49
|
+
attachArbiter(arbiter: IModelArbiter | null): void;
|
|
50
|
+
register(handle: VisionSubServiceHandle): void;
|
|
51
|
+
unregister(id: string): void;
|
|
52
|
+
/**
|
|
53
|
+
* Mark a sub-service as in-use. If it was previously released, re-acquire
|
|
54
|
+
* via the registered `acquire` callback (if any).
|
|
55
|
+
*
|
|
56
|
+
* Returns `true` if the sub-service is loaded after the call.
|
|
57
|
+
*/
|
|
58
|
+
touch(id: string): Promise<boolean>;
|
|
59
|
+
/**
|
|
60
|
+
* Force-release a single holder.
|
|
61
|
+
*/
|
|
62
|
+
release(id: string): Promise<void>;
|
|
63
|
+
/**
|
|
64
|
+
* Drop every registered sub-service (used during plugin stop()).
|
|
65
|
+
*/
|
|
66
|
+
stop(): Promise<void>;
|
|
67
|
+
/** Test-only: return current snapshot. */
|
|
68
|
+
snapshot(): Array<{
|
|
69
|
+
id: string;
|
|
70
|
+
loaded: boolean;
|
|
71
|
+
lastUsed: number;
|
|
72
|
+
}>;
|
|
73
|
+
private ensureWatchdog;
|
|
74
|
+
private runWatchdog;
|
|
75
|
+
private handlePressure;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Try to resolve a model arbiter from the runtime, dynamically. This avoids
|
|
79
|
+
* a hard dependency on `@elizaos/plugin-local-inference` (WS1) — vision still
|
|
80
|
+
* works standalone when WS1 isn't installed.
|
|
81
|
+
*
|
|
82
|
+
* Two resolution paths:
|
|
83
|
+
* 1. Direct: a service named `MEMORY_ARBITER` / `memory_arbiter` /
|
|
84
|
+
* `memoryArbiter` that already implements the `IModelArbiter` shape.
|
|
85
|
+
* Used by tests and standalone arbiter services.
|
|
86
|
+
* 2. WS1 bridge: a `localInferenceLoader` / `localInference` service that
|
|
87
|
+
* exposes `getMemoryArbiter()` returning the WS1 `MemoryArbiter`. We
|
|
88
|
+
* adapt it to `IModelArbiter` via `adaptWS1ArbiterToIModelArbiter` so
|
|
89
|
+
* memory-pressure events cascade into vision sub-service release.
|
|
90
|
+
*/
|
|
91
|
+
export declare function resolveArbiterFromRuntime(runtime: {
|
|
92
|
+
getService?: (name: string) => unknown;
|
|
93
|
+
}): IModelArbiter | null;
|
|
94
|
+
//# sourceMappingURL=lifecycle.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lifecycle.d.ts","sourceRoot":"","sources":["../src/lifecycle.ts"],"names":[],"mappings":"AAmBA;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B;;;OAGG;IACH,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC;IAEnE;;OAEG;IACH,OAAO,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IAE9C;;;;OAIG;IACH,UAAU,CAAC,QAAQ,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,IAAI,GAAG,MAAM,IAAI,CAAC;CAC/D;AAED,MAAM,WAAW,sBAAsB;IACrC,6CAA6C;IAC7C,EAAE,EAAE,MAAM,CAAC;IACX,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,oEAAoE;IACpE,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IAC/B,8DAA8D;IAC9D,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;CAClC;AAED,MAAM,WAAW,qBAAqB;IACpC,mEAAmE;IACnE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,2CAA2C;IAC3C,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAYD,qBAAa,6BAA6B;IACxC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAoC;IACzD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;IACtC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAS;IAC5C,OAAO,CAAC,OAAO,CAA8B;IAC7C,OAAO,CAAC,mBAAmB,CAA6B;IACxD,OAAO,CAAC,aAAa,CAA+B;IACpD,OAAO,CAAC,OAAO,CAAS;gBAEZ,MAAM,GAAE,qBAA0B;IAM9C,aAAa,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,GAAG,IAAI;IAelD,QAAQ,CAAC,MAAM,EAAE,sBAAsB,GAAG,IAAI;IAW9C,UAAU,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAI5B;;;;;OAKG;IACG,KAAK,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAgCzC;;OAEG;IACG,OAAO,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAYxC;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAkB3B,0CAA0C;IAC1C,QAAQ,IAAI,KAAK,CAAC;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC;IAQpE,OAAO,CAAC,cAAc;YAWR,WAAW;YAYX,cAAc;CAa7B;AAyDD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,yBAAyB,CAAC,OAAO,EAAE;IACjD,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;CACxC,GAAG,aAAa,GAAG,IAAI,CAqCvB"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import type { CameraInfo, VisionFrame } from "../types";
|
|
2
|
+
interface MobileCameraOpenOptions {
|
|
3
|
+
/** Stable camera id (typically `back` / `front` / a per-device id). */
|
|
4
|
+
cameraId?: string;
|
|
5
|
+
/** Desired frame width in pixels — the native side may snap to nearest. */
|
|
6
|
+
width?: number;
|
|
7
|
+
/** Desired frame height in pixels. */
|
|
8
|
+
height?: number;
|
|
9
|
+
/** Desired frame rate. */
|
|
10
|
+
fps?: number;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Minimal interface every mobile camera implementation must satisfy.
|
|
14
|
+
*
|
|
15
|
+
* Implementations live in:
|
|
16
|
+
* - plugin-aosp (Android NNAPI / CameraX) — WS8
|
|
17
|
+
* - plugin-ios (Core ML / AVFoundation) — WS9
|
|
18
|
+
* - plugin-capacitor-bridge (cross-platform Capacitor plugin) — planned bridge package
|
|
19
|
+
*/
|
|
20
|
+
export interface MobileCameraSource {
|
|
21
|
+
/** Discover cameras visible to the OS. */
|
|
22
|
+
listCameras(): Promise<CameraInfo[]>;
|
|
23
|
+
/** Open a session when the native source supports continuous capture. */
|
|
24
|
+
open(opts?: MobileCameraOpenOptions): Promise<void>;
|
|
25
|
+
/** Capture a single frame as a JPEG buffer. */
|
|
26
|
+
captureJpeg(): Promise<Buffer>;
|
|
27
|
+
/** Capture and return a fully-decoded RGBA frame. */
|
|
28
|
+
captureRgbaFrame?(): Promise<VisionFrame>;
|
|
29
|
+
/** Tear down the session. */
|
|
30
|
+
close(): Promise<void>;
|
|
31
|
+
/** Optional capability declaration — UIs use this to gate buttons. */
|
|
32
|
+
capabilities?(): {
|
|
33
|
+
supportsContinuousFrames: boolean;
|
|
34
|
+
supportsExposureLock: boolean;
|
|
35
|
+
supportsTorch: boolean;
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
interface CapacitorVisionPlugin {
|
|
39
|
+
listCameras?: () => Promise<CameraInfo[]>;
|
|
40
|
+
open?: (opts?: MobileCameraOpenOptions) => Promise<void>;
|
|
41
|
+
captureJpeg?: () => Promise<Buffer | Uint8Array | string | {
|
|
42
|
+
data?: string;
|
|
43
|
+
}>;
|
|
44
|
+
captureRgbaFrame?: () => Promise<VisionFrame | {
|
|
45
|
+
data: string;
|
|
46
|
+
}>;
|
|
47
|
+
close?: () => Promise<void>;
|
|
48
|
+
capabilities?: () => Promise<{
|
|
49
|
+
supportsContinuousFrames: boolean;
|
|
50
|
+
supportsExposureLock: boolean;
|
|
51
|
+
supportsTorch: boolean;
|
|
52
|
+
}>;
|
|
53
|
+
}
|
|
54
|
+
export declare class CapacitorCameraSource implements MobileCameraSource {
|
|
55
|
+
private readonly plugin;
|
|
56
|
+
constructor(plugin: CapacitorVisionPlugin);
|
|
57
|
+
listCameras(): Promise<CameraInfo[]>;
|
|
58
|
+
open(opts?: MobileCameraOpenOptions): Promise<void>;
|
|
59
|
+
captureJpeg(): Promise<Buffer>;
|
|
60
|
+
captureRgbaFrame(): Promise<VisionFrame>;
|
|
61
|
+
close(): Promise<void>;
|
|
62
|
+
capabilities(): {
|
|
63
|
+
supportsContinuousFrames: boolean;
|
|
64
|
+
supportsExposureLock: boolean;
|
|
65
|
+
supportsTorch: boolean;
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Default unavailable implementation. Returns no cameras and refuses captures.
|
|
70
|
+
* This keeps the plugin-vision JS surface buildable on Node platforms where no
|
|
71
|
+
* native bridge is registered.
|
|
72
|
+
*/
|
|
73
|
+
export declare class UnavailableMobileCameraSource implements MobileCameraSource {
|
|
74
|
+
listCameras(): Promise<CameraInfo[]>;
|
|
75
|
+
open(): Promise<void>;
|
|
76
|
+
captureJpeg(): Promise<Buffer>;
|
|
77
|
+
close(): Promise<void>;
|
|
78
|
+
}
|
|
79
|
+
/** Compatibility alias for older imports. */
|
|
80
|
+
export declare const CapacitorCameraStub: typeof UnavailableMobileCameraSource;
|
|
81
|
+
export declare function registerMobileCameraSource(source: MobileCameraSource): void;
|
|
82
|
+
export declare function getMobileCameraSource(): MobileCameraSource | null;
|
|
83
|
+
export declare function clearMobileCameraSource(): void;
|
|
84
|
+
export {};
|
|
85
|
+
//# sourceMappingURL=capacitor-camera.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"capacitor-camera.d.ts","sourceRoot":"","sources":["../../src/mobile/capacitor-camera.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAExD,UAAU,uBAAuB;IAC/B,uEAAuE;IACvE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2EAA2E;IAC3E,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,sCAAsC;IACtC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,0BAA0B;IAC1B,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,kBAAkB;IACjC,0CAA0C;IAC1C,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IACrC,yEAAyE;IACzE,IAAI,CAAC,IAAI,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACpD,+CAA+C;IAC/C,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IAC/B,qDAAqD;IACrD,gBAAgB,CAAC,IAAI,OAAO,CAAC,WAAW,CAAC,CAAC;IAC1C,6BAA6B;IAC7B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,sEAAsE;IACtE,YAAY,CAAC,IAAI;QACf,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC;CACH;AAED,UAAU,qBAAqB;IAC7B,WAAW,CAAC,EAAE,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1C,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,uBAAuB,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,WAAW,CAAC,EAAE,MAAM,OAAO,CAAC,MAAM,GAAG,UAAU,GAAG,MAAM,GAAG;QAAE,IAAI,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9E,gBAAgB,CAAC,EAAE,MAAM,OAAO,CAAC,WAAW,GAAG;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,KAAK,CAAC,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,OAAO,CAAC;QAC3B,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC,CAAC;CACJ;AAsBD,qBAAa,qBAAsB,YAAW,kBAAkB;IAClD,OAAO,CAAC,QAAQ,CAAC,MAAM;gBAAN,MAAM,EAAE,qBAAqB;IAEpD,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAIpC,IAAI,CAAC,IAAI,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC;IAOnD,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAO9B,gBAAgB,IAAI,OAAO,CAAC,WAAW,CAAC;IAaxC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAI5B,YAAY,IAAI;QACd,wBAAwB,EAAE,OAAO,CAAC;QAClC,oBAAoB,EAAE,OAAO,CAAC;QAC9B,aAAa,EAAE,OAAO,CAAC;KACxB;CAOF;AAED;;;;GAIG;AACH,qBAAa,6BAA8B,YAAW,kBAAkB;IAChE,WAAW,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAMpC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAGrB,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAG9B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAC7B;AAED,6CAA6C;AAC7C,eAAO,MAAM,mBAAmB,sCAAgC,CAAC;AAiBjE,wBAAgB,0BAA0B,CAAC,MAAM,EAAE,kBAAkB,GAAG,IAAI,CAwB3E;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAMjE;AAED,wBAAgB,uBAAuB,IAAI,IAAI,CAE9C"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/** Where the runtime expects GGUF weights. */
|
|
2
|
+
export declare function defaultDetWeightsPath(): string;
|
|
3
|
+
export declare function defaultRecWeightsPath(): string;
|
|
4
|
+
interface DocTRBindings {
|
|
5
|
+
/** Detection forward pass. Output: prob map at H/4 × W/4. */
|
|
6
|
+
detect(detGGUFPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
|
|
7
|
+
probMap: Float32Array;
|
|
8
|
+
h: number;
|
|
9
|
+
w: number;
|
|
10
|
+
}>;
|
|
11
|
+
/** Recognition forward pass on a cropped line image. */
|
|
12
|
+
recognize(recGGUFPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
|
|
13
|
+
logits: Float32Array;
|
|
14
|
+
T: number;
|
|
15
|
+
C: number;
|
|
16
|
+
}>;
|
|
17
|
+
/** Returns the recognition charset (utf-8, newline separated). */
|
|
18
|
+
charset(recGGUFPath: string): Promise<string>;
|
|
19
|
+
dispose(): Promise<void>;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Load the doctr.cpp shared library via `bun:ffi`. Returns null when either
|
|
23
|
+
* the library or the GGUF weights are missing — the caller is expected to
|
|
24
|
+
* throw a clear error in that case rather than silently fall back.
|
|
25
|
+
*/
|
|
26
|
+
export declare function loadDoctrBindings(): Promise<DocTRBindings | null>;
|
|
27
|
+
/**
|
|
28
|
+
* `true` when both the native library and the GGUF weights exist on disk. Does
|
|
29
|
+
* not actually initialize anything — callers should still expect the C++ side
|
|
30
|
+
* to return DOCTR_ERR_BACKEND until the ggml graph is wired.
|
|
31
|
+
*/
|
|
32
|
+
export declare function isDoctrReady(opts?: {
|
|
33
|
+
detPath?: string;
|
|
34
|
+
recPath?: string;
|
|
35
|
+
}): Promise<{
|
|
36
|
+
ready: boolean;
|
|
37
|
+
reason?: string;
|
|
38
|
+
}>;
|
|
39
|
+
export {};
|
|
40
|
+
//# sourceMappingURL=doctr-ffi.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"doctr-ffi.d.ts","sourceRoot":"","sources":["../../src/native/doctr-ffi.ts"],"names":[],"mappings":"AA0CA,8CAA8C;AAC9C,wBAAgB,qBAAqB,IAAI,MAAM,CAO9C;AAED,wBAAgB,qBAAqB,IAAI,MAAM,CAO9C;AAsBD,UAAU,aAAa;IACrB,6DAA6D;IAC7D,MAAM,CACJ,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,OAAO,EAAE,YAAY,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAE5D,wDAAwD;IACxD,SAAS,CACP,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,MAAM,EAAE,YAAY,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAE3D,kEAAkE;IAClE,OAAO,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAE9C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAID;;;;GAIG;AACH,wBAAsB,iBAAiB,IAAI,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CAsLvE;AAED;;;;GAIG;AACH,wBAAsB,YAAY,CAAC,IAAI,CAAC,EAAE;IACxC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,GAAG,OAAO,CAAC;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAqB/C"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export declare function defaultYoloWeightsPath(): string;
|
|
2
|
+
interface YoloBindings {
|
|
3
|
+
/** Run forward pass. Returns the raw (channels, anchors) logits tensor. */
|
|
4
|
+
run(ggufPath: string, rgbCHW: Float32Array, h: number, w: number): Promise<{
|
|
5
|
+
logits: Float32Array;
|
|
6
|
+
channels: number;
|
|
7
|
+
anchors: number;
|
|
8
|
+
}>;
|
|
9
|
+
/** Returns the embedded class names (newline-separated). */
|
|
10
|
+
classes(ggufPath: string): Promise<string>;
|
|
11
|
+
dispose(): Promise<void>;
|
|
12
|
+
}
|
|
13
|
+
export declare function loadYoloBindings(): Promise<YoloBindings | null>;
|
|
14
|
+
export declare function isYoloReady(opts?: {
|
|
15
|
+
weightsPath?: string;
|
|
16
|
+
}): Promise<{
|
|
17
|
+
ready: boolean;
|
|
18
|
+
reason?: string;
|
|
19
|
+
}>;
|
|
20
|
+
export {};
|
|
21
|
+
//# sourceMappingURL=yolo-ffi.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"yolo-ffi.d.ts","sourceRoot":"","sources":["../../src/native/yolo-ffi.ts"],"names":[],"mappings":"AAuCA,wBAAgB,sBAAsB,IAAI,MAAM,CAO/C;AAED,UAAU,YAAY;IACpB,2EAA2E;IAC3E,GAAG,CACD,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,YAAY,EACpB,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,GACR,OAAO,CAAC;QAAE,MAAM,EAAE,YAAY,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAExE,4DAA4D;IAC5D,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAE3C,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAmBD,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CA0HrE;AAED,wBAAsB,WAAW,CAAC,IAAI,CAAC,EAAE;IACvC,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAe/C"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent WinRT OCR host (Windows-only) — kills the per-OCR cold-spawn tax.
|
|
3
|
+
*
|
|
4
|
+
* `WindowsMediaOcrService.describe()` previously ran `powershell -File
|
|
5
|
+
* windows-ocr.ps1` for EVERY recognized region. On Defender-heavy hosts a cold
|
|
6
|
+
* `powershell.exe` spawn is ~10-16s (#9581), and OCR fires on every dirty region
|
|
7
|
+
* every turn — and the scene pipeline OCRs regions in parallel, so a turn would
|
|
8
|
+
* spawn N cold processes at once and thrash the AV scanner.
|
|
9
|
+
*
|
|
10
|
+
* This keeps ONE long-lived `powershell.exe` that loads the (expensive) WinRT
|
|
11
|
+
* projection + `OcrEngine` ONCE in its parent scope, then loops: read an image
|
|
12
|
+
* path on stdin, recognize, emit one compact JSON line on stdout. So each call
|
|
13
|
+
* pays neither the process spawn NOR the WinRT type-load — only the recognize
|
|
14
|
+
* (~0.3-1s). Requests are serialized over the one pipe (fine — each is fast).
|
|
15
|
+
*
|
|
16
|
+
* It is a pure latency optimization: `describe()` falls back to the original
|
|
17
|
+
* one-shot `-File` spawn whenever the host is unavailable / disabled / errors,
|
|
18
|
+
* so output is unchanged. No-op off Windows. Disable with `ELIZA_VISION_OCR_HOST=0`.
|
|
19
|
+
*
|
|
20
|
+
* Protocol: JS writes `<absolute-image-path>\n`; the host writes exactly one
|
|
21
|
+
* line of compact JSON (`{width,height,lines}` — same shape as the one-shot
|
|
22
|
+
* script). base64 isn't needed: temp image paths never contain newlines, and
|
|
23
|
+
* `ConvertTo-Json -Compress` output is always a single physical line.
|
|
24
|
+
*/
|
|
25
|
+
export declare function ocrHostAvailable(): boolean;
|
|
26
|
+
export declare function shutdownOcrHost(): void;
|
|
27
|
+
/**
|
|
28
|
+
* Recognize the image at `imagePath` via the warm host, returning the raw JSON
|
|
29
|
+
* line (same shape the one-shot script emits). Serialized against other calls.
|
|
30
|
+
* Rejects (so the caller can fall back to a one-shot spawn) on host-start
|
|
31
|
+
* failure, timeout, or unexpected exit.
|
|
32
|
+
*/
|
|
33
|
+
export declare function runOcrHost(imagePath: string): Promise<string>;
|
|
34
|
+
//# sourceMappingURL=ocr-host-windows.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-host-windows.d.ts","sourceRoot":"","sources":["../src/ocr-host-windows.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAqFH,wBAAgB,gBAAgB,IAAI,OAAO,CAK1C;AA2BD,wBAAgB,eAAe,IAAI,IAAI,CAsBtC;AAuFD;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAQ7D"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* macOS Apple Vision OCR provider (issue #9105 — per-OS native OCR fallback).
|
|
3
|
+
*
|
|
4
|
+
* Implements the structural `AppleVisionOcrProvider` seam from `ocr-service.ts`
|
|
5
|
+
* by shelling out to a bundled Swift helper (`native/macos-vision-ocr.swift`)
|
|
6
|
+
* that runs `VNRecognizeTextRequest` (accurate level, language correction on).
|
|
7
|
+
* The helper reads PNG/JPEG bytes from stdin and prints a single JSON object;
|
|
8
|
+
* this module pipes the bytes in and maps the result onto the provider shape.
|
|
9
|
+
*
|
|
10
|
+
* Zero LLM tokens, no model download — Apple Vision ships with macOS. This is
|
|
11
|
+
* the darwin sibling of `WindowsMediaOcrService` (Windows.Media.Ocr) and the
|
|
12
|
+
* iOS `createIosVisionOcrProvider` (Capacitor bridge): same VNRecognizeText
|
|
13
|
+
* engine, reached without Capacitor on a desktop host.
|
|
14
|
+
*
|
|
15
|
+
* Coordinate convention: Vision returns normalized BOTTOM-LEFT bboxes; the
|
|
16
|
+
* Swift helper converts them to TOP-LEFT PIXEL coordinates so the result
|
|
17
|
+
* matches the display-absolute convention used by every other provider.
|
|
18
|
+
*
|
|
19
|
+
* Fails soft: `available()` is false off darwin or when `swift` is missing, and
|
|
20
|
+
* `recognize()` returns an empty result rather than throwing on a helper
|
|
21
|
+
* failure, so the `OCRService` chain falls through to the doCTR backend.
|
|
22
|
+
*/
|
|
23
|
+
import type { AppleVisionOcrProvider } from "./ocr-service";
|
|
24
|
+
interface MacosVisionAvailabilityOptions {
|
|
25
|
+
platform?: NodeJS.Platform;
|
|
26
|
+
env?: NodeJS.ProcessEnv;
|
|
27
|
+
pathExists?: (candidate: string) => boolean;
|
|
28
|
+
executableExists?: (name: string, env: NodeJS.ProcessEnv) => boolean;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Resolve the bundled Swift helper. Works from both the dev tree (`src/`) and
|
|
32
|
+
* the published build (`dist/`) — the `native/` directory sits alongside both,
|
|
33
|
+
* at the package root. `ELIZA_MACOS_VISION_OCR_SCRIPT` overrides for tests.
|
|
34
|
+
*/
|
|
35
|
+
declare function resolveScriptPath(env?: NodeJS.ProcessEnv, pathExists?: (candidate: string) => boolean): string | null;
|
|
36
|
+
/** True when running on macOS with the `swift` toolchain and the helper present. */
|
|
37
|
+
declare function macosVisionAvailable(options?: MacosVisionAvailabilityOptions): boolean;
|
|
38
|
+
/**
|
|
39
|
+
* Build an `AppleVisionOcrProvider` backed by macOS Apple Vision. Register it
|
|
40
|
+
* via `registerAppleVisionOcrProvider(createMacosVisionOcrProvider())` on
|
|
41
|
+
* darwin so the `OCRService` Apple-Vision backend resolves a real engine.
|
|
42
|
+
*/
|
|
43
|
+
export declare function createMacosVisionOcrProvider(): AppleVisionOcrProvider;
|
|
44
|
+
/** Exposed for the runtime wire-up + tests; mirrors `macosVisionAvailable`. */
|
|
45
|
+
export declare function isMacosVisionOcrAvailable(): boolean;
|
|
46
|
+
export declare const __test__: {
|
|
47
|
+
macosVisionAvailable: typeof macosVisionAvailable;
|
|
48
|
+
resolveScriptPath: typeof resolveScriptPath;
|
|
49
|
+
};
|
|
50
|
+
export {};
|
|
51
|
+
//# sourceMappingURL=ocr-service-apple-vision-macos.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service-apple-vision-macos.d.ts","sourceRoot":"","sources":["../src/ocr-service-apple-vision-macos.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAOH,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AAe5D,UAAU,8BAA8B;IACtC,QAAQ,CAAC,EAAE,MAAM,CAAC,QAAQ,CAAC;IAC3B,GAAG,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;IACxB,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC;IAC5C,gBAAgB,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,CAAC,UAAU,KAAK,OAAO,CAAC;CACtE;AAED;;;;GAIG;AACH,iBAAS,iBAAiB,CACxB,GAAG,GAAE,MAAM,CAAC,UAAwB,EACpC,UAAU,GAAE,CAAC,SAAS,EAAE,MAAM,KAAK,OAAoB,GACtD,MAAM,GAAG,IAAI,CAQf;AAsBD,oFAAoF;AACpF,iBAAS,oBAAoB,CAC3B,OAAO,GAAE,8BAAmC,GAC3C,OAAO,CAQT;AA8BD;;;;GAIG;AACH,wBAAgB,4BAA4B,IAAI,sBAAsB,CAoCrE;AAED,+EAA+E;AAC/E,wBAAgB,yBAAyB,IAAI,OAAO,CAEnD;AAED,eAAO,MAAM,QAAQ;;;CAGpB,CAAC"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { OCRResult } from "./types";
|
|
2
|
+
export interface DoctrOCRConfig {
|
|
3
|
+
/** GGUF detection weights path. */
|
|
4
|
+
detPath?: string;
|
|
5
|
+
/** GGUF recognition weights path. */
|
|
6
|
+
recPath?: string;
|
|
7
|
+
/** Detection input resolution (square). Default 1024. */
|
|
8
|
+
inputSize?: number;
|
|
9
|
+
/** Probability threshold for the DBNet output. */
|
|
10
|
+
probThreshold?: number;
|
|
11
|
+
/** Minimum connected-component pixel count for a detection. */
|
|
12
|
+
minComponentSize?: number;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Detect platforms where Apple Vision is the better OCR choice.
|
|
16
|
+
*
|
|
17
|
+
* macOS Sonoma+ and iOS expose VNRecognizeTextRequest which is faster and
|
|
18
|
+
* higher-quality than any community OCR for Latin scripts. The integration
|
|
19
|
+
* lives in `plugin-computeruse/mobile`; we just refuse to claim availability
|
|
20
|
+
* so the higher-priority Apple Vision backend wins on darwin.
|
|
21
|
+
*/
|
|
22
|
+
export declare function shouldPreferAppleVision(): boolean;
|
|
23
|
+
export declare class DoctrOCRService {
|
|
24
|
+
private readonly cfg;
|
|
25
|
+
private initPromise;
|
|
26
|
+
private initialized;
|
|
27
|
+
private charset;
|
|
28
|
+
constructor(config?: DoctrOCRConfig);
|
|
29
|
+
/**
|
|
30
|
+
* Best-effort availability check. Confirms the native lib loads and the
|
|
31
|
+
* GGUF files are on disk. Does NOT prove the ggml forward pass works —
|
|
32
|
+
* that's discovered on the first `extractText` call.
|
|
33
|
+
*/
|
|
34
|
+
static isAvailable(opts?: {
|
|
35
|
+
detPath?: string;
|
|
36
|
+
recPath?: string;
|
|
37
|
+
}): Promise<boolean>;
|
|
38
|
+
isInitialized(): boolean;
|
|
39
|
+
initialize(): Promise<void>;
|
|
40
|
+
private _initialize;
|
|
41
|
+
extractText(imageBuffer: Buffer): Promise<OCRResult>;
|
|
42
|
+
private toCHWFloat32;
|
|
43
|
+
/**
|
|
44
|
+
* DBNet contouring: scan the probability map, group above-threshold pixels
|
|
45
|
+
* into connected components, return axis-aligned bboxes in original image
|
|
46
|
+
* coordinates.
|
|
47
|
+
*
|
|
48
|
+
* This is the same algorithm as the previous PP-OCRv5 path — it's a
|
|
49
|
+
* standard DBNet post-process and works for both detection backbones.
|
|
50
|
+
*/
|
|
51
|
+
private probMapToBoxes;
|
|
52
|
+
/**
|
|
53
|
+
* Recognition step: crop the bbox, resize to 32xN, run through the CRNN
|
|
54
|
+
* recognizer, CTC-decode the output.
|
|
55
|
+
*/
|
|
56
|
+
private recognizeCrop;
|
|
57
|
+
/** CTC greedy decoding. Blank index = 0. */
|
|
58
|
+
private ctcDecode;
|
|
59
|
+
dispose(): Promise<void>;
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=ocr-service-doctr.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service-doctr.d.ts","sourceRoot":"","sources":["../src/ocr-service-doctr.ts"],"names":[],"mappings":"AA0BA,OAAO,KAAK,EAAe,SAAS,EAAE,MAAM,SAAS,CAAC;AAEtD,MAAM,WAAW,cAAc;IAC7B,mCAAmC;IACnC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,yDAAyD;IACzD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,kDAAkD;IAClD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,+DAA+D;IAC/D,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;;;GAOG;AACH,wBAAgB,uBAAuB,IAAI,OAAO,CAKjD;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,GAAG,CAGH;IACjB,OAAO,CAAC,WAAW,CAA8B;IACjD,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,OAAO,CAAgB;gBAEnB,MAAM,GAAE,cAAmB;IAUvC;;;;OAIG;WACU,WAAW,CAAC,IAAI,CAAC,EAAE;QAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,GAAG,OAAO,CAAC,OAAO,CAAC;IAKpB,aAAa,IAAI,OAAO;IAIlB,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAOnB,WAAW;IA0BnB,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC;IAmD1D,OAAO,CAAC,YAAY;IAWpB;;;;;;;OAOG;IACH,OAAO,CAAC,cAAc;IAoEtB;;;OAGG;YACW,aAAa;IAwC3B,4CAA4C;IAC5C,OAAO,CAAC,SAAS;IAwBX,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAO/B"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Native Linux OCR-with-coords via the classic `tesseract` CLI (issue #9105 /
|
|
3
|
+
* M4).
|
|
4
|
+
*
|
|
5
|
+
* Zero LLM tokens, no in-repo model download, no ONNX — `tesseract` is a
|
|
6
|
+
* standalone C++ engine packaged by every Linux distro (`apt install
|
|
7
|
+
* tesseract-ocr`). We shell to it with `tsv` output, which emits one row per
|
|
8
|
+
* recognized element with a `level` column (1=page, 2=block, 3=para, 4=line,
|
|
9
|
+
* 5=word) plus per-element `left/top/width/height` boxes and a per-word `conf`.
|
|
10
|
+
* Output maps onto `OcrWithCoordsResult`, so this plugs straight into the
|
|
11
|
+
* `OcrWithCoordsService` registry seam and (via the M1 bridge) into
|
|
12
|
+
* plugin-computeruse's `CoordOcrProvider`.
|
|
13
|
+
*
|
|
14
|
+
* We read the word rows (`level == 5`), group them by their parent
|
|
15
|
+
* `(block, paragraph, line)` triple into one `OcrWithCoordsBlock` per text
|
|
16
|
+
* line (block bbox = union of its word rects), compute the semantic position
|
|
17
|
+
* against the source-tile thirds, and shift every bbox into display-absolute
|
|
18
|
+
* coordinates via `sourceX/sourceY` — the same shape as the Windows provider.
|
|
19
|
+
*
|
|
20
|
+
* Availability is feature-detected on the `tesseract` binary and cached for the
|
|
21
|
+
* process lifetime. When the binary is absent the provider reports unavailable
|
|
22
|
+
* and `describe()` returns empty blocks; it never throws so the boot chain
|
|
23
|
+
* falls through to the docTR ggml backend cleanly.
|
|
24
|
+
*/
|
|
25
|
+
import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
|
|
26
|
+
/** A single `level == 5` (word) row parsed from the tesseract TSV. */
|
|
27
|
+
interface TesseractWordRow {
|
|
28
|
+
readonly blockNum: number;
|
|
29
|
+
readonly parNum: number;
|
|
30
|
+
readonly lineNum: number;
|
|
31
|
+
readonly left: number;
|
|
32
|
+
readonly top: number;
|
|
33
|
+
readonly width: number;
|
|
34
|
+
readonly height: number;
|
|
35
|
+
/** Tesseract confidence in [0, 100]; `-1` for non-word rows (filtered out). */
|
|
36
|
+
readonly conf: number;
|
|
37
|
+
readonly text: string;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Parse the raw tesseract TSV into word rows. Pure — exported for tests so the
|
|
41
|
+
* column mapping has a single source of truth and CI never needs a real
|
|
42
|
+
* tesseract binary. Skips the header row, non-word levels, blank text, and any
|
|
43
|
+
* row with too few columns.
|
|
44
|
+
*/
|
|
45
|
+
export declare function parseTesseractTsv(tsv: string): TesseractWordRow[];
|
|
46
|
+
/**
|
|
47
|
+
* Pure mapper: raw tesseract TSV → `OcrWithCoordsResult`. Exported for
|
|
48
|
+
* cross-platform unit tests that inject a fixed TSV string (no real binary).
|
|
49
|
+
* Word rows are grouped by their `(block, paragraph, line)` triple — one
|
|
50
|
+
* `OcrWithCoordsBlock` per recognized text line — in first-seen order.
|
|
51
|
+
*/
|
|
52
|
+
export declare function mapTesseractTsvToResult(tsv: string, tileWidth: number, tileHeight: number, sourceX: number, sourceY: number): OcrWithCoordsResult;
|
|
53
|
+
/** Resolved tesseract invocation: the binary path + the env it must run under. */
|
|
54
|
+
export interface TesseractResolution {
|
|
55
|
+
bin: string;
|
|
56
|
+
/** Extra env merged onto `process.env` for child runs (LD_LIBRARY_PATH,
|
|
57
|
+
* TESSDATA_PREFIX) — non-empty only when a bundled tesseract is used. */
|
|
58
|
+
env: Record<string, string>;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Resolve a tesseract to run, so the OCR path "just ships and works" without a
|
|
62
|
+
* system `apt install tesseract-ocr` (#9105). Order:
|
|
63
|
+
*
|
|
64
|
+
* 1. `ELIZA_TESSERACT_BIN` — an explicit binary path (CI / power users).
|
|
65
|
+
* 2. A vendored bundle the app ships, found at
|
|
66
|
+
* `${ELIZA_VISION_VENDOR_DIR}/tesseract/{bin/tesseract, lib/*.so*,
|
|
67
|
+
* tessdata/<lang>.traineddata}`. The desktop build stages a portable
|
|
68
|
+
* tesseract there (binary + libtesseract/libleptonica + eng.traineddata);
|
|
69
|
+
* we then run it with that `lib/` on `LD_LIBRARY_PATH` and that `tessdata/`
|
|
70
|
+
* as `TESSDATA_PREFIX`, so no host install is needed.
|
|
71
|
+
* 3. `tesseract` on `PATH` (a system install, the legacy path).
|
|
72
|
+
*
|
|
73
|
+
* Pure-ish + exported for tests (it only reads env + the filesystem); cached for
|
|
74
|
+
* the process lifetime.
|
|
75
|
+
*/
|
|
76
|
+
export declare function resolveTesseract(): TesseractResolution;
|
|
77
|
+
/** Test-only: reset the cached availability + resolution probes between cases. */
|
|
78
|
+
export declare function _resetTesseractAvailabilityForTests(): void;
|
|
79
|
+
export declare class LinuxTesseractOcrService implements OcrWithCoordsService {
|
|
80
|
+
readonly name = "linux-tesseract";
|
|
81
|
+
static isAvailable(): boolean;
|
|
82
|
+
describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
|
|
83
|
+
}
|
|
84
|
+
export {};
|
|
85
|
+
//# sourceMappingURL=ocr-service-linux-tesseract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service-linux-tesseract.d.ts","sourceRoot":"","sources":["../src/ocr-service-linux-tesseract.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAOH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAE1B,MAAM,sBAAsB,CAAC;AAG9B,sEAAsE;AACtE,UAAU,gBAAgB;IACxB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,+EAA+E;IAC/E,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAkBD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,EAAE,CA2BjE;AA4ED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CA4BrB;AAED,kFAAkF;AAClF,MAAM,WAAW,mBAAmB;IAClC,GAAG,EAAE,MAAM,CAAC;IACZ;6EACyE;IACzE,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC7B;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,gBAAgB,IAAI,mBAAmB,CA2BtD;AA2CD,kFAAkF;AAClF,wBAAgB,mCAAmC,IAAI,IAAI,CAG1D;AA8CD,qBAAa,wBAAyB,YAAW,oBAAoB;IACnE,QAAQ,CAAC,IAAI,qBAAqB;IAElC,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAwBxE"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PaddleOCR / Paddle-Lite OCR-with-coords backend (issue #9581).
|
|
3
|
+
*
|
|
4
|
+
* The alternate coord-OCR provider beyond the shipped Tesseract + RapidOCR
|
|
5
|
+
* adapters. PaddleOCR is a standalone, cross-platform OCR engine (pip install
|
|
6
|
+
* paddleocr) with strong multilingual detection. We drive it through a small
|
|
7
|
+
* self-contained Python wrapper so the JS side parses a stable JSON shape we
|
|
8
|
+
* control — not PaddleOCR's version-sensitive raw `ocr.ocr()` return — and so
|
|
9
|
+
* the wrapper absorbs the numpy/tuple conversion and the 2.x detection layout
|
|
10
|
+
* (`[page][det] = [box4pts, (text, conf)]`).
|
|
11
|
+
*
|
|
12
|
+
* The wrapper emits one object per recognized text line:
|
|
13
|
+
* `[{ "box": [[x,y],[x,y],[x,y],[x,y]], "text": "...", "conf": 0.0..1.0 }, …]`
|
|
14
|
+
* PaddleOCR returns line-level (not word-level) detections, so each entry maps
|
|
15
|
+
* to one `OcrWithCoordsBlock` whose single word is the line; the block bbox is
|
|
16
|
+
* the axis-aligned hull of the (possibly rotated) detection quad, shifted into
|
|
17
|
+
* display-absolute coordinates via `sourceX/sourceY` — the same output shape as
|
|
18
|
+
* the Tesseract and Windows providers, so it plugs straight into the
|
|
19
|
+
* `OcrWithCoordsService` registry seam (and via the bridge into
|
|
20
|
+
* plugin-computeruse's `CoordOcrProvider`).
|
|
21
|
+
*
|
|
22
|
+
* Opt-in: this provider is only selected when `ELIZA_VISION_OCR_BACKEND` is
|
|
23
|
+
* `paddleocr`, so it never displaces a verified default provider. When PaddleOCR
|
|
24
|
+
* (or python3) is absent it reports unavailable and `describe()` returns empty
|
|
25
|
+
* blocks; it never throws, so the boot chain falls through cleanly.
|
|
26
|
+
*
|
|
27
|
+
* NOTE (#9581): the JSON parser below is unit-tested without the engine (CI
|
|
28
|
+
* needs no PaddleOCR install). End-to-end behaviour against a real PaddleOCR
|
|
29
|
+
* install still needs on-target verification before this becomes a default.
|
|
30
|
+
*/
|
|
31
|
+
import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
|
|
32
|
+
/** One detection from the wrapper's stable JSON: a quad + line text + score. */
|
|
33
|
+
interface PaddleOcrDetection {
|
|
34
|
+
readonly box: ReadonlyArray<readonly [number, number]>;
|
|
35
|
+
readonly text: string;
|
|
36
|
+
readonly conf: number;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Parse the wrapper's stable JSON into typed detections. Pure — exported for
|
|
40
|
+
* tests so the contract with `PADDLE_PY` has a single source of truth and CI
|
|
41
|
+
* never needs a real PaddleOCR install. Drops entries without a 4-point box,
|
|
42
|
+
* blank text, or a non-finite score.
|
|
43
|
+
*/
|
|
44
|
+
export declare function parsePaddleOcrJson(raw: string): PaddleOcrDetection[];
|
|
45
|
+
/**
|
|
46
|
+
* Pure mapper: wrapper JSON → `OcrWithCoordsResult`. Exported for unit tests
|
|
47
|
+
* that inject a fixed JSON string (no real engine). One block per detected line
|
|
48
|
+
* (PaddleOCR is line-level), in first-seen order; the single word is the line.
|
|
49
|
+
*/
|
|
50
|
+
export declare function mapPaddleOcrJsonToResult(raw: string, tileWidth: number, tileHeight: number, sourceX: number, sourceY: number): OcrWithCoordsResult;
|
|
51
|
+
/** Test-only: reset the cached availability probe between cases. */
|
|
52
|
+
export declare function _resetPaddleOcrAvailabilityForTests(): void;
|
|
53
|
+
export declare class PaddleOcrService implements OcrWithCoordsService {
|
|
54
|
+
readonly name = "paddleocr";
|
|
55
|
+
static isAvailable(): boolean;
|
|
56
|
+
describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
|
|
57
|
+
}
|
|
58
|
+
export {};
|
|
59
|
+
//# sourceMappingURL=ocr-service-paddleocr.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service-paddleocr.d.ts","sourceRoot":"","sources":["../src/ocr-service-paddleocr.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAOH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAC1B,MAAM,sBAAsB,CAAC;AAG9B,gFAAgF;AAChF,UAAU,kBAAkB;IAC1B,QAAQ,CAAC,GAAG,EAAE,aAAa,CAAC,SAAS,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACvD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAwDD;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,kBAAkB,EAAE,CAuCpE;AAED;;;;GAIG;AACH,wBAAgB,wBAAwB,CACtC,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CA0BrB;AAgDD,oEAAoE;AACpE,wBAAgB,mCAAmC,IAAI,IAAI,CAE1D;AAmBD,qBAAa,gBAAiB,YAAW,oBAAoB;IAC3D,QAAQ,CAAC,IAAI,eAAe;IAE5B,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAwBxE"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Native Windows OCR-with-coords via the built-in WinRT `Windows.Media.Ocr`
|
|
3
|
+
* engine (issue #9105 / M4a).
|
|
4
|
+
*
|
|
5
|
+
* Zero LLM tokens, no model download, NPU-accelerated where available. The
|
|
6
|
+
* WinRT projection is only reachable from Windows PowerShell 5.1 (`powershell`),
|
|
7
|
+
* not PowerShell 7 (`pwsh`), so we shell to `powershell` with an embedded
|
|
8
|
+
* script. Output is `OcrWithCoordsResult`, so this plugs straight into the
|
|
9
|
+
* `OcrWithCoordsService` registry seam and (via the M1 bridge) into
|
|
10
|
+
* plugin-computeruse's `CoordOcrProvider`.
|
|
11
|
+
*
|
|
12
|
+
* The engine returns text LINES, each with WORDS that carry bounding rects.
|
|
13
|
+
* We map each line to one `OcrWithCoordsBlock` (block bbox = union of its word
|
|
14
|
+
* rects), compute the semantic position against the source tile thirds, and
|
|
15
|
+
* shift every bbox into display-absolute coordinates via `sourceX/sourceY`.
|
|
16
|
+
*/
|
|
17
|
+
import { type OcrWithCoordsInput, type OcrWithCoordsResult, type OcrWithCoordsService } from "./ocr-with-coords.js";
|
|
18
|
+
/** Shape emitted by the embedded PowerShell script (parsed from stdout JSON). */
|
|
19
|
+
interface WinOcrRaw {
|
|
20
|
+
width: number;
|
|
21
|
+
height: number;
|
|
22
|
+
lines: Array<{
|
|
23
|
+
text: string;
|
|
24
|
+
words: Array<{
|
|
25
|
+
text: string;
|
|
26
|
+
x: number;
|
|
27
|
+
y: number;
|
|
28
|
+
width: number;
|
|
29
|
+
height: number;
|
|
30
|
+
}>;
|
|
31
|
+
}>;
|
|
32
|
+
}
|
|
33
|
+
/** Pure mapper (exported for cross-platform unit tests). */
|
|
34
|
+
export declare function mapWinOcrToResult(raw: WinOcrRaw, sourceX: number, sourceY: number): OcrWithCoordsResult;
|
|
35
|
+
export declare class WindowsMediaOcrService implements OcrWithCoordsService {
|
|
36
|
+
readonly name = "windows-media-ocr";
|
|
37
|
+
static isAvailable(): boolean;
|
|
38
|
+
describe(input: OcrWithCoordsInput): Promise<OcrWithCoordsResult>;
|
|
39
|
+
}
|
|
40
|
+
export {};
|
|
41
|
+
//# sourceMappingURL=ocr-service-windows.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-service-windows.d.ts","sourceRoot":"","sources":["../src/ocr-service-windows.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAQH,OAAO,EAGL,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EAE1B,MAAM,sBAAsB,CAAC;AAG9B,iFAAiF;AACjF,UAAU,SAAS;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,KAAK,CAAC;QACX,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,KAAK,CAAC;YACX,IAAI,EAAE,MAAM,CAAC;YACb,CAAC,EAAE,MAAM,CAAC;YACV,CAAC,EAAE,MAAM,CAAC;YACV,KAAK,EAAE,MAAM,CAAC;YACd,MAAM,EAAE,MAAM,CAAC;SAChB,CAAC,CAAC;KACJ,CAAC,CAAC;CACJ;AA+JD,4DAA4D;AAC5D,wBAAgB,iBAAiB,CAC/B,GAAG,EAAE,SAAS,EACd,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,GACd,mBAAmB,CAOrB;AAED,qBAAa,sBAAuB,YAAW,oBAAoB;IACjE,QAAQ,CAAC,IAAI,uBAAuB;IAEpC,MAAM,CAAC,WAAW,IAAI,OAAO;IAIvB,QAAQ,CAAC,KAAK,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAyCxE"}
|