@elizaos/plugin-computeruse 2.0.0-beta.1 → 2.0.3-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +80 -0
- package/dist/actions/clipboard.d.ts +22 -0
- package/dist/actions/clipboard.d.ts.map +1 -0
- package/dist/actions/progress.d.ts +26 -0
- package/dist/actions/progress.d.ts.map +1 -0
- package/dist/actions/use-computer-agent.d.ts +113 -0
- package/dist/actions/use-computer-agent.d.ts.map +1 -0
- package/dist/actions/use-computer.d.ts.map +1 -1
- package/dist/actions/window-handlers.d.ts +11 -0
- package/dist/actions/window-handlers.d.ts.map +1 -0
- package/dist/actions/window.d.ts +11 -0
- package/dist/actions/window.d.ts.map +1 -0
- package/dist/actor/actor.d.ts +84 -0
- package/dist/actor/actor.d.ts.map +1 -0
- package/dist/actor/agent-callbacks.d.ts +128 -0
- package/dist/actor/agent-callbacks.d.ts.map +1 -0
- package/dist/actor/agent-loop.d.ts +134 -0
- package/dist/actor/agent-loop.d.ts.map +1 -0
- package/dist/actor/aosp-input-actor.d.ts +87 -0
- package/dist/actor/aosp-input-actor.d.ts.map +1 -0
- package/dist/actor/brain.d.ts +195 -0
- package/dist/actor/brain.d.ts.map +1 -0
- package/dist/actor/cascade.d.ts +92 -0
- package/dist/actor/cascade.d.ts.map +1 -0
- package/dist/actor/computer-interface.d.ts +276 -0
- package/dist/actor/computer-interface.d.ts.map +1 -0
- package/dist/actor/dispatch.d.ts +24 -0
- package/dist/actor/dispatch.d.ts.map +1 -0
- package/dist/actor/index.d.ts +12 -0
- package/dist/actor/index.d.ts.map +1 -0
- package/dist/actor/types.d.ts +94 -0
- package/dist/actor/types.d.ts.map +1 -0
- package/dist/approval-manager.d.ts.map +1 -1
- package/dist/index.d.ts +19 -6
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +12001 -5484
- package/dist/index.js.map +59 -25
- package/dist/mcp/index.d.ts +8 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/server.d.ts +42 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/tools.d.ts +53 -0
- package/dist/mcp/tools.d.ts.map +1 -0
- package/dist/mobile/android-bridge.d.ts +263 -0
- package/dist/mobile/android-bridge.d.ts.map +1 -0
- package/dist/mobile/android-scene.d.ts +52 -0
- package/dist/mobile/android-scene.d.ts.map +1 -0
- package/dist/mobile/android-trajectory.d.ts +66 -0
- package/dist/mobile/android-trajectory.d.ts.map +1 -0
- package/dist/mobile/index.d.ts +19 -0
- package/dist/mobile/index.d.ts.map +1 -0
- package/dist/mobile/ios-app-intent-registry.d.ts +20 -0
- package/dist/mobile/ios-app-intent-registry.d.ts.map +1 -0
- package/dist/mobile/ios-bridge.d.ts +359 -0
- package/dist/mobile/ios-bridge.d.ts.map +1 -0
- package/dist/mobile/ios-computer-interface.d.ts +160 -0
- package/dist/mobile/ios-computer-interface.d.ts.map +1 -0
- package/dist/mobile/mobile-computer-interface.d.ts +142 -0
- package/dist/mobile/mobile-computer-interface.d.ts.map +1 -0
- package/dist/mobile/mobile-screen-capture.d.ts +64 -0
- package/dist/mobile/mobile-screen-capture.d.ts.map +1 -0
- package/dist/mobile/ocr-provider.d.ts +187 -0
- package/dist/mobile/ocr-provider.d.ts.map +1 -0
- package/dist/mobile/ocr-provider.js +111 -0
- package/dist/mobile/ocr-provider.js.map +10 -0
- package/dist/osworld/action-converter.d.ts +4 -1
- package/dist/osworld/action-converter.d.ts.map +1 -1
- package/dist/osworld/adapter.d.ts +1 -0
- package/dist/osworld/adapter.d.ts.map +1 -1
- package/dist/parity/index.d.ts +9 -0
- package/dist/parity/index.d.ts.map +1 -0
- package/dist/parity/parity-matrix.d.ts +82 -0
- package/dist/parity/parity-matrix.d.ts.map +1 -0
- package/dist/parity/screenspot.d.ts +56 -0
- package/dist/parity/screenspot.d.ts.map +1 -0
- package/dist/platform/a11y.d.ts +29 -1
- package/dist/platform/a11y.d.ts.map +1 -1
- package/dist/platform/browser.d.ts +1 -1
- package/dist/platform/browser.d.ts.map +1 -1
- package/dist/platform/capabilities.d.ts +23 -0
- package/dist/platform/capabilities.d.ts.map +1 -1
- package/dist/platform/capture.d.ts +65 -0
- package/dist/platform/capture.d.ts.map +1 -0
- package/dist/platform/clipboard.d.ts +24 -0
- package/dist/platform/clipboard.d.ts.map +1 -0
- package/dist/platform/coords.d.ts +73 -0
- package/dist/platform/coords.d.ts.map +1 -0
- package/dist/platform/desktop.d.ts +23 -0
- package/dist/platform/desktop.d.ts.map +1 -1
- package/dist/platform/displays.d.ts +97 -0
- package/dist/platform/displays.d.ts.map +1 -0
- package/dist/platform/driver.d.ts +22 -0
- package/dist/platform/driver.d.ts.map +1 -1
- package/dist/platform/file-ops.d.ts +17 -0
- package/dist/platform/file-ops.d.ts.map +1 -1
- package/dist/platform/helpers.d.ts +2 -3
- package/dist/platform/helpers.d.ts.map +1 -1
- package/dist/platform/launch.d.ts +54 -0
- package/dist/platform/launch.d.ts.map +1 -0
- package/dist/platform/normalized-coords.d.ts +46 -0
- package/dist/platform/normalized-coords.d.ts.map +1 -0
- package/dist/platform/nut-driver.d.ts +67 -0
- package/dist/platform/nut-driver.d.ts.map +1 -1
- package/dist/platform/permissions.d.ts +12 -0
- package/dist/platform/permissions.d.ts.map +1 -1
- package/dist/platform/process-list.d.ts +32 -0
- package/dist/platform/process-list.d.ts.map +1 -0
- package/dist/platform/ps-host.d.ts +77 -0
- package/dist/platform/ps-host.d.ts.map +1 -0
- package/dist/platform/screenshot-errors.d.ts +54 -0
- package/dist/platform/screenshot-errors.d.ts.map +1 -0
- package/dist/platform/screenshot-quality.d.ts +11 -0
- package/dist/platform/screenshot-quality.d.ts.map +1 -0
- package/dist/platform/screenshot.d.ts.map +1 -1
- package/dist/platform/security.d.ts +8 -0
- package/dist/platform/security.d.ts.map +1 -1
- package/dist/platform/wayland-portal.d.ts +25 -0
- package/dist/platform/wayland-portal.d.ts.map +1 -0
- package/dist/platform/windows-list.d.ts +43 -1
- package/dist/platform/windows-list.d.ts.map +1 -1
- package/dist/providers/computer-state.d.ts.map +1 -1
- package/dist/providers/scene.d.ts +21 -0
- package/dist/providers/scene.d.ts.map +1 -0
- package/dist/register-routes.js +11715 -4990
- package/dist/register-routes.js.map +61 -24
- package/dist/routes/computer-use-compat-routes.d.ts +1 -1
- package/dist/routes/computer-use-compat-routes.d.ts.map +1 -1
- package/dist/sandbox/docker-backend.d.ts +69 -0
- package/dist/sandbox/docker-backend.d.ts.map +1 -0
- package/dist/sandbox/index.d.ts +62 -0
- package/dist/sandbox/index.d.ts.map +1 -0
- package/dist/sandbox/qemu-backend.d.ts +48 -0
- package/dist/sandbox/qemu-backend.d.ts.map +1 -0
- package/dist/sandbox/remote-guest.d.ts +72 -0
- package/dist/sandbox/remote-guest.d.ts.map +1 -0
- package/dist/sandbox/sandbox-driver.d.ts +41 -0
- package/dist/sandbox/sandbox-driver.d.ts.map +1 -0
- package/dist/sandbox/surface-types.d.ts +17 -0
- package/dist/sandbox/surface-types.d.ts.map +1 -0
- package/dist/sandbox/types.d.ts +138 -0
- package/dist/sandbox/types.d.ts.map +1 -0
- package/dist/sandbox/wsb-backend.d.ts +48 -0
- package/dist/sandbox/wsb-backend.d.ts.map +1 -0
- package/dist/scene/a11y-provider.d.ts +83 -0
- package/dist/scene/a11y-provider.d.ts.map +1 -0
- package/dist/scene/apps.d.ts +39 -0
- package/dist/scene/apps.d.ts.map +1 -0
- package/dist/scene/dhash.d.ts +105 -0
- package/dist/scene/dhash.d.ts.map +1 -0
- package/dist/scene/ocr-adapter.d.ts +64 -0
- package/dist/scene/ocr-adapter.d.ts.map +1 -0
- package/dist/scene/scene-builder.d.ts +107 -0
- package/dist/scene/scene-builder.d.ts.map +1 -0
- package/dist/scene/scene-types.d.ts +70 -0
- package/dist/scene/scene-types.d.ts.map +1 -0
- package/dist/scene/screen-state.d.ts +105 -0
- package/dist/scene/screen-state.d.ts.map +1 -0
- package/dist/scene/serialize.d.ts +28 -0
- package/dist/scene/serialize.d.ts.map +1 -0
- package/dist/security/browser-script-policy.d.ts +9 -0
- package/dist/security/browser-script-policy.d.ts.map +1 -0
- package/dist/services/computer-use-service.d.ts +78 -2
- package/dist/services/computer-use-service.d.ts.map +1 -1
- package/dist/services/index.d.ts +7 -0
- package/dist/services/index.d.ts.map +1 -0
- package/dist/services/vision-context-provider.d.ts +32 -0
- package/dist/services/vision-context-provider.d.ts.map +1 -0
- package/dist/types.d.ts +115 -5
- package/dist/types.d.ts.map +1 -1
- package/package.json +47 -10
- package/registry-entry.json +74 -0
- package/dist/actions/desktop-handlers.d.ts +0 -20
- package/dist/actions/desktop-handlers.d.ts.map +0 -1
- package/dist/actions/desktop.d.ts +0 -11
- package/dist/actions/desktop.d.ts.map +0 -1
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WS7 ↔ WS8 — `MobileComputerInterface` adapts the WS7 `ComputerInterface`
|
|
3
|
+
* port to the Android `AndroidComputerUseBridge` so the cascade + dispatcher
|
|
4
|
+
* run unchanged on mobile.
|
|
5
|
+
*
|
|
6
|
+
* Mapping (display-local pixel coords → Android screen-pixel gestures):
|
|
7
|
+
*
|
|
8
|
+
* leftClick({x, y}) → dispatchGesture({type:"tap", x, y})
|
|
9
|
+
* doubleClick({x, y}) → dispatchGesture(tap) twice (no native double-tap)
|
|
10
|
+
* rightClick({x, y}) → dispatchGesture(tap) [no right-click on Android;
|
|
11
|
+
* cascade should prefer longClick semantics via
|
|
12
|
+
* the AX node's `longClick` action]
|
|
13
|
+
* dragTo({x, y}) / drag(path)→ dispatchGesture({type:"swipe", x1,y1,x2,y2})
|
|
14
|
+
* scroll({x, y, dx, dy}) → dispatchGesture(swipe) anchored at (x, y),
|
|
15
|
+
* direction inverted (scrolling DOWN visually
|
|
16
|
+
* means swiping UP physically)
|
|
17
|
+
* pressKey({key:"back"}) → performGlobalAction("back")
|
|
18
|
+
* pressKey({key:"home"}) → performGlobalAction("home")
|
|
19
|
+
* hotkey → not supported on Android; throws
|
|
20
|
+
* typeText → setText({text}) against the focused editable
|
|
21
|
+
* AccessibilityNodeInfo.
|
|
22
|
+
*
|
|
23
|
+
* `getScreenSize`, `getCursorPosition`, and the coord-conversion helpers
|
|
24
|
+
* keep their desktop behavior — they're metadata calls, not input.
|
|
25
|
+
*
|
|
26
|
+
* Errors:
|
|
27
|
+
* Every method that calls the bridge propagates `ok:false` as a thrown
|
|
28
|
+
* `Error` whose message carries the `code` + `message`. The WS7 dispatcher
|
|
29
|
+
* maps that to `ActionResult.error.driver_error`, exactly like desktop.
|
|
30
|
+
*/
|
|
31
|
+
import type { ComputerInterface, CursorPosition, DisplayPoint, DragPath, MouseButton, ScreenshotResult, ScrollDelta } from "../actor/computer-interface.js";
|
|
32
|
+
import type { Scene, SceneAxNode } from "../scene/scene-types.js";
|
|
33
|
+
import type { DisplayDescriptor } from "../types.js";
|
|
34
|
+
import type { AndroidComputerUseBridge } from "./android-bridge.js";
|
|
35
|
+
export interface MobileComputerInterfaceDeps {
|
|
36
|
+
/** Capacitor plugin handle (null when off-platform). */
|
|
37
|
+
getBridge: () => AndroidComputerUseBridge | null;
|
|
38
|
+
/** Latest scene accessor — used for `getAccessibilityTree`. */
|
|
39
|
+
getScene?: () => Scene | null;
|
|
40
|
+
/** Display descriptor; mobile devices have exactly one. */
|
|
41
|
+
getDisplay?: () => DisplayDescriptor;
|
|
42
|
+
/** Internal cursor-position state, mostly for tests. */
|
|
43
|
+
cursorState?: {
|
|
44
|
+
current: CursorPosition;
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Override fetched screenshot bytes. Defaults to a one-shot
|
|
48
|
+
* `bridge.captureFrame()` so the cascade's pull contract just works.
|
|
49
|
+
*/
|
|
50
|
+
decodeJpeg?: (b64: string) => Buffer;
|
|
51
|
+
/**
|
|
52
|
+
* Map a WS7 key name (`"back"`, `"home"`, `"recents"`, `"notifications"`)
|
|
53
|
+
* to an Android `performGlobalAction` invocation. Other keys throw —
|
|
54
|
+
* Android has no equivalent of arbitrary keystrokes from a non-system app.
|
|
55
|
+
*/
|
|
56
|
+
globalActionMap?: ReadonlyMap<string, "back" | "home" | "recents" | "notifications">;
|
|
57
|
+
}
|
|
58
|
+
export declare class MobileComputerInterface implements ComputerInterface {
|
|
59
|
+
private readonly deps;
|
|
60
|
+
private readonly cursorState;
|
|
61
|
+
constructor(deps: MobileComputerInterfaceDeps);
|
|
62
|
+
screenshot(opts?: {
|
|
63
|
+
displayId?: number;
|
|
64
|
+
}): Promise<ScreenshotResult>;
|
|
65
|
+
mouseDown(point: DisplayPoint & {
|
|
66
|
+
button?: MouseButton;
|
|
67
|
+
}): Promise<void>;
|
|
68
|
+
mouseUp(point: DisplayPoint & {
|
|
69
|
+
button?: MouseButton;
|
|
70
|
+
}): Promise<void>;
|
|
71
|
+
leftClick(point: DisplayPoint): Promise<void>;
|
|
72
|
+
rightClick(point: DisplayPoint): Promise<void>;
|
|
73
|
+
doubleClick(point: DisplayPoint): Promise<void>;
|
|
74
|
+
moveCursor(point: DisplayPoint): Promise<void>;
|
|
75
|
+
dragTo(point: DisplayPoint): Promise<void>;
|
|
76
|
+
drag(path: DragPath): Promise<void>;
|
|
77
|
+
keyDown(args: {
|
|
78
|
+
key: string;
|
|
79
|
+
}): Promise<void>;
|
|
80
|
+
keyUp(_args: {
|
|
81
|
+
key: string;
|
|
82
|
+
}): Promise<void>;
|
|
83
|
+
typeText(args: {
|
|
84
|
+
text: string;
|
|
85
|
+
}): Promise<void>;
|
|
86
|
+
pressKey(args: {
|
|
87
|
+
key: string;
|
|
88
|
+
}): Promise<void>;
|
|
89
|
+
hotkey(_args: {
|
|
90
|
+
keys: string[];
|
|
91
|
+
}): Promise<void>;
|
|
92
|
+
scroll(delta: ScrollDelta): Promise<void>;
|
|
93
|
+
scrollUp(args: {
|
|
94
|
+
displayId: number;
|
|
95
|
+
clicks: number;
|
|
96
|
+
}): Promise<void>;
|
|
97
|
+
scrollDown(args: {
|
|
98
|
+
displayId: number;
|
|
99
|
+
clicks: number;
|
|
100
|
+
}): Promise<void>;
|
|
101
|
+
getScreenSize(_args: {
|
|
102
|
+
displayId: number;
|
|
103
|
+
}): {
|
|
104
|
+
w: number;
|
|
105
|
+
h: number;
|
|
106
|
+
};
|
|
107
|
+
getCursorPosition(): CursorPosition;
|
|
108
|
+
toScreenCoordinates(args: {
|
|
109
|
+
displayId: number;
|
|
110
|
+
imgX: number;
|
|
111
|
+
imgY: number;
|
|
112
|
+
imgW: number;
|
|
113
|
+
imgH: number;
|
|
114
|
+
}): {
|
|
115
|
+
x: number;
|
|
116
|
+
y: number;
|
|
117
|
+
};
|
|
118
|
+
toScreenshotCoordinates(args: {
|
|
119
|
+
displayId: number;
|
|
120
|
+
x: number;
|
|
121
|
+
y: number;
|
|
122
|
+
imgW: number;
|
|
123
|
+
imgH: number;
|
|
124
|
+
}): {
|
|
125
|
+
imgX: number;
|
|
126
|
+
imgY: number;
|
|
127
|
+
};
|
|
128
|
+
getAccessibilityTree(args: {
|
|
129
|
+
displayId?: number;
|
|
130
|
+
}): SceneAxNode[];
|
|
131
|
+
private dispatchTap;
|
|
132
|
+
private dispatchSwipe;
|
|
133
|
+
private moveTracker;
|
|
134
|
+
private requireBridge;
|
|
135
|
+
private requireDisplayId;
|
|
136
|
+
private requireFiniteCoords;
|
|
137
|
+
private resolveGlobalAction;
|
|
138
|
+
private getDisplay;
|
|
139
|
+
}
|
|
140
|
+
/** Convenience factory. */
|
|
141
|
+
export declare function makeMobileComputerInterface(deps: MobileComputerInterfaceDeps): ComputerInterface;
|
|
142
|
+
//# sourceMappingURL=mobile-computer-interface.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mobile-computer-interface.d.ts","sourceRoot":"","sources":["../../src/mobile/mobile-computer-interface.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAGH,OAAO,KAAK,EACV,iBAAiB,EACjB,cAAc,EACd,YAAY,EACZ,QAAQ,EACR,WAAW,EACX,gBAAgB,EAChB,WAAW,EACZ,MAAM,gCAAgC,CAAC;AACxC,OAAO,KAAK,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AAClE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AACrD,OAAO,KAAK,EAEV,wBAAwB,EAEzB,MAAM,qBAAqB,CAAC;AAM7B,MAAM,WAAW,2BAA2B;IAC1C,wDAAwD;IACxD,SAAS,EAAE,MAAM,wBAAwB,GAAG,IAAI,CAAC;IACjD,+DAA+D;IAC/D,QAAQ,CAAC,EAAE,MAAM,KAAK,GAAG,IAAI,CAAC;IAC9B,2DAA2D;IAC3D,UAAU,CAAC,EAAE,MAAM,iBAAiB,CAAC;IACrC,wDAAwD;IACxD,WAAW,CAAC,EAAE;QAAE,OAAO,EAAE,cAAc,CAAA;KAAE,CAAC;IAC1C;;;OAGG;IACH,UAAU,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC;IACrC;;;;OAIG;IACH,eAAe,CAAC,EAAE,WAAW,CAC3B,MAAM,EACN,MAAM,GAAG,MAAM,GAAG,SAAS,GAAG,eAAe,CAC9C,CAAC;CACH;AAED,qBAAa,uBAAwB,YAAW,iBAAiB;IAC/D,OAAO,CAAC,QAAQ,CAAC,IAAI,CAA8B;IACnD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAA8B;gBAE9C,IAAI,EAAE,2BAA2B;IASvC,UAAU,CACd,IAAI,GAAE;QAAE,SAAS,CAAC,EAAE,MAAM,CAAA;KAAO,GAChC,OAAO,CAAC,gBAAgB,CAAC;IAiBtB,SAAS,CACb,KAAK,EAAE,YAAY,GAAG;QAAE,MAAM,CAAC,EAAE,WAAW,CAAA;KAAE,GAC7C,OAAO,CAAC,IAAI,CAAC;IAOV,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG;QAAE,MAAM,CAAC,EAAE,WAAW,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAItE,SAAS,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAI7C,UAAU,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAO9C,WAAW,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAQ/C,UAAU,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAM9C,MAAM,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAc1C,IAAI,CAAC,IAAI,EAAE,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC;IA0BnC,OAAO,CAAC,IAAI,EAAE;QAAE,GAAG,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7C,KAAK,CAAC,KAAK,EAAE;QAAE,GAAG,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAI5C,QAAQ,CAAC,IAAI,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAU/C,QAAQ,CAAC,IAAI,EAAE;QAAE,GAAG,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAc9C,MAAM,CAAC,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,EAAE,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAQhD,MAAM,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IA+BzC,QAAQ,CAAC,IAAI,EAAE;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAapE,UAAU,CAAC,IAAI,EAAE;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAe5E,aAAa,CAAC,KAAK,EAAE;QAAE,SAAS,EAAE,MAAM,CAAA;KAAE,GAAG;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE;IAKrE,iBAAiB,IAAI,cAAc;IAInC,mBAAmB,CAAC,IAAI,EAAE;QACxB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;KACd,GAAG;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE;IAgB5B,uBAAuB,CAAC,IAAI,EAAE;QAC5B,SAAS,EAAE,MAAM,CAAC;QAClB,CAAC,EAAE,MAAM,CAAC;QACV,CAAC,EAAE,MAAM,CAAC;QACV,IAAI,EAAE,MAAM,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;KACd,GAAG;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAgBlC,oBAAoB,CAAC,IAAI,EAAE;QAAE,SAAS,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,WAAW,EAAE;YASnD,WAAW;YASX,aAAa;IAoB3B,OAAO,CAAC,WAAW;IAMnB,OAAO,CAAC,aAAa;IAUrB,OAAO,CAAC,gBAAgB;IASxB,OAAO,CAAC,mBAAmB;IAQ3B,OAAO,CAAC,mBAAmB;IAsB3B,OAAO,CAAC,UAAU;CAUnB;AAED,2BAA2B;AAC3B,wBAAgB,2BAA2B,CACzC,IAAI,EAAE,2BAA2B,GAChC,iBAAiB,CAEnB"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WS8 — `MobileScreenCaptureSource` adapts the Android `captureFrame()`
|
|
3
|
+
* bridge call into a WS5-shape `DisplayCapture` so the WS7 cascade can run
|
|
4
|
+
* unmodified on Android. iOS uses ReplayKit and is treated separately; this
|
|
5
|
+
* module focuses on Android because that's where the consumer-build cascade
|
|
6
|
+
* actually runs.
|
|
7
|
+
*
|
|
8
|
+
* Contract:
|
|
9
|
+
* - `captureDisplay(displayId?)` returns the latest screen frame as
|
|
10
|
+
* `{ display, frame }` — the frame is the decoded JPEG byte buffer
|
|
11
|
+
* exactly as the Kotlin ImageReader pipeline emits it. The Brain encodes
|
|
12
|
+
* it as a `data:image/jpeg;base64,...` URL; downstream model adapters
|
|
13
|
+
* do not require PNG specifically, only "image".
|
|
14
|
+
* - `captureAllDisplays()` returns a single-element array since mobile
|
|
15
|
+
* devices report one logical display. Multi-display Android (DeX,
|
|
16
|
+
* foldables in two-pane mode) is out of scope for v1.
|
|
17
|
+
* - On bridge `ok:false`, the call rejects with an `Error` whose message
|
|
18
|
+
* carries the bridge's `code` and `message`. The cascade surfaces this
|
|
19
|
+
* through `safeCapture` so the agent loop reports it as
|
|
20
|
+
* `reason: "error"`.
|
|
21
|
+
*
|
|
22
|
+
* The capture path is intentionally pull-based — Kotlin keeps the latest
|
|
23
|
+
* frame in a ring-buffer, and this module just drains it. The fps knob is
|
|
24
|
+
* set when MediaProjection was started; this module doesn't manage it.
|
|
25
|
+
*/
|
|
26
|
+
import type { DisplayCapture } from "../platform/capture.js";
|
|
27
|
+
import type { DisplayDescriptor } from "../types.js";
|
|
28
|
+
import type { AndroidComputerUseBridge, CapturedScreenFrame } from "./android-bridge.js";
|
|
29
|
+
export declare const ANDROID_LOGICAL_DISPLAY_ID: 0;
|
|
30
|
+
export interface MobileScreenCaptureSourceDeps {
|
|
31
|
+
/** Returns the Capacitor `ComputerUse` plugin handle, or null when off-platform. */
|
|
32
|
+
getBridge: () => AndroidComputerUseBridge | null;
|
|
33
|
+
/**
|
|
34
|
+
* Override the display descriptor — primarily for tests. In production,
|
|
35
|
+
* the descriptor is derived from the captured frame's width/height (the
|
|
36
|
+
* device only knows its own size).
|
|
37
|
+
*/
|
|
38
|
+
getDisplay?: (frame: CapturedScreenFrame | null) => DisplayDescriptor;
|
|
39
|
+
/**
|
|
40
|
+
* Override how raw JPEG bytes are produced from the base64 payload. The
|
|
41
|
+
* default uses Node's `Buffer.from(..., "base64")`; tests can inject a
|
|
42
|
+
* decoder when running in a browser-only context.
|
|
43
|
+
*/
|
|
44
|
+
decodeJpeg?: (jpegBase64: string) => Buffer;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Pull-based capture source for Android MediaProjection. Mirrors the public
|
|
48
|
+
* surface of `captureAllDisplays` / `captureDisplay` from
|
|
49
|
+
* `platform/capture.ts` so the WS7 cascade can substitute it via DI.
|
|
50
|
+
*/
|
|
51
|
+
export declare class MobileScreenCaptureSource {
|
|
52
|
+
private readonly deps;
|
|
53
|
+
constructor(deps: MobileScreenCaptureSourceDeps);
|
|
54
|
+
/**
|
|
55
|
+
* Drain the latest frame for the (single) logical Android display.
|
|
56
|
+
* `displayId` is honored only when it equals `ANDROID_LOGICAL_DISPLAY_ID`;
|
|
57
|
+
* any other id throws — mirrors `captureDisplay`'s unknown-display
|
|
58
|
+
* behavior on desktop.
|
|
59
|
+
*/
|
|
60
|
+
captureDisplay(displayId?: number): Promise<DisplayCapture>;
|
|
61
|
+
/** Single-element array — mobile devices have one logical display. */
|
|
62
|
+
captureAllDisplays(): Promise<DisplayCapture[]>;
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=mobile-screen-capture.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mobile-screen-capture.d.ts","sourceRoot":"","sources":["../../src/mobile/mobile-screen-capture.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AACrD,OAAO,KAAK,EACV,wBAAwB,EACxB,mBAAmB,EACpB,MAAM,qBAAqB,CAAC;AAE7B,eAAO,MAAM,0BAA0B,EAAG,CAAU,CAAC;AAErD,MAAM,WAAW,6BAA6B;IAC5C,oFAAoF;IACpF,SAAS,EAAE,MAAM,wBAAwB,GAAG,IAAI,CAAC;IACjD;;;;OAIG;IACH,UAAU,CAAC,EAAE,CAAC,KAAK,EAAE,mBAAmB,GAAG,IAAI,KAAK,iBAAiB,CAAC;IACtE;;;;OAIG;IACH,UAAU,CAAC,EAAE,CAAC,UAAU,EAAE,MAAM,KAAK,MAAM,CAAC;CAC7C;AAED;;;;GAIG;AACH,qBAAa,yBAAyB;IACxB,OAAO,CAAC,QAAQ,CAAC,IAAI;gBAAJ,IAAI,EAAE,6BAA6B;IAEhE;;;;;OAKG;IACG,cAAc,CAClB,SAAS,GAAE,MAAmC,GAC7C,OAAO,CAAC,cAAc,CAAC;IA2B1B,sEAAsE;IAChE,kBAAkB,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;CAGtD"}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OCR provider chain — used by WS6 (scene-builder) and any other consumer
|
|
3
|
+
* that needs text-from-image extraction.
|
|
4
|
+
*
|
|
5
|
+
* Defined here in plugin-computeruse so the mobile bridge can publish the
|
|
6
|
+
* iOS Apple Vision implementation alongside the rest of the iOS surface.
|
|
7
|
+
* WS6 will register additional providers (cloud, Tesseract fallback) and
|
|
8
|
+
* pick a provider per-call via `selectOcrProvider`.
|
|
9
|
+
*
|
|
10
|
+
* Contract:
|
|
11
|
+
* - `name` : stable string id for routing/telemetry.
|
|
12
|
+
* - `priority` : higher wins when multiple providers report `available`.
|
|
13
|
+
* - `available()` : cheap synchronous availability probe; the registry
|
|
14
|
+
* caches the result for the process lifetime.
|
|
15
|
+
* - `recognize()` : async OCR call. Throws on hard failures so callers can
|
|
16
|
+
* fall back to the next provider; never returns empty
|
|
17
|
+
* lines silently.
|
|
18
|
+
*/
|
|
19
|
+
import type { IosComputerUseBridge } from "./ios-bridge.js";
|
|
20
|
+
export interface OcrLine {
|
|
21
|
+
readonly text: string;
|
|
22
|
+
readonly confidence: number;
|
|
23
|
+
readonly boundingBox: {
|
|
24
|
+
readonly x: number;
|
|
25
|
+
readonly y: number;
|
|
26
|
+
readonly width: number;
|
|
27
|
+
readonly height: number;
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
export interface OcrResult {
|
|
31
|
+
readonly lines: readonly OcrLine[];
|
|
32
|
+
readonly fullText: string;
|
|
33
|
+
readonly elapsedMs: number;
|
|
34
|
+
readonly providerName: string;
|
|
35
|
+
readonly languagesUsed: readonly string[];
|
|
36
|
+
}
|
|
37
|
+
export interface OcrRecognizeOptions {
|
|
38
|
+
readonly languages?: readonly string[];
|
|
39
|
+
readonly recognitionLevel?: "fast" | "accurate";
|
|
40
|
+
readonly minimumTextHeight?: number;
|
|
41
|
+
}
|
|
42
|
+
export interface OcrProvider {
|
|
43
|
+
readonly name: string;
|
|
44
|
+
readonly priority: number;
|
|
45
|
+
available(): boolean;
|
|
46
|
+
recognize(input: OcrInput, options?: OcrRecognizeOptions): Promise<OcrResult>;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Image input. Either a base64-encoded PNG/JPEG or raw bytes. The provider
|
|
50
|
+
* is responsible for normalizing into whatever its native side expects.
|
|
51
|
+
*/
|
|
52
|
+
export type OcrInput = {
|
|
53
|
+
readonly kind: "base64";
|
|
54
|
+
readonly data: string;
|
|
55
|
+
} | {
|
|
56
|
+
readonly kind: "bytes";
|
|
57
|
+
readonly data: Uint8Array;
|
|
58
|
+
};
|
|
59
|
+
export declare function registerOcrProvider(provider: OcrProvider): void;
|
|
60
|
+
export declare function unregisterOcrProvider(name: string): void;
|
|
61
|
+
export declare function listOcrProviders(): readonly OcrProvider[];
|
|
62
|
+
/**
|
|
63
|
+
* Returns the highest-priority provider that reports `available()`. Throws if
|
|
64
|
+
* none are available — callers must handle that explicitly rather than
|
|
65
|
+
* silently degrading. WS6's scene-builder catches and reports.
|
|
66
|
+
*/
|
|
67
|
+
export declare function selectOcrProvider(): OcrProvider;
|
|
68
|
+
/**
|
|
69
|
+
* Builds an OcrProvider that delegates to the Capacitor `ComputerUse` plugin's
|
|
70
|
+
* `visionOcr` method. Pass in a getter that lazily resolves the bridge so this
|
|
71
|
+
* module stays free of Capacitor imports (which would break Node test runs).
|
|
72
|
+
*/
|
|
73
|
+
export declare function createIosVisionOcrProvider(getBridge: () => IosComputerUseBridge | null, options?: {
|
|
74
|
+
readonly priority?: number;
|
|
75
|
+
}): OcrProvider;
|
|
76
|
+
/**
|
|
77
|
+
* Test helper. Drops every provider from the registry; callers re-register
|
|
78
|
+
* in `beforeEach`.
|
|
79
|
+
*/
|
|
80
|
+
export declare function _resetOcrProvidersForTests(): void;
|
|
81
|
+
/**
|
|
82
|
+
* Hierarchical (block / line / word) OCR with absolute source-display
|
|
83
|
+
* coordinates and a coarse 3x3 semantic position label per element. This is
|
|
84
|
+
* a *separate* slot from `OcrProvider` above — that one is line-only and
|
|
85
|
+
* shaped around Apple Vision's API, while this one carries the structure
|
|
86
|
+
* that plugin-computeruse needs to compute action targets without re-running
|
|
87
|
+
* detection.
|
|
88
|
+
*
|
|
89
|
+
* The provider implementation lives in `@elizaos/plugin-vision`'s
|
|
90
|
+
* `ocr-with-coords.ts`. plugin-computeruse intentionally does not take a
|
|
91
|
+
* runtime dep on plugin-vision (computeruse is the higher-level seam), so
|
|
92
|
+
* the runtime registers a provider here at boot.
|
|
93
|
+
*
|
|
94
|
+
* The current in-tree provider is the RapidOCR-backed adapter registered by
|
|
95
|
+
* `@elizaos/plugin-vision`; native providers can register the same interface
|
|
96
|
+
* when they are available.
|
|
97
|
+
*/
|
|
98
|
+
export interface CoordOcrSemantic {
|
|
99
|
+
readonly position: "upper-left" | "upper-center" | "upper-right" | "middle-left" | "center" | "middle-right" | "lower-left" | "lower-center" | "lower-right";
|
|
100
|
+
}
|
|
101
|
+
export interface CoordOcrWord {
|
|
102
|
+
readonly text: string;
|
|
103
|
+
readonly bbox: {
|
|
104
|
+
readonly x: number;
|
|
105
|
+
readonly y: number;
|
|
106
|
+
readonly width: number;
|
|
107
|
+
readonly height: number;
|
|
108
|
+
};
|
|
109
|
+
readonly semantic_position: CoordOcrSemantic["position"];
|
|
110
|
+
}
|
|
111
|
+
export interface CoordOcrBlock {
|
|
112
|
+
readonly text: string;
|
|
113
|
+
readonly bbox: {
|
|
114
|
+
readonly x: number;
|
|
115
|
+
readonly y: number;
|
|
116
|
+
readonly width: number;
|
|
117
|
+
readonly height: number;
|
|
118
|
+
};
|
|
119
|
+
readonly words: ReadonlyArray<CoordOcrWord>;
|
|
120
|
+
readonly semantic_position: CoordOcrSemantic["position"];
|
|
121
|
+
}
|
|
122
|
+
export interface CoordOcrResult {
|
|
123
|
+
readonly blocks: ReadonlyArray<CoordOcrBlock>;
|
|
124
|
+
}
|
|
125
|
+
export interface CoordOcrInput {
|
|
126
|
+
readonly displayId: string;
|
|
127
|
+
readonly sourceX: number;
|
|
128
|
+
readonly sourceY: number;
|
|
129
|
+
readonly pngBytes: Uint8Array;
|
|
130
|
+
}
|
|
131
|
+
export interface CoordOcrProvider {
|
|
132
|
+
readonly name: string;
|
|
133
|
+
describe(input: CoordOcrInput): Promise<CoordOcrResult>;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Register the hierarchical / coord-aware OCR provider. Idempotent — last
|
|
137
|
+
* call wins so a hot-reload of the bridge swaps cleanly. Pass `null` to
|
|
138
|
+
* unregister.
|
|
139
|
+
*/
|
|
140
|
+
export declare function registerCoordOcrProvider(provider: CoordOcrProvider | null): void;
|
|
141
|
+
export declare function getCoordOcrProvider(): CoordOcrProvider | null;
|
|
142
|
+
/**
|
|
143
|
+
* Set-of-Marks grounding (trycua/cua's OmniParser technique): fuse GGUF YOLO
|
|
144
|
+
* icon detections + OCR text boxes into ONE deduplicated, 1-indexed set of
|
|
145
|
+
* numbered targets, optionally with a numbered-overlay PNG. The VLM picks a
|
|
146
|
+
* *number* instead of regressing raw pixels.
|
|
147
|
+
*
|
|
148
|
+
* The fusion + overlay implementation lives in `@elizaos/plugin-vision`
|
|
149
|
+
* (`som.ts`) because that package owns the YOLO detector and OCR engines.
|
|
150
|
+
* computeruse exposes this registration slot (same no-hard-dep pattern as the
|
|
151
|
+
* CoordOcrProvider above); `detect_elements` consumes whatever is registered.
|
|
152
|
+
*/
|
|
153
|
+
export interface SetOfMarksMark {
|
|
154
|
+
/** 1-indexed mark number shown in the overlay. */
|
|
155
|
+
readonly index: number;
|
|
156
|
+
/** Display-local box `[x, y, w, h]`. */
|
|
157
|
+
readonly bbox: readonly [number, number, number, number];
|
|
158
|
+
/** Box center `[x, y]` — the click target the number resolves to. */
|
|
159
|
+
readonly center: readonly [number, number];
|
|
160
|
+
readonly source: "icon" | "text";
|
|
161
|
+
readonly label?: string;
|
|
162
|
+
readonly score: number;
|
|
163
|
+
}
|
|
164
|
+
export interface SetOfMarksResult {
|
|
165
|
+
readonly marks: ReadonlyArray<SetOfMarksMark>;
|
|
166
|
+
/** Base64 PNG of the numbered overlay (present only when requested). */
|
|
167
|
+
readonly overlayPngBase64?: string;
|
|
168
|
+
}
|
|
169
|
+
export interface SetOfMarksInput {
|
|
170
|
+
readonly displayId: string;
|
|
171
|
+
readonly sourceX: number;
|
|
172
|
+
readonly sourceY: number;
|
|
173
|
+
readonly pngBytes: Uint8Array;
|
|
174
|
+
/** Render and return the numbered-overlay PNG. Default false (marks only). */
|
|
175
|
+
readonly renderOverlay?: boolean;
|
|
176
|
+
}
|
|
177
|
+
export interface SetOfMarksProvider {
|
|
178
|
+
readonly name: string;
|
|
179
|
+
describe(input: SetOfMarksInput): Promise<SetOfMarksResult>;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Register the Set-of-Marks provider. Idempotent — last call wins. Pass `null`
|
|
183
|
+
* to unregister.
|
|
184
|
+
*/
|
|
185
|
+
export declare function registerSetOfMarksProvider(provider: SetOfMarksProvider | null): void;
|
|
186
|
+
export declare function getSetOfMarksProvider(): SetOfMarksProvider | null;
|
|
187
|
+
//# sourceMappingURL=ocr-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ocr-provider.d.ts","sourceRoot":"","sources":["../../src/mobile/ocr-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,KAAK,EACV,oBAAoB,EAIrB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,WAAW,OAAO;IACtB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,WAAW,EAAE;QACpB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;QACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC;CACH;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,KAAK,EAAE,SAAS,OAAO,EAAE,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,aAAa,EAAE,SAAS,MAAM,EAAE,CAAC;CAC3C;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,SAAS,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IACvC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,GAAG,UAAU,CAAC;IAChD,QAAQ,CAAC,iBAAiB,CAAC,EAAE,MAAM,CAAC;CACrC;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,SAAS,IAAI,OAAO,CAAC;IACrB,SAAS,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;CAC/E;AAED;;;GAGG;AACH,MAAM,MAAM,QAAQ,GAChB;IAAE,QAAQ,CAAC,IAAI,EAAE,QAAQ,CAAC;IAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;CAAE,GAClD;IAAE,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IAAC,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAA;CAAE,CAAC;AAM1D,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,WAAW,GAAG,IAAI,CAE/D;AAED,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAExD;AAED,wBAAgB,gBAAgB,IAAI,SAAS,WAAW,EAAE,CAEzD;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,IAAI,WAAW,CAO/C;AAID;;;;GAIG;AACH,wBAAgB,0BAA0B,CACxC,SAAS,EAAE,MAAM,oBAAoB,GAAG,IAAI,EAC5C,OAAO,GAAE;IAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAO,GAC3C,WAAW,CA2Cb;AAsCD;;;GAGG;AACH,wBAAgB,0BAA0B,IAAI,IAAI,CAEjD;AAID;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,QAAQ,EACb,YAAY,GACZ,cAAc,GACd,aAAa,GACb,aAAa,GACb,QAAQ,GACR,cAAc,GACd,YAAY,GACZ,cAAc,GACd,aAAa,CAAC;CACnB;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE;QACb,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;QACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC;IACF,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC,UAAU,CAAC,CAAC;CAC1D;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE;QACb,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;QACnB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;QACvB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC;IACF,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC,YAAY,CAAC,CAAC;IAC5C,QAAQ,CAAC,iBAAiB,EAAE,gBAAgB,CAAC,UAAU,CAAC,CAAC;CAC1D;AAED,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,aAAa,CAAC,CAAC;CAC/C;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;CAC/B;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,aAAa,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;CACzD;AAID;;;;GAIG;AACH,wBAAgB,wBAAwB,CACtC,QAAQ,EAAE,gBAAgB,GAAG,IAAI,GAChC,IAAI,CAEN;AAED,wBAAgB,mBAAmB,IAAI,gBAAgB,GAAG,IAAI,CAE7D;AAID;;;;;;;;;;GAUG;AACH,MAAM,WAAW,cAAc;IAC7B,kDAAkD;IAClD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,wCAAwC;IACxC,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;IACzD,qEAAqE;IACrE,QAAQ,CAAC,MAAM,EAAE,SAAS,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC3C,QAAQ,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAAC;IACjC,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC,cAAc,CAAC,CAAC;IAC9C,wEAAwE;IACxE,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;CACpC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;IAC9B,8EAA8E;IAC9E,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;CAC7D;AAID;;;GAGG;AACH,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,kBAAkB,GAAG,IAAI,GAClC,IAAI,CAEN;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
// src/mobile/ocr-provider.ts
|
|
2
|
+
var REGISTRY = new Map;
|
|
3
|
+
function registerOcrProvider(provider) {
|
|
4
|
+
REGISTRY.set(provider.name, provider);
|
|
5
|
+
}
|
|
6
|
+
function unregisterOcrProvider(name) {
|
|
7
|
+
REGISTRY.delete(name);
|
|
8
|
+
}
|
|
9
|
+
function listOcrProviders() {
|
|
10
|
+
return [...REGISTRY.values()].sort((a, b) => b.priority - a.priority);
|
|
11
|
+
}
|
|
12
|
+
function selectOcrProvider() {
|
|
13
|
+
for (const provider of listOcrProviders()) {
|
|
14
|
+
if (provider.available())
|
|
15
|
+
return provider;
|
|
16
|
+
}
|
|
17
|
+
throw new Error("No OCR provider available. Register at least one provider before calling selectOcrProvider().");
|
|
18
|
+
}
|
|
19
|
+
function createIosVisionOcrProvider(getBridge, options = {}) {
|
|
20
|
+
return {
|
|
21
|
+
name: "ios-apple-vision",
|
|
22
|
+
priority: options.priority ?? 100,
|
|
23
|
+
available() {
|
|
24
|
+
return getBridge() !== null;
|
|
25
|
+
},
|
|
26
|
+
async recognize(input, recognizeOptions) {
|
|
27
|
+
const bridge = getBridge();
|
|
28
|
+
if (!bridge) {
|
|
29
|
+
throw new Error("ios-apple-vision provider invoked but Capacitor ComputerUse plugin is not registered.");
|
|
30
|
+
}
|
|
31
|
+
const imageBase64 = toBase64(input);
|
|
32
|
+
const visionOptions = {
|
|
33
|
+
...recognizeOptions?.languages ? { languages: recognizeOptions.languages } : {},
|
|
34
|
+
...recognizeOptions?.recognitionLevel ? { recognitionLevel: recognizeOptions.recognitionLevel } : {},
|
|
35
|
+
...recognizeOptions?.minimumTextHeight !== undefined ? { minimumTextHeight: recognizeOptions.minimumTextHeight } : {}
|
|
36
|
+
};
|
|
37
|
+
const result = await bridge.visionOcr({
|
|
38
|
+
imageBase64,
|
|
39
|
+
options: visionOptions
|
|
40
|
+
});
|
|
41
|
+
if (!result.ok) {
|
|
42
|
+
const failure = result;
|
|
43
|
+
throw new Error(`ios-apple-vision OCR failed: ${failure.code} — ${failure.message}`);
|
|
44
|
+
}
|
|
45
|
+
return mapVisionResult(result.data);
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
function toBase64(input) {
|
|
50
|
+
if (input.kind === "base64")
|
|
51
|
+
return input.data;
|
|
52
|
+
return uint8ArrayToBase64(input.data);
|
|
53
|
+
}
|
|
54
|
+
function uint8ArrayToBase64(bytes) {
|
|
55
|
+
if (typeof Buffer !== "undefined") {
|
|
56
|
+
return Buffer.from(bytes).toString("base64");
|
|
57
|
+
}
|
|
58
|
+
let binary = "";
|
|
59
|
+
for (let i = 0;i < bytes.length; i += 1) {
|
|
60
|
+
binary += String.fromCharCode(bytes[i]);
|
|
61
|
+
}
|
|
62
|
+
return btoa(binary);
|
|
63
|
+
}
|
|
64
|
+
function mapVisionResult(result) {
|
|
65
|
+
return {
|
|
66
|
+
lines: result.lines.map(mapLine),
|
|
67
|
+
fullText: result.fullText,
|
|
68
|
+
elapsedMs: result.elapsedMs,
|
|
69
|
+
providerName: "ios-apple-vision",
|
|
70
|
+
languagesUsed: result.languagesUsed
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
function mapLine(line) {
|
|
74
|
+
return {
|
|
75
|
+
text: line.text,
|
|
76
|
+
confidence: line.confidence,
|
|
77
|
+
boundingBox: line.boundingBox
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
function _resetOcrProvidersForTests() {
|
|
81
|
+
REGISTRY.clear();
|
|
82
|
+
}
|
|
83
|
+
var registeredCoordOcrProvider = null;
|
|
84
|
+
function registerCoordOcrProvider(provider) {
|
|
85
|
+
registeredCoordOcrProvider = provider;
|
|
86
|
+
}
|
|
87
|
+
function getCoordOcrProvider() {
|
|
88
|
+
return registeredCoordOcrProvider;
|
|
89
|
+
}
|
|
90
|
+
var registeredSetOfMarksProvider = null;
|
|
91
|
+
function registerSetOfMarksProvider(provider) {
|
|
92
|
+
registeredSetOfMarksProvider = provider;
|
|
93
|
+
}
|
|
94
|
+
function getSetOfMarksProvider() {
|
|
95
|
+
return registeredSetOfMarksProvider;
|
|
96
|
+
}
|
|
97
|
+
export {
|
|
98
|
+
unregisterOcrProvider,
|
|
99
|
+
selectOcrProvider,
|
|
100
|
+
registerSetOfMarksProvider,
|
|
101
|
+
registerOcrProvider,
|
|
102
|
+
registerCoordOcrProvider,
|
|
103
|
+
listOcrProviders,
|
|
104
|
+
getSetOfMarksProvider,
|
|
105
|
+
getCoordOcrProvider,
|
|
106
|
+
createIosVisionOcrProvider,
|
|
107
|
+
_resetOcrProvidersForTests
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
//# debugId=ECB47B4DF356B75E64756E2164756E21
|
|
111
|
+
//# sourceMappingURL=ocr-provider.js.map
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../src/mobile/ocr-provider.ts"],
|
|
4
|
+
"sourcesContent": [
|
|
5
|
+
"/**\n * OCR provider chain — used by WS6 (scene-builder) and any other consumer\n * that needs text-from-image extraction.\n *\n * Defined here in plugin-computeruse so the mobile bridge can publish the\n * iOS Apple Vision implementation alongside the rest of the iOS surface.\n * WS6 will register additional providers (cloud, Tesseract fallback) and\n * pick a provider per-call via `selectOcrProvider`.\n *\n * Contract:\n * - `name` : stable string id for routing/telemetry.\n * - `priority` : higher wins when multiple providers report `available`.\n * - `available()` : cheap synchronous availability probe; the registry\n * caches the result for the process lifetime.\n * - `recognize()` : async OCR call. Throws on hard failures so callers can\n * fall back to the next provider; never returns empty\n * lines silently.\n */\n\nimport type {\n IosComputerUseBridge,\n VisionOcrLine,\n VisionOcrOptions,\n VisionOcrResult,\n} from \"./ios-bridge.js\";\n\nexport interface OcrLine {\n readonly text: string;\n readonly confidence: number;\n readonly boundingBox: {\n readonly x: number;\n readonly y: number;\n readonly width: number;\n readonly height: number;\n };\n}\n\nexport interface OcrResult {\n readonly lines: readonly OcrLine[];\n readonly fullText: string;\n readonly elapsedMs: number;\n readonly providerName: string;\n readonly languagesUsed: readonly string[];\n}\n\nexport interface OcrRecognizeOptions {\n readonly languages?: readonly string[];\n readonly recognitionLevel?: \"fast\" | \"accurate\";\n readonly minimumTextHeight?: number;\n}\n\nexport interface OcrProvider {\n readonly name: string;\n readonly priority: number;\n available(): boolean;\n recognize(input: OcrInput, options?: OcrRecognizeOptions): Promise<OcrResult>;\n}\n\n/**\n * Image input. Either a base64-encoded PNG/JPEG or raw bytes. The provider\n * is responsible for normalizing into whatever its native side expects.\n */\nexport type OcrInput =\n | { readonly kind: \"base64\"; readonly data: string }\n | { readonly kind: \"bytes\"; readonly data: Uint8Array };\n\n// ── Registry ─────────────────────────────────────────────────────────────────\n\nconst REGISTRY = new Map<string, OcrProvider>();\n\nexport function registerOcrProvider(provider: OcrProvider): void {\n REGISTRY.set(provider.name, provider);\n}\n\nexport function unregisterOcrProvider(name: string): void {\n REGISTRY.delete(name);\n}\n\nexport function listOcrProviders(): readonly OcrProvider[] {\n return [...REGISTRY.values()].sort((a, b) => b.priority - a.priority);\n}\n\n/**\n * Returns the highest-priority provider that reports `available()`. Throws if\n * none are available — callers must handle that explicitly rather than\n * silently degrading. WS6's scene-builder catches and reports.\n */\nexport function selectOcrProvider(): OcrProvider {\n for (const provider of listOcrProviders()) {\n if (provider.available()) return provider;\n }\n throw new Error(\n \"No OCR provider available. Register at least one provider before calling selectOcrProvider().\",\n );\n}\n\n// ── iOS Apple Vision provider ────────────────────────────────────────────────\n\n/**\n * Builds an OcrProvider that delegates to the Capacitor `ComputerUse` plugin's\n * `visionOcr` method. Pass in a getter that lazily resolves the bridge so this\n * module stays free of Capacitor imports (which would break Node test runs).\n */\nexport function createIosVisionOcrProvider(\n getBridge: () => IosComputerUseBridge | null,\n options: { readonly priority?: number } = {},\n): OcrProvider {\n return {\n name: \"ios-apple-vision\",\n priority: options.priority ?? 100,\n available(): boolean {\n return getBridge() !== null;\n },\n async recognize(input, recognizeOptions): Promise<OcrResult> {\n const bridge = getBridge();\n if (!bridge) {\n throw new Error(\n \"ios-apple-vision provider invoked but Capacitor ComputerUse plugin is not registered.\",\n );\n }\n const imageBase64 = toBase64(input);\n const visionOptions: VisionOcrOptions = {\n ...(recognizeOptions?.languages\n ? { languages: recognizeOptions.languages }\n : {}),\n ...(recognizeOptions?.recognitionLevel\n ? { recognitionLevel: recognizeOptions.recognitionLevel }\n : {}),\n ...(recognizeOptions?.minimumTextHeight !== undefined\n ? { minimumTextHeight: recognizeOptions.minimumTextHeight }\n : {}),\n };\n const result = await bridge.visionOcr({\n imageBase64,\n options: visionOptions,\n });\n if (!result.ok) {\n // Narrow to the failure arm explicitly — some consumer tsconfigs\n // run with `strict: false`, which disables discriminated-union\n // narrowing on `!result.ok` and surfaces TS2339 on `result.code`\n // / `result.message`. The runtime invariant is unchanged.\n const failure = result as Extract<typeof result, { ok: false }>;\n throw new Error(\n `ios-apple-vision OCR failed: ${failure.code} — ${failure.message}`,\n );\n }\n return mapVisionResult(result.data);\n },\n };\n}\n\nfunction toBase64(input: OcrInput): string {\n if (input.kind === \"base64\") return input.data;\n return uint8ArrayToBase64(input.data);\n}\n\nfunction uint8ArrayToBase64(bytes: Uint8Array): string {\n if (typeof Buffer !== \"undefined\") {\n return Buffer.from(bytes).toString(\"base64\");\n }\n let binary = \"\";\n for (let i = 0; i < bytes.length; i += 1) {\n binary += String.fromCharCode(bytes[i]);\n }\n // btoa is the only universal browser/Capacitor path.\n // eslint-disable-next-line no-undef\n return btoa(binary);\n}\n\nfunction mapVisionResult(result: VisionOcrResult): OcrResult {\n return {\n lines: result.lines.map(mapLine),\n fullText: result.fullText,\n elapsedMs: result.elapsedMs,\n providerName: \"ios-apple-vision\",\n languagesUsed: result.languagesUsed,\n };\n}\n\nfunction mapLine(line: VisionOcrLine): OcrLine {\n return {\n text: line.text,\n confidence: line.confidence,\n boundingBox: line.boundingBox,\n };\n}\n\n/**\n * Test helper. Drops every provider from the registry; callers re-register\n * in `beforeEach`.\n */\nexport function _resetOcrProvidersForTests(): void {\n REGISTRY.clear();\n}\n\n// ── Coord-aware OCR provider slot ────────────────────────────────────────────\n\n/**\n * Hierarchical (block / line / word) OCR with absolute source-display\n * coordinates and a coarse 3x3 semantic position label per element. This is\n * a *separate* slot from `OcrProvider` above — that one is line-only and\n * shaped around Apple Vision's API, while this one carries the structure\n * that plugin-computeruse needs to compute action targets without re-running\n * detection.\n *\n * The provider implementation lives in `@elizaos/plugin-vision`'s\n * `ocr-with-coords.ts`. plugin-computeruse intentionally does not take a\n * runtime dep on plugin-vision (computeruse is the higher-level seam), so\n * the runtime registers a provider here at boot.\n *\n * The current in-tree provider is the RapidOCR-backed adapter registered by\n * `@elizaos/plugin-vision`; native providers can register the same interface\n * when they are available.\n */\nexport interface CoordOcrSemantic {\n readonly position:\n | \"upper-left\"\n | \"upper-center\"\n | \"upper-right\"\n | \"middle-left\"\n | \"center\"\n | \"middle-right\"\n | \"lower-left\"\n | \"lower-center\"\n | \"lower-right\";\n}\n\nexport interface CoordOcrWord {\n readonly text: string;\n readonly bbox: {\n readonly x: number;\n readonly y: number;\n readonly width: number;\n readonly height: number;\n };\n readonly semantic_position: CoordOcrSemantic[\"position\"];\n}\n\nexport interface CoordOcrBlock {\n readonly text: string;\n readonly bbox: {\n readonly x: number;\n readonly y: number;\n readonly width: number;\n readonly height: number;\n };\n readonly words: ReadonlyArray<CoordOcrWord>;\n readonly semantic_position: CoordOcrSemantic[\"position\"];\n}\n\nexport interface CoordOcrResult {\n readonly blocks: ReadonlyArray<CoordOcrBlock>;\n}\n\nexport interface CoordOcrInput {\n readonly displayId: string;\n readonly sourceX: number;\n readonly sourceY: number;\n readonly pngBytes: Uint8Array;\n}\n\nexport interface CoordOcrProvider {\n readonly name: string;\n describe(input: CoordOcrInput): Promise<CoordOcrResult>;\n}\n\nlet registeredCoordOcrProvider: CoordOcrProvider | null = null;\n\n/**\n * Register the hierarchical / coord-aware OCR provider. Idempotent — last\n * call wins so a hot-reload of the bridge swaps cleanly. Pass `null` to\n * unregister.\n */\nexport function registerCoordOcrProvider(\n provider: CoordOcrProvider | null,\n): void {\n registeredCoordOcrProvider = provider;\n}\n\nexport function getCoordOcrProvider(): CoordOcrProvider | null {\n return registeredCoordOcrProvider;\n}\n\n// ── Set-of-Marks provider slot (#9170 M9) ────────────────────────────────────\n\n/**\n * Set-of-Marks grounding (trycua/cua's OmniParser technique): fuse GGUF YOLO\n * icon detections + OCR text boxes into ONE deduplicated, 1-indexed set of\n * numbered targets, optionally with a numbered-overlay PNG. The VLM picks a\n * *number* instead of regressing raw pixels.\n *\n * The fusion + overlay implementation lives in `@elizaos/plugin-vision`\n * (`som.ts`) because that package owns the YOLO detector and OCR engines.\n * computeruse exposes this registration slot (same no-hard-dep pattern as the\n * CoordOcrProvider above); `detect_elements` consumes whatever is registered.\n */\nexport interface SetOfMarksMark {\n /** 1-indexed mark number shown in the overlay. */\n readonly index: number;\n /** Display-local box `[x, y, w, h]`. */\n readonly bbox: readonly [number, number, number, number];\n /** Box center `[x, y]` — the click target the number resolves to. */\n readonly center: readonly [number, number];\n readonly source: \"icon\" | \"text\";\n readonly label?: string;\n readonly score: number;\n}\n\nexport interface SetOfMarksResult {\n readonly marks: ReadonlyArray<SetOfMarksMark>;\n /** Base64 PNG of the numbered overlay (present only when requested). */\n readonly overlayPngBase64?: string;\n}\n\nexport interface SetOfMarksInput {\n readonly displayId: string;\n readonly sourceX: number;\n readonly sourceY: number;\n readonly pngBytes: Uint8Array;\n /** Render and return the numbered-overlay PNG. Default false (marks only). */\n readonly renderOverlay?: boolean;\n}\n\nexport interface SetOfMarksProvider {\n readonly name: string;\n describe(input: SetOfMarksInput): Promise<SetOfMarksResult>;\n}\n\nlet registeredSetOfMarksProvider: SetOfMarksProvider | null = null;\n\n/**\n * Register the Set-of-Marks provider. Idempotent — last call wins. Pass `null`\n * to unregister.\n */\nexport function registerSetOfMarksProvider(\n provider: SetOfMarksProvider | null,\n): void {\n registeredSetOfMarksProvider = provider;\n}\n\nexport function getSetOfMarksProvider(): SetOfMarksProvider | null {\n return registeredSetOfMarksProvider;\n}\n"
|
|
6
|
+
],
|
|
7
|
+
"mappings": ";AAoEA,IAAM,WAAW,IAAI;AAEd,SAAS,mBAAmB,CAAC,UAA6B;AAAA,EAC/D,SAAS,IAAI,SAAS,MAAM,QAAQ;AAAA;AAG/B,SAAS,qBAAqB,CAAC,MAAoB;AAAA,EACxD,SAAS,OAAO,IAAI;AAAA;AAGf,SAAS,gBAAgB,GAA2B;AAAA,EACzD,OAAO,CAAC,GAAG,SAAS,OAAO,CAAC,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAAA;AAQ/D,SAAS,iBAAiB,GAAgB;AAAA,EAC/C,WAAW,YAAY,iBAAiB,GAAG;AAAA,IACzC,IAAI,SAAS,UAAU;AAAA,MAAG,OAAO;AAAA,EACnC;AAAA,EACA,MAAM,IAAI,MACR,+FACF;AAAA;AAUK,SAAS,0BAA0B,CACxC,WACA,UAA0C,CAAC,GAC9B;AAAA,EACb,OAAO;AAAA,IACL,MAAM;AAAA,IACN,UAAU,QAAQ,YAAY;AAAA,IAC9B,SAAS,GAAY;AAAA,MACnB,OAAO,UAAU,MAAM;AAAA;AAAA,SAEnB,UAAS,CAAC,OAAO,kBAAsC;AAAA,MAC3D,MAAM,SAAS,UAAU;AAAA,MACzB,IAAI,CAAC,QAAQ;AAAA,QACX,MAAM,IAAI,MACR,uFACF;AAAA,MACF;AAAA,MACA,MAAM,cAAc,SAAS,KAAK;AAAA,MAClC,MAAM,gBAAkC;AAAA,WAClC,kBAAkB,YAClB,EAAE,WAAW,iBAAiB,UAAU,IACxC,CAAC;AAAA,WACD,kBAAkB,mBAClB,EAAE,kBAAkB,iBAAiB,iBAAiB,IACtD,CAAC;AAAA,WACD,kBAAkB,sBAAsB,YACxC,EAAE,mBAAmB,iBAAiB,kBAAkB,IACxD,CAAC;AAAA,MACP;AAAA,MACA,MAAM,SAAS,MAAM,OAAO,UAAU;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,MACX,CAAC;AAAA,MACD,IAAI,CAAC,OAAO,IAAI;AAAA,QAKd,MAAM,UAAU;AAAA,QAChB,MAAM,IAAI,MACR,gCAAgC,QAAQ,UAAS,QAAQ,SAC3D;AAAA,MACF;AAAA,MACA,OAAO,gBAAgB,OAAO,IAAI;AAAA;AAAA,EAEtC;AAAA;AAGF,SAAS,QAAQ,CAAC,OAAyB;AAAA,EACzC,IAAI,MAAM,SAAS;AAAA,IAAU,OAAO,MAAM;AAAA,EAC1C,OAAO,mBAAmB,MAAM,IAAI;AAAA;AAGtC,SAAS,kBAAkB,CAAC,OAA2B;AAAA,EACrD,IAAI,OAAO,WAAW,aAAa;AAAA,IACjC,OAAO,OAAO,KAAK,KAAK,EAAE,SAAS,QAAQ;AAAA,EAC7C;AAAA,EACA,IAAI,SAAS;AAAA,EACb,SAAS,IAAI,EAAG,IAAI,MAAM,QAAQ,KAAK,GAAG;AAAA,IACxC,UAAU,OAAO,aAAa,MAAM,EAAE;AAAA,EACxC;AAAA,EAGA,OAAO,KAAK,MAAM;AAAA;AAGpB,SAAS,eAAe,CAAC,QAAoC;AAAA,EAC3D,OAAO;AAAA,IACL,OAAO,OAAO,MAAM,IAAI,OAAO;AAAA,IAC/B,UAAU,OAAO;AAAA,IACjB,WAAW,OAAO;AAAA,IAClB,cAAc;AAAA,IACd,eAAe,OAAO;AAAA,EACxB;AAAA;AAGF,SAAS,OAAO,CAAC,MAA8B;AAAA,EAC7C,OAAO;AAAA,IACL,MAAM,KAAK;AAAA,IACX,YAAY,KAAK;AAAA,IACjB,aAAa,KAAK;AAAA,EACpB;AAAA;AAOK,SAAS,0BAA0B,GAAS;AAAA,EACjD,SAAS,MAAM;AAAA;AA0EjB,IAAI,6BAAsD;AAOnD,SAAS,wBAAwB,CACtC,UACM;AAAA,EACN,6BAA6B;AAAA;AAGxB,SAAS,mBAAmB,GAA4B;AAAA,EAC7D,OAAO;AAAA;AAgDT,IAAI,+BAA0D;AAMvD,SAAS,0BAA0B,CACxC,UACM;AAAA,EACN,+BAA+B;AAAA;AAG1B,SAAS,qBAAqB,GAA8B;AAAA,EACjE,OAAO;AAAA;",
|
|
8
|
+
"debugId": "ECB47B4DF356B75E64756E2164756E21",
|
|
9
|
+
"names": []
|
|
10
|
+
}
|
|
@@ -7,11 +7,14 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import type { DesktopActionParams } from "../types.js";
|
|
9
9
|
import type { OSWorldAction } from "./types.js";
|
|
10
|
+
export interface OSWorldPointerState {
|
|
11
|
+
mouseDownAt?: [number, number];
|
|
12
|
+
}
|
|
10
13
|
/**
|
|
11
14
|
* Convert an OSWorld computer_13 action to a DesktopActionParams.
|
|
12
15
|
* Returns null for WAIT/DONE/FAIL (control flow, not desktop actions).
|
|
13
16
|
*/
|
|
14
|
-
export declare function fromOSWorldAction(action: OSWorldAction): DesktopActionParams | null;
|
|
17
|
+
export declare function fromOSWorldAction(action: OSWorldAction, pointerState?: OSWorldPointerState): DesktopActionParams | null;
|
|
15
18
|
/**
|
|
16
19
|
* Convert a DesktopActionParams to an OSWorld computer_13 action.
|
|
17
20
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"action-converter.d.ts","sourceRoot":"","sources":["../../src/osworld/action-converter.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AACvD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAIhD;;;GAGG;AACH,wBAAgB,iBAAiB,CAC/B,MAAM,EAAE,aAAa,
|
|
1
|
+
{"version":3,"file":"action-converter.d.ts","sourceRoot":"","sources":["../../src/osworld/action-converter.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AACvD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAIhD,MAAM,WAAW,mBAAmB;IAClC,WAAW,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAChC;AAED;;;GAGG;AACH,wBAAgB,iBAAiB,CAC/B,MAAM,EAAE,aAAa,EACrB,YAAY,CAAC,EAAE,mBAAmB,GACjC,mBAAmB,GAAG,IAAI,CA8G5B;AAID;;GAEG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,mBAAmB,GAAG,aAAa,CAwE1E;AAID;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,mBAAmB,GAAG,IAAI,CA+GtE"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../src/osworld/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAIH,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,qCAAqC,CAAC;AAM9E,OAAO,KAAK,
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../src/osworld/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAIH,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,qCAAqC,CAAC;AAM9E,OAAO,KAAK,EACV,aAAa,EACb,kBAAkB,EAClB,kBAAkB,EAClB,iBAAiB,EAClB,MAAM,YAAY,CAAC;AAEpB,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAqB;IACpC,OAAO,CAAC,MAAM,CAAqB;IACnC,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,YAAY,CAA2B;IAC/C,OAAO,CAAC,UAAU,CAIV;gBAGN,OAAO,EAAE,kBAAkB,EAC3B,MAAM,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC;IActC;;OAEG;IACG,cAAc,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC;IA2BtE;;;OAGG;IACG,aAAa,CACjB,MAAM,EAAE,aAAa,GACpB,OAAO,CAAC;QAAE,QAAQ,EAAE,OAAO,CAAC;QAAC,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,EAAE,OAAO,CAAA;KAAE,CAAC;IAyBjE;;OAEG;IACG,gBAAgB,CACpB,IAAI,EAAE,MAAM,GACX,OAAO,CAAC;QAAE,QAAQ,EAAE,OAAO,CAAC;QAAC,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,EAAE,OAAO,CAAA;KAAE,CAAC;IA2BjE;;;OAGG;IACG,IAAI,CACR,MAAM,EAAE,aAAa,GAAG,MAAM,EAC9B,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,iBAAiB,CAAC;IAuC7B;;OAEG;IACH,KAAK,IAAI,IAAI;IAMb;;OAEG;IACH,aAAa;gBA5KH,aAAa,GAAG,MAAM;qBACjB,kBAAkB;mBACpB,MAAM;;IA8KnB;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACH,eAAe,IAAI,OAAO;IAI1B;;OAEG;IACH,SAAS,IAAI,kBAAkB;IAM/B,OAAO,CAAC,KAAK;CAGd"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trycua/cua parity tooling (#9170 M14) — public surface.
|
|
3
|
+
*
|
|
4
|
+
* The machine-checkable capability matrix + its validator, and the ScreenSpot
|
|
5
|
+
* grounding harness. The OSWorld benchmark adapter lives under `src/osworld/`.
|
|
6
|
+
*/
|
|
7
|
+
export { type OsCoverage, type OsName, PARITY_MATRIX, type ParityCapability, type ParityCoverageByOs, type ParityStatus, type ParityValidationProblem, type ParityValidationResult, parityCoverageByOs, parityMatrixSummary, validateParityCoverage, validateParityMatrix, } from "./parity-matrix.js";
|
|
8
|
+
export { pointInBbox, type ScreenSpotPrediction, type ScreenSpotSample, type ScreenSpotSampleResult, type ScreenSpotScore, scoreScreenSpot, } from "./screenspot.js";
|
|
9
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parity/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EACL,KAAK,UAAU,EACf,KAAK,MAAM,EACX,aAAa,EACb,KAAK,gBAAgB,EACrB,KAAK,kBAAkB,EACvB,KAAK,YAAY,EACjB,KAAK,uBAAuB,EAC5B,KAAK,sBAAsB,EAC3B,kBAAkB,EAClB,mBAAmB,EACnB,sBAAsB,EACtB,oBAAoB,GACrB,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EACL,WAAW,EACX,KAAK,oBAAoB,EACzB,KAAK,gBAAgB,EACrB,KAAK,sBAAsB,EAC3B,KAAK,eAAe,EACpB,eAAe,GAChB,MAAM,iBAAiB,CAAC"}
|