@elizaos/plugin-computeruse 2.0.0-beta.1 → 2.0.3-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +80 -0
- package/dist/actions/clipboard.d.ts +22 -0
- package/dist/actions/clipboard.d.ts.map +1 -0
- package/dist/actions/progress.d.ts +26 -0
- package/dist/actions/progress.d.ts.map +1 -0
- package/dist/actions/use-computer-agent.d.ts +113 -0
- package/dist/actions/use-computer-agent.d.ts.map +1 -0
- package/dist/actions/use-computer.d.ts.map +1 -1
- package/dist/actions/window-handlers.d.ts +11 -0
- package/dist/actions/window-handlers.d.ts.map +1 -0
- package/dist/actions/window.d.ts +11 -0
- package/dist/actions/window.d.ts.map +1 -0
- package/dist/actor/actor.d.ts +84 -0
- package/dist/actor/actor.d.ts.map +1 -0
- package/dist/actor/agent-callbacks.d.ts +128 -0
- package/dist/actor/agent-callbacks.d.ts.map +1 -0
- package/dist/actor/agent-loop.d.ts +134 -0
- package/dist/actor/agent-loop.d.ts.map +1 -0
- package/dist/actor/aosp-input-actor.d.ts +87 -0
- package/dist/actor/aosp-input-actor.d.ts.map +1 -0
- package/dist/actor/brain.d.ts +195 -0
- package/dist/actor/brain.d.ts.map +1 -0
- package/dist/actor/cascade.d.ts +92 -0
- package/dist/actor/cascade.d.ts.map +1 -0
- package/dist/actor/computer-interface.d.ts +276 -0
- package/dist/actor/computer-interface.d.ts.map +1 -0
- package/dist/actor/dispatch.d.ts +24 -0
- package/dist/actor/dispatch.d.ts.map +1 -0
- package/dist/actor/index.d.ts +12 -0
- package/dist/actor/index.d.ts.map +1 -0
- package/dist/actor/types.d.ts +94 -0
- package/dist/actor/types.d.ts.map +1 -0
- package/dist/approval-manager.d.ts.map +1 -1
- package/dist/index.d.ts +19 -6
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +12001 -5484
- package/dist/index.js.map +59 -25
- package/dist/mcp/index.d.ts +8 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/server.d.ts +42 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/tools.d.ts +53 -0
- package/dist/mcp/tools.d.ts.map +1 -0
- package/dist/mobile/android-bridge.d.ts +263 -0
- package/dist/mobile/android-bridge.d.ts.map +1 -0
- package/dist/mobile/android-scene.d.ts +52 -0
- package/dist/mobile/android-scene.d.ts.map +1 -0
- package/dist/mobile/android-trajectory.d.ts +66 -0
- package/dist/mobile/android-trajectory.d.ts.map +1 -0
- package/dist/mobile/index.d.ts +19 -0
- package/dist/mobile/index.d.ts.map +1 -0
- package/dist/mobile/ios-app-intent-registry.d.ts +20 -0
- package/dist/mobile/ios-app-intent-registry.d.ts.map +1 -0
- package/dist/mobile/ios-bridge.d.ts +359 -0
- package/dist/mobile/ios-bridge.d.ts.map +1 -0
- package/dist/mobile/ios-computer-interface.d.ts +160 -0
- package/dist/mobile/ios-computer-interface.d.ts.map +1 -0
- package/dist/mobile/mobile-computer-interface.d.ts +142 -0
- package/dist/mobile/mobile-computer-interface.d.ts.map +1 -0
- package/dist/mobile/mobile-screen-capture.d.ts +64 -0
- package/dist/mobile/mobile-screen-capture.d.ts.map +1 -0
- package/dist/mobile/ocr-provider.d.ts +187 -0
- package/dist/mobile/ocr-provider.d.ts.map +1 -0
- package/dist/mobile/ocr-provider.js +111 -0
- package/dist/mobile/ocr-provider.js.map +10 -0
- package/dist/osworld/action-converter.d.ts +4 -1
- package/dist/osworld/action-converter.d.ts.map +1 -1
- package/dist/osworld/adapter.d.ts +1 -0
- package/dist/osworld/adapter.d.ts.map +1 -1
- package/dist/parity/index.d.ts +9 -0
- package/dist/parity/index.d.ts.map +1 -0
- package/dist/parity/parity-matrix.d.ts +82 -0
- package/dist/parity/parity-matrix.d.ts.map +1 -0
- package/dist/parity/screenspot.d.ts +56 -0
- package/dist/parity/screenspot.d.ts.map +1 -0
- package/dist/platform/a11y.d.ts +29 -1
- package/dist/platform/a11y.d.ts.map +1 -1
- package/dist/platform/browser.d.ts +1 -1
- package/dist/platform/browser.d.ts.map +1 -1
- package/dist/platform/capabilities.d.ts +23 -0
- package/dist/platform/capabilities.d.ts.map +1 -1
- package/dist/platform/capture.d.ts +65 -0
- package/dist/platform/capture.d.ts.map +1 -0
- package/dist/platform/clipboard.d.ts +24 -0
- package/dist/platform/clipboard.d.ts.map +1 -0
- package/dist/platform/coords.d.ts +73 -0
- package/dist/platform/coords.d.ts.map +1 -0
- package/dist/platform/desktop.d.ts +23 -0
- package/dist/platform/desktop.d.ts.map +1 -1
- package/dist/platform/displays.d.ts +97 -0
- package/dist/platform/displays.d.ts.map +1 -0
- package/dist/platform/driver.d.ts +22 -0
- package/dist/platform/driver.d.ts.map +1 -1
- package/dist/platform/file-ops.d.ts +17 -0
- package/dist/platform/file-ops.d.ts.map +1 -1
- package/dist/platform/helpers.d.ts +2 -3
- package/dist/platform/helpers.d.ts.map +1 -1
- package/dist/platform/launch.d.ts +54 -0
- package/dist/platform/launch.d.ts.map +1 -0
- package/dist/platform/normalized-coords.d.ts +46 -0
- package/dist/platform/normalized-coords.d.ts.map +1 -0
- package/dist/platform/nut-driver.d.ts +67 -0
- package/dist/platform/nut-driver.d.ts.map +1 -1
- package/dist/platform/permissions.d.ts +12 -0
- package/dist/platform/permissions.d.ts.map +1 -1
- package/dist/platform/process-list.d.ts +32 -0
- package/dist/platform/process-list.d.ts.map +1 -0
- package/dist/platform/ps-host.d.ts +77 -0
- package/dist/platform/ps-host.d.ts.map +1 -0
- package/dist/platform/screenshot-errors.d.ts +54 -0
- package/dist/platform/screenshot-errors.d.ts.map +1 -0
- package/dist/platform/screenshot-quality.d.ts +11 -0
- package/dist/platform/screenshot-quality.d.ts.map +1 -0
- package/dist/platform/screenshot.d.ts.map +1 -1
- package/dist/platform/security.d.ts +8 -0
- package/dist/platform/security.d.ts.map +1 -1
- package/dist/platform/wayland-portal.d.ts +25 -0
- package/dist/platform/wayland-portal.d.ts.map +1 -0
- package/dist/platform/windows-list.d.ts +43 -1
- package/dist/platform/windows-list.d.ts.map +1 -1
- package/dist/providers/computer-state.d.ts.map +1 -1
- package/dist/providers/scene.d.ts +21 -0
- package/dist/providers/scene.d.ts.map +1 -0
- package/dist/register-routes.js +11715 -4990
- package/dist/register-routes.js.map +61 -24
- package/dist/routes/computer-use-compat-routes.d.ts +1 -1
- package/dist/routes/computer-use-compat-routes.d.ts.map +1 -1
- package/dist/sandbox/docker-backend.d.ts +69 -0
- package/dist/sandbox/docker-backend.d.ts.map +1 -0
- package/dist/sandbox/index.d.ts +62 -0
- package/dist/sandbox/index.d.ts.map +1 -0
- package/dist/sandbox/qemu-backend.d.ts +48 -0
- package/dist/sandbox/qemu-backend.d.ts.map +1 -0
- package/dist/sandbox/remote-guest.d.ts +72 -0
- package/dist/sandbox/remote-guest.d.ts.map +1 -0
- package/dist/sandbox/sandbox-driver.d.ts +41 -0
- package/dist/sandbox/sandbox-driver.d.ts.map +1 -0
- package/dist/sandbox/surface-types.d.ts +17 -0
- package/dist/sandbox/surface-types.d.ts.map +1 -0
- package/dist/sandbox/types.d.ts +138 -0
- package/dist/sandbox/types.d.ts.map +1 -0
- package/dist/sandbox/wsb-backend.d.ts +48 -0
- package/dist/sandbox/wsb-backend.d.ts.map +1 -0
- package/dist/scene/a11y-provider.d.ts +83 -0
- package/dist/scene/a11y-provider.d.ts.map +1 -0
- package/dist/scene/apps.d.ts +39 -0
- package/dist/scene/apps.d.ts.map +1 -0
- package/dist/scene/dhash.d.ts +105 -0
- package/dist/scene/dhash.d.ts.map +1 -0
- package/dist/scene/ocr-adapter.d.ts +64 -0
- package/dist/scene/ocr-adapter.d.ts.map +1 -0
- package/dist/scene/scene-builder.d.ts +107 -0
- package/dist/scene/scene-builder.d.ts.map +1 -0
- package/dist/scene/scene-types.d.ts +70 -0
- package/dist/scene/scene-types.d.ts.map +1 -0
- package/dist/scene/screen-state.d.ts +105 -0
- package/dist/scene/screen-state.d.ts.map +1 -0
- package/dist/scene/serialize.d.ts +28 -0
- package/dist/scene/serialize.d.ts.map +1 -0
- package/dist/security/browser-script-policy.d.ts +9 -0
- package/dist/security/browser-script-policy.d.ts.map +1 -0
- package/dist/services/computer-use-service.d.ts +78 -2
- package/dist/services/computer-use-service.d.ts.map +1 -1
- package/dist/services/index.d.ts +7 -0
- package/dist/services/index.d.ts.map +1 -0
- package/dist/services/vision-context-provider.d.ts +32 -0
- package/dist/services/vision-context-provider.d.ts.map +1 -0
- package/dist/types.d.ts +115 -5
- package/dist/types.d.ts.map +1 -1
- package/package.json +47 -10
- package/registry-entry.json +74 -0
- package/dist/actions/desktop-handlers.d.ts +0 -20
- package/dist/actions/desktop-handlers.d.ts.map +0 -1
- package/dist/actions/desktop.d.ts +0 -11
- package/dist/actions/desktop.d.ts.map +0 -1
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent-loop registry (#9170 M10).
|
|
3
|
+
*
|
|
4
|
+
* trycua/cua selects an agent *loop* from a model string: an `anthropic/...`
|
|
5
|
+
* model routes to the Claude computer-use loop, `openai/computer-use-preview`
|
|
6
|
+
* routes to the OpenAI operator loop, an OmniParser/grounder string routes to a
|
|
7
|
+
* local set-of-marks loop, etc. Each loop implements the same two-call seam —
|
|
8
|
+
* `predict_step` (observe + plan the next action) and `predict_click` (ground a
|
|
9
|
+
* target to a coordinate) — so the runner is decoupled from *how* a step is
|
|
10
|
+
* produced.
|
|
11
|
+
*
|
|
12
|
+
* elizaOS shipped a single hardcoded Brain→Cascade (ScreenSeekeR). This module
|
|
13
|
+
* replaces that hardcoding with a registry:
|
|
14
|
+
* - `AgentLoop` — the `predictStep` / `predictClick` seam.
|
|
15
|
+
* - `registerAgentLoop` — register a loop keyed by a model-string matcher.
|
|
16
|
+
* - `createAgentLoop(modelString, deps)` — pick the highest-priority matching
|
|
17
|
+
* loop and instantiate it.
|
|
18
|
+
*
|
|
19
|
+
* The built-in `local-grounder` loop wraps the existing Brain→Cascade and
|
|
20
|
+
* exposes the M5 grounding cache through `predictClick`. Anthropic / OpenAI
|
|
21
|
+
* computer-use loops are *pluggable*: a provider plugin calls
|
|
22
|
+
* `registerAgentLoop` with `matchesModelFamily("anthropic")` (etc.) and its own
|
|
23
|
+
* `predictStep`. With none registered, every model string falls through to the
|
|
24
|
+
* local grounder (which always matches at the lowest priority).
|
|
25
|
+
*/
|
|
26
|
+
import type { IAgentRuntime } from "@elizaos/core";
|
|
27
|
+
import type { DisplayCapture } from "../platform/capture.js";
|
|
28
|
+
import type { Scene } from "../scene/scene-types.js";
|
|
29
|
+
import { type Actor } from "./actor.js";
|
|
30
|
+
import { Brain } from "./brain.js";
|
|
31
|
+
import type { CascadeResult, GroundingResult } from "./types.js";
|
|
32
|
+
/** Default loop model-string — the local OCR/AX + actor grounder. */
|
|
33
|
+
export declare const DEFAULT_AGENT_LOOP_MODEL = "local-grounder";
|
|
34
|
+
/** Setting / env key the runner reads to choose a loop. */
|
|
35
|
+
export declare const AGENT_LOOP_SETTING = "COMPUTER_USE_AGENT_LOOP";
|
|
36
|
+
export interface AgentStepInput {
|
|
37
|
+
scene: Scene;
|
|
38
|
+
goal: string;
|
|
39
|
+
captures: Map<number, DisplayCapture>;
|
|
40
|
+
}
|
|
41
|
+
export interface PredictClickInput {
|
|
42
|
+
scene: Scene;
|
|
43
|
+
captures: Map<number, DisplayCapture>;
|
|
44
|
+
targetDisplayId: number;
|
|
45
|
+
/** OCR/AX id to ground (`t<d>-<n>` / `a<d>-<n>`). */
|
|
46
|
+
ref?: string;
|
|
47
|
+
/** Free-form instruction when no ref is available. */
|
|
48
|
+
instruction?: string;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* The two-call seam every loop implements. `predictStep` plans the next
|
|
52
|
+
* concrete action; `predictClick` grounds a target to a coordinate (used by
|
|
53
|
+
* loops that plan elsewhere but reuse our grounding, and by callers that want
|
|
54
|
+
* grounding without a full step).
|
|
55
|
+
*/
|
|
56
|
+
/**
|
|
57
|
+
* Per-run model-call accounting (#9105). `invocations` counts the token-bearing
|
|
58
|
+
* model calls a loop actually issued; `cacheHits` counts calls served without a
|
|
59
|
+
* model round-trip. Reported once per run as `evt:"computeruse.agent.tokens"`.
|
|
60
|
+
*/
|
|
61
|
+
export interface AgentLoopStats {
|
|
62
|
+
/** Token-bearing model calls actually issued during the run. */
|
|
63
|
+
invocations: number;
|
|
64
|
+
/** Calls served from cache (no model call, no tokens). */
|
|
65
|
+
cacheHits: number;
|
|
66
|
+
/** Model calls issued with no screenshot attached (#9105). */
|
|
67
|
+
imagelessCalls: number;
|
|
68
|
+
/** Estimated image tokens not sent because of imageless calls (#9105). */
|
|
69
|
+
estImageTokensSaved: number;
|
|
70
|
+
}
|
|
71
|
+
export interface AgentLoop {
|
|
72
|
+
readonly name: string;
|
|
73
|
+
predictStep(input: AgentStepInput): Promise<CascadeResult>;
|
|
74
|
+
predictClick(input: PredictClickInput): Promise<GroundingResult | null>;
|
|
75
|
+
/** Per-run model-call accounting, when the loop tracks it (#9105). */
|
|
76
|
+
getStats?(): AgentLoopStats;
|
|
77
|
+
}
|
|
78
|
+
export interface AgentLoopDeps {
|
|
79
|
+
runtime: IAgentRuntime | null;
|
|
80
|
+
/** Latest-scene accessor for the default actor. */
|
|
81
|
+
getScene: () => Scene | null;
|
|
82
|
+
/** Brain override (mostly tests). */
|
|
83
|
+
brain?: Brain;
|
|
84
|
+
/** Actor override (mostly tests). */
|
|
85
|
+
actor?: Actor | null;
|
|
86
|
+
}
|
|
87
|
+
export interface AgentLoopRegistration {
|
|
88
|
+
/** Stable id for telemetry + explicit selection. */
|
|
89
|
+
readonly name: string;
|
|
90
|
+
/** True when this loop handles `modelString`. */
|
|
91
|
+
matches: (modelString: string) => boolean;
|
|
92
|
+
/** Instantiate the loop for a run. */
|
|
93
|
+
create: (deps: AgentLoopDeps) => AgentLoop;
|
|
94
|
+
/** Higher wins when multiple registrations match. Default 0. */
|
|
95
|
+
priority?: number;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Wraps the existing ScreenSeekeR (Brain → Cascade). `predictStep` is the full
|
|
99
|
+
* observe→plan→ground cascade; `predictClick` calls the cascade's grounding-only
|
|
100
|
+
* path so the M5 per-Scene grounding cache is shared across both.
|
|
101
|
+
*/
|
|
102
|
+
export declare class LocalGrounderLoop implements AgentLoop {
|
|
103
|
+
readonly name = "local-grounder";
|
|
104
|
+
private readonly cascade;
|
|
105
|
+
private readonly brain;
|
|
106
|
+
constructor(deps: AgentLoopDeps);
|
|
107
|
+
predictStep(input: AgentStepInput): Promise<CascadeResult>;
|
|
108
|
+
predictClick(input: PredictClickInput): Promise<GroundingResult | null>;
|
|
109
|
+
/** Grounding cache hit/miss snapshot (delegates to the wrapped cascade). */
|
|
110
|
+
getGroundStats(): import("./cascade.js").CascadeGroundStats;
|
|
111
|
+
/** Model-call accounting from the wrapped Brain (#9105). */
|
|
112
|
+
getStats(): AgentLoopStats;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* A matcher for a provider family — `anthropic`, `openai`, `google`, … A
|
|
116
|
+
* pluggable loop registers with `matches: matchesModelFamily("anthropic")` so a
|
|
117
|
+
* model string like `anthropic/claude-...` or `claude-3-7-sonnet` routes to it.
|
|
118
|
+
*/
|
|
119
|
+
export declare function matchesModelFamily(family: string): (modelString: string) => boolean;
|
|
120
|
+
/** Register (or replace, by name) an agent-loop. */
|
|
121
|
+
export declare function registerAgentLoop(registration: AgentLoopRegistration): void;
|
|
122
|
+
export declare function unregisterAgentLoop(name: string): void;
|
|
123
|
+
export declare function listAgentLoops(): readonly AgentLoopRegistration[];
|
|
124
|
+
/**
|
|
125
|
+
* Pick the registration for a model string: the highest-priority one whose
|
|
126
|
+
* `matches` returns true. The local grounder's match-anything floor guarantees
|
|
127
|
+
* a result, so this never throws.
|
|
128
|
+
*/
|
|
129
|
+
export declare function selectAgentLoopRegistration(modelString: string): AgentLoopRegistration;
|
|
130
|
+
/** Resolve + instantiate the loop for a model string. */
|
|
131
|
+
export declare function createAgentLoop(modelString: string, deps: AgentLoopDeps): AgentLoop;
|
|
132
|
+
/** Test helper — restore the registry to just the built-in local grounder. */
|
|
133
|
+
export declare function _resetAgentLoopsForTests(): void;
|
|
134
|
+
//# sourceMappingURL=agent-loop.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"agent-loop.d.ts","sourceRoot":"","sources":["../../src/actor/agent-loop.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AACnD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AACrD,OAAO,EAAE,KAAK,KAAK,EAA+B,MAAM,YAAY,CAAC;AACrE,OAAO,EAAE,KAAK,EAA2B,MAAM,YAAY,CAAC;AAE5D,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjE,qEAAqE;AACrE,eAAO,MAAM,wBAAwB,mBAAmB,CAAC;AAEzD,2DAA2D;AAC3D,eAAO,MAAM,kBAAkB,4BAA4B,CAAC;AAE5D,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,KAAK,CAAC;IACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;IACtC,eAAe,EAAE,MAAM,CAAC;IACxB,qDAAqD;IACrD,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;GAKG;AACH;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC7B,gEAAgE;IAChE,WAAW,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,cAAc,EAAE,MAAM,CAAC;IACvB,0EAA0E;IAC1E,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,KAAK,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;IAC3D,YAAY,CAAC,KAAK,EAAE,iBAAiB,GAAG,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC;IACxE,sEAAsE;IACtE,QAAQ,CAAC,IAAI,cAAc,CAAC;CAC7B;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,aAAa,GAAG,IAAI,CAAC;IAC9B,mDAAmD;IACnD,QAAQ,EAAE,MAAM,KAAK,GAAG,IAAI,CAAC;IAC7B,qCAAqC;IACrC,KAAK,CAAC,EAAE,KAAK,CAAC;IACd,qCAAqC;IACrC,KAAK,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,qBAAqB;IACpC,oDAAoD;IACpD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,iDAAiD;IACjD,OAAO,EAAE,CAAC,WAAW,EAAE,MAAM,KAAK,OAAO,CAAC;IAC1C,sCAAsC;IACtC,MAAM,EAAE,CAAC,IAAI,EAAE,aAAa,KAAK,SAAS,CAAC;IAC3C,gEAAgE;IAChE,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAID;;;;GAIG;AACH,qBAAa,iBAAkB,YAAW,SAAS;IACjD,QAAQ,CAAC,IAAI,oBAA4B;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAQ;gBAElB,IAAI,EAAE,aAAa;IAc/B,WAAW,CAAC,KAAK,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;IAIpD,YAAY,CAChB,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC;IAkBlC,4EAA4E;IAC5E,cAAc;IAId,4DAA4D;IAC5D,QAAQ,IAAI,cAAc;CAG3B;AAID;;;;GAIG;AACH,wBAAgB,kBAAkB,CAChC,MAAM,EAAE,MAAM,GACb,CAAC,WAAW,EAAE,MAAM,KAAK,OAAO,CAYlC;AAMD,oDAAoD;AACpD,wBAAgB,iBAAiB,CAAC,YAAY,EAAE,qBAAqB,GAAG,IAAI,CAE3E;AAED,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAEtD;AAED,wBAAgB,cAAc,IAAI,SAAS,qBAAqB,EAAE,CAIjE;AA0BD;;;;GAIG;AACH,wBAAgB,2BAA2B,CACzC,WAAW,EAAE,MAAM,GAClB,qBAAqB,CAevB;AAED,yDAAyD;AACzD,wBAAgB,eAAe,CAC7B,WAAW,EAAE,MAAM,EACnB,IAAI,EAAE,aAAa,GAClB,SAAS,CAEX;AAED,8EAA8E;AAC9E,wBAAgB,wBAAwB,IAAI,IAAI,CAG/C"}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WS7 ↔ AOSP — Privileged-input actor.
|
|
3
|
+
*
|
|
4
|
+
* In the consumer build the cascade routes gestures through
|
|
5
|
+
* `MobileComputerInterface` → `AccessibilityGestureDescription` — which is
|
|
6
|
+
* coarse and blocks on touch-recognizer state in some apps (banking, DRM
|
|
7
|
+
* video, anything that sets `filterTouchesWhenObscured`).
|
|
8
|
+
*
|
|
9
|
+
* In an AOSP system-app build (see `docs/AOSP_SYSTEM_APP.md`), the
|
|
10
|
+
* privileged path uses `InputManager.injectInputEvent()` directly. That
|
|
11
|
+
* path lives behind `AospPrivilegedBridge.injectMotionEvent(...)` on the
|
|
12
|
+
* Kotlin side; the consumer-flavor `AospPrivilegedBridge` exports
|
|
13
|
+
* `createIfAvailable(): null` so this actor stays inert until a real AOSP
|
|
14
|
+
* bridge is linked in.
|
|
15
|
+
*
|
|
16
|
+
* `AospInputActor` maps a resolved WS7 `ProposedAction` (display-local
|
|
17
|
+
* pixel coords) into the privileged-bridge calls. It does NOT implement
|
|
18
|
+
* the `Actor` "grounding" contract — grounding stays with the OCR/AX or
|
|
19
|
+
* VLM actor; this is purely an *input-dispatch* shim. It's surfaced as
|
|
20
|
+
* an alternative to `ComputerInterface` for AOSP builds: the agent loop
|
|
21
|
+
* picks `AospInputActor.execute(action)` instead of `dispatch(action, {
|
|
22
|
+
* interface, ... })` when the privileged bridge is available.
|
|
23
|
+
*/
|
|
24
|
+
import type { ActionResult, ProposedAction } from "./types.js";
|
|
25
|
+
/** Minimal Kotlin-side surface this actor needs from the AOSP build. */
|
|
26
|
+
export interface AospPrivilegedInputBridge {
|
|
27
|
+
/**
|
|
28
|
+
* Inject a single motion event at the InputManager level. `action` follows
|
|
29
|
+
* `MotionEvent.ACTION_*` constants (DOWN=0, UP=1, MOVE=2). `downTimeMs`
|
|
30
|
+
* is the original-touch timestamp the gesture started at, in `uptimeMillis`
|
|
31
|
+
* units. Implementations enforce the INJECT_EVENTS permission.
|
|
32
|
+
*/
|
|
33
|
+
injectMotionEvent(args: {
|
|
34
|
+
x: number;
|
|
35
|
+
y: number;
|
|
36
|
+
action: number;
|
|
37
|
+
downTimeMs: number;
|
|
38
|
+
}): Promise<{
|
|
39
|
+
ok: boolean;
|
|
40
|
+
}>;
|
|
41
|
+
/** Capture the primary display frame buffer synchronously. JPEG bytes. */
|
|
42
|
+
captureDisplayFrameBuffer?(): Promise<Uint8Array>;
|
|
43
|
+
}
|
|
44
|
+
export interface AospInputActorDeps {
|
|
45
|
+
/** Returns the AOSP bridge handle, or null in consumer builds. */
|
|
46
|
+
getBridge: () => AospPrivilegedInputBridge | null;
|
|
47
|
+
/** Override the clock for tests. */
|
|
48
|
+
now?: () => number;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Motion-event action constants matching `android.view.MotionEvent.ACTION_*`.
|
|
52
|
+
* Re-exported here so callers don't need to import Android Kotlin enums.
|
|
53
|
+
*/
|
|
54
|
+
export declare const MOTION_EVENT_ACTION_DOWN: 0;
|
|
55
|
+
export declare const MOTION_EVENT_ACTION_UP: 1;
|
|
56
|
+
export declare const MOTION_EVENT_ACTION_MOVE: 2;
|
|
57
|
+
/**
|
|
58
|
+
* Translate a cascade-resolved `ProposedAction` into one or more
|
|
59
|
+
* `injectMotionEvent` calls. Returns the same `ActionResult` envelope the
|
|
60
|
+
* desktop dispatcher uses — invalid args or driver errors do not throw.
|
|
61
|
+
*
|
|
62
|
+
* Behavior parity with `dispatch.ts`:
|
|
63
|
+
* - unknown action.kind → invalid_args
|
|
64
|
+
* - missing coords → invalid_args
|
|
65
|
+
* - bridge throw → driver_error
|
|
66
|
+
* - bridge ok:false → driver_error
|
|
67
|
+
*
|
|
68
|
+
* Coverage:
|
|
69
|
+
* - click / double_click / right_click → tap(s)
|
|
70
|
+
* - drag → DOWN at start, MOVE/UP at end
|
|
71
|
+
* - scroll → swipe (DOWN, MOVE, UP)
|
|
72
|
+
* - wait / finish → success: true (no input event)
|
|
73
|
+
* - type / key / hotkey → invalid_args (use AccessibilityNodeInfo
|
|
74
|
+
* or a separate keymap actor; out of
|
|
75
|
+
* scope for this privileged path).
|
|
76
|
+
*/
|
|
77
|
+
export declare class AospInputActor {
|
|
78
|
+
private readonly deps;
|
|
79
|
+
readonly name = "aosp-input";
|
|
80
|
+
constructor(deps: AospInputActorDeps);
|
|
81
|
+
execute(action: ProposedAction): Promise<ActionResult>;
|
|
82
|
+
private tap;
|
|
83
|
+
private swipe;
|
|
84
|
+
private must;
|
|
85
|
+
private now;
|
|
86
|
+
}
|
|
87
|
+
//# sourceMappingURL=aosp-input-actor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aosp-input-actor.d.ts","sourceRoot":"","sources":["../../src/actor/aosp-input-actor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAK/D,wEAAwE;AACxE,MAAM,WAAW,yBAAyB;IACxC;;;;;OAKG;IACH,iBAAiB,CAAC,IAAI,EAAE;QACtB,CAAC,EAAE,MAAM,CAAC;QACV,CAAC,EAAE,MAAM,CAAC;QACV,MAAM,EAAE,MAAM,CAAC;QACf,UAAU,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QAAE,EAAE,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;IAC7B,0EAA0E;IAC1E,yBAAyB,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,CAAC;CACnD;AAED,MAAM,WAAW,kBAAkB;IACjC,kEAAkE;IAClE,SAAS,EAAE,MAAM,yBAAyB,GAAG,IAAI,CAAC;IAClD,oCAAoC;IACpC,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;CACpB;AAED;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAG,CAAU,CAAC;AACnD,eAAO,MAAM,sBAAsB,EAAG,CAAU,CAAC;AACjD,eAAO,MAAM,wBAAwB,EAAG,CAAU,CAAC;AAEnD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,cAAc;IAGb,OAAO,CAAC,QAAQ,CAAC,IAAI;IAFjC,QAAQ,CAAC,IAAI,gBAAgB;gBAEA,IAAI,EAAE,kBAAkB;IAE/C,OAAO,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,YAAY,CAAC;YAwH9C,GAAG;YA0BH,KAAK;YAsCL,IAAI;IAYlB,OAAO,CAAC,GAAG;CAGZ"}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WS7 — Brain (full-screen reasoning).
|
|
3
|
+
*
|
|
4
|
+
* Sends one image per display (each downscaled to ~1.3 MP, the OS-Atlas /
|
|
5
|
+
* Qwen3-VL `max_pixels` convention) to `runtime.useModel(IMAGE_DESCRIPTION,
|
|
6
|
+
* ...)`. The model is prompted to emit a JSON `BrainOutput` describing:
|
|
7
|
+
* - the scene in one paragraph,
|
|
8
|
+
* - which display to act on,
|
|
9
|
+
* - up to N ROIs the Actor should zoom into,
|
|
10
|
+
* - a single proposed action with rationale.
|
|
11
|
+
*
|
|
12
|
+
* The Brain itself doesn't dispatch — it just produces `BrainOutput`. The
|
|
13
|
+
* cascade ("ScreenSeekeR") is the orchestrator that takes a `BrainOutput`,
|
|
14
|
+
* optionally calls the Actor on cropped ROIs, and produces a concrete
|
|
15
|
+
* `ProposedAction` for the dispatcher.
|
|
16
|
+
*
|
|
17
|
+
* Image transport contract: we pass `imageUrl` as a `data:image/png;base64,...`
|
|
18
|
+
* URL. The WS2 MemoryArbiter intercepts at `ModelType.IMAGE_DESCRIPTION` and
|
|
19
|
+
* routes through its content-hash cache, so identical frames don't burn
|
|
20
|
+
* inference budget twice.
|
|
21
|
+
*
|
|
22
|
+
* Image policy (#9105): the compact scene already carries OCR text + AX boxes,
|
|
23
|
+
* which can suffice to pick the next target, so the `"on-escalation"` policy
|
|
24
|
+
* plans from that text-only context with NO image — routed through a TEXT model,
|
|
25
|
+
* since every IMAGE_DESCRIPTION provider rejects an empty imageUrl — and
|
|
26
|
+
* attaches the ~1.3 MP frame only when the planned target cannot be grounded
|
|
27
|
+
* against the OCR/AX boxes. The DEFAULT is `"always"` (legacy: image on every
|
|
28
|
+
* call) until a real-model CUA trajectory validates imageless planning accuracy;
|
|
29
|
+
* operators opt into `"on-escalation"` via the `COMPUTERUSE_BRAIN_IMAGE_POLICY`
|
|
30
|
+
* setting. `"never"` never attaches pixels.
|
|
31
|
+
*
|
|
32
|
+
* Parse strictness:
|
|
33
|
+
* - We try to parse the response as JSON (either the literal string or
|
|
34
|
+
* `result.description`).
|
|
35
|
+
* - On first parse failure, retry once with a stricter prompt.
|
|
36
|
+
* - On second failure, throw `BrainParseError` — the cascade surfaces this
|
|
37
|
+
* as a structured `ActionResult.error` and aborts the turn cleanly.
|
|
38
|
+
*/
|
|
39
|
+
import { type IAgentRuntime, type ImageDescriptionResult } from "@elizaos/core";
|
|
40
|
+
import type { DisplayCapture } from "../platform/capture.js";
|
|
41
|
+
import type { Scene } from "../scene/scene-types.js";
|
|
42
|
+
import type { BrainOutput } from "./types.js";
|
|
43
|
+
export declare const BRAIN_MAX_PIXELS = 1310720;
|
|
44
|
+
export declare const BRAIN_MAX_ROIS = 2;
|
|
45
|
+
/** Bound on the per-Brain dHash→BrainOutput cache (LRU-ish, oldest evicted). */
|
|
46
|
+
export declare const BRAIN_DHASH_CACHE_MAX = 16;
|
|
47
|
+
/**
|
|
48
|
+
* dHash Hamming threshold for cached-plan reuse (#9581 continuous-understanding
|
|
49
|
+
* tuning). Exact-equality (distance 0) re-burned the IMAGE_DESCRIPTION model on
|
|
50
|
+
* cosmetically-identical frames — cursor jitter, a blinking caret, anti-aliasing,
|
|
51
|
+
* and tiny scroll noise all flip a few dHash bits.
|
|
52
|
+
*
|
|
53
|
+
* This mirrors `SCREEN_STATE_HAMMING_THRESHOLD`: distances below the threshold
|
|
54
|
+
* are unchanged; distances at or above it are changed and must re-plan.
|
|
55
|
+
*/
|
|
56
|
+
export declare const BRAIN_DHASH_HAMMING_THRESHOLD = 5;
|
|
57
|
+
/**
|
|
58
|
+
* Image-token estimate per source pixel for a vision model with the Qwen3-VL /
|
|
59
|
+
* OS-Atlas `max_pixels` convention: one visual token ≈ a 28×28 (≈750 px) patch.
|
|
60
|
+
* Used only to quantify the saving when a frame is *not* attached (#9105).
|
|
61
|
+
*/
|
|
62
|
+
export declare const BRAIN_PIXELS_PER_IMAGE_TOKEN = 750;
|
|
63
|
+
/**
|
|
64
|
+
* When to attach the raw screenshot to the planning model (#9105):
|
|
65
|
+
* - `"always"` — attach the pixels on every call (legacy behaviour). Default.
|
|
66
|
+
* - `"on-escalation"` — plan from the compact OCR/AX scene with no image
|
|
67
|
+
* first (routed through a TEXT model); attach pixels
|
|
68
|
+
* only when the planned target cannot be grounded
|
|
69
|
+
* against the OCR/AX boxes or a strict-retry fires.
|
|
70
|
+
* - `"never"` — never attach the screenshot; plan from the scene alone.
|
|
71
|
+
*/
|
|
72
|
+
export type BrainImagePolicy = "always" | "on-escalation" | "never";
|
|
73
|
+
/**
|
|
74
|
+
* Default is `"always"` (proven legacy behaviour). `"on-escalation"` cuts the
|
|
75
|
+
* dominant per-frame image-token cost but plans the first pass blind to the
|
|
76
|
+
* pixels, so it stays opt-in (via the `COMPUTERUSE_BRAIN_IMAGE_POLICY` runtime
|
|
77
|
+
* setting / env var) until a real-model CUA trajectory validates its accuracy.
|
|
78
|
+
*/
|
|
79
|
+
export declare const DEFAULT_BRAIN_IMAGE_POLICY: BrainImagePolicy;
|
|
80
|
+
/**
|
|
81
|
+
* Resolve the Brain image policy from the `COMPUTERUSE_BRAIN_IMAGE_POLICY`
|
|
82
|
+
* runtime setting / env var, falling back to {@link DEFAULT_BRAIN_IMAGE_POLICY}.
|
|
83
|
+
* The operator escape hatch to enable imageless planning without a code change
|
|
84
|
+
* once it is validated against a real model.
|
|
85
|
+
*/
|
|
86
|
+
export declare function resolveBrainImagePolicy(runtime: IAgentRuntime | null): BrainImagePolicy;
|
|
87
|
+
/** Token-accounting snapshot for a Brain instance (#9105 M3). */
|
|
88
|
+
export interface BrainStats {
|
|
89
|
+
/** IMAGE_DESCRIPTION model calls actually issued. */
|
|
90
|
+
invocations: number;
|
|
91
|
+
/** Describe calls served from the frame-dHash cache (no model call). */
|
|
92
|
+
cacheHits: number;
|
|
93
|
+
/**
|
|
94
|
+
* Model calls issued with NO screenshot attached (the scene's OCR/AX text
|
|
95
|
+
* sufficed). Each one saved roughly one full-frame image's worth of tokens.
|
|
96
|
+
*/
|
|
97
|
+
imagelessCalls: number;
|
|
98
|
+
/**
|
|
99
|
+
* Estimated image tokens NOT sent because of imageless calls — the sum of
|
|
100
|
+
* `(width * height) / BRAIN_PIXELS_PER_IMAGE_TOKEN` over every imageless
|
|
101
|
+
* call, capped per frame at `BRAIN_MAX_PIXELS`.
|
|
102
|
+
*/
|
|
103
|
+
estImageTokensSaved: number;
|
|
104
|
+
}
|
|
105
|
+
export declare class BrainParseError extends Error {
|
|
106
|
+
readonly raw: string;
|
|
107
|
+
constructor(message: string, raw: string);
|
|
108
|
+
}
|
|
109
|
+
export interface BrainDeps {
|
|
110
|
+
/** Optional override for tests — bypasses runtime.useModel. */
|
|
111
|
+
invokeModel?: (args: {
|
|
112
|
+
imageUrl: string;
|
|
113
|
+
prompt: string;
|
|
114
|
+
displayId: number;
|
|
115
|
+
}) => Promise<string | ImageDescriptionResult>;
|
|
116
|
+
/**
|
|
117
|
+
* When to attach the raw screenshot to the planning model (#9105). Defaults
|
|
118
|
+
* to `"always"` (legacy); see {@link resolveBrainImagePolicy} for the
|
|
119
|
+
* operator opt-in to `"on-escalation"`.
|
|
120
|
+
*/
|
|
121
|
+
imagePolicy?: BrainImagePolicy;
|
|
122
|
+
}
|
|
123
|
+
export interface BrainInput {
|
|
124
|
+
scene: Scene;
|
|
125
|
+
goal: string;
|
|
126
|
+
/**
|
|
127
|
+
* Per-display capture buffers. If a display from `scene.displays` is
|
|
128
|
+
* missing here, the Brain skips it. The cascade is responsible for
|
|
129
|
+
* supplying these alongside the scene.
|
|
130
|
+
*/
|
|
131
|
+
captures: Map<number, DisplayCapture>;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Pure description of a "Brain" call. Created by `Cascade.runCascade` and
|
|
135
|
+
* test fixtures.
|
|
136
|
+
*/
|
|
137
|
+
export declare class Brain {
|
|
138
|
+
private readonly runtime;
|
|
139
|
+
private readonly deps;
|
|
140
|
+
/**
|
|
141
|
+
* Frame-dHash → BrainOutput cache. The WS2 MemoryArbiter only dedups the
|
|
142
|
+
* IMAGE_DESCRIPTION call for LOCAL backends; the remote/cloud path bypasses
|
|
143
|
+
* it, so an identical screen re-burns tokens every step. This call-site cache
|
|
144
|
+
* skips the model entirely when the same frame is observed for the same goal,
|
|
145
|
+
* cutting the dominant CUA-loop token cost regardless of backend (#9105 M3).
|
|
146
|
+
*/
|
|
147
|
+
private readonly dhashCache;
|
|
148
|
+
private readonly imagePolicy;
|
|
149
|
+
private invocations;
|
|
150
|
+
private cacheHits;
|
|
151
|
+
private imagelessCalls;
|
|
152
|
+
private estImageTokensSaved;
|
|
153
|
+
constructor(runtime: IAgentRuntime | null, deps?: BrainDeps);
|
|
154
|
+
/** Token-accounting snapshot (model calls, cache hits, imageless savings). */
|
|
155
|
+
getStats(): BrainStats;
|
|
156
|
+
private cacheKey;
|
|
157
|
+
/**
|
|
158
|
+
* Return a cached plan for the same goal whose frame is within
|
|
159
|
+
* `BRAIN_DHASH_HAMMING_THRESHOLD` bits of `dh` — a near-identical screen, not
|
|
160
|
+
* just a byte-identical one. On a hit the entry is moved to the end (LRU), so
|
|
161
|
+
* a steadily-evolving screen keeps its most-recent close match warm.
|
|
162
|
+
*/
|
|
163
|
+
private findCached;
|
|
164
|
+
private rememberOutput;
|
|
165
|
+
observeAndPlan(input: BrainInput): Promise<BrainOutput>;
|
|
166
|
+
/**
|
|
167
|
+
* True when the imageless plan needs no screenshot to dispatch: a
|
|
168
|
+
* non-coordinate action (type/hotkey/key/wait/finish) carries everything in
|
|
169
|
+
* its args, and a coordinate action is fine when its `ref`/rationale resolves
|
|
170
|
+
* to a concrete OCR/AX box. When the target cannot be grounded we escalate to
|
|
171
|
+
* the pixels — correctness over token saving.
|
|
172
|
+
*/
|
|
173
|
+
private resolvesWithoutImage;
|
|
174
|
+
private recordImageless;
|
|
175
|
+
private invoke;
|
|
176
|
+
}
|
|
177
|
+
export declare function brainPromptFor(compactSceneJson: string, goal: string, strict: boolean): string;
|
|
178
|
+
export declare function parseBrainOutput(raw: string): BrainOutput;
|
|
179
|
+
/**
|
|
180
|
+
* Encode a PNG buffer for transport to the IMAGE_DESCRIPTION model. We don't
|
|
181
|
+
* resize here — `runtime.useModel` adapters (and any vLLM backends behind
|
|
182
|
+
* them) handle the `max_pixels` downscale. The constant `BRAIN_MAX_PIXELS`
|
|
183
|
+
* is exported for the cascade so it can crop ROIs at the right native
|
|
184
|
+
* resolution before invoking the Actor.
|
|
185
|
+
*/
|
|
186
|
+
export declare function encodeForBrain(png: Buffer): Promise<string>;
|
|
187
|
+
/**
|
|
188
|
+
* Estimate the visual tokens a frame would have cost the planning model, for
|
|
189
|
+
* the "tokens saved by going imageless" telemetry (#9105). The backends apply a
|
|
190
|
+
* `max_pixels` downscale, so the per-frame estimate is capped at
|
|
191
|
+
* `BRAIN_MAX_PIXELS`. Falls back to the cap when the PNG header is unreadable —
|
|
192
|
+
* an attached frame would have been downscaled to that ceiling regardless.
|
|
193
|
+
*/
|
|
194
|
+
export declare function estimateImageTokens(png: Buffer): number;
|
|
195
|
+
//# sourceMappingURL=brain.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"brain.d.ts","sourceRoot":"","sources":["../../src/actor/brain.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,EACL,KAAK,aAAa,EAClB,KAAK,sBAAsB,EAG5B,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAE7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AAGrD,OAAO,KAAK,EAAE,WAAW,EAAiC,MAAM,YAAY,CAAC;AAE7E,eAAO,MAAM,gBAAgB,UAAY,CAAC;AAC1C,eAAO,MAAM,cAAc,IAAI,CAAC;AAChC,gFAAgF;AAChF,eAAO,MAAM,qBAAqB,KAAK,CAAC;AACxC;;;;;;;;GAQG;AACH,eAAO,MAAM,6BAA6B,IAAI,CAAC;AAC/C;;;;GAIG;AACH,eAAO,MAAM,4BAA4B,MAAM,CAAC;AAEhD;;;;;;;;GAQG;AACH,MAAM,MAAM,gBAAgB,GAAG,QAAQ,GAAG,eAAe,GAAG,OAAO,CAAC;AAEpE;;;;;GAKG;AACH,eAAO,MAAM,0BAA0B,EAAE,gBAA2B,CAAC;AAErE;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,OAAO,EAAE,aAAa,GAAG,IAAI,GAC5B,gBAAgB,CAOlB;AAED,iEAAiE;AACjE,MAAM,WAAW,UAAU;IACzB,qDAAqD;IACrD,WAAW,EAAE,MAAM,CAAC;IACpB,wEAAwE;IACxE,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAmDD,qBAAa,eAAgB,SAAQ,KAAK;aAGtB,GAAG,EAAE,MAAM;gBAD3B,OAAO,EAAE,MAAM,EACC,GAAG,EAAE,MAAM;CAK9B;AAED,MAAM,WAAW,SAAS;IACxB,+DAA+D;IAC/D,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE;QACnB,QAAQ,EAAE,MAAM,CAAC;QACjB,MAAM,EAAE,MAAM,CAAC;QACf,SAAS,EAAE,MAAM,CAAC;KACnB,KAAK,OAAO,CAAC,MAAM,GAAG,sBAAsB,CAAC,CAAC;IAC/C;;;;OAIG;IACH,WAAW,CAAC,EAAE,gBAAgB,CAAC;CAChC;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb;;;;OAIG;IACH,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED;;;GAGG;AACH,qBAAa,KAAK;IAqBd,OAAO,CAAC,QAAQ,CAAC,OAAO;IACxB,OAAO,CAAC,QAAQ,CAAC,IAAI;IArBvB;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ,CAAC,UAAU,CAKnB;IACR,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmB;IAC/C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,cAAc,CAAK;IAC3B,OAAO,CAAC,mBAAmB,CAAK;gBAGb,OAAO,EAAE,aAAa,GAAG,IAAI,EAC7B,IAAI,GAAE,SAAc;IAKvC,8EAA8E;IAC9E,QAAQ,IAAI,UAAU;IAStB,OAAO,CAAC,QAAQ;IAShB;;;;;OAKG;IACH,OAAO,CAAC,UAAU;IAoBlB,OAAO,CAAC,cAAc;IAmBhB,cAAc,CAAC,KAAK,EAAE,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IA4I7D;;;;;;OAMG;IACH,OAAO,CAAC,oBAAoB;IAa5B,OAAO,CAAC,eAAe;YAKT,MAAM;CA4BrB;AAID,wBAAgB,cAAc,CAC5B,gBAAgB,EAAE,MAAM,EACxB,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,GACd,MAAM,CAgCR;AAID,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,WAAW,CA6FzD;AAkBD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAEjE;AAED;;;;;;GAMG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAMvD"}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WS7 — ScreenSeekeR cascade.
|
|
3
|
+
*
|
|
4
|
+
* Step 1: Brain looks at the full-screen scene for `target_display_id`.
|
|
5
|
+
* Step 2: For each ROI (up to BRAIN_MAX_ROIS), crop the *native resolution*
|
|
6
|
+
* region from the captured PNG and hand it to the Actor for
|
|
7
|
+
* fine grounding. The Actor returns display-local coords.
|
|
8
|
+
* Step 3: Combine the Brain's proposed action with the Actor's coordinates
|
|
9
|
+
* (or fall back to the OCR/AX deterministic actor on `ref`) and
|
|
10
|
+
* produce a single `ProposedAction` for the dispatcher.
|
|
11
|
+
*
|
|
12
|
+
* Cropping notes:
|
|
13
|
+
* - We do NOT decode the PNG. The cropped buffer is what we hand to the
|
|
14
|
+
* Actor. For the built-in OCR/AX actor, the crop is just a pass-through.
|
|
15
|
+
* - When a real PNG cropper is wired in (sharp / native module), this
|
|
16
|
+
* module is the place to add it: `cropPngToRoi(frame, bbox)`.
|
|
17
|
+
* - The cascade tests use the actual frame bytes; assertions are on the
|
|
18
|
+
* resolved coords and the order of Actor calls.
|
|
19
|
+
*/
|
|
20
|
+
import type { DisplayCapture } from "../platform/capture.js";
|
|
21
|
+
import type { Scene } from "../scene/scene-types.js";
|
|
22
|
+
import type { Actor } from "./actor.js";
|
|
23
|
+
import { type Brain } from "./brain.js";
|
|
24
|
+
import type { BrainRoi, CascadeResult } from "./types.js";
|
|
25
|
+
export interface CascadeDeps {
|
|
26
|
+
brain: Brain;
|
|
27
|
+
actor?: Actor | null;
|
|
28
|
+
/** Cropper override (mostly tests). Returns a Buffer for the bbox region. */
|
|
29
|
+
crop?: (frame: Buffer, bbox: [number, number, number, number]) => Buffer;
|
|
30
|
+
}
|
|
31
|
+
export interface CascadeInput {
|
|
32
|
+
scene: Scene;
|
|
33
|
+
goal: string;
|
|
34
|
+
/** Per-display PNG captures, keyed by displayId. */
|
|
35
|
+
captures: Map<number, DisplayCapture>;
|
|
36
|
+
}
|
|
37
|
+
/** Grounding-cache accounting (#9105 M5). */
|
|
38
|
+
export interface CascadeGroundStats {
|
|
39
|
+
/** Grounding resolutions served from the per-Scene cache. */
|
|
40
|
+
hits: number;
|
|
41
|
+
/** Grounding resolutions that ran the full resolve (OCR/AX + optional actor). */
|
|
42
|
+
misses: number;
|
|
43
|
+
}
|
|
44
|
+
export declare class Cascade {
|
|
45
|
+
private readonly deps;
|
|
46
|
+
/**
|
|
47
|
+
* Per-Scene grounding cache (predict/ground split, #9105 M5). Grounding the
|
|
48
|
+
* same target (ref or rationale) on the same Scene is deterministic, so the
|
|
49
|
+
* cheap GROUND step is memoized — re-grounding within a turn skips a repeat
|
|
50
|
+
* `resolveReference` (OCR/AX) scan and a repeat (possibly model-backed)
|
|
51
|
+
* `actor.ground` call. Keyed by Scene timestamp so a new screen invalidates.
|
|
52
|
+
*/
|
|
53
|
+
private readonly groundCache;
|
|
54
|
+
private groundStats;
|
|
55
|
+
constructor(deps: CascadeDeps);
|
|
56
|
+
/** Grounding cache hit/miss snapshot for token/work accounting. */
|
|
57
|
+
getGroundStats(): CascadeGroundStats;
|
|
58
|
+
run(input: CascadeInput): Promise<CascadeResult>;
|
|
59
|
+
/**
|
|
60
|
+
* Grounding-only entry (the `predict_click` half of the predict/ground split,
|
|
61
|
+
* #9105 M5 / #9170 M10). Resolves a `ref` (OCR/AX id) or free-form
|
|
62
|
+
* `instruction` to a display-local coordinate WITHOUT running the Brain —
|
|
63
|
+
* agent loops that do their own step planning (Anthropic / OpenAI
|
|
64
|
+
* computer-use) call this to reuse our deterministic OCR/AX + actor grounding
|
|
65
|
+
* and its per-Scene cache. Returns `null` when nothing can be grounded.
|
|
66
|
+
*/
|
|
67
|
+
groundTarget(args: {
|
|
68
|
+
scene: Scene;
|
|
69
|
+
captures: Map<number, DisplayCapture>;
|
|
70
|
+
targetDisplayId: number;
|
|
71
|
+
ref?: string;
|
|
72
|
+
instruction?: string;
|
|
73
|
+
/** Optional ROI to ground inside when no `ref` is available. */
|
|
74
|
+
roi?: BrainRoi;
|
|
75
|
+
}): Promise<{
|
|
76
|
+
displayId: number;
|
|
77
|
+
x: number;
|
|
78
|
+
y: number;
|
|
79
|
+
} | null>;
|
|
80
|
+
private resolveBrainOutput;
|
|
81
|
+
private resolveCoords;
|
|
82
|
+
/** Drop cache entries from any Scene other than `timestamp`, then store. */
|
|
83
|
+
private rememberGround;
|
|
84
|
+
private resolveCoordsUncached;
|
|
85
|
+
private coordsForRef;
|
|
86
|
+
private groundReference;
|
|
87
|
+
private groundRoi;
|
|
88
|
+
private cropFrame;
|
|
89
|
+
}
|
|
90
|
+
export declare function setActor(actor: Actor | null): void;
|
|
91
|
+
export declare function getRegisteredActor(): Actor | null;
|
|
92
|
+
//# sourceMappingURL=cascade.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cascade.d.ts","sourceRoot":"","sources":["../../src/actor/cascade.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAGH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AACrD,OAAO,KAAK,EAAE,KAAK,EAAmB,MAAM,YAAY,CAAC;AAEzD,OAAO,EAAkB,KAAK,KAAK,EAAE,MAAM,YAAY,CAAC;AACxD,OAAO,KAAK,EAEV,QAAQ,EACR,aAAa,EAEd,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,KAAK,CAAC;IACb,KAAK,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC;IACrB,6EAA6E;IAC7E,IAAI,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,KAAK,MAAM,CAAC;CAC1E;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,oDAAoD;IACpD,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED,6CAA6C;AAC7C,MAAM,WAAW,kBAAkB;IACjC,6DAA6D;IAC7D,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,OAAO;IAcN,OAAO,CAAC,QAAQ,CAAC,IAAI;IAbjC;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAGxB;IACJ,OAAO,CAAC,WAAW,CAA8C;gBAEpC,IAAI,EAAE,WAAW;IAE9C,mEAAmE;IACnE,cAAc,IAAI,kBAAkB;IAI9B,GAAG,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC;IAStD;;;;;;;OAOG;IACG,YAAY,CAAC,IAAI,EAAE;QACvB,KAAK,EAAE,KAAK,CAAC;QACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACtC,eAAe,EAAE,MAAM,CAAC;QACxB,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,gEAAgE;QAChE,GAAG,CAAC,EAAE,QAAQ,CAAC;KAChB,GAAG,OAAO,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;YAsBjD,kBAAkB;YAwJlB,aAAa;IA2B3B,4EAA4E;IAC5E,OAAO,CAAC,cAAc;YAYR,qBAAqB;YAkCrB,YAAY;YAUZ,eAAe;YAiDf,SAAS;IAiCvB,OAAO,CAAC,SAAS;CAYlB;AAQD,wBAAgB,QAAQ,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI,GAAG,IAAI,CAElD;AACD,wBAAgB,kBAAkB,IAAI,KAAK,GAAG,IAAI,CAEjD"}
|