@elizaos/plugin-computeruse 2.0.0-beta.1 → 2.0.3-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +80 -0
  3. package/dist/actions/clipboard.d.ts +22 -0
  4. package/dist/actions/clipboard.d.ts.map +1 -0
  5. package/dist/actions/progress.d.ts +26 -0
  6. package/dist/actions/progress.d.ts.map +1 -0
  7. package/dist/actions/use-computer-agent.d.ts +113 -0
  8. package/dist/actions/use-computer-agent.d.ts.map +1 -0
  9. package/dist/actions/use-computer.d.ts.map +1 -1
  10. package/dist/actions/window-handlers.d.ts +11 -0
  11. package/dist/actions/window-handlers.d.ts.map +1 -0
  12. package/dist/actions/window.d.ts +11 -0
  13. package/dist/actions/window.d.ts.map +1 -0
  14. package/dist/actor/actor.d.ts +84 -0
  15. package/dist/actor/actor.d.ts.map +1 -0
  16. package/dist/actor/agent-callbacks.d.ts +128 -0
  17. package/dist/actor/agent-callbacks.d.ts.map +1 -0
  18. package/dist/actor/agent-loop.d.ts +134 -0
  19. package/dist/actor/agent-loop.d.ts.map +1 -0
  20. package/dist/actor/aosp-input-actor.d.ts +87 -0
  21. package/dist/actor/aosp-input-actor.d.ts.map +1 -0
  22. package/dist/actor/brain.d.ts +195 -0
  23. package/dist/actor/brain.d.ts.map +1 -0
  24. package/dist/actor/cascade.d.ts +92 -0
  25. package/dist/actor/cascade.d.ts.map +1 -0
  26. package/dist/actor/computer-interface.d.ts +276 -0
  27. package/dist/actor/computer-interface.d.ts.map +1 -0
  28. package/dist/actor/dispatch.d.ts +24 -0
  29. package/dist/actor/dispatch.d.ts.map +1 -0
  30. package/dist/actor/index.d.ts +12 -0
  31. package/dist/actor/index.d.ts.map +1 -0
  32. package/dist/actor/types.d.ts +94 -0
  33. package/dist/actor/types.d.ts.map +1 -0
  34. package/dist/approval-manager.d.ts.map +1 -1
  35. package/dist/index.d.ts +19 -6
  36. package/dist/index.d.ts.map +1 -1
  37. package/dist/index.js +12001 -5484
  38. package/dist/index.js.map +59 -25
  39. package/dist/mcp/index.d.ts +8 -0
  40. package/dist/mcp/index.d.ts.map +1 -0
  41. package/dist/mcp/server.d.ts +42 -0
  42. package/dist/mcp/server.d.ts.map +1 -0
  43. package/dist/mcp/tools.d.ts +53 -0
  44. package/dist/mcp/tools.d.ts.map +1 -0
  45. package/dist/mobile/android-bridge.d.ts +263 -0
  46. package/dist/mobile/android-bridge.d.ts.map +1 -0
  47. package/dist/mobile/android-scene.d.ts +52 -0
  48. package/dist/mobile/android-scene.d.ts.map +1 -0
  49. package/dist/mobile/android-trajectory.d.ts +66 -0
  50. package/dist/mobile/android-trajectory.d.ts.map +1 -0
  51. package/dist/mobile/index.d.ts +19 -0
  52. package/dist/mobile/index.d.ts.map +1 -0
  53. package/dist/mobile/ios-app-intent-registry.d.ts +20 -0
  54. package/dist/mobile/ios-app-intent-registry.d.ts.map +1 -0
  55. package/dist/mobile/ios-bridge.d.ts +359 -0
  56. package/dist/mobile/ios-bridge.d.ts.map +1 -0
  57. package/dist/mobile/ios-computer-interface.d.ts +160 -0
  58. package/dist/mobile/ios-computer-interface.d.ts.map +1 -0
  59. package/dist/mobile/mobile-computer-interface.d.ts +142 -0
  60. package/dist/mobile/mobile-computer-interface.d.ts.map +1 -0
  61. package/dist/mobile/mobile-screen-capture.d.ts +64 -0
  62. package/dist/mobile/mobile-screen-capture.d.ts.map +1 -0
  63. package/dist/mobile/ocr-provider.d.ts +187 -0
  64. package/dist/mobile/ocr-provider.d.ts.map +1 -0
  65. package/dist/mobile/ocr-provider.js +111 -0
  66. package/dist/mobile/ocr-provider.js.map +10 -0
  67. package/dist/osworld/action-converter.d.ts +4 -1
  68. package/dist/osworld/action-converter.d.ts.map +1 -1
  69. package/dist/osworld/adapter.d.ts +1 -0
  70. package/dist/osworld/adapter.d.ts.map +1 -1
  71. package/dist/parity/index.d.ts +9 -0
  72. package/dist/parity/index.d.ts.map +1 -0
  73. package/dist/parity/parity-matrix.d.ts +82 -0
  74. package/dist/parity/parity-matrix.d.ts.map +1 -0
  75. package/dist/parity/screenspot.d.ts +56 -0
  76. package/dist/parity/screenspot.d.ts.map +1 -0
  77. package/dist/platform/a11y.d.ts +29 -1
  78. package/dist/platform/a11y.d.ts.map +1 -1
  79. package/dist/platform/browser.d.ts +1 -1
  80. package/dist/platform/browser.d.ts.map +1 -1
  81. package/dist/platform/capabilities.d.ts +23 -0
  82. package/dist/platform/capabilities.d.ts.map +1 -1
  83. package/dist/platform/capture.d.ts +65 -0
  84. package/dist/platform/capture.d.ts.map +1 -0
  85. package/dist/platform/clipboard.d.ts +24 -0
  86. package/dist/platform/clipboard.d.ts.map +1 -0
  87. package/dist/platform/coords.d.ts +73 -0
  88. package/dist/platform/coords.d.ts.map +1 -0
  89. package/dist/platform/desktop.d.ts +23 -0
  90. package/dist/platform/desktop.d.ts.map +1 -1
  91. package/dist/platform/displays.d.ts +97 -0
  92. package/dist/platform/displays.d.ts.map +1 -0
  93. package/dist/platform/driver.d.ts +22 -0
  94. package/dist/platform/driver.d.ts.map +1 -1
  95. package/dist/platform/file-ops.d.ts +17 -0
  96. package/dist/platform/file-ops.d.ts.map +1 -1
  97. package/dist/platform/helpers.d.ts +2 -3
  98. package/dist/platform/helpers.d.ts.map +1 -1
  99. package/dist/platform/launch.d.ts +54 -0
  100. package/dist/platform/launch.d.ts.map +1 -0
  101. package/dist/platform/normalized-coords.d.ts +46 -0
  102. package/dist/platform/normalized-coords.d.ts.map +1 -0
  103. package/dist/platform/nut-driver.d.ts +67 -0
  104. package/dist/platform/nut-driver.d.ts.map +1 -1
  105. package/dist/platform/permissions.d.ts +12 -0
  106. package/dist/platform/permissions.d.ts.map +1 -1
  107. package/dist/platform/process-list.d.ts +32 -0
  108. package/dist/platform/process-list.d.ts.map +1 -0
  109. package/dist/platform/ps-host.d.ts +77 -0
  110. package/dist/platform/ps-host.d.ts.map +1 -0
  111. package/dist/platform/screenshot-errors.d.ts +54 -0
  112. package/dist/platform/screenshot-errors.d.ts.map +1 -0
  113. package/dist/platform/screenshot-quality.d.ts +11 -0
  114. package/dist/platform/screenshot-quality.d.ts.map +1 -0
  115. package/dist/platform/screenshot.d.ts.map +1 -1
  116. package/dist/platform/security.d.ts +8 -0
  117. package/dist/platform/security.d.ts.map +1 -1
  118. package/dist/platform/wayland-portal.d.ts +25 -0
  119. package/dist/platform/wayland-portal.d.ts.map +1 -0
  120. package/dist/platform/windows-list.d.ts +43 -1
  121. package/dist/platform/windows-list.d.ts.map +1 -1
  122. package/dist/providers/computer-state.d.ts.map +1 -1
  123. package/dist/providers/scene.d.ts +21 -0
  124. package/dist/providers/scene.d.ts.map +1 -0
  125. package/dist/register-routes.js +11715 -4990
  126. package/dist/register-routes.js.map +61 -24
  127. package/dist/routes/computer-use-compat-routes.d.ts +1 -1
  128. package/dist/routes/computer-use-compat-routes.d.ts.map +1 -1
  129. package/dist/sandbox/docker-backend.d.ts +69 -0
  130. package/dist/sandbox/docker-backend.d.ts.map +1 -0
  131. package/dist/sandbox/index.d.ts +62 -0
  132. package/dist/sandbox/index.d.ts.map +1 -0
  133. package/dist/sandbox/qemu-backend.d.ts +48 -0
  134. package/dist/sandbox/qemu-backend.d.ts.map +1 -0
  135. package/dist/sandbox/remote-guest.d.ts +72 -0
  136. package/dist/sandbox/remote-guest.d.ts.map +1 -0
  137. package/dist/sandbox/sandbox-driver.d.ts +41 -0
  138. package/dist/sandbox/sandbox-driver.d.ts.map +1 -0
  139. package/dist/sandbox/surface-types.d.ts +17 -0
  140. package/dist/sandbox/surface-types.d.ts.map +1 -0
  141. package/dist/sandbox/types.d.ts +138 -0
  142. package/dist/sandbox/types.d.ts.map +1 -0
  143. package/dist/sandbox/wsb-backend.d.ts +48 -0
  144. package/dist/sandbox/wsb-backend.d.ts.map +1 -0
  145. package/dist/scene/a11y-provider.d.ts +83 -0
  146. package/dist/scene/a11y-provider.d.ts.map +1 -0
  147. package/dist/scene/apps.d.ts +39 -0
  148. package/dist/scene/apps.d.ts.map +1 -0
  149. package/dist/scene/dhash.d.ts +105 -0
  150. package/dist/scene/dhash.d.ts.map +1 -0
  151. package/dist/scene/ocr-adapter.d.ts +64 -0
  152. package/dist/scene/ocr-adapter.d.ts.map +1 -0
  153. package/dist/scene/scene-builder.d.ts +107 -0
  154. package/dist/scene/scene-builder.d.ts.map +1 -0
  155. package/dist/scene/scene-types.d.ts +70 -0
  156. package/dist/scene/scene-types.d.ts.map +1 -0
  157. package/dist/scene/screen-state.d.ts +105 -0
  158. package/dist/scene/screen-state.d.ts.map +1 -0
  159. package/dist/scene/serialize.d.ts +28 -0
  160. package/dist/scene/serialize.d.ts.map +1 -0
  161. package/dist/security/browser-script-policy.d.ts +9 -0
  162. package/dist/security/browser-script-policy.d.ts.map +1 -0
  163. package/dist/services/computer-use-service.d.ts +78 -2
  164. package/dist/services/computer-use-service.d.ts.map +1 -1
  165. package/dist/services/index.d.ts +7 -0
  166. package/dist/services/index.d.ts.map +1 -0
  167. package/dist/services/vision-context-provider.d.ts +32 -0
  168. package/dist/services/vision-context-provider.d.ts.map +1 -0
  169. package/dist/types.d.ts +115 -5
  170. package/dist/types.d.ts.map +1 -1
  171. package/package.json +47 -10
  172. package/registry-entry.json +74 -0
  173. package/dist/actions/desktop-handlers.d.ts +0 -20
  174. package/dist/actions/desktop-handlers.d.ts.map +0 -1
  175. package/dist/actions/desktop.d.ts +0 -11
  176. package/dist/actions/desktop.d.ts.map +0 -1
@@ -0,0 +1,134 @@
1
+ /**
2
+ * Agent-loop registry (#9170 M10).
3
+ *
4
+ * trycua/cua selects an agent *loop* from a model string: an `anthropic/...`
5
+ * model routes to the Claude computer-use loop, `openai/computer-use-preview`
6
+ * routes to the OpenAI operator loop, an OmniParser/grounder string routes to a
7
+ * local set-of-marks loop, etc. Each loop implements the same two-call seam —
8
+ * `predict_step` (observe + plan the next action) and `predict_click` (ground a
9
+ * target to a coordinate) — so the runner is decoupled from *how* a step is
10
+ * produced.
11
+ *
12
+ * elizaOS shipped a single hardcoded Brain→Cascade (ScreenSeekeR). This module
13
+ * replaces that hardcoding with a registry:
14
+ * - `AgentLoop` — the `predictStep` / `predictClick` seam.
15
+ * - `registerAgentLoop` — register a loop keyed by a model-string matcher.
16
+ * - `createAgentLoop(modelString, deps)` — pick the highest-priority matching
17
+ * loop and instantiate it.
18
+ *
19
+ * The built-in `local-grounder` loop wraps the existing Brain→Cascade and
20
+ * exposes the M5 grounding cache through `predictClick`. Anthropic / OpenAI
21
+ * computer-use loops are *pluggable*: a provider plugin calls
22
+ * `registerAgentLoop` with `matchesModelFamily("anthropic")` (etc.) and its own
23
+ * `predictStep`. With none registered, every model string falls through to the
24
+ * local grounder (which always matches at the lowest priority).
25
+ */
26
+ import type { IAgentRuntime } from "@elizaos/core";
27
+ import type { DisplayCapture } from "../platform/capture.js";
28
+ import type { Scene } from "../scene/scene-types.js";
29
+ import { type Actor } from "./actor.js";
30
+ import { Brain } from "./brain.js";
31
+ import type { CascadeResult, GroundingResult } from "./types.js";
32
+ /** Default loop model-string — the local OCR/AX + actor grounder. */
33
+ export declare const DEFAULT_AGENT_LOOP_MODEL = "local-grounder";
34
+ /** Setting / env key the runner reads to choose a loop. */
35
+ export declare const AGENT_LOOP_SETTING = "COMPUTER_USE_AGENT_LOOP";
36
+ export interface AgentStepInput {
37
+ scene: Scene;
38
+ goal: string;
39
+ captures: Map<number, DisplayCapture>;
40
+ }
41
+ export interface PredictClickInput {
42
+ scene: Scene;
43
+ captures: Map<number, DisplayCapture>;
44
+ targetDisplayId: number;
45
+ /** OCR/AX id to ground (`t<d>-<n>` / `a<d>-<n>`). */
46
+ ref?: string;
47
+ /** Free-form instruction when no ref is available. */
48
+ instruction?: string;
49
+ }
50
+ /**
51
+ * The two-call seam every loop implements. `predictStep` plans the next
52
+ * concrete action; `predictClick` grounds a target to a coordinate (used by
53
+ * loops that plan elsewhere but reuse our grounding, and by callers that want
54
+ * grounding without a full step).
55
+ */
56
+ /**
57
+ * Per-run model-call accounting (#9105). `invocations` counts the token-bearing
58
+ * model calls a loop actually issued; `cacheHits` counts calls served without a
59
+ * model round-trip. Reported once per run as `evt:"computeruse.agent.tokens"`.
60
+ */
61
+ export interface AgentLoopStats {
62
+ /** Token-bearing model calls actually issued during the run. */
63
+ invocations: number;
64
+ /** Calls served from cache (no model call, no tokens). */
65
+ cacheHits: number;
66
+ /** Model calls issued with no screenshot attached (#9105). */
67
+ imagelessCalls: number;
68
+ /** Estimated image tokens not sent because of imageless calls (#9105). */
69
+ estImageTokensSaved: number;
70
+ }
71
+ export interface AgentLoop {
72
+ readonly name: string;
73
+ predictStep(input: AgentStepInput): Promise<CascadeResult>;
74
+ predictClick(input: PredictClickInput): Promise<GroundingResult | null>;
75
+ /** Per-run model-call accounting, when the loop tracks it (#9105). */
76
+ getStats?(): AgentLoopStats;
77
+ }
78
+ export interface AgentLoopDeps {
79
+ runtime: IAgentRuntime | null;
80
+ /** Latest-scene accessor for the default actor. */
81
+ getScene: () => Scene | null;
82
+ /** Brain override (mostly tests). */
83
+ brain?: Brain;
84
+ /** Actor override (mostly tests). */
85
+ actor?: Actor | null;
86
+ }
87
+ export interface AgentLoopRegistration {
88
+ /** Stable id for telemetry + explicit selection. */
89
+ readonly name: string;
90
+ /** True when this loop handles `modelString`. */
91
+ matches: (modelString: string) => boolean;
92
+ /** Instantiate the loop for a run. */
93
+ create: (deps: AgentLoopDeps) => AgentLoop;
94
+ /** Higher wins when multiple registrations match. Default 0. */
95
+ priority?: number;
96
+ }
97
+ /**
98
+ * Wraps the existing ScreenSeekeR (Brain → Cascade). `predictStep` is the full
99
+ * observe→plan→ground cascade; `predictClick` calls the cascade's grounding-only
100
+ * path so the M5 per-Scene grounding cache is shared across both.
101
+ */
102
+ export declare class LocalGrounderLoop implements AgentLoop {
103
+ readonly name = "local-grounder";
104
+ private readonly cascade;
105
+ private readonly brain;
106
+ constructor(deps: AgentLoopDeps);
107
+ predictStep(input: AgentStepInput): Promise<CascadeResult>;
108
+ predictClick(input: PredictClickInput): Promise<GroundingResult | null>;
109
+ /** Grounding cache hit/miss snapshot (delegates to the wrapped cascade). */
110
+ getGroundStats(): import("./cascade.js").CascadeGroundStats;
111
+ /** Model-call accounting from the wrapped Brain (#9105). */
112
+ getStats(): AgentLoopStats;
113
+ }
114
+ /**
115
+ * A matcher for a provider family — `anthropic`, `openai`, `google`, … A
116
+ * pluggable loop registers with `matches: matchesModelFamily("anthropic")` so a
117
+ * model string like `anthropic/claude-...` or `claude-3-7-sonnet` routes to it.
118
+ */
119
+ export declare function matchesModelFamily(family: string): (modelString: string) => boolean;
120
+ /** Register (or replace, by name) an agent-loop. */
121
+ export declare function registerAgentLoop(registration: AgentLoopRegistration): void;
122
+ export declare function unregisterAgentLoop(name: string): void;
123
+ export declare function listAgentLoops(): readonly AgentLoopRegistration[];
124
+ /**
125
+ * Pick the registration for a model string: the highest-priority one whose
126
+ * `matches` returns true. The local grounder's match-anything floor guarantees
127
+ * a result, so this never throws.
128
+ */
129
+ export declare function selectAgentLoopRegistration(modelString: string): AgentLoopRegistration;
130
+ /** Resolve + instantiate the loop for a model string. */
131
+ export declare function createAgentLoop(modelString: string, deps: AgentLoopDeps): AgentLoop;
132
+ /** Test helper — restore the registry to just the built-in local grounder. */
133
+ export declare function _resetAgentLoopsForTests(): void;
134
+ //# sourceMappingURL=agent-loop.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"agent-loop.d.ts","sourceRoot":"","sources":["../../src/actor/agent-loop.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AACnD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AACrD,OAAO,EAAE,KAAK,KAAK,EAA+B,MAAM,YAAY,CAAC;AACrE,OAAO,EAAE,KAAK,EAA2B,MAAM,YAAY,CAAC;AAE5D,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjE,qEAAqE;AACrE,eAAO,MAAM,wBAAwB,mBAAmB,CAAC;AAEzD,2DAA2D;AAC3D,eAAO,MAAM,kBAAkB,4BAA4B,CAAC;AAE5D,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,KAAK,CAAC;IACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;IACtC,eAAe,EAAE,MAAM,CAAC;IACxB,qDAAqD;IACrD,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;GAKG;AACH;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC7B,gEAAgE;IAChE,WAAW,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,cAAc,EAAE,MAAM,CAAC;IACvB,0EAA0E;IAC1E,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,KAAK,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;IAC3D,YAAY,CAAC,KAAK,EAAE,iBAAiB,GAAG,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC;IACxE,sEAAsE;IACtE,QAAQ,CAAC,IAAI,cAAc,CAAC;CAC7B;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,aAAa,GAAG,IAAI,CAAC;IAC9B,mDAAmD;IACnD,QAAQ,EAAE,MAAM,KAAK,GAAG,IAAI,CAAC;IAC7B,qCAAqC;IACrC,KAAK,CAAC,EAAE,KAAK,CAAC;IACd,qCAAqC;IACrC,KAAK,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,qBAAqB;IACpC,oDAAoD;IACpD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,iDAAiD;IACjD,OAAO,EAAE,CAAC,WAAW,EAAE,MAAM,KAAK,OAAO,CAAC;IAC1C,sCAAsC;IACtC,MAAM,EAAE,CAAC,IAAI,EAAE,aAAa,KAAK,SAAS,CAAC;IAC3C,gEAAgE;IAChE,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAID;;;;GAIG;AACH,qBAAa,iBAAkB,YAAW,SAAS;IACjD,QAAQ,CAAC,IAAI,oBAA4B;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAQ;gBAElB,IAAI,EAAE,aAAa;IAc/B,WAAW,CAAC,KAAK,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;IAIpD,YAAY,CAChB,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC;IAkBlC,4EAA4E;IAC5E,cAAc;IAId,4DAA4D;IAC5D,QAAQ,IAAI,cAAc;CAG3B;AAID;;;;GAIG;AACH,wBAAgB,kBAAkB,CAChC,MAAM,EAAE,MAAM,GACb,CAAC,WAAW,EAAE,MAAM,KAAK,OAAO,CAYlC;AAMD,oDAAoD;AACpD,wBAAgB,iBAAiB,CAAC,YAAY,EAAE,qBAAqB,GAAG,IAAI,CAE3E;AAED,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAEtD;AAED,wBAAgB,cAAc,IAAI,SAAS,qBAAqB,EAAE,CAIjE;AA0BD;;;;GAIG;AACH,wBAAgB,2BAA2B,CACzC,WAAW,EAAE,MAAM,GAClB,qBAAqB,CAevB;AAED,yDAAyD;AACzD,wBAAgB,eAAe,CAC7B,WAAW,EAAE,MAAM,EACnB,IAAI,EAAE,aAAa,GAClB,SAAS,CAEX;AAED,8EAA8E;AAC9E,wBAAgB,wBAAwB,IAAI,IAAI,CAG/C"}
@@ -0,0 +1,87 @@
1
+ /**
2
+ * WS7 ↔ AOSP — Privileged-input actor.
3
+ *
4
+ * In the consumer build the cascade routes gestures through
5
+ * `MobileComputerInterface` → `AccessibilityGestureDescription` — which is
6
+ * coarse and blocks on touch-recognizer state in some apps (banking, DRM
7
+ * video, anything that sets `filterTouchesWhenObscured`).
8
+ *
9
+ * In an AOSP system-app build (see `docs/AOSP_SYSTEM_APP.md`), the
10
+ * privileged path uses `InputManager.injectInputEvent()` directly. That
11
+ * path lives behind `AospPrivilegedBridge.injectMotionEvent(...)` on the
12
+ * Kotlin side; the consumer-flavor `AospPrivilegedBridge` exports
13
+ * `createIfAvailable(): null` so this actor stays inert until a real AOSP
14
+ * bridge is linked in.
15
+ *
16
+ * `AospInputActor` maps a resolved WS7 `ProposedAction` (display-local
17
+ * pixel coords) into the privileged-bridge calls. It does NOT implement
18
+ * the `Actor` "grounding" contract — grounding stays with the OCR/AX or
19
+ * VLM actor; this is purely an *input-dispatch* shim. It's surfaced as
20
+ * an alternative to `ComputerInterface` for AOSP builds: the agent loop
21
+ * picks `AospInputActor.execute(action)` instead of `dispatch(action, {
22
+ * interface, ... })` when the privileged bridge is available.
23
+ */
24
+ import type { ActionResult, ProposedAction } from "./types.js";
25
+ /** Minimal Kotlin-side surface this actor needs from the AOSP build. */
26
+ export interface AospPrivilegedInputBridge {
27
+ /**
28
+ * Inject a single motion event at the InputManager level. `action` follows
29
+ * `MotionEvent.ACTION_*` constants (DOWN=0, UP=1, MOVE=2). `downTimeMs`
30
+ * is the original-touch timestamp the gesture started at, in `uptimeMillis`
31
+ * units. Implementations enforce the INJECT_EVENTS permission.
32
+ */
33
+ injectMotionEvent(args: {
34
+ x: number;
35
+ y: number;
36
+ action: number;
37
+ downTimeMs: number;
38
+ }): Promise<{
39
+ ok: boolean;
40
+ }>;
41
+ /** Capture the primary display frame buffer synchronously. JPEG bytes. */
42
+ captureDisplayFrameBuffer?(): Promise<Uint8Array>;
43
+ }
44
+ export interface AospInputActorDeps {
45
+ /** Returns the AOSP bridge handle, or null in consumer builds. */
46
+ getBridge: () => AospPrivilegedInputBridge | null;
47
+ /** Override the clock for tests. */
48
+ now?: () => number;
49
+ }
50
+ /**
51
+ * Motion-event action constants matching `android.view.MotionEvent.ACTION_*`.
52
+ * Re-exported here so callers don't need to import Android Kotlin enums.
53
+ */
54
+ export declare const MOTION_EVENT_ACTION_DOWN: 0;
55
+ export declare const MOTION_EVENT_ACTION_UP: 1;
56
+ export declare const MOTION_EVENT_ACTION_MOVE: 2;
57
+ /**
58
+ * Translate a cascade-resolved `ProposedAction` into one or more
59
+ * `injectMotionEvent` calls. Returns the same `ActionResult` envelope the
60
+ * desktop dispatcher uses — invalid args or driver errors do not throw.
61
+ *
62
+ * Behavior parity with `dispatch.ts`:
63
+ * - unknown action.kind → invalid_args
64
+ * - missing coords → invalid_args
65
+ * - bridge throw → driver_error
66
+ * - bridge ok:false → driver_error
67
+ *
68
+ * Coverage:
69
+ * - click / double_click / right_click → tap(s)
70
+ * - drag → DOWN at start, MOVE/UP at end
71
+ * - scroll → swipe (DOWN, MOVE, UP)
72
+ * - wait / finish → success: true (no input event)
73
+ * - type / key / hotkey → invalid_args (use AccessibilityNodeInfo
74
+ * or a separate keymap actor; out of
75
+ * scope for this privileged path).
76
+ */
77
+ export declare class AospInputActor {
78
+ private readonly deps;
79
+ readonly name = "aosp-input";
80
+ constructor(deps: AospInputActorDeps);
81
+ execute(action: ProposedAction): Promise<ActionResult>;
82
+ private tap;
83
+ private swipe;
84
+ private must;
85
+ private now;
86
+ }
87
+ //# sourceMappingURL=aosp-input-actor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"aosp-input-actor.d.ts","sourceRoot":"","sources":["../../src/actor/aosp-input-actor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAK/D,wEAAwE;AACxE,MAAM,WAAW,yBAAyB;IACxC;;;;;OAKG;IACH,iBAAiB,CAAC,IAAI,EAAE;QACtB,CAAC,EAAE,MAAM,CAAC;QACV,CAAC,EAAE,MAAM,CAAC;QACV,MAAM,EAAE,MAAM,CAAC;QACf,UAAU,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QAAE,EAAE,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;IAC7B,0EAA0E;IAC1E,yBAAyB,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,CAAC;CACnD;AAED,MAAM,WAAW,kBAAkB;IACjC,kEAAkE;IAClE,SAAS,EAAE,MAAM,yBAAyB,GAAG,IAAI,CAAC;IAClD,oCAAoC;IACpC,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;CACpB;AAED;;;GAGG;AACH,eAAO,MAAM,wBAAwB,EAAG,CAAU,CAAC;AACnD,eAAO,MAAM,sBAAsB,EAAG,CAAU,CAAC;AACjD,eAAO,MAAM,wBAAwB,EAAG,CAAU,CAAC;AAEnD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,cAAc;IAGb,OAAO,CAAC,QAAQ,CAAC,IAAI;IAFjC,QAAQ,CAAC,IAAI,gBAAgB;gBAEA,IAAI,EAAE,kBAAkB;IAE/C,OAAO,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,YAAY,CAAC;YAwH9C,GAAG;YA0BH,KAAK;YAsCL,IAAI;IAYlB,OAAO,CAAC,GAAG;CAGZ"}
@@ -0,0 +1,195 @@
1
+ /**
2
+ * WS7 — Brain (full-screen reasoning).
3
+ *
4
+ * Sends one image per display (each downscaled to ~1.3 MP, the OS-Atlas /
5
+ * Qwen3-VL `max_pixels` convention) to `runtime.useModel(IMAGE_DESCRIPTION,
6
+ * ...)`. The model is prompted to emit a JSON `BrainOutput` describing:
7
+ * - the scene in one paragraph,
8
+ * - which display to act on,
9
+ * - up to N ROIs the Actor should zoom into,
10
+ * - a single proposed action with rationale.
11
+ *
12
+ * The Brain itself doesn't dispatch — it just produces `BrainOutput`. The
13
+ * cascade ("ScreenSeekeR") is the orchestrator that takes a `BrainOutput`,
14
+ * optionally calls the Actor on cropped ROIs, and produces a concrete
15
+ * `ProposedAction` for the dispatcher.
16
+ *
17
+ * Image transport contract: we pass `imageUrl` as a `data:image/png;base64,...`
18
+ * URL. The WS2 MemoryArbiter intercepts at `ModelType.IMAGE_DESCRIPTION` and
19
+ * routes through its content-hash cache, so identical frames don't burn
20
+ * inference budget twice.
21
+ *
22
+ * Image policy (#9105): the compact scene already carries OCR text + AX boxes,
23
+ * which can suffice to pick the next target, so the `"on-escalation"` policy
24
+ * plans from that text-only context with NO image — routed through a TEXT model,
25
+ * since every IMAGE_DESCRIPTION provider rejects an empty imageUrl — and
26
+ * attaches the ~1.3 MP frame only when the planned target cannot be grounded
27
+ * against the OCR/AX boxes. The DEFAULT is `"always"` (legacy: image on every
28
+ * call) until a real-model CUA trajectory validates imageless planning accuracy;
29
+ * operators opt into `"on-escalation"` via the `COMPUTERUSE_BRAIN_IMAGE_POLICY`
30
+ * setting. `"never"` never attaches pixels.
31
+ *
32
+ * Parse strictness:
33
+ * - We try to parse the response as JSON (either the literal string or
34
+ * `result.description`).
35
+ * - On first parse failure, retry once with a stricter prompt.
36
+ * - On second failure, throw `BrainParseError` — the cascade surfaces this
37
+ * as a structured `ActionResult.error` and aborts the turn cleanly.
38
+ */
39
+ import { type IAgentRuntime, type ImageDescriptionResult } from "@elizaos/core";
40
+ import type { DisplayCapture } from "../platform/capture.js";
41
+ import type { Scene } from "../scene/scene-types.js";
42
+ import type { BrainOutput } from "./types.js";
43
+ export declare const BRAIN_MAX_PIXELS = 1310720;
44
+ export declare const BRAIN_MAX_ROIS = 2;
45
+ /** Bound on the per-Brain dHash→BrainOutput cache (LRU-ish, oldest evicted). */
46
+ export declare const BRAIN_DHASH_CACHE_MAX = 16;
47
+ /**
48
+ * dHash Hamming threshold for cached-plan reuse (#9581 continuous-understanding
49
+ * tuning). Exact-equality (distance 0) re-burned the IMAGE_DESCRIPTION model on
50
+ * cosmetically-identical frames — cursor jitter, a blinking caret, anti-aliasing,
51
+ * and tiny scroll noise all flip a few dHash bits.
52
+ *
53
+ * This mirrors `SCREEN_STATE_HAMMING_THRESHOLD`: distances below the threshold
54
+ * are unchanged; distances at or above it are changed and must re-plan.
55
+ */
56
+ export declare const BRAIN_DHASH_HAMMING_THRESHOLD = 5;
57
+ /**
58
+ * Image-token estimate per source pixel for a vision model with the Qwen3-VL /
59
+ * OS-Atlas `max_pixels` convention: one visual token ≈ a 28×28 (≈750 px) patch.
60
+ * Used only to quantify the saving when a frame is *not* attached (#9105).
61
+ */
62
+ export declare const BRAIN_PIXELS_PER_IMAGE_TOKEN = 750;
63
+ /**
64
+ * When to attach the raw screenshot to the planning model (#9105):
65
+ * - `"always"` — attach the pixels on every call (legacy behaviour). Default.
66
+ * - `"on-escalation"` — plan from the compact OCR/AX scene with no image
67
+ * first (routed through a TEXT model); attach pixels
68
+ * only when the planned target cannot be grounded
69
+ * against the OCR/AX boxes or a strict-retry fires.
70
+ * - `"never"` — never attach the screenshot; plan from the scene alone.
71
+ */
72
+ export type BrainImagePolicy = "always" | "on-escalation" | "never";
73
+ /**
74
+ * Default is `"always"` (proven legacy behaviour). `"on-escalation"` cuts the
75
+ * dominant per-frame image-token cost but plans the first pass blind to the
76
+ * pixels, so it stays opt-in (via the `COMPUTERUSE_BRAIN_IMAGE_POLICY` runtime
77
+ * setting / env var) until a real-model CUA trajectory validates its accuracy.
78
+ */
79
+ export declare const DEFAULT_BRAIN_IMAGE_POLICY: BrainImagePolicy;
80
+ /**
81
+ * Resolve the Brain image policy from the `COMPUTERUSE_BRAIN_IMAGE_POLICY`
82
+ * runtime setting / env var, falling back to {@link DEFAULT_BRAIN_IMAGE_POLICY}.
83
+ * The operator escape hatch to enable imageless planning without a code change
84
+ * once it is validated against a real model.
85
+ */
86
+ export declare function resolveBrainImagePolicy(runtime: IAgentRuntime | null): BrainImagePolicy;
87
+ /** Token-accounting snapshot for a Brain instance (#9105 M3). */
88
+ export interface BrainStats {
89
+ /** IMAGE_DESCRIPTION model calls actually issued. */
90
+ invocations: number;
91
+ /** Describe calls served from the frame-dHash cache (no model call). */
92
+ cacheHits: number;
93
+ /**
94
+ * Model calls issued with NO screenshot attached (the scene's OCR/AX text
95
+ * sufficed). Each one saved roughly one full-frame image's worth of tokens.
96
+ */
97
+ imagelessCalls: number;
98
+ /**
99
+ * Estimated image tokens NOT sent because of imageless calls — the sum of
100
+ * `(width * height) / BRAIN_PIXELS_PER_IMAGE_TOKEN` over every imageless
101
+ * call, capped per frame at `BRAIN_MAX_PIXELS`.
102
+ */
103
+ estImageTokensSaved: number;
104
+ }
105
+ export declare class BrainParseError extends Error {
106
+ readonly raw: string;
107
+ constructor(message: string, raw: string);
108
+ }
109
+ export interface BrainDeps {
110
+ /** Optional override for tests — bypasses runtime.useModel. */
111
+ invokeModel?: (args: {
112
+ imageUrl: string;
113
+ prompt: string;
114
+ displayId: number;
115
+ }) => Promise<string | ImageDescriptionResult>;
116
+ /**
117
+ * When to attach the raw screenshot to the planning model (#9105). Defaults
118
+ * to `"always"` (legacy); see {@link resolveBrainImagePolicy} for the
119
+ * operator opt-in to `"on-escalation"`.
120
+ */
121
+ imagePolicy?: BrainImagePolicy;
122
+ }
123
+ export interface BrainInput {
124
+ scene: Scene;
125
+ goal: string;
126
+ /**
127
+ * Per-display capture buffers. If a display from `scene.displays` is
128
+ * missing here, the Brain skips it. The cascade is responsible for
129
+ * supplying these alongside the scene.
130
+ */
131
+ captures: Map<number, DisplayCapture>;
132
+ }
133
+ /**
134
+ * Pure description of a "Brain" call. Created by `Cascade.runCascade` and
135
+ * test fixtures.
136
+ */
137
+ export declare class Brain {
138
+ private readonly runtime;
139
+ private readonly deps;
140
+ /**
141
+ * Frame-dHash → BrainOutput cache. The WS2 MemoryArbiter only dedups the
142
+ * IMAGE_DESCRIPTION call for LOCAL backends; the remote/cloud path bypasses
143
+ * it, so an identical screen re-burns tokens every step. This call-site cache
144
+ * skips the model entirely when the same frame is observed for the same goal,
145
+ * cutting the dominant CUA-loop token cost regardless of backend (#9105 M3).
146
+ */
147
+ private readonly dhashCache;
148
+ private readonly imagePolicy;
149
+ private invocations;
150
+ private cacheHits;
151
+ private imagelessCalls;
152
+ private estImageTokensSaved;
153
+ constructor(runtime: IAgentRuntime | null, deps?: BrainDeps);
154
+ /** Token-accounting snapshot (model calls, cache hits, imageless savings). */
155
+ getStats(): BrainStats;
156
+ private cacheKey;
157
+ /**
158
+ * Return a cached plan for the same goal whose frame is within
159
+ * `BRAIN_DHASH_HAMMING_THRESHOLD` bits of `dh` — a near-identical screen, not
160
+ * just a byte-identical one. On a hit the entry is moved to the end (LRU), so
161
+ * a steadily-evolving screen keeps its most-recent close match warm.
162
+ */
163
+ private findCached;
164
+ private rememberOutput;
165
+ observeAndPlan(input: BrainInput): Promise<BrainOutput>;
166
+ /**
167
+ * True when the imageless plan needs no screenshot to dispatch: a
168
+ * non-coordinate action (type/hotkey/key/wait/finish) carries everything in
169
+ * its args, and a coordinate action is fine when its `ref`/rationale resolves
170
+ * to a concrete OCR/AX box. When the target cannot be grounded we escalate to
171
+ * the pixels — correctness over token saving.
172
+ */
173
+ private resolvesWithoutImage;
174
+ private recordImageless;
175
+ private invoke;
176
+ }
177
+ export declare function brainPromptFor(compactSceneJson: string, goal: string, strict: boolean): string;
178
+ export declare function parseBrainOutput(raw: string): BrainOutput;
179
+ /**
180
+ * Encode a PNG buffer for transport to the IMAGE_DESCRIPTION model. We don't
181
+ * resize here — `runtime.useModel` adapters (and any vLLM backends behind
182
+ * them) handle the `max_pixels` downscale. The constant `BRAIN_MAX_PIXELS`
183
+ * is exported for the cascade so it can crop ROIs at the right native
184
+ * resolution before invoking the Actor.
185
+ */
186
+ export declare function encodeForBrain(png: Buffer): Promise<string>;
187
+ /**
188
+ * Estimate the visual tokens a frame would have cost the planning model, for
189
+ * the "tokens saved by going imageless" telemetry (#9105). The backends apply a
190
+ * `max_pixels` downscale, so the per-frame estimate is capped at
191
+ * `BRAIN_MAX_PIXELS`. Falls back to the cap when the PNG header is unreadable —
192
+ * an attached frame would have been downscaled to that ceiling regardless.
193
+ */
194
+ export declare function estimateImageTokens(png: Buffer): number;
195
+ //# sourceMappingURL=brain.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"brain.d.ts","sourceRoot":"","sources":["../../src/actor/brain.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,EACL,KAAK,aAAa,EAClB,KAAK,sBAAsB,EAG5B,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAE7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AAGrD,OAAO,KAAK,EAAE,WAAW,EAAiC,MAAM,YAAY,CAAC;AAE7E,eAAO,MAAM,gBAAgB,UAAY,CAAC;AAC1C,eAAO,MAAM,cAAc,IAAI,CAAC;AAChC,gFAAgF;AAChF,eAAO,MAAM,qBAAqB,KAAK,CAAC;AACxC;;;;;;;;GAQG;AACH,eAAO,MAAM,6BAA6B,IAAI,CAAC;AAC/C;;;;GAIG;AACH,eAAO,MAAM,4BAA4B,MAAM,CAAC;AAEhD;;;;;;;;GAQG;AACH,MAAM,MAAM,gBAAgB,GAAG,QAAQ,GAAG,eAAe,GAAG,OAAO,CAAC;AAEpE;;;;;GAKG;AACH,eAAO,MAAM,0BAA0B,EAAE,gBAA2B,CAAC;AAErE;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,OAAO,EAAE,aAAa,GAAG,IAAI,GAC5B,gBAAgB,CAOlB;AAED,iEAAiE;AACjE,MAAM,WAAW,UAAU;IACzB,qDAAqD;IACrD,WAAW,EAAE,MAAM,CAAC;IACpB,wEAAwE;IACxE,SAAS,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAmDD,qBAAa,eAAgB,SAAQ,KAAK;aAGtB,GAAG,EAAE,MAAM;gBAD3B,OAAO,EAAE,MAAM,EACC,GAAG,EAAE,MAAM;CAK9B;AAED,MAAM,WAAW,SAAS;IACxB,+DAA+D;IAC/D,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE;QACnB,QAAQ,EAAE,MAAM,CAAC;QACjB,MAAM,EAAE,MAAM,CAAC;QACf,SAAS,EAAE,MAAM,CAAC;KACnB,KAAK,OAAO,CAAC,MAAM,GAAG,sBAAsB,CAAC,CAAC;IAC/C;;;;OAIG;IACH,WAAW,CAAC,EAAE,gBAAgB,CAAC;CAChC;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb;;;;OAIG;IACH,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED;;;GAGG;AACH,qBAAa,KAAK;IAqBd,OAAO,CAAC,QAAQ,CAAC,OAAO;IACxB,OAAO,CAAC,QAAQ,CAAC,IAAI;IArBvB;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ,CAAC,UAAU,CAKnB;IACR,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAmB;IAC/C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,cAAc,CAAK;IAC3B,OAAO,CAAC,mBAAmB,CAAK;gBAGb,OAAO,EAAE,aAAa,GAAG,IAAI,EAC7B,IAAI,GAAE,SAAc;IAKvC,8EAA8E;IAC9E,QAAQ,IAAI,UAAU;IAStB,OAAO,CAAC,QAAQ;IAShB;;;;;OAKG;IACH,OAAO,CAAC,UAAU;IAoBlB,OAAO,CAAC,cAAc;IAmBhB,cAAc,CAAC,KAAK,EAAE,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IA4I7D;;;;;;OAMG;IACH,OAAO,CAAC,oBAAoB;IAa5B,OAAO,CAAC,eAAe;YAKT,MAAM;CA4BrB;AAID,wBAAgB,cAAc,CAC5B,gBAAgB,EAAE,MAAM,EACxB,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,GACd,MAAM,CAgCR;AAID,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,WAAW,CA6FzD;AAkBD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAEjE;AAED;;;;;;GAMG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAMvD"}
@@ -0,0 +1,92 @@
1
+ /**
2
+ * WS7 — ScreenSeekeR cascade.
3
+ *
4
+ * Step 1: Brain looks at the full-screen scene for `target_display_id`.
5
+ * Step 2: For each ROI (up to BRAIN_MAX_ROIS), crop the *native resolution*
6
+ * region from the captured PNG and hand it to the Actor for
7
+ * fine grounding. The Actor returns display-local coords.
8
+ * Step 3: Combine the Brain's proposed action with the Actor's coordinates
9
+ * (or fall back to the OCR/AX deterministic actor on `ref`) and
10
+ * produce a single `ProposedAction` for the dispatcher.
11
+ *
12
+ * Cropping notes:
13
+ * - We do NOT decode the PNG. The cropped buffer is what we hand to the
14
+ * Actor. For the built-in OCR/AX actor, the crop is just a pass-through.
15
+ * - When a real PNG cropper is wired in (sharp / native module), this
16
+ * module is the place to add it: `cropPngToRoi(frame, bbox)`.
17
+ * - The cascade tests use the actual frame bytes; assertions are on the
18
+ * resolved coords and the order of Actor calls.
19
+ */
20
+ import type { DisplayCapture } from "../platform/capture.js";
21
+ import type { Scene } from "../scene/scene-types.js";
22
+ import type { Actor } from "./actor.js";
23
+ import { type Brain } from "./brain.js";
24
+ import type { BrainRoi, CascadeResult } from "./types.js";
25
+ export interface CascadeDeps {
26
+ brain: Brain;
27
+ actor?: Actor | null;
28
+ /** Cropper override (mostly tests). Returns a Buffer for the bbox region. */
29
+ crop?: (frame: Buffer, bbox: [number, number, number, number]) => Buffer;
30
+ }
31
+ export interface CascadeInput {
32
+ scene: Scene;
33
+ goal: string;
34
+ /** Per-display PNG captures, keyed by displayId. */
35
+ captures: Map<number, DisplayCapture>;
36
+ }
37
+ /** Grounding-cache accounting (#9105 M5). */
38
+ export interface CascadeGroundStats {
39
+ /** Grounding resolutions served from the per-Scene cache. */
40
+ hits: number;
41
+ /** Grounding resolutions that ran the full resolve (OCR/AX + optional actor). */
42
+ misses: number;
43
+ }
44
+ export declare class Cascade {
45
+ private readonly deps;
46
+ /**
47
+ * Per-Scene grounding cache (predict/ground split, #9105 M5). Grounding the
48
+ * same target (ref or rationale) on the same Scene is deterministic, so the
49
+ * cheap GROUND step is memoized — re-grounding within a turn skips a repeat
50
+ * `resolveReference` (OCR/AX) scan and a repeat (possibly model-backed)
51
+ * `actor.ground` call. Keyed by Scene timestamp so a new screen invalidates.
52
+ */
53
+ private readonly groundCache;
54
+ private groundStats;
55
+ constructor(deps: CascadeDeps);
56
+ /** Grounding cache hit/miss snapshot for token/work accounting. */
57
+ getGroundStats(): CascadeGroundStats;
58
+ run(input: CascadeInput): Promise<CascadeResult>;
59
+ /**
60
+ * Grounding-only entry (the `predict_click` half of the predict/ground split,
61
+ * #9105 M5 / #9170 M10). Resolves a `ref` (OCR/AX id) or free-form
62
+ * `instruction` to a display-local coordinate WITHOUT running the Brain —
63
+ * agent loops that do their own step planning (Anthropic / OpenAI
64
+ * computer-use) call this to reuse our deterministic OCR/AX + actor grounding
65
+ * and its per-Scene cache. Returns `null` when nothing can be grounded.
66
+ */
67
+ groundTarget(args: {
68
+ scene: Scene;
69
+ captures: Map<number, DisplayCapture>;
70
+ targetDisplayId: number;
71
+ ref?: string;
72
+ instruction?: string;
73
+ /** Optional ROI to ground inside when no `ref` is available. */
74
+ roi?: BrainRoi;
75
+ }): Promise<{
76
+ displayId: number;
77
+ x: number;
78
+ y: number;
79
+ } | null>;
80
+ private resolveBrainOutput;
81
+ private resolveCoords;
82
+ /** Drop cache entries from any Scene other than `timestamp`, then store. */
83
+ private rememberGround;
84
+ private resolveCoordsUncached;
85
+ private coordsForRef;
86
+ private groundReference;
87
+ private groundRoi;
88
+ private cropFrame;
89
+ }
90
+ export declare function setActor(actor: Actor | null): void;
91
+ export declare function getRegisteredActor(): Actor | null;
92
+ //# sourceMappingURL=cascade.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cascade.d.ts","sourceRoot":"","sources":["../../src/actor/cascade.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAGH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,yBAAyB,CAAC;AACrD,OAAO,KAAK,EAAE,KAAK,EAAmB,MAAM,YAAY,CAAC;AAEzD,OAAO,EAAkB,KAAK,KAAK,EAAE,MAAM,YAAY,CAAC;AACxD,OAAO,KAAK,EAEV,QAAQ,EACR,aAAa,EAEd,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,KAAK,CAAC;IACb,KAAK,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC;IACrB,6EAA6E;IAC7E,IAAI,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,KAAK,MAAM,CAAC;CAC1E;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,oDAAoD;IACpD,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;CACvC;AAED,6CAA6C;AAC7C,MAAM,WAAW,kBAAkB;IACjC,6DAA6D;IAC7D,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,OAAO;IAcN,OAAO,CAAC,QAAQ,CAAC,IAAI;IAbjC;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAGxB;IACJ,OAAO,CAAC,WAAW,CAA8C;gBAEpC,IAAI,EAAE,WAAW;IAE9C,mEAAmE;IACnE,cAAc,IAAI,kBAAkB;IAI9B,GAAG,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC;IAStD;;;;;;;OAOG;IACG,YAAY,CAAC,IAAI,EAAE;QACvB,KAAK,EAAE,KAAK,CAAC;QACb,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACtC,eAAe,EAAE,MAAM,CAAC;QACxB,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,gEAAgE;QAChE,GAAG,CAAC,EAAE,QAAQ,CAAC;KAChB,GAAG,OAAO,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;YAsBjD,kBAAkB;YAwJlB,aAAa;IA2B3B,4EAA4E;IAC5E,OAAO,CAAC,cAAc;YAYR,qBAAqB;YAkCrB,YAAY;YAUZ,eAAe;YAiDf,SAAS;IAiCvB,OAAO,CAAC,SAAS;CAYlB;AAQD,wBAAgB,QAAQ,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI,GAAG,IAAI,CAElD;AACD,wBAAgB,kBAAkB,IAAI,KAAK,GAAG,IAAI,CAEjD"}