@onkernel/cua-ai 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +100 -0
  2. package/README.md +341 -65
  3. package/dist/chunk-D7D4PA-g.js +13 -0
  4. package/dist/index.d.ts +576 -10
  5. package/dist/index.js +1999 -11
  6. package/docs/supported-models.md +77 -0
  7. package/examples/quickstart.ts +28 -22
  8. package/package.json +10 -6
  9. package/dist/api-keys.d.ts +0 -8
  10. package/dist/api-keys.d.ts.map +0 -1
  11. package/dist/api-keys.js +0 -48
  12. package/dist/api-keys.js.map +0 -1
  13. package/dist/index.d.ts.map +0 -1
  14. package/dist/index.js.map +0 -1
  15. package/dist/models.d.ts +0 -33
  16. package/dist/models.d.ts.map +0 -1
  17. package/dist/models.js +0 -159
  18. package/dist/models.js.map +0 -1
  19. package/dist/providers/anthropic/index.d.ts +0 -10
  20. package/dist/providers/anthropic/index.d.ts.map +0 -1
  21. package/dist/providers/anthropic/index.js +0 -16
  22. package/dist/providers/anthropic/index.js.map +0 -1
  23. package/dist/providers/common.d.ts +0 -111
  24. package/dist/providers/common.d.ts.map +0 -1
  25. package/dist/providers/common.js +0 -138
  26. package/dist/providers/common.js.map +0 -1
  27. package/dist/providers/gemini/index.d.ts +0 -11
  28. package/dist/providers/gemini/index.d.ts.map +0 -1
  29. package/dist/providers/gemini/index.js +0 -14
  30. package/dist/providers/gemini/index.js.map +0 -1
  31. package/dist/providers/openai/index.d.ts +0 -8
  32. package/dist/providers/openai/index.d.ts.map +0 -1
  33. package/dist/providers/openai/index.js +0 -22
  34. package/dist/providers/openai/index.js.map +0 -1
  35. package/dist/providers/tzafon/index.d.ts +0 -12
  36. package/dist/providers/tzafon/index.d.ts.map +0 -1
  37. package/dist/providers/tzafon/index.js +0 -18
  38. package/dist/providers/tzafon/index.js.map +0 -1
  39. package/dist/providers/tzafon/provider.d.ts +0 -8
  40. package/dist/providers/tzafon/provider.d.ts.map +0 -1
  41. package/dist/providers/tzafon/provider.js +0 -234
  42. package/dist/providers/tzafon/provider.js.map +0 -1
  43. package/dist/providers/yutori/index.d.ts +0 -12
  44. package/dist/providers/yutori/index.d.ts.map +0 -1
  45. package/dist/providers/yutori/index.js +0 -23
  46. package/dist/providers/yutori/index.js.map +0 -1
  47. package/dist/providers/yutori/provider.d.ts +0 -9
  48. package/dist/providers/yutori/provider.d.ts.map +0 -1
  49. package/dist/providers/yutori/provider.js +0 -307
  50. package/dist/providers/yutori/provider.js.map +0 -1
  51. package/dist/providers.d.ts +0 -6
  52. package/dist/providers.d.ts.map +0 -1
  53. package/dist/providers.js +0 -26
  54. package/dist/providers.js.map +0 -1
  55. package/dist/runtime-spec.d.ts +0 -29
  56. package/dist/runtime-spec.d.ts.map +0 -1
  57. package/dist/runtime-spec.js +0 -58
  58. package/dist/runtime-spec.js.map +0 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,105 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.2.1 - 2026-06-11
4
+
5
+ - Add computer-use support for the `claude-fable-5` Anthropic model.
6
+
7
+ ## 0.2.0 - 2026-06-10
8
+
9
+ ### Fixed
10
+
11
+ - The published package is now importable under plain Node ESM. 0.1.0 shipped
12
+ extensionless relative imports in `dist/`, so `import "@onkernel/cua-ai"`
13
+ failed outside bundlers; `dist/` is now bundled with tsdown.
14
+ - The shipped `examples/quickstart.ts` imports `@onkernel/cua-ai` instead of a
15
+ `../src` path that does not exist in the tarball, checks `stopReason` so
16
+ provider errors are no longer silent, resolves its API key via
17
+ `requireCuaEnvApiKeyForModel`, and switches providers with the `CUA_MODEL`
18
+ env var.
19
+ - `docs/` (the supported-models list the README links to) is now included in
20
+ the npm tarball.
21
+ - A malformed Yutori tool call now degrades to an empty-arguments call instead
22
+ of failing the entire response, matching the existing Tzafon hardening.
23
+
24
+ ### Breaking changes
25
+
26
+ - Provider namespaces follow one convention. Every namespace now exports
27
+ `computerTools({ actions? })` / `computerToolExecutors({ actions? })`,
28
+ `createActionSchema`, `coordinateSystem()`, `providerModule`,
29
+ `<PROVIDER>_CUA_ACTION_TYPES`, `<PROVIDER>_COMPUTER_INSTRUCTIONS`, a
30
+ `<Provider>Action` type, and `ComputerToolsOptions`. This replaces 0.1.0's
31
+ `createComputerToolDefinitions(options)` /
32
+ `CreateComputerToolDefinitionsOptions`, the per-namespace
33
+ `COMPUTER_TOOL_COORDINATES` constants, `TZAFON_ACTION_TYPES` /
34
+ `YUTORI_ACTION_TYPES`, and the `OPENAI_BATCH_INSTRUCTIONS` /
35
+ `GEMINI_INSTRUCTIONS_RAW` / `TZAFON_INSTRUCTIONS_RAW` /
36
+ `YUTORI_INSTRUCTIONS_RAW` prompt constants.
37
+ - `CUA_BATCH_TOOL_NAME` is now `"computer_batch"` (was
38
+ `"batch_computer_actions"`), matching the batch tool Anthropic ships by
39
+ default. `anthropic.ANTHROPIC_BATCH_TOOL_NAME` carries the same new value;
40
+ the other per-namespace batch aliases (`TZAFON_BATCH_TOOL_NAME`,
41
+ `YUTORI_BATCH_TOOL_NAME`, `*_BATCH_DESCRIPTION`, `*BatchSchema`,
42
+ `*BatchInput`) were removed — use `CUA_BATCH_TOOL_NAME`,
43
+ `CUA_BATCH_TOOL_DESCRIPTION`, `CuaBatchSchema`, and `CuaBatchInput`.
44
+ - Anthropic tools are now the 13 canonical browser actions Anthropic supports
45
+ (no `back`/`forward`/`url`) plus a `computer_batch` batch tool by default;
46
+ pass `excludeBatch: true` to omit it. Unsupported `actions` entries throw.
47
+ `anthropic.ANTHROPIC_CUA_ACTION_TYPES` reflects the supported subset rather
48
+ than aliasing the full canonical list.
49
+ - Yutori models now use Yutori's documented native `tool_set` request field.
50
+ `streamYutori` strips canonical action tools from the outbound payload
51
+ (preserve specific tools via the `keepToolNames` stream option), selects the
52
+ n1.5 core tool set where applicable, and normalizes native tool calls back
53
+ to canonical names. `yutori.providerModule.toolDefinitions()` is `[]`;
54
+ `yutori.computerTools()` builds local mirrors for executor lookup, validates
55
+ `{ actions }` against the supported subset, and throws on unsupported
56
+ actions. `yutoriBuiltinToolsOnPayload` was replaced by
57
+ `yutoriNativeToolSetOnPayload`. The Yutori runtime spec also carries a
58
+ screenshot policy (append a 1280x800 webp screenshot to the latest message).
59
+ - Family model annotations now match only the family root plus numeric
60
+ revision or dated-snapshot suffixes (`claude-opus-4-7`,
61
+ `gpt-5.5-2026-04-23`). Named sibling variants such as `gpt-5.4-mini` are no
62
+ longer listed by `listCuaModels()` or accepted by `getCuaModel()` without
63
+ their own annotation.
64
+ - `google:gemini-2.5-computer-use-preview-10-2025` was removed from the
65
+ catalog: it rejects the standard function declarations this package sends
66
+ and requires Google's native `tools.computer_use` wrapper. Use
67
+ `google:gemini-3-flash-preview` or `google:gemini-3-pro-preview`.
68
+ - `streamTzafonResponses` no longer accepts a `maxOutputTokens` option — use
69
+ the standard `maxTokens` stream option.
70
+
71
+ ### Added
72
+
73
+ - `CuaProviderModule` contract plus a `providerModule` export per namespace,
74
+ and a richer `CuaRuntimeSpec`: `toolExecutors` (local adapters that turn
75
+ provider tool calls into canonical `CuaAction`s via `CuaToolExecutorSpec`),
76
+ `coordinateSystem`, and optional `screenshot` policy alongside the existing
77
+ tool definitions, default prompt, and payload middleware.
78
+ - `resolveCuaRuntimeSpec(input, options?)` accepts `ComputerToolsOptions` and
79
+ forwards it to the provider module, so runtime consumers can narrow tool
80
+ definitions and executors (e.g. `{ actions: ["click"] }`).
81
+ - `registerCuaProviders()` is exported: importing the package still registers
82
+ the Yutori/Tzafon stream providers automatically, and this restores them
83
+ after pi-ai registry mutators (`clearApiProviders`, `resetApiProviders`,
84
+ `unregisterApiProviders`).
85
+ - `parseCuaModelRef` / `getCuaModel` accept `"gemini:"` refs as an alias for
86
+ `"google:"`, and unsupported-provider errors now list the valid providers.
87
+ - `CuaMouseButton` and `CuaDragMouseButton` closed unions type the `button`
88
+ field on click/mouse_down/mouse_up and drag actions.
89
+ - `yutori.YutoriOptions` and `tzafon.TzafonResponsesOptions` are exported and
90
+ aligned; both support `keepToolNames` to preserve caller tools that collide
91
+ with canonical action names on the wire.
92
+ - Yutori native action vocabulary exports: `YUTORI_N1_ACTION_TYPES`,
93
+ `YUTORI_N15_CORE_ACTION_TYPES`, `YUTORI_N15_EXPANDED_ACTION_TYPES`,
94
+ tool-set ids, `yutoriToolSetForModel`, `yutoriNativeActionsForModel`, and
95
+ `toCanonicalActions`; Tzafon exports `toCanonicalActions`,
96
+ `TzafonCanonicalAction`, `tzafonComputerUseOnPayload`, and
97
+ `tzafonToolCallId`.
98
+ - README and JSDoc coverage across the public surface: API key prerequisites
99
+ and helpers, error handling (`stopReason` semantics), a multi-turn
100
+ tool-result example, the complete export list, and per-provider canonical
101
+ action subsets.
102
+
3
103
  ## 0.1.0
4
104
 
5
105
  - Provider-qualified CUA model catalog with support annotations and curated overrides.
package/README.md CHANGED
@@ -10,46 +10,190 @@ for building CUA agents on Kernel.
10
10
  npm install @onkernel/cua-ai
11
11
  ```
12
12
 
13
+ ## Prerequisites
14
+
15
+ You need an API key for each provider you call. The helpers in this package
16
+ check these environment variables, in order:
17
+
18
+ | Provider | Environment variables (checked in order) |
19
+ | ----------- | ------------------------------------------- |
20
+ | `openai` | `OPENAI_API_KEY` |
21
+ | `anthropic` | `ANTHROPIC_OAUTH_TOKEN`, `ANTHROPIC_API_KEY` |
22
+ | `google` | `GOOGLE_API_KEY`, `GEMINI_API_KEY` |
23
+ | `tzafon` | `TZAFON_API_KEY` |
24
+ | `yutori` | `YUTORI_API_KEY` |
25
+
26
+ The exported helpers wrap this table:
27
+
28
+ - `cuaApiKeyEnvVarsForProvider(provider)` — the env var names for a provider
29
+ (accepts `"gemini"` as an alias for `"google"`).
30
+ - `getCuaEnvApiKey(provider)` — read the key, or `undefined` when unset.
31
+ - `requireCuaEnvApiKey(provider)` — read the key, or throw naming the
32
+ variables to set.
33
+ - `getCuaEnvApiKeyForModel(refOrModel)` / `requireCuaEnvApiKeyForModel(refOrModel)`
34
+ — the same, keyed by a model ref like `"openai:gpt-5.5"` or a concrete
35
+ `Model<Api>`.
36
+
37
+ Pass the resolved key as the `apiKey` stream option (as in the Quick Start
38
+ below) so a missing key fails loudly before any request is made. If you omit
39
+ `apiKey`, pi-ai's built-in providers fall back to their own env lookup
40
+ (`OPENAI_API_KEY`; `ANTHROPIC_OAUTH_TOKEN`/`ANTHROPIC_API_KEY`; for `google`
41
+ only `GEMINI_API_KEY`, not `GOOGLE_API_KEY`), and this package's Tzafon/Yutori
42
+ stream adapters read `TZAFON_API_KEY`/`YUTORI_API_KEY`.
43
+
13
44
  ## Quick Start
14
45
 
15
46
  ```ts
16
47
  import { readFile } from "node:fs/promises";
17
- import { complete, getCuaModel, openai } from "@onkernel/cua-ai";
18
-
19
- const screenshot = await readFile("examples/screenshot.png");
48
+ import { complete, getCuaModel, openai, requireCuaEnvApiKeyForModel } from "@onkernel/cua-ai";
20
49
 
21
50
  const model = getCuaModel("openai:gpt-5.5");
22
-
23
- const response = await complete(model, {
24
- systemPrompt: "You are a browser automation agent.",
25
- messages: [
26
- {
27
- role: "user",
28
- content: [
29
- { type: "text", text: "Click the Login button in this screenshot." },
30
- { type: "image", data: screenshot.toString("base64"), mimeType: "image/png" },
31
- ],
32
- timestamp: Date.now(),
33
- },
34
- ],
35
- tools: openai.createComputerToolDefinitions({ actions: ["click"] }),
36
- });
51
+ const apiKey = requireCuaEnvApiKeyForModel("openai:gpt-5.5"); // throws unless OPENAI_API_KEY is set
52
+
53
+ // Any screenshot of the page you want to act on, resolved relative to this
54
+ // module so the snippet does not depend on the process working directory.
55
+ const screenshot = await readFile(new URL("./screenshot.png", import.meta.url));
56
+
57
+ const response = await complete(
58
+ model,
59
+ {
60
+ systemPrompt: "You are a browser automation agent.",
61
+ messages: [
62
+ {
63
+ role: "user",
64
+ content: [
65
+ { type: "text", text: "Click the sign in / up link in this screenshot." },
66
+ { type: "image", data: screenshot.toString("base64"), mimeType: "image/png" },
67
+ ],
68
+ timestamp: Date.now(),
69
+ },
70
+ ],
71
+ tools: openai.computerTools({ actions: ["click"] }),
72
+ },
73
+ { apiKey },
74
+ );
75
+
76
+ if (response.stopReason === "error" || response.stopReason === "aborted") {
77
+ throw new Error(response.errorMessage ?? `request ended with stopReason "${response.stopReason}"`);
78
+ }
37
79
 
38
80
  for (const block of response.content) {
39
- if (block.type === "toolCall" && block.name === "click_mouse") {
81
+ if (block.type === "toolCall" && block.name === "click") {
40
82
  console.log("click:", block.arguments);
41
83
  }
42
84
  }
43
85
  ```
44
86
 
87
+ A runnable version ships at [`examples/quickstart.ts`](./examples/quickstart.ts)
88
+ (with a sample screenshot). In this repo, run it from `packages/ai` with
89
+ `npm run example:quickstart`; switch providers with the `CUA_MODEL` env var,
90
+ e.g. `CUA_MODEL=anthropic:claude-opus-4-7`.
91
+
92
+ ## Error Handling
93
+
94
+ pi-ai's `complete()` and `stream()` **resolve instead of throwing** when a
95
+ request fails. The returned `AssistantMessage` carries the outcome on
96
+ `stopReason`:
97
+
98
+ - `"stop"`, `"length"`, `"toolUse"` — success; `content` holds the response.
99
+ - `"error"` — the provider call failed (bad API key, no model access, network
100
+ error, …). `content` is empty and `errorMessage` holds the provider error.
101
+ - `"aborted"` — the request was cancelled via the `signal` stream option.
102
+
103
+ Always check `stopReason` before reading `content` — otherwise a typo'd API
104
+ key looks like a successful run that produced nothing:
105
+
106
+ ```ts
107
+ if (response.stopReason === "error" || response.stopReason === "aborted") {
108
+ throw new Error(response.errorMessage ?? `request ended with stopReason "${response.stopReason}"`);
109
+ }
110
+ ```
111
+
112
+ `getCuaModel()`, `requireCuaEnvApiKey*()`, and `computerTools({ actions })`
113
+ validate eagerly and throw regular errors.
114
+
115
+ ## Continuing the Loop
116
+
117
+ [`@onkernel/cua-agent`](https://www.npmjs.com/package/@onkernel/cua-agent)
118
+ runs this loop for you — `CuaAgent`/`CuaAgentHarness` classes with browser
119
+ execution against a Kernel browser. Reach for it first; the rest of this
120
+ section is for driving the loop yourself against your own browser stack.
121
+
122
+ A computer-use session is a loop: the model calls a tool, you execute it
123
+ against a real browser, and you send the result (with a fresh screenshot) back
124
+ so the model can plan the next step. Tool results are pi-ai
125
+ `ToolResultMessage`s:
126
+
127
+ ```ts
128
+ type ToolResultMessage = {
129
+ role: "toolResult";
130
+ toolCallId: string; // ToolCall.id from the assistant message
131
+ toolName: string; // ToolCall.name
132
+ content: (TextContent | ImageContent)[];
133
+ details?: unknown; // optional executor metadata, not sent to the model
134
+ isError: boolean;
135
+ timestamp: number;
136
+ };
137
+ ```
138
+
139
+ A minimal two-turn loop:
140
+
141
+ ```ts
142
+ import { complete, getCuaModel, openai, requireCuaEnvApiKeyForModel, type Message } from "@onkernel/cua-ai";
143
+
144
+ const model = getCuaModel("openai:gpt-5.5");
145
+ const apiKey = requireCuaEnvApiKeyForModel("openai:gpt-5.5");
146
+ const tools = openai.computerTools({ actions: ["click", "type", "screenshot"] });
147
+
148
+ const messages: Message[] = [
149
+ {
150
+ role: "user",
151
+ content: [
152
+ { type: "text", text: "Click the sign in / up link in this screenshot." },
153
+ { type: "image", data: screenshotBase64, mimeType: "image/png" },
154
+ ],
155
+ timestamp: Date.now(),
156
+ },
157
+ ];
158
+
159
+ // Turn 1: the model responds with tool calls.
160
+ const first = await complete(model, { messages, tools }, { apiKey });
161
+ if (first.stopReason === "error" || first.stopReason === "aborted") {
162
+ throw new Error(first.errorMessage);
163
+ }
164
+ messages.push(first); // the AssistantMessage joins the transcript as-is
165
+
166
+ // Execute each tool call against your browser stack, then append a
167
+ // toolResult message carrying a fresh screenshot.
168
+ for (const block of first.content) {
169
+ if (block.type !== "toolCall") continue;
170
+ const freshScreenshotBase64 = await runInYourBrowser(block.name, block.arguments);
171
+ messages.push({
172
+ role: "toolResult",
173
+ toolCallId: block.id,
174
+ toolName: block.name,
175
+ content: [
176
+ { type: "text", text: "done" },
177
+ { type: "image", data: freshScreenshotBase64, mimeType: "image/png" },
178
+ ],
179
+ isError: false,
180
+ timestamp: Date.now(),
181
+ });
182
+ }
183
+
184
+ // Turn 2: the model sees the results and plans the next action.
185
+ const second = await complete(model, { messages, tools }, { apiKey });
186
+ ```
187
+
45
188
  ## Core Concepts
46
189
 
47
- `@onkernel/cua-ai` re-exports the core primitives of
48
- [`@earendil-works/pi-ai`](https://github.com/earendil-works/pi/tree/main/packages/ai):
190
+ `@onkernel/cua-ai` re-exports the full surface of
191
+ [`@earendil-works/pi-ai`](https://github.com/earendil-works/pi/tree/main/packages/ai)
192
+ (`export * from "@earendil-works/pi-ai"`), including the core primitives:
49
193
  `Model`, `Context`, `Message`, `Tool`, `complete`, `stream`, `completeSimple`,
50
- `streamSimple`, `Type`, `Static`, `TSchema`, and the event/validation helpers
51
- that pi-ai exposes. Some familiarity with pi-ai is assumed; Kernel adds the
52
- computer-use model catalog and provider/tool metadata.
194
+ `streamSimple`, `Type`, `Static`, `TSchema`, and the event/validation helpers.
195
+ Some familiarity with pi-ai is assumed; Kernel adds the computer-use model
196
+ catalog and provider/tool metadata.
53
197
 
54
198
  ### Model Refs
55
199
 
@@ -59,13 +203,14 @@ computer-use model catalog and provider/tool metadata.
59
203
  ```ts
60
204
  getCuaModel("openai:gpt-5.5");
61
205
  getCuaModel("anthropic:claude-opus-4-7");
62
- getCuaModel("google:gemini-2.5-computer-use-preview-10-2025");
206
+ getCuaModel("google:gemini-3-flash-preview");
63
207
  getCuaModel("tzafon:tzafon.northstar-cua-fast");
64
208
  getCuaModel("yutori:n1.5-latest");
65
209
  ```
66
210
 
67
211
  `getCuaModel(ref)` returns a pi-ai `Model<Api>` you can pass to `complete()`
68
- or `stream()`.
212
+ or `stream()`. It throws when the ref names a model without a CUA-support
213
+ annotation.
69
214
 
70
215
  See [`docs/supported-models.md`](./docs/supported-models.md) for the current
71
216
  list of CUA-supporting models per provider.
@@ -95,59 +240,153 @@ interface CuaModelInfo {
95
240
  }
96
241
  ```
97
242
 
98
- ### Exports
243
+ ## Exports
244
+
245
+ Everything below is importable from the package root. pi-ai's full surface is
246
+ re-exported alongside (see [Core Concepts](#core-concepts)).
99
247
 
100
- Top-level exports:
248
+ ### Models and refs
101
249
 
102
250
  - `getCuaModel(ref: CuaModelRef): Model<Api>`
103
251
  - `listCuaModels(provider?: CuaProvider): CuaModelInfo[]`
252
+ - `parseCuaModelRef(ref: string): { provider: CuaProvider; model: string }` —
253
+ accepts the `"gemini:"` alias
254
+ - `formatCuaModelRef(provider, model): CuaModelRef`
104
255
  - `providerForModel(model: Model<Api>): CuaProvider`
105
- - `resolveCuaRuntimeSpec(input: CuaModelRef | Model<Api>): CuaRuntimeSpec`
256
+ - `isCuaProvider(value: string): value is CuaProvider`
257
+ - `findCuaAnnotation(provider, modelId): CuaModelAnnotation | undefined`
106
258
  - `CUA_PROVIDERS: readonly CuaProvider[]`
107
- - `CuaBatchSchema`, `CuaActionSchema`, `CuaNavigationSchema` TypeBox schemas
108
- - `createCuaActionSchema(actions?)`, `createCuaBatchSchema(actions?)`
259
+ - `CUA_MODEL_ANNOTATIONS: Record<CuaProvider, readonly CuaModelAnnotation[]>`
260
+ the source-cited support table
261
+ - Types: `CuaProvider`, `CuaModelRef`, `CuaModelInfo`, `CuaModelAnnotation`,
262
+ `CuaModelMatch`
263
+
264
+ ### API keys
265
+
266
+ - `cuaApiKeyEnvVarsForProvider(provider): readonly string[]`
267
+ - `getCuaEnvApiKey(provider): string | undefined`
268
+ - `requireCuaEnvApiKey(provider): string`
269
+ - `getCuaEnvApiKeyForModel(refOrModel): string | undefined`
270
+ - `requireCuaEnvApiKeyForModel(refOrModel): string`
271
+
272
+ ### Runtime specs
273
+
274
+ - `resolveCuaRuntimeSpec(input: CuaModelRef | Model<Api>, options?: ComputerToolsOptions): CuaRuntimeSpec`
275
+ - Types: `CuaRuntimeSpec`, `CuaRuntimeSpecInput`, `CuaProviderModule`,
276
+ `CuaScreenshotSpec`, `CuaScreenshotTransformSpec`, `CuaPayloadHook`,
277
+ `CuaPayloadContext`
109
278
 
110
279
  `resolveCuaRuntimeSpec()` centralizes provider-specific defaults for
111
280
  runtime consumers:
112
281
 
113
282
  - canonical provider id
114
- - canonical CUA tool definitions
283
+ - provider-facing CUA tool definitions used in model requests
284
+ - local execution adapters used by `CuaAgent`/`CuaAgentHarness`
115
285
  - default system prompt text
286
+ - provider coordinate convention
287
+ - optional provider screenshot input policy
116
288
  - optional provider payload middleware (for protocol quirks)
117
289
 
118
- Provider namespaces expose `createComputerToolDefinitions({ actions? })` for
119
- building model-facing pi-ai `Tool[]` definitions. Omit `actions` for the
120
- provider's default computer tool set, or pass an action subset to narrow the
121
- schema for a single `complete()` call:
290
+ Pass `options` (e.g. `{ actions: ["click"] }`) to narrow the resolved tool
291
+ definitions and executors; it is forwarded to the provider module's
292
+ `toolDefinitions()`/`toolExecutors()`, so providers with a restricted subset
293
+ (Anthropic, Yutori) throw on unsupported actions.
294
+
295
+ ### Canonical actions and tools
296
+
297
+ - `CUA_ACTION_TYPES: readonly CuaActionType[]` — the 16 canonical action names
298
+ - `computerTools(options?: ComputerToolsOptions): Tool[]` /
299
+ `createCuaActionToolDefinitions(actions?)` — one `Tool` per canonical action
300
+ (the full canonical superset; provider namespaces apply provider defaults
301
+ and validation on top)
302
+ - `computerToolExecutors(options?)` / `createCuaActionToolExecutors(actions?)`
303
+ — matching `CuaToolExecutorSpec[]` execution adapters
304
+ - `createCuaActionSchema(actions?)`, `CuaActionSchema` — TypeBox union schema
305
+ - `createCuaBatchSchema(actions?)`, `CuaBatchSchema`,
306
+ `createCuaBatchToolDefinition(actions?, options?)`,
307
+ `createCuaBatchToolExecutor(actions?, options?)`,
308
+ `CUA_BATCH_TOOL_NAME` (`"computer_batch"`), `CUA_BATCH_TOOL_DESCRIPTION`
309
+ - `createCuaNavigationToolDefinition()`, `CuaNavigationSchema`,
310
+ `CUA_NAVIGATION_TOOL_NAME` (`"computer_use_extra"`),
311
+ `CUA_NAVIGATION_TOOL_DESCRIPTION`
312
+ - `canonicalToolCallName(action)`, `canonicalToolCallArguments(action)` — map
313
+ a normalized `CuaAction` back to its tool-call name/arguments
314
+ - `normalizeGotoUrl(value)` — prefix bare hostnames with `https://`
315
+ - Types: `CuaAction` (plus the 16 per-action interfaces), `CuaActionType`,
316
+ `CuaMouseButton`, `CuaDragMouseButton`, `CuaBatchInput`,
317
+ `CuaNavigationInput`, `CuaToolExecutorSpec`, `ComputerToolsOptions`,
318
+ `ComputerToolCoordinateSystem`
319
+
320
+ ### Provider registration
321
+
322
+ - `registerCuaProviders(): void` — re-register the Yutori/Tzafon stream
323
+ providers with pi-ai's global registry (runs automatically on import;
324
+ idempotent; call it after any pi-ai registry mutator)
325
+
326
+ ## Provider Tools
327
+
328
+ Provider namespaces expose `computerTools({ actions? })` for
329
+ building the provider's default CUA `Tool[]` definitions. These are the tools
330
+ sent to the model when you call `complete()` or `stream()` directly. The
331
+ default set can differ by provider: Anthropic includes its `computer_batch`
332
+ tool from the computer-use best-practices reference, while providers such as
333
+ OpenAI currently expose individual canonical browser actions. Omit `actions`
334
+ for the provider's default computer tool set, or pass an action subset to narrow
335
+ the schema for a single `complete()` call:
122
336
 
123
337
  ```ts
124
338
  import { openai } from "@onkernel/cua-ai";
125
339
 
126
- const allComputerTools = openai.createComputerToolDefinitions();
127
- const clickOnlyTools = openai.createComputerToolDefinitions({ actions: ["click"] });
340
+ const allComputerTools = openai.computerTools();
341
+ const clickOnlyTools = openai.computerTools({ actions: ["click"] });
128
342
  ```
129
343
 
130
- Every provider namespace synthesizes a `batch_computer_actions` tool definition.
131
- That gives models a consistent way to plan ordered browser actions even when the
132
- provider's native computer-use API has a different shape. Provider namespaces
133
- are still used so the definitions can diverge over time where provider protocol
134
- differences matter.
135
-
136
- Provider namespaces also expose `COMPUTER_TOOL_COORDINATES`, which describes
137
- the coordinates the provider's computer tool calls are expected to emit:
344
+ When `actions` is provided, it must be a subset of that provider's supported
345
+ canonical action set; unsupported actions throw (e.g.
346
+ `anthropic.computerTools({ actions: ["back"] })` throws
347
+ `unsupported Anthropic canonical action(s): back`).
348
+
349
+ Per-provider canonical action subsets (each namespace exports its list as
350
+ `<PROVIDER>_CUA_ACTION_TYPES`):
351
+
352
+ | Namespace | Canonical actions |
353
+ | ----------- | ---------------------------------------------------------------------------------- |
354
+ | `openai` | all 16 |
355
+ | `anthropic` | 13 — everything except `back`, `forward`, `url`; adds `computer_batch` by default |
356
+ | `gemini` | all 16 |
357
+ | `tzafon` | all 16 (replaced on the wire by Tzafon's native `computer_use` tool) |
358
+ | `yutori` | 13 — everything except `screenshot`, `url`, `cursor_position` (local mirrors only) |
359
+
360
+ Runtime specs also include `toolExecutors`: provider-owned adapters that use
361
+ the same tool-call names as the model-facing tools and translate their
362
+ arguments into canonical CUA actions for `@onkernel/cua-agent`. For most
363
+ providers, `toolDefinitions` and `toolExecutors` line up one-for-one. Some
364
+ providers are different on the wire: Yutori exposes browser actions through its
365
+ documented `tool_set` request field, so its runtime spec has no model-facing
366
+ `toolDefinitions` (`yutori.providerModule.toolDefinitions()` is `[]`) but
367
+ still provides local `toolExecutors` for the canonical actions emitted after
368
+ Yutori's native tool calls are normalized. `yutori.computerTools()` builds
369
+ local mirrors of those canonical tools — they are never sent to the API
370
+ (`streamYutori` strips them from the outbound payload) and exist so the
371
+ normalized tool calls have matching local definitions/executors. Caller-provided
372
+ tools that should remain on the provider payload can be preserved by payload
373
+ middleware via `CuaPayloadContext.keepToolNames`.
374
+
375
+ Provider namespaces also expose `coordinateSystem()`, which returns the
376
+ coordinates the provider's computer tool calls are expected to emit:
138
377
 
139
378
  ```ts
140
- openai.COMPUTER_TOOL_COORDINATES
379
+ openai.coordinateSystem()
141
380
  // { type: "pixel" }
142
381
 
143
- gemini.COMPUTER_TOOL_COORDINATES
382
+ gemini.coordinateSystem()
144
383
  // { type: "normalized", range: [0, 999] }
145
384
  ```
146
385
 
147
386
  Current coordinate contracts:
148
387
 
149
388
  - `openai`: pixel coordinates
150
- - `anthropic`: pixel coordinates
389
+ - `anthropic`: pixel coordinates, matching Anthropic's computer-use quickstart
151
390
  - `gemini`: normalized coordinates in the 0-999 range ([source](https://ai.google.dev/gemini-api/docs/computer-use))
152
391
  - `yutori`: normalized coordinates in the 0-1000 range ([source](https://docs.yutori.com/reference/navigator), [SDK helper](https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/coordinates.py))
153
392
  - `tzafon`: normalized coordinates in the 0-999 range ([source](https://docs.lightcone.ai/guides/coordinates/), [model card](https://huggingface.co/Tzafon/Northstar-CUA-Fast))
@@ -182,7 +421,7 @@ type CuaActionClick = {
182
421
  type: "click";
183
422
  x: number;
184
423
  y: number;
185
- button?: string;
424
+ button?: CuaMouseButton; // "left" | "right" | "middle" | "back" | "forward"
186
425
  hold_keys?: string[];
187
426
  };
188
427
 
@@ -192,8 +431,13 @@ type CuaActionGoto = {
192
431
  };
193
432
  ```
194
433
 
195
- The provider namespace `createComputerToolDefinitions()` emits a
196
- `batch_computer_actions` tool whose input is:
434
+ Mouse buttons are closed unions: `CuaMouseButton` for `click`/`mouse_down`/
435
+ `mouse_up` and `CuaDragMouseButton` (`"left" | "right" | "middle"`) for
436
+ `drag`. Executors coerce anything outside the set to `"left"`. `keys` stays
437
+ `string[]` — the agent-side key-alias table passes unrecognized keys through.
438
+
439
+ `createCuaBatchToolDefinition(actions?, options?)` builds a batch tool schema
440
+ whose input is:
197
441
 
198
442
  ```ts
199
443
  type CuaBatchInput = {
@@ -201,11 +445,14 @@ type CuaBatchInput = {
201
445
  };
202
446
  ```
203
447
 
204
- The model can plan several writes and reads in one call. Read actions such as
205
- `screenshot`, `url`, and `cursor_position` can be interleaved with writes so
206
- your executor can return fresh state in the same order.
448
+ Providers can include a batch tool when their model is expected to use one.
449
+ Anthropic does this by default with `computer_batch` (also exported as
450
+ `anthropic.ANTHROPIC_BATCH_TOOL_NAME`, equal to the top-level
451
+ `CUA_BATCH_TOOL_NAME`); Yutori does not.
452
+ `createCuaBatchToolExecutor()` is the matching execution adapter for turning
453
+ that provider-defined batch input into canonical CUA actions.
207
454
 
208
- When `actions` is omitted, the OpenAI namespace also emits a `computer_use_extra`
455
+ `createCuaNavigationToolDefinition()` can synthesize a `computer_use_extra`
209
456
  navigation tool whose input is:
210
457
 
211
458
  ```ts
@@ -215,15 +462,44 @@ type CuaNavigationInput = {
215
462
  };
216
463
  ```
217
464
 
218
- Provider namespaces:
219
-
220
- - `openai`: `createComputerToolDefinitions`, `COMPUTER_TOOL_COORDINATES`, OpenAI CUA action schemas, and `OPENAI_BATCH_INSTRUCTIONS`
221
- - `anthropic`: `createComputerToolDefinitions`, `COMPUTER_TOOL_COORDINATES`, prompt helpers, and CUA batch schema aliases
222
- - `gemini`: `createComputerToolDefinitions`, `COMPUTER_TOOL_COORDINATES`, prompt helpers, and CUA batch schema aliases
223
- - `tzafon`: `createComputerToolDefinitions`, `COMPUTER_TOOL_COORDINATES`, prompt helpers, and local `tzafon-responses` stream adapter
224
- - `yutori`: Yutori prompt helpers, local `yutori-chat-completions` stream
225
- adapter, `createComputerToolDefinitions`, `COMPUTER_TOOL_COORDINATES`, and
226
- `yutoriBuiltinToolsOnPayload`
465
+ ## Provider Namespaces
466
+
467
+ Every provider namespace (`openai`, `anthropic`, `gemini`, `tzafon`,
468
+ `yutori`) follows one convention:
469
+
470
+ - `computerTools(options?)` and `computerToolExecutors(options?)`
471
+ - `createActionSchema(actions?)` TypeBox schema for the provider's subset
472
+ - `coordinateSystem()`
473
+ - `build<Provider>SystemPrompt({ suffix? })` and
474
+ `<PROVIDER>_COMPUTER_INSTRUCTIONS` (the prompt text)
475
+ - `<PROVIDER>_CUA_ACTION_TYPES` — the supported canonical action subset
476
+ - `<Provider>Action` type — the canonical action union for that subset
477
+ - `ComputerToolsOptions` type (Anthropic's adds `excludeBatch`, also exported
478
+ as `AnthropicComputerToolsOptions`)
479
+ - `providerModule` — the uniform `CuaProviderModule` object that
480
+ `resolveCuaRuntimeSpec` looks up
481
+
482
+ Provider-specific extras:
483
+
484
+ - `openai`: `openaiResponsesStoreOnPayload` payload hook, plus the
485
+ `computer_use_extra` navigation aliases `OPENAI_EXTRA_TOOL_NAME`,
486
+ `OPENAI_EXTRA_TOOL_DESCRIPTION`, `OpenAIExtraSchema`, `OpenAIExtraInput`
487
+ - `anthropic`: `ANTHROPIC_BATCH_TOOL_NAME` (`"computer_batch"`)
488
+ - `tzafon`: the `tzafon-responses` stream adapter (`TZAFON_RESPONSES_API`,
489
+ `streamTzafonResponses`, `streamSimpleTzafonResponses`,
490
+ `TzafonResponsesOptions` with `keepToolNames`), `tzafonComputerUseOnPayload`
491
+ payload middleware, `tzafonToolCallId`, and the native-to-canonical
492
+ normalizer `toCanonicalActions` (+ `TzafonCanonicalAction`)
493
+ - `yutori`: the `yutori-chat-completions` stream adapter
494
+ (`YUTORI_CHAT_COMPLETIONS_API`, `streamYutori`, `streamSimpleYutori`,
495
+ `YutoriOptions` with `keepToolNames`), `yutoriNativeToolSetOnPayload`
496
+ payload middleware, the native Navigator action sets
497
+ (`YUTORI_N1_ACTION_TYPES`, `YUTORI_N15_CORE_ACTION_TYPES`,
498
+ `YUTORI_N15_EXPANDED_ACTION_TYPES`, `YUTORI_N15_ACTION_TYPES`, the
499
+ `YUTORI_N15_CORE_TOOL_SET`/`YUTORI_N15_EXPANDED_TOOL_SET` tool-set ids, and
500
+ the matching `Yutori*ActionType` types), `yutoriToolSetForModel`,
501
+ `yutoriNativeActionsForModel`, and the native-to-canonical normalizer
502
+ `toCanonicalActions`
227
503
 
228
504
  This package does not execute browser actions. Use `@onkernel/cua-agent` when
229
505
  you want model tool calls executed against a Kernel browser.
@@ -0,0 +1,13 @@
1
+ //#region \0rolldown/runtime.js
2
+ var __defProp = Object.defineProperty;
3
+ var __exportAll = (all, no_symbols) => {
4
+ let target = {};
5
+ for (var name in all) __defProp(target, name, {
6
+ get: all[name],
7
+ enumerable: true
8
+ });
9
+ if (!no_symbols) __defProp(target, Symbol.toStringTag, { value: "Module" });
10
+ return target;
11
+ };
12
+ //#endregion
13
+ export { __exportAll as t };