opensteer 0.4.11 → 0.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,6 +110,15 @@ function resolveProviderInfo(modelStr) {
110
110
  }
111
111
  return { pkg: "@ai-sdk/openai", providerFn: "openai" };
112
112
  }
113
+ function stripProviderPrefix(modelStr) {
114
+ const slash = modelStr.indexOf("/");
115
+ if (slash <= 0) return modelStr;
116
+ const provider = modelStr.slice(0, slash).toLowerCase();
117
+ if (provider === "openai" || provider === "anthropic" || provider === "google" || provider === "xai" || provider === "groq") {
118
+ return modelStr.slice(slash + 1);
119
+ }
120
+ return modelStr;
121
+ }
113
122
  async function getModelProvider(modelStr) {
114
123
  const { pkg, providerFn } = resolveProviderInfo(modelStr);
115
124
  let mod;
@@ -126,7 +135,7 @@ async function getModelProvider(modelStr) {
126
135
  `Provider '${providerFn}' not found in '${pkg}'. Ensure you have the latest version installed.`
127
136
  );
128
137
  }
129
- const modelId = modelStr.startsWith("groq/") ? modelStr.slice("groq/".length) : modelStr;
138
+ const modelId = stripProviderPrefix(modelStr);
130
139
  return provider(modelId);
131
140
  }
132
141
  var PROVIDER_MAP;
@@ -134,6 +143,10 @@ var init_model = __esm({
134
143
  "src/ai/model.ts"() {
135
144
  "use strict";
136
145
  PROVIDER_MAP = {
146
+ "openai/": { pkg: "@ai-sdk/openai", providerFn: "openai" },
147
+ "anthropic/": { pkg: "@ai-sdk/anthropic", providerFn: "anthropic" },
148
+ "google/": { pkg: "@ai-sdk/google", providerFn: "google" },
149
+ "xai/": { pkg: "@ai-sdk/xai", providerFn: "xai" },
137
150
  "gpt-": { pkg: "@ai-sdk/openai", providerFn: "openai" },
138
151
  "o1-": { pkg: "@ai-sdk/openai", providerFn: "openai" },
139
152
  "o3-": { pkg: "@ai-sdk/openai", providerFn: "openai" },
@@ -7797,7 +7810,8 @@ var CloudCdpClient = class {
7797
7810
  const message = error instanceof Error ? error.message : "Failed to connect to cloud CDP endpoint.";
7798
7811
  throw new OpensteerCloudError("CLOUD_TRANSPORT_ERROR", message);
7799
7812
  }
7800
- const context = browser.contexts()[0];
7813
+ const contexts = browser.contexts();
7814
+ const context = contexts[0];
7801
7815
  if (!context) {
7802
7816
  await browser.close();
7803
7817
  throw new OpensteerCloudError(
@@ -7805,10 +7819,41 @@ var CloudCdpClient = class {
7805
7819
  "Cloud browser returned no context."
7806
7820
  );
7807
7821
  }
7822
+ const preferred = selectPreferredContextPage(browser, contexts);
7823
+ if (preferred) {
7824
+ return preferred;
7825
+ }
7808
7826
  const page = context.pages()[0] || await context.newPage();
7809
7827
  return { browser, context, page };
7810
7828
  }
7811
7829
  };
7830
+ function selectPreferredContextPage(browser, contexts) {
7831
+ let aboutBlankCandidate = null;
7832
+ for (const context of contexts) {
7833
+ for (const page of context.pages()) {
7834
+ const url = safePageUrl(page);
7835
+ if (!isInternalOrEmptyUrl(url)) {
7836
+ return { browser, context, page };
7837
+ }
7838
+ if (!aboutBlankCandidate && url === "about:blank") {
7839
+ aboutBlankCandidate = { browser, context, page };
7840
+ }
7841
+ }
7842
+ }
7843
+ return aboutBlankCandidate;
7844
+ }
7845
+ function safePageUrl(page) {
7846
+ try {
7847
+ return page.url();
7848
+ } catch {
7849
+ return "";
7850
+ }
7851
+ }
7852
+ function isInternalOrEmptyUrl(url) {
7853
+ if (!url) return true;
7854
+ if (url === "about:blank") return true;
7855
+ return url.startsWith("chrome://") || url.startsWith("devtools://") || url.startsWith("edge://");
7856
+ }
7812
7857
  function withTokenQuery2(wsUrl, token) {
7813
7858
  const url = new URL(wsUrl);
7814
7859
  url.searchParams.set("token", token);
@@ -8070,7 +8115,7 @@ function toCloudErrorCode(code) {
8070
8115
  }
8071
8116
 
8072
8117
  // src/cloud/runtime.ts
8073
- var DEFAULT_CLOUD_BASE_URL = "https://remote.opensteer.com";
8118
+ var DEFAULT_CLOUD_BASE_URL = "https://api.opensteer.com";
8074
8119
  function createCloudRuntimeState(key, baseUrl = resolveCloudBaseUrl(), authScheme = "api-key") {
8075
8120
  const normalizedBaseUrl = normalizeCloudBaseUrl(baseUrl);
8076
8121
  return {
@@ -8102,187 +8147,2006 @@ function readCloudActionDescription(payload) {
8102
8147
  return normalized.length ? normalized : void 0;
8103
8148
  }
8104
8149
 
8105
- // src/opensteer.ts
8106
- var CLOUD_INTERACTION_METHODS = /* @__PURE__ */ new Set([
8107
- "click",
8108
- "dblclick",
8109
- "rightclick",
8110
- "hover",
8111
- "input",
8112
- "select",
8113
- "scroll",
8114
- "uploadFile"
8150
+ // src/agent/errors.ts
8151
+ var OpensteerAgentError = class extends Error {
8152
+ constructor(message, cause) {
8153
+ super(message, { cause });
8154
+ this.name = "OpensteerAgentError";
8155
+ }
8156
+ };
8157
+ var OpensteerAgentConfigError = class extends OpensteerAgentError {
8158
+ constructor(message) {
8159
+ super(message);
8160
+ this.name = "OpensteerAgentConfigError";
8161
+ }
8162
+ };
8163
+ var OpensteerAgentProviderError = class extends OpensteerAgentError {
8164
+ constructor(message) {
8165
+ super(message);
8166
+ this.name = "OpensteerAgentProviderError";
8167
+ }
8168
+ };
8169
+ var OpensteerAgentExecutionError = class extends OpensteerAgentError {
8170
+ constructor(message, cause) {
8171
+ super(message, cause);
8172
+ this.name = "OpensteerAgentExecutionError";
8173
+ }
8174
+ };
8175
+ var OpensteerAgentBusyError = class extends OpensteerAgentError {
8176
+ constructor() {
8177
+ super("An OpenSteer agent execution is already in progress on this instance.");
8178
+ this.name = "OpensteerAgentBusyError";
8179
+ }
8180
+ };
8181
+ var OpensteerAgentActionError = class extends OpensteerAgentError {
8182
+ constructor(message, cause) {
8183
+ super(message, cause);
8184
+ this.name = "OpensteerAgentActionError";
8185
+ }
8186
+ };
8187
+ var OpensteerAgentApiError = class extends OpensteerAgentError {
8188
+ status;
8189
+ provider;
8190
+ constructor(provider, message, status, cause) {
8191
+ super(message, cause);
8192
+ this.name = "OpensteerAgentApiError";
8193
+ this.provider = provider;
8194
+ this.status = status;
8195
+ }
8196
+ };
8197
+
8198
+ // src/agent/model.ts
8199
+ var SUPPORTED_CUA_PROVIDERS = /* @__PURE__ */ new Set([
8200
+ "openai",
8201
+ "anthropic",
8202
+ "google"
8115
8203
  ]);
8116
- var Opensteer = class _Opensteer {
8117
- config;
8118
- aiResolve;
8119
- aiExtract;
8120
- namespace;
8121
- storage;
8122
- pool;
8123
- cloud;
8124
- browser = null;
8125
- pageRef = null;
8126
- contextRef = null;
8127
- ownsBrowser = false;
8128
- snapshotCache = null;
8129
- constructor(config = {}) {
8130
- const resolved = resolveConfig(config);
8131
- const cloudSelection = resolveCloudSelection({
8132
- cloud: resolved.cloud
8133
- });
8134
- const model = resolved.model;
8135
- this.config = resolved;
8136
- this.aiResolve = this.createLazyResolveCallback(model);
8137
- this.aiExtract = this.createLazyExtractCallback(model);
8138
- const rootDir = resolved.storage?.rootDir || process.cwd();
8139
- this.namespace = resolveNamespace(resolved, rootDir);
8140
- this.storage = new LocalSelectorStorage(rootDir, this.namespace);
8141
- this.pool = new BrowserPool(resolved.browser || {});
8142
- if (cloudSelection.cloud) {
8143
- const cloudConfig = resolved.cloud && typeof resolved.cloud === "object" ? resolved.cloud : void 0;
8144
- const apiKey = cloudConfig?.apiKey?.trim();
8145
- if (!apiKey) {
8146
- throw new Error(
8147
- "Cloud mode requires a non-empty API key via cloud.apiKey or OPENSTEER_API_KEY."
8148
- );
8149
- }
8150
- this.cloud = createCloudRuntimeState(
8151
- apiKey,
8152
- cloudConfig?.baseUrl,
8153
- cloudConfig?.authScheme
8154
- );
8155
- } else {
8156
- this.cloud = null;
8157
- }
8204
+ function resolveCuaModelConfig(args) {
8205
+ const env = args.env || process.env;
8206
+ const source = resolveModelSource(args.agentConfig.model, args.fallbackModel);
8207
+ const parsed = parseProviderModel(source.modelName);
8208
+ if (!SUPPORTED_CUA_PROVIDERS.has(parsed.provider)) {
8209
+ throw new OpensteerAgentProviderError(
8210
+ `Unsupported CUA provider "${parsed.provider}". Supported providers: openai, anthropic, google.`
8211
+ );
8158
8212
  }
8159
- createLazyResolveCallback(model) {
8160
- let resolverPromise = null;
8161
- return async (...args) => {
8162
- try {
8163
- if (!resolverPromise) {
8164
- resolverPromise = Promise.resolve().then(() => (init_resolver(), resolver_exports)).then(
8165
- (m) => m.createResolveCallback(model)
8166
- );
8167
- }
8168
- const resolver = await resolverPromise;
8169
- return resolver(...args);
8170
- } catch (err) {
8171
- resolverPromise = null;
8172
- throw err;
8173
- }
8213
+ const apiKey = resolveProviderApiKey(parsed.provider, source.options.apiKey, env);
8214
+ return {
8215
+ provider: parsed.provider,
8216
+ fullModelName: `${parsed.provider}/${parsed.modelName}`,
8217
+ providerModelName: parsed.modelName,
8218
+ apiKey,
8219
+ baseUrl: normalizeOptional(source.options.baseUrl),
8220
+ organization: normalizeOptional(source.options.organization),
8221
+ thinkingBudget: typeof source.options.thinkingBudget === "number" && Number.isFinite(source.options.thinkingBudget) ? source.options.thinkingBudget : void 0,
8222
+ environment: normalizeOptional(source.options.environment)
8223
+ };
8224
+ }
8225
+ function resolveModelSource(model, fallbackModel) {
8226
+ if (model && typeof model === "object") {
8227
+ const modelName2 = normalizeRequired(model.modelName, "agent.model.modelName");
8228
+ const { modelName: _, ...options } = model;
8229
+ return {
8230
+ modelName: modelName2,
8231
+ options
8174
8232
  };
8175
8233
  }
8176
- createLazyExtractCallback(model) {
8177
- let extractorPromise = null;
8178
- const extract = async (args) => {
8179
- try {
8180
- if (!extractorPromise) {
8181
- extractorPromise = Promise.resolve().then(() => (init_extractor(), extractor_exports)).then(
8182
- (m) => m.createExtractCallback(model)
8183
- );
8184
- }
8185
- const extractor = await extractorPromise;
8186
- return extractor(args);
8187
- } catch (err) {
8188
- extractorPromise = null;
8189
- throw err;
8190
- }
8234
+ const modelName = normalizeOptional(model) || normalizeOptional(fallbackModel);
8235
+ if (!modelName) {
8236
+ throw new OpensteerAgentConfigError(
8237
+ 'A CUA model is required. Pass agent.model (for example "openai/computer-use-preview").'
8238
+ );
8239
+ }
8240
+ return {
8241
+ modelName,
8242
+ options: {}
8243
+ };
8244
+ }
8245
+ function parseProviderModel(modelName) {
8246
+ const slash = modelName.indexOf("/");
8247
+ if (slash <= 0 || slash === modelName.length - 1) {
8248
+ throw new OpensteerAgentConfigError(
8249
+ `Invalid CUA model "${modelName}". Use "provider/model" format (for example "openai/computer-use-preview").`
8250
+ );
8251
+ }
8252
+ const providerRaw = modelName.slice(0, slash).trim().toLowerCase();
8253
+ const providerModelName = modelName.slice(slash + 1).trim();
8254
+ if (!providerModelName) {
8255
+ throw new OpensteerAgentConfigError(
8256
+ `Invalid CUA model "${modelName}". The model name segment after the provider cannot be empty.`
8257
+ );
8258
+ }
8259
+ if (providerRaw !== "openai" && providerRaw !== "anthropic" && providerRaw !== "google") {
8260
+ throw new OpensteerAgentProviderError(
8261
+ `Unsupported CUA provider "${providerRaw}". Supported providers: openai, anthropic, google.`
8262
+ );
8263
+ }
8264
+ return {
8265
+ provider: providerRaw,
8266
+ modelName: providerModelName
8267
+ };
8268
+ }
8269
+ function resolveProviderApiKey(provider, explicitApiKey, env) {
8270
+ const explicit = normalizeOptional(explicitApiKey);
8271
+ if (explicit) return explicit;
8272
+ if (provider === "openai") {
8273
+ const value = normalizeOptional(env.OPENAI_API_KEY);
8274
+ if (value) return value;
8275
+ throw new OpensteerAgentConfigError(
8276
+ "OpenAI CUA requires an API key via agent.model.apiKey or OPENAI_API_KEY."
8277
+ );
8278
+ }
8279
+ if (provider === "anthropic") {
8280
+ const value = normalizeOptional(env.ANTHROPIC_API_KEY);
8281
+ if (value) return value;
8282
+ throw new OpensteerAgentConfigError(
8283
+ "Anthropic CUA requires an API key via agent.model.apiKey or ANTHROPIC_API_KEY."
8284
+ );
8285
+ }
8286
+ const googleApiKey = normalizeOptional(env.GOOGLE_GENERATIVE_AI_API_KEY) || normalizeOptional(env.GEMINI_API_KEY) || normalizeOptional(env.GOOGLE_API_KEY);
8287
+ if (googleApiKey) return googleApiKey;
8288
+ throw new OpensteerAgentConfigError(
8289
+ "Google CUA requires an API key via agent.model.apiKey, GOOGLE_GENERATIVE_AI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY."
8290
+ );
8291
+ }
8292
+ function normalizeOptional(value) {
8293
+ if (typeof value !== "string") return void 0;
8294
+ const trimmed = value.trim();
8295
+ return trimmed.length ? trimmed : void 0;
8296
+ }
8297
+ function normalizeRequired(value, field) {
8298
+ const normalized = normalizeOptional(value);
8299
+ if (!normalized) {
8300
+ throw new OpensteerAgentConfigError(`${field} is required.`);
8301
+ }
8302
+ return normalized;
8303
+ }
8304
+
8305
+ // src/agent/clients/openai.ts
8306
+ var import_openai = __toESM(require("openai"), 1);
8307
+
8308
+ // src/agent/client.ts
8309
+ var CuaClient = class {
8310
+ screenshotProvider = null;
8311
+ actionHandler = null;
8312
+ viewport = {
8313
+ width: 1288,
8314
+ height: 711
8315
+ };
8316
+ currentUrl = null;
8317
+ setViewport(width, height) {
8318
+ this.viewport = {
8319
+ width,
8320
+ height
8191
8321
  };
8192
- return extract;
8193
8322
  }
8194
- async invokeCloudActionAndResetCache(method, args) {
8195
- const result = await this.invokeCloudAction(method, args);
8196
- this.snapshotCache = null;
8197
- return result;
8323
+ setCurrentUrl(url) {
8324
+ this.currentUrl = url;
8198
8325
  }
8199
- async invokeCloudAction(method, args) {
8200
- const actionClient = this.cloud?.actionClient;
8201
- const sessionId = this.cloud?.sessionId;
8202
- if (!actionClient || !sessionId) {
8203
- throw cloudNotLaunchedError();
8326
+ setScreenshotProvider(provider) {
8327
+ this.screenshotProvider = provider;
8328
+ }
8329
+ setActionHandler(handler) {
8330
+ this.actionHandler = handler;
8331
+ }
8332
+ getScreenshotProvider() {
8333
+ if (!this.screenshotProvider) {
8334
+ throw new Error("CUA screenshot provider is not initialized.");
8204
8335
  }
8205
- const payload = args && typeof args === "object" ? args : {};
8206
- try {
8207
- return await actionClient.request(method, payload);
8208
- } catch (err) {
8209
- if (err instanceof OpensteerCloudError && err.code === "CLOUD_ACTION_FAILED" && CLOUD_INTERACTION_METHODS.has(method)) {
8210
- const detailsRecord = err.details && typeof err.details === "object" ? err.details : null;
8211
- const cloudFailure = normalizeActionFailure(
8212
- detailsRecord?.actionFailure
8213
- );
8214
- const failure = cloudFailure || classifyActionFailure({
8215
- action: method,
8216
- error: err,
8217
- fallbackMessage: defaultActionFailureMessage(method)
8218
- });
8219
- const description = readCloudActionDescription(payload);
8220
- throw this.buildActionError(
8221
- method,
8222
- description,
8223
- failure,
8224
- null,
8225
- err
8226
- );
8227
- }
8228
- throw err;
8336
+ return this.screenshotProvider;
8337
+ }
8338
+ getActionHandler() {
8339
+ if (!this.actionHandler) {
8340
+ throw new Error("CUA action handler is not initialized.");
8229
8341
  }
8342
+ return this.actionHandler;
8230
8343
  }
8231
- buildActionError(action, description, failure, selectorUsed, cause) {
8232
- return new OpensteerActionError({
8233
- action,
8234
- failure,
8235
- selectorUsed: selectorUsed || null,
8236
- message: formatActionFailureMessage(
8237
- action,
8238
- description,
8239
- failure.message
8240
- ),
8241
- cause
8242
- });
8344
+ };
8345
+ function normalizeExecuteOptions(instructionOrOptions) {
8346
+ if (typeof instructionOrOptions === "string") {
8347
+ return {
8348
+ instruction: normalizeInstruction(instructionOrOptions)
8349
+ };
8243
8350
  }
8244
- get page() {
8245
- if (!this.pageRef) {
8246
- throw new Error(
8247
- "Browser page is not initialized. Call launch() or Opensteer.from(page)."
8248
- );
8249
- }
8250
- return this.pageRef;
8351
+ if (!instructionOrOptions || typeof instructionOrOptions !== "object" || Array.isArray(instructionOrOptions)) {
8352
+ throw new OpensteerAgentExecutionError(
8353
+ "agent.execute(...) expects either a string instruction or an options object."
8354
+ );
8251
8355
  }
8252
- get context() {
8253
- if (!this.contextRef) {
8254
- throw new Error(
8255
- "Browser context is not initialized. Call launch() or Opensteer.from(page)."
8356
+ const normalized = {
8357
+ instruction: normalizeInstruction(instructionOrOptions.instruction)
8358
+ };
8359
+ if (instructionOrOptions.maxSteps !== void 0) {
8360
+ normalized.maxSteps = normalizeMaxSteps(instructionOrOptions.maxSteps);
8361
+ }
8362
+ if (instructionOrOptions.highlightCursor !== void 0) {
8363
+ if (typeof instructionOrOptions.highlightCursor !== "boolean") {
8364
+ throw new OpensteerAgentExecutionError(
8365
+ 'agent.execute(...) "highlightCursor" must be a boolean when provided.'
8256
8366
  );
8257
8367
  }
8258
- return this.contextRef;
8368
+ normalized.highlightCursor = instructionOrOptions.highlightCursor;
8259
8369
  }
8260
- getCloudSessionId() {
8261
- return this.cloud?.sessionId ?? null;
8370
+ return normalized;
8371
+ }
8372
+ function normalizeInstruction(instruction) {
8373
+ if (typeof instruction !== "string") {
8374
+ throw new OpensteerAgentExecutionError(
8375
+ 'agent.execute(...) requires a non-empty "instruction" string.'
8376
+ );
8262
8377
  }
8263
- getCloudSessionUrl() {
8264
- return this.cloud?.cloudSessionUrl ?? null;
8378
+ const normalized = instruction.trim();
8379
+ if (!normalized) {
8380
+ throw new OpensteerAgentExecutionError(
8381
+ 'agent.execute(...) requires a non-empty "instruction" string.'
8382
+ );
8265
8383
  }
8266
- announceCloudSession(args) {
8267
- if (!this.shouldAnnounceCloudSession()) {
8268
- return;
8269
- }
8270
- const fields = [
8271
- `sessionId=${args.sessionId}`,
8272
- `workspaceId=${args.workspaceId}`
8384
+ return normalized;
8385
+ }
8386
+ function normalizeMaxSteps(maxSteps) {
8387
+ if (typeof maxSteps !== "number" || !Number.isInteger(maxSteps) || maxSteps <= 0) {
8388
+ throw new OpensteerAgentExecutionError(
8389
+ 'agent.execute(...) "maxSteps" must be a positive integer when provided.'
8390
+ );
8391
+ }
8392
+ return maxSteps;
8393
+ }
8394
+
8395
+ // src/agent/clients/openai.ts
8396
+ var OpenAICuaClient = class extends CuaClient {
8397
+ client;
8398
+ modelConfig;
8399
+ constructor(modelConfig) {
8400
+ super();
8401
+ this.modelConfig = modelConfig;
8402
+ this.client = new import_openai.default({
8403
+ apiKey: modelConfig.apiKey,
8404
+ baseURL: modelConfig.baseUrl,
8405
+ organization: modelConfig.organization
8406
+ });
8407
+ }
8408
+ async execute(input) {
8409
+ const actions = [];
8410
+ let finalMessage = "";
8411
+ let completed = false;
8412
+ let step = 0;
8413
+ let previousResponseId;
8414
+ let nextInputItems = [
8415
+ {
8416
+ role: "system",
8417
+ content: input.systemPrompt
8418
+ },
8419
+ {
8420
+ role: "user",
8421
+ content: input.instruction
8422
+ }
8273
8423
  ];
8274
- if (args.cloudSessionUrl) {
8275
- fields.push(`url=${args.cloudSessionUrl}`);
8424
+ let totalInputTokens = 0;
8425
+ let totalOutputTokens = 0;
8426
+ let totalReasoningTokens = 0;
8427
+ let totalInferenceTimeMs = 0;
8428
+ while (!completed && step < input.maxSteps) {
8429
+ const startedAt = Date.now();
8430
+ const response = await this.getAction(nextInputItems, previousResponseId);
8431
+ totalInferenceTimeMs += Date.now() - startedAt;
8432
+ totalInputTokens += toNumber(response.usage?.input_tokens);
8433
+ totalOutputTokens += toNumber(response.usage?.output_tokens);
8434
+ totalReasoningTokens += toNumber(response.usage?.output_tokens_details?.reasoning_tokens) || toNumber(toRecord(response.usage).reasoning_tokens);
8435
+ previousResponseId = normalizeString(response.id) || previousResponseId;
8436
+ const stepResult = await this.processResponse(response.output);
8437
+ actions.push(...stepResult.actions);
8438
+ nextInputItems = stepResult.nextInputItems;
8439
+ completed = stepResult.completed;
8440
+ if (stepResult.message) {
8441
+ finalMessage = stepResult.message;
8442
+ }
8443
+ step += 1;
8276
8444
  }
8277
- process.stderr.write(`[opensteer] cloud session ready ${fields.join(" ")}
8278
- `);
8445
+ return {
8446
+ success: completed,
8447
+ completed,
8448
+ message: finalMessage,
8449
+ actions,
8450
+ usage: {
8451
+ inputTokens: totalInputTokens,
8452
+ outputTokens: totalOutputTokens,
8453
+ reasoningTokens: totalReasoningTokens > 0 ? totalReasoningTokens : void 0,
8454
+ inferenceTimeMs: totalInferenceTimeMs
8455
+ }
8456
+ };
8279
8457
  }
8280
- shouldAnnounceCloudSession() {
8281
- const cloudConfig = this.config.cloud && typeof this.config.cloud === "object" ? this.config.cloud : null;
8282
- const announce = cloudConfig?.announce ?? "always";
8283
- if (announce === "off") {
8284
- return false;
8285
- }
8458
+ async getAction(inputItems, previousResponseId) {
8459
+ const request = {
8460
+ model: this.modelConfig.providerModelName,
8461
+ tools: [
8462
+ {
8463
+ type: "computer_use_preview",
8464
+ display_width: this.viewport.width,
8465
+ display_height: this.viewport.height,
8466
+ environment: "browser"
8467
+ }
8468
+ ],
8469
+ input: inputItems,
8470
+ truncation: "auto",
8471
+ ...previousResponseId ? { previous_response_id: previousResponseId } : {}
8472
+ };
8473
+ try {
8474
+ return await this.client.responses.create(request);
8475
+ } catch (error) {
8476
+ throw mapOpenAiApiError(error);
8477
+ }
8478
+ }
8479
+ async processResponse(output) {
8480
+ const actions = [];
8481
+ const nextInputItems = [];
8482
+ const messageParts = [];
8483
+ let hasComputerAction = false;
8484
+ for (const item of output) {
8485
+ if (item.type === "computer_call") {
8486
+ hasComputerAction = true;
8487
+ const action = toAgentAction(item.action);
8488
+ actions.push(action);
8489
+ let actionError;
8490
+ try {
8491
+ await this.getActionHandler()(action);
8492
+ } catch (error) {
8493
+ actionError = error instanceof Error ? error.message : String(error);
8494
+ }
8495
+ const outputItem = {
8496
+ type: "computer_call_output",
8497
+ call_id: item.call_id
8498
+ };
8499
+ const safetyChecks = item.pending_safety_checks.length ? item.pending_safety_checks : void 0;
8500
+ const screenshotDataUrl = await this.captureScreenshotDataUrl();
8501
+ const outputPayload = {
8502
+ type: "input_image",
8503
+ image_url: screenshotDataUrl
8504
+ };
8505
+ if (this.currentUrl) {
8506
+ outputPayload.current_url = this.currentUrl;
8507
+ }
8508
+ if (actionError) {
8509
+ outputPayload.error = actionError;
8510
+ }
8511
+ outputItem.output = outputPayload;
8512
+ if (safetyChecks) {
8513
+ outputItem.acknowledged_safety_checks = safetyChecks;
8514
+ }
8515
+ nextInputItems.push(outputItem);
8516
+ }
8517
+ if (item.type === "message") {
8518
+ for (const content of item.content) {
8519
+ if (content.type === "output_text") {
8520
+ messageParts.push(content.text);
8521
+ }
8522
+ }
8523
+ }
8524
+ }
8525
+ return {
8526
+ actions,
8527
+ nextInputItems,
8528
+ completed: !hasComputerAction,
8529
+ message: messageParts.join("\n").trim()
8530
+ };
8531
+ }
8532
+ async captureScreenshotDataUrl() {
8533
+ const base64 = await this.getScreenshotProvider()();
8534
+ return `data:image/png;base64,${base64}`;
8535
+ }
8536
+ };
8537
+ function toAgentAction(action) {
8538
+ const actionRecord = toRecord(action);
8539
+ return {
8540
+ type: normalizeString(actionRecord.type) || "unknown",
8541
+ ...actionRecord
8542
+ };
8543
+ }
8544
+ function mapOpenAiApiError(error) {
8545
+ const errorRecord = toRecord(error);
8546
+ const nestedError = toRecord(errorRecord.error);
8547
+ const status = toNumber(errorRecord.status);
8548
+ const message = normalizeString(nestedError.message) || (error instanceof Error ? error.message : String(error));
8549
+ return new OpensteerAgentApiError("openai", message, status, error);
8550
+ }
8551
+ function toRecord(value) {
8552
+ return value && typeof value === "object" ? value : {};
8553
+ }
8554
+ function toNumber(value) {
8555
+ return typeof value === "number" && Number.isFinite(value) ? value : 0;
8556
+ }
8557
+ function normalizeString(value) {
8558
+ if (typeof value !== "string") return void 0;
8559
+ const normalized = value.trim();
8560
+ return normalized.length ? normalized : void 0;
8561
+ }
8562
+
8563
+ // src/agent/clients/anthropic.ts
8564
+ var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
8565
+ var AnthropicCuaClient = class extends CuaClient {
8566
+ modelConfig;
8567
+ client;
8568
+ constructor(modelConfig) {
8569
+ super();
8570
+ this.modelConfig = modelConfig;
8571
+ this.client = new import_sdk.default({
8572
+ apiKey: modelConfig.apiKey,
8573
+ baseURL: modelConfig.baseUrl
8574
+ });
8575
+ }
8576
+ async execute(input) {
8577
+ const actions = [];
8578
+ let finalMessage = "";
8579
+ let completed = false;
8580
+ let step = 0;
8581
+ const messages = [
8582
+ {
8583
+ role: "user",
8584
+ content: input.instruction
8585
+ }
8586
+ ];
8587
+ let totalInputTokens = 0;
8588
+ let totalOutputTokens = 0;
8589
+ let totalReasoningTokens = 0;
8590
+ let totalInferenceTimeMs = 0;
8591
+ while (!completed && step < input.maxSteps) {
8592
+ const startedAt = Date.now();
8593
+ const response = await this.getAction(messages, input.systemPrompt);
8594
+ totalInferenceTimeMs += Date.now() - startedAt;
8595
+ totalInputTokens += toNumber2(response?.usage?.input_tokens);
8596
+ totalOutputTokens += toNumber2(response?.usage?.output_tokens);
8597
+ totalReasoningTokens += toNumber2(toRecord2(response.usage).reasoning_tokens);
8598
+ const content = response.content.map((item) => toRecord2(item));
8599
+ const toolUseItems = content.filter(
8600
+ (item) => item.type === "tool_use" && item.name === "computer"
8601
+ );
8602
+ const message = extractTextMessage(content);
8603
+ if (message) {
8604
+ finalMessage = message;
8605
+ }
8606
+ messages.push({
8607
+ role: "assistant",
8608
+ content
8609
+ });
8610
+ if (!toolUseItems.length) {
8611
+ completed = true;
8612
+ } else {
8613
+ const stepResult = await this.processToolUseItems(toolUseItems);
8614
+ actions.push(...stepResult.actions);
8615
+ messages.push({
8616
+ role: "user",
8617
+ content: stepResult.toolResults
8618
+ });
8619
+ }
8620
+ step += 1;
8621
+ }
8622
+ return {
8623
+ success: completed,
8624
+ completed,
8625
+ message: finalMessage,
8626
+ actions,
8627
+ usage: {
8628
+ inputTokens: totalInputTokens,
8629
+ outputTokens: totalOutputTokens,
8630
+ reasoningTokens: totalReasoningTokens > 0 ? totalReasoningTokens : void 0,
8631
+ inferenceTimeMs: totalInferenceTimeMs
8632
+ }
8633
+ };
8634
+ }
8635
+ async processToolUseItems(items) {
8636
+ const actions = [];
8637
+ const toolResults = [];
8638
+ for (const item of items) {
8639
+ const toolUseId = normalizeString2(item.id);
8640
+ const input = item.input && typeof item.input === "object" ? item.input : {};
8641
+ const action = convertAnthropicAction(input);
8642
+ actions.push(action);
8643
+ let errorMessage2;
8644
+ try {
8645
+ await this.getActionHandler()(action);
8646
+ } catch (error) {
8647
+ errorMessage2 = error instanceof Error ? error.message : String(error);
8648
+ }
8649
+ let imageBlock = null;
8650
+ try {
8651
+ const screenshot = await this.getScreenshotProvider()();
8652
+ imageBlock = {
8653
+ type: "image",
8654
+ source: {
8655
+ type: "base64",
8656
+ media_type: "image/png",
8657
+ data: screenshot
8658
+ }
8659
+ };
8660
+ } catch (error) {
8661
+ errorMessage2 = errorMessage2 || (error instanceof Error ? error.message : String(error));
8662
+ }
8663
+ const resultContent = [];
8664
+ if (imageBlock) {
8665
+ resultContent.push(imageBlock);
8666
+ }
8667
+ if (this.currentUrl) {
8668
+ resultContent.push({
8669
+ type: "text",
8670
+ text: `Current URL: ${this.currentUrl}`
8671
+ });
8672
+ }
8673
+ if (errorMessage2) {
8674
+ resultContent.push({
8675
+ type: "text",
8676
+ text: `Error: ${errorMessage2}`
8677
+ });
8678
+ }
8679
+ toolResults.push({
8680
+ type: "tool_result",
8681
+ tool_use_id: toolUseId || "unknown_tool_use_id",
8682
+ content: resultContent.length > 0 ? resultContent : [
8683
+ {
8684
+ type: "text",
8685
+ text: "Action completed."
8686
+ }
8687
+ ]
8688
+ });
8689
+ }
8690
+ return {
8691
+ actions,
8692
+ toolResults
8693
+ };
8694
+ }
8695
+ async getAction(messages, systemPrompt) {
8696
+ const toolVersion = requiresNewestAnthropicToolVersion(
8697
+ this.modelConfig.providerModelName
8698
+ ) ? "computer_20251124" : "computer_20250124";
8699
+ const betaFlag = toolVersion === "computer_20251124" ? "computer-use-2025-11-24" : "computer-use-2025-01-24";
8700
+ const request = {
8701
+ model: this.modelConfig.providerModelName,
8702
+ max_tokens: 4096,
8703
+ system: systemPrompt,
8704
+ messages,
8705
+ tools: [
8706
+ {
8707
+ type: toolVersion,
8708
+ name: "computer",
8709
+ display_width_px: this.viewport.width,
8710
+ display_height_px: this.viewport.height,
8711
+ display_number: 1
8712
+ }
8713
+ ],
8714
+ betas: [betaFlag]
8715
+ };
8716
+ if (typeof this.modelConfig.thinkingBudget === "number") {
8717
+ request.thinking = {
8718
+ type: "enabled",
8719
+ budget_tokens: this.modelConfig.thinkingBudget
8720
+ };
8721
+ }
8722
+ try {
8723
+ return await this.client.beta.messages.create(
8724
+ request
8725
+ );
8726
+ } catch (error) {
8727
+ throw mapAnthropicApiError(error);
8728
+ }
8729
+ }
8730
+ };
8731
+ function convertAnthropicAction(input) {
8732
+ const type = normalizeString2(input.action) || "unknown";
8733
+ if (type === "left_click") {
8734
+ const coordinates = resolveCoordinates(input, type);
8735
+ return {
8736
+ type: "click",
8737
+ x: coordinates.x,
8738
+ y: coordinates.y,
8739
+ button: "left"
8740
+ };
8741
+ }
8742
+ if (type === "double_click" || type === "doubleClick") {
8743
+ const coordinates = resolveCoordinates(input, type);
8744
+ return {
8745
+ type: "double_click",
8746
+ x: coordinates.x,
8747
+ y: coordinates.y
8748
+ };
8749
+ }
8750
+ if (type === "drag" || type === "left_click_drag") {
8751
+ const start = resolveCoordinateArray(
8752
+ input.start_coordinate,
8753
+ type,
8754
+ "start_coordinate"
8755
+ );
8756
+ const end = resolveCoordinates(input, type);
8757
+ return {
8758
+ type: "drag",
8759
+ path: [start, end]
8760
+ };
8761
+ }
8762
+ if (type === "scroll") {
8763
+ const coordinates = resolveCoordinates(input, type);
8764
+ const direction = normalizeScrollDirection(input.scroll_direction, type);
8765
+ const amount = resolvePositiveNumber(
8766
+ input.scroll_amount,
8767
+ type,
8768
+ "scroll_amount"
8769
+ );
8770
+ const magnitude = Math.max(1, amount) * 100;
8771
+ let scrollX = 0;
8772
+ let scrollY = 0;
8773
+ if (direction === "up") scrollY = -magnitude;
8774
+ if (direction === "down") scrollY = magnitude;
8775
+ if (direction === "left") scrollX = -magnitude;
8776
+ if (direction === "right") scrollX = magnitude;
8777
+ return {
8778
+ type: "scroll",
8779
+ x: coordinates.x,
8780
+ y: coordinates.y,
8781
+ scrollX,
8782
+ scrollY
8783
+ };
8784
+ }
8785
+ if (type === "keypress" || type === "key") {
8786
+ const keyText = normalizeRequiredString(
8787
+ input.text,
8788
+ `Anthropic action "${type}" requires a non-empty text value.`
8789
+ );
8790
+ return {
8791
+ type: "keypress",
8792
+ keys: [keyText]
8793
+ };
8794
+ }
8795
+ if (type === "move") {
8796
+ const coordinates = resolveCoordinates(input, type);
8797
+ return {
8798
+ type: "move",
8799
+ x: coordinates.x,
8800
+ y: coordinates.y
8801
+ };
8802
+ }
8803
+ if (type === "click") {
8804
+ const coordinates = resolveCoordinates(input, type);
8805
+ return {
8806
+ type: "click",
8807
+ x: coordinates.x,
8808
+ y: coordinates.y,
8809
+ button: normalizeMouseButton(input.button)
8810
+ };
8811
+ }
8812
+ if (type === "type") {
8813
+ const coordinates = resolveCoordinates(input, type);
8814
+ return {
8815
+ type: "type",
8816
+ text: normalizeRequiredString(
8817
+ input.text,
8818
+ `Anthropic action "${type}" requires a non-empty text value.`
8819
+ ),
8820
+ x: coordinates.x,
8821
+ y: coordinates.y
8822
+ };
8823
+ }
8824
+ return {
8825
+ type,
8826
+ ...input
8827
+ };
8828
+ }
8829
+ function extractTextMessage(content) {
8830
+ const texts = content.filter((item) => item.type === "text" && typeof item.text === "string").map((item) => String(item.text));
8831
+ return texts.join("\n").trim();
8832
+ }
8833
+ function requiresNewestAnthropicToolVersion(modelName) {
8834
+ return modelName === "claude-opus-4-6" || modelName === "claude-sonnet-4-6" || modelName === "claude-opus-4-5-20251101";
8835
+ }
8836
+ function normalizeString2(value) {
8837
+ if (typeof value !== "string") return void 0;
8838
+ const normalized = value.trim();
8839
+ return normalized.length ? normalized : void 0;
8840
+ }
8841
+ function normalizeRequiredString(value, errorMessage2) {
8842
+ const normalized = normalizeString2(value);
8843
+ if (!normalized) {
8844
+ throw new OpensteerAgentActionError(errorMessage2);
8845
+ }
8846
+ return normalized;
8847
+ }
8848
+ function toNumber2(value) {
8849
+ return typeof value === "number" && Number.isFinite(value) ? value : 0;
8850
+ }
8851
+ function arrayNumber(value) {
8852
+ if (!Array.isArray(value)) return [NaN, NaN];
8853
+ return [
8854
+ typeof value[0] === "number" ? value[0] : NaN,
8855
+ typeof value[1] === "number" ? value[1] : NaN
8856
+ ];
8857
+ }
8858
+ function resolveCoordinates(input, actionType) {
8859
+ const [xFromCoordinate, yFromCoordinate] = arrayNumber(input.coordinate);
8860
+ const xFromFallback = toFiniteNumber(input.x);
8861
+ const yFromFallback = toFiniteNumber(input.y);
8862
+ const x = Number.isFinite(xFromCoordinate) ? xFromCoordinate : xFromFallback;
8863
+ const y = Number.isFinite(yFromCoordinate) ? yFromCoordinate : yFromFallback;
8864
+ if (x == null || y == null) {
8865
+ throw new OpensteerAgentActionError(
8866
+ `Anthropic action "${actionType}" requires numeric x/y coordinates.`
8867
+ );
8868
+ }
8869
+ return { x, y };
8870
+ }
8871
+ function resolveCoordinateArray(value, actionType, field) {
8872
+ const [x, y] = arrayNumber(value);
8873
+ if (!Number.isFinite(x) || !Number.isFinite(y)) {
8874
+ throw new OpensteerAgentActionError(
8875
+ `Anthropic action "${actionType}" requires numeric "${field}" coordinates.`
8876
+ );
8877
+ }
8878
+ return { x, y };
8879
+ }
8880
+ function resolvePositiveNumber(value, actionType, field) {
8881
+ const number = toFiniteNumber(value);
8882
+ if (number == null || number <= 0) {
8883
+ throw new OpensteerAgentActionError(
8884
+ `Anthropic action "${actionType}" requires a positive numeric "${field}" value.`
8885
+ );
8886
+ }
8887
+ return number;
8888
+ }
8889
+ function normalizeScrollDirection(value, actionType) {
8890
+ const direction = normalizeString2(value);
8891
+ if (direction === "up" || direction === "down" || direction === "left" || direction === "right") {
8892
+ return direction;
8893
+ }
8894
+ throw new OpensteerAgentActionError(
8895
+ `Anthropic action "${actionType}" requires "scroll_direction" to be one of: up, down, left, right.`
8896
+ );
8897
+ }
8898
+ function normalizeMouseButton(value) {
8899
+ const button = normalizeRequiredString(
8900
+ value,
8901
+ 'Anthropic action "click" requires a non-empty "button" value.'
8902
+ ).toLowerCase();
8903
+ if (button === "left" || button === "right" || button === "middle") {
8904
+ return button;
8905
+ }
8906
+ throw new OpensteerAgentActionError(
8907
+ `Anthropic action "click" has unsupported button "${button}".`
8908
+ );
8909
+ }
8910
+ function toFiniteNumber(value) {
8911
+ if (typeof value === "number" && Number.isFinite(value)) {
8912
+ return value;
8913
+ }
8914
+ return null;
8915
+ }
8916
+ function mapAnthropicApiError(error) {
8917
+ const errorRecord = toRecord2(error);
8918
+ const nestedError = toRecord2(errorRecord.error);
8919
+ const status = typeof errorRecord.status === "number" ? errorRecord.status : void 0;
8920
+ const message = normalizeString2(nestedError.message) || (error instanceof Error ? error.message : String(error));
8921
+ return new OpensteerAgentApiError("anthropic", message, status, error);
8922
+ }
8923
+ function toRecord2(value) {
8924
+ return value && typeof value === "object" ? value : {};
8925
+ }
8926
+
8927
+ // src/agent/clients/google.ts
8928
+ var import_genai = require("@google/genai");
8929
+
8930
+ // src/agent/coords.ts
8931
+ var DEFAULT_CUA_VIEWPORT = {
8932
+ width: 1288,
8933
+ height: 711
8934
+ };
8935
+ function normalizeGoogleCoordinates(x, y, viewport) {
8936
+ const clampedX = Math.min(999, Math.max(0, x));
8937
+ const clampedY = Math.min(999, Math.max(0, y));
8938
+ return {
8939
+ x: Math.floor(clampedX / 1e3 * viewport.width),
8940
+ y: Math.floor(clampedY / 1e3 * viewport.height)
8941
+ };
8942
+ }
8943
+ function maybeNormalizeCoordinates(provider, x, y, viewport) {
8944
+ if (provider === "google") {
8945
+ return normalizeGoogleCoordinates(x, y, viewport);
8946
+ }
8947
+ return { x, y };
8948
+ }
8949
+
8950
+ // src/agent/key-mapping.ts
8951
+ var KEY_MAP = {
8952
+ ENTER: "Enter",
8953
+ RETURN: "Enter",
8954
+ ESCAPE: "Escape",
8955
+ ESC: "Escape",
8956
+ BACKSPACE: "Backspace",
8957
+ TAB: "Tab",
8958
+ SPACE: " ",
8959
+ DELETE: "Delete",
8960
+ DEL: "Delete",
8961
+ ARROWUP: "ArrowUp",
8962
+ ARROWDOWN: "ArrowDown",
8963
+ ARROWLEFT: "ArrowLeft",
8964
+ ARROWRIGHT: "ArrowRight",
8965
+ ARROW_UP: "ArrowUp",
8966
+ ARROW_DOWN: "ArrowDown",
8967
+ ARROW_LEFT: "ArrowLeft",
8968
+ ARROW_RIGHT: "ArrowRight",
8969
+ UP: "ArrowUp",
8970
+ DOWN: "ArrowDown",
8971
+ LEFT: "ArrowLeft",
8972
+ RIGHT: "ArrowRight",
8973
+ SHIFT: "Shift",
8974
+ CONTROL: "Control",
8975
+ CTRL: "Control",
8976
+ ALT: "Alt",
8977
+ OPTION: "Alt",
8978
+ META: "Meta",
8979
+ COMMAND: "Meta",
8980
+ CMD: "Meta",
8981
+ SUPER: "Meta",
8982
+ WINDOWS: "Meta",
8983
+ WIN: "Meta",
8984
+ HOME: "Home",
8985
+ END: "End",
8986
+ PAGEUP: "PageUp",
8987
+ PAGEDOWN: "PageDown",
8988
+ PAGE_UP: "PageUp",
8989
+ PAGE_DOWN: "PageDown",
8990
+ PGUP: "PageUp",
8991
+ PGDN: "PageDown",
8992
+ CONTROLORMETA: process.platform === "darwin" ? "Meta" : "Control"
8993
+ };
8994
+ function mapKeyToPlaywright(key) {
8995
+ const normalized = key.trim();
8996
+ if (!normalized) return normalized;
8997
+ const mapped = KEY_MAP[normalized.toUpperCase()];
8998
+ return mapped || normalized;
8999
+ }
9000
+
9001
+ // src/agent/clients/google.ts
9002
+ var GoogleCuaClient = class extends CuaClient {
9003
+ modelConfig;
9004
+ client;
9005
+ history = [];
9006
+ constructor(modelConfig) {
9007
+ super();
9008
+ this.modelConfig = modelConfig;
9009
+ this.client = new import_genai.GoogleGenAI({
9010
+ apiKey: modelConfig.apiKey,
9011
+ ...modelConfig.baseUrl ? { httpOptions: { baseUrl: modelConfig.baseUrl } } : {}
9012
+ });
9013
+ }
9014
+ async execute(input) {
9015
+ this.history = [
9016
+ {
9017
+ role: "user",
9018
+ parts: [
9019
+ {
9020
+ text: `System prompt: ${input.systemPrompt}`
9021
+ }
9022
+ ]
9023
+ },
9024
+ {
9025
+ role: "user",
9026
+ parts: [
9027
+ {
9028
+ text: input.instruction
9029
+ }
9030
+ ]
9031
+ }
9032
+ ];
9033
+ const actions = [];
9034
+ let finalMessage = "";
9035
+ let completed = false;
9036
+ let step = 0;
9037
+ let totalInputTokens = 0;
9038
+ let totalOutputTokens = 0;
9039
+ let totalInferenceTimeMs = 0;
9040
+ while (!completed && step < input.maxSteps) {
9041
+ const startedAt = Date.now();
9042
+ const response = await this.generateContent();
9043
+ totalInferenceTimeMs += Date.now() - startedAt;
9044
+ const usageMetadata = response.usageMetadata || {};
9045
+ totalInputTokens += toFiniteNumberOrZero(usageMetadata.promptTokenCount);
9046
+ totalOutputTokens += toFiniteNumberOrZero(
9047
+ usageMetadata.candidatesTokenCount
9048
+ );
9049
+ const candidate = Array.isArray(response.candidates) ? response.candidates[0] : null;
9050
+ const content = candidate && typeof candidate === "object" && candidate.content && typeof candidate.content === "object" ? candidate.content : null;
9051
+ const parts = content && Array.isArray(content.parts) ? content.parts : [];
9052
+ const finishReason = extractFinishReason(candidate);
9053
+ if (content) {
9054
+ this.history.push({
9055
+ role: "model",
9056
+ parts
9057
+ });
9058
+ }
9059
+ const messageParts = [];
9060
+ const functionCalls = [];
9061
+ for (const part of parts) {
9062
+ if (typeof part.text === "string") {
9063
+ messageParts.push(part.text);
9064
+ }
9065
+ if (part.functionCall && typeof part.functionCall === "object") {
9066
+ functionCalls.push(part.functionCall);
9067
+ }
9068
+ }
9069
+ if (messageParts.length) {
9070
+ finalMessage = messageParts.join("\n").trim();
9071
+ }
9072
+ if (!functionCalls.length) {
9073
+ completed = isSuccessfulGoogleFinishReason(finishReason);
9074
+ if (!completed && !finalMessage) {
9075
+ finalMessage = `Google CUA stopped with finish reason: ${finishReason || "unknown"}.`;
9076
+ }
9077
+ } else {
9078
+ const functionResponses = [];
9079
+ for (const functionCall of functionCalls) {
9080
+ const mappedActions = mapGoogleFunctionCallToActions(
9081
+ functionCall,
9082
+ this.viewport
9083
+ );
9084
+ actions.push(...mappedActions);
9085
+ let executionError;
9086
+ for (const mappedAction of mappedActions) {
9087
+ try {
9088
+ await this.getActionHandler()(mappedAction);
9089
+ } catch (error) {
9090
+ executionError = error instanceof Error ? error.message : String(error);
9091
+ }
9092
+ }
9093
+ const screenshotBase64 = await this.getScreenshotProvider()();
9094
+ const responsePayload = {
9095
+ url: this.currentUrl || ""
9096
+ };
9097
+ const args = functionCall.args && typeof functionCall.args === "object" ? functionCall.args : null;
9098
+ if (args && args.safety_decision !== void 0) {
9099
+ responsePayload.safety_acknowledgement = "true";
9100
+ }
9101
+ if (executionError) {
9102
+ responsePayload.error = executionError;
9103
+ }
9104
+ functionResponses.push({
9105
+ functionResponse: {
9106
+ name: typeof functionCall.name === "string" && functionCall.name || "computer_use",
9107
+ response: responsePayload,
9108
+ parts: [
9109
+ {
9110
+ inlineData: {
9111
+ mimeType: "image/png",
9112
+ data: screenshotBase64
9113
+ }
9114
+ }
9115
+ ]
9116
+ }
9117
+ });
9118
+ }
9119
+ if (functionResponses.length) {
9120
+ this.history.push({
9121
+ role: "user",
9122
+ parts: functionResponses
9123
+ });
9124
+ }
9125
+ if (finishReason && finishReason !== "STOP") {
9126
+ throw new OpensteerAgentActionError(
9127
+ `Google CUA returned function calls with terminal finish reason "${finishReason}".`
9128
+ );
9129
+ }
9130
+ completed = false;
9131
+ }
9132
+ step += 1;
9133
+ }
9134
+ return {
9135
+ success: completed,
9136
+ completed,
9137
+ message: finalMessage,
9138
+ actions,
9139
+ usage: {
9140
+ inputTokens: totalInputTokens,
9141
+ outputTokens: totalOutputTokens,
9142
+ inferenceTimeMs: totalInferenceTimeMs
9143
+ }
9144
+ };
9145
+ }
9146
+ async generateContent() {
9147
+ const params = {
9148
+ model: this.modelConfig.providerModelName,
9149
+ contents: this.history,
9150
+ config: {
9151
+ temperature: 1,
9152
+ topP: 0.95,
9153
+ topK: 40,
9154
+ maxOutputTokens: 8192,
9155
+ tools: [
9156
+ {
9157
+ computerUse: {
9158
+ environment: resolveGoogleEnvironment(
9159
+ this.modelConfig.environment
9160
+ )
9161
+ }
9162
+ }
9163
+ ]
9164
+ }
9165
+ };
9166
+ try {
9167
+ return await this.client.models.generateContent(params);
9168
+ } catch (error) {
9169
+ throw mapGoogleApiError(error);
9170
+ }
9171
+ }
9172
+ };
9173
+ function mapGoogleFunctionCallToActions(functionCall, viewport) {
9174
+ const name = normalizeString3(functionCall.name);
9175
+ const args = functionCall.args && typeof functionCall.args === "object" ? functionCall.args : {};
9176
+ if (!name) {
9177
+ throw new OpensteerAgentActionError(
9178
+ 'Google CUA function call is missing a "name" value.'
9179
+ );
9180
+ }
9181
+ switch (name) {
9182
+ case "click_at": {
9183
+ const coordinates = normalizeCoordinates(args, viewport, name);
9184
+ return [
9185
+ {
9186
+ type: "click",
9187
+ x: coordinates.x,
9188
+ y: coordinates.y,
9189
+ button: normalizeString3(args.button) || "left"
9190
+ }
9191
+ ];
9192
+ }
9193
+ case "type_text_at": {
9194
+ const coordinates = normalizeCoordinates(args, viewport, name);
9195
+ const clearBeforeTyping = typeof args.clear_before_typing === "boolean" ? args.clear_before_typing : true;
9196
+ const pressEnter = typeof args.press_enter === "boolean" ? args.press_enter : false;
9197
+ const text = normalizeRequiredString2(
9198
+ args.text,
9199
+ 'Google action "type_text_at" requires a non-empty "text" value.'
9200
+ );
9201
+ const actions = [
9202
+ {
9203
+ type: "click",
9204
+ x: coordinates.x,
9205
+ y: coordinates.y,
9206
+ button: "left"
9207
+ }
9208
+ ];
9209
+ if (clearBeforeTyping) {
9210
+ actions.push({
9211
+ type: "keypress",
9212
+ keys: ["ControlOrMeta+A"]
9213
+ });
9214
+ actions.push({
9215
+ type: "keypress",
9216
+ keys: ["Backspace"]
9217
+ });
9218
+ }
9219
+ actions.push({
9220
+ type: "type",
9221
+ text,
9222
+ x: coordinates.x,
9223
+ y: coordinates.y
9224
+ });
9225
+ if (pressEnter) {
9226
+ actions.push({
9227
+ type: "keypress",
9228
+ keys: ["Enter"]
9229
+ });
9230
+ }
9231
+ return actions;
9232
+ }
9233
+ case "key_combination": {
9234
+ const keysRaw = normalizeRequiredString2(
9235
+ args.keys,
9236
+ 'Google action "key_combination" requires a non-empty "keys" value.'
9237
+ );
9238
+ const keys = keysRaw.split("+").map((part) => part.trim()).filter(Boolean).map((part) => mapKeyToPlaywright(part));
9239
+ if (!keys.length) {
9240
+ throw new OpensteerAgentActionError(
9241
+ 'Google action "key_combination" did not produce any key tokens.'
9242
+ );
9243
+ }
9244
+ return [
9245
+ {
9246
+ type: "keypress",
9247
+ keys
9248
+ }
9249
+ ];
9250
+ }
9251
+ case "scroll_document": {
9252
+ const direction = normalizeVerticalDirection(
9253
+ args.direction,
9254
+ "scroll_document"
9255
+ );
9256
+ return [
9257
+ {
9258
+ type: "keypress",
9259
+ keys: [direction === "up" ? "PageUp" : "PageDown"]
9260
+ }
9261
+ ];
9262
+ }
9263
+ case "scroll_at": {
9264
+ const coordinates = normalizeCoordinates(args, viewport, name);
9265
+ const direction = normalizeScrollDirection2(args.direction, "scroll_at");
9266
+ const magnitude = parsePositiveNumber(
9267
+ args.magnitude,
9268
+ "scroll_at",
9269
+ "magnitude"
9270
+ );
9271
+ let scrollX = 0;
9272
+ let scrollY = 0;
9273
+ if (direction === "up") scrollY = -magnitude;
9274
+ if (direction === "down") scrollY = magnitude;
9275
+ if (direction === "left") scrollX = -magnitude;
9276
+ if (direction === "right") scrollX = magnitude;
9277
+ return [
9278
+ {
9279
+ type: "scroll",
9280
+ x: coordinates.x,
9281
+ y: coordinates.y,
9282
+ scrollX,
9283
+ scrollY
9284
+ }
9285
+ ];
9286
+ }
9287
+ case "hover_at": {
9288
+ const coordinates = normalizeCoordinates(args, viewport, name);
9289
+ return [
9290
+ {
9291
+ type: "move",
9292
+ x: coordinates.x,
9293
+ y: coordinates.y
9294
+ }
9295
+ ];
9296
+ }
9297
+ case "drag_and_drop": {
9298
+ const startX = parseRequiredNumber(args.x, "drag_and_drop", "x");
9299
+ const startY = parseRequiredNumber(args.y, "drag_and_drop", "y");
9300
+ const endX = parseRequiredNumber(
9301
+ args.destination_x,
9302
+ "drag_and_drop",
9303
+ "destination_x"
9304
+ );
9305
+ const endY = parseRequiredNumber(
9306
+ args.destination_y,
9307
+ "drag_and_drop",
9308
+ "destination_y"
9309
+ );
9310
+ const start = maybeNormalizeCoordinates(
9311
+ "google",
9312
+ startX,
9313
+ startY,
9314
+ viewport
9315
+ );
9316
+ const end = maybeNormalizeCoordinates(
9317
+ "google",
9318
+ endX,
9319
+ endY,
9320
+ viewport
9321
+ );
9322
+ return [
9323
+ {
9324
+ type: "drag",
9325
+ path: [start, end]
9326
+ }
9327
+ ];
9328
+ }
9329
+ case "navigate":
9330
+ return [
9331
+ {
9332
+ type: "goto",
9333
+ url: normalizeRequiredString2(
9334
+ args.url,
9335
+ 'Google action "navigate" requires a non-empty "url" value.'
9336
+ )
9337
+ }
9338
+ ];
9339
+ case "go_back":
9340
+ return [{ type: "back" }];
9341
+ case "go_forward":
9342
+ return [{ type: "forward" }];
9343
+ case "wait_5_seconds":
9344
+ return [{ type: "wait", timeMs: 5e3 }];
9345
+ case "search":
9346
+ return [
9347
+ {
9348
+ type: "goto",
9349
+ url: buildGoogleSearchUrl(args)
9350
+ }
9351
+ ];
9352
+ case "open_web_browser":
9353
+ return [{ type: "open_web_browser" }];
9354
+ default:
9355
+ throw new OpensteerAgentActionError(
9356
+ `Unsupported Google CUA function call "${name}".`
9357
+ );
9358
+ }
9359
+ }
9360
+ function normalizeCoordinates(args, viewport, actionName) {
9361
+ const x = parseRequiredNumber(args.x, actionName, "x");
9362
+ const y = parseRequiredNumber(args.y, actionName, "y");
9363
+ return maybeNormalizeCoordinates(
9364
+ "google",
9365
+ x,
9366
+ y,
9367
+ viewport
9368
+ );
9369
+ }
9370
+ function parseRequiredNumber(value, actionName, field) {
9371
+ if (typeof value === "number" && Number.isFinite(value)) {
9372
+ return value;
9373
+ }
9374
+ throw new OpensteerAgentActionError(
9375
+ `Google action "${actionName}" requires numeric "${field}" coordinates.`
9376
+ );
9377
+ }
9378
+ function parsePositiveNumber(value, actionName, field) {
9379
+ if (typeof value === "number" && Number.isFinite(value) && value > 0) {
9380
+ return value;
9381
+ }
9382
+ throw new OpensteerAgentActionError(
9383
+ `Google action "${actionName}" requires a positive numeric "${field}" value.`
9384
+ );
9385
+ }
9386
+ function toFiniteNumberOrZero(value) {
9387
+ return typeof value === "number" && Number.isFinite(value) ? value : 0;
9388
+ }
9389
+ function normalizeString3(value) {
9390
+ if (typeof value !== "string") return void 0;
9391
+ const normalized = value.trim();
9392
+ return normalized.length ? normalized : void 0;
9393
+ }
9394
+ function normalizeRequiredString2(value, errorMessage2) {
9395
+ const normalized = normalizeString3(value);
9396
+ if (!normalized) {
9397
+ throw new OpensteerAgentActionError(errorMessage2);
9398
+ }
9399
+ return normalized;
9400
+ }
9401
+ function normalizeScrollDirection2(value, actionName) {
9402
+ const direction = normalizeString3(value);
9403
+ if (direction === "up" || direction === "down" || direction === "left" || direction === "right") {
9404
+ return direction;
9405
+ }
9406
+ throw new OpensteerAgentActionError(
9407
+ `Google action "${actionName}" requires "direction" to be one of: up, down, left, right.`
9408
+ );
9409
+ }
9410
+ function normalizeVerticalDirection(value, actionName) {
9411
+ const direction = normalizeString3(value);
9412
+ if (direction === "up" || direction === "down") {
9413
+ return direction;
9414
+ }
9415
+ throw new OpensteerAgentActionError(
9416
+ `Google action "${actionName}" requires "direction" to be "up" or "down".`
9417
+ );
9418
+ }
9419
+ function buildGoogleSearchUrl(args) {
9420
+ const query = normalizeRequiredString2(
9421
+ args.query ?? args.text,
9422
+ 'Google action "search" requires a non-empty "query" value.'
9423
+ );
9424
+ return `https://www.google.com/search?q=${encodeURIComponent(query)}`;
9425
+ }
9426
+ function extractFinishReason(candidate) {
9427
+ if (!candidate || typeof candidate !== "object") {
9428
+ return void 0;
9429
+ }
9430
+ return normalizeString3(candidate.finishReason);
9431
+ }
9432
+ function isSuccessfulGoogleFinishReason(finishReason) {
9433
+ return !finishReason || finishReason === "STOP";
9434
+ }
9435
+ function resolveGoogleEnvironment(value) {
9436
+ const environment = normalizeString3(value);
9437
+ if (environment === import_genai.Environment.ENVIRONMENT_UNSPECIFIED) {
9438
+ return import_genai.Environment.ENVIRONMENT_UNSPECIFIED;
9439
+ }
9440
+ return import_genai.Environment.ENVIRONMENT_BROWSER;
9441
+ }
9442
+ function mapGoogleApiError(error) {
9443
+ const errorRecord = toRecord3(error);
9444
+ const status = typeof errorRecord.status === "number" ? errorRecord.status : void 0;
9445
+ const message = normalizeString3(errorRecord.message) || (error instanceof Error ? error.message : String(error));
9446
+ return new OpensteerAgentApiError("google", message, status, error);
9447
+ }
9448
+ function toRecord3(value) {
9449
+ return value && typeof value === "object" ? value : {};
9450
+ }
9451
+
9452
+ // src/agent/provider.ts
9453
+ var DEFAULT_SYSTEM_PROMPT = "You are a browser automation agent. Complete the user instruction safely and efficiently. Do not ask follow-up questions. Finish as soon as the task is complete.";
9454
+ function resolveAgentConfig(args) {
9455
+ const { agentConfig } = args;
9456
+ if (!agentConfig || typeof agentConfig !== "object") {
9457
+ throw new OpensteerAgentConfigError(
9458
+ 'agent() requires a configuration object with mode: "cua".'
9459
+ );
9460
+ }
9461
+ if (agentConfig.mode !== "cua") {
9462
+ throw new OpensteerAgentConfigError(
9463
+ `Unsupported agent mode "${String(agentConfig.mode)}". OpenSteer currently supports only mode: "cua".`
9464
+ );
9465
+ }
9466
+ const model = resolveCuaModelConfig({
9467
+ agentConfig,
9468
+ fallbackModel: args.fallbackModel,
9469
+ env: args.env
9470
+ });
9471
+ return {
9472
+ mode: "cua",
9473
+ systemPrompt: normalizeNonEmptyString(agentConfig.systemPrompt) || DEFAULT_SYSTEM_PROMPT,
9474
+ waitBetweenActionsMs: normalizeWaitBetween(agentConfig.waitBetweenActionsMs),
9475
+ model
9476
+ };
9477
+ }
9478
+ function createCuaClient(config) {
9479
+ switch (config.model.provider) {
9480
+ case "openai":
9481
+ return new OpenAICuaClient(config.model);
9482
+ case "anthropic":
9483
+ return new AnthropicCuaClient(config.model);
9484
+ case "google":
9485
+ return new GoogleCuaClient(config.model);
9486
+ default:
9487
+ throw new OpensteerAgentProviderError(
9488
+ `Unsupported CUA provider "${String(config.model.provider)}".`
9489
+ );
9490
+ }
9491
+ }
9492
+ function normalizeNonEmptyString(value) {
9493
+ if (typeof value !== "string") return void 0;
9494
+ const normalized = value.trim();
9495
+ return normalized.length ? normalized : void 0;
9496
+ }
9497
+ function normalizeWaitBetween(value) {
9498
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0) {
9499
+ return 500;
9500
+ }
9501
+ return Math.floor(value);
9502
+ }
9503
+
9504
+ // src/agent/action-executor.ts
9505
+ async function executeAgentAction(page, action) {
9506
+ const type = normalizeActionType(action.type);
9507
+ switch (type) {
9508
+ case "click": {
9509
+ const { x, y } = toPoint(action);
9510
+ await page.mouse.click(x, y, {
9511
+ button: normalizeMouseButton2(action.button, "left"),
9512
+ clickCount: normalizeClickCount(action.clickCount, 1)
9513
+ });
9514
+ return;
9515
+ }
9516
+ case "doubleclick": {
9517
+ const { x, y } = toPoint(action);
9518
+ await page.mouse.click(x, y, {
9519
+ button: normalizeMouseButton2(action.button, "left"),
9520
+ clickCount: 2
9521
+ });
9522
+ return;
9523
+ }
9524
+ case "tripleclick": {
9525
+ const { x, y } = toPoint(action);
9526
+ await page.mouse.click(x, y, {
9527
+ button: normalizeMouseButton2(action.button, "left"),
9528
+ clickCount: 3
9529
+ });
9530
+ return;
9531
+ }
9532
+ case "rightclick": {
9533
+ const { x, y } = toPoint(action);
9534
+ await page.mouse.click(x, y, {
9535
+ button: "right",
9536
+ clickCount: normalizeClickCount(action.clickCount, 1)
9537
+ });
9538
+ return;
9539
+ }
9540
+ case "type": {
9541
+ await maybeFocusPoint(page, action);
9542
+ const text = typeof action.text === "string" ? action.text : "";
9543
+ if (action.clearBeforeTyping === true) {
9544
+ await pressKeyCombo(page, "ControlOrMeta+A");
9545
+ await page.keyboard.press("Backspace");
9546
+ }
9547
+ await page.keyboard.type(text);
9548
+ if (action.pressEnter === true) {
9549
+ await page.keyboard.press("Enter");
9550
+ }
9551
+ return;
9552
+ }
9553
+ case "keypress": {
9554
+ const combos = normalizeKeyCombos(action.keys);
9555
+ for (const combo of combos) {
9556
+ await pressKeyCombo(page, combo);
9557
+ }
9558
+ return;
9559
+ }
9560
+ case "scroll": {
9561
+ const x = numberOr(action.scrollX, action.scroll_x, 0);
9562
+ const y = numberOr(action.scrollY, action.scroll_y, 0);
9563
+ const point = maybePoint(action);
9564
+ if (point) {
9565
+ await page.mouse.move(point.x, point.y);
9566
+ }
9567
+ await page.mouse.wheel(x, y);
9568
+ return;
9569
+ }
9570
+ case "drag": {
9571
+ const path5 = normalizePath(action.path);
9572
+ if (!path5.length) {
9573
+ throw new OpensteerAgentActionError(
9574
+ "Drag action requires a non-empty path."
9575
+ );
9576
+ }
9577
+ await page.mouse.move(path5[0].x, path5[0].y);
9578
+ await page.mouse.down();
9579
+ for (const point of path5.slice(1)) {
9580
+ await page.mouse.move(point.x, point.y);
9581
+ }
9582
+ await page.mouse.up();
9583
+ return;
9584
+ }
9585
+ case "move":
9586
+ case "hover": {
9587
+ const { x, y } = toPoint(action);
9588
+ await page.mouse.move(x, y);
9589
+ return;
9590
+ }
9591
+ case "wait": {
9592
+ const ms = numberOr(action.timeMs, action.time_ms, 1e3);
9593
+ await sleep3(ms);
9594
+ return;
9595
+ }
9596
+ case "goto": {
9597
+ const url = normalizeRequiredString3(action.url, "Action URL is required for goto.");
9598
+ await page.goto(url, { waitUntil: "load" });
9599
+ return;
9600
+ }
9601
+ case "back": {
9602
+ await page.goBack({ waitUntil: "load" }).catch(() => void 0);
9603
+ return;
9604
+ }
9605
+ case "forward": {
9606
+ await page.goForward({ waitUntil: "load" }).catch(() => void 0);
9607
+ return;
9608
+ }
9609
+ case "screenshot":
9610
+ case "open_web_browser": {
9611
+ return;
9612
+ }
9613
+ default:
9614
+ throw new OpensteerAgentActionError(
9615
+ `Unsupported CUA action type "${String(action.type)}".`
9616
+ );
9617
+ }
9618
+ }
9619
+ function isMutatingAgentAction(action) {
9620
+ const type = normalizeActionType(action.type);
9621
+ return type !== "wait" && type !== "screenshot" && type !== "open_web_browser";
9622
+ }
9623
+ function normalizeActionType(value) {
9624
+ const raw = typeof value === "string" ? value : "";
9625
+ const normalized = raw.trim().toLowerCase();
9626
+ if (!normalized) return "";
9627
+ if (normalized === "double_click" || normalized === "doubleclick") {
9628
+ return "doubleclick";
9629
+ }
9630
+ if (normalized === "triple_click" || normalized === "tripleclick") {
9631
+ return "tripleclick";
9632
+ }
9633
+ if (normalized === "left_click") {
9634
+ return "click";
9635
+ }
9636
+ if (normalized === "right_click") {
9637
+ return "rightclick";
9638
+ }
9639
+ if (normalized === "openwebbrowser" || normalized === "open_web_browser") {
9640
+ return "open_web_browser";
9641
+ }
9642
+ return normalized;
9643
+ }
9644
+ function toPoint(action) {
9645
+ const point = maybePoint(action);
9646
+ if (point) {
9647
+ return point;
9648
+ }
9649
+ throw new OpensteerAgentActionError(
9650
+ `Action "${String(action.type)}" requires numeric x and y coordinates.`
9651
+ );
9652
+ }
9653
+ function maybePoint(action) {
9654
+ const coordinate = Array.isArray(action.coordinate) ? action.coordinate : Array.isArray(action.coordinates) ? action.coordinates : null;
9655
+ const x = numberOr(action.x, coordinate?.[0]);
9656
+ const y = numberOr(action.y, coordinate?.[1]);
9657
+ if (!Number.isFinite(x) || !Number.isFinite(y)) {
9658
+ return null;
9659
+ }
9660
+ return {
9661
+ x,
9662
+ y
9663
+ };
9664
+ }
9665
+ async function maybeFocusPoint(page, action) {
9666
+ const point = maybePoint(action);
9667
+ if (!point) {
9668
+ return;
9669
+ }
9670
+ await page.mouse.click(point.x, point.y, {
9671
+ button: normalizeMouseButton2(action.button, "left"),
9672
+ clickCount: 1
9673
+ });
9674
+ }
9675
+ function normalizePath(path5) {
9676
+ if (!Array.isArray(path5)) return [];
9677
+ const points = [];
9678
+ for (const entry of path5) {
9679
+ if (!entry || typeof entry !== "object") continue;
9680
+ const candidate = entry;
9681
+ const x = Number(candidate.x);
9682
+ const y = Number(candidate.y);
9683
+ if (!Number.isFinite(x) || !Number.isFinite(y)) continue;
9684
+ points.push({ x, y });
9685
+ }
9686
+ return points;
9687
+ }
9688
+ function normalizeMouseButton2(value, fallback) {
9689
+ if (value === "left" || value === "right" || value === "middle") {
9690
+ return value;
9691
+ }
9692
+ if (typeof value === "string") {
9693
+ const normalized = value.toLowerCase();
9694
+ if (normalized === "left" || normalized === "right" || normalized === "middle") {
9695
+ return normalized;
9696
+ }
9697
+ }
9698
+ return fallback;
9699
+ }
9700
+ function normalizeClickCount(value, fallback) {
9701
+ if (typeof value === "number" && Number.isFinite(value) && value > 0) {
9702
+ return Math.floor(value);
9703
+ }
9704
+ return fallback;
9705
+ }
9706
+ function normalizeKeyCombos(value) {
9707
+ if (typeof value === "string") {
9708
+ const trimmed = value.trim();
9709
+ return trimmed ? [trimmed] : [];
9710
+ }
9711
+ if (!Array.isArray(value)) {
9712
+ return [];
9713
+ }
9714
+ const keys = value.filter((entry) => typeof entry === "string").map((entry) => entry.trim()).filter(Boolean);
9715
+ if (!keys.length) {
9716
+ return [];
9717
+ }
9718
+ const hasExplicitComboSyntax = keys.some((entry) => entry.includes("+"));
9719
+ if (!hasExplicitComboSyntax && keys.length > 1) {
9720
+ return [keys.join("+")];
9721
+ }
9722
+ return keys;
9723
+ }
9724
+ function numberOr(...values) {
9725
+ for (const value of values) {
9726
+ if (typeof value === "number" && Number.isFinite(value)) return value;
9727
+ }
9728
+ return NaN;
9729
+ }
9730
+ function normalizeRequiredString3(value, errorMessage2) {
9731
+ if (typeof value !== "string" || !value.trim()) {
9732
+ throw new OpensteerAgentActionError(errorMessage2);
9733
+ }
9734
+ return value.trim();
9735
+ }
9736
+ async function pressKeyCombo(page, combo) {
9737
+ const trimmed = combo.trim();
9738
+ if (!trimmed) return;
9739
+ if (!trimmed.includes("+")) {
9740
+ await page.keyboard.press(mapKeyToPlaywright(trimmed));
9741
+ return;
9742
+ }
9743
+ const parts = trimmed.split("+").map((part) => part.trim()).filter(Boolean).map((part) => mapKeyToPlaywright(part));
9744
+ if (!parts.length) return;
9745
+ const modifiers = parts.slice(0, -1);
9746
+ const last = parts[parts.length - 1];
9747
+ for (const modifier of modifiers) {
9748
+ await page.keyboard.down(modifier);
9749
+ }
9750
+ try {
9751
+ await page.keyboard.press(last);
9752
+ } finally {
9753
+ for (const modifier of modifiers.slice().reverse()) {
9754
+ await page.keyboard.up(modifier);
9755
+ }
9756
+ }
9757
+ }
9758
+ function sleep3(ms) {
9759
+ return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
9760
+ }
9761
+
9762
+ // src/agent/handler.ts
9763
+ var OpensteerCuaAgentHandler = class {
9764
+ page;
9765
+ config;
9766
+ client;
9767
+ debug;
9768
+ onMutatingAction;
9769
+ cursorOverlayInjected = false;
9770
+ constructor(options) {
9771
+ this.page = options.page;
9772
+ this.config = options.config;
9773
+ this.client = options.client;
9774
+ this.debug = options.debug;
9775
+ this.onMutatingAction = options.onMutatingAction;
9776
+ }
9777
+ async execute(options) {
9778
+ const instruction = options.instruction;
9779
+ const maxSteps = options.maxSteps ?? 20;
9780
+ await this.initializeClient();
9781
+ const highlightCursor = options.highlightCursor === true;
9782
+ this.client.setActionHandler(async (action) => {
9783
+ if (highlightCursor) {
9784
+ await this.maybeRenderCursor(action);
9785
+ }
9786
+ await executeAgentAction(this.page, action);
9787
+ this.client.setCurrentUrl(this.page.url());
9788
+ if (isMutatingAgentAction(action)) {
9789
+ this.onMutatingAction?.(action);
9790
+ }
9791
+ await sleep4(this.config.waitBetweenActionsMs);
9792
+ });
9793
+ try {
9794
+ const result = await this.client.execute({
9795
+ instruction,
9796
+ maxSteps,
9797
+ systemPrompt: this.config.systemPrompt
9798
+ });
9799
+ return {
9800
+ ...result,
9801
+ provider: this.config.model.provider,
9802
+ model: this.config.model.fullModelName
9803
+ };
9804
+ } catch (error) {
9805
+ throw new OpensteerAgentExecutionError(
9806
+ `CUA agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
9807
+ error
9808
+ );
9809
+ }
9810
+ }
9811
+ async initializeClient() {
9812
+ const viewport = await this.resolveViewport();
9813
+ this.client.setViewport(viewport.width, viewport.height);
9814
+ this.client.setCurrentUrl(this.page.url());
9815
+ this.client.setScreenshotProvider(async () => {
9816
+ const buffer = await this.page.screenshot({
9817
+ fullPage: false,
9818
+ type: "png"
9819
+ });
9820
+ return buffer.toString("base64");
9821
+ });
9822
+ }
9823
+ async resolveViewport() {
9824
+ const directViewport = this.page.viewportSize();
9825
+ if (directViewport?.width && directViewport?.height) {
9826
+ return directViewport;
9827
+ }
9828
+ try {
9829
+ const evaluated = await this.page.evaluate(() => ({
9830
+ width: window.innerWidth,
9831
+ height: window.innerHeight
9832
+ }));
9833
+ if (evaluated && typeof evaluated === "object" && typeof evaluated.width === "number" && typeof evaluated.height === "number" && evaluated.width > 0 && evaluated.height > 0) {
9834
+ return {
9835
+ width: Math.floor(evaluated.width),
9836
+ height: Math.floor(evaluated.height)
9837
+ };
9838
+ }
9839
+ } catch {
9840
+ }
9841
+ return DEFAULT_CUA_VIEWPORT;
9842
+ }
9843
+ async maybeRenderCursor(action) {
9844
+ const x = typeof action.x === "number" ? action.x : null;
9845
+ const y = typeof action.y === "number" ? action.y : null;
9846
+ if (x == null || y == null) {
9847
+ return;
9848
+ }
9849
+ try {
9850
+ if (!this.cursorOverlayInjected) {
9851
+ await this.page.evaluate(() => {
9852
+ if (document.getElementById("__opensteer_cua_cursor")) return;
9853
+ const cursor = document.createElement("div");
9854
+ cursor.id = "__opensteer_cua_cursor";
9855
+ cursor.style.position = "fixed";
9856
+ cursor.style.width = "14px";
9857
+ cursor.style.height = "14px";
9858
+ cursor.style.borderRadius = "999px";
9859
+ cursor.style.background = "rgba(255, 51, 51, 0.85)";
9860
+ cursor.style.border = "2px solid rgba(255, 255, 255, 0.95)";
9861
+ cursor.style.boxShadow = "0 0 0 3px rgba(255, 51, 51, 0.25)";
9862
+ cursor.style.pointerEvents = "none";
9863
+ cursor.style.zIndex = "2147483647";
9864
+ cursor.style.transform = "translate(-9999px, -9999px)";
9865
+ cursor.style.transition = "transform 80ms linear";
9866
+ document.documentElement.appendChild(cursor);
9867
+ });
9868
+ this.cursorOverlayInjected = true;
9869
+ }
9870
+ await this.page.evaluate(
9871
+ ({ px, py }) => {
9872
+ const cursor = document.getElementById("__opensteer_cua_cursor");
9873
+ if (!cursor) return;
9874
+ cursor.style.transform = `translate(${Math.round(px - 7)}px, ${Math.round(py - 7)}px)`;
9875
+ },
9876
+ { px: x, py: y }
9877
+ );
9878
+ } catch (error) {
9879
+ if (this.debug) {
9880
+ const message = error instanceof Error ? error.message : String(error);
9881
+ console.warn(`[opensteer] cursor overlay failed: ${message}`);
9882
+ }
9883
+ }
9884
+ }
9885
+ };
9886
+ function sleep4(ms) {
9887
+ return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
9888
+ }
9889
+
9890
+ // src/opensteer.ts
9891
+ var CLOUD_INTERACTION_METHODS = /* @__PURE__ */ new Set([
9892
+ "click",
9893
+ "dblclick",
9894
+ "rightclick",
9895
+ "hover",
9896
+ "input",
9897
+ "select",
9898
+ "scroll",
9899
+ "uploadFile"
9900
+ ]);
9901
+ var Opensteer = class _Opensteer {
9902
+ config;
9903
+ aiResolve;
9904
+ aiExtract;
9905
+ namespace;
9906
+ storage;
9907
+ pool;
9908
+ cloud;
9909
+ browser = null;
9910
+ pageRef = null;
9911
+ contextRef = null;
9912
+ ownsBrowser = false;
9913
+ snapshotCache = null;
9914
+ agentExecutionInFlight = false;
9915
+ constructor(config = {}) {
9916
+ const resolved = resolveConfig(config);
9917
+ const cloudSelection = resolveCloudSelection({
9918
+ cloud: resolved.cloud
9919
+ });
9920
+ const model = resolved.model;
9921
+ this.config = resolved;
9922
+ this.aiResolve = this.createLazyResolveCallback(model);
9923
+ this.aiExtract = this.createLazyExtractCallback(model);
9924
+ const rootDir = resolved.storage?.rootDir || process.cwd();
9925
+ this.namespace = resolveNamespace(resolved, rootDir);
9926
+ this.storage = new LocalSelectorStorage(rootDir, this.namespace);
9927
+ this.pool = new BrowserPool(resolved.browser || {});
9928
+ if (cloudSelection.cloud) {
9929
+ const cloudConfig = resolved.cloud && typeof resolved.cloud === "object" ? resolved.cloud : void 0;
9930
+ const apiKey = cloudConfig?.apiKey?.trim();
9931
+ if (!apiKey) {
9932
+ throw new Error(
9933
+ "Cloud mode requires a non-empty API key via cloud.apiKey or OPENSTEER_API_KEY."
9934
+ );
9935
+ }
9936
+ this.cloud = createCloudRuntimeState(
9937
+ apiKey,
9938
+ cloudConfig?.baseUrl,
9939
+ cloudConfig?.authScheme
9940
+ );
9941
+ } else {
9942
+ this.cloud = null;
9943
+ }
9944
+ }
9945
+ createLazyResolveCallback(model) {
9946
+ let resolverPromise = null;
9947
+ return async (...args) => {
9948
+ try {
9949
+ if (!resolverPromise) {
9950
+ resolverPromise = Promise.resolve().then(() => (init_resolver(), resolver_exports)).then(
9951
+ (m) => m.createResolveCallback(model)
9952
+ );
9953
+ }
9954
+ const resolver = await resolverPromise;
9955
+ return resolver(...args);
9956
+ } catch (err) {
9957
+ resolverPromise = null;
9958
+ throw err;
9959
+ }
9960
+ };
9961
+ }
9962
+ createLazyExtractCallback(model) {
9963
+ let extractorPromise = null;
9964
+ const extract = async (args) => {
9965
+ try {
9966
+ if (!extractorPromise) {
9967
+ extractorPromise = Promise.resolve().then(() => (init_extractor(), extractor_exports)).then(
9968
+ (m) => m.createExtractCallback(model)
9969
+ );
9970
+ }
9971
+ const extractor = await extractorPromise;
9972
+ return extractor(args);
9973
+ } catch (err) {
9974
+ extractorPromise = null;
9975
+ throw err;
9976
+ }
9977
+ };
9978
+ return extract;
9979
+ }
9980
+ async invokeCloudActionAndResetCache(method, args) {
9981
+ const result = await this.invokeCloudAction(method, args);
9982
+ this.snapshotCache = null;
9983
+ return result;
9984
+ }
9985
+ async invokeCloudAction(method, args) {
9986
+ const actionClient = this.cloud?.actionClient;
9987
+ const sessionId = this.cloud?.sessionId;
9988
+ if (!actionClient || !sessionId) {
9989
+ throw cloudNotLaunchedError();
9990
+ }
9991
+ const payload = args && typeof args === "object" ? args : {};
9992
+ try {
9993
+ return await actionClient.request(method, payload);
9994
+ } catch (err) {
9995
+ if (err instanceof OpensteerCloudError && err.code === "CLOUD_ACTION_FAILED" && CLOUD_INTERACTION_METHODS.has(method)) {
9996
+ const detailsRecord = err.details && typeof err.details === "object" ? err.details : null;
9997
+ const cloudFailure = normalizeActionFailure(
9998
+ detailsRecord?.actionFailure
9999
+ );
10000
+ const failure = cloudFailure || classifyActionFailure({
10001
+ action: method,
10002
+ error: err,
10003
+ fallbackMessage: defaultActionFailureMessage(method)
10004
+ });
10005
+ const description = readCloudActionDescription(payload);
10006
+ throw this.buildActionError(
10007
+ method,
10008
+ description,
10009
+ failure,
10010
+ null,
10011
+ err
10012
+ );
10013
+ }
10014
+ throw err;
10015
+ }
10016
+ }
10017
+ buildActionError(action, description, failure, selectorUsed, cause) {
10018
+ return new OpensteerActionError({
10019
+ action,
10020
+ failure,
10021
+ selectorUsed: selectorUsed || null,
10022
+ message: formatActionFailureMessage(
10023
+ action,
10024
+ description,
10025
+ failure.message
10026
+ ),
10027
+ cause
10028
+ });
10029
+ }
10030
+ async syncCloudPageRef(args) {
10031
+ if (!this.cloud || !this.browser) return;
10032
+ let tabs;
10033
+ try {
10034
+ tabs = await this.invokeCloudAction("tabs", {});
10035
+ } catch {
10036
+ return;
10037
+ }
10038
+ if (!tabs.length) {
10039
+ return;
10040
+ }
10041
+ const contexts = this.browser.contexts();
10042
+ if (!contexts.length) return;
10043
+ const syncContext = this.contextRef && contexts.includes(this.contextRef) ? this.contextRef : contexts[0];
10044
+ const syncContextPages = syncContext.pages();
10045
+ const activeTab = tabs.find((tab) => tab.active) ?? null;
10046
+ if (activeTab && activeTab.index >= 0 && activeTab.index < syncContextPages.length) {
10047
+ this.contextRef = syncContext;
10048
+ this.pageRef = syncContextPages[activeTab.index];
10049
+ return;
10050
+ }
10051
+ const expectedUrl = args?.expectedUrl?.trim() || null;
10052
+ const expectedUrlInSyncContext = expectedUrl ? syncContextPages.find((page) => page.url() === expectedUrl) : void 0;
10053
+ if (expectedUrlInSyncContext) {
10054
+ this.contextRef = syncContext;
10055
+ this.pageRef = expectedUrlInSyncContext;
10056
+ return;
10057
+ }
10058
+ const firstNonInternalInSyncContext = syncContextPages.find(
10059
+ (page) => !isInternalOrBlankPageUrl(page.url())
10060
+ );
10061
+ if (firstNonInternalInSyncContext) {
10062
+ this.contextRef = syncContext;
10063
+ this.pageRef = firstNonInternalInSyncContext;
10064
+ return;
10065
+ }
10066
+ const firstAboutBlankInSyncContext = syncContextPages.find(
10067
+ (page) => page.url() === "about:blank"
10068
+ );
10069
+ if (firstAboutBlankInSyncContext) {
10070
+ this.contextRef = syncContext;
10071
+ this.pageRef = firstAboutBlankInSyncContext;
10072
+ return;
10073
+ }
10074
+ const pages = [];
10075
+ for (const context of contexts) {
10076
+ for (const page of context.pages()) {
10077
+ pages.push({
10078
+ context,
10079
+ page,
10080
+ url: page.url()
10081
+ });
10082
+ }
10083
+ }
10084
+ if (!pages.length) return;
10085
+ const expectedUrlMatch = expectedUrl ? pages.find(({ url }) => url === expectedUrl) : void 0;
10086
+ if (expectedUrlMatch) {
10087
+ this.contextRef = expectedUrlMatch.context;
10088
+ this.pageRef = expectedUrlMatch.page;
10089
+ return;
10090
+ }
10091
+ const firstNonInternal = pages.find(
10092
+ ({ url }) => !isInternalOrBlankPageUrl(url)
10093
+ );
10094
+ if (firstNonInternal) {
10095
+ this.contextRef = firstNonInternal.context;
10096
+ this.pageRef = firstNonInternal.page;
10097
+ return;
10098
+ }
10099
+ const firstAboutBlank = pages.find(({ url }) => url === "about:blank");
10100
+ if (firstAboutBlank) {
10101
+ this.contextRef = firstAboutBlank.context;
10102
+ this.pageRef = firstAboutBlank.page;
10103
+ return;
10104
+ }
10105
+ this.contextRef = pages[0].context;
10106
+ this.pageRef = pages[0].page;
10107
+ }
10108
+ get page() {
10109
+ if (!this.pageRef) {
10110
+ throw new Error(
10111
+ "Browser page is not initialized. Call launch() or Opensteer.from(page)."
10112
+ );
10113
+ }
10114
+ return this.pageRef;
10115
+ }
10116
+ get context() {
10117
+ if (!this.contextRef) {
10118
+ throw new Error(
10119
+ "Browser context is not initialized. Call launch() or Opensteer.from(page)."
10120
+ );
10121
+ }
10122
+ return this.contextRef;
10123
+ }
10124
+ getCloudSessionId() {
10125
+ return this.cloud?.sessionId ?? null;
10126
+ }
10127
+ getCloudSessionUrl() {
10128
+ return this.cloud?.cloudSessionUrl ?? null;
10129
+ }
10130
+ announceCloudSession(args) {
10131
+ if (!this.shouldAnnounceCloudSession()) {
10132
+ return;
10133
+ }
10134
+ const fields = [
10135
+ `sessionId=${args.sessionId}`,
10136
+ `workspaceId=${args.workspaceId}`
10137
+ ];
10138
+ if (args.cloudSessionUrl) {
10139
+ fields.push(`url=${args.cloudSessionUrl}`);
10140
+ }
10141
+ process.stderr.write(`[opensteer] cloud session ready ${fields.join(" ")}
10142
+ `);
10143
+ }
10144
+ shouldAnnounceCloudSession() {
10145
+ const cloudConfig = this.config.cloud && typeof this.config.cloud === "object" ? this.config.cloud : null;
10146
+ const announce = cloudConfig?.announce ?? "always";
10147
+ if (announce === "off") {
10148
+ return false;
10149
+ }
8286
10150
  if (announce === "tty") {
8287
10151
  return Boolean(process.stderr.isTTY);
8288
10152
  }
@@ -8343,6 +10207,7 @@ var Opensteer = class _Opensteer {
8343
10207
  this.cloud.actionClient = actionClient;
8344
10208
  this.cloud.sessionId = sessionId;
8345
10209
  this.cloud.cloudSessionUrl = session3.cloudSessionUrl;
10210
+ await this.syncCloudPageRef().catch(() => void 0);
8346
10211
  this.announceCloudSession({
8347
10212
  sessionId: session3.sessionId,
8348
10213
  workspaceId: session3.cloudSession.workspaceId,
@@ -8438,6 +10303,9 @@ var Opensteer = class _Opensteer {
8438
10303
  async goto(url, options) {
8439
10304
  if (this.cloud) {
8440
10305
  await this.invokeCloudActionAndResetCache("goto", { url, options });
10306
+ await this.syncCloudPageRef({ expectedUrl: url }).catch(
10307
+ () => void 0
10308
+ );
8441
10309
  return;
8442
10310
  }
8443
10311
  const { waitUntil = "domcontentloaded", ...rest } = options ?? {};
@@ -8942,9 +10810,16 @@ var Opensteer = class _Opensteer {
8942
10810
  }
8943
10811
  async newTab(url) {
8944
10812
  if (this.cloud) {
8945
- return await this.invokeCloudActionAndResetCache("newTab", {
8946
- url
8947
- });
10813
+ const result = await this.invokeCloudActionAndResetCache(
10814
+ "newTab",
10815
+ {
10816
+ url
10817
+ }
10818
+ );
10819
+ await this.syncCloudPageRef({ expectedUrl: result.url }).catch(
10820
+ () => void 0
10821
+ );
10822
+ return result;
8948
10823
  }
8949
10824
  const { page, info } = await createTab(this.context, url);
8950
10825
  this.pageRef = page;
@@ -8954,6 +10829,7 @@ var Opensteer = class _Opensteer {
8954
10829
  async switchTab(index) {
8955
10830
  if (this.cloud) {
8956
10831
  await this.invokeCloudActionAndResetCache("switchTab", { index });
10832
+ await this.syncCloudPageRef().catch(() => void 0);
8957
10833
  return;
8958
10834
  }
8959
10835
  const page = await switchTab(this.context, index);
@@ -8963,6 +10839,7 @@ var Opensteer = class _Opensteer {
8963
10839
  async closeTab(index) {
8964
10840
  if (this.cloud) {
8965
10841
  await this.invokeCloudActionAndResetCache("closeTab", { index });
10842
+ await this.syncCloudPageRef().catch(() => void 0);
8966
10843
  return;
8967
10844
  }
8968
10845
  const newPage = await closeTab(this.context, this.page, index);
@@ -9373,6 +11250,37 @@ var Opensteer = class _Opensteer {
9373
11250
  this.storage.clearNamespace();
9374
11251
  this.snapshotCache = null;
9375
11252
  }
11253
+ agent(config) {
11254
+ const resolvedAgentConfig = resolveAgentConfig({
11255
+ agentConfig: config,
11256
+ fallbackModel: this.config.model
11257
+ });
11258
+ return {
11259
+ execute: async (instructionOrOptions) => {
11260
+ if (this.agentExecutionInFlight) {
11261
+ throw new OpensteerAgentBusyError();
11262
+ }
11263
+ this.agentExecutionInFlight = true;
11264
+ try {
11265
+ const options = normalizeExecuteOptions(instructionOrOptions);
11266
+ const handler = new OpensteerCuaAgentHandler({
11267
+ page: this.page,
11268
+ config: resolvedAgentConfig,
11269
+ client: createCuaClient(resolvedAgentConfig),
11270
+ debug: Boolean(this.config.debug),
11271
+ onMutatingAction: () => {
11272
+ this.snapshotCache = null;
11273
+ }
11274
+ });
11275
+ const result = await handler.execute(options);
11276
+ this.snapshotCache = null;
11277
+ return result;
11278
+ } finally {
11279
+ this.agentExecutionInFlight = false;
11280
+ }
11281
+ }
11282
+ };
11283
+ }
9376
11284
  async runWithPostActionWait(action, waitOverride, execute) {
9377
11285
  const waitSession = createPostActionWaitSession(
9378
11286
  this.page,
@@ -10395,6 +12303,11 @@ function getScrollDelta2(options) {
10395
12303
  return { x: 0, y: absoluteAmount };
10396
12304
  }
10397
12305
  }
12306
+ function isInternalOrBlankPageUrl(url) {
12307
+ if (!url) return true;
12308
+ if (url === "about:blank") return true;
12309
+ return url.startsWith("chrome://") || url.startsWith("devtools://") || url.startsWith("edge://");
12310
+ }
10398
12311
  function buildLocalRunId(namespace) {
10399
12312
  const normalized = namespace.trim() || "default";
10400
12313
  return `${normalized}-${Date.now().toString(36)}-${(0, import_crypto2.randomUUID)().slice(0, 8)}`;