@oagi/oagi 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1536 @@
1
+ #!/usr/bin/env node
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __decorateClass = (decorators, target, key, kind) => {
5
+ var result = kind > 1 ? void 0 : kind ? __getOwnPropDesc(target, key) : target;
6
+ for (var i = decorators.length - 1, decorator; i >= 0; i--)
7
+ if (decorator = decorators[i])
8
+ result = (kind ? decorator(target, key, result) : decorator(result)) || result;
9
+ if (kind && result) __defProp(target, key, result);
10
+ return result;
11
+ };
12
+
13
+ // src/errors.ts
14
+ var OAGIError = class extends Error {
15
+ };
16
+ var APIError = class extends OAGIError {
17
+ constructor(response, message) {
18
+ super(message ?? response.statusText);
19
+ this.response = response;
20
+ }
21
+ toString() {
22
+ return `API Error [${this.response.status}]: ${this.message}`;
23
+ }
24
+ };
25
+ var AuthenticationError = class extends APIError {
26
+ };
27
+ var RateLimitError = class extends APIError {
28
+ };
29
+ var ValidationError = class extends APIError {
30
+ };
31
+ var NotFoundError = class extends APIError {
32
+ };
33
+ var ServerError = class extends APIError {
34
+ };
35
+ var ConfigurationError = class extends OAGIError {
36
+ };
37
+ var NetworkError = class extends OAGIError {
38
+ constructor(message, originalError) {
39
+ super(message);
40
+ this.originalError = originalError;
41
+ }
42
+ };
43
+ var RequestTimeoutError = class extends NetworkError {
44
+ };
45
+ var ValueError = class extends OAGIError {
46
+ };
47
+
48
+ // src/client.ts
49
+ import OpenAI from "openai";
50
+
51
+ // src/consts.ts
52
+ var DEFAULT_BASE_URL = "https://api.agiopen.org";
53
+ var API_KEY_HELP_URL = "https://developer.agiopen.org/api-keys";
54
+ var API_V1_FILE_UPLOAD_ENDPOINT = "/v1/file/upload";
55
+ var API_V1_GENERATE_ENDPOINT = "/v1/generate";
56
+ var MODEL_ACTOR = "lux-actor-1";
57
+ var MODEL_THINKER = "lux-thinker-1";
58
+ var MODE_ACTOR = "actor";
59
+ var DEFAULT_MAX_STEPS = 20;
60
+ var DEFAULT_MAX_STEPS_THINKER = 100;
61
+ var MAX_STEPS_ACTOR = 30;
62
+ var MAX_STEPS_THINKER = 120;
63
+ var DEFAULT_STEP_DELAY = 0.3;
64
+ var DEFAULT_TEMPERATURE = 0.5;
65
+ var DEFAULT_TEMPERATURE_LOW = 0.1;
66
+ var HTTP_CLIENT_TIMEOUT = 60;
67
+ var DEFAULT_MAX_RETRIES = 2;
68
+
69
+ // src/logger.ts
70
+ import pino from "pino";
71
+ var levelEnv = process.env.OAGI_LOG?.toLowerCase() ?? "info";
72
+ var allowedLevels = ["debug", "info", "warn", "error", "fatal"];
73
+ var logger = pino({
74
+ level: allowedLevels.includes(levelEnv) ? levelEnv : "info",
75
+ base: null,
76
+ timestamp: () => `,"time":"${(/* @__PURE__ */ new Date()).toISOString()}"`,
77
+ messageKey: "msg",
78
+ transport: {
79
+ target: "pino-pretty",
80
+ options: {
81
+ colorize: false,
82
+ translateTime: "SYS:yyyy-mm-dd HH:MM:ss",
83
+ messageFormat: "{msg}",
84
+ ignore: "pid,hostname"
85
+ }
86
+ }
87
+ });
88
+ var getLogger = (name) => logger.child({ name: `oagi.${name}` });
89
+ var logger_default = getLogger;
90
+ var logTraceOnFailure = (_, __, descriptor) => {
91
+ const original = descriptor.value;
92
+ descriptor.value = async function(...args) {
93
+ try {
94
+ return await original.apply(this, args);
95
+ } catch (err) {
96
+ if (err instanceof APIError) {
97
+ const requestId = err.response.headers.get("x-request-id") ?? "";
98
+ const traceId = err.response.headers.get("x-trace-id") ?? "";
99
+ logger.error(`Request Id: ${requestId}`);
100
+ logger.error(`Trace Id: ${traceId}`);
101
+ }
102
+ throw err;
103
+ }
104
+ };
105
+ return descriptor;
106
+ };
107
+
108
+ // src/types/models/action.ts
109
+ import * as z from "zod";
110
+ var ActionTypeSchema = z.enum([
111
+ "click",
112
+ "left_double",
113
+ "left_triple",
114
+ "right_single",
115
+ "drag",
116
+ "hotkey",
117
+ "type",
118
+ "scroll",
119
+ "finish",
120
+ "wait",
121
+ "call_user"
122
+ ]);
123
+ var ActionSchema = z.object({
124
+ /**
125
+ * Type of action to perform
126
+ */
127
+ type: ActionTypeSchema,
128
+ /**
129
+ * Action argument in the specified format
130
+ */
131
+ argument: z.string(),
132
+ /**
133
+ * Number of times to repeat the action
134
+ */
135
+ count: z.int().default(1)
136
+ });
137
+ var parseCoords = (args) => {
138
+ const match = /(\d+),\s*(\d+)/.exec(args);
139
+ if (!match) {
140
+ return null;
141
+ }
142
+ return [Number(match[1]), Number(match[2])];
143
+ };
144
+ var parseDragCoords = (args) => {
145
+ const match = /(\d+),\s*(\d+),\s*(\d+),\s*(\d+)/.exec(args);
146
+ if (!match) {
147
+ return null;
148
+ }
149
+ return [
150
+ Number(match[1]),
151
+ Number(match[2]),
152
+ Number(match[3]),
153
+ Number(match[4])
154
+ ];
155
+ };
156
+ var parseScroll = (args) => {
157
+ const match = /(\d+),\s*(\d+),\s*(\w+)/.exec(args);
158
+ if (!match) {
159
+ return null;
160
+ }
161
+ return [Number(match[1]), Number(match[2]), match[3].toLowerCase()];
162
+ };
163
+
164
+ // src/types/models/client.ts
165
+ import * as z2 from "zod";
166
+ var UsageSchema = z2.object({
167
+ prompt_tokens: z2.int(),
168
+ completion_tokens: z2.int(),
169
+ total_tokens: z2.int()
170
+ });
171
+ var ErrorDetailSchema = z2.object({
172
+ code: z2.string(),
173
+ message: z2.string()
174
+ });
175
+ var ErrorResponseSchema = z2.object({
176
+ error: ErrorDetailSchema.nullish()
177
+ });
178
+ var UploadFileResponseSchema = z2.object({
179
+ url: z2.string(),
180
+ uuid: z2.string(),
181
+ expires_at: z2.int(),
182
+ file_expires_at: z2.int(),
183
+ download_url: z2.string()
184
+ });
185
+ var GenerateResponseSchema = z2.object({
186
+ response: z2.string(),
187
+ prompt_tokens: z2.int(),
188
+ completion_tokens: z2.int(),
189
+ /**
190
+ * @deprecated This field is deprecated
191
+ */
192
+ cost: z2.float64().nullish(),
193
+ request_id: z2.string().nullish()
194
+ });
195
+
196
+ // src/types/models/image-config.ts
197
+ import * as z3 from "zod";
198
+ var ImageConfigSchema = z3.object({
199
+ format: z3.enum(["PNG", "JPEG"]).default("JPEG"),
200
+ quality: z3.int().min(1).max(100).default(85),
201
+ width: z3.int().positive().nullish().default(1260),
202
+ height: z3.int().positive().nullish().default(700),
203
+ optimize: z3.boolean().default(false),
204
+ resample: z3.enum(["NEAREST", "BILINEAR", "BICUBIC", "LANCZOS"]).default("LANCZOS")
205
+ }).transform((value) => {
206
+ if (value.format === "PNG") {
207
+ return { ...value, quality: 85 };
208
+ }
209
+ return value;
210
+ });
211
+
212
+ // src/types/step_observer.ts
213
+ import * as z4 from "zod";
214
+ var BaseEventSchema = z4.object({
215
+ timestamp: z4.date().default(() => /* @__PURE__ */ new Date())
216
+ });
217
+ var ImageEventSchema = BaseEventSchema.extend({
218
+ type: z4.literal("image"),
219
+ step_num: z4.number(),
220
+ image: z4.string()
221
+ });
222
+ var StepEventSchema = BaseEventSchema.extend({
223
+ type: z4.literal("step"),
224
+ step_num: z4.number(),
225
+ image: z4.custom(),
226
+ step: z4.custom(),
227
+ task_id: z4.string().optional()
228
+ });
229
+ var ActionEventSchema = BaseEventSchema.extend({
230
+ type: z4.literal("action"),
231
+ step_num: z4.number(),
232
+ actions: z4.array(z4.custom()),
233
+ error: z4.string().optional()
234
+ });
235
+ var LogEventSchema = BaseEventSchema.extend({
236
+ type: z4.literal("log"),
237
+ message: z4.string()
238
+ });
239
+ var SplitEventSchema = BaseEventSchema.extend({
240
+ type: z4.literal("split"),
241
+ label: z4.string().optional()
242
+ });
243
+ var PlanEventSchema = BaseEventSchema.extend({
244
+ type: z4.literal("plan"),
245
+ phase: z4.enum(["initial", "reflection", "summary"]),
246
+ image: z4.string().or(z4.custom()).optional(),
247
+ reasoning: z4.string(),
248
+ result: z4.string().optional(),
249
+ request_id: z4.string().optional()
250
+ });
251
+ var StepObserver = class {
252
+ chain(observer) {
253
+ return new ChainedStepObserver([this, observer ?? null]);
254
+ }
255
+ };
256
+ var ChainedStepObserver = class extends StepObserver {
257
+ observers;
258
+ constructor(observers) {
259
+ super();
260
+ this.observers = observers;
261
+ }
262
+ async onEvent(event) {
263
+ return await this.observers.reduce(async (prev, observer) => {
264
+ await prev;
265
+ if (observer) await observer.onEvent(event);
266
+ }, Promise.resolve());
267
+ }
268
+ };
269
+
270
+ // src/utils/output-parser.ts
271
+ var splitActions = (actionBlock) => {
272
+ const actions = [];
273
+ let currentAction = [];
274
+ let parenLevel = 0;
275
+ for (const char of actionBlock) {
276
+ currentAction.push(char);
277
+ switch (char) {
278
+ case "(":
279
+ parenLevel++;
280
+ break;
281
+ case ")":
282
+ parenLevel--;
283
+ break;
284
+ case "&":
285
+ if (parenLevel === 0) {
286
+ const action = currentAction.join("").trim();
287
+ action && actions.push(action);
288
+ currentAction = [];
289
+ }
290
+ break;
291
+ }
292
+ }
293
+ const lastAction = currentAction.join("").trim();
294
+ lastAction && actions.push(lastAction);
295
+ return actions;
296
+ };
297
+ var parseAction = (action) => {
298
+ const match = /(\w+)\((.*)\)/.exec(action);
299
+ if (!match) return null;
300
+ const { data: actionType, success } = ActionTypeSchema.safeParse(match[1]);
301
+ if (!success) return null;
302
+ let argument = match[2].trim();
303
+ const args = argument.split(",");
304
+ let count = 1;
305
+ switch (actionType) {
306
+ // hotkey(key, c) - press key c times
307
+ case "hotkey":
308
+ if (args.length >= 2 && args[1].trim()) {
309
+ argument = args[0].trim();
310
+ count = Number(args[1].trim());
311
+ }
312
+ break;
313
+ case "scroll":
314
+ if (args.length >= 4) {
315
+ const x = args[0].trim();
316
+ const y = args[1].trim();
317
+ const direction = args[2].trim();
318
+ argument = `${x},${y},${direction}`;
319
+ count = Number(args[3].trim());
320
+ }
321
+ break;
322
+ default:
323
+ }
324
+ if (!Number.isInteger(count) || count <= 0) {
325
+ count = 1;
326
+ }
327
+ return { type: actionType, argument, count };
328
+ };
329
+ var parseRawOutput = (rawOutput) => {
330
+ const reason = /<\|think_start\|>(.*?)<\|think_end\|>/s.exec(rawOutput)?.[1] ?? "";
331
+ const action = /<\|action_start\|>(.*?)<\|action_end\|>/s.exec(rawOutput)?.[1] ?? "";
332
+ const actions = splitActions(action).map(parseAction).filter((action2) => !!action2);
333
+ return {
334
+ reason,
335
+ actions,
336
+ stop: actions.some((action2) => action2.type === "finish")
337
+ };
338
+ };
339
+
340
+ // src/utils/prompt-builder.ts
341
+ var buildPrompt = (taskDescription) => `You are a Desktop Agent completing computer use tasks from a user instruction.
342
+
343
+ Every step, you will look at the screenshot and output the desired actions in a format as:
344
+
345
+ <|think_start|> brief description of your intent and reasoning <|think_end|>
346
+ <|action_start|> one of the allowed actions as below <|action_end|>
347
+
348
+ In the action field, you have the following action formats:
349
+ 1. click(x, y) # left-click at the position (x, y), where x and y are integers normalized between 0 and 1000
350
+ 2. left_double(x, y) # left-double-click at the position (x, y), where x and y are integers normalized between 0 and 1000
351
+ 3. left_triple(x, y) # left-triple-click at the position (x, y), where x and y are integers normalized between 0 and 1000
352
+ 4. right_single(x, y) # right-click at the position (x, y), where x and y are integers normalized between 0 and 1000
353
+ 5. drag(x1, y1, x2, y2) # drag the mouse from (x1, y1) to (x2, y2) to select or move contents, where x1, y1, x2, y2 are integers normalized between 0 and 1000
354
+ 6. hotkey(key, c) # press the key for c times
355
+ 7. type(text) # type a text string on the keyboard
356
+ 8. scroll(x, y, direction, c) # scroll the mouse at position (x, y) in the direction of up or down for c times, where x and y are integers normalized between 0 and 1000
357
+ 9. wait() # wait for a while
358
+ 10. finish() # indicate the task is finished
359
+
360
+ Directly output the text beginning with <|think_start|>, no additional text is needed for this scenario.
361
+
362
+ The user instruction is:
363
+ ${taskDescription}
364
+ `;
365
+
366
+ // src/client.ts
367
+ var logger2 = logger_default("client");
368
+ var _Client = class _Client {
369
+ constructor(baseUrl = process.env.OAGI_BASE_URL ?? DEFAULT_BASE_URL, apiKey = process.env.OAGI_API_KEY ?? null, maxRetries = DEFAULT_MAX_RETRIES) {
370
+ this.baseUrl = baseUrl;
371
+ this.apiKey = apiKey;
372
+ if (!apiKey) {
373
+ throw new ConfigurationError(
374
+ `OAGI API key must be provided either as 'api_key' parameter or OAGI_API_KEY environment variable. Get your API key at ${API_KEY_HELP_URL}`
375
+ );
376
+ }
377
+ this.client = new OpenAI({
378
+ baseURL: new URL("./v1", baseUrl).href,
379
+ apiKey,
380
+ maxRetries
381
+ });
382
+ logger2.info(`Client initialized with base_url: ${baseUrl}`);
383
+ }
384
+ timeout = HTTP_CLIENT_TIMEOUT;
385
+ client;
386
+ fetch(input, init) {
387
+ if (typeof input === "string" || input instanceof URL) {
388
+ input = new URL(input, this.baseUrl);
389
+ } else {
390
+ input = new URL(input.url, this.baseUrl);
391
+ }
392
+ init ??= {};
393
+ const signal = AbortSignal.timeout(this.timeout * 1e3);
394
+ init.signal = init.signal ? AbortSignal.any([signal, init.signal]) : signal;
395
+ return fetch(input, init);
396
+ }
397
+ buildHeaders(apiVersion) {
398
+ const headers = {};
399
+ if (apiVersion) {
400
+ headers["x-api-version"] = apiVersion;
401
+ }
402
+ if (this.apiKey) {
403
+ headers["x-api-key"] = this.apiKey;
404
+ }
405
+ return headers;
406
+ }
407
+ async handleResponseError(response) {
408
+ const data = await response.json();
409
+ const cls = _Client.getErrorClass(response.status);
410
+ const err = new cls(response, data.error?.message);
411
+ logger2.error(err.toString());
412
+ throw err;
413
+ }
414
+ handleHttpErrors(err) {
415
+ if (err instanceof DOMException) {
416
+ if (err.name === "TimeoutError") {
417
+ const message = `Request timed out after ${this.timeout} seconds`;
418
+ logger2.error(message);
419
+ throw new RequestTimeoutError(message, err);
420
+ }
421
+ } else if (err instanceof TypeError) {
422
+ const message = `Network error: ${err}`;
423
+ logger2.error(message);
424
+ throw new NetworkError(message, err);
425
+ }
426
+ throw err;
427
+ }
428
+ static getErrorClass(statusCode) {
429
+ if (statusCode >= 500) return ServerError;
430
+ return {
431
+ 401: AuthenticationError,
432
+ 404: NotFoundError,
433
+ 422: ValidationError,
434
+ 429: RateLimitError
435
+ }[statusCode] ?? APIError;
436
+ }
437
+ /**
438
+ * Call OpenAI-compatible /v1/chat/completions endpoint.
439
+ *
440
+ * @param model Model to use for inference
441
+ * @param messages Full message history (OpenAI-compatible format)
442
+ * @param temperature Sampling temperature (0.0-2.0)
443
+ * @param taskId Optional task ID for multi-turn conversations
444
+ * @returns Tuple of (Step, raw_output, Usage)
445
+ * - Step: Parsed actions and reasoning
446
+ * - raw_output: Raw model output string (for message history)
447
+ * - Usage: Token usage statistics (or None if not available)
448
+ */
449
+ async chatCompletions(model, messages, temperature, taskId) {
450
+ logger2.info(`Making async chat completion request with model: ${model}`);
451
+ const response = await this.client.chat.completions.create({
452
+ model,
453
+ messages,
454
+ temperature,
455
+ // @ts-expect-error extra body
456
+ task_id: taskId
457
+ });
458
+ const rawOutput = response.choices[0].message.content ?? "";
459
+ const step = parseRawOutput(rawOutput);
460
+ taskId = response.task_id;
461
+ const task = taskId ? `task_id: ${taskId}, ` : "";
462
+ const usage = response.usage ? `, tokens: ${response.usage.prompt_tokens}+${response.usage.completion_tokens}` : "";
463
+ logger2.info(
464
+ `Chat completion successful - ${task}actions: ${step.actions.length}, stop: ${step.stop}${usage}`
465
+ );
466
+ return [step, rawOutput, response.usage];
467
+ }
468
+ /**
469
+ * Call the /v1/file/upload endpoint to get a S3 presigned URL
470
+ *
471
+ * @param apiVersion API version header
472
+ * @returns {Promise<UploadFileResponse>} The response from /v1/file/upload with uuid and presigned S3 URL
473
+ */
474
+ async getS3PresignedUrl(apiVersion) {
475
+ logger2.debug(`Making async API request to ${API_V1_FILE_UPLOAD_ENDPOINT}`);
476
+ try {
477
+ const headers = this.buildHeaders(apiVersion);
478
+ const response = await this.fetch(API_V1_FILE_UPLOAD_ENDPOINT, {
479
+ headers
480
+ });
481
+ if (!response.ok) {
482
+ await this.handleResponseError(response);
483
+ }
484
+ try {
485
+ const uploadFileResponse = UploadFileResponseSchema.parse(
486
+ await response.json()
487
+ );
488
+ logger2.debug("Calling /v1/file/upload successful");
489
+ return uploadFileResponse;
490
+ } catch (err) {
491
+ logger2.error(`Invalid upload response: ${response.status}`);
492
+ throw new APIError(
493
+ response,
494
+ `Invalid presigned S3 URL response: ${err}`
495
+ );
496
+ }
497
+ } catch (err) {
498
+ this.handleHttpErrors(err);
499
+ }
500
+ }
501
+ /**
502
+ * Upload image bytes to S3 using presigned URL
503
+ *
504
+ * @param url S3 presigned URL
505
+ * @param content Image bytes to upload
506
+ * @throws {APIError} If upload fails
507
+ */
508
+ async uploadToS3(url, content) {
509
+ logger2.debug("Uploading image to S3");
510
+ let response = null;
511
+ try {
512
+ response = await this.fetch(url, {
513
+ body: content,
514
+ method: "PUT"
515
+ });
516
+ if (!response.ok) {
517
+ await this.handleResponseError(response);
518
+ }
519
+ } catch (err) {
520
+ logger2.error(`S3 upload failed ${err}`);
521
+ if (err instanceof APIError) {
522
+ throw err;
523
+ }
524
+ throw new APIError(
525
+ response ?? new Response(null, { status: 500 }),
526
+ `${err}`
527
+ );
528
+ }
529
+ }
530
+ /**
531
+ * Get S3 presigned URL and upload image (convenience method)
532
+ *
533
+ * @param screenshot Screenshot image bytes
534
+ * @param apiVersion API version header
535
+ * @returns {UploadFileResponse} The response from /v1/file/upload with uuid and presigned S3 URL
536
+ */
537
+ async putS3PresignedUrl(screenshot, apiVersion) {
538
+ const uploadFileResponse = await this.getS3PresignedUrl(apiVersion);
539
+ await this.uploadToS3(uploadFileResponse.url, screenshot);
540
+ return uploadFileResponse;
541
+ }
542
+ async callWorker({
543
+ workerId,
544
+ overallTodo,
545
+ taskDescription,
546
+ todos,
547
+ history = [],
548
+ currentTodoIndex,
549
+ taskExecutionSummary,
550
+ currentScreenshot,
551
+ currentSubtaskInstruction,
552
+ windowSteps,
553
+ windowScreenshots,
554
+ resultScreenshot,
555
+ priorNotes,
556
+ latestTodoSummary,
557
+ apiVersion
558
+ }) {
559
+ const validWorkers = ["oagi_first", "oagi_follow", "oagi_task_summary"];
560
+ if (!validWorkers.includes(workerId)) {
561
+ throw new ValueError(
562
+ `Invalid worker_id '${workerId}'. Must be one of: ${validWorkers}`
563
+ );
564
+ }
565
+ logger2.info(`Calling /v1/generate with worker_id: ${workerId}`);
566
+ const payload = {
567
+ external_worker_id: workerId,
568
+ overall_todo: overallTodo,
569
+ task_description: taskDescription,
570
+ todos,
571
+ history,
572
+ // Add optional memory fields
573
+ current_todo_index: currentTodoIndex,
574
+ task_execution_summary: taskExecutionSummary,
575
+ // Add optional screenshot/worker-specific fields
576
+ current_screenshot: currentScreenshot,
577
+ current_subtask_instruction: currentSubtaskInstruction,
578
+ window_steps: windowSteps,
579
+ window_screenshots: windowScreenshots,
580
+ result_screenshot: resultScreenshot,
581
+ prior_notes: priorNotes,
582
+ latest_todo_summary: latestTodoSummary
583
+ };
584
+ const headers = this.buildHeaders(apiVersion);
585
+ try {
586
+ const response = await this.fetch(API_V1_GENERATE_ENDPOINT, {
587
+ body: JSON.stringify(payload),
588
+ headers,
589
+ method: "POST"
590
+ });
591
+ if (!response.ok) {
592
+ await this.handleResponseError(response);
593
+ }
594
+ const result = GenerateResponseSchema.parse(await response.json());
595
+ result.request_id = response.headers.get("X-Request-ID");
596
+ logger2.info(
597
+ `Generate request successful - tokens: ${result.prompt_tokens}+${result.completion_tokens}, request_id: ${result.request_id}`
598
+ );
599
+ return result;
600
+ } catch (err) {
601
+ this.handleHttpErrors(err);
602
+ }
603
+ }
604
+ };
605
+ __decorateClass([
606
+ logTraceOnFailure
607
+ ], _Client.prototype, "callWorker", 1);
608
+ var Client = _Client;
609
+
610
+ // src/actor.ts
611
+ import { randomUUID } from "crypto";
612
+ var logger3 = logger_default("task");
613
+ var Actor = class {
614
+ constructor(apiKey, baseUrl, model = MODEL_ACTOR, temperature) {
615
+ this.model = model;
616
+ this.temperature = temperature;
617
+ this.client = new Client(baseUrl, apiKey);
618
+ }
619
+ /**
620
+ * Client-side generated UUID
621
+ */
622
+ taskId = randomUUID();
623
+ taskDescription = null;
624
+ /**
625
+ * OpenAI-compatible message history
626
+ */
627
+ messageHistory = [];
628
+ maxSteps = DEFAULT_MAX_STEPS;
629
+ /**
630
+ * Current step counter
631
+ */
632
+ currentStep = 0;
633
+ client;
634
+ validateAndIncrementStep() {
635
+ if (!this.taskDescription) {
636
+ throw new ValueError(
637
+ "Task description must be set. Call initTask() first."
638
+ );
639
+ }
640
+ if (this.currentStep >= this.maxSteps) {
641
+ throw new ValueError(
642
+ `Max steps limit (${this.maxSteps}) reached. Call initTask() to start a new task.`
643
+ );
644
+ }
645
+ this.currentStep++;
646
+ }
647
+ /**
648
+ * Get screenshot URL, uploading to S3 if needed (async version).
649
+ * @param screenshot Screenshot as URL string, or raw bytes
650
+ * @returns Screenshot URL (either direct or from S3 upload)
651
+ */
652
+ async ensureScreenshotUrl(screenshot) {
653
+ if (typeof screenshot === "string") return screenshot;
654
+ const uploadResponse = await this.client.putS3PresignedUrl(screenshot);
655
+ return uploadResponse.download_url;
656
+ }
657
+ /**
658
+ * Add user message with screenshot to message history.
659
+ *
660
+ * @param screenshot URL of the screenshot
661
+ * @param prompt Optional prompt text (for first message only)
662
+ */
663
+ addUserMessageToHistory(screenshot, prompt) {
664
+ const content = [];
665
+ if (prompt) {
666
+ content.push({
667
+ type: "text",
668
+ text: prompt
669
+ });
670
+ }
671
+ content.push({
672
+ type: "image_url",
673
+ image_url: {
674
+ url: screenshot
675
+ }
676
+ });
677
+ this.messageHistory.push({ role: "user", content });
678
+ }
679
+ /**
680
+ * Build prompt for first message only.
681
+ */
682
+ buildStepPrompt() {
683
+ if (this.messageHistory.length === 0) {
684
+ return buildPrompt(this.taskDescription);
685
+ }
686
+ }
687
+ /**
688
+ * Initialize a new task with the given description.
689
+ *
690
+ * @param taskDescription Task description
691
+ * @param maxSteps Maximum number of steps allowed
692
+ */
693
+ initTask(taskDescription, maxSteps = DEFAULT_MAX_STEPS) {
694
+ this.taskId = randomUUID();
695
+ this.taskDescription = taskDescription;
696
+ this.messageHistory = [];
697
+ const limit = this.model == MODEL_THINKER ? MAX_STEPS_THINKER : MAX_STEPS_ACTOR;
698
+ if (maxSteps > limit) {
699
+ logger3.warn(
700
+ `max_steps (${maxSteps}) exceeds limit for model '${this.model}'. Capping to ${limit}.`
701
+ );
702
+ maxSteps = limit;
703
+ }
704
+ this.maxSteps = maxSteps;
705
+ this.currentStep = 0;
706
+ logger3.info(
707
+ `Task initialized: '${taskDescription}' (max_steps: ${maxSteps})`
708
+ );
709
+ }
710
+ /**
711
+ * Send screenshot to the server and get the next actions.
712
+ *
713
+ * @param screenshot Screenshot as URL string, or raw bytes
714
+ * @param instruction Optional additional instruction for this step (currently unused)
715
+ * @param temperature Sampling temperature for this step (overrides task default if provided)
716
+ */
717
+ async step(screenshot, _instruction, temperature) {
718
+ this.validateAndIncrementStep();
719
+ logger3.debug(`Executing step for task: '${this.taskDescription}'`);
720
+ try {
721
+ const screenshotUrl = await this.ensureScreenshotUrl(screenshot);
722
+ this.addUserMessageToHistory(screenshotUrl, this.buildStepPrompt());
723
+ const [step, rawOutput] = await this.client.chatCompletions(
724
+ this.model,
725
+ this.messageHistory,
726
+ temperature ?? this.temperature,
727
+ this.taskId
728
+ );
729
+ if (rawOutput) {
730
+ this.messageHistory.push({
731
+ role: "assistant",
732
+ content: [
733
+ {
734
+ type: "text",
735
+ text: rawOutput
736
+ }
737
+ ]
738
+ });
739
+ }
740
+ if (step.stop) {
741
+ logger3.info("Task completed.");
742
+ } else {
743
+ logger3.debug(`Step completed with${step.actions.length} actions`);
744
+ }
745
+ return step;
746
+ } catch (err) {
747
+ logger3.error(`Error during step execution: ${err}`);
748
+ throw err;
749
+ }
750
+ }
751
+ };
752
+
753
+ // src/agent/default.ts
754
+ var logger4 = logger_default("agent.default");
755
+ var resetHandler = (handler) => {
756
+ if (typeof handler.reset === "function") {
757
+ handler.reset();
758
+ }
759
+ };
760
+ var sleep = (seconds) => new Promise((resolve) => setTimeout(resolve, seconds * 1e3));
761
+ var DefaultAgent = class {
762
+ /** Default asynchronous agent implementation using OAGI client. */
763
+ api_key;
764
+ base_url;
765
+ model;
766
+ max_steps;
767
+ temperature;
768
+ step_observer;
769
+ step_delay;
770
+ constructor(api_key, base_url, model = MODEL_ACTOR, max_steps = DEFAULT_MAX_STEPS, temperature = DEFAULT_TEMPERATURE, step_observer, step_delay = DEFAULT_STEP_DELAY) {
771
+ this.api_key = api_key;
772
+ this.base_url = base_url;
773
+ this.model = model;
774
+ this.max_steps = max_steps;
775
+ this.temperature = temperature;
776
+ this.step_observer = step_observer;
777
+ this.step_delay = step_delay;
778
+ }
779
+ async execute(instruction, action_handler, image_provider) {
780
+ const actor = new Actor(this.api_key, this.base_url, this.model);
781
+ logger4.info(`Starting async task execution: ${instruction}`);
782
+ await actor.initTask(instruction, this.max_steps);
783
+ resetHandler(action_handler);
784
+ for (let i = 0; i < this.max_steps; i++) {
785
+ const step_num = i + 1;
786
+ logger4.debug(`Executing step ${step_num}/${this.max_steps}`);
787
+ const image = await image_provider.provide();
788
+ const step = await actor.step(image, void 0, this.temperature);
789
+ if (step.reason) {
790
+ logger4.info(`Step ${step_num}: ${step.reason}`);
791
+ }
792
+ if (this.step_observer) {
793
+ const event = {
794
+ type: "step",
795
+ timestamp: /* @__PURE__ */ new Date(),
796
+ step_num,
797
+ image,
798
+ step,
799
+ task_id: actor.taskId
800
+ };
801
+ await this.step_observer.onEvent(event);
802
+ }
803
+ if (step.actions?.length) {
804
+ logger4.info(`Actions (${step.actions.length}):`);
805
+ for (const action of step.actions) {
806
+ const count_suffix = action.count && action.count > 1 ? ` x${action.count}` : "";
807
+ logger4.info(` [${action.type}] ${action.argument}${count_suffix}`);
808
+ }
809
+ let error = null;
810
+ try {
811
+ await action_handler.handle(step.actions);
812
+ } catch (e) {
813
+ error = String(e);
814
+ throw e;
815
+ } finally {
816
+ if (this.step_observer) {
817
+ const event = {
818
+ type: "action",
819
+ timestamp: /* @__PURE__ */ new Date(),
820
+ step_num,
821
+ actions: step.actions,
822
+ error: error ?? void 0
823
+ };
824
+ await this.step_observer.onEvent(event);
825
+ }
826
+ }
827
+ }
828
+ if (this.step_delay > 0) {
829
+ await sleep(this.step_delay);
830
+ }
831
+ if (step.stop) {
832
+ logger4.info(`Task completed successfully after ${step_num} steps`);
833
+ return true;
834
+ }
835
+ }
836
+ logger4.warn(
837
+ `Task reached max steps (${this.max_steps}) without completion`
838
+ );
839
+ return false;
840
+ }
841
+ };
842
+
843
+ // src/agent/registry.ts
844
+ var agentRegistry = {};
845
+ var asyncAgentRegister = (mode) => {
846
+ return (func) => {
847
+ if (mode in agentRegistry) {
848
+ throw new Error(
849
+ `Agent mode '${mode}' is already registered. Cannot register the same mode twice.`
850
+ );
851
+ }
852
+ agentRegistry[mode] = func;
853
+ return func;
854
+ };
855
+ };
856
+ var getAgentFactory = (mode) => {
857
+ if (!(mode in agentRegistry)) {
858
+ const availableModes = Object.keys(agentRegistry);
859
+ throw new Error(
860
+ `Unknown agent mode: '${mode}'. Available modes: ${availableModes}`
861
+ );
862
+ }
863
+ return agentRegistry[mode];
864
+ };
865
+ var listAgentModes = () => {
866
+ return Object.keys(agentRegistry);
867
+ };
868
+ var createAgent = (mode, options = {}) => {
869
+ const factory = getAgentFactory(mode);
870
+ const agent = factory(options);
871
+ if (!agent || typeof agent.execute !== "function") {
872
+ throw new TypeError(
873
+ `Factory for mode '${mode}' returned an object that doesn't implement Agent. Expected an object with an 'execute' method.`
874
+ );
875
+ }
876
+ return agent;
877
+ };
878
+
879
+ // src/agent/factories.ts
880
+ asyncAgentRegister("actor")((options = {}) => {
881
+ const {
882
+ apiKey,
883
+ baseUrl,
884
+ model = MODEL_ACTOR,
885
+ maxSteps = DEFAULT_MAX_STEPS,
886
+ temperature = DEFAULT_TEMPERATURE_LOW,
887
+ stepObserver,
888
+ stepDelay = DEFAULT_STEP_DELAY
889
+ } = options;
890
+ return new DefaultAgent(
891
+ apiKey,
892
+ baseUrl,
893
+ model,
894
+ maxSteps,
895
+ temperature,
896
+ stepObserver ?? void 0,
897
+ stepDelay
898
+ );
899
+ });
900
+ asyncAgentRegister("thinker")((options = {}) => {
901
+ const {
902
+ apiKey,
903
+ baseUrl,
904
+ model = MODEL_THINKER,
905
+ maxSteps = DEFAULT_MAX_STEPS_THINKER,
906
+ temperature = DEFAULT_TEMPERATURE_LOW,
907
+ stepObserver,
908
+ stepDelay = DEFAULT_STEP_DELAY
909
+ } = options;
910
+ return new DefaultAgent(
911
+ apiKey,
912
+ baseUrl,
913
+ model,
914
+ maxSteps,
915
+ temperature,
916
+ stepObserver ?? void 0,
917
+ stepDelay
918
+ );
919
+ });
920
+
921
+ // src/agent/observer/exporters.ts
922
+ import fs from "fs";
923
+ import path from "path";
924
+ import { fileURLToPath, pathToFileURL } from "url";
925
+ var ensureDir = (dirPath) => {
926
+ fs.mkdirSync(dirPath, { recursive: true });
927
+ };
928
+ var parseActionCoords = (action) => {
929
+ const arg = action.argument.replace(/^\(|\)$/g, "");
930
+ switch (action.type) {
931
+ case "click":
932
+ case "left_double":
933
+ case "left_triple":
934
+ case "right_single": {
935
+ const coords = parseCoords(arg);
936
+ if (coords) {
937
+ return { type: "click", x: coords[0], y: coords[1] };
938
+ }
939
+ return null;
940
+ }
941
+ case "drag": {
942
+ const coords = parseDragCoords(arg);
943
+ if (coords) {
944
+ return {
945
+ type: "drag",
946
+ x1: coords[0],
947
+ y1: coords[1],
948
+ x2: coords[2],
949
+ y2: coords[3]
950
+ };
951
+ }
952
+ return null;
953
+ }
954
+ case "scroll": {
955
+ const result = parseScroll(arg);
956
+ if (result) {
957
+ return {
958
+ type: "scroll",
959
+ x: result[0],
960
+ y: result[1],
961
+ direction: result[2]
962
+ };
963
+ }
964
+ return null;
965
+ }
966
+ default:
967
+ return null;
968
+ }
969
+ };
970
+ var exportToMarkdown = (events, filePath, imagesDir) => {
971
+ const outputDir = path.dirname(filePath);
972
+ ensureDir(outputDir);
973
+ if (imagesDir) {
974
+ ensureDir(imagesDir);
975
+ }
976
+ const lines = ["# Agent Execution Report\n"];
977
+ for (const event of events) {
978
+ const d = event.timestamp instanceof Date ? event.timestamp : new Date(event.timestamp);
979
+ const timestamp = d.toTimeString().slice(0, 8);
980
+ switch (event.type) {
981
+ case "step":
982
+ lines.push(`
983
+ ## Step ${event.step_num}
984
+ `);
985
+ lines.push(`**Time:** ${timestamp}
986
+ `);
987
+ if (event.task_id) {
988
+ lines.push(`**Task ID:** \`${event.task_id}\`
989
+ `);
990
+ }
991
+ if (typeof event.image !== "string") {
992
+ if (imagesDir) {
993
+ const imageFilename = `step_${event.step_num}.png`;
994
+ const imagePath = path.join(imagesDir, imageFilename);
995
+ fs.writeFileSync(imagePath, Buffer.from(event.image));
996
+ const relPath = path.join(path.basename(imagesDir), imageFilename);
997
+ lines.push(`
998
+ ![Step ${event.step_num}](${relPath})
999
+ `);
1000
+ } else {
1001
+ lines.push(
1002
+ `
1003
+ *[Screenshot captured - ${event.image.byteLength} bytes]*
1004
+ `
1005
+ );
1006
+ }
1007
+ } else {
1008
+ lines.push(`
1009
+ **Screenshot URL:** ${event.image}
1010
+ `);
1011
+ }
1012
+ if (event.step.reason) {
1013
+ lines.push(`
1014
+ **Reasoning:**
1015
+ > ${event.step.reason}
1016
+ `);
1017
+ }
1018
+ if (event.step.actions?.length) {
1019
+ lines.push("\n**Planned Actions:**\n");
1020
+ for (const action of event.step.actions) {
1021
+ const countStr = action.count && action.count > 1 ? ` (x${action.count})` : "";
1022
+ lines.push(`- \`${action.type}\`: ${action.argument}${countStr}
1023
+ `);
1024
+ }
1025
+ }
1026
+ if (event.step.stop) {
1027
+ lines.push("\n**Status:** Task Complete\n");
1028
+ }
1029
+ break;
1030
+ case "action":
1031
+ lines.push(`
1032
+ ### Actions Executed (${timestamp})
1033
+ `);
1034
+ if (event.error) {
1035
+ lines.push(`
1036
+ **Error:** ${event.error}
1037
+ `);
1038
+ } else {
1039
+ lines.push("\n**Result:** Success\n");
1040
+ }
1041
+ break;
1042
+ case "log":
1043
+ lines.push(`
1044
+ > **Log (${timestamp}):** ${event.message}
1045
+ `);
1046
+ break;
1047
+ case "split":
1048
+ if (event.label) {
1049
+ lines.push(`
1050
+ ---
1051
+
1052
+ ### ${event.label}
1053
+ `);
1054
+ } else {
1055
+ lines.push("\n---\n");
1056
+ }
1057
+ break;
1058
+ case "image":
1059
+ break;
1060
+ case "plan": {
1061
+ const phaseTitles = {
1062
+ initial: "Initial Planning",
1063
+ reflection: "Reflection",
1064
+ summary: "Summary"
1065
+ };
1066
+ const phaseTitle = phaseTitles[event.phase] ?? event.phase;
1067
+ lines.push(`
1068
+ ### ${phaseTitle} (${timestamp})
1069
+ `);
1070
+ if (event.request_id) {
1071
+ lines.push(`**Request ID:** \`${event.request_id}\`
1072
+ `);
1073
+ }
1074
+ if (event.image) {
1075
+ if (typeof event.image !== "string") {
1076
+ if (imagesDir) {
1077
+ const imageFilename = `plan_${event.phase}_${Date.now()}.png`;
1078
+ const imagePath = path.join(imagesDir, imageFilename);
1079
+ fs.writeFileSync(imagePath, Buffer.from(event.image));
1080
+ const relPath = path.join(
1081
+ path.basename(imagesDir),
1082
+ imageFilename
1083
+ );
1084
+ lines.push(`
1085
+ ![${phaseTitle}](${relPath})
1086
+ `);
1087
+ } else {
1088
+ lines.push(
1089
+ `
1090
+ *[Screenshot captured - ${event.image.byteLength} bytes]*
1091
+ `
1092
+ );
1093
+ }
1094
+ } else {
1095
+ lines.push(`
1096
+ **Screenshot URL:** ${event.image}
1097
+ `);
1098
+ }
1099
+ }
1100
+ if (event.reasoning) {
1101
+ lines.push(`
1102
+ **Reasoning:**
1103
+ > ${event.reasoning}
1104
+ `);
1105
+ }
1106
+ if (event.result) {
1107
+ lines.push(`
1108
+ **Result:** ${event.result}
1109
+ `);
1110
+ }
1111
+ break;
1112
+ }
1113
+ }
1114
+ }
1115
+ fs.writeFileSync(filePath, lines.join(""), "utf-8");
1116
+ };
1117
+ var convertEventsForHtml = (events) => {
1118
+ const result = [];
1119
+ for (const event of events) {
1120
+ const d = event.timestamp instanceof Date ? event.timestamp : new Date(event.timestamp);
1121
+ const timestamp = d.toTimeString().slice(0, 8);
1122
+ switch (event.type) {
1123
+ case "step": {
1124
+ const action_coords = [];
1125
+ const actions = [];
1126
+ if (event.step.actions?.length) {
1127
+ for (const action of event.step.actions) {
1128
+ const coords = parseActionCoords(action);
1129
+ if (coords) {
1130
+ action_coords.push(coords);
1131
+ }
1132
+ actions.push({
1133
+ type: action.type,
1134
+ argument: action.argument,
1135
+ count: action.count ?? 1
1136
+ });
1137
+ }
1138
+ }
1139
+ let image = null;
1140
+ if (typeof event.image !== "string") {
1141
+ image = Buffer.from(event.image).toString("base64");
1142
+ } else {
1143
+ image = event.image;
1144
+ }
1145
+ result.push({
1146
+ event_type: "step",
1147
+ timestamp,
1148
+ step_num: event.step_num,
1149
+ image,
1150
+ action_coords,
1151
+ reason: event.step.reason,
1152
+ actions,
1153
+ stop: event.step.stop,
1154
+ task_id: event.task_id
1155
+ });
1156
+ break;
1157
+ }
1158
+ case "action":
1159
+ result.push({
1160
+ event_type: "action",
1161
+ timestamp,
1162
+ error: event.error ?? null
1163
+ });
1164
+ break;
1165
+ case "log":
1166
+ result.push({ event_type: "log", timestamp, message: event.message });
1167
+ break;
1168
+ case "split":
1169
+ result.push({ event_type: "split", timestamp, label: event.label });
1170
+ break;
1171
+ case "image":
1172
+ break;
1173
+ case "plan": {
1174
+ let image = null;
1175
+ if (event.image) {
1176
+ if (typeof event.image !== "string") {
1177
+ image = Buffer.from(event.image).toString("base64");
1178
+ } else {
1179
+ image = event.image;
1180
+ }
1181
+ }
1182
+ result.push({
1183
+ event_type: "plan",
1184
+ timestamp,
1185
+ phase: event.phase,
1186
+ image,
1187
+ reasoning: event.reasoning,
1188
+ result: event.result ?? null,
1189
+ request_id: event.request_id ?? null
1190
+ });
1191
+ break;
1192
+ }
1193
+ }
1194
+ }
1195
+ return result;
1196
+ };
1197
+ var exportToHtml = (events, filePath) => {
1198
+ const outputDir = path.dirname(filePath);
1199
+ ensureDir(outputDir);
1200
+ const moduleUrl = import.meta?.url ? import.meta.url : pathToFileURL(__filename).href;
1201
+ const moduleDir = path.dirname(fileURLToPath(moduleUrl));
1202
+ const templatePath = path.join(moduleDir, "report_template.html");
1203
+ const template = fs.readFileSync(templatePath, "utf-8");
1204
+ const eventsData = convertEventsForHtml(events);
1205
+ const eventsJson = JSON.stringify(eventsData);
1206
+ const htmlContent = template.replace("{EVENTS_DATA}", eventsJson);
1207
+ fs.writeFileSync(filePath, htmlContent, "utf-8");
1208
+ };
1209
+ var exportToJson = (events, filePath) => {
1210
+ const outputDir = path.dirname(filePath);
1211
+ ensureDir(outputDir);
1212
+ const jsonEvents = events.map((event) => {
1213
+ const timestamp = event.timestamp instanceof Date ? event.timestamp.toISOString() : new Date(event.timestamp).toISOString();
1214
+ if ("image" in event && event.image instanceof ArrayBuffer) {
1215
+ return {
1216
+ ...event,
1217
+ timestamp,
1218
+ image: Buffer.from(event.image).toString("base64"),
1219
+ image_encoding: "base64"
1220
+ };
1221
+ }
1222
+ return {
1223
+ ...event,
1224
+ timestamp
1225
+ };
1226
+ });
1227
+ fs.writeFileSync(filePath, JSON.stringify(jsonEvents, null, 2), "utf-8");
1228
+ };
1229
+
1230
+ // src/agent/observer/agent_observer.ts
1231
+ var AsyncAgentObserver = class extends StepObserver {
1232
+ /**
1233
+ * Records agent execution events and exports to various formats.
1234
+ *
1235
+ * This class implements the AsyncObserver protocol and provides
1236
+ * functionality for recording events during agent execution and
1237
+ * exporting them to Markdown or HTML formats.
1238
+ */
1239
+ events = [];
1240
+ async onEvent(event) {
1241
+ this.events.push(event);
1242
+ }
1243
+ addLog(message) {
1244
+ const event = {
1245
+ type: "log",
1246
+ timestamp: /* @__PURE__ */ new Date(),
1247
+ message
1248
+ };
1249
+ this.events.push(event);
1250
+ }
1251
+ addSplit(label = "") {
1252
+ const event = {
1253
+ type: "split",
1254
+ timestamp: /* @__PURE__ */ new Date(),
1255
+ label
1256
+ };
1257
+ this.events.push(event);
1258
+ }
1259
+ clear() {
1260
+ this.events = [];
1261
+ }
1262
+ getEventsByStep(step_num) {
1263
+ return this.events.filter(
1264
+ (event) => event.step_num !== void 0 && event.step_num === step_num
1265
+ );
1266
+ }
1267
+ export(format, path2, images_dir) {
1268
+ const normalized = typeof format === "string" ? format.toLowerCase() : format;
1269
+ switch (normalized) {
1270
+ case "markdown" /* MARKDOWN */:
1271
+ exportToMarkdown(this.events, path2, images_dir ?? void 0);
1272
+ return;
1273
+ case "html" /* HTML */:
1274
+ exportToHtml(this.events, path2);
1275
+ return;
1276
+ case "json" /* JSON */:
1277
+ exportToJson(this.events, path2);
1278
+ return;
1279
+ default:
1280
+ throw new Error(`Unknown export format: ${String(format)}`);
1281
+ }
1282
+ }
1283
+ };
1284
+
1285
+ // src/handler.ts
1286
+ import robot from "robotjs";
1287
+ import sharp from "sharp";
1288
+ var sleep2 = (ms) => new Promise((r) => setTimeout(r, ms));
1289
+ var toSharpKernel = (resample) => {
1290
+ switch (resample) {
1291
+ case "NEAREST":
1292
+ return "nearest";
1293
+ case "BICUBIC":
1294
+ return "cubic";
1295
+ case "BILINEAR":
1296
+ return "mitchell";
1297
+ case "LANCZOS":
1298
+ default:
1299
+ return "lanczos3";
1300
+ }
1301
+ };
1302
+ var normalizeKey = (raw, opts) => {
1303
+ const key = raw.trim().toLowerCase();
1304
+ if (key === "caps_lock" || key === "caps") return "capslock";
1305
+ if (key === "page_up" || key === "pageup") return "pageup";
1306
+ if (key === "page_down" || key === "pagedown") return "pagedown";
1307
+ if (key === "cmd") return "command";
1308
+ if (opts.macosCtrlToCmd && process.platform === "darwin" && key === "ctrl") {
1309
+ return "command";
1310
+ }
1311
+ if (key === "ctrl") return "control";
1312
+ return key;
1313
+ };
1314
+ var parseHotkey = (arg, opts) => {
1315
+ const s = arg.trim().replace(/^\(/, "").replace(/\)$/, "");
1316
+ return s.split("+").map((k) => normalizeKey(k, opts)).filter(Boolean);
1317
+ };
1318
+ var stripOuterParens = (s) => s.trim().replace(/^\(/, "").replace(/\)$/, "");
1319
+ var applySessionCaps = (text, enabled) => {
1320
+ if (!enabled) return text;
1321
+ return text.split("").map((c) => /[a-z]/i.test(c) ? c.toUpperCase() : c).join("");
1322
+ };
1323
+ var defaultDesktopAutomationConfig = () => ({
1324
+ dragDurationMs: 500,
1325
+ scrollAmount: process.platform === "darwin" ? 2 : 100,
1326
+ waitDurationMs: 1e3,
1327
+ hotkeyDelayMs: 100,
1328
+ macosCtrlToCmd: true,
1329
+ capslockMode: "session"
1330
+ });
1331
+ var ScreenshotMaker = class _ScreenshotMaker {
1332
+ #cfg;
1333
+ constructor(cfg) {
1334
+ const defaultConfig = ImageConfigSchema.parse({});
1335
+ this.#cfg = { ...defaultConfig, ...cfg };
1336
+ }
1337
+ static toArrayBuffer(buffer) {
1338
+ const arraybuffer = new ArrayBuffer(buffer.length);
1339
+ const view = new Uint8Array(arraybuffer);
1340
+ for (let i = 0; i < buffer.length; ++i) {
1341
+ view[i] = buffer[i];
1342
+ }
1343
+ return arraybuffer;
1344
+ }
1345
+ async provide() {
1346
+ const { width, height } = robot.getScreenSize();
1347
+ const screenshot = robot.screen.capture(0, 0, width, height);
1348
+ const channels = 3;
1349
+ const data = new Uint8Array(
1350
+ screenshot.width * screenshot.height * channels
1351
+ );
1352
+ for (let w = 0; w < screenshot.width; ++w) {
1353
+ for (let h = 0; h < screenshot.height; ++h) {
1354
+ let offset = (h * screenshot.width + w) * channels;
1355
+ let offset2 = screenshot.byteWidth * h + w * screenshot.bytesPerPixel;
1356
+ data[offset] = screenshot.image.readUInt8(offset2 + 2);
1357
+ data[offset + 1] = screenshot.image.readUInt8(offset2 + 1);
1358
+ data[offset + 2] = screenshot.image.readUInt8(offset2 + 0);
1359
+ }
1360
+ }
1361
+ let p = sharp(Buffer.from(data), {
1362
+ raw: {
1363
+ width: screenshot.width,
1364
+ height: screenshot.height,
1365
+ channels
1366
+ }
1367
+ });
1368
+ if (this.#cfg.width || this.#cfg.height) {
1369
+ p = p.resize(this.#cfg.width ?? width, this.#cfg.height ?? height, {
1370
+ fit: "fill",
1371
+ kernel: toSharpKernel(this.#cfg.resample)
1372
+ });
1373
+ }
1374
+ const encoded = this.#cfg.format === "PNG" ? await p.png({ compressionLevel: this.#cfg.optimize ? 9 : 6 }).toBuffer() : await p.jpeg({ quality: this.#cfg.quality }).toBuffer();
1375
+ return _ScreenshotMaker.toArrayBuffer(encoded);
1376
+ }
1377
+ };
1378
+ var DefaultActionHandler = class {
1379
+ #cfg;
1380
+ #sessionCapsEnabled = false;
1381
+ constructor(cfg) {
1382
+ this.#cfg = { ...defaultDesktopAutomationConfig(), ...cfg };
1383
+ }
1384
+ reset() {
1385
+ this.#sessionCapsEnabled = false;
1386
+ }
1387
+ async handle(actions) {
1388
+ for (const action of actions) {
1389
+ const count = action.count ?? 1;
1390
+ for (let i = 0; i < count; i++) {
1391
+ await this.#handleOne(action);
1392
+ }
1393
+ }
1394
+ }
1395
+ #denormalize(x, y) {
1396
+ const { width, height } = robot.getScreenSize();
1397
+ let px = Math.floor(x * width / 1e3);
1398
+ let py = Math.floor(y * height / 1e3);
1399
+ if (px < 1) px = 1;
1400
+ if (px > width - 1) px = width - 1;
1401
+ if (py < 1) py = 1;
1402
+ if (py > height - 1) py = height - 1;
1403
+ return { x: px, y: py };
1404
+ }
1405
+ async #handleOne(action) {
1406
+ const arg = stripOuterParens(action.argument);
1407
+ switch (action.type) {
1408
+ case "click": {
1409
+ const coords = parseCoords(arg);
1410
+ if (!coords) throw new Error(`Invalid coords: ${arg}`);
1411
+ const p = this.#denormalize(coords[0], coords[1]);
1412
+ robot.moveMouse(p.x, p.y);
1413
+ robot.mouseClick("left", false);
1414
+ return;
1415
+ }
1416
+ case "left_double": {
1417
+ const coords = parseCoords(arg);
1418
+ if (!coords) throw new Error(`Invalid coords: ${arg}`);
1419
+ const p = this.#denormalize(coords[0], coords[1]);
1420
+ robot.moveMouse(p.x, p.y);
1421
+ robot.mouseClick("left", true);
1422
+ return;
1423
+ }
1424
+ case "left_triple": {
1425
+ const coords = parseCoords(arg);
1426
+ if (!coords) throw new Error(`Invalid coords: ${arg}`);
1427
+ const p = this.#denormalize(coords[0], coords[1]);
1428
+ robot.moveMouse(p.x, p.y);
1429
+ robot.mouseClick("left", true);
1430
+ robot.mouseClick("left", false);
1431
+ return;
1432
+ }
1433
+ case "right_single": {
1434
+ const coords = parseCoords(arg);
1435
+ if (!coords) throw new Error(`Invalid coords: ${arg}`);
1436
+ const p = this.#denormalize(coords[0], coords[1]);
1437
+ robot.moveMouse(p.x, p.y);
1438
+ robot.mouseClick("right", false);
1439
+ return;
1440
+ }
1441
+ case "drag": {
1442
+ const coords = parseDragCoords(arg);
1443
+ if (!coords) throw new Error(`Invalid drag coords: ${arg}`);
1444
+ const p1 = this.#denormalize(coords[0], coords[1]);
1445
+ const p2 = this.#denormalize(coords[2], coords[3]);
1446
+ robot.moveMouse(p1.x, p1.y);
1447
+ robot.mouseToggle("down", "left");
1448
+ robot.dragMouse(p2.x, p2.y);
1449
+ await sleep2(this.#cfg.dragDurationMs);
1450
+ robot.mouseToggle("up", "left");
1451
+ return;
1452
+ }
1453
+ case "hotkey": {
1454
+ const keys = parseHotkey(arg, {
1455
+ macosCtrlToCmd: this.#cfg.macosCtrlToCmd
1456
+ });
1457
+ if (keys.length === 1 && keys[0] === "capslock") {
1458
+ if (this.#cfg.capslockMode === "system") {
1459
+ robot.keyTap("capslock");
1460
+ } else {
1461
+ this.#sessionCapsEnabled = !this.#sessionCapsEnabled;
1462
+ }
1463
+ return;
1464
+ }
1465
+ const last = keys.at(-1);
1466
+ if (!last) return;
1467
+ const modifiers = keys.slice(0, -1);
1468
+ robot.keyTap(last, modifiers.length ? modifiers : []);
1469
+ await sleep2(this.#cfg.hotkeyDelayMs);
1470
+ return;
1471
+ }
1472
+ case "type": {
1473
+ const raw = arg.replace(/^['"]/, "").replace(/['"]$/, "");
1474
+ const text = applySessionCaps(raw, this.#sessionCapsEnabled);
1475
+ robot.typeString(text);
1476
+ return;
1477
+ }
1478
+ case "scroll": {
1479
+ const parsed = parseScroll(arg);
1480
+ if (!parsed) throw new Error(`Invalid scroll: ${arg}`);
1481
+ const p = this.#denormalize(parsed[0], parsed[1]);
1482
+ const direction = parsed[2];
1483
+ robot.moveMouse(p.x, p.y);
1484
+ const amount = direction === "up" ? this.#cfg.scrollAmount : -this.#cfg.scrollAmount;
1485
+ robot.scrollMouse(0, amount);
1486
+ return;
1487
+ }
1488
+ case "wait": {
1489
+ await sleep2(this.#cfg.waitDurationMs);
1490
+ return;
1491
+ }
1492
+ case "finish": {
1493
+ this.reset();
1494
+ return;
1495
+ }
1496
+ case "call_user": {
1497
+ return;
1498
+ }
1499
+ default: {
1500
+ const exhaustive = action.type;
1501
+ throw new Error(`Unknown action type: ${String(exhaustive)}`);
1502
+ }
1503
+ }
1504
+ }
1505
+ };
1506
+
1507
+ export {
1508
+ DEFAULT_BASE_URL,
1509
+ API_KEY_HELP_URL,
1510
+ MODEL_ACTOR,
1511
+ MODEL_THINKER,
1512
+ MODE_ACTOR,
1513
+ DEFAULT_MAX_STEPS_THINKER,
1514
+ DEFAULT_STEP_DELAY,
1515
+ OAGIError,
1516
+ APIError,
1517
+ AuthenticationError,
1518
+ RateLimitError,
1519
+ ValidationError,
1520
+ NotFoundError,
1521
+ ServerError,
1522
+ ConfigurationError,
1523
+ NetworkError,
1524
+ RequestTimeoutError,
1525
+ logger_default,
1526
+ StepObserver,
1527
+ Client,
1528
+ Actor,
1529
+ DefaultAgent,
1530
+ listAgentModes,
1531
+ createAgent,
1532
+ AsyncAgentObserver,
1533
+ ScreenshotMaker,
1534
+ DefaultActionHandler
1535
+ };
1536
+ //# sourceMappingURL=chunk-SDBYP57G.js.map