gpt-driver-node 1.0.0-alpha.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,5 +1,11 @@
1
+ import { promises } from 'node:fs';
2
+ import path from 'node:path';
1
3
  import axios from 'axios';
2
4
  import sharp from 'sharp';
5
+ import { attach } from 'webdriverio';
6
+ import winston from 'winston';
7
+ import { z } from 'zod';
8
+ import crypto from 'node:crypto';
3
9
 
4
10
  const delay = async (milliseconds) => {
5
11
  await new Promise((resolve) => setTimeout(resolve, milliseconds));
@@ -15,12 +21,600 @@ function buildUrl(base, extraPath) {
15
21
  return `${baseUrl}${extraPath}`;
16
22
  }
17
23
 
24
+ const colors = {
25
+ reset: "\x1B[0m",
26
+ bold: "\x1B[1m",
27
+ gray: "\x1B[90m",
28
+ red: "\x1B[31m",
29
+ green: "\x1B[32m",
30
+ yellow: "\x1B[33m",
31
+ cyan: "\x1B[36m"
32
+ };
33
+ const logStyles = {
34
+ bold: (text) => `${colors.bold}${text}${colors.reset}`,
35
+ cyan: (text) => `${colors.cyan}${text}${colors.reset}`,
36
+ yellow: (text) => `${colors.yellow}${text}${colors.reset}`,
37
+ green: (text) => `${colors.green}${text}${colors.reset}`,
38
+ red: (text) => `${colors.red}${text}${colors.reset}`,
39
+ gray: (text) => `${colors.gray}${text}${colors.reset}`,
40
+ highlight: (text) => `${colors.bold}${colors.cyan}${text}${colors.reset}`,
41
+ success: (text) => `${colors.bold}${colors.green}${text}${colors.reset}`,
42
+ error: (text) => `${colors.bold}${colors.red}${text}${colors.reset}`,
43
+ warning: (text) => `${colors.bold}${colors.yellow}${text}${colors.reset}`
44
+ };
45
+ const globalLogger = winston.createLogger({
46
+ level: process.env.GPT_DRIVER_LOG_LEVEL || "info",
47
+ format: winston.format.combine(
48
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
49
+ winston.format.errors({ stack: true }),
50
+ winston.format.printf(({ timestamp, level, message, stack }) => {
51
+ const logMessage = `${timestamp} [${level.toUpperCase()}]: ${message}`;
52
+ return stack ? `${logMessage}
53
+ ${stack}` : logMessage;
54
+ })
55
+ ),
56
+ transports: [
57
+ new winston.transports.Console({
58
+ format: winston.format.combine(
59
+ winston.format.printf(({ timestamp, level, message, stack }) => {
60
+ let coloredLevel = level.toUpperCase();
61
+ switch (level) {
62
+ case "error":
63
+ coloredLevel = logStyles.error(coloredLevel);
64
+ break;
65
+ case "warn":
66
+ coloredLevel = logStyles.warning(coloredLevel);
67
+ break;
68
+ case "info":
69
+ coloredLevel = logStyles.cyan(coloredLevel);
70
+ break;
71
+ case "debug":
72
+ coloredLevel = logStyles.gray(coloredLevel);
73
+ break;
74
+ }
75
+ const formattedTimestamp = logStyles.gray(timestamp);
76
+ const logMessage = `${formattedTimestamp} [${coloredLevel}]: ${message}`;
77
+ return stack ? `${logMessage}
78
+ ${logStyles.gray(stack)}` : logMessage;
79
+ })
80
+ )
81
+ })
82
+ ]
83
+ });
84
+
85
+ const SavableStepBaseSchema = z.object({
86
+ id: z.number().optional(),
87
+ descriptionText: z.string().optional(),
88
+ optional: z.boolean().optional()
89
+ });
90
+ const SavableTapStepSchema = SavableStepBaseSchema.extend({
91
+ type: z.literal("tap"),
92
+ elementId: z.string().optional(),
93
+ timeout: z.number().optional(),
94
+ useLlmOnly: z.boolean().optional(),
95
+ cropBase64: z.string().optional()
96
+ });
97
+ const SavableAssertStepSchema = SavableStepBaseSchema.extend({
98
+ type: z.literal("assert"),
99
+ elementId: z.string().optional(),
100
+ timeout: z.number().optional(),
101
+ useLlmOnly: z.boolean().optional(),
102
+ cropBase64: z.string().optional()
103
+ });
104
+ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
105
+ type: z.literal("type"),
106
+ text: z.string()
107
+ });
108
+ const SavableScrollStepSchema = SavableStepBaseSchema.extend({
109
+ type: z.literal("scroll"),
110
+ direction: z.enum(["up", "down"])
111
+ });
112
+ const SavableZoomStepSchema = SavableStepBaseSchema.extend({
113
+ type: z.literal("zoom"),
114
+ direction: z.enum(["in", "out"])
115
+ });
116
+ const SavableScrollUntilStepSchema = SavableStepBaseSchema.extend({
117
+ type: z.literal("scrollUntil"),
118
+ text: z.string().optional(),
119
+ elementId: z.string().optional(),
120
+ direction: z.enum(["up", "down"]),
121
+ maxScrolls: z.number().optional()
122
+ });
123
+ const SavableDeeplinkStepSchema = SavableStepBaseSchema.extend({
124
+ type: z.literal("deeplink"),
125
+ url: z.string()
126
+ });
127
+ const SavableAIStepSchema = SavableStepBaseSchema.extend({
128
+ type: z.literal("ai"),
129
+ instruction: z.string()
130
+ });
131
+ const SavableFileRefStepSchema = SavableStepBaseSchema.extend({
132
+ type: z.literal("fileRef"),
133
+ path: z.string(),
134
+ overrides: z.record(z.string(), z.string()).optional()
135
+ });
136
+ const SavableStepSchema = z.discriminatedUnion("type", [
137
+ SavableTapStepSchema,
138
+ // type: 'tap'
139
+ SavableAssertStepSchema,
140
+ // type: 'assert'
141
+ SavableTypeStepSchema,
142
+ // type: 'type'
143
+ SavableScrollStepSchema,
144
+ // type: 'scroll'
145
+ SavableZoomStepSchema,
146
+ // type: 'zoom'
147
+ SavableScrollUntilStepSchema,
148
+ // type: 'scrollUntil'
149
+ SavableDeeplinkStepSchema,
150
+ // type: 'deeplink'
151
+ SavableAIStepSchema,
152
+ // type: 'ai'
153
+ SavableFileRefStepSchema
154
+ // type: 'fileRef'
155
+ ]);
156
+ const SavableTestStoreSchema = z.object({
157
+ name: z.string(),
158
+ steps: z.array(SavableStepSchema),
159
+ params: z.record(z.string(), z.string()).optional()
160
+ });
161
+
162
+ const CACHE_SERVER_URL = "https://cache.mobileboost.io";
163
+ const GPT_DRIVER_BASE_URL = "https://api.mobileboost.io";
164
+ const RESCALE_FACTOR = 4;
165
+ const SMART_LOOP_MAX_ITERATIONS = 15;
166
+ const CACHE_RETRY_MS = 2e3;
167
+ const CACHE_CHECK_INTERVAL_MS = 500;
168
+
169
+ function generateCacheHash(apiKey, filepath, stepNumber, description, platform, resolution) {
170
+ const resString = resolution ? `${resolution.width}x${resolution.height}` : "";
171
+ const normalizedPlatform = platform?.toLowerCase() || "";
172
+ const data = `${apiKey}${filepath || ""}${stepNumber}${description}${normalizedPlatform || ""}${resString}`;
173
+ return crypto.createHash("sha256").update(data).digest("hex");
174
+ }
175
+ function scaleCommand(cmd, operation) {
176
+ if (cmd.match(/([xy])=(\d+)/)) {
177
+ return cmd.replace(/([xy])=(\d+)/g, (_match, axis, val) => {
178
+ const num = parseInt(val, 10);
179
+ let scaled;
180
+ if (operation === "multiply") {
181
+ scaled = Math.round(num * RESCALE_FACTOR);
182
+ } else {
183
+ scaled = Math.round(num / RESCALE_FACTOR);
184
+ }
185
+ return `${axis}=${scaled}`;
186
+ });
187
+ }
188
+ return cmd.replace(/(^|;)(\d+);(\d+)(;|$)/, (_match, prefix, xStr, yStr, suffix) => {
189
+ const x = parseInt(xStr, 10);
190
+ const y = parseInt(yStr, 10);
191
+ let scaledX;
192
+ let scaledY;
193
+ if (operation === "multiply") {
194
+ scaledX = Math.round(x * RESCALE_FACTOR);
195
+ scaledY = Math.round(y * RESCALE_FACTOR);
196
+ } else {
197
+ scaledX = Math.round(x / RESCALE_FACTOR);
198
+ scaledY = Math.round(y / RESCALE_FACTOR);
199
+ }
200
+ return `${prefix}${scaledX};${scaledY}${suffix}`;
201
+ });
202
+ }
203
+ async function resizeScreenshotForCache(screenshotBase64) {
204
+ const buffer = Buffer.from(
205
+ screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
206
+ "base64"
207
+ );
208
+ const metadata = await sharp(buffer).metadata();
209
+ const originalWidth = metadata.width ?? 1080;
210
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
211
+ return sharp(buffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
212
+ }
213
+
214
+ async function executeFromCache(params) {
215
+ try {
216
+ const hash = generateCacheHash(
217
+ params.apiKey,
218
+ params.filepath,
219
+ params.stepNumber,
220
+ params.stepDescription,
221
+ params.platform,
222
+ params.screenResolution
223
+ );
224
+ const resizedBuffer = await resizeScreenshotForCache(params.screenshot);
225
+ const formData = new FormData();
226
+ formData.append("hash", hash);
227
+ const blob = new Blob([new Uint8Array(resizedBuffer)], { type: "image/png" });
228
+ const blobSizeMB = (blob.size / (1024 * 1024)).toFixed(2);
229
+ globalLogger.debug(`[Cache] Executing from cache with screenshot size: ${blobSizeMB} MB`);
230
+ formData.append("screenshot", blob, "screenshot.png");
231
+ if (params.highestUsedIndex !== void 0 && params.highestUsedIndex !== null) {
232
+ globalLogger.debug(`[Cache] Sending highest_used_index: ${params.highestUsedIndex}`);
233
+ formData.append("highest_used_index", String(params.highestUsedIndex));
234
+ }
235
+ const response = await axios.post(`${CACHE_SERVER_URL}/execute-from-cache`, formData);
236
+ const result = response.data;
237
+ if (result.found && result.cacheCommands) {
238
+ const scaledCommands = result.cacheCommands.map(
239
+ (cmd) => scaleCommand(cmd, "multiply")
240
+ );
241
+ return {
242
+ found: true,
243
+ cacheCommands: scaledCommands,
244
+ cacheIndex: result.cacheIndex
245
+ };
246
+ }
247
+ return { found: false };
248
+ } catch (error) {
249
+ if (axios.isAxiosError(error)) {
250
+ globalLogger.warn(`[Cache] Cache lookup failed: ${error.response?.data || error.message}`);
251
+ } else {
252
+ globalLogger.error(`[Cache] Error executing from cache: ${error}`);
253
+ }
254
+ return { found: false };
255
+ }
256
+ }
257
+ async function populateCache(params) {
258
+ try {
259
+ const hash = generateCacheHash(
260
+ params.apiKey,
261
+ params.filepath,
262
+ params.stepNumber,
263
+ params.stepDescription,
264
+ params.platform,
265
+ params.screenResolution
266
+ );
267
+ const payload = await Promise.all(params.executionData.map(async (item) => {
268
+ const resizedBuffer = await resizeScreenshotForCache(item.screenshot);
269
+ const scaledCommands = item.commands.map(
270
+ (cmd) => scaleCommand(cmd, "divide")
271
+ );
272
+ return {
273
+ screenshot: resizedBuffer.toString("base64"),
274
+ commands: scaledCommands
275
+ };
276
+ }));
277
+ const payloadSizeMB = (JSON.stringify(payload).length / (1024 * 1024)).toFixed(2);
278
+ globalLogger.debug(`[Cache] Populating cache with payload size: ~${payloadSizeMB} MB (Hash: ${hash})`);
279
+ await axios.post(`${CACHE_SERVER_URL}/populate-cache`, payload, {
280
+ params: { hash }
281
+ });
282
+ return { success: true };
283
+ } catch (error) {
284
+ if (axios.isAxiosError(error)) {
285
+ globalLogger.error(`[Cache] Failed to populate cache: ${error.response?.data || error.message}`);
286
+ } else {
287
+ globalLogger.error(`[Cache] Error populating cache: ${error}`);
288
+ }
289
+ return { success: false };
290
+ }
291
+ }
292
+
293
+ const AI_AGENT_ENDPOINT = "https://api.mobileboost.io/call_lambda";
294
+ async function executeAgentStep(params) {
295
+ const imageBuffer = Buffer.from(params.base64_screenshot, "base64");
296
+ const metadata = await sharp(imageBuffer).metadata();
297
+ const originalWidth = metadata.width ?? 1080;
298
+ const originalHeight = metadata.height ?? 1920;
299
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
300
+ const resizedBuffer = await sharp(imageBuffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
301
+ const resizedMetadata = await sharp(resizedBuffer).metadata();
302
+ const resizedWidth = resizedMetadata.width ?? desiredWidth;
303
+ const resizedHeight = resizedMetadata.height ?? Math.round(originalHeight * (desiredWidth / originalWidth));
304
+ globalLogger.debug(`[AI Client] Resized screenshot: ${originalWidth}x${originalHeight} -> ${resizedWidth}x${resizedHeight}`);
305
+ const payload = {
306
+ lambda_flow: "get_next_step",
307
+ current_date: (/* @__PURE__ */ new Date()).toLocaleDateString("en-GB", {
308
+ day: "numeric",
309
+ month: "long",
310
+ year: "numeric"
311
+ }),
312
+ base64_screenshot: resizedBuffer.toString("base64"),
313
+ getUI_elements: [],
314
+ uiHierarchy: [],
315
+ test_task_string: JSON.stringify([
316
+ {
317
+ id: "step-1",
318
+ text: `1. ${params.instruction}`,
319
+ plainText: params.instruction
320
+ }
321
+ ]),
322
+ image_width: resizedWidth,
323
+ image_height: resizedHeight,
324
+ action_history: params.action_history,
325
+ orgKey: params.apiKey,
326
+ template_images: {},
327
+ model_provider: "vellum",
328
+ model_version: "claude-agent",
329
+ fallbackModel: "claude-agent",
330
+ utilize_fullTextAnnotation: false,
331
+ enableSortingOCR: true,
332
+ enableActionHistoryCut: true,
333
+ removeOverlappingText: false,
334
+ currentAndPreviousScreenMatch: false,
335
+ popupDetectionEnabled: true,
336
+ ocrProvider: "gcp"
337
+ };
338
+ globalLogger.debug(`[AI Client] Sending request to ${AI_AGENT_ENDPOINT}`);
339
+ try {
340
+ const response = await axios.post(
341
+ AI_AGENT_ENDPOINT,
342
+ payload,
343
+ {
344
+ headers: {
345
+ "Content-Type": "application/json"
346
+ }
347
+ }
348
+ );
349
+ const result = response.data;
350
+ globalLogger.debug("[AI Client] Received response from backend");
351
+ if (result.appetizeCommands) {
352
+ result.appetizeCommands = result.appetizeCommands.map(
353
+ (cmd) => scaleCommand(cmd, "multiply")
354
+ );
355
+ }
356
+ return result;
357
+ } catch (error) {
358
+ if (axios.isAxiosError(error)) {
359
+ const status = error.response?.status ?? "unknown";
360
+ const errorText = error.response?.data ?? error.message;
361
+ globalLogger.error(`[AI Client] Backend error (${status}): ${JSON.stringify(errorText)}`);
362
+ throw new Error(`AI Backend Error: ${status} - ${error.message}`);
363
+ }
364
+ throw error;
365
+ }
366
+ }
367
+
368
+ function parseTapCoordinates(cmd) {
369
+ const xMatch = cmd.match(/x=(\d+)/);
370
+ const yMatch = cmd.match(/y=(\d+)/);
371
+ if (xMatch && yMatch) {
372
+ return {
373
+ x: parseInt(xMatch[1], 10),
374
+ y: parseInt(yMatch[1], 10)
375
+ };
376
+ }
377
+ const parts = cmd.split(";");
378
+ if (parts.length >= 3) {
379
+ const x = parseInt(parts[1], 10);
380
+ const y = parseInt(parts[2], 10);
381
+ if (!isNaN(x) && !isNaN(y)) {
382
+ return { x, y };
383
+ }
384
+ }
385
+ return null;
386
+ }
387
+ function parseWaitSeconds(cmd) {
388
+ const match = cmd.match(/wait:\s*(\d+)/);
389
+ return match ? parseInt(match[1], 10) : null;
390
+ }
391
+ function parseScrollDirection(cmd) {
392
+ const match = cmd.match(/scroll:\s*(up|down)/i);
393
+ return match ? match[1].toLowerCase() : null;
394
+ }
395
+ function parseTypeText(cmd) {
396
+ const match = cmd.match(/^type:\s*(.+)$/);
397
+ return match ? match[1] : null;
398
+ }
399
+ function isTaskComplete(cmd) {
400
+ return cmd.toLowerCase().includes("task complete:");
401
+ }
402
+ function isErrorDetected(cmd) {
403
+ return cmd.toLowerCase().includes("error detected:");
404
+ }
405
+ function isRememberCommand(cmd) {
406
+ return cmd.startsWith("remember:");
407
+ }
408
+ function isTapCommand(cmd) {
409
+ return /^t(ap|ab)On:/.test(cmd);
410
+ }
411
+ function isWaitCommand(cmd) {
412
+ return cmd.startsWith("wait:");
413
+ }
414
+ function isScrollCommand(cmd) {
415
+ return cmd.startsWith("scroll:");
416
+ }
417
+ function isTypeCommand(cmd) {
418
+ return cmd.startsWith("type:");
419
+ }
420
+
421
+ async function executeSmartLoop(ctx, params) {
422
+ const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
423
+ let loopCount = 0;
424
+ let actionHistory = [...ctx.globalActionHistory];
425
+ let lastCacheIndex = void 0;
426
+ let anyCacheMiss = false;
427
+ let everHadCacheHit = false;
428
+ const currentExecutionData = [];
429
+ globalLogger.info(`[SmartLoop] Starting for step ${params.stepNumber}: "${params.description}"`);
430
+ try {
431
+ while (loopCount < SMART_LOOP_MAX_ITERATIONS) {
432
+ let screenshot = "";
433
+ let commands = [];
434
+ let isCacheHit = false;
435
+ for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
436
+ screenshot = await ctx.getScreenshot();
437
+ const sizeInBytes = screenshot.length * 0.75;
438
+ const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2);
439
+ globalLogger.debug(`[SmartLoop] Captured screenshot: ~${sizeInMB} MB`);
440
+ try {
441
+ globalLogger.debug(`[SmartLoop] Checking cache (Attempt ${attempt + 1}/${maxCacheAttempts})`);
442
+ const cacheResult = await executeFromCache({
443
+ apiKey: ctx.apiKey,
444
+ stepNumber: params.stepNumber,
445
+ stepDescription: params.description,
446
+ screenshot,
447
+ screenResolution: ctx.screenSize,
448
+ highestUsedIndex: lastCacheIndex,
449
+ platform: ctx.platform,
450
+ filepath: params.filepath
451
+ });
452
+ if (cacheResult.found && cacheResult.cacheCommands) {
453
+ commands = cacheResult.cacheCommands;
454
+ lastCacheIndex = cacheResult.cacheIndex;
455
+ isCacheHit = true;
456
+ everHadCacheHit = true;
457
+ globalLogger.info(`[SmartLoop] Cache Hit! (${commands.length} commands)`);
458
+ break;
459
+ }
460
+ } catch (e) {
461
+ globalLogger.warn(`[SmartLoop] Cache check failed: ${e.message}`);
462
+ }
463
+ if (attempt < maxCacheAttempts - 1) {
464
+ globalLogger.debug(`[SmartLoop] Cache miss, retrying in ${CACHE_CHECK_INTERVAL_MS}ms...`);
465
+ await delay(CACHE_CHECK_INTERVAL_MS);
466
+ }
467
+ }
468
+ let aiCommands = [];
469
+ if (!isCacheHit) {
470
+ anyCacheMiss = true;
471
+ globalLogger.info(`[SmartLoop] Cache Miss. Requesting AI agent...`);
472
+ const agentResponse = await executeAgentStep({
473
+ apiKey: ctx.apiKey,
474
+ base64_screenshot: screenshot,
475
+ instruction: params.instruction,
476
+ action_history: actionHistory
477
+ });
478
+ aiCommands = agentResponse.appetizeCommands || [];
479
+ const gptCommands = agentResponse.gptCommands || [];
480
+ const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
481
+ if (reasoningIndex !== -1) {
482
+ const parsedCommands = gptCommands.slice(reasoningIndex);
483
+ const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
484
+ if (rememberCommands.length > 0) {
485
+ ctx.globalActionHistory.push(...rememberCommands);
486
+ }
487
+ actionHistory = [...actionHistory, ...parsedCommands];
488
+ }
489
+ commands = [...aiCommands];
490
+ globalLogger.debug(`[SmartLoop] AI returned ${commands.length} command(s)`);
491
+ }
492
+ currentExecutionData.push({
493
+ screenshot,
494
+ commands: aiCommands.length > 0 ? aiCommands : commands
495
+ });
496
+ await ctx.logCodeExecution(screenshot, commands.join("\n"));
497
+ let actionExecuted = false;
498
+ let taskCompleted = false;
499
+ if (commands.length > 0) {
500
+ globalLogger.debug(`[SmartLoop] Executing ${commands.length} command(s)`);
501
+ }
502
+ for (const cmd of commands) {
503
+ if (isTaskComplete(cmd)) {
504
+ taskCompleted = true;
505
+ globalLogger.info(`[SmartLoop] Task completed signal received`);
506
+ continue;
507
+ }
508
+ if (isErrorDetected(cmd)) {
509
+ throw new Error(`AI Reported Error: ${cmd}`);
510
+ }
511
+ if (isRememberCommand(cmd)) {
512
+ ctx.globalActionHistory.push(cmd);
513
+ }
514
+ if (isTapCommand(cmd)) {
515
+ const coords = parseTapCoordinates(cmd);
516
+ if (coords) {
517
+ globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
518
+ await ctx.performTap(coords.x, coords.y);
519
+ actionExecuted = true;
520
+ }
521
+ } else if (isWaitCommand(cmd)) {
522
+ const seconds = parseWaitSeconds(cmd);
523
+ if (seconds) {
524
+ globalLogger.debug(`[SmartLoop] Waiting ${seconds}s`);
525
+ await delay(seconds * 1e3);
526
+ actionExecuted = true;
527
+ }
528
+ } else if (isScrollCommand(cmd)) {
529
+ const direction = parseScrollDirection(cmd);
530
+ if (direction) {
531
+ globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
532
+ await ctx.performScroll(direction);
533
+ actionExecuted = true;
534
+ }
535
+ } else if (isTypeCommand(cmd)) {
536
+ const text = parseTypeText(cmd);
537
+ if (text) {
538
+ globalLogger.debug(`[SmartLoop] Typing text`);
539
+ await ctx.performType(text);
540
+ actionExecuted = true;
541
+ }
542
+ }
543
+ }
544
+ if (actionExecuted) {
545
+ if (isCacheHit) {
546
+ actionHistory.push(...commands);
547
+ }
548
+ await delay(100);
549
+ }
550
+ if (taskCompleted) {
551
+ globalLogger.info(`[SmartLoop] Task completed successfully`);
552
+ if (anyCacheMiss && currentExecutionData.length > 0) {
553
+ globalLogger.info(`[SmartLoop] Populating cache with ${currentExecutionData.length} frame(s)...`);
554
+ try {
555
+ await populateCache({
556
+ apiKey: ctx.apiKey,
557
+ stepNumber: params.stepNumber,
558
+ stepDescription: params.description,
559
+ executionData: currentExecutionData,
560
+ screenResolution: ctx.screenSize,
561
+ platform: ctx.platform,
562
+ filepath: params.filepath
563
+ });
564
+ globalLogger.debug(`[SmartLoop] Cache populated successfully`);
565
+ } catch (e) {
566
+ globalLogger.warn(`[SmartLoop] Failed to populate cache: ${e.message}`);
567
+ }
568
+ } else if (!anyCacheMiss) {
569
+ globalLogger.debug(`[SmartLoop] Skipping cache population (all actions were cached)`);
570
+ }
571
+ return {
572
+ success: true,
573
+ iterations: loopCount + 1,
574
+ cacheHit: everHadCacheHit
575
+ };
576
+ }
577
+ loopCount++;
578
+ }
579
+ throw new Error(`Smart Loop timeout after ${SMART_LOOP_MAX_ITERATIONS} iterations`);
580
+ } catch (error) {
581
+ const message = error instanceof Error ? error.message : String(error);
582
+ globalLogger.error(`[SmartLoop] Error: ${message}`);
583
+ return {
584
+ success: false,
585
+ error: message,
586
+ iterations: loopCount + 1,
587
+ cacheHit: everHadCacheHit
588
+ };
589
+ }
590
+ }
591
+
18
592
  class GptDriver {
593
+ interpolateTemplate(input, params) {
594
+ if (typeof input !== "string" || !input.includes("{{")) return input;
595
+ const pattern = /{{\s*([^}]+?)\s*}}/g;
596
+ return input.replace(pattern, (_match, keyRaw) => {
597
+ const key = String(keyRaw);
598
+ if (!(key in params)) {
599
+ throw new Error(`Missing flow param: {{${key}}}`);
600
+ }
601
+ return params[key];
602
+ });
603
+ }
19
604
  apiKey;
20
605
  gptDriverSessionId;
21
606
  gptDriverBaseUrl;
22
607
  appiumSessionConfig;
608
+ cachingMode;
23
609
  driver;
610
+ appiumSessionStarted;
611
+ useGptDriverCloud;
612
+ gptDriverCloudConfig;
613
+ buildId;
614
+ testId;
615
+ step_number = 1;
616
+ // Smart loop state - maintains action history across steps for context
617
+ globalActionHistory = [];
24
618
  /**
25
619
  * Creates an instance of the GptDriver class.
26
620
  *
@@ -40,19 +634,34 @@ class GptDriver {
40
634
  * - `device.platform`: The platform name of the device (e.g., iOS, Android).
41
635
  */
42
636
  constructor(config) {
637
+ this.testId = config.testId;
43
638
  this.apiKey = config.apiKey;
44
- this.gptDriverBaseUrl = "https://api.mobileboost.io";
45
- this.initializeDriver(config);
46
- this.initializeAppiumConfig(config);
639
+ this.buildId = config.buildId;
640
+ this.useGptDriverCloud = config.useGptDriverCloud;
641
+ this.gptDriverBaseUrl = GPT_DRIVER_BASE_URL;
642
+ this.cachingMode = config.cachingMode ?? "NONE";
643
+ if (config.useGptDriverCloud) {
644
+ if (config.serverConfig.device?.platform == null) {
645
+ throw new Error("Platform is missing. Please specify the platform when using GPTDriver Cloud.");
646
+ }
647
+ this.gptDriverCloudConfig = {
648
+ platform: config.serverConfig.device.platform,
649
+ deviceName: config.serverConfig.device.deviceName,
650
+ platformVersion: config.serverConfig.device.platformVersion
651
+ };
652
+ } else {
653
+ this.initializeDriver(config);
654
+ this.initializeAppiumConfig(config);
655
+ }
47
656
  }
48
657
  initializeDriver(config) {
49
658
  if (config.driver) {
50
659
  this.driver = config.driver;
51
- if (!config.severConfig?.url) {
660
+ if (!config.serverConfig.url) {
52
661
  throw new Error("Server url is missing. Please specify the server url when providing a driver.");
53
662
  }
54
663
  } else {
55
- const isValidServerConfig = config.severConfig?.url && config.severConfig.device?.platform;
664
+ const isValidServerConfig = config.serverConfig.url && config.serverConfig.device?.platform;
56
665
  if (!isValidServerConfig) {
57
666
  throw new Error("Either provide a driver, or a valid severConfig object.");
58
667
  }
@@ -61,10 +670,10 @@ class GptDriver {
61
670
  initializeAppiumConfig(config) {
62
671
  const defaultPort = parseInt(process.env.APPIUM_PORT ?? "4723", 10);
63
672
  const defaultHost = process.env.APPIUM_HOST ?? "127.0.0.1";
64
- let serverUrl = config.severConfig?.url instanceof URL ? config.severConfig.url : new URL(config.severConfig?.url ?? `http://${defaultHost}:${defaultPort}`);
673
+ const serverUrl = config.serverConfig.url instanceof URL ? config.serverConfig.url : new URL(config.serverConfig.url ?? `http://${defaultHost}:${defaultPort}`);
65
674
  this.appiumSessionConfig = {
66
675
  serverUrl,
67
- ...config.severConfig?.device
676
+ ...config.serverConfig.device
68
677
  };
69
678
  }
70
679
  /**
@@ -74,50 +683,53 @@ class GptDriver {
74
683
  * @throws {Error} If the session cannot be started or the driver is not properly initialized.
75
684
  */
76
685
  async startSession() {
77
- console.log(">> Starting session...");
78
- if (this.driver) {
79
- let platform;
80
- let platformVersion;
81
- let deviceName;
82
- let sessionId;
83
- if (this.driver.sessionId == null) {
84
- const driver = this.driver;
85
- const capabilities = await driver.getCapabilities();
86
- platform = capabilities.get("platformName");
87
- platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
88
- deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
89
- const session = await driver.getSession();
90
- sessionId = session.getId();
686
+ globalLogger.info("Starting session...");
687
+ if (!this.useGptDriverCloud) {
688
+ if (this.driver) {
689
+ let platform;
690
+ let platformVersion;
691
+ let deviceName;
692
+ let sessionId;
693
+ if (this.driver.sessionId == null) {
694
+ const driver = this.driver;
695
+ const capabilities = await driver.getCapabilities();
696
+ platform = capabilities.get("platformName");
697
+ platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
698
+ deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
699
+ const session = await driver.getSession();
700
+ sessionId = session.getId();
701
+ } else {
702
+ const driver = this.driver;
703
+ platform = driver.capabilities["appium:platformName"] ?? driver.capabilities["platformName"];
704
+ platformVersion = driver.capabilities["appium:platformVersion"] ?? driver.capabilities["platformVersion"];
705
+ deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? driver.capabilities["deviceName"];
706
+ sessionId = driver.sessionId;
707
+ }
708
+ this.appiumSessionConfig = {
709
+ ...this.appiumSessionConfig,
710
+ id: sessionId,
711
+ platform,
712
+ platformVersion,
713
+ deviceName
714
+ };
715
+ globalLogger.debug(`Session config: ${JSON.stringify(this.appiumSessionConfig)}`);
91
716
  } else {
92
- const driver = this.driver;
93
- platform = driver.capabilities["appium:platformName"];
94
- platformVersion = driver.capabilities["appium:platformVersion"];
95
- deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? "";
96
- sessionId = driver.sessionId;
97
- }
98
- this.appiumSessionConfig = {
99
- ...this.appiumSessionConfig,
100
- id: sessionId,
101
- platform,
102
- platformVersion,
103
- deviceName
717
+ this.appiumSessionConfig.id = await this.createSession();
718
+ }
719
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
720
+ const rectResponse = await axios.get(url);
721
+ this.appiumSessionConfig.size = {
722
+ width: rectResponse.data.value.width,
723
+ height: rectResponse.data.value.height
104
724
  };
105
- } else {
106
- this.appiumSessionConfig.id = await this.createSession();
725
+ this.appiumSessionStarted = true;
107
726
  }
108
727
  await this.createGptDriverSession();
109
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
110
- const rectResponse = await axios.get(
111
- url
112
- );
113
- this.appiumSessionConfig.size = {
114
- width: rectResponse.data.value.width,
115
- height: rectResponse.data.value.height
116
- };
117
- console.log(`>> Session created. Monitor execution at: ${this.getSessionLink()}`);
728
+ globalLogger.info(logStyles.highlight(`Session created. Monitor execution at: ${this.getSessionLink()}`));
118
729
  }
119
730
  async createSession() {
120
731
  const { platform, deviceName, platformVersion, serverUrl } = this.appiumSessionConfig;
732
+ globalLogger.debug(`Creating Appium session for ${platform} ${platformVersion} on ${deviceName}`);
121
733
  const url = buildUrl(serverUrl, `/session`);
122
734
  const response = await axios.post(
123
735
  url,
@@ -132,22 +744,41 @@ class GptDriver {
132
744
  }
133
745
  }
134
746
  );
135
- return response.data.value.sessionId;
747
+ const sessionId = response.data.value.sessionId;
748
+ globalLogger.debug(`Appium session created with ID: ${sessionId}`);
749
+ return sessionId;
136
750
  }
137
751
  async createGptDriverSession() {
752
+ globalLogger.debug("Creating GPT Driver session...");
138
753
  const response = await axios.post(
139
754
  `${this.gptDriverBaseUrl}/sessions/create`,
140
755
  {
756
+ ...this.testId && { test_id: this.testId },
141
757
  api_key: this.apiKey,
142
- appium_session_id: this.appiumSessionConfig.id,
758
+ appium_session_id: this.appiumSessionConfig?.id,
143
759
  device_config: {
144
- platform: this.appiumSessionConfig.platform,
145
- device: this.appiumSessionConfig.deviceName,
146
- os: this.appiumSessionConfig.platformVersion
147
- }
760
+ platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
761
+ device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
762
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
763
+ },
764
+ use_internal_virtual_device: this.useGptDriverCloud,
765
+ build_id: this.buildId,
766
+ caching_mode: this.cachingMode
148
767
  }
149
768
  );
150
769
  this.gptDriverSessionId = response.data.sessionId;
770
+ globalLogger.debug(`GPT Driver session created with ID: ${this.gptDriverSessionId}`);
771
+ if (this.useGptDriverCloud) {
772
+ const parsedUrl = new URL(response.data.appiumServerUrl);
773
+ this.driver = await attach({
774
+ options: {
775
+ hostname: parsedUrl.hostname,
776
+ path: parsedUrl.pathname
777
+ },
778
+ sessionId: response.data.appiumSessionId
779
+ });
780
+ this.appiumSessionStarted = true;
781
+ }
151
782
  }
152
783
  getSessionLink() {
153
784
  return `https://app.mobileboost.io/gpt-driver/sessions/${this.gptDriverSessionId}`;
@@ -163,20 +794,174 @@ class GptDriver {
163
794
  *
164
795
  * @throws {Error} If the request to stop the session fails.
165
796
  */
166
- async stopSession(status) {
167
- console.log(">> Stopping session...");
168
- await axios.post(
169
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
797
+ async setSessionStatus(status) {
798
+ if (this.gptDriverSessionId) {
799
+ globalLogger.info(`Stopping session with status: ${status}`);
800
+ await axios.post(
801
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
802
+ {
803
+ api_key: this.apiKey,
804
+ status
805
+ }
806
+ );
807
+ globalLogger.info("Session stopped successfully");
808
+ this.appiumSessionStarted = false;
809
+ this.gptDriverSessionId = void 0;
810
+ this.step_number = 1;
811
+ this.globalActionHistory = [];
812
+ }
813
+ }
814
+ // ─────────────────────────────────────────────────────────────────────────────
815
+ // SMART LOOP INTEGRATION
816
+ // ─────────────────────────────────────────────────────────────────────────────
817
+ /**
818
+ * Creates a SmartLoopContext for the current session.
819
+ * This context provides all the callbacks needed by the smart loop executor.
820
+ */
821
+ createSmartLoopContext() {
822
+ return {
823
+ apiKey: this.apiKey,
824
+ platform: this.appiumSessionConfig?.platform,
825
+ screenSize: this.appiumSessionConfig.size,
826
+ globalActionHistory: this.globalActionHistory,
827
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
828
+ performTap: (x, y) => this.performTap(x, y),
829
+ performScroll: (direction) => this.performScroll(direction),
830
+ performType: (text) => this.performType(text),
831
+ logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command)
832
+ };
833
+ }
834
+ /**
835
+ * Calls the AI agent to determine the next actions based on the current screenshot.
836
+ * This requires an active GPT Driver session.
837
+ */
838
+ async executeAgentStep(params) {
839
+ const response = await axios.post(
840
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/agent/execute`,
170
841
  {
171
842
  api_key: this.apiKey,
172
- status
843
+ base64_screenshot: params.screenshot.replace(/^data:image\/\w+;base64,/, ""),
844
+ instruction: params.instruction,
845
+ action_history: params.actionHistory
173
846
  }
174
847
  );
175
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}`);
176
- await axios.delete(url);
177
- console.log(">> Session stopped.");
178
- this.gptDriverSessionId = void 0;
848
+ return {
849
+ gptCommands: response.data.gpt_commands,
850
+ appetizeCommands: response.data.appetize_commands,
851
+ actionHistory: response.data.action_history
852
+ };
853
+ }
854
+ // ─────────────────────────────────────────────────────────────────────────────
855
+ // DEVICE ACTION METHODS
856
+ // ─────────────────────────────────────────────────────────────────────────────
857
+ async getWdioClient() {
858
+ if (!this.appiumSessionStarted) {
859
+ await this.startSession();
860
+ }
861
+ if (this.driver?.sessionId != null) {
862
+ return this.driver;
863
+ }
864
+ const url = this.appiumSessionConfig.serverUrl;
865
+ const parsed = new URL(url);
866
+ const client = await attach({
867
+ sessionId: this.appiumSessionConfig.id,
868
+ options: {
869
+ protocol: parsed.protocol.replace(":", ""),
870
+ hostname: parsed.hostname,
871
+ port: parsed.port ? Number(parsed.port) : parsed.protocol === "https:" ? 443 : 80,
872
+ path: parsed.pathname && parsed.pathname !== "/" ? parsed.pathname : "/"
873
+ }
874
+ });
875
+ this.driver = client;
876
+ return client;
877
+ }
878
+ /**
879
+ * Performs a tap action at the specified coordinates.
880
+ */
881
+ async performTap(x, y) {
882
+ const client = await this.getWdioClient();
883
+ await client.performActions([
884
+ {
885
+ type: "pointer",
886
+ id: "finger1",
887
+ parameters: { pointerType: "touch" },
888
+ actions: [
889
+ { type: "pointerMove", duration: 0, x, y },
890
+ { type: "pointerDown", button: 0 },
891
+ { type: "pause", duration: 100 },
892
+ { type: "pointerUp", button: 0 }
893
+ ]
894
+ }
895
+ ]);
896
+ }
897
+ async performType(text) {
898
+ const client = await this.getWdioClient();
899
+ await client.keys(text.split(""));
900
+ }
901
+ async performScroll(direction) {
902
+ const client = await this.getWdioClient();
903
+ const w = this.appiumSessionConfig?.size?.width ?? 1080;
904
+ const h = this.appiumSessionConfig?.size?.height ?? 1920;
905
+ const x = Math.round(w / 2);
906
+ const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
907
+ const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
908
+ await client.performActions([
909
+ {
910
+ type: "pointer",
911
+ id: "finger1",
912
+ parameters: { pointerType: "touch" },
913
+ actions: [
914
+ { type: "pointerMove", duration: 0, x, y: startY },
915
+ { type: "pointerDown", button: 0 },
916
+ { type: "pause", duration: 100 },
917
+ { type: "pointerMove", duration: 500, x, y: endY },
918
+ { type: "pointerUp", button: 0 }
919
+ ]
920
+ }
921
+ ]);
922
+ }
923
+ async getPageSource() {
924
+ const client = await this.getWdioClient();
925
+ return client.getPageSource();
926
+ }
927
+ async performScrollUntil(params) {
928
+ const { direction, text, elementId } = params;
929
+ const max = params.maxScrolls ?? 10;
930
+ for (let i = 0; i < max; i++) {
931
+ const source = await this.getPageSource();
932
+ const found = elementId ? source.includes(elementId) : text ? source.includes(text) : false;
933
+ if (found) {
934
+ return;
935
+ }
936
+ await this.performScroll(direction);
937
+ await this._delay(500);
938
+ }
939
+ throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
940
+ }
941
+ async getScreenshot(appiumSessionConfig) {
942
+ globalLogger.debug("Capturing screenshot...");
943
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
944
+ const screenshotResponse = await axios.get(url);
945
+ let screenshot = await screenshotResponse.data.value;
946
+ if (appiumSessionConfig.platform === "iOS") {
947
+ globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
948
+ const imageBuffer = Buffer.from(screenshot, "base64");
949
+ const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
950
+ screenshot = transformedImage.toString("base64");
951
+ }
952
+ return screenshot;
953
+ }
954
+ /**
955
+ * Helper method to delay execution.
956
+ *
957
+ * @private
958
+ */
959
+ _delay(ms) {
960
+ return new Promise((resolve) => setTimeout(resolve, ms));
179
961
  }
962
+ // ─────────────────────────────────────────────────────────────────────────────
963
+ // PUBLIC API METHODS
964
+ // ─────────────────────────────────────────────────────────────────────────────
180
965
  /**
181
966
  * Executes a specified command within the WebDriver session, optionally using an Appium handler.
182
967
  *
@@ -184,6 +969,7 @@ class GptDriver {
184
969
  * the command-specific operations. After executing the handler, the executed commands get logged on the GPTDriver servers.
185
970
  * If the handler execution fails or no handler is provided, the command gets executed by the GPTDriver using just natural language.
186
971
  *
972
+ * @deprecated Use `aiExecute()` instead. This method will be removed in a future version.
187
973
  * @param {string} command - The natural language command to be executed by the GPTDriver.
188
974
  * @param {AppiumHandler} [appiumHandler] - An optional function that processes Appium-specific commands.
189
975
  * If provided, this handler is executed instead of calling the GPTDriver serves.
@@ -191,24 +977,136 @@ class GptDriver {
191
977
  * @throws {Error} If an error occurs during the execution of the Appium handler or while processing the command by the GPTDriver.
192
978
  */
193
979
  async execute(command, appiumHandler) {
194
- console.log(">> Executing command:", command);
980
+ globalLogger.warn("Method 'execute()' is deprecated. Please use 'aiExecute()' instead.");
981
+ if (!this.appiumSessionStarted) {
982
+ await this.startSession();
983
+ }
984
+ globalLogger.info(`Executing command: ${command}`);
195
985
  const driver = this.driver;
196
986
  if (appiumHandler != null) {
197
987
  try {
988
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
198
989
  await appiumHandler(driver);
199
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
200
- await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
201
- api_key: this.apiKey,
202
- base64_screenshot: screenshot,
203
- command: appiumHandler.toString()
204
- });
990
+ globalLogger.debug("Custom Appium handler executed successfully");
205
991
  } catch (e) {
992
+ globalLogger.warn("Custom Appium handler failed, falling back to GPT handler");
206
993
  await this.gptHandler(command);
207
994
  }
208
995
  } else {
209
996
  await this.gptHandler(command);
210
997
  }
211
998
  }
999
+ /**
1000
+ * Executes a specified command within the WebDriver session with configurable caching options.
1001
+ *
1002
+ * This is the recommended method for executing commands. It provides fine-grained control over
1003
+ * caching behavior, allowing you to optimize performance and costs for repetitive test scenarios.
1004
+ *
1005
+ * If an `appiumHandler` is provided, it will be invoked with the WebDriver instance to perform
1006
+ * the command-specific operations. After executing the handler, the executed commands get logged
1007
+ * on the GPTDriver servers. If the handler execution fails or no handler is provided, the command
1008
+ * gets executed by the GPTDriver using natural language processing.
1009
+ *
1010
+ * @param {Object} params - The execution parameters
1011
+ * @param {string} params.command - The natural language command to be executed by the GPTDriver.
1012
+ * Examples: "Click the login button", "Enter 'test@example.com' in the email field"
1013
+ * @param {AppiumHandler} [params.appiumHandler] - An optional function that processes Appium-specific commands.
1014
+ * If provided, this handler is executed instead of calling
1015
+ * the GPTDriver API. Useful for performance optimization when
1016
+ * you know the exact Appium commands to execute.
1017
+ * @param {CachingMode} [params.cachingMode] - Controls how the GPTDriver caches this command execution.
1018
+ * If not specified, uses the global caching mode set in the constructor.
1019
+ * Options:
1020
+ * - "NONE"
1021
+ * - "FULL_SCREEN"
1022
+ * - "INTERACTION_REGION"
1023
+ * @param {boolean} [params.useSmartLoop] - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1024
+ * which optimizes execution by checking cache first and populating it after.
1025
+ * Default: false (uses legacy gptHandler)
1026
+ *
1027
+ * @returns {Promise<void>} A promise that resolves when the command execution is complete.
1028
+ *
1029
+ * @throws {Error} If an error occurs during the execution of the Appium handler or while processing
1030
+ * the command by the GPTDriver.
1031
+ *
1032
+ * @example
1033
+ * // Basic usage with natural language (no caching)
1034
+ * await driver.aiExecute({
1035
+ * command: "Click the submit button"
1036
+ * });
1037
+ *
1038
+ * @example
1039
+ * // Full screen caching for repetitive navigation on similar screens
1040
+ * await driver.aiExecute({
1041
+ * command: "Navigate to the settings page",
1042
+ * cachingMode: "FULL_SCREEN"
1043
+ * });
1044
+ *
1045
+ * @example
1046
+ * // Interaction region caching for repeated actions on the same button
1047
+ * await driver.aiExecute({
1048
+ * command: "Click the login button",
1049
+ * cachingMode: "INTERACTION_REGION"
1050
+ * });
1051
+ *
1052
+ * @example
1053
+ * // With custom Appium handler as fallback
1054
+ * await driver.aiExecute({
1055
+ * command: "Click the login button",
1056
+ * appiumHandler: async (driver) => {
1057
+ * const loginBtn = await driver.$('~loginButton');
1058
+ * await loginBtn.click();
1059
+ * },
1060
+ * cachingMode: "INTERACTION_REGION"
1061
+ * });
1062
+ *
1063
+ * @example
1064
+ * // Force fresh execution for dynamic content
1065
+ * await driver.aiExecute({
1066
+ * command: "Verify the current timestamp",
1067
+ * cachingMode: "NONE"
1068
+ * });
1069
+ *
1070
+ * @example
1071
+ * // Using smart loop for optimized caching
1072
+ * await driver.aiExecute({
1073
+ * command: "Click the login button",
1074
+ * useSmartLoop: true,
1075
+ * cachingMode: "FULL_SCREEN"
1076
+ * });
1077
+ */
1078
+ async aiExecute({ command, appiumHandler, cachingMode, useSmartLoop = false }) {
1079
+ if (!this.appiumSessionStarted) {
1080
+ await this.startSession();
1081
+ }
1082
+ globalLogger.info(`Executing command: ${command}`);
1083
+ const driver = this.driver;
1084
+ if (appiumHandler != null) {
1085
+ try {
1086
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
1087
+ await appiumHandler(driver);
1088
+ globalLogger.debug("Custom Appium handler executed successfully");
1089
+ this.step_number++;
1090
+ return;
1091
+ } catch (e) {
1092
+ globalLogger.warn("Custom Appium handler failed, falling back to AI execution");
1093
+ }
1094
+ }
1095
+ if (useSmartLoop) {
1096
+ const ctx = this.createSmartLoopContext();
1097
+ const result = await executeSmartLoop(ctx, {
1098
+ stepNumber: this.step_number,
1099
+ description: command,
1100
+ instruction: command
1101
+ });
1102
+ if (!result.success) {
1103
+ throw new Error(result.error || "Smart loop execution failed");
1104
+ }
1105
+ this.step_number++;
1106
+ } else {
1107
+ await this.gptHandler(command, cachingMode);
1108
+ }
1109
+ }
212
1110
  /**
213
1111
  * Asserts a single condition using the GPTDriver.
214
1112
  *
@@ -216,13 +1114,25 @@ class GptDriver {
216
1114
  * If the assertion fails, an error is thrown.
217
1115
  *
218
1116
  * @param {string} assertion - The condition to be asserted.
1117
+ * @param cachingMode - The caching mode to be used for the assertion.
219
1118
  * @throws {Error} If the assertion fails.
220
1119
  */
221
- async assert(assertion) {
222
- console.log(">> Asserting:", assertion);
223
- const results = await this.checkBulk([assertion]);
224
- if (!Object.values(results).at(0)) {
225
- throw new Error(`Failed assertion: ${assertion}`);
1120
+ async assert(assertion, cachingMode) {
1121
+ if (!this.appiumSessionStarted) {
1122
+ await this.startSession();
1123
+ }
1124
+ try {
1125
+ const results = await this.checkBulk([assertion], cachingMode);
1126
+ if (!Object.values(results).at(0)) {
1127
+ await this.setSessionStatus("failed");
1128
+ globalLogger.error(`Assertion failed: ${assertion}`);
1129
+ throw new Error(`Failed assertion: ${assertion}`);
1130
+ }
1131
+ this.step_number = this.step_number + 1;
1132
+ globalLogger.info(`Assertion passed: ${assertion}`);
1133
+ } catch (e) {
1134
+ await this.setSessionStatus("failed");
1135
+ throw e;
226
1136
  }
227
1137
  }
228
1138
  /**
@@ -232,43 +1142,100 @@ class GptDriver {
232
1142
  * If any assertion fails, an error is thrown listing all failed assertions.
233
1143
  *
234
1144
  * @param {string[]} assertions - An array of conditions to be asserted.
1145
+ * @param cachingMode - The caching mode to be used for the assertions.
235
1146
  * @throws {Error} If any of the assertions fail.
236
1147
  */
237
- async assertBulk(assertions) {
238
- console.log(">> Asserting:", assertions);
239
- const results = await this.checkBulk(assertions);
240
- const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
241
- if (!current) {
242
- return [...prev, assertions.at(currentIndex)];
1148
+ async assertBulk(assertions, cachingMode) {
1149
+ if (!this.appiumSessionStarted) {
1150
+ await this.startSession();
1151
+ }
1152
+ try {
1153
+ const results = await this.checkBulk(assertions, cachingMode);
1154
+ const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
1155
+ if (!current) {
1156
+ return [...prev, assertions.at(currentIndex)];
1157
+ }
1158
+ return prev;
1159
+ }, []);
1160
+ if (failedAssertions.length > 0) {
1161
+ await this.setSessionStatus("failed");
1162
+ globalLogger.error(`Multiple assertions failed: ${failedAssertions.join(", ")}`);
1163
+ throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
243
1164
  }
244
- return prev;
245
- }, []);
246
- if (failedAssertions.length > 0) {
247
- throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
1165
+ this.step_number = this.step_number + 1;
1166
+ globalLogger.info(`All ${assertions.length} assertions passed`);
1167
+ } catch (e) {
1168
+ await this.setSessionStatus("failed");
1169
+ throw e;
248
1170
  }
249
1171
  }
250
1172
  /**
251
1173
  * Checks multiple conditions and returns their results using the GPTDriver.
252
1174
  *
253
1175
  * This method sends a bulk condition request and returns the results of the conditions.
1176
+ * Failed conditions will be retried up to maxRetries times.
254
1177
  *
255
1178
  * @param {string[]} conditions - An array of conditions to be checked.
1179
+ * @param {CachingMode} cachingMode - The caching mode to be used for the conditions.
1180
+ * @param {number} maxRetries - The maximum number of retries if any condition fails (default: 2).
1181
+ * @param {number} retryDelayMs - The delay in milliseconds between retries (default: 1000).
256
1182
  * @returns {Promise<Record<string, boolean>>} A promise that resolves with an object mapping each condition
257
1183
  * to a boolean indicating whether the condition was met.
258
1184
  */
259
- async checkBulk(conditions) {
260
- console.log(">> Checking:", conditions);
261
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
262
- const response = await axios.post(
263
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
264
- {
265
- api_key: this.apiKey,
266
- base64_screenshot: screenshot,
267
- assertions: conditions,
268
- command: `Assert: ${JSON.stringify(conditions)}`
1185
+ async checkBulk(conditions, cachingMode, maxRetries = 2, retryDelayMs = 1e3) {
1186
+ let attempt = 0;
1187
+ let results = {};
1188
+ while (attempt <= maxRetries) {
1189
+ results = await this._checkBulkOnce(conditions, cachingMode, attempt);
1190
+ const failedConditions = Object.entries(results).filter(([_, success]) => !success).map(([key, _]) => key);
1191
+ if (failedConditions.length === 0) {
1192
+ return results;
269
1193
  }
270
- );
271
- return response.data.results;
1194
+ attempt++;
1195
+ if (attempt <= maxRetries) {
1196
+ globalLogger.info(
1197
+ `>> Conditions failed ${JSON.stringify(failedConditions)}. Retrying in ${retryDelayMs}ms... (Attempt ${attempt}/${maxRetries})`
1198
+ );
1199
+ await this._delay(retryDelayMs);
1200
+ } else {
1201
+ globalLogger.info(`>> Conditions failed: ${JSON.stringify(failedConditions)}`);
1202
+ }
1203
+ }
1204
+ return results;
1205
+ }
1206
+ /**
1207
+ * Internal method to check conditions once without retry logic.
1208
+ *
1209
+ * @private
1210
+ */
1211
+ async _checkBulkOnce(conditions, cachingMode, attempt = 0) {
1212
+ if (!this.appiumSessionStarted) {
1213
+ await this.startSession();
1214
+ }
1215
+ globalLogger.info(`Checking conditions (attempt ${attempt}): ${conditions.join(", ")}`);
1216
+ try {
1217
+ let screenshot;
1218
+ if (!this.useGptDriverCloud) {
1219
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1220
+ }
1221
+ const response = await axios.post(
1222
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
1223
+ {
1224
+ api_key: this.apiKey,
1225
+ base64_screenshot: screenshot,
1226
+ assertions: conditions,
1227
+ command: `Assert: ${JSON.stringify(conditions)}`,
1228
+ caching_mode: cachingMode ?? this.cachingMode,
1229
+ step_number: this.step_number
1230
+ }
1231
+ );
1232
+ globalLogger.debug(`Check results: ${JSON.stringify(response.data.results)}`);
1233
+ return response.data.results;
1234
+ } catch (e) {
1235
+ globalLogger.error("Failed to check conditions", e);
1236
+ await this.setSessionStatus("failed");
1237
+ throw e;
1238
+ }
272
1239
  }
273
1240
  /**
274
1241
  * Extracts specified information using the GPTDriver.
@@ -278,30 +1245,305 @@ class GptDriver {
278
1245
  *
279
1246
  * @param {string[]} extractions - An array of extraction criteria. Each criterion specifies what information
280
1247
  * should be extracted from the session.
1248
+ * @param cachingMode - The caching mode to be used for the extraction.
281
1249
  * @returns {Promise<Record<string, any>>} A promise that resolves with an object mapping each extraction criterion
282
1250
  * to the extracted data. The structure of the returned data depends on the
283
1251
  * specifics of the extraction criteria.
284
1252
  */
285
- async extract(extractions) {
286
- console.log(">> Extracting:", extractions);
287
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1253
+ async extract(extractions, cachingMode) {
1254
+ if (!this.appiumSessionStarted) {
1255
+ await this.startSession();
1256
+ }
1257
+ globalLogger.info(`Extracting data: ${extractions.join(", ")}`);
1258
+ let screenshot;
1259
+ if (!this.useGptDriverCloud) {
1260
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1261
+ }
288
1262
  const response = await axios.post(
289
1263
  `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/extract`,
290
1264
  {
291
1265
  api_key: this.apiKey,
292
1266
  base64_screenshot: screenshot,
293
1267
  extractions,
294
- command: `Extract: ${JSON.stringify(extractions)}`
1268
+ command: `Extract: ${JSON.stringify(extractions)}`,
1269
+ step_number: this.step_number
295
1270
  }
296
1271
  );
1272
+ this.step_number = this.step_number + 1;
1273
+ globalLogger.debug(`Extraction results: ${JSON.stringify(response.data.results)}`);
297
1274
  return response.data.results;
298
1275
  }
299
- async gptHandler(command) {
1276
+ /**
1277
+ * Opens a deep link url in the Appium session.
1278
+ *
1279
+ * This method sends a request to the GPT Driver server to open a deep link url in the Appium session.
1280
+ *
1281
+ * @param {OpenDeepLinkUrlParams} params - The parameters for opening the deep link url.
1282
+ * @returns {Promise<void>} A promise that resolves when the deep link url is opened.
1283
+ */
1284
+ async openDeepLinkUrl(params) {
1285
+ if (!this.appiumSessionStarted) {
1286
+ await this.startSession();
1287
+ }
1288
+ globalLogger.info(`Opening deep link: ${params.url}`);
1289
+ if (params.package == null && this.appiumSessionConfig?.platform === "Android") {
1290
+ throw new Error("Package is required for Android platform");
1291
+ }
1292
+ await this.executeCommand(
1293
+ {
1294
+ url: `http://localhost:4723/session/${this.appiumSessionConfig?.id}/execute/sync`,
1295
+ method: "POST",
1296
+ data: {
1297
+ "script": "mobile:deepLink",
1298
+ "args": [{
1299
+ url: params.url,
1300
+ ...params.bundleId && { bundleId: params.bundleId },
1301
+ ...params.package && { package: params.package }
1302
+ }]
1303
+ }
1304
+ }
1305
+ );
1306
+ this.step_number = this.step_number + 1;
1307
+ globalLogger.debug("Deep link opened successfully");
1308
+ }
1309
+ /**
1310
+ * Reads a flow JSON file from disk and validates it using the SavableTestStoreSchema.
1311
+ *
1312
+ * Returns the parsed and validated object on success; throws a detailed error on failure.
1313
+ *
1314
+ * @param filePath - Path to the flow file (JSON)
1315
+ * @param options - Optional execution options
1316
+ * @param options.useSmartLoop - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1317
+ * for AI, tap, and assert steps. This optimizes execution by checking cache
1318
+ * first and populating it after successful execution. Default: false
1319
+ * @returns The validated flow data
1320
+ *
1321
+ * @example
1322
+ * // Execute flow with default settings (legacy gptHandler)
1323
+ * const result = await driver.executeFlow('tests/login-flow.json');
1324
+ *
1325
+ * @example
1326
+ * // Execute flow with smart loop enabled for optimized caching
1327
+ * const result = await driver.executeFlow('tests/login-flow.json', { useSmartLoop: true });
1328
+ */
1329
+ async executeFlow(filePath, options) {
1330
+ const useSmartLoop = options?.useSmartLoop ?? false;
1331
+ globalLogger.info(`Loading flow from file: ${filePath}`);
1332
+ const absolutePath = path.resolve(filePath);
1333
+ const baseDir = path.dirname(absolutePath);
1334
+ let raw;
1335
+ try {
1336
+ raw = await promises.readFile(absolutePath, "utf-8");
1337
+ } catch (e) {
1338
+ const msg = `Failed to read file at ${filePath}: ${e?.message ?? e}`;
1339
+ globalLogger.error(msg);
1340
+ throw new Error(msg);
1341
+ }
1342
+ let json;
1343
+ try {
1344
+ json = JSON.parse(raw);
1345
+ } catch (e) {
1346
+ const msg = `Invalid JSON in flow file ${filePath}: ${e?.message ?? e}`;
1347
+ globalLogger.error(msg);
1348
+ throw new Error(msg);
1349
+ }
1350
+ const parsed = SavableTestStoreSchema.safeParse(json);
1351
+ if (!parsed.success) {
1352
+ const issues = parsed.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1353
+ const msg = `Flow validation failed for ${filePath}:
1354
+ ${issues}`;
1355
+ globalLogger.error(msg);
1356
+ throw new Error(msg);
1357
+ }
1358
+ const rootFlow = parsed.data;
1359
+ globalLogger.info(`Flow file validated successfully: ${filePath}`);
1360
+ const visited = /* @__PURE__ */ new Set();
1361
+ const loadFlow = async (p) => {
1362
+ const abs = path.isAbsolute(p) ? p : path.resolve(baseDir, p);
1363
+ const rawChild = await promises.readFile(abs, "utf-8");
1364
+ const childJson = JSON.parse(rawChild);
1365
+ const val = SavableTestStoreSchema.safeParse(childJson);
1366
+ if (!val.success) {
1367
+ const issues = val.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1368
+ throw new Error(`Flow validation failed for referenced file ${abs}:
1369
+ ${issues}`);
1370
+ }
1371
+ return val.data;
1372
+ };
1373
+ const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1374
+ const out = [];
1375
+ for (const step of steps) {
1376
+ if (step.type === "fileRef") {
1377
+ const refPath = path.isAbsolute(step.path) ? step.path : path.resolve(parentDir, step.path);
1378
+ const refKey = path.normalize(refPath);
1379
+ if (visited.has(refKey)) {
1380
+ const cycle = [...stack, refKey].map((p) => path.basename(p)).join(" -> ");
1381
+ throw new Error(`Detected circular fileRef: ${cycle}`);
1382
+ }
1383
+ visited.add(refKey);
1384
+ const child = await loadFlow(refPath);
1385
+ const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1386
+ const childDir = path.dirname(refPath);
1387
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1388
+ out.push(...childExpanded);
1389
+ } else {
1390
+ const resolved = { ...step, __params: { ...inheritedParams } };
1391
+ out.push(resolved);
1392
+ }
1393
+ }
1394
+ return out;
1395
+ };
1396
+ const effectiveParams = { ...rootFlow.params ?? {} };
1397
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1398
+ if (!this.appiumSessionStarted) {
1399
+ await this.startSession();
1400
+ }
1401
+ globalLogger.info(`Executing flow '${rootFlow.name}' with ${expandedSteps.length} step(s)...`);
1402
+ let executed = 0;
1403
+ try {
1404
+ for (const step of expandedSteps) {
1405
+ const params = step.__params ?? effectiveParams;
1406
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1407
+ try {
1408
+ switch (step.type) {
1409
+ case "ai": {
1410
+ const instruction = this.interpolateTemplate(step.instruction, params);
1411
+ globalLogger.info(`${prefix}: ${instruction}`);
1412
+ if (useSmartLoop) {
1413
+ const ctx = this.createSmartLoopContext();
1414
+ const result = await executeSmartLoop(ctx, {
1415
+ stepNumber: this.step_number,
1416
+ description: instruction,
1417
+ instruction
1418
+ });
1419
+ if (!result.success) {
1420
+ throw new Error(result.error || "Smart loop execution failed");
1421
+ }
1422
+ this.step_number++;
1423
+ } else {
1424
+ await this.aiExecute({ command: instruction });
1425
+ }
1426
+ break;
1427
+ }
1428
+ case "tap": {
1429
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1430
+ if (!description) {
1431
+ throw new Error("Tap step requires a descriptionText. Coordinate-based taps are no longer supported.");
1432
+ }
1433
+ globalLogger.info(`${prefix}: ${description}`);
1434
+ if (useSmartLoop) {
1435
+ const ctx = this.createSmartLoopContext();
1436
+ const result = await executeSmartLoop(ctx, {
1437
+ stepNumber: this.step_number,
1438
+ description,
1439
+ instruction: description
1440
+ });
1441
+ if (!result.success) {
1442
+ throw new Error(result.error || "Smart loop execution failed");
1443
+ }
1444
+ this.step_number++;
1445
+ } else {
1446
+ await this.aiExecute({ command: description });
1447
+ }
1448
+ break;
1449
+ }
1450
+ case "assert": {
1451
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1452
+ if (!description) {
1453
+ throw new Error("Assert step requires a descriptionText. Coordinate-based assertions are no longer supported.");
1454
+ }
1455
+ globalLogger.info(`${prefix}: ${description}`);
1456
+ if (useSmartLoop) {
1457
+ const instruction = `Verify that: ${description}`;
1458
+ const ctx = this.createSmartLoopContext();
1459
+ const result = await executeSmartLoop(ctx, {
1460
+ stepNumber: this.step_number,
1461
+ description,
1462
+ instruction
1463
+ });
1464
+ if (!result.success) {
1465
+ throw new Error(result.error || "Smart loop execution failed");
1466
+ }
1467
+ this.step_number++;
1468
+ } else {
1469
+ await this.assert(description);
1470
+ }
1471
+ break;
1472
+ }
1473
+ case "type": {
1474
+ const text = this.interpolateTemplate(step.text, params);
1475
+ globalLogger.info(`${prefix}: Type text`);
1476
+ await this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
1477
+ await this.performType(text);
1478
+ this.step_number++;
1479
+ break;
1480
+ }
1481
+ case "scroll": {
1482
+ globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1483
+ await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1484
+ await this.performScroll(step.direction);
1485
+ this.step_number++;
1486
+ break;
1487
+ }
1488
+ case "zoom": {
1489
+ globalLogger.info(`${prefix}: Zoom ${step.direction}`);
1490
+ await this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
1491
+ this.step_number++;
1492
+ break;
1493
+ }
1494
+ case "scrollUntil": {
1495
+ const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
1496
+ globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
1497
+ await this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
1498
+ await this.performScrollUntil({
1499
+ direction: step.direction,
1500
+ text: interpolatedText,
1501
+ elementId: step.elementId,
1502
+ maxScrolls: step.maxScrolls
1503
+ });
1504
+ this.step_number++;
1505
+ break;
1506
+ }
1507
+ case "deeplink": {
1508
+ const pkg = params["package"];
1509
+ const bundleId = params["bundleId"];
1510
+ const url = this.interpolateTemplate(step.url, params);
1511
+ globalLogger.info(`${prefix}: Open deeplink ${url}`);
1512
+ await this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
1513
+ await this.openDeepLinkUrl({ url, package: pkg, bundleId });
1514
+ break;
1515
+ }
1516
+ default: {
1517
+ throw new Error(`Unsupported step type at execution: ${step.type}`);
1518
+ }
1519
+ }
1520
+ executed++;
1521
+ } catch (err) {
1522
+ if (step.optional) {
1523
+ globalLogger.warn(`${prefix} failed but marked optional. Continuing. Error: ${err.message}`);
1524
+ continue;
1525
+ }
1526
+ throw err;
1527
+ }
1528
+ }
1529
+ } catch (e) {
1530
+ try {
1531
+ await this.setSessionStatus("failed");
1532
+ } catch {
1533
+ }
1534
+ throw e;
1535
+ }
1536
+ return rootFlow;
1537
+ }
1538
+ async gptHandler(command, cachingMode) {
300
1539
  try {
301
1540
  let conditionSucceeded = false;
302
1541
  while (!conditionSucceeded) {
303
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
304
- console.log(">> Asking GTP Driver for next action...");
1542
+ let screenshot;
1543
+ if (!this.useGptDriverCloud) {
1544
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1545
+ }
1546
+ globalLogger.info("Requesting next action from GPT Driver...");
305
1547
  const response = await axios.request(
306
1548
  {
307
1549
  url: `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/execute`,
@@ -309,39 +1551,49 @@ class GptDriver {
309
1551
  data: {
310
1552
  api_key: this.apiKey,
311
1553
  command,
312
- base64_screenshot: screenshot
1554
+ base64_screenshot: screenshot,
1555
+ caching_mode: cachingMode ?? this.cachingMode,
1556
+ step_number: this.step_number
313
1557
  }
314
1558
  }
315
1559
  );
316
1560
  const executeStatus = response.data.status;
317
1561
  if (executeStatus === "failed") {
318
- const errorMessage = response?.data?.commands?.at(0)?.data;
1562
+ const errorMessage = response.data?.commands?.at(0)?.data;
1563
+ globalLogger.error(`Execution failed: ${errorMessage ?? "Unknown error"}`);
319
1564
  throw new Error(errorMessage ?? "Execution failed");
320
1565
  }
321
1566
  conditionSucceeded = executeStatus !== "inProgress";
322
1567
  const executeResponse = response.data;
323
- for (const command2 of executeResponse.commands) {
324
- await this.executeCommand(command2);
1568
+ globalLogger.debug(`Received ${executeResponse.commands.length} command(s) to execute`);
1569
+ for (const appiumCommand of executeResponse.commands) {
1570
+ await this.executeCommand(appiumCommand);
325
1571
  }
326
1572
  if (!conditionSucceeded) {
1573
+ globalLogger.debug("Command still in progress, waiting...");
327
1574
  await delay(1500);
328
1575
  }
329
1576
  }
1577
+ this.step_number = this.step_number + 1;
1578
+ globalLogger.info("Command execution completed successfully");
330
1579
  } catch (e) {
331
- await this.stopSession("failed");
1580
+ globalLogger.error("GPT handler failed", e);
1581
+ await this.setSessionStatus("failed");
332
1582
  throw e;
333
1583
  }
334
1584
  }
335
1585
  async executeCommand(command) {
336
- const firstAction = command.data.actions?.at(0);
1586
+ const firstAction = command.data?.actions?.at(0);
337
1587
  if (firstAction?.type === "pause" && firstAction.duration != null) {
1588
+ globalLogger.debug(`Pausing for ${firstAction.duration} seconds`);
338
1589
  await delay(firstAction * 1e3);
339
- } else {
1590
+ } else if (!this.useGptDriverCloud) {
340
1591
  const parsedUrl = new URL(command.url);
341
1592
  parsedUrl.protocol = this.appiumSessionConfig.serverUrl.protocol;
342
1593
  parsedUrl.host = this.appiumSessionConfig.serverUrl.host;
343
1594
  parsedUrl.port = this.appiumSessionConfig.serverUrl.port != "" ? `${this.appiumSessionConfig.serverUrl.port}` : "";
344
1595
  parsedUrl.pathname = this.appiumSessionConfig.serverUrl.pathname != "/" ? `${this.appiumSessionConfig.serverUrl.pathname}${parsedUrl.pathname}` : parsedUrl.pathname;
1596
+ globalLogger.debug(`Executing ${command.method} request to ${parsedUrl.pathname}`);
345
1597
  await axios.request({
346
1598
  url: parsedUrl.toString(),
347
1599
  method: command.method,
@@ -349,16 +1601,25 @@ class GptDriver {
349
1601
  });
350
1602
  }
351
1603
  }
352
- async getScreenshot(appiumSessionConfig) {
353
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
354
- const screenshotResponse = await axios.get(url);
355
- let screenshot = await screenshotResponse.data.value;
356
- if (appiumSessionConfig.platform === "iOS") {
357
- const imageBuffer = Buffer.from(screenshot, "base64");
358
- const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
359
- screenshot = transformedImage.toString("base64");
1604
+ async logCodeExecution(screenshot, command) {
1605
+ try {
1606
+ const screenshot2 = await this.getScreenshot(this.appiumSessionConfig);
1607
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
1608
+ api_key: this.apiKey,
1609
+ base64_screenshot: screenshot2,
1610
+ command
1611
+ });
1612
+ } catch (e) {
1613
+ globalLogger.error("Failed to log code execution", e);
1614
+ }
1615
+ }
1616
+ async takeScreenshotAndLogCodeExecution(command) {
1617
+ try {
1618
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1619
+ await this.logCodeExecution(screenshot, command);
1620
+ } catch (e) {
1621
+ globalLogger.error("Failed to log code execution", e);
360
1622
  }
361
- return screenshot;
362
1623
  }
363
1624
  }
364
1625