gpt-driver-node 1.0.0-alpha.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,13 @@
1
1
  'use strict';
2
2
 
3
+ var node_fs = require('node:fs');
4
+ var path = require('node:path');
3
5
  var axios = require('axios');
4
6
  var sharp = require('sharp');
7
+ var webdriverio = require('webdriverio');
8
+ var winston = require('winston');
9
+ var zod = require('zod');
10
+ var crypto = require('node:crypto');
5
11
 
6
12
  const delay = async (milliseconds) => {
7
13
  await new Promise((resolve) => setTimeout(resolve, milliseconds));
@@ -17,12 +23,600 @@ function buildUrl(base, extraPath) {
17
23
  return `${baseUrl}${extraPath}`;
18
24
  }
19
25
 
26
+ const colors = {
27
+ reset: "\x1B[0m",
28
+ bold: "\x1B[1m",
29
+ gray: "\x1B[90m",
30
+ red: "\x1B[31m",
31
+ green: "\x1B[32m",
32
+ yellow: "\x1B[33m",
33
+ cyan: "\x1B[36m"
34
+ };
35
+ const logStyles = {
36
+ bold: (text) => `${colors.bold}${text}${colors.reset}`,
37
+ cyan: (text) => `${colors.cyan}${text}${colors.reset}`,
38
+ yellow: (text) => `${colors.yellow}${text}${colors.reset}`,
39
+ green: (text) => `${colors.green}${text}${colors.reset}`,
40
+ red: (text) => `${colors.red}${text}${colors.reset}`,
41
+ gray: (text) => `${colors.gray}${text}${colors.reset}`,
42
+ highlight: (text) => `${colors.bold}${colors.cyan}${text}${colors.reset}`,
43
+ success: (text) => `${colors.bold}${colors.green}${text}${colors.reset}`,
44
+ error: (text) => `${colors.bold}${colors.red}${text}${colors.reset}`,
45
+ warning: (text) => `${colors.bold}${colors.yellow}${text}${colors.reset}`
46
+ };
47
+ const globalLogger = winston.createLogger({
48
+ level: process.env.GPT_DRIVER_LOG_LEVEL || "info",
49
+ format: winston.format.combine(
50
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
51
+ winston.format.errors({ stack: true }),
52
+ winston.format.printf(({ timestamp, level, message, stack }) => {
53
+ const logMessage = `${timestamp} [${level.toUpperCase()}]: ${message}`;
54
+ return stack ? `${logMessage}
55
+ ${stack}` : logMessage;
56
+ })
57
+ ),
58
+ transports: [
59
+ new winston.transports.Console({
60
+ format: winston.format.combine(
61
+ winston.format.printf(({ timestamp, level, message, stack }) => {
62
+ let coloredLevel = level.toUpperCase();
63
+ switch (level) {
64
+ case "error":
65
+ coloredLevel = logStyles.error(coloredLevel);
66
+ break;
67
+ case "warn":
68
+ coloredLevel = logStyles.warning(coloredLevel);
69
+ break;
70
+ case "info":
71
+ coloredLevel = logStyles.cyan(coloredLevel);
72
+ break;
73
+ case "debug":
74
+ coloredLevel = logStyles.gray(coloredLevel);
75
+ break;
76
+ }
77
+ const formattedTimestamp = logStyles.gray(timestamp);
78
+ const logMessage = `${formattedTimestamp} [${coloredLevel}]: ${message}`;
79
+ return stack ? `${logMessage}
80
+ ${logStyles.gray(stack)}` : logMessage;
81
+ })
82
+ )
83
+ })
84
+ ]
85
+ });
86
+
87
+ const SavableStepBaseSchema = zod.z.object({
88
+ id: zod.z.number().optional(),
89
+ descriptionText: zod.z.string().optional(),
90
+ optional: zod.z.boolean().optional()
91
+ });
92
+ const SavableTapStepSchema = SavableStepBaseSchema.extend({
93
+ type: zod.z.literal("tap"),
94
+ elementId: zod.z.string().optional(),
95
+ timeout: zod.z.number().optional(),
96
+ useLlmOnly: zod.z.boolean().optional(),
97
+ cropBase64: zod.z.string().optional()
98
+ });
99
+ const SavableAssertStepSchema = SavableStepBaseSchema.extend({
100
+ type: zod.z.literal("assert"),
101
+ elementId: zod.z.string().optional(),
102
+ timeout: zod.z.number().optional(),
103
+ useLlmOnly: zod.z.boolean().optional(),
104
+ cropBase64: zod.z.string().optional()
105
+ });
106
+ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
107
+ type: zod.z.literal("type"),
108
+ text: zod.z.string()
109
+ });
110
+ const SavableScrollStepSchema = SavableStepBaseSchema.extend({
111
+ type: zod.z.literal("scroll"),
112
+ direction: zod.z.enum(["up", "down"])
113
+ });
114
+ const SavableZoomStepSchema = SavableStepBaseSchema.extend({
115
+ type: zod.z.literal("zoom"),
116
+ direction: zod.z.enum(["in", "out"])
117
+ });
118
+ const SavableScrollUntilStepSchema = SavableStepBaseSchema.extend({
119
+ type: zod.z.literal("scrollUntil"),
120
+ text: zod.z.string().optional(),
121
+ elementId: zod.z.string().optional(),
122
+ direction: zod.z.enum(["up", "down"]),
123
+ maxScrolls: zod.z.number().optional()
124
+ });
125
+ const SavableDeeplinkStepSchema = SavableStepBaseSchema.extend({
126
+ type: zod.z.literal("deeplink"),
127
+ url: zod.z.string()
128
+ });
129
+ const SavableAIStepSchema = SavableStepBaseSchema.extend({
130
+ type: zod.z.literal("ai"),
131
+ instruction: zod.z.string()
132
+ });
133
+ const SavableFileRefStepSchema = SavableStepBaseSchema.extend({
134
+ type: zod.z.literal("fileRef"),
135
+ path: zod.z.string(),
136
+ overrides: zod.z.record(zod.z.string(), zod.z.string()).optional()
137
+ });
138
+ const SavableStepSchema = zod.z.discriminatedUnion("type", [
139
+ SavableTapStepSchema,
140
+ // type: 'tap'
141
+ SavableAssertStepSchema,
142
+ // type: 'assert'
143
+ SavableTypeStepSchema,
144
+ // type: 'type'
145
+ SavableScrollStepSchema,
146
+ // type: 'scroll'
147
+ SavableZoomStepSchema,
148
+ // type: 'zoom'
149
+ SavableScrollUntilStepSchema,
150
+ // type: 'scrollUntil'
151
+ SavableDeeplinkStepSchema,
152
+ // type: 'deeplink'
153
+ SavableAIStepSchema,
154
+ // type: 'ai'
155
+ SavableFileRefStepSchema
156
+ // type: 'fileRef'
157
+ ]);
158
+ const SavableTestStoreSchema = zod.z.object({
159
+ name: zod.z.string(),
160
+ steps: zod.z.array(SavableStepSchema),
161
+ params: zod.z.record(zod.z.string(), zod.z.string()).optional()
162
+ });
163
+
164
+ const CACHE_SERVER_URL = "https://cache.mobileboost.io";
165
+ const GPT_DRIVER_BASE_URL = "https://api.mobileboost.io";
166
+ const RESCALE_FACTOR = 4;
167
+ const SMART_LOOP_MAX_ITERATIONS = 15;
168
+ const CACHE_RETRY_MS = 2e3;
169
+ const CACHE_CHECK_INTERVAL_MS = 500;
170
+
171
+ function generateCacheHash(apiKey, filepath, stepNumber, description, platform, resolution) {
172
+ const resString = resolution ? `${resolution.width}x${resolution.height}` : "";
173
+ const normalizedPlatform = platform?.toLowerCase() || "";
174
+ const data = `${apiKey}${filepath || ""}${stepNumber}${description}${normalizedPlatform || ""}${resString}`;
175
+ return crypto.createHash("sha256").update(data).digest("hex");
176
+ }
177
+ function scaleCommand(cmd, operation) {
178
+ if (cmd.match(/([xy])=(\d+)/)) {
179
+ return cmd.replace(/([xy])=(\d+)/g, (_match, axis, val) => {
180
+ const num = parseInt(val, 10);
181
+ let scaled;
182
+ if (operation === "multiply") {
183
+ scaled = Math.round(num * RESCALE_FACTOR);
184
+ } else {
185
+ scaled = Math.round(num / RESCALE_FACTOR);
186
+ }
187
+ return `${axis}=${scaled}`;
188
+ });
189
+ }
190
+ return cmd.replace(/(^|;)(\d+);(\d+)(;|$)/, (_match, prefix, xStr, yStr, suffix) => {
191
+ const x = parseInt(xStr, 10);
192
+ const y = parseInt(yStr, 10);
193
+ let scaledX;
194
+ let scaledY;
195
+ if (operation === "multiply") {
196
+ scaledX = Math.round(x * RESCALE_FACTOR);
197
+ scaledY = Math.round(y * RESCALE_FACTOR);
198
+ } else {
199
+ scaledX = Math.round(x / RESCALE_FACTOR);
200
+ scaledY = Math.round(y / RESCALE_FACTOR);
201
+ }
202
+ return `${prefix}${scaledX};${scaledY}${suffix}`;
203
+ });
204
+ }
205
+ async function resizeScreenshotForCache(screenshotBase64) {
206
+ const buffer = Buffer.from(
207
+ screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
208
+ "base64"
209
+ );
210
+ const metadata = await sharp(buffer).metadata();
211
+ const originalWidth = metadata.width ?? 1080;
212
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
213
+ return sharp(buffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
214
+ }
215
+
216
+ async function executeFromCache(params) {
217
+ try {
218
+ const hash = generateCacheHash(
219
+ params.apiKey,
220
+ params.filepath,
221
+ params.stepNumber,
222
+ params.stepDescription,
223
+ params.platform,
224
+ params.screenResolution
225
+ );
226
+ const resizedBuffer = await resizeScreenshotForCache(params.screenshot);
227
+ const formData = new FormData();
228
+ formData.append("hash", hash);
229
+ const blob = new Blob([new Uint8Array(resizedBuffer)], { type: "image/png" });
230
+ const blobSizeMB = (blob.size / (1024 * 1024)).toFixed(2);
231
+ globalLogger.debug(`[Cache] Executing from cache with screenshot size: ${blobSizeMB} MB`);
232
+ formData.append("screenshot", blob, "screenshot.png");
233
+ if (params.highestUsedIndex !== void 0 && params.highestUsedIndex !== null) {
234
+ globalLogger.debug(`[Cache] Sending highest_used_index: ${params.highestUsedIndex}`);
235
+ formData.append("highest_used_index", String(params.highestUsedIndex));
236
+ }
237
+ const response = await axios.post(`${CACHE_SERVER_URL}/execute-from-cache`, formData);
238
+ const result = response.data;
239
+ if (result.found && result.cacheCommands) {
240
+ const scaledCommands = result.cacheCommands.map(
241
+ (cmd) => scaleCommand(cmd, "multiply")
242
+ );
243
+ return {
244
+ found: true,
245
+ cacheCommands: scaledCommands,
246
+ cacheIndex: result.cacheIndex
247
+ };
248
+ }
249
+ return { found: false };
250
+ } catch (error) {
251
+ if (axios.isAxiosError(error)) {
252
+ globalLogger.warn(`[Cache] Cache lookup failed: ${error.response?.data || error.message}`);
253
+ } else {
254
+ globalLogger.error(`[Cache] Error executing from cache: ${error}`);
255
+ }
256
+ return { found: false };
257
+ }
258
+ }
259
+ async function populateCache(params) {
260
+ try {
261
+ const hash = generateCacheHash(
262
+ params.apiKey,
263
+ params.filepath,
264
+ params.stepNumber,
265
+ params.stepDescription,
266
+ params.platform,
267
+ params.screenResolution
268
+ );
269
+ const payload = await Promise.all(params.executionData.map(async (item) => {
270
+ const resizedBuffer = await resizeScreenshotForCache(item.screenshot);
271
+ const scaledCommands = item.commands.map(
272
+ (cmd) => scaleCommand(cmd, "divide")
273
+ );
274
+ return {
275
+ screenshot: resizedBuffer.toString("base64"),
276
+ commands: scaledCommands
277
+ };
278
+ }));
279
+ const payloadSizeMB = (JSON.stringify(payload).length / (1024 * 1024)).toFixed(2);
280
+ globalLogger.debug(`[Cache] Populating cache with payload size: ~${payloadSizeMB} MB (Hash: ${hash})`);
281
+ await axios.post(`${CACHE_SERVER_URL}/populate-cache`, payload, {
282
+ params: { hash }
283
+ });
284
+ return { success: true };
285
+ } catch (error) {
286
+ if (axios.isAxiosError(error)) {
287
+ globalLogger.error(`[Cache] Failed to populate cache: ${error.response?.data || error.message}`);
288
+ } else {
289
+ globalLogger.error(`[Cache] Error populating cache: ${error}`);
290
+ }
291
+ return { success: false };
292
+ }
293
+ }
294
+
295
+ const AI_AGENT_ENDPOINT = "https://api.mobileboost.io/call_lambda";
296
+ async function executeAgentStep(params) {
297
+ const imageBuffer = Buffer.from(params.base64_screenshot, "base64");
298
+ const metadata = await sharp(imageBuffer).metadata();
299
+ const originalWidth = metadata.width ?? 1080;
300
+ const originalHeight = metadata.height ?? 1920;
301
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
302
+ const resizedBuffer = await sharp(imageBuffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
303
+ const resizedMetadata = await sharp(resizedBuffer).metadata();
304
+ const resizedWidth = resizedMetadata.width ?? desiredWidth;
305
+ const resizedHeight = resizedMetadata.height ?? Math.round(originalHeight * (desiredWidth / originalWidth));
306
+ globalLogger.debug(`[AI Client] Resized screenshot: ${originalWidth}x${originalHeight} -> ${resizedWidth}x${resizedHeight}`);
307
+ const payload = {
308
+ lambda_flow: "get_next_step",
309
+ current_date: (/* @__PURE__ */ new Date()).toLocaleDateString("en-GB", {
310
+ day: "numeric",
311
+ month: "long",
312
+ year: "numeric"
313
+ }),
314
+ base64_screenshot: resizedBuffer.toString("base64"),
315
+ getUI_elements: [],
316
+ uiHierarchy: [],
317
+ test_task_string: JSON.stringify([
318
+ {
319
+ id: "step-1",
320
+ text: `1. ${params.instruction}`,
321
+ plainText: params.instruction
322
+ }
323
+ ]),
324
+ image_width: resizedWidth,
325
+ image_height: resizedHeight,
326
+ action_history: params.action_history,
327
+ orgKey: params.apiKey,
328
+ template_images: {},
329
+ model_provider: "vellum",
330
+ model_version: "claude-agent",
331
+ fallbackModel: "claude-agent",
332
+ utilize_fullTextAnnotation: false,
333
+ enableSortingOCR: true,
334
+ enableActionHistoryCut: true,
335
+ removeOverlappingText: false,
336
+ currentAndPreviousScreenMatch: false,
337
+ popupDetectionEnabled: true,
338
+ ocrProvider: "gcp"
339
+ };
340
+ globalLogger.debug(`[AI Client] Sending request to ${AI_AGENT_ENDPOINT}`);
341
+ try {
342
+ const response = await axios.post(
343
+ AI_AGENT_ENDPOINT,
344
+ payload,
345
+ {
346
+ headers: {
347
+ "Content-Type": "application/json"
348
+ }
349
+ }
350
+ );
351
+ const result = response.data;
352
+ globalLogger.debug("[AI Client] Received response from backend");
353
+ if (result.appetizeCommands) {
354
+ result.appetizeCommands = result.appetizeCommands.map(
355
+ (cmd) => scaleCommand(cmd, "multiply")
356
+ );
357
+ }
358
+ return result;
359
+ } catch (error) {
360
+ if (axios.isAxiosError(error)) {
361
+ const status = error.response?.status ?? "unknown";
362
+ const errorText = error.response?.data ?? error.message;
363
+ globalLogger.error(`[AI Client] Backend error (${status}): ${JSON.stringify(errorText)}`);
364
+ throw new Error(`AI Backend Error: ${status} - ${error.message}`);
365
+ }
366
+ throw error;
367
+ }
368
+ }
369
+
370
+ function parseTapCoordinates(cmd) {
371
+ const xMatch = cmd.match(/x=(\d+)/);
372
+ const yMatch = cmd.match(/y=(\d+)/);
373
+ if (xMatch && yMatch) {
374
+ return {
375
+ x: parseInt(xMatch[1], 10),
376
+ y: parseInt(yMatch[1], 10)
377
+ };
378
+ }
379
+ const parts = cmd.split(";");
380
+ if (parts.length >= 3) {
381
+ const x = parseInt(parts[1], 10);
382
+ const y = parseInt(parts[2], 10);
383
+ if (!isNaN(x) && !isNaN(y)) {
384
+ return { x, y };
385
+ }
386
+ }
387
+ return null;
388
+ }
389
+ function parseWaitSeconds(cmd) {
390
+ const match = cmd.match(/wait:\s*(\d+)/);
391
+ return match ? parseInt(match[1], 10) : null;
392
+ }
393
+ function parseScrollDirection(cmd) {
394
+ const match = cmd.match(/scroll:\s*(up|down)/i);
395
+ return match ? match[1].toLowerCase() : null;
396
+ }
397
+ function parseTypeText(cmd) {
398
+ const match = cmd.match(/^type:\s*(.+)$/);
399
+ return match ? match[1] : null;
400
+ }
401
+ function isTaskComplete(cmd) {
402
+ return cmd.toLowerCase().includes("task complete:");
403
+ }
404
+ function isErrorDetected(cmd) {
405
+ return cmd.toLowerCase().includes("error detected:");
406
+ }
407
+ function isRememberCommand(cmd) {
408
+ return cmd.startsWith("remember:");
409
+ }
410
+ function isTapCommand(cmd) {
411
+ return /^t(ap|ab)On:/.test(cmd);
412
+ }
413
+ function isWaitCommand(cmd) {
414
+ return cmd.startsWith("wait:");
415
+ }
416
+ function isScrollCommand(cmd) {
417
+ return cmd.startsWith("scroll:");
418
+ }
419
+ function isTypeCommand(cmd) {
420
+ return cmd.startsWith("type:");
421
+ }
422
+
423
+ async function executeSmartLoop(ctx, params) {
424
+ const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
425
+ let loopCount = 0;
426
+ let actionHistory = [...ctx.globalActionHistory];
427
+ let lastCacheIndex = void 0;
428
+ let anyCacheMiss = false;
429
+ let everHadCacheHit = false;
430
+ const currentExecutionData = [];
431
+ globalLogger.info(`[SmartLoop] Starting for step ${params.stepNumber}: "${params.description}"`);
432
+ try {
433
+ while (loopCount < SMART_LOOP_MAX_ITERATIONS) {
434
+ let screenshot = "";
435
+ let commands = [];
436
+ let isCacheHit = false;
437
+ for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
438
+ screenshot = await ctx.getScreenshot();
439
+ const sizeInBytes = screenshot.length * 0.75;
440
+ const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2);
441
+ globalLogger.debug(`[SmartLoop] Captured screenshot: ~${sizeInMB} MB`);
442
+ try {
443
+ globalLogger.debug(`[SmartLoop] Checking cache (Attempt ${attempt + 1}/${maxCacheAttempts})`);
444
+ const cacheResult = await executeFromCache({
445
+ apiKey: ctx.apiKey,
446
+ stepNumber: params.stepNumber,
447
+ stepDescription: params.description,
448
+ screenshot,
449
+ screenResolution: ctx.screenSize,
450
+ highestUsedIndex: lastCacheIndex,
451
+ platform: ctx.platform,
452
+ filepath: params.filepath
453
+ });
454
+ if (cacheResult.found && cacheResult.cacheCommands) {
455
+ commands = cacheResult.cacheCommands;
456
+ lastCacheIndex = cacheResult.cacheIndex;
457
+ isCacheHit = true;
458
+ everHadCacheHit = true;
459
+ globalLogger.info(`[SmartLoop] Cache Hit! (${commands.length} commands)`);
460
+ break;
461
+ }
462
+ } catch (e) {
463
+ globalLogger.warn(`[SmartLoop] Cache check failed: ${e.message}`);
464
+ }
465
+ if (attempt < maxCacheAttempts - 1) {
466
+ globalLogger.debug(`[SmartLoop] Cache miss, retrying in ${CACHE_CHECK_INTERVAL_MS}ms...`);
467
+ await delay(CACHE_CHECK_INTERVAL_MS);
468
+ }
469
+ }
470
+ let aiCommands = [];
471
+ if (!isCacheHit) {
472
+ anyCacheMiss = true;
473
+ globalLogger.info(`[SmartLoop] Cache Miss. Requesting AI agent...`);
474
+ const agentResponse = await executeAgentStep({
475
+ apiKey: ctx.apiKey,
476
+ base64_screenshot: screenshot,
477
+ instruction: params.instruction,
478
+ action_history: actionHistory
479
+ });
480
+ aiCommands = agentResponse.appetizeCommands || [];
481
+ const gptCommands = agentResponse.gptCommands || [];
482
+ const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
483
+ if (reasoningIndex !== -1) {
484
+ const parsedCommands = gptCommands.slice(reasoningIndex);
485
+ const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
486
+ if (rememberCommands.length > 0) {
487
+ ctx.globalActionHistory.push(...rememberCommands);
488
+ }
489
+ actionHistory = [...actionHistory, ...parsedCommands];
490
+ }
491
+ commands = [...aiCommands];
492
+ globalLogger.debug(`[SmartLoop] AI returned ${commands.length} command(s)`);
493
+ }
494
+ currentExecutionData.push({
495
+ screenshot,
496
+ commands: aiCommands.length > 0 ? aiCommands : commands
497
+ });
498
+ await ctx.logCodeExecution(screenshot, commands.join("\n"));
499
+ let actionExecuted = false;
500
+ let taskCompleted = false;
501
+ if (commands.length > 0) {
502
+ globalLogger.debug(`[SmartLoop] Executing ${commands.length} command(s)`);
503
+ }
504
+ for (const cmd of commands) {
505
+ if (isTaskComplete(cmd)) {
506
+ taskCompleted = true;
507
+ globalLogger.info(`[SmartLoop] Task completed signal received`);
508
+ continue;
509
+ }
510
+ if (isErrorDetected(cmd)) {
511
+ throw new Error(`AI Reported Error: ${cmd}`);
512
+ }
513
+ if (isRememberCommand(cmd)) {
514
+ ctx.globalActionHistory.push(cmd);
515
+ }
516
+ if (isTapCommand(cmd)) {
517
+ const coords = parseTapCoordinates(cmd);
518
+ if (coords) {
519
+ globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
520
+ await ctx.performTap(coords.x, coords.y);
521
+ actionExecuted = true;
522
+ }
523
+ } else if (isWaitCommand(cmd)) {
524
+ const seconds = parseWaitSeconds(cmd);
525
+ if (seconds) {
526
+ globalLogger.debug(`[SmartLoop] Waiting ${seconds}s`);
527
+ await delay(seconds * 1e3);
528
+ actionExecuted = true;
529
+ }
530
+ } else if (isScrollCommand(cmd)) {
531
+ const direction = parseScrollDirection(cmd);
532
+ if (direction) {
533
+ globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
534
+ await ctx.performScroll(direction);
535
+ actionExecuted = true;
536
+ }
537
+ } else if (isTypeCommand(cmd)) {
538
+ const text = parseTypeText(cmd);
539
+ if (text) {
540
+ globalLogger.debug(`[SmartLoop] Typing text`);
541
+ await ctx.performType(text);
542
+ actionExecuted = true;
543
+ }
544
+ }
545
+ }
546
+ if (actionExecuted) {
547
+ if (isCacheHit) {
548
+ actionHistory.push(...commands);
549
+ }
550
+ await delay(100);
551
+ }
552
+ if (taskCompleted) {
553
+ globalLogger.info(`[SmartLoop] Task completed successfully`);
554
+ if (anyCacheMiss && currentExecutionData.length > 0) {
555
+ globalLogger.info(`[SmartLoop] Populating cache with ${currentExecutionData.length} frame(s)...`);
556
+ try {
557
+ await populateCache({
558
+ apiKey: ctx.apiKey,
559
+ stepNumber: params.stepNumber,
560
+ stepDescription: params.description,
561
+ executionData: currentExecutionData,
562
+ screenResolution: ctx.screenSize,
563
+ platform: ctx.platform,
564
+ filepath: params.filepath
565
+ });
566
+ globalLogger.debug(`[SmartLoop] Cache populated successfully`);
567
+ } catch (e) {
568
+ globalLogger.warn(`[SmartLoop] Failed to populate cache: ${e.message}`);
569
+ }
570
+ } else if (!anyCacheMiss) {
571
+ globalLogger.debug(`[SmartLoop] Skipping cache population (all actions were cached)`);
572
+ }
573
+ return {
574
+ success: true,
575
+ iterations: loopCount + 1,
576
+ cacheHit: everHadCacheHit
577
+ };
578
+ }
579
+ loopCount++;
580
+ }
581
+ throw new Error(`Smart Loop timeout after ${SMART_LOOP_MAX_ITERATIONS} iterations`);
582
+ } catch (error) {
583
+ const message = error instanceof Error ? error.message : String(error);
584
+ globalLogger.error(`[SmartLoop] Error: ${message}`);
585
+ return {
586
+ success: false,
587
+ error: message,
588
+ iterations: loopCount + 1,
589
+ cacheHit: everHadCacheHit
590
+ };
591
+ }
592
+ }
593
+
20
594
  class GptDriver {
595
+ interpolateTemplate(input, params) {
596
+ if (typeof input !== "string" || !input.includes("{{")) return input;
597
+ const pattern = /{{\s*([^}]+?)\s*}}/g;
598
+ return input.replace(pattern, (_match, keyRaw) => {
599
+ const key = String(keyRaw);
600
+ if (!(key in params)) {
601
+ throw new Error(`Missing flow param: {{${key}}}`);
602
+ }
603
+ return params[key];
604
+ });
605
+ }
21
606
  apiKey;
22
607
  gptDriverSessionId;
23
608
  gptDriverBaseUrl;
24
609
  appiumSessionConfig;
610
+ cachingMode;
25
611
  driver;
612
+ appiumSessionStarted;
613
+ useGptDriverCloud;
614
+ gptDriverCloudConfig;
615
+ buildId;
616
+ testId;
617
+ step_number = 1;
618
+ // Smart loop state - maintains action history across steps for context
619
+ globalActionHistory = [];
26
620
  /**
27
621
  * Creates an instance of the GptDriver class.
28
622
  *
@@ -42,19 +636,34 @@ class GptDriver {
42
636
  * - `device.platform`: The platform name of the device (e.g., iOS, Android).
43
637
  */
44
638
  constructor(config) {
639
+ this.testId = config.testId;
45
640
  this.apiKey = config.apiKey;
46
- this.gptDriverBaseUrl = "https://api.mobileboost.io";
47
- this.initializeDriver(config);
48
- this.initializeAppiumConfig(config);
641
+ this.buildId = config.buildId;
642
+ this.useGptDriverCloud = config.useGptDriverCloud;
643
+ this.gptDriverBaseUrl = GPT_DRIVER_BASE_URL;
644
+ this.cachingMode = config.cachingMode ?? "NONE";
645
+ if (config.useGptDriverCloud) {
646
+ if (config.serverConfig.device?.platform == null) {
647
+ throw new Error("Platform is missing. Please specify the platform when using GPTDriver Cloud.");
648
+ }
649
+ this.gptDriverCloudConfig = {
650
+ platform: config.serverConfig.device.platform,
651
+ deviceName: config.serverConfig.device.deviceName,
652
+ platformVersion: config.serverConfig.device.platformVersion
653
+ };
654
+ } else {
655
+ this.initializeDriver(config);
656
+ this.initializeAppiumConfig(config);
657
+ }
49
658
  }
50
659
  initializeDriver(config) {
51
660
  if (config.driver) {
52
661
  this.driver = config.driver;
53
- if (!config.severConfig?.url) {
662
+ if (!config.serverConfig.url) {
54
663
  throw new Error("Server url is missing. Please specify the server url when providing a driver.");
55
664
  }
56
665
  } else {
57
- const isValidServerConfig = config.severConfig?.url && config.severConfig.device?.platform;
666
+ const isValidServerConfig = config.serverConfig.url && config.serverConfig.device?.platform;
58
667
  if (!isValidServerConfig) {
59
668
  throw new Error("Either provide a driver, or a valid severConfig object.");
60
669
  }
@@ -63,10 +672,10 @@ class GptDriver {
63
672
  initializeAppiumConfig(config) {
64
673
  const defaultPort = parseInt(process.env.APPIUM_PORT ?? "4723", 10);
65
674
  const defaultHost = process.env.APPIUM_HOST ?? "127.0.0.1";
66
- let serverUrl = config.severConfig?.url instanceof URL ? config.severConfig.url : new URL(config.severConfig?.url ?? `http://${defaultHost}:${defaultPort}`);
675
+ const serverUrl = config.serverConfig.url instanceof URL ? config.serverConfig.url : new URL(config.serverConfig.url ?? `http://${defaultHost}:${defaultPort}`);
67
676
  this.appiumSessionConfig = {
68
677
  serverUrl,
69
- ...config.severConfig?.device
678
+ ...config.serverConfig.device
70
679
  };
71
680
  }
72
681
  /**
@@ -76,50 +685,53 @@ class GptDriver {
76
685
  * @throws {Error} If the session cannot be started or the driver is not properly initialized.
77
686
  */
78
687
  async startSession() {
79
- console.log(">> Starting session...");
80
- if (this.driver) {
81
- let platform;
82
- let platformVersion;
83
- let deviceName;
84
- let sessionId;
85
- if (this.driver.sessionId == null) {
86
- const driver = this.driver;
87
- const capabilities = await driver.getCapabilities();
88
- platform = capabilities.get("platformName");
89
- platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
90
- deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
91
- const session = await driver.getSession();
92
- sessionId = session.getId();
688
+ globalLogger.info("Starting session...");
689
+ if (!this.useGptDriverCloud) {
690
+ if (this.driver) {
691
+ let platform;
692
+ let platformVersion;
693
+ let deviceName;
694
+ let sessionId;
695
+ if (this.driver.sessionId == null) {
696
+ const driver = this.driver;
697
+ const capabilities = await driver.getCapabilities();
698
+ platform = capabilities.get("platformName");
699
+ platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
700
+ deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
701
+ const session = await driver.getSession();
702
+ sessionId = session.getId();
703
+ } else {
704
+ const driver = this.driver;
705
+ platform = driver.capabilities["appium:platformName"] ?? driver.capabilities["platformName"];
706
+ platformVersion = driver.capabilities["appium:platformVersion"] ?? driver.capabilities["platformVersion"];
707
+ deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? driver.capabilities["deviceName"];
708
+ sessionId = driver.sessionId;
709
+ }
710
+ this.appiumSessionConfig = {
711
+ ...this.appiumSessionConfig,
712
+ id: sessionId,
713
+ platform,
714
+ platformVersion,
715
+ deviceName
716
+ };
717
+ globalLogger.debug(`Session config: ${JSON.stringify(this.appiumSessionConfig)}`);
93
718
  } else {
94
- const driver = this.driver;
95
- platform = driver.capabilities["appium:platformName"];
96
- platformVersion = driver.capabilities["appium:platformVersion"];
97
- deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? "";
98
- sessionId = driver.sessionId;
99
- }
100
- this.appiumSessionConfig = {
101
- ...this.appiumSessionConfig,
102
- id: sessionId,
103
- platform,
104
- platformVersion,
105
- deviceName
719
+ this.appiumSessionConfig.id = await this.createSession();
720
+ }
721
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
722
+ const rectResponse = await axios.get(url);
723
+ this.appiumSessionConfig.size = {
724
+ width: rectResponse.data.value.width,
725
+ height: rectResponse.data.value.height
106
726
  };
107
- } else {
108
- this.appiumSessionConfig.id = await this.createSession();
727
+ this.appiumSessionStarted = true;
109
728
  }
110
729
  await this.createGptDriverSession();
111
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
112
- const rectResponse = await axios.get(
113
- url
114
- );
115
- this.appiumSessionConfig.size = {
116
- width: rectResponse.data.value.width,
117
- height: rectResponse.data.value.height
118
- };
119
- console.log(`>> Session created. Monitor execution at: ${this.getSessionLink()}`);
730
+ globalLogger.info(logStyles.highlight(`Session created. Monitor execution at: ${this.getSessionLink()}`));
120
731
  }
121
732
  async createSession() {
122
733
  const { platform, deviceName, platformVersion, serverUrl } = this.appiumSessionConfig;
734
+ globalLogger.debug(`Creating Appium session for ${platform} ${platformVersion} on ${deviceName}`);
123
735
  const url = buildUrl(serverUrl, `/session`);
124
736
  const response = await axios.post(
125
737
  url,
@@ -134,22 +746,41 @@ class GptDriver {
134
746
  }
135
747
  }
136
748
  );
137
- return response.data.value.sessionId;
749
+ const sessionId = response.data.value.sessionId;
750
+ globalLogger.debug(`Appium session created with ID: ${sessionId}`);
751
+ return sessionId;
138
752
  }
139
753
  async createGptDriverSession() {
754
+ globalLogger.debug("Creating GPT Driver session...");
140
755
  const response = await axios.post(
141
756
  `${this.gptDriverBaseUrl}/sessions/create`,
142
757
  {
758
+ ...this.testId && { test_id: this.testId },
143
759
  api_key: this.apiKey,
144
- appium_session_id: this.appiumSessionConfig.id,
760
+ appium_session_id: this.appiumSessionConfig?.id,
145
761
  device_config: {
146
- platform: this.appiumSessionConfig.platform,
147
- device: this.appiumSessionConfig.deviceName,
148
- os: this.appiumSessionConfig.platformVersion
149
- }
762
+ platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
763
+ device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
764
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
765
+ },
766
+ use_internal_virtual_device: this.useGptDriverCloud,
767
+ build_id: this.buildId,
768
+ caching_mode: this.cachingMode
150
769
  }
151
770
  );
152
771
  this.gptDriverSessionId = response.data.sessionId;
772
+ globalLogger.debug(`GPT Driver session created with ID: ${this.gptDriverSessionId}`);
773
+ if (this.useGptDriverCloud) {
774
+ const parsedUrl = new URL(response.data.appiumServerUrl);
775
+ this.driver = await webdriverio.attach({
776
+ options: {
777
+ hostname: parsedUrl.hostname,
778
+ path: parsedUrl.pathname
779
+ },
780
+ sessionId: response.data.appiumSessionId
781
+ });
782
+ this.appiumSessionStarted = true;
783
+ }
153
784
  }
154
785
  getSessionLink() {
155
786
  return `https://app.mobileboost.io/gpt-driver/sessions/${this.gptDriverSessionId}`;
@@ -165,20 +796,174 @@ class GptDriver {
165
796
  *
166
797
  * @throws {Error} If the request to stop the session fails.
167
798
  */
168
- async stopSession(status) {
169
- console.log(">> Stopping session...");
170
- await axios.post(
171
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
799
+ async setSessionStatus(status) {
800
+ if (this.gptDriverSessionId) {
801
+ globalLogger.info(`Stopping session with status: ${status}`);
802
+ await axios.post(
803
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
804
+ {
805
+ api_key: this.apiKey,
806
+ status
807
+ }
808
+ );
809
+ globalLogger.info("Session stopped successfully");
810
+ this.appiumSessionStarted = false;
811
+ this.gptDriverSessionId = void 0;
812
+ this.step_number = 1;
813
+ this.globalActionHistory = [];
814
+ }
815
+ }
816
+ // ─────────────────────────────────────────────────────────────────────────────
817
+ // SMART LOOP INTEGRATION
818
+ // ─────────────────────────────────────────────────────────────────────────────
819
+ /**
820
+ * Creates a SmartLoopContext for the current session.
821
+ * This context provides all the callbacks needed by the smart loop executor.
822
+ */
823
+ createSmartLoopContext() {
824
+ return {
825
+ apiKey: this.apiKey,
826
+ platform: this.appiumSessionConfig?.platform,
827
+ screenSize: this.appiumSessionConfig.size,
828
+ globalActionHistory: this.globalActionHistory,
829
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
830
+ performTap: (x, y) => this.performTap(x, y),
831
+ performScroll: (direction) => this.performScroll(direction),
832
+ performType: (text) => this.performType(text),
833
+ logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command)
834
+ };
835
+ }
836
+ /**
837
+ * Calls the AI agent to determine the next actions based on the current screenshot.
838
+ * This requires an active GPT Driver session.
839
+ */
840
+ async executeAgentStep(params) {
841
+ const response = await axios.post(
842
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/agent/execute`,
172
843
  {
173
844
  api_key: this.apiKey,
174
- status
845
+ base64_screenshot: params.screenshot.replace(/^data:image\/\w+;base64,/, ""),
846
+ instruction: params.instruction,
847
+ action_history: params.actionHistory
175
848
  }
176
849
  );
177
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}`);
178
- await axios.delete(url);
179
- console.log(">> Session stopped.");
180
- this.gptDriverSessionId = void 0;
850
+ return {
851
+ gptCommands: response.data.gpt_commands,
852
+ appetizeCommands: response.data.appetize_commands,
853
+ actionHistory: response.data.action_history
854
+ };
855
+ }
856
+ // ─────────────────────────────────────────────────────────────────────────────
857
+ // DEVICE ACTION METHODS
858
+ // ─────────────────────────────────────────────────────────────────────────────
859
+ async getWdioClient() {
860
+ if (!this.appiumSessionStarted) {
861
+ await this.startSession();
862
+ }
863
+ if (this.driver?.sessionId != null) {
864
+ return this.driver;
865
+ }
866
+ const url = this.appiumSessionConfig.serverUrl;
867
+ const parsed = new URL(url);
868
+ const client = await webdriverio.attach({
869
+ sessionId: this.appiumSessionConfig.id,
870
+ options: {
871
+ protocol: parsed.protocol.replace(":", ""),
872
+ hostname: parsed.hostname,
873
+ port: parsed.port ? Number(parsed.port) : parsed.protocol === "https:" ? 443 : 80,
874
+ path: parsed.pathname && parsed.pathname !== "/" ? parsed.pathname : "/"
875
+ }
876
+ });
877
+ this.driver = client;
878
+ return client;
879
+ }
880
+ /**
881
+ * Performs a tap action at the specified coordinates.
882
+ */
883
+ async performTap(x, y) {
884
+ const client = await this.getWdioClient();
885
+ await client.performActions([
886
+ {
887
+ type: "pointer",
888
+ id: "finger1",
889
+ parameters: { pointerType: "touch" },
890
+ actions: [
891
+ { type: "pointerMove", duration: 0, x, y },
892
+ { type: "pointerDown", button: 0 },
893
+ { type: "pause", duration: 100 },
894
+ { type: "pointerUp", button: 0 }
895
+ ]
896
+ }
897
+ ]);
898
+ }
899
+ async performType(text) {
900
+ const client = await this.getWdioClient();
901
+ await client.keys(text.split(""));
902
+ }
903
+ async performScroll(direction) {
904
+ const client = await this.getWdioClient();
905
+ const w = this.appiumSessionConfig?.size?.width ?? 1080;
906
+ const h = this.appiumSessionConfig?.size?.height ?? 1920;
907
+ const x = Math.round(w / 2);
908
+ const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
909
+ const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
910
+ await client.performActions([
911
+ {
912
+ type: "pointer",
913
+ id: "finger1",
914
+ parameters: { pointerType: "touch" },
915
+ actions: [
916
+ { type: "pointerMove", duration: 0, x, y: startY },
917
+ { type: "pointerDown", button: 0 },
918
+ { type: "pause", duration: 100 },
919
+ { type: "pointerMove", duration: 500, x, y: endY },
920
+ { type: "pointerUp", button: 0 }
921
+ ]
922
+ }
923
+ ]);
924
+ }
925
+ async getPageSource() {
926
+ const client = await this.getWdioClient();
927
+ return client.getPageSource();
928
+ }
929
+ async performScrollUntil(params) {
930
+ const { direction, text, elementId } = params;
931
+ const max = params.maxScrolls ?? 10;
932
+ for (let i = 0; i < max; i++) {
933
+ const source = await this.getPageSource();
934
+ const found = elementId ? source.includes(elementId) : text ? source.includes(text) : false;
935
+ if (found) {
936
+ return;
937
+ }
938
+ await this.performScroll(direction);
939
+ await this._delay(500);
940
+ }
941
+ throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
942
+ }
943
+ async getScreenshot(appiumSessionConfig) {
944
+ globalLogger.debug("Capturing screenshot...");
945
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
946
+ const screenshotResponse = await axios.get(url);
947
+ let screenshot = await screenshotResponse.data.value;
948
+ if (appiumSessionConfig.platform === "iOS") {
949
+ globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
950
+ const imageBuffer = Buffer.from(screenshot, "base64");
951
+ const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
952
+ screenshot = transformedImage.toString("base64");
953
+ }
954
+ return screenshot;
955
+ }
956
+ /**
957
+ * Helper method to delay execution.
958
+ *
959
+ * @private
960
+ */
961
+ _delay(ms) {
962
+ return new Promise((resolve) => setTimeout(resolve, ms));
181
963
  }
964
+ // ─────────────────────────────────────────────────────────────────────────────
965
+ // PUBLIC API METHODS
966
+ // ─────────────────────────────────────────────────────────────────────────────
182
967
  /**
183
968
  * Executes a specified command within the WebDriver session, optionally using an Appium handler.
184
969
  *
@@ -186,6 +971,7 @@ class GptDriver {
186
971
  * the command-specific operations. After executing the handler, the executed commands get logged on the GPTDriver servers.
187
972
  * If the handler execution fails or no handler is provided, the command gets executed by the GPTDriver using just natural language.
188
973
  *
974
+ * @deprecated Use `aiExecute()` instead. This method will be removed in a future version.
189
975
  * @param {string} command - The natural language command to be executed by the GPTDriver.
190
976
  * @param {AppiumHandler} [appiumHandler] - An optional function that processes Appium-specific commands.
191
977
  * If provided, this handler is executed instead of calling the GPTDriver serves.
@@ -193,24 +979,136 @@ class GptDriver {
193
979
  * @throws {Error} If an error occurs during the execution of the Appium handler or while processing the command by the GPTDriver.
194
980
  */
195
981
  async execute(command, appiumHandler) {
196
- console.log(">> Executing command:", command);
982
+ globalLogger.warn("Method 'execute()' is deprecated. Please use 'aiExecute()' instead.");
983
+ if (!this.appiumSessionStarted) {
984
+ await this.startSession();
985
+ }
986
+ globalLogger.info(`Executing command: ${command}`);
197
987
  const driver = this.driver;
198
988
  if (appiumHandler != null) {
199
989
  try {
990
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
200
991
  await appiumHandler(driver);
201
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
202
- await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
203
- api_key: this.apiKey,
204
- base64_screenshot: screenshot,
205
- command: appiumHandler.toString()
206
- });
992
+ globalLogger.debug("Custom Appium handler executed successfully");
207
993
  } catch (e) {
994
+ globalLogger.warn("Custom Appium handler failed, falling back to GPT handler");
208
995
  await this.gptHandler(command);
209
996
  }
210
997
  } else {
211
998
  await this.gptHandler(command);
212
999
  }
213
1000
  }
1001
+ /**
1002
+ * Executes a specified command within the WebDriver session with configurable caching options.
1003
+ *
1004
+ * This is the recommended method for executing commands. It provides fine-grained control over
1005
+ * caching behavior, allowing you to optimize performance and costs for repetitive test scenarios.
1006
+ *
1007
+ * If an `appiumHandler` is provided, it will be invoked with the WebDriver instance to perform
1008
+ * the command-specific operations. After executing the handler, the executed commands get logged
1009
+ * on the GPTDriver servers. If the handler execution fails or no handler is provided, the command
1010
+ * gets executed by the GPTDriver using natural language processing.
1011
+ *
1012
+ * @param {Object} params - The execution parameters
1013
+ * @param {string} params.command - The natural language command to be executed by the GPTDriver.
1014
+ * Examples: "Click the login button", "Enter 'test@example.com' in the email field"
1015
+ * @param {AppiumHandler} [params.appiumHandler] - An optional function that processes Appium-specific commands.
1016
+ * If provided, this handler is executed instead of calling
1017
+ * the GPTDriver API. Useful for performance optimization when
1018
+ * you know the exact Appium commands to execute.
1019
+ * @param {CachingMode} [params.cachingMode] - Controls how the GPTDriver caches this command execution.
1020
+ * If not specified, uses the global caching mode set in the constructor.
1021
+ * Options:
1022
+ * - "NONE"
1023
+ * - "FULL_SCREEN"
1024
+ * - "INTERACTION_REGION"
1025
+ * @param {boolean} [params.useSmartLoop] - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1026
+ * which optimizes execution by checking cache first and populating it after.
1027
+ * Default: false (uses legacy gptHandler)
1028
+ *
1029
+ * @returns {Promise<void>} A promise that resolves when the command execution is complete.
1030
+ *
1031
+ * @throws {Error} If an error occurs during the execution of the Appium handler or while processing
1032
+ * the command by the GPTDriver.
1033
+ *
1034
+ * @example
1035
+ * // Basic usage with natural language (no caching)
1036
+ * await driver.aiExecute({
1037
+ * command: "Click the submit button"
1038
+ * });
1039
+ *
1040
+ * @example
1041
+ * // Full screen caching for repetitive navigation on similar screens
1042
+ * await driver.aiExecute({
1043
+ * command: "Navigate to the settings page",
1044
+ * cachingMode: "FULL_SCREEN"
1045
+ * });
1046
+ *
1047
+ * @example
1048
+ * // Interaction region caching for repeated actions on the same button
1049
+ * await driver.aiExecute({
1050
+ * command: "Click the login button",
1051
+ * cachingMode: "INTERACTION_REGION"
1052
+ * });
1053
+ *
1054
+ * @example
1055
+ * // With custom Appium handler as fallback
1056
+ * await driver.aiExecute({
1057
+ * command: "Click the login button",
1058
+ * appiumHandler: async (driver) => {
1059
+ * const loginBtn = await driver.$('~loginButton');
1060
+ * await loginBtn.click();
1061
+ * },
1062
+ * cachingMode: "INTERACTION_REGION"
1063
+ * });
1064
+ *
1065
+ * @example
1066
+ * // Force fresh execution for dynamic content
1067
+ * await driver.aiExecute({
1068
+ * command: "Verify the current timestamp",
1069
+ * cachingMode: "NONE"
1070
+ * });
1071
+ *
1072
+ * @example
1073
+ * // Using smart loop for optimized caching
1074
+ * await driver.aiExecute({
1075
+ * command: "Click the login button",
1076
+ * useSmartLoop: true,
1077
+ * cachingMode: "FULL_SCREEN"
1078
+ * });
1079
+ */
1080
+ async aiExecute({ command, appiumHandler, cachingMode, useSmartLoop = false }) {
1081
+ if (!this.appiumSessionStarted) {
1082
+ await this.startSession();
1083
+ }
1084
+ globalLogger.info(`Executing command: ${command}`);
1085
+ const driver = this.driver;
1086
+ if (appiumHandler != null) {
1087
+ try {
1088
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
1089
+ await appiumHandler(driver);
1090
+ globalLogger.debug("Custom Appium handler executed successfully");
1091
+ this.step_number++;
1092
+ return;
1093
+ } catch (e) {
1094
+ globalLogger.warn("Custom Appium handler failed, falling back to AI execution");
1095
+ }
1096
+ }
1097
+ if (useSmartLoop) {
1098
+ const ctx = this.createSmartLoopContext();
1099
+ const result = await executeSmartLoop(ctx, {
1100
+ stepNumber: this.step_number,
1101
+ description: command,
1102
+ instruction: command
1103
+ });
1104
+ if (!result.success) {
1105
+ throw new Error(result.error || "Smart loop execution failed");
1106
+ }
1107
+ this.step_number++;
1108
+ } else {
1109
+ await this.gptHandler(command, cachingMode);
1110
+ }
1111
+ }
214
1112
  /**
215
1113
  * Asserts a single condition using the GPTDriver.
216
1114
  *
@@ -218,13 +1116,25 @@ class GptDriver {
218
1116
  * If the assertion fails, an error is thrown.
219
1117
  *
220
1118
  * @param {string} assertion - The condition to be asserted.
1119
+ * @param cachingMode - The caching mode to be used for the assertion.
221
1120
  * @throws {Error} If the assertion fails.
222
1121
  */
223
- async assert(assertion) {
224
- console.log(">> Asserting:", assertion);
225
- const results = await this.checkBulk([assertion]);
226
- if (!Object.values(results).at(0)) {
227
- throw new Error(`Failed assertion: ${assertion}`);
1122
+ async assert(assertion, cachingMode) {
1123
+ if (!this.appiumSessionStarted) {
1124
+ await this.startSession();
1125
+ }
1126
+ try {
1127
+ const results = await this.checkBulk([assertion], cachingMode);
1128
+ if (!Object.values(results).at(0)) {
1129
+ await this.setSessionStatus("failed");
1130
+ globalLogger.error(`Assertion failed: ${assertion}`);
1131
+ throw new Error(`Failed assertion: ${assertion}`);
1132
+ }
1133
+ this.step_number = this.step_number + 1;
1134
+ globalLogger.info(`Assertion passed: ${assertion}`);
1135
+ } catch (e) {
1136
+ await this.setSessionStatus("failed");
1137
+ throw e;
228
1138
  }
229
1139
  }
230
1140
  /**
@@ -234,43 +1144,100 @@ class GptDriver {
234
1144
  * If any assertion fails, an error is thrown listing all failed assertions.
235
1145
  *
236
1146
  * @param {string[]} assertions - An array of conditions to be asserted.
1147
+ * @param cachingMode - The caching mode to be used for the assertions.
237
1148
  * @throws {Error} If any of the assertions fail.
238
1149
  */
239
- async assertBulk(assertions) {
240
- console.log(">> Asserting:", assertions);
241
- const results = await this.checkBulk(assertions);
242
- const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
243
- if (!current) {
244
- return [...prev, assertions.at(currentIndex)];
1150
+ async assertBulk(assertions, cachingMode) {
1151
+ if (!this.appiumSessionStarted) {
1152
+ await this.startSession();
1153
+ }
1154
+ try {
1155
+ const results = await this.checkBulk(assertions, cachingMode);
1156
+ const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
1157
+ if (!current) {
1158
+ return [...prev, assertions.at(currentIndex)];
1159
+ }
1160
+ return prev;
1161
+ }, []);
1162
+ if (failedAssertions.length > 0) {
1163
+ await this.setSessionStatus("failed");
1164
+ globalLogger.error(`Multiple assertions failed: ${failedAssertions.join(", ")}`);
1165
+ throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
245
1166
  }
246
- return prev;
247
- }, []);
248
- if (failedAssertions.length > 0) {
249
- throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
1167
+ this.step_number = this.step_number + 1;
1168
+ globalLogger.info(`All ${assertions.length} assertions passed`);
1169
+ } catch (e) {
1170
+ await this.setSessionStatus("failed");
1171
+ throw e;
250
1172
  }
251
1173
  }
252
1174
  /**
253
1175
  * Checks multiple conditions and returns their results using the GPTDriver.
254
1176
  *
255
1177
  * This method sends a bulk condition request and returns the results of the conditions.
1178
+ * Failed conditions will be retried up to maxRetries times.
256
1179
  *
257
1180
  * @param {string[]} conditions - An array of conditions to be checked.
1181
+ * @param {CachingMode} cachingMode - The caching mode to be used for the conditions.
1182
+ * @param {number} maxRetries - The maximum number of retries if any condition fails (default: 2).
1183
+ * @param {number} retryDelayMs - The delay in milliseconds between retries (default: 1000).
258
1184
  * @returns {Promise<Record<string, boolean>>} A promise that resolves with an object mapping each condition
259
1185
  * to a boolean indicating whether the condition was met.
260
1186
  */
261
- async checkBulk(conditions) {
262
- console.log(">> Checking:", conditions);
263
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
264
- const response = await axios.post(
265
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
266
- {
267
- api_key: this.apiKey,
268
- base64_screenshot: screenshot,
269
- assertions: conditions,
270
- command: `Assert: ${JSON.stringify(conditions)}`
1187
+ async checkBulk(conditions, cachingMode, maxRetries = 2, retryDelayMs = 1e3) {
1188
+ let attempt = 0;
1189
+ let results = {};
1190
+ while (attempt <= maxRetries) {
1191
+ results = await this._checkBulkOnce(conditions, cachingMode, attempt);
1192
+ const failedConditions = Object.entries(results).filter(([_, success]) => !success).map(([key, _]) => key);
1193
+ if (failedConditions.length === 0) {
1194
+ return results;
271
1195
  }
272
- );
273
- return response.data.results;
1196
+ attempt++;
1197
+ if (attempt <= maxRetries) {
1198
+ globalLogger.info(
1199
+ `>> Conditions failed ${JSON.stringify(failedConditions)}. Retrying in ${retryDelayMs}ms... (Attempt ${attempt}/${maxRetries})`
1200
+ );
1201
+ await this._delay(retryDelayMs);
1202
+ } else {
1203
+ globalLogger.info(`>> Conditions failed: ${JSON.stringify(failedConditions)}`);
1204
+ }
1205
+ }
1206
+ return results;
1207
+ }
1208
+ /**
1209
+ * Internal method to check conditions once without retry logic.
1210
+ *
1211
+ * @private
1212
+ */
1213
+ async _checkBulkOnce(conditions, cachingMode, attempt = 0) {
1214
+ if (!this.appiumSessionStarted) {
1215
+ await this.startSession();
1216
+ }
1217
+ globalLogger.info(`Checking conditions (attempt ${attempt}): ${conditions.join(", ")}`);
1218
+ try {
1219
+ let screenshot;
1220
+ if (!this.useGptDriverCloud) {
1221
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1222
+ }
1223
+ const response = await axios.post(
1224
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
1225
+ {
1226
+ api_key: this.apiKey,
1227
+ base64_screenshot: screenshot,
1228
+ assertions: conditions,
1229
+ command: `Assert: ${JSON.stringify(conditions)}`,
1230
+ caching_mode: cachingMode ?? this.cachingMode,
1231
+ step_number: this.step_number
1232
+ }
1233
+ );
1234
+ globalLogger.debug(`Check results: ${JSON.stringify(response.data.results)}`);
1235
+ return response.data.results;
1236
+ } catch (e) {
1237
+ globalLogger.error("Failed to check conditions", e);
1238
+ await this.setSessionStatus("failed");
1239
+ throw e;
1240
+ }
274
1241
  }
275
1242
  /**
276
1243
  * Extracts specified information using the GPTDriver.
@@ -280,30 +1247,305 @@ class GptDriver {
280
1247
  *
281
1248
  * @param {string[]} extractions - An array of extraction criteria. Each criterion specifies what information
282
1249
  * should be extracted from the session.
1250
+ * @param cachingMode - The caching mode to be used for the extraction.
283
1251
  * @returns {Promise<Record<string, any>>} A promise that resolves with an object mapping each extraction criterion
284
1252
  * to the extracted data. The structure of the returned data depends on the
285
1253
  * specifics of the extraction criteria.
286
1254
  */
287
- async extract(extractions) {
288
- console.log(">> Extracting:", extractions);
289
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1255
+ async extract(extractions, cachingMode) {
1256
+ if (!this.appiumSessionStarted) {
1257
+ await this.startSession();
1258
+ }
1259
+ globalLogger.info(`Extracting data: ${extractions.join(", ")}`);
1260
+ let screenshot;
1261
+ if (!this.useGptDriverCloud) {
1262
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1263
+ }
290
1264
  const response = await axios.post(
291
1265
  `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/extract`,
292
1266
  {
293
1267
  api_key: this.apiKey,
294
1268
  base64_screenshot: screenshot,
295
1269
  extractions,
296
- command: `Extract: ${JSON.stringify(extractions)}`
1270
+ command: `Extract: ${JSON.stringify(extractions)}`,
1271
+ step_number: this.step_number
297
1272
  }
298
1273
  );
1274
+ this.step_number = this.step_number + 1;
1275
+ globalLogger.debug(`Extraction results: ${JSON.stringify(response.data.results)}`);
299
1276
  return response.data.results;
300
1277
  }
301
- async gptHandler(command) {
1278
+ /**
1279
+ * Opens a deep link url in the Appium session.
1280
+ *
1281
+ * This method sends a request to the GPT Driver server to open a deep link url in the Appium session.
1282
+ *
1283
+ * @param {OpenDeepLinkUrlParams} params - The parameters for opening the deep link url.
1284
+ * @returns {Promise<void>} A promise that resolves when the deep link url is opened.
1285
+ */
1286
+ async openDeepLinkUrl(params) {
1287
+ if (!this.appiumSessionStarted) {
1288
+ await this.startSession();
1289
+ }
1290
+ globalLogger.info(`Opening deep link: ${params.url}`);
1291
+ if (params.package == null && this.appiumSessionConfig?.platform === "Android") {
1292
+ throw new Error("Package is required for Android platform");
1293
+ }
1294
+ await this.executeCommand(
1295
+ {
1296
+ url: `http://localhost:4723/session/${this.appiumSessionConfig?.id}/execute/sync`,
1297
+ method: "POST",
1298
+ data: {
1299
+ "script": "mobile:deepLink",
1300
+ "args": [{
1301
+ url: params.url,
1302
+ ...params.bundleId && { bundleId: params.bundleId },
1303
+ ...params.package && { package: params.package }
1304
+ }]
1305
+ }
1306
+ }
1307
+ );
1308
+ this.step_number = this.step_number + 1;
1309
+ globalLogger.debug("Deep link opened successfully");
1310
+ }
1311
+ /**
1312
+ * Reads a flow JSON file from disk and validates it using the SavableTestStoreSchema.
1313
+ *
1314
+ * Returns the parsed and validated object on success; throws a detailed error on failure.
1315
+ *
1316
+ * @param filePath - Path to the flow file (JSON)
1317
+ * @param options - Optional execution options
1318
+ * @param options.useSmartLoop - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1319
+ * for AI, tap, and assert steps. This optimizes execution by checking cache
1320
+ * first and populating it after successful execution. Default: false
1321
+ * @returns The validated flow data
1322
+ *
1323
+ * @example
1324
+ * // Execute flow with default settings (legacy gptHandler)
1325
+ * const result = await driver.executeFlow('tests/login-flow.json');
1326
+ *
1327
+ * @example
1328
+ * // Execute flow with smart loop enabled for optimized caching
1329
+ * const result = await driver.executeFlow('tests/login-flow.json', { useSmartLoop: true });
1330
+ */
1331
+ async executeFlow(filePath, options) {
1332
+ const useSmartLoop = options?.useSmartLoop ?? false;
1333
+ globalLogger.info(`Loading flow from file: ${filePath}`);
1334
+ const absolutePath = path.resolve(filePath);
1335
+ const baseDir = path.dirname(absolutePath);
1336
+ let raw;
1337
+ try {
1338
+ raw = await node_fs.promises.readFile(absolutePath, "utf-8");
1339
+ } catch (e) {
1340
+ const msg = `Failed to read file at ${filePath}: ${e?.message ?? e}`;
1341
+ globalLogger.error(msg);
1342
+ throw new Error(msg);
1343
+ }
1344
+ let json;
1345
+ try {
1346
+ json = JSON.parse(raw);
1347
+ } catch (e) {
1348
+ const msg = `Invalid JSON in flow file ${filePath}: ${e?.message ?? e}`;
1349
+ globalLogger.error(msg);
1350
+ throw new Error(msg);
1351
+ }
1352
+ const parsed = SavableTestStoreSchema.safeParse(json);
1353
+ if (!parsed.success) {
1354
+ const issues = parsed.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1355
+ const msg = `Flow validation failed for ${filePath}:
1356
+ ${issues}`;
1357
+ globalLogger.error(msg);
1358
+ throw new Error(msg);
1359
+ }
1360
+ const rootFlow = parsed.data;
1361
+ globalLogger.info(`Flow file validated successfully: ${filePath}`);
1362
+ const visited = /* @__PURE__ */ new Set();
1363
+ const loadFlow = async (p) => {
1364
+ const abs = path.isAbsolute(p) ? p : path.resolve(baseDir, p);
1365
+ const rawChild = await node_fs.promises.readFile(abs, "utf-8");
1366
+ const childJson = JSON.parse(rawChild);
1367
+ const val = SavableTestStoreSchema.safeParse(childJson);
1368
+ if (!val.success) {
1369
+ const issues = val.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1370
+ throw new Error(`Flow validation failed for referenced file ${abs}:
1371
+ ${issues}`);
1372
+ }
1373
+ return val.data;
1374
+ };
1375
+ const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1376
+ const out = [];
1377
+ for (const step of steps) {
1378
+ if (step.type === "fileRef") {
1379
+ const refPath = path.isAbsolute(step.path) ? step.path : path.resolve(parentDir, step.path);
1380
+ const refKey = path.normalize(refPath);
1381
+ if (visited.has(refKey)) {
1382
+ const cycle = [...stack, refKey].map((p) => path.basename(p)).join(" -> ");
1383
+ throw new Error(`Detected circular fileRef: ${cycle}`);
1384
+ }
1385
+ visited.add(refKey);
1386
+ const child = await loadFlow(refPath);
1387
+ const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1388
+ const childDir = path.dirname(refPath);
1389
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1390
+ out.push(...childExpanded);
1391
+ } else {
1392
+ const resolved = { ...step, __params: { ...inheritedParams } };
1393
+ out.push(resolved);
1394
+ }
1395
+ }
1396
+ return out;
1397
+ };
1398
+ const effectiveParams = { ...rootFlow.params ?? {} };
1399
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1400
+ if (!this.appiumSessionStarted) {
1401
+ await this.startSession();
1402
+ }
1403
+ globalLogger.info(`Executing flow '${rootFlow.name}' with ${expandedSteps.length} step(s)...`);
1404
+ let executed = 0;
1405
+ try {
1406
+ for (const step of expandedSteps) {
1407
+ const params = step.__params ?? effectiveParams;
1408
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1409
+ try {
1410
+ switch (step.type) {
1411
+ case "ai": {
1412
+ const instruction = this.interpolateTemplate(step.instruction, params);
1413
+ globalLogger.info(`${prefix}: ${instruction}`);
1414
+ if (useSmartLoop) {
1415
+ const ctx = this.createSmartLoopContext();
1416
+ const result = await executeSmartLoop(ctx, {
1417
+ stepNumber: this.step_number,
1418
+ description: instruction,
1419
+ instruction
1420
+ });
1421
+ if (!result.success) {
1422
+ throw new Error(result.error || "Smart loop execution failed");
1423
+ }
1424
+ this.step_number++;
1425
+ } else {
1426
+ await this.aiExecute({ command: instruction });
1427
+ }
1428
+ break;
1429
+ }
1430
+ case "tap": {
1431
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1432
+ if (!description) {
1433
+ throw new Error("Tap step requires a descriptionText. Coordinate-based taps are no longer supported.");
1434
+ }
1435
+ globalLogger.info(`${prefix}: ${description}`);
1436
+ if (useSmartLoop) {
1437
+ const ctx = this.createSmartLoopContext();
1438
+ const result = await executeSmartLoop(ctx, {
1439
+ stepNumber: this.step_number,
1440
+ description,
1441
+ instruction: description
1442
+ });
1443
+ if (!result.success) {
1444
+ throw new Error(result.error || "Smart loop execution failed");
1445
+ }
1446
+ this.step_number++;
1447
+ } else {
1448
+ await this.aiExecute({ command: description });
1449
+ }
1450
+ break;
1451
+ }
1452
+ case "assert": {
1453
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1454
+ if (!description) {
1455
+ throw new Error("Assert step requires a descriptionText. Coordinate-based assertions are no longer supported.");
1456
+ }
1457
+ globalLogger.info(`${prefix}: ${description}`);
1458
+ if (useSmartLoop) {
1459
+ const instruction = `Verify that: ${description}`;
1460
+ const ctx = this.createSmartLoopContext();
1461
+ const result = await executeSmartLoop(ctx, {
1462
+ stepNumber: this.step_number,
1463
+ description,
1464
+ instruction
1465
+ });
1466
+ if (!result.success) {
1467
+ throw new Error(result.error || "Smart loop execution failed");
1468
+ }
1469
+ this.step_number++;
1470
+ } else {
1471
+ await this.assert(description);
1472
+ }
1473
+ break;
1474
+ }
1475
+ case "type": {
1476
+ const text = this.interpolateTemplate(step.text, params);
1477
+ globalLogger.info(`${prefix}: Type text`);
1478
+ await this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
1479
+ await this.performType(text);
1480
+ this.step_number++;
1481
+ break;
1482
+ }
1483
+ case "scroll": {
1484
+ globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1485
+ await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1486
+ await this.performScroll(step.direction);
1487
+ this.step_number++;
1488
+ break;
1489
+ }
1490
+ case "zoom": {
1491
+ globalLogger.info(`${prefix}: Zoom ${step.direction}`);
1492
+ await this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
1493
+ this.step_number++;
1494
+ break;
1495
+ }
1496
+ case "scrollUntil": {
1497
+ const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
1498
+ globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
1499
+ await this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
1500
+ await this.performScrollUntil({
1501
+ direction: step.direction,
1502
+ text: interpolatedText,
1503
+ elementId: step.elementId,
1504
+ maxScrolls: step.maxScrolls
1505
+ });
1506
+ this.step_number++;
1507
+ break;
1508
+ }
1509
+ case "deeplink": {
1510
+ const pkg = params["package"];
1511
+ const bundleId = params["bundleId"];
1512
+ const url = this.interpolateTemplate(step.url, params);
1513
+ globalLogger.info(`${prefix}: Open deeplink ${url}`);
1514
+ await this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
1515
+ await this.openDeepLinkUrl({ url, package: pkg, bundleId });
1516
+ break;
1517
+ }
1518
+ default: {
1519
+ throw new Error(`Unsupported step type at execution: ${step.type}`);
1520
+ }
1521
+ }
1522
+ executed++;
1523
+ } catch (err) {
1524
+ if (step.optional) {
1525
+ globalLogger.warn(`${prefix} failed but marked optional. Continuing. Error: ${err.message}`);
1526
+ continue;
1527
+ }
1528
+ throw err;
1529
+ }
1530
+ }
1531
+ } catch (e) {
1532
+ try {
1533
+ await this.setSessionStatus("failed");
1534
+ } catch {
1535
+ }
1536
+ throw e;
1537
+ }
1538
+ return rootFlow;
1539
+ }
1540
+ async gptHandler(command, cachingMode) {
302
1541
  try {
303
1542
  let conditionSucceeded = false;
304
1543
  while (!conditionSucceeded) {
305
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
306
- console.log(">> Asking GTP Driver for next action...");
1544
+ let screenshot;
1545
+ if (!this.useGptDriverCloud) {
1546
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1547
+ }
1548
+ globalLogger.info("Requesting next action from GPT Driver...");
307
1549
  const response = await axios.request(
308
1550
  {
309
1551
  url: `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/execute`,
@@ -311,39 +1553,49 @@ class GptDriver {
311
1553
  data: {
312
1554
  api_key: this.apiKey,
313
1555
  command,
314
- base64_screenshot: screenshot
1556
+ base64_screenshot: screenshot,
1557
+ caching_mode: cachingMode ?? this.cachingMode,
1558
+ step_number: this.step_number
315
1559
  }
316
1560
  }
317
1561
  );
318
1562
  const executeStatus = response.data.status;
319
1563
  if (executeStatus === "failed") {
320
- const errorMessage = response?.data?.commands?.at(0)?.data;
1564
+ const errorMessage = response.data?.commands?.at(0)?.data;
1565
+ globalLogger.error(`Execution failed: ${errorMessage ?? "Unknown error"}`);
321
1566
  throw new Error(errorMessage ?? "Execution failed");
322
1567
  }
323
1568
  conditionSucceeded = executeStatus !== "inProgress";
324
1569
  const executeResponse = response.data;
325
- for (const command2 of executeResponse.commands) {
326
- await this.executeCommand(command2);
1570
+ globalLogger.debug(`Received ${executeResponse.commands.length} command(s) to execute`);
1571
+ for (const appiumCommand of executeResponse.commands) {
1572
+ await this.executeCommand(appiumCommand);
327
1573
  }
328
1574
  if (!conditionSucceeded) {
1575
+ globalLogger.debug("Command still in progress, waiting...");
329
1576
  await delay(1500);
330
1577
  }
331
1578
  }
1579
+ this.step_number = this.step_number + 1;
1580
+ globalLogger.info("Command execution completed successfully");
332
1581
  } catch (e) {
333
- await this.stopSession("failed");
1582
+ globalLogger.error("GPT handler failed", e);
1583
+ await this.setSessionStatus("failed");
334
1584
  throw e;
335
1585
  }
336
1586
  }
337
1587
  async executeCommand(command) {
338
- const firstAction = command.data.actions?.at(0);
1588
+ const firstAction = command.data?.actions?.at(0);
339
1589
  if (firstAction?.type === "pause" && firstAction.duration != null) {
1590
+ globalLogger.debug(`Pausing for ${firstAction.duration} seconds`);
340
1591
  await delay(firstAction * 1e3);
341
- } else {
1592
+ } else if (!this.useGptDriverCloud) {
342
1593
  const parsedUrl = new URL(command.url);
343
1594
  parsedUrl.protocol = this.appiumSessionConfig.serverUrl.protocol;
344
1595
  parsedUrl.host = this.appiumSessionConfig.serverUrl.host;
345
1596
  parsedUrl.port = this.appiumSessionConfig.serverUrl.port != "" ? `${this.appiumSessionConfig.serverUrl.port}` : "";
346
1597
  parsedUrl.pathname = this.appiumSessionConfig.serverUrl.pathname != "/" ? `${this.appiumSessionConfig.serverUrl.pathname}${parsedUrl.pathname}` : parsedUrl.pathname;
1598
+ globalLogger.debug(`Executing ${command.method} request to ${parsedUrl.pathname}`);
347
1599
  await axios.request({
348
1600
  url: parsedUrl.toString(),
349
1601
  method: command.method,
@@ -351,16 +1603,25 @@ class GptDriver {
351
1603
  });
352
1604
  }
353
1605
  }
354
- async getScreenshot(appiumSessionConfig) {
355
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
356
- const screenshotResponse = await axios.get(url);
357
- let screenshot = await screenshotResponse.data.value;
358
- if (appiumSessionConfig.platform === "iOS") {
359
- const imageBuffer = Buffer.from(screenshot, "base64");
360
- const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
361
- screenshot = transformedImage.toString("base64");
1606
+ async logCodeExecution(screenshot, command) {
1607
+ try {
1608
+ const screenshot2 = await this.getScreenshot(this.appiumSessionConfig);
1609
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
1610
+ api_key: this.apiKey,
1611
+ base64_screenshot: screenshot2,
1612
+ command
1613
+ });
1614
+ } catch (e) {
1615
+ globalLogger.error("Failed to log code execution", e);
1616
+ }
1617
+ }
1618
+ async takeScreenshotAndLogCodeExecution(command) {
1619
+ try {
1620
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1621
+ await this.logCodeExecution(screenshot, command);
1622
+ } catch (e) {
1623
+ globalLogger.error("Failed to log code execution", e);
362
1624
  }
363
- return screenshot;
364
1625
  }
365
1626
  }
366
1627