gpt-driver-node 1.0.0-alpha.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,8 +1,13 @@
1
1
  'use strict';
2
2
 
3
+ var node_fs = require('node:fs');
4
+ var path = require('node:path');
3
5
  var axios = require('axios');
4
- var seleniumWebdriver = require('selenium-webdriver');
5
6
  var sharp = require('sharp');
7
+ var webdriverio = require('webdriverio');
8
+ var winston = require('winston');
9
+ var zod = require('zod');
10
+ var crypto = require('node:crypto');
6
11
 
7
12
  const delay = async (milliseconds) => {
8
13
  await new Promise((resolve) => setTimeout(resolve, milliseconds));
@@ -18,12 +23,600 @@ function buildUrl(base, extraPath) {
18
23
  return `${baseUrl}${extraPath}`;
19
24
  }
20
25
 
26
+ const colors = {
27
+ reset: "\x1B[0m",
28
+ bold: "\x1B[1m",
29
+ gray: "\x1B[90m",
30
+ red: "\x1B[31m",
31
+ green: "\x1B[32m",
32
+ yellow: "\x1B[33m",
33
+ cyan: "\x1B[36m"
34
+ };
35
+ const logStyles = {
36
+ bold: (text) => `${colors.bold}${text}${colors.reset}`,
37
+ cyan: (text) => `${colors.cyan}${text}${colors.reset}`,
38
+ yellow: (text) => `${colors.yellow}${text}${colors.reset}`,
39
+ green: (text) => `${colors.green}${text}${colors.reset}`,
40
+ red: (text) => `${colors.red}${text}${colors.reset}`,
41
+ gray: (text) => `${colors.gray}${text}${colors.reset}`,
42
+ highlight: (text) => `${colors.bold}${colors.cyan}${text}${colors.reset}`,
43
+ success: (text) => `${colors.bold}${colors.green}${text}${colors.reset}`,
44
+ error: (text) => `${colors.bold}${colors.red}${text}${colors.reset}`,
45
+ warning: (text) => `${colors.bold}${colors.yellow}${text}${colors.reset}`
46
+ };
47
+ const globalLogger = winston.createLogger({
48
+ level: process.env.GPT_DRIVER_LOG_LEVEL || "info",
49
+ format: winston.format.combine(
50
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
51
+ winston.format.errors({ stack: true }),
52
+ winston.format.printf(({ timestamp, level, message, stack }) => {
53
+ const logMessage = `${timestamp} [${level.toUpperCase()}]: ${message}`;
54
+ return stack ? `${logMessage}
55
+ ${stack}` : logMessage;
56
+ })
57
+ ),
58
+ transports: [
59
+ new winston.transports.Console({
60
+ format: winston.format.combine(
61
+ winston.format.printf(({ timestamp, level, message, stack }) => {
62
+ let coloredLevel = level.toUpperCase();
63
+ switch (level) {
64
+ case "error":
65
+ coloredLevel = logStyles.error(coloredLevel);
66
+ break;
67
+ case "warn":
68
+ coloredLevel = logStyles.warning(coloredLevel);
69
+ break;
70
+ case "info":
71
+ coloredLevel = logStyles.cyan(coloredLevel);
72
+ break;
73
+ case "debug":
74
+ coloredLevel = logStyles.gray(coloredLevel);
75
+ break;
76
+ }
77
+ const formattedTimestamp = logStyles.gray(timestamp);
78
+ const logMessage = `${formattedTimestamp} [${coloredLevel}]: ${message}`;
79
+ return stack ? `${logMessage}
80
+ ${logStyles.gray(stack)}` : logMessage;
81
+ })
82
+ )
83
+ })
84
+ ]
85
+ });
86
+
87
+ const SavableStepBaseSchema = zod.z.object({
88
+ id: zod.z.number().optional(),
89
+ descriptionText: zod.z.string().optional(),
90
+ optional: zod.z.boolean().optional()
91
+ });
92
+ const SavableTapStepSchema = SavableStepBaseSchema.extend({
93
+ type: zod.z.literal("tap"),
94
+ elementId: zod.z.string().optional(),
95
+ timeout: zod.z.number().optional(),
96
+ useLlmOnly: zod.z.boolean().optional(),
97
+ cropBase64: zod.z.string().optional()
98
+ });
99
+ const SavableAssertStepSchema = SavableStepBaseSchema.extend({
100
+ type: zod.z.literal("assert"),
101
+ elementId: zod.z.string().optional(),
102
+ timeout: zod.z.number().optional(),
103
+ useLlmOnly: zod.z.boolean().optional(),
104
+ cropBase64: zod.z.string().optional()
105
+ });
106
+ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
107
+ type: zod.z.literal("type"),
108
+ text: zod.z.string()
109
+ });
110
+ const SavableScrollStepSchema = SavableStepBaseSchema.extend({
111
+ type: zod.z.literal("scroll"),
112
+ direction: zod.z.enum(["up", "down"])
113
+ });
114
+ const SavableZoomStepSchema = SavableStepBaseSchema.extend({
115
+ type: zod.z.literal("zoom"),
116
+ direction: zod.z.enum(["in", "out"])
117
+ });
118
+ const SavableScrollUntilStepSchema = SavableStepBaseSchema.extend({
119
+ type: zod.z.literal("scrollUntil"),
120
+ text: zod.z.string().optional(),
121
+ elementId: zod.z.string().optional(),
122
+ direction: zod.z.enum(["up", "down"]),
123
+ maxScrolls: zod.z.number().optional()
124
+ });
125
+ const SavableDeeplinkStepSchema = SavableStepBaseSchema.extend({
126
+ type: zod.z.literal("deeplink"),
127
+ url: zod.z.string()
128
+ });
129
+ const SavableAIStepSchema = SavableStepBaseSchema.extend({
130
+ type: zod.z.literal("ai"),
131
+ instruction: zod.z.string()
132
+ });
133
+ const SavableFileRefStepSchema = SavableStepBaseSchema.extend({
134
+ type: zod.z.literal("fileRef"),
135
+ path: zod.z.string(),
136
+ overrides: zod.z.record(zod.z.string(), zod.z.string()).optional()
137
+ });
138
+ const SavableStepSchema = zod.z.discriminatedUnion("type", [
139
+ SavableTapStepSchema,
140
+ // type: 'tap'
141
+ SavableAssertStepSchema,
142
+ // type: 'assert'
143
+ SavableTypeStepSchema,
144
+ // type: 'type'
145
+ SavableScrollStepSchema,
146
+ // type: 'scroll'
147
+ SavableZoomStepSchema,
148
+ // type: 'zoom'
149
+ SavableScrollUntilStepSchema,
150
+ // type: 'scrollUntil'
151
+ SavableDeeplinkStepSchema,
152
+ // type: 'deeplink'
153
+ SavableAIStepSchema,
154
+ // type: 'ai'
155
+ SavableFileRefStepSchema
156
+ // type: 'fileRef'
157
+ ]);
158
+ const SavableTestStoreSchema = zod.z.object({
159
+ name: zod.z.string(),
160
+ steps: zod.z.array(SavableStepSchema),
161
+ params: zod.z.record(zod.z.string(), zod.z.string()).optional()
162
+ });
163
+
164
+ const CACHE_SERVER_URL = "https://cache.mobileboost.io";
165
+ const GPT_DRIVER_BASE_URL = "https://api.mobileboost.io";
166
+ const RESCALE_FACTOR = 4;
167
+ const SMART_LOOP_MAX_ITERATIONS = 15;
168
+ const CACHE_RETRY_MS = 2e3;
169
+ const CACHE_CHECK_INTERVAL_MS = 500;
170
+
171
+ function generateCacheHash(apiKey, filepath, stepNumber, description, platform, resolution) {
172
+ const resString = resolution ? `${resolution.width}x${resolution.height}` : "";
173
+ const normalizedPlatform = platform?.toLowerCase() || "";
174
+ const data = `${apiKey}${filepath || ""}${stepNumber}${description}${normalizedPlatform || ""}${resString}`;
175
+ return crypto.createHash("sha256").update(data).digest("hex");
176
+ }
177
+ function scaleCommand(cmd, operation) {
178
+ if (cmd.match(/([xy])=(\d+)/)) {
179
+ return cmd.replace(/([xy])=(\d+)/g, (_match, axis, val) => {
180
+ const num = parseInt(val, 10);
181
+ let scaled;
182
+ if (operation === "multiply") {
183
+ scaled = Math.round(num * RESCALE_FACTOR);
184
+ } else {
185
+ scaled = Math.round(num / RESCALE_FACTOR);
186
+ }
187
+ return `${axis}=${scaled}`;
188
+ });
189
+ }
190
+ return cmd.replace(/(^|;)(\d+);(\d+)(;|$)/, (_match, prefix, xStr, yStr, suffix) => {
191
+ const x = parseInt(xStr, 10);
192
+ const y = parseInt(yStr, 10);
193
+ let scaledX;
194
+ let scaledY;
195
+ if (operation === "multiply") {
196
+ scaledX = Math.round(x * RESCALE_FACTOR);
197
+ scaledY = Math.round(y * RESCALE_FACTOR);
198
+ } else {
199
+ scaledX = Math.round(x / RESCALE_FACTOR);
200
+ scaledY = Math.round(y / RESCALE_FACTOR);
201
+ }
202
+ return `${prefix}${scaledX};${scaledY}${suffix}`;
203
+ });
204
+ }
205
+ async function resizeScreenshotForCache(screenshotBase64) {
206
+ const buffer = Buffer.from(
207
+ screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
208
+ "base64"
209
+ );
210
+ const metadata = await sharp(buffer).metadata();
211
+ const originalWidth = metadata.width ?? 1080;
212
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
213
+ return sharp(buffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
214
+ }
215
+
216
+ async function executeFromCache(params) {
217
+ try {
218
+ const hash = generateCacheHash(
219
+ params.apiKey,
220
+ params.filepath,
221
+ params.stepNumber,
222
+ params.stepDescription,
223
+ params.platform,
224
+ params.screenResolution
225
+ );
226
+ const resizedBuffer = await resizeScreenshotForCache(params.screenshot);
227
+ const formData = new FormData();
228
+ formData.append("hash", hash);
229
+ const blob = new Blob([new Uint8Array(resizedBuffer)], { type: "image/png" });
230
+ const blobSizeMB = (blob.size / (1024 * 1024)).toFixed(2);
231
+ globalLogger.debug(`[Cache] Executing from cache with screenshot size: ${blobSizeMB} MB`);
232
+ formData.append("screenshot", blob, "screenshot.png");
233
+ if (params.highestUsedIndex !== void 0 && params.highestUsedIndex !== null) {
234
+ globalLogger.debug(`[Cache] Sending highest_used_index: ${params.highestUsedIndex}`);
235
+ formData.append("highest_used_index", String(params.highestUsedIndex));
236
+ }
237
+ const response = await axios.post(`${CACHE_SERVER_URL}/execute-from-cache`, formData);
238
+ const result = response.data;
239
+ if (result.found && result.cacheCommands) {
240
+ const scaledCommands = result.cacheCommands.map(
241
+ (cmd) => scaleCommand(cmd, "multiply")
242
+ );
243
+ return {
244
+ found: true,
245
+ cacheCommands: scaledCommands,
246
+ cacheIndex: result.cacheIndex
247
+ };
248
+ }
249
+ return { found: false };
250
+ } catch (error) {
251
+ if (axios.isAxiosError(error)) {
252
+ globalLogger.warn(`[Cache] Cache lookup failed: ${error.response?.data || error.message}`);
253
+ } else {
254
+ globalLogger.error(`[Cache] Error executing from cache: ${error}`);
255
+ }
256
+ return { found: false };
257
+ }
258
+ }
259
+ async function populateCache(params) {
260
+ try {
261
+ const hash = generateCacheHash(
262
+ params.apiKey,
263
+ params.filepath,
264
+ params.stepNumber,
265
+ params.stepDescription,
266
+ params.platform,
267
+ params.screenResolution
268
+ );
269
+ const payload = await Promise.all(params.executionData.map(async (item) => {
270
+ const resizedBuffer = await resizeScreenshotForCache(item.screenshot);
271
+ const scaledCommands = item.commands.map(
272
+ (cmd) => scaleCommand(cmd, "divide")
273
+ );
274
+ return {
275
+ screenshot: resizedBuffer.toString("base64"),
276
+ commands: scaledCommands
277
+ };
278
+ }));
279
+ const payloadSizeMB = (JSON.stringify(payload).length / (1024 * 1024)).toFixed(2);
280
+ globalLogger.debug(`[Cache] Populating cache with payload size: ~${payloadSizeMB} MB (Hash: ${hash})`);
281
+ await axios.post(`${CACHE_SERVER_URL}/populate-cache`, payload, {
282
+ params: { hash }
283
+ });
284
+ return { success: true };
285
+ } catch (error) {
286
+ if (axios.isAxiosError(error)) {
287
+ globalLogger.error(`[Cache] Failed to populate cache: ${error.response?.data || error.message}`);
288
+ } else {
289
+ globalLogger.error(`[Cache] Error populating cache: ${error}`);
290
+ }
291
+ return { success: false };
292
+ }
293
+ }
294
+
295
+ const AI_AGENT_ENDPOINT = "https://api.mobileboost.io/call_lambda";
296
+ async function executeAgentStep(params) {
297
+ const imageBuffer = Buffer.from(params.base64_screenshot, "base64");
298
+ const metadata = await sharp(imageBuffer).metadata();
299
+ const originalWidth = metadata.width ?? 1080;
300
+ const originalHeight = metadata.height ?? 1920;
301
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
302
+ const resizedBuffer = await sharp(imageBuffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
303
+ const resizedMetadata = await sharp(resizedBuffer).metadata();
304
+ const resizedWidth = resizedMetadata.width ?? desiredWidth;
305
+ const resizedHeight = resizedMetadata.height ?? Math.round(originalHeight * (desiredWidth / originalWidth));
306
+ globalLogger.debug(`[AI Client] Resized screenshot: ${originalWidth}x${originalHeight} -> ${resizedWidth}x${resizedHeight}`);
307
+ const payload = {
308
+ lambda_flow: "get_next_step",
309
+ current_date: (/* @__PURE__ */ new Date()).toLocaleDateString("en-GB", {
310
+ day: "numeric",
311
+ month: "long",
312
+ year: "numeric"
313
+ }),
314
+ base64_screenshot: resizedBuffer.toString("base64"),
315
+ getUI_elements: [],
316
+ uiHierarchy: [],
317
+ test_task_string: JSON.stringify([
318
+ {
319
+ id: "step-1",
320
+ text: `1. ${params.instruction}`,
321
+ plainText: params.instruction
322
+ }
323
+ ]),
324
+ image_width: resizedWidth,
325
+ image_height: resizedHeight,
326
+ action_history: params.action_history,
327
+ orgKey: params.apiKey,
328
+ template_images: {},
329
+ model_provider: "vellum",
330
+ model_version: "claude-agent",
331
+ fallbackModel: "claude-agent",
332
+ utilize_fullTextAnnotation: false,
333
+ enableSortingOCR: true,
334
+ enableActionHistoryCut: true,
335
+ removeOverlappingText: false,
336
+ currentAndPreviousScreenMatch: false,
337
+ popupDetectionEnabled: true,
338
+ ocrProvider: "gcp"
339
+ };
340
+ globalLogger.debug(`[AI Client] Sending request to ${AI_AGENT_ENDPOINT}`);
341
+ try {
342
+ const response = await axios.post(
343
+ AI_AGENT_ENDPOINT,
344
+ payload,
345
+ {
346
+ headers: {
347
+ "Content-Type": "application/json"
348
+ }
349
+ }
350
+ );
351
+ const result = response.data;
352
+ globalLogger.debug("[AI Client] Received response from backend");
353
+ if (result.appetizeCommands) {
354
+ result.appetizeCommands = result.appetizeCommands.map(
355
+ (cmd) => scaleCommand(cmd, "multiply")
356
+ );
357
+ }
358
+ return result;
359
+ } catch (error) {
360
+ if (axios.isAxiosError(error)) {
361
+ const status = error.response?.status ?? "unknown";
362
+ const errorText = error.response?.data ?? error.message;
363
+ globalLogger.error(`[AI Client] Backend error (${status}): ${JSON.stringify(errorText)}`);
364
+ throw new Error(`AI Backend Error: ${status} - ${error.message}`);
365
+ }
366
+ throw error;
367
+ }
368
+ }
369
+
370
+ function parseTapCoordinates(cmd) {
371
+ const xMatch = cmd.match(/x=(\d+)/);
372
+ const yMatch = cmd.match(/y=(\d+)/);
373
+ if (xMatch && yMatch) {
374
+ return {
375
+ x: parseInt(xMatch[1], 10),
376
+ y: parseInt(yMatch[1], 10)
377
+ };
378
+ }
379
+ const parts = cmd.split(";");
380
+ if (parts.length >= 3) {
381
+ const x = parseInt(parts[1], 10);
382
+ const y = parseInt(parts[2], 10);
383
+ if (!isNaN(x) && !isNaN(y)) {
384
+ return { x, y };
385
+ }
386
+ }
387
+ return null;
388
+ }
389
+ function parseWaitSeconds(cmd) {
390
+ const match = cmd.match(/wait:\s*(\d+)/);
391
+ return match ? parseInt(match[1], 10) : null;
392
+ }
393
+ function parseScrollDirection(cmd) {
394
+ const match = cmd.match(/scroll:\s*(up|down)/i);
395
+ return match ? match[1].toLowerCase() : null;
396
+ }
397
+ function parseTypeText(cmd) {
398
+ const match = cmd.match(/^type:\s*(.+)$/);
399
+ return match ? match[1] : null;
400
+ }
401
+ function isTaskComplete(cmd) {
402
+ return cmd.toLowerCase().includes("task complete:");
403
+ }
404
+ function isErrorDetected(cmd) {
405
+ return cmd.toLowerCase().includes("error detected:");
406
+ }
407
+ function isRememberCommand(cmd) {
408
+ return cmd.startsWith("remember:");
409
+ }
410
+ function isTapCommand(cmd) {
411
+ return /^t(ap|ab)On:/.test(cmd);
412
+ }
413
+ function isWaitCommand(cmd) {
414
+ return cmd.startsWith("wait:");
415
+ }
416
+ function isScrollCommand(cmd) {
417
+ return cmd.startsWith("scroll:");
418
+ }
419
+ function isTypeCommand(cmd) {
420
+ return cmd.startsWith("type:");
421
+ }
422
+
423
+ async function executeSmartLoop(ctx, params) {
424
+ const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
425
+ let loopCount = 0;
426
+ let actionHistory = [...ctx.globalActionHistory];
427
+ let lastCacheIndex = void 0;
428
+ let anyCacheMiss = false;
429
+ let everHadCacheHit = false;
430
+ const currentExecutionData = [];
431
+ globalLogger.info(`[SmartLoop] Starting for step ${params.stepNumber}: "${params.description}"`);
432
+ try {
433
+ while (loopCount < SMART_LOOP_MAX_ITERATIONS) {
434
+ let screenshot = "";
435
+ let commands = [];
436
+ let isCacheHit = false;
437
+ for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
438
+ screenshot = await ctx.getScreenshot();
439
+ const sizeInBytes = screenshot.length * 0.75;
440
+ const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2);
441
+ globalLogger.debug(`[SmartLoop] Captured screenshot: ~${sizeInMB} MB`);
442
+ try {
443
+ globalLogger.debug(`[SmartLoop] Checking cache (Attempt ${attempt + 1}/${maxCacheAttempts})`);
444
+ const cacheResult = await executeFromCache({
445
+ apiKey: ctx.apiKey,
446
+ stepNumber: params.stepNumber,
447
+ stepDescription: params.description,
448
+ screenshot,
449
+ screenResolution: ctx.screenSize,
450
+ highestUsedIndex: lastCacheIndex,
451
+ platform: ctx.platform,
452
+ filepath: params.filepath
453
+ });
454
+ if (cacheResult.found && cacheResult.cacheCommands) {
455
+ commands = cacheResult.cacheCommands;
456
+ lastCacheIndex = cacheResult.cacheIndex;
457
+ isCacheHit = true;
458
+ everHadCacheHit = true;
459
+ globalLogger.info(`[SmartLoop] Cache Hit! (${commands.length} commands)`);
460
+ break;
461
+ }
462
+ } catch (e) {
463
+ globalLogger.warn(`[SmartLoop] Cache check failed: ${e.message}`);
464
+ }
465
+ if (attempt < maxCacheAttempts - 1) {
466
+ globalLogger.debug(`[SmartLoop] Cache miss, retrying in ${CACHE_CHECK_INTERVAL_MS}ms...`);
467
+ await delay(CACHE_CHECK_INTERVAL_MS);
468
+ }
469
+ }
470
+ let aiCommands = [];
471
+ if (!isCacheHit) {
472
+ anyCacheMiss = true;
473
+ globalLogger.info(`[SmartLoop] Cache Miss. Requesting AI agent...`);
474
+ const agentResponse = await executeAgentStep({
475
+ apiKey: ctx.apiKey,
476
+ base64_screenshot: screenshot,
477
+ instruction: params.instruction,
478
+ action_history: actionHistory
479
+ });
480
+ aiCommands = agentResponse.appetizeCommands || [];
481
+ const gptCommands = agentResponse.gptCommands || [];
482
+ const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
483
+ if (reasoningIndex !== -1) {
484
+ const parsedCommands = gptCommands.slice(reasoningIndex);
485
+ const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
486
+ if (rememberCommands.length > 0) {
487
+ ctx.globalActionHistory.push(...rememberCommands);
488
+ }
489
+ actionHistory = [...actionHistory, ...parsedCommands];
490
+ }
491
+ commands = [...aiCommands];
492
+ globalLogger.debug(`[SmartLoop] AI returned ${commands.length} command(s)`);
493
+ }
494
+ currentExecutionData.push({
495
+ screenshot,
496
+ commands: aiCommands.length > 0 ? aiCommands : commands
497
+ });
498
+ await ctx.logCodeExecution(screenshot, commands.join("\n"));
499
+ let actionExecuted = false;
500
+ let taskCompleted = false;
501
+ if (commands.length > 0) {
502
+ globalLogger.debug(`[SmartLoop] Executing ${commands.length} command(s)`);
503
+ }
504
+ for (const cmd of commands) {
505
+ if (isTaskComplete(cmd)) {
506
+ taskCompleted = true;
507
+ globalLogger.info(`[SmartLoop] Task completed signal received`);
508
+ continue;
509
+ }
510
+ if (isErrorDetected(cmd)) {
511
+ throw new Error(`AI Reported Error: ${cmd}`);
512
+ }
513
+ if (isRememberCommand(cmd)) {
514
+ ctx.globalActionHistory.push(cmd);
515
+ }
516
+ if (isTapCommand(cmd)) {
517
+ const coords = parseTapCoordinates(cmd);
518
+ if (coords) {
519
+ globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
520
+ await ctx.performTap(coords.x, coords.y);
521
+ actionExecuted = true;
522
+ }
523
+ } else if (isWaitCommand(cmd)) {
524
+ const seconds = parseWaitSeconds(cmd);
525
+ if (seconds) {
526
+ globalLogger.debug(`[SmartLoop] Waiting ${seconds}s`);
527
+ await delay(seconds * 1e3);
528
+ actionExecuted = true;
529
+ }
530
+ } else if (isScrollCommand(cmd)) {
531
+ const direction = parseScrollDirection(cmd);
532
+ if (direction) {
533
+ globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
534
+ await ctx.performScroll(direction);
535
+ actionExecuted = true;
536
+ }
537
+ } else if (isTypeCommand(cmd)) {
538
+ const text = parseTypeText(cmd);
539
+ if (text) {
540
+ globalLogger.debug(`[SmartLoop] Typing text`);
541
+ await ctx.performType(text);
542
+ actionExecuted = true;
543
+ }
544
+ }
545
+ }
546
+ if (actionExecuted) {
547
+ if (isCacheHit) {
548
+ actionHistory.push(...commands);
549
+ }
550
+ await delay(100);
551
+ }
552
+ if (taskCompleted) {
553
+ globalLogger.info(`[SmartLoop] Task completed successfully`);
554
+ if (anyCacheMiss && currentExecutionData.length > 0) {
555
+ globalLogger.info(`[SmartLoop] Populating cache with ${currentExecutionData.length} frame(s)...`);
556
+ try {
557
+ await populateCache({
558
+ apiKey: ctx.apiKey,
559
+ stepNumber: params.stepNumber,
560
+ stepDescription: params.description,
561
+ executionData: currentExecutionData,
562
+ screenResolution: ctx.screenSize,
563
+ platform: ctx.platform,
564
+ filepath: params.filepath
565
+ });
566
+ globalLogger.debug(`[SmartLoop] Cache populated successfully`);
567
+ } catch (e) {
568
+ globalLogger.warn(`[SmartLoop] Failed to populate cache: ${e.message}`);
569
+ }
570
+ } else if (!anyCacheMiss) {
571
+ globalLogger.debug(`[SmartLoop] Skipping cache population (all actions were cached)`);
572
+ }
573
+ return {
574
+ success: true,
575
+ iterations: loopCount + 1,
576
+ cacheHit: everHadCacheHit
577
+ };
578
+ }
579
+ loopCount++;
580
+ }
581
+ throw new Error(`Smart Loop timeout after ${SMART_LOOP_MAX_ITERATIONS} iterations`);
582
+ } catch (error) {
583
+ const message = error instanceof Error ? error.message : String(error);
584
+ globalLogger.error(`[SmartLoop] Error: ${message}`);
585
+ return {
586
+ success: false,
587
+ error: message,
588
+ iterations: loopCount + 1,
589
+ cacheHit: everHadCacheHit
590
+ };
591
+ }
592
+ }
593
+
21
594
  class GptDriver {
595
+ interpolateTemplate(input, params) {
596
+ if (typeof input !== "string" || !input.includes("{{")) return input;
597
+ const pattern = /{{\s*([^}]+?)\s*}}/g;
598
+ return input.replace(pattern, (_match, keyRaw) => {
599
+ const key = String(keyRaw);
600
+ if (!(key in params)) {
601
+ throw new Error(`Missing flow param: {{${key}}}`);
602
+ }
603
+ return params[key];
604
+ });
605
+ }
22
606
  apiKey;
23
607
  gptDriverSessionId;
24
608
  gptDriverBaseUrl;
25
609
  appiumSessionConfig;
610
+ cachingMode;
26
611
  driver;
612
+ appiumSessionStarted;
613
+ useGptDriverCloud;
614
+ gptDriverCloudConfig;
615
+ buildId;
616
+ testId;
617
+ step_number = 1;
618
+ // Smart loop state - maintains action history across steps for context
619
+ globalActionHistory = [];
27
620
  /**
28
621
  * Creates an instance of the GptDriver class.
29
622
  *
@@ -43,19 +636,34 @@ class GptDriver {
43
636
  * - `device.platform`: The platform name of the device (e.g., iOS, Android).
44
637
  */
45
638
  constructor(config) {
639
+ this.testId = config.testId;
46
640
  this.apiKey = config.apiKey;
47
- this.gptDriverBaseUrl = "https://api.mobileboost.io";
48
- this.initializeDriver(config);
49
- this.initializeAppiumConfig(config);
641
+ this.buildId = config.buildId;
642
+ this.useGptDriverCloud = config.useGptDriverCloud;
643
+ this.gptDriverBaseUrl = GPT_DRIVER_BASE_URL;
644
+ this.cachingMode = config.cachingMode ?? "NONE";
645
+ if (config.useGptDriverCloud) {
646
+ if (config.serverConfig.device?.platform == null) {
647
+ throw new Error("Platform is missing. Please specify the platform when using GPTDriver Cloud.");
648
+ }
649
+ this.gptDriverCloudConfig = {
650
+ platform: config.serverConfig.device.platform,
651
+ deviceName: config.serverConfig.device.deviceName,
652
+ platformVersion: config.serverConfig.device.platformVersion
653
+ };
654
+ } else {
655
+ this.initializeDriver(config);
656
+ this.initializeAppiumConfig(config);
657
+ }
50
658
  }
51
659
  initializeDriver(config) {
52
660
  if (config.driver) {
53
661
  this.driver = config.driver;
54
- if (!config.severConfig?.url) {
662
+ if (!config.serverConfig.url) {
55
663
  throw new Error("Server url is missing. Please specify the server url when providing a driver.");
56
664
  }
57
665
  } else {
58
- const isValidServerConfig = config.severConfig?.url && config.severConfig.device?.platform;
666
+ const isValidServerConfig = config.serverConfig.url && config.serverConfig.device?.platform;
59
667
  if (!isValidServerConfig) {
60
668
  throw new Error("Either provide a driver, or a valid severConfig object.");
61
669
  }
@@ -64,10 +672,10 @@ class GptDriver {
64
672
  initializeAppiumConfig(config) {
65
673
  const defaultPort = parseInt(process.env.APPIUM_PORT ?? "4723", 10);
66
674
  const defaultHost = process.env.APPIUM_HOST ?? "127.0.0.1";
67
- let serverUrl = config.severConfig?.url instanceof URL ? config.severConfig.url : new URL(config.severConfig?.url ?? `http://${defaultHost}:${defaultPort}`);
675
+ const serverUrl = config.serverConfig.url instanceof URL ? config.serverConfig.url : new URL(config.serverConfig.url ?? `http://${defaultHost}:${defaultPort}`);
68
676
  this.appiumSessionConfig = {
69
677
  serverUrl,
70
- ...config.severConfig?.device
678
+ ...config.serverConfig.device
71
679
  };
72
680
  }
73
681
  /**
@@ -77,48 +685,53 @@ class GptDriver {
77
685
  * @throws {Error} If the session cannot be started or the driver is not properly initialized.
78
686
  */
79
687
  async startSession() {
80
- console.log(">> Starting session...");
81
- if (this.driver) {
82
- let platform;
83
- let platformVersion;
84
- let deviceName;
85
- let sessionId;
86
- if (this.driver instanceof seleniumWebdriver.WebDriver) {
87
- const capabilities = await this.driver.getCapabilities();
88
- platform = capabilities.get("platformName");
89
- platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
90
- deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
91
- const session = await this.driver.getSession();
92
- sessionId = session.getId();
688
+ globalLogger.info("Starting session...");
689
+ if (!this.useGptDriverCloud) {
690
+ if (this.driver) {
691
+ let platform;
692
+ let platformVersion;
693
+ let deviceName;
694
+ let sessionId;
695
+ if (this.driver.sessionId == null) {
696
+ const driver = this.driver;
697
+ const capabilities = await driver.getCapabilities();
698
+ platform = capabilities.get("platformName");
699
+ platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
700
+ deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
701
+ const session = await driver.getSession();
702
+ sessionId = session.getId();
703
+ } else {
704
+ const driver = this.driver;
705
+ platform = driver.capabilities["appium:platformName"] ?? driver.capabilities["platformName"];
706
+ platformVersion = driver.capabilities["appium:platformVersion"] ?? driver.capabilities["platformVersion"];
707
+ deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? driver.capabilities["deviceName"];
708
+ sessionId = driver.sessionId;
709
+ }
710
+ this.appiumSessionConfig = {
711
+ ...this.appiumSessionConfig,
712
+ id: sessionId,
713
+ platform,
714
+ platformVersion,
715
+ deviceName
716
+ };
717
+ globalLogger.debug(`Session config: ${JSON.stringify(this.appiumSessionConfig)}`);
93
718
  } else {
94
- platform = this.driver.capabilities["appium:platformName"];
95
- platformVersion = this.driver.capabilities["appium:platformVersion"];
96
- deviceName = this.appiumSessionConfig?.deviceName ?? this.driver.capabilities["appium:deviceName"] ?? "";
97
- sessionId = this.driver.sessionId;
98
- }
99
- this.appiumSessionConfig = {
100
- ...this.appiumSessionConfig,
101
- id: sessionId,
102
- platform,
103
- platformVersion,
104
- deviceName
719
+ this.appiumSessionConfig.id = await this.createSession();
720
+ }
721
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
722
+ const rectResponse = await axios.get(url);
723
+ this.appiumSessionConfig.size = {
724
+ width: rectResponse.data.value.width,
725
+ height: rectResponse.data.value.height
105
726
  };
106
- } else {
107
- this.appiumSessionConfig.id = await this.createSession();
727
+ this.appiumSessionStarted = true;
108
728
  }
109
729
  await this.createGptDriverSession();
110
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
111
- const rectResponse = await axios.get(
112
- url
113
- );
114
- this.appiumSessionConfig.size = {
115
- width: rectResponse.data.value.width,
116
- height: rectResponse.data.value.height
117
- };
118
- console.log(`>> Session created. Monitor execution at: ${this.getSessionLink()}`);
730
+ globalLogger.info(logStyles.highlight(`Session created. Monitor execution at: ${this.getSessionLink()}`));
119
731
  }
120
732
  async createSession() {
121
733
  const { platform, deviceName, platformVersion, serverUrl } = this.appiumSessionConfig;
734
+ globalLogger.debug(`Creating Appium session for ${platform} ${platformVersion} on ${deviceName}`);
122
735
  const url = buildUrl(serverUrl, `/session`);
123
736
  const response = await axios.post(
124
737
  url,
@@ -133,22 +746,41 @@ class GptDriver {
133
746
  }
134
747
  }
135
748
  );
136
- return response.data.value.sessionId;
749
+ const sessionId = response.data.value.sessionId;
750
+ globalLogger.debug(`Appium session created with ID: ${sessionId}`);
751
+ return sessionId;
137
752
  }
138
753
  async createGptDriverSession() {
754
+ globalLogger.debug("Creating GPT Driver session...");
139
755
  const response = await axios.post(
140
756
  `${this.gptDriverBaseUrl}/sessions/create`,
141
757
  {
758
+ ...this.testId && { test_id: this.testId },
142
759
  api_key: this.apiKey,
143
- appium_session_id: this.appiumSessionConfig.id,
760
+ appium_session_id: this.appiumSessionConfig?.id,
144
761
  device_config: {
145
- platform: this.appiumSessionConfig.platform,
146
- device: this.appiumSessionConfig.deviceName,
147
- os: this.appiumSessionConfig.platformVersion
148
- }
762
+ platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
763
+ device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
764
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
765
+ },
766
+ use_internal_virtual_device: this.useGptDriverCloud,
767
+ build_id: this.buildId,
768
+ caching_mode: this.cachingMode
149
769
  }
150
770
  );
151
771
  this.gptDriverSessionId = response.data.sessionId;
772
+ globalLogger.debug(`GPT Driver session created with ID: ${this.gptDriverSessionId}`);
773
+ if (this.useGptDriverCloud) {
774
+ const parsedUrl = new URL(response.data.appiumServerUrl);
775
+ this.driver = await webdriverio.attach({
776
+ options: {
777
+ hostname: parsedUrl.hostname,
778
+ path: parsedUrl.pathname
779
+ },
780
+ sessionId: response.data.appiumSessionId
781
+ });
782
+ this.appiumSessionStarted = true;
783
+ }
152
784
  }
153
785
  getSessionLink() {
154
786
  return `https://app.mobileboost.io/gpt-driver/sessions/${this.gptDriverSessionId}`;
@@ -164,20 +796,174 @@ class GptDriver {
164
796
  *
165
797
  * @throws {Error} If the request to stop the session fails.
166
798
  */
167
- async stopSession(status) {
168
- console.log(">> Stopping session...");
169
- await axios.post(
170
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
799
+ async setSessionStatus(status) {
800
+ if (this.gptDriverSessionId) {
801
+ globalLogger.info(`Stopping session with status: ${status}`);
802
+ await axios.post(
803
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
804
+ {
805
+ api_key: this.apiKey,
806
+ status
807
+ }
808
+ );
809
+ globalLogger.info("Session stopped successfully");
810
+ this.appiumSessionStarted = false;
811
+ this.gptDriverSessionId = void 0;
812
+ this.step_number = 1;
813
+ this.globalActionHistory = [];
814
+ }
815
+ }
816
+ // ─────────────────────────────────────────────────────────────────────────────
817
+ // SMART LOOP INTEGRATION
818
+ // ─────────────────────────────────────────────────────────────────────────────
819
+ /**
820
+ * Creates a SmartLoopContext for the current session.
821
+ * This context provides all the callbacks needed by the smart loop executor.
822
+ */
823
+ createSmartLoopContext() {
824
+ return {
825
+ apiKey: this.apiKey,
826
+ platform: this.appiumSessionConfig?.platform,
827
+ screenSize: this.appiumSessionConfig.size,
828
+ globalActionHistory: this.globalActionHistory,
829
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
830
+ performTap: (x, y) => this.performTap(x, y),
831
+ performScroll: (direction) => this.performScroll(direction),
832
+ performType: (text) => this.performType(text),
833
+ logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command)
834
+ };
835
+ }
836
+ /**
837
+ * Calls the AI agent to determine the next actions based on the current screenshot.
838
+ * This requires an active GPT Driver session.
839
+ */
840
+ async executeAgentStep(params) {
841
+ const response = await axios.post(
842
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/agent/execute`,
171
843
  {
172
844
  api_key: this.apiKey,
173
- status
845
+ base64_screenshot: params.screenshot.replace(/^data:image\/\w+;base64,/, ""),
846
+ instruction: params.instruction,
847
+ action_history: params.actionHistory
174
848
  }
175
849
  );
176
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}`);
177
- await axios.delete(url);
178
- console.log(">> Session stopped.");
179
- this.gptDriverSessionId = void 0;
850
+ return {
851
+ gptCommands: response.data.gpt_commands,
852
+ appetizeCommands: response.data.appetize_commands,
853
+ actionHistory: response.data.action_history
854
+ };
855
+ }
856
+ // ─────────────────────────────────────────────────────────────────────────────
857
+ // DEVICE ACTION METHODS
858
+ // ─────────────────────────────────────────────────────────────────────────────
859
+ async getWdioClient() {
860
+ if (!this.appiumSessionStarted) {
861
+ await this.startSession();
862
+ }
863
+ if (this.driver?.sessionId != null) {
864
+ return this.driver;
865
+ }
866
+ const url = this.appiumSessionConfig.serverUrl;
867
+ const parsed = new URL(url);
868
+ const client = await webdriverio.attach({
869
+ sessionId: this.appiumSessionConfig.id,
870
+ options: {
871
+ protocol: parsed.protocol.replace(":", ""),
872
+ hostname: parsed.hostname,
873
+ port: parsed.port ? Number(parsed.port) : parsed.protocol === "https:" ? 443 : 80,
874
+ path: parsed.pathname && parsed.pathname !== "/" ? parsed.pathname : "/"
875
+ }
876
+ });
877
+ this.driver = client;
878
+ return client;
879
+ }
880
+ /**
881
+ * Performs a tap action at the specified coordinates.
882
+ */
883
+ async performTap(x, y) {
884
+ const client = await this.getWdioClient();
885
+ await client.performActions([
886
+ {
887
+ type: "pointer",
888
+ id: "finger1",
889
+ parameters: { pointerType: "touch" },
890
+ actions: [
891
+ { type: "pointerMove", duration: 0, x, y },
892
+ { type: "pointerDown", button: 0 },
893
+ { type: "pause", duration: 100 },
894
+ { type: "pointerUp", button: 0 }
895
+ ]
896
+ }
897
+ ]);
898
+ }
899
+ async performType(text) {
900
+ const client = await this.getWdioClient();
901
+ await client.keys(text.split(""));
902
+ }
903
+ async performScroll(direction) {
904
+ const client = await this.getWdioClient();
905
+ const w = this.appiumSessionConfig?.size?.width ?? 1080;
906
+ const h = this.appiumSessionConfig?.size?.height ?? 1920;
907
+ const x = Math.round(w / 2);
908
+ const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
909
+ const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
910
+ await client.performActions([
911
+ {
912
+ type: "pointer",
913
+ id: "finger1",
914
+ parameters: { pointerType: "touch" },
915
+ actions: [
916
+ { type: "pointerMove", duration: 0, x, y: startY },
917
+ { type: "pointerDown", button: 0 },
918
+ { type: "pause", duration: 100 },
919
+ { type: "pointerMove", duration: 500, x, y: endY },
920
+ { type: "pointerUp", button: 0 }
921
+ ]
922
+ }
923
+ ]);
924
+ }
925
+ async getPageSource() {
926
+ const client = await this.getWdioClient();
927
+ return client.getPageSource();
928
+ }
929
+ async performScrollUntil(params) {
930
+ const { direction, text, elementId } = params;
931
+ const max = params.maxScrolls ?? 10;
932
+ for (let i = 0; i < max; i++) {
933
+ const source = await this.getPageSource();
934
+ const found = elementId ? source.includes(elementId) : text ? source.includes(text) : false;
935
+ if (found) {
936
+ return;
937
+ }
938
+ await this.performScroll(direction);
939
+ await this._delay(500);
940
+ }
941
+ throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
942
+ }
943
+ async getScreenshot(appiumSessionConfig) {
944
+ globalLogger.debug("Capturing screenshot...");
945
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
946
+ const screenshotResponse = await axios.get(url);
947
+ let screenshot = await screenshotResponse.data.value;
948
+ if (appiumSessionConfig.platform === "iOS") {
949
+ globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
950
+ const imageBuffer = Buffer.from(screenshot, "base64");
951
+ const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
952
+ screenshot = transformedImage.toString("base64");
953
+ }
954
+ return screenshot;
955
+ }
956
+ /**
957
+ * Helper method to delay execution.
958
+ *
959
+ * @private
960
+ */
961
+ _delay(ms) {
962
+ return new Promise((resolve) => setTimeout(resolve, ms));
180
963
  }
964
+ // ─────────────────────────────────────────────────────────────────────────────
965
+ // PUBLIC API METHODS
966
+ // ─────────────────────────────────────────────────────────────────────────────
181
967
  /**
182
968
  * Executes a specified command within the WebDriver session, optionally using an Appium handler.
183
969
  *
@@ -185,6 +971,7 @@ class GptDriver {
185
971
  * the command-specific operations. After executing the handler, the executed commands get logged on the GPTDriver servers.
186
972
  * If the handler execution fails or no handler is provided, the command gets executed by the GPTDriver using just natural language.
187
973
  *
974
+ * @deprecated Use `aiExecute()` instead. This method will be removed in a future version.
188
975
  * @param {string} command - The natural language command to be executed by the GPTDriver.
189
976
  * @param {AppiumHandler} [appiumHandler] - An optional function that processes Appium-specific commands.
190
977
  * If provided, this handler is executed instead of calling the GPTDriver serves.
@@ -192,24 +979,136 @@ class GptDriver {
192
979
  * @throws {Error} If an error occurs during the execution of the Appium handler or while processing the command by the GPTDriver.
193
980
  */
194
981
  async execute(command, appiumHandler) {
195
- console.log(">> Executing command:", command);
982
+ globalLogger.warn("Method 'execute()' is deprecated. Please use 'aiExecute()' instead.");
983
+ if (!this.appiumSessionStarted) {
984
+ await this.startSession();
985
+ }
986
+ globalLogger.info(`Executing command: ${command}`);
196
987
  const driver = this.driver;
197
988
  if (appiumHandler != null) {
198
989
  try {
990
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
199
991
  await appiumHandler(driver);
200
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
201
- await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
202
- api_key: this.apiKey,
203
- base64_screenshot: screenshot,
204
- command: appiumHandler.toString()
205
- });
992
+ globalLogger.debug("Custom Appium handler executed successfully");
206
993
  } catch (e) {
994
+ globalLogger.warn("Custom Appium handler failed, falling back to GPT handler");
207
995
  await this.gptHandler(command);
208
996
  }
209
997
  } else {
210
998
  await this.gptHandler(command);
211
999
  }
212
1000
  }
1001
+ /**
1002
+ * Executes a specified command within the WebDriver session with configurable caching options.
1003
+ *
1004
+ * This is the recommended method for executing commands. It provides fine-grained control over
1005
+ * caching behavior, allowing you to optimize performance and costs for repetitive test scenarios.
1006
+ *
1007
+ * If an `appiumHandler` is provided, it will be invoked with the WebDriver instance to perform
1008
+ * the command-specific operations. After executing the handler, the executed commands get logged
1009
+ * on the GPTDriver servers. If the handler execution fails or no handler is provided, the command
1010
+ * gets executed by the GPTDriver using natural language processing.
1011
+ *
1012
+ * @param {Object} params - The execution parameters
1013
+ * @param {string} params.command - The natural language command to be executed by the GPTDriver.
1014
+ * Examples: "Click the login button", "Enter 'test@example.com' in the email field"
1015
+ * @param {AppiumHandler} [params.appiumHandler] - An optional function that processes Appium-specific commands.
1016
+ * If provided, this handler is executed instead of calling
1017
+ * the GPTDriver API. Useful for performance optimization when
1018
+ * you know the exact Appium commands to execute.
1019
+ * @param {CachingMode} [params.cachingMode] - Controls how the GPTDriver caches this command execution.
1020
+ * If not specified, uses the global caching mode set in the constructor.
1021
+ * Options:
1022
+ * - "NONE"
1023
+ * - "FULL_SCREEN"
1024
+ * - "INTERACTION_REGION"
1025
+ * @param {boolean} [params.useSmartLoop] - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1026
+ * which optimizes execution by checking cache first and populating it after.
1027
+ * Default: false (uses legacy gptHandler)
1028
+ *
1029
+ * @returns {Promise<void>} A promise that resolves when the command execution is complete.
1030
+ *
1031
+ * @throws {Error} If an error occurs during the execution of the Appium handler or while processing
1032
+ * the command by the GPTDriver.
1033
+ *
1034
+ * @example
1035
+ * // Basic usage with natural language (no caching)
1036
+ * await driver.aiExecute({
1037
+ * command: "Click the submit button"
1038
+ * });
1039
+ *
1040
+ * @example
1041
+ * // Full screen caching for repetitive navigation on similar screens
1042
+ * await driver.aiExecute({
1043
+ * command: "Navigate to the settings page",
1044
+ * cachingMode: "FULL_SCREEN"
1045
+ * });
1046
+ *
1047
+ * @example
1048
+ * // Interaction region caching for repeated actions on the same button
1049
+ * await driver.aiExecute({
1050
+ * command: "Click the login button",
1051
+ * cachingMode: "INTERACTION_REGION"
1052
+ * });
1053
+ *
1054
+ * @example
1055
+ * // With custom Appium handler as fallback
1056
+ * await driver.aiExecute({
1057
+ * command: "Click the login button",
1058
+ * appiumHandler: async (driver) => {
1059
+ * const loginBtn = await driver.$('~loginButton');
1060
+ * await loginBtn.click();
1061
+ * },
1062
+ * cachingMode: "INTERACTION_REGION"
1063
+ * });
1064
+ *
1065
+ * @example
1066
+ * // Force fresh execution for dynamic content
1067
+ * await driver.aiExecute({
1068
+ * command: "Verify the current timestamp",
1069
+ * cachingMode: "NONE"
1070
+ * });
1071
+ *
1072
+ * @example
1073
+ * // Using smart loop for optimized caching
1074
+ * await driver.aiExecute({
1075
+ * command: "Click the login button",
1076
+ * useSmartLoop: true,
1077
+ * cachingMode: "FULL_SCREEN"
1078
+ * });
1079
+ */
1080
+ async aiExecute({ command, appiumHandler, cachingMode, useSmartLoop = false }) {
1081
+ if (!this.appiumSessionStarted) {
1082
+ await this.startSession();
1083
+ }
1084
+ globalLogger.info(`Executing command: ${command}`);
1085
+ const driver = this.driver;
1086
+ if (appiumHandler != null) {
1087
+ try {
1088
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
1089
+ await appiumHandler(driver);
1090
+ globalLogger.debug("Custom Appium handler executed successfully");
1091
+ this.step_number++;
1092
+ return;
1093
+ } catch (e) {
1094
+ globalLogger.warn("Custom Appium handler failed, falling back to AI execution");
1095
+ }
1096
+ }
1097
+ if (useSmartLoop) {
1098
+ const ctx = this.createSmartLoopContext();
1099
+ const result = await executeSmartLoop(ctx, {
1100
+ stepNumber: this.step_number,
1101
+ description: command,
1102
+ instruction: command
1103
+ });
1104
+ if (!result.success) {
1105
+ throw new Error(result.error || "Smart loop execution failed");
1106
+ }
1107
+ this.step_number++;
1108
+ } else {
1109
+ await this.gptHandler(command, cachingMode);
1110
+ }
1111
+ }
213
1112
  /**
214
1113
  * Asserts a single condition using the GPTDriver.
215
1114
  *
@@ -217,13 +1116,25 @@ class GptDriver {
217
1116
  * If the assertion fails, an error is thrown.
218
1117
  *
219
1118
  * @param {string} assertion - The condition to be asserted.
1119
+ * @param cachingMode - The caching mode to be used for the assertion.
220
1120
  * @throws {Error} If the assertion fails.
221
1121
  */
222
- async assert(assertion) {
223
- console.log(">> Asserting:", assertion);
224
- const results = await this.checkBulk([assertion]);
225
- if (!Object.values(results).at(0)) {
226
- throw new Error(`Failed assertion: ${assertion}`);
1122
+ async assert(assertion, cachingMode) {
1123
+ if (!this.appiumSessionStarted) {
1124
+ await this.startSession();
1125
+ }
1126
+ try {
1127
+ const results = await this.checkBulk([assertion], cachingMode);
1128
+ if (!Object.values(results).at(0)) {
1129
+ await this.setSessionStatus("failed");
1130
+ globalLogger.error(`Assertion failed: ${assertion}`);
1131
+ throw new Error(`Failed assertion: ${assertion}`);
1132
+ }
1133
+ this.step_number = this.step_number + 1;
1134
+ globalLogger.info(`Assertion passed: ${assertion}`);
1135
+ } catch (e) {
1136
+ await this.setSessionStatus("failed");
1137
+ throw e;
227
1138
  }
228
1139
  }
229
1140
  /**
@@ -233,43 +1144,100 @@ class GptDriver {
233
1144
  * If any assertion fails, an error is thrown listing all failed assertions.
234
1145
  *
235
1146
  * @param {string[]} assertions - An array of conditions to be asserted.
1147
+ * @param cachingMode - The caching mode to be used for the assertions.
236
1148
  * @throws {Error} If any of the assertions fail.
237
1149
  */
238
- async assertBulk(assertions) {
239
- console.log(">> Asserting:", assertions);
240
- const results = await this.checkBulk(assertions);
241
- const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
242
- if (!current) {
243
- return [...prev, assertions.at(currentIndex)];
1150
+ async assertBulk(assertions, cachingMode) {
1151
+ if (!this.appiumSessionStarted) {
1152
+ await this.startSession();
1153
+ }
1154
+ try {
1155
+ const results = await this.checkBulk(assertions, cachingMode);
1156
+ const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
1157
+ if (!current) {
1158
+ return [...prev, assertions.at(currentIndex)];
1159
+ }
1160
+ return prev;
1161
+ }, []);
1162
+ if (failedAssertions.length > 0) {
1163
+ await this.setSessionStatus("failed");
1164
+ globalLogger.error(`Multiple assertions failed: ${failedAssertions.join(", ")}`);
1165
+ throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
244
1166
  }
245
- return prev;
246
- }, []);
247
- if (failedAssertions.length > 0) {
248
- throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
1167
+ this.step_number = this.step_number + 1;
1168
+ globalLogger.info(`All ${assertions.length} assertions passed`);
1169
+ } catch (e) {
1170
+ await this.setSessionStatus("failed");
1171
+ throw e;
249
1172
  }
250
1173
  }
251
1174
  /**
252
1175
  * Checks multiple conditions and returns their results using the GPTDriver.
253
1176
  *
254
1177
  * This method sends a bulk condition request and returns the results of the conditions.
1178
+ * Failed conditions will be retried up to maxRetries times.
255
1179
  *
256
1180
  * @param {string[]} conditions - An array of conditions to be checked.
1181
+ * @param {CachingMode} cachingMode - The caching mode to be used for the conditions.
1182
+ * @param {number} maxRetries - The maximum number of retries if any condition fails (default: 2).
1183
+ * @param {number} retryDelayMs - The delay in milliseconds between retries (default: 1000).
257
1184
  * @returns {Promise<Record<string, boolean>>} A promise that resolves with an object mapping each condition
258
1185
  * to a boolean indicating whether the condition was met.
259
1186
  */
260
- async checkBulk(conditions) {
261
- console.log(">> Checking:", conditions);
262
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
263
- const response = await axios.post(
264
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
265
- {
266
- api_key: this.apiKey,
267
- base64_screenshot: screenshot,
268
- assertions: conditions,
269
- command: `Assert: ${JSON.stringify(conditions)}`
1187
+ async checkBulk(conditions, cachingMode, maxRetries = 2, retryDelayMs = 1e3) {
1188
+ let attempt = 0;
1189
+ let results = {};
1190
+ while (attempt <= maxRetries) {
1191
+ results = await this._checkBulkOnce(conditions, cachingMode, attempt);
1192
+ const failedConditions = Object.entries(results).filter(([_, success]) => !success).map(([key, _]) => key);
1193
+ if (failedConditions.length === 0) {
1194
+ return results;
270
1195
  }
271
- );
272
- return response.data.results;
1196
+ attempt++;
1197
+ if (attempt <= maxRetries) {
1198
+ globalLogger.info(
1199
+ `>> Conditions failed ${JSON.stringify(failedConditions)}. Retrying in ${retryDelayMs}ms... (Attempt ${attempt}/${maxRetries})`
1200
+ );
1201
+ await this._delay(retryDelayMs);
1202
+ } else {
1203
+ globalLogger.info(`>> Conditions failed: ${JSON.stringify(failedConditions)}`);
1204
+ }
1205
+ }
1206
+ return results;
1207
+ }
1208
+ /**
1209
+ * Internal method to check conditions once without retry logic.
1210
+ *
1211
+ * @private
1212
+ */
1213
+ async _checkBulkOnce(conditions, cachingMode, attempt = 0) {
1214
+ if (!this.appiumSessionStarted) {
1215
+ await this.startSession();
1216
+ }
1217
+ globalLogger.info(`Checking conditions (attempt ${attempt}): ${conditions.join(", ")}`);
1218
+ try {
1219
+ let screenshot;
1220
+ if (!this.useGptDriverCloud) {
1221
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1222
+ }
1223
+ const response = await axios.post(
1224
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
1225
+ {
1226
+ api_key: this.apiKey,
1227
+ base64_screenshot: screenshot,
1228
+ assertions: conditions,
1229
+ command: `Assert: ${JSON.stringify(conditions)}`,
1230
+ caching_mode: cachingMode ?? this.cachingMode,
1231
+ step_number: this.step_number
1232
+ }
1233
+ );
1234
+ globalLogger.debug(`Check results: ${JSON.stringify(response.data.results)}`);
1235
+ return response.data.results;
1236
+ } catch (e) {
1237
+ globalLogger.error("Failed to check conditions", e);
1238
+ await this.setSessionStatus("failed");
1239
+ throw e;
1240
+ }
273
1241
  }
274
1242
  /**
275
1243
  * Extracts specified information using the GPTDriver.
@@ -279,30 +1247,305 @@ class GptDriver {
279
1247
  *
280
1248
  * @param {string[]} extractions - An array of extraction criteria. Each criterion specifies what information
281
1249
  * should be extracted from the session.
1250
+ * @param cachingMode - The caching mode to be used for the extraction.
282
1251
  * @returns {Promise<Record<string, any>>} A promise that resolves with an object mapping each extraction criterion
283
1252
  * to the extracted data. The structure of the returned data depends on the
284
1253
  * specifics of the extraction criteria.
285
1254
  */
286
- async extract(extractions) {
287
- console.log(">> Extracting:", extractions);
288
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1255
+ async extract(extractions, cachingMode) {
1256
+ if (!this.appiumSessionStarted) {
1257
+ await this.startSession();
1258
+ }
1259
+ globalLogger.info(`Extracting data: ${extractions.join(", ")}`);
1260
+ let screenshot;
1261
+ if (!this.useGptDriverCloud) {
1262
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1263
+ }
289
1264
  const response = await axios.post(
290
1265
  `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/extract`,
291
1266
  {
292
1267
  api_key: this.apiKey,
293
1268
  base64_screenshot: screenshot,
294
1269
  extractions,
295
- command: `Extract: ${JSON.stringify(extractions)}`
1270
+ command: `Extract: ${JSON.stringify(extractions)}`,
1271
+ step_number: this.step_number
296
1272
  }
297
1273
  );
1274
+ this.step_number = this.step_number + 1;
1275
+ globalLogger.debug(`Extraction results: ${JSON.stringify(response.data.results)}`);
298
1276
  return response.data.results;
299
1277
  }
300
- async gptHandler(command) {
1278
+ /**
1279
+ * Opens a deep link url in the Appium session.
1280
+ *
1281
+ * This method sends a request to the GPT Driver server to open a deep link url in the Appium session.
1282
+ *
1283
+ * @param {OpenDeepLinkUrlParams} params - The parameters for opening the deep link url.
1284
+ * @returns {Promise<void>} A promise that resolves when the deep link url is opened.
1285
+ */
1286
+ async openDeepLinkUrl(params) {
1287
+ if (!this.appiumSessionStarted) {
1288
+ await this.startSession();
1289
+ }
1290
+ globalLogger.info(`Opening deep link: ${params.url}`);
1291
+ if (params.package == null && this.appiumSessionConfig?.platform === "Android") {
1292
+ throw new Error("Package is required for Android platform");
1293
+ }
1294
+ await this.executeCommand(
1295
+ {
1296
+ url: `http://localhost:4723/session/${this.appiumSessionConfig?.id}/execute/sync`,
1297
+ method: "POST",
1298
+ data: {
1299
+ "script": "mobile:deepLink",
1300
+ "args": [{
1301
+ url: params.url,
1302
+ ...params.bundleId && { bundleId: params.bundleId },
1303
+ ...params.package && { package: params.package }
1304
+ }]
1305
+ }
1306
+ }
1307
+ );
1308
+ this.step_number = this.step_number + 1;
1309
+ globalLogger.debug("Deep link opened successfully");
1310
+ }
1311
+ /**
1312
+ * Reads a flow JSON file from disk and validates it using the SavableTestStoreSchema.
1313
+ *
1314
+ * Returns the parsed and validated object on success; throws a detailed error on failure.
1315
+ *
1316
+ * @param filePath - Path to the flow file (JSON)
1317
+ * @param options - Optional execution options
1318
+ * @param options.useSmartLoop - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1319
+ * for AI, tap, and assert steps. This optimizes execution by checking cache
1320
+ * first and populating it after successful execution. Default: false
1321
+ * @returns The validated flow data
1322
+ *
1323
+ * @example
1324
+ * // Execute flow with default settings (legacy gptHandler)
1325
+ * const result = await driver.executeFlow('tests/login-flow.json');
1326
+ *
1327
+ * @example
1328
+ * // Execute flow with smart loop enabled for optimized caching
1329
+ * const result = await driver.executeFlow('tests/login-flow.json', { useSmartLoop: true });
1330
+ */
1331
+ async executeFlow(filePath, options) {
1332
+ const useSmartLoop = options?.useSmartLoop ?? false;
1333
+ globalLogger.info(`Loading flow from file: ${filePath}`);
1334
+ const absolutePath = path.resolve(filePath);
1335
+ const baseDir = path.dirname(absolutePath);
1336
+ let raw;
1337
+ try {
1338
+ raw = await node_fs.promises.readFile(absolutePath, "utf-8");
1339
+ } catch (e) {
1340
+ const msg = `Failed to read file at ${filePath}: ${e?.message ?? e}`;
1341
+ globalLogger.error(msg);
1342
+ throw new Error(msg);
1343
+ }
1344
+ let json;
1345
+ try {
1346
+ json = JSON.parse(raw);
1347
+ } catch (e) {
1348
+ const msg = `Invalid JSON in flow file ${filePath}: ${e?.message ?? e}`;
1349
+ globalLogger.error(msg);
1350
+ throw new Error(msg);
1351
+ }
1352
+ const parsed = SavableTestStoreSchema.safeParse(json);
1353
+ if (!parsed.success) {
1354
+ const issues = parsed.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1355
+ const msg = `Flow validation failed for ${filePath}:
1356
+ ${issues}`;
1357
+ globalLogger.error(msg);
1358
+ throw new Error(msg);
1359
+ }
1360
+ const rootFlow = parsed.data;
1361
+ globalLogger.info(`Flow file validated successfully: ${filePath}`);
1362
+ const visited = /* @__PURE__ */ new Set();
1363
+ const loadFlow = async (p) => {
1364
+ const abs = path.isAbsolute(p) ? p : path.resolve(baseDir, p);
1365
+ const rawChild = await node_fs.promises.readFile(abs, "utf-8");
1366
+ const childJson = JSON.parse(rawChild);
1367
+ const val = SavableTestStoreSchema.safeParse(childJson);
1368
+ if (!val.success) {
1369
+ const issues = val.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1370
+ throw new Error(`Flow validation failed for referenced file ${abs}:
1371
+ ${issues}`);
1372
+ }
1373
+ return val.data;
1374
+ };
1375
+ const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1376
+ const out = [];
1377
+ for (const step of steps) {
1378
+ if (step.type === "fileRef") {
1379
+ const refPath = path.isAbsolute(step.path) ? step.path : path.resolve(parentDir, step.path);
1380
+ const refKey = path.normalize(refPath);
1381
+ if (visited.has(refKey)) {
1382
+ const cycle = [...stack, refKey].map((p) => path.basename(p)).join(" -> ");
1383
+ throw new Error(`Detected circular fileRef: ${cycle}`);
1384
+ }
1385
+ visited.add(refKey);
1386
+ const child = await loadFlow(refPath);
1387
+ const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1388
+ const childDir = path.dirname(refPath);
1389
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1390
+ out.push(...childExpanded);
1391
+ } else {
1392
+ const resolved = { ...step, __params: { ...inheritedParams } };
1393
+ out.push(resolved);
1394
+ }
1395
+ }
1396
+ return out;
1397
+ };
1398
+ const effectiveParams = { ...rootFlow.params ?? {} };
1399
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1400
+ if (!this.appiumSessionStarted) {
1401
+ await this.startSession();
1402
+ }
1403
+ globalLogger.info(`Executing flow '${rootFlow.name}' with ${expandedSteps.length} step(s)...`);
1404
+ let executed = 0;
1405
+ try {
1406
+ for (const step of expandedSteps) {
1407
+ const params = step.__params ?? effectiveParams;
1408
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1409
+ try {
1410
+ switch (step.type) {
1411
+ case "ai": {
1412
+ const instruction = this.interpolateTemplate(step.instruction, params);
1413
+ globalLogger.info(`${prefix}: ${instruction}`);
1414
+ if (useSmartLoop) {
1415
+ const ctx = this.createSmartLoopContext();
1416
+ const result = await executeSmartLoop(ctx, {
1417
+ stepNumber: this.step_number,
1418
+ description: instruction,
1419
+ instruction
1420
+ });
1421
+ if (!result.success) {
1422
+ throw new Error(result.error || "Smart loop execution failed");
1423
+ }
1424
+ this.step_number++;
1425
+ } else {
1426
+ await this.aiExecute({ command: instruction });
1427
+ }
1428
+ break;
1429
+ }
1430
+ case "tap": {
1431
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1432
+ if (!description) {
1433
+ throw new Error("Tap step requires a descriptionText. Coordinate-based taps are no longer supported.");
1434
+ }
1435
+ globalLogger.info(`${prefix}: ${description}`);
1436
+ if (useSmartLoop) {
1437
+ const ctx = this.createSmartLoopContext();
1438
+ const result = await executeSmartLoop(ctx, {
1439
+ stepNumber: this.step_number,
1440
+ description,
1441
+ instruction: description
1442
+ });
1443
+ if (!result.success) {
1444
+ throw new Error(result.error || "Smart loop execution failed");
1445
+ }
1446
+ this.step_number++;
1447
+ } else {
1448
+ await this.aiExecute({ command: description });
1449
+ }
1450
+ break;
1451
+ }
1452
+ case "assert": {
1453
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1454
+ if (!description) {
1455
+ throw new Error("Assert step requires a descriptionText. Coordinate-based assertions are no longer supported.");
1456
+ }
1457
+ globalLogger.info(`${prefix}: ${description}`);
1458
+ if (useSmartLoop) {
1459
+ const instruction = `Verify that: ${description}`;
1460
+ const ctx = this.createSmartLoopContext();
1461
+ const result = await executeSmartLoop(ctx, {
1462
+ stepNumber: this.step_number,
1463
+ description,
1464
+ instruction
1465
+ });
1466
+ if (!result.success) {
1467
+ throw new Error(result.error || "Smart loop execution failed");
1468
+ }
1469
+ this.step_number++;
1470
+ } else {
1471
+ await this.assert(description);
1472
+ }
1473
+ break;
1474
+ }
1475
+ case "type": {
1476
+ const text = this.interpolateTemplate(step.text, params);
1477
+ globalLogger.info(`${prefix}: Type text`);
1478
+ await this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
1479
+ await this.performType(text);
1480
+ this.step_number++;
1481
+ break;
1482
+ }
1483
+ case "scroll": {
1484
+ globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1485
+ await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1486
+ await this.performScroll(step.direction);
1487
+ this.step_number++;
1488
+ break;
1489
+ }
1490
+ case "zoom": {
1491
+ globalLogger.info(`${prefix}: Zoom ${step.direction}`);
1492
+ await this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
1493
+ this.step_number++;
1494
+ break;
1495
+ }
1496
+ case "scrollUntil": {
1497
+ const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
1498
+ globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
1499
+ await this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
1500
+ await this.performScrollUntil({
1501
+ direction: step.direction,
1502
+ text: interpolatedText,
1503
+ elementId: step.elementId,
1504
+ maxScrolls: step.maxScrolls
1505
+ });
1506
+ this.step_number++;
1507
+ break;
1508
+ }
1509
+ case "deeplink": {
1510
+ const pkg = params["package"];
1511
+ const bundleId = params["bundleId"];
1512
+ const url = this.interpolateTemplate(step.url, params);
1513
+ globalLogger.info(`${prefix}: Open deeplink ${url}`);
1514
+ await this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
1515
+ await this.openDeepLinkUrl({ url, package: pkg, bundleId });
1516
+ break;
1517
+ }
1518
+ default: {
1519
+ throw new Error(`Unsupported step type at execution: ${step.type}`);
1520
+ }
1521
+ }
1522
+ executed++;
1523
+ } catch (err) {
1524
+ if (step.optional) {
1525
+ globalLogger.warn(`${prefix} failed but marked optional. Continuing. Error: ${err.message}`);
1526
+ continue;
1527
+ }
1528
+ throw err;
1529
+ }
1530
+ }
1531
+ } catch (e) {
1532
+ try {
1533
+ await this.setSessionStatus("failed");
1534
+ } catch {
1535
+ }
1536
+ throw e;
1537
+ }
1538
+ return rootFlow;
1539
+ }
1540
+ async gptHandler(command, cachingMode) {
301
1541
  try {
302
1542
  let conditionSucceeded = false;
303
1543
  while (!conditionSucceeded) {
304
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
305
- console.log(">> Asking GTP Driver for next action...");
1544
+ let screenshot;
1545
+ if (!this.useGptDriverCloud) {
1546
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1547
+ }
1548
+ globalLogger.info("Requesting next action from GPT Driver...");
306
1549
  const response = await axios.request(
307
1550
  {
308
1551
  url: `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/execute`,
@@ -310,39 +1553,49 @@ class GptDriver {
310
1553
  data: {
311
1554
  api_key: this.apiKey,
312
1555
  command,
313
- base64_screenshot: screenshot
1556
+ base64_screenshot: screenshot,
1557
+ caching_mode: cachingMode ?? this.cachingMode,
1558
+ step_number: this.step_number
314
1559
  }
315
1560
  }
316
1561
  );
317
1562
  const executeStatus = response.data.status;
318
1563
  if (executeStatus === "failed") {
319
- const errorMessage = response?.data?.commands?.at(0)?.data;
1564
+ const errorMessage = response.data?.commands?.at(0)?.data;
1565
+ globalLogger.error(`Execution failed: ${errorMessage ?? "Unknown error"}`);
320
1566
  throw new Error(errorMessage ?? "Execution failed");
321
1567
  }
322
1568
  conditionSucceeded = executeStatus !== "inProgress";
323
1569
  const executeResponse = response.data;
324
- for (const command2 of executeResponse.commands) {
325
- await this.executeCommand(command2);
1570
+ globalLogger.debug(`Received ${executeResponse.commands.length} command(s) to execute`);
1571
+ for (const appiumCommand of executeResponse.commands) {
1572
+ await this.executeCommand(appiumCommand);
326
1573
  }
327
1574
  if (!conditionSucceeded) {
1575
+ globalLogger.debug("Command still in progress, waiting...");
328
1576
  await delay(1500);
329
1577
  }
330
1578
  }
1579
+ this.step_number = this.step_number + 1;
1580
+ globalLogger.info("Command execution completed successfully");
331
1581
  } catch (e) {
332
- await this.stopSession("failed");
1582
+ globalLogger.error("GPT handler failed", e);
1583
+ await this.setSessionStatus("failed");
333
1584
  throw e;
334
1585
  }
335
1586
  }
336
1587
  async executeCommand(command) {
337
- const firstAction = command.data.actions?.at(0);
1588
+ const firstAction = command.data?.actions?.at(0);
338
1589
  if (firstAction?.type === "pause" && firstAction.duration != null) {
1590
+ globalLogger.debug(`Pausing for ${firstAction.duration} seconds`);
339
1591
  await delay(firstAction * 1e3);
340
- } else {
1592
+ } else if (!this.useGptDriverCloud) {
341
1593
  const parsedUrl = new URL(command.url);
342
1594
  parsedUrl.protocol = this.appiumSessionConfig.serverUrl.protocol;
343
1595
  parsedUrl.host = this.appiumSessionConfig.serverUrl.host;
344
1596
  parsedUrl.port = this.appiumSessionConfig.serverUrl.port != "" ? `${this.appiumSessionConfig.serverUrl.port}` : "";
345
1597
  parsedUrl.pathname = this.appiumSessionConfig.serverUrl.pathname != "/" ? `${this.appiumSessionConfig.serverUrl.pathname}${parsedUrl.pathname}` : parsedUrl.pathname;
1598
+ globalLogger.debug(`Executing ${command.method} request to ${parsedUrl.pathname}`);
346
1599
  await axios.request({
347
1600
  url: parsedUrl.toString(),
348
1601
  method: command.method,
@@ -350,16 +1603,25 @@ class GptDriver {
350
1603
  });
351
1604
  }
352
1605
  }
353
- async getScreenshot(appiumSessionConfig) {
354
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
355
- const screenshotResponse = await axios.get(url);
356
- let screenshot = await screenshotResponse.data.value;
357
- if (appiumSessionConfig.platform === "iOS") {
358
- const imageBuffer = Buffer.from(screenshot, "base64");
359
- const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
360
- screenshot = transformedImage.toString("base64");
1606
+ async logCodeExecution(screenshot, command) {
1607
+ try {
1608
+ const screenshot2 = await this.getScreenshot(this.appiumSessionConfig);
1609
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
1610
+ api_key: this.apiKey,
1611
+ base64_screenshot: screenshot2,
1612
+ command
1613
+ });
1614
+ } catch (e) {
1615
+ globalLogger.error("Failed to log code execution", e);
1616
+ }
1617
+ }
1618
+ async takeScreenshotAndLogCodeExecution(command) {
1619
+ try {
1620
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1621
+ await this.logCodeExecution(screenshot, command);
1622
+ } catch (e) {
1623
+ globalLogger.error("Failed to log code execution", e);
361
1624
  }
362
- return screenshot;
363
1625
  }
364
1626
  }
365
1627