gpt-driver-node 1.0.0-alpha.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,13 @@
1
1
  'use strict';
2
2
 
3
+ var node_fs = require('node:fs');
4
+ var path = require('node:path');
3
5
  var axios = require('axios');
4
6
  var sharp = require('sharp');
7
+ var webdriverio = require('webdriverio');
8
+ var winston = require('winston');
9
+ var zod = require('zod');
10
+ var crypto = require('node:crypto');
5
11
 
6
12
  const delay = async (milliseconds) => {
7
13
  await new Promise((resolve) => setTimeout(resolve, milliseconds));
@@ -17,12 +23,601 @@ function buildUrl(base, extraPath) {
17
23
  return `${baseUrl}${extraPath}`;
18
24
  }
19
25
 
26
+ const colors = {
27
+ reset: "\x1B[0m",
28
+ bold: "\x1B[1m",
29
+ gray: "\x1B[90m",
30
+ red: "\x1B[31m",
31
+ green: "\x1B[32m",
32
+ yellow: "\x1B[33m",
33
+ cyan: "\x1B[36m"
34
+ };
35
+ const logStyles = {
36
+ bold: (text) => `${colors.bold}${text}${colors.reset}`,
37
+ cyan: (text) => `${colors.cyan}${text}${colors.reset}`,
38
+ yellow: (text) => `${colors.yellow}${text}${colors.reset}`,
39
+ green: (text) => `${colors.green}${text}${colors.reset}`,
40
+ red: (text) => `${colors.red}${text}${colors.reset}`,
41
+ gray: (text) => `${colors.gray}${text}${colors.reset}`,
42
+ highlight: (text) => `${colors.bold}${colors.cyan}${text}${colors.reset}`,
43
+ success: (text) => `${colors.bold}${colors.green}${text}${colors.reset}`,
44
+ error: (text) => `${colors.bold}${colors.red}${text}${colors.reset}`,
45
+ warning: (text) => `${colors.bold}${colors.yellow}${text}${colors.reset}`
46
+ };
47
+ const globalLogger = winston.createLogger({
48
+ level: process.env.GPT_DRIVER_LOG_LEVEL || "info",
49
+ format: winston.format.combine(
50
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
51
+ winston.format.errors({ stack: true }),
52
+ winston.format.printf(({ timestamp, level, message, stack }) => {
53
+ const logMessage = `${timestamp} [${level.toUpperCase()}]: ${message}`;
54
+ return stack ? `${logMessage}
55
+ ${stack}` : logMessage;
56
+ })
57
+ ),
58
+ transports: [
59
+ new winston.transports.Console({
60
+ format: winston.format.combine(
61
+ winston.format.printf(({ timestamp, level, message, stack }) => {
62
+ let coloredLevel = level.toUpperCase();
63
+ switch (level) {
64
+ case "error":
65
+ coloredLevel = logStyles.error(coloredLevel);
66
+ break;
67
+ case "warn":
68
+ coloredLevel = logStyles.warning(coloredLevel);
69
+ break;
70
+ case "info":
71
+ coloredLevel = logStyles.cyan(coloredLevel);
72
+ break;
73
+ case "debug":
74
+ coloredLevel = logStyles.gray(coloredLevel);
75
+ break;
76
+ }
77
+ const formattedTimestamp = logStyles.gray(timestamp);
78
+ const logMessage = `${formattedTimestamp} [${coloredLevel}]: ${message}`;
79
+ return stack ? `${logMessage}
80
+ ${logStyles.gray(stack)}` : logMessage;
81
+ })
82
+ )
83
+ })
84
+ ]
85
+ });
86
+
87
+ const SavableStepBaseSchema = zod.z.object({
88
+ id: zod.z.number().optional(),
89
+ descriptionText: zod.z.string().optional(),
90
+ optional: zod.z.boolean().optional()
91
+ });
92
+ const SavableTapStepSchema = SavableStepBaseSchema.extend({
93
+ type: zod.z.literal("tap"),
94
+ elementId: zod.z.string().optional(),
95
+ timeout: zod.z.number().optional(),
96
+ useLlmOnly: zod.z.boolean().optional(),
97
+ cropBase64: zod.z.string().optional()
98
+ });
99
+ const SavableAssertStepSchema = SavableStepBaseSchema.extend({
100
+ type: zod.z.literal("assert"),
101
+ elementId: zod.z.string().optional(),
102
+ timeout: zod.z.number().optional(),
103
+ useLlmOnly: zod.z.boolean().optional(),
104
+ cropBase64: zod.z.string().optional()
105
+ });
106
+ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
107
+ type: zod.z.literal("type"),
108
+ text: zod.z.string()
109
+ });
110
+ const SavableScrollStepSchema = SavableStepBaseSchema.extend({
111
+ type: zod.z.literal("scroll"),
112
+ direction: zod.z.enum(["up", "down"])
113
+ });
114
+ const SavableZoomStepSchema = SavableStepBaseSchema.extend({
115
+ type: zod.z.literal("zoom"),
116
+ direction: zod.z.enum(["in", "out"])
117
+ });
118
+ const SavableScrollUntilStepSchema = SavableStepBaseSchema.extend({
119
+ type: zod.z.literal("scrollUntil"),
120
+ text: zod.z.string().optional(),
121
+ elementId: zod.z.string().optional(),
122
+ direction: zod.z.enum(["up", "down"]),
123
+ maxScrolls: zod.z.number().optional()
124
+ });
125
+ const SavableDeeplinkStepSchema = SavableStepBaseSchema.extend({
126
+ type: zod.z.literal("deeplink"),
127
+ url: zod.z.string()
128
+ });
129
+ const SavableAIStepSchema = SavableStepBaseSchema.extend({
130
+ type: zod.z.literal("ai"),
131
+ instruction: zod.z.string()
132
+ });
133
+ const SavableFileRefStepSchema = SavableStepBaseSchema.extend({
134
+ type: zod.z.literal("fileRef"),
135
+ path: zod.z.string(),
136
+ overrides: zod.z.record(zod.z.string(), zod.z.string()).optional()
137
+ });
138
+ const SavableStepSchema = zod.z.discriminatedUnion("type", [
139
+ SavableTapStepSchema,
140
+ // type: 'tap'
141
+ SavableAssertStepSchema,
142
+ // type: 'assert'
143
+ SavableTypeStepSchema,
144
+ // type: 'type'
145
+ SavableScrollStepSchema,
146
+ // type: 'scroll'
147
+ SavableZoomStepSchema,
148
+ // type: 'zoom'
149
+ SavableScrollUntilStepSchema,
150
+ // type: 'scrollUntil'
151
+ SavableDeeplinkStepSchema,
152
+ // type: 'deeplink'
153
+ SavableAIStepSchema,
154
+ // type: 'ai'
155
+ SavableFileRefStepSchema
156
+ // type: 'fileRef'
157
+ ]);
158
+ const SavableTestStoreSchema = zod.z.object({
159
+ name: zod.z.string(),
160
+ steps: zod.z.array(SavableStepSchema),
161
+ params: zod.z.record(zod.z.string(), zod.z.string()).optional()
162
+ });
163
+
164
+ const CACHE_SERVER_URL = "https://cache.mobileboost.io";
165
+ const GPT_DRIVER_BASE_URL = "https://api.mobileboost.io";
166
+ const RESCALE_FACTOR = 4;
167
+ const SMART_LOOP_MAX_ITERATIONS = 15;
168
+ const CACHE_RETRY_MS = 2e3;
169
+ const CACHE_CHECK_INTERVAL_MS = 500;
170
+
171
+ function generateCacheHash(apiKey, filepath, stepNumber, description, platform, resolution) {
172
+ const resString = resolution ? `${resolution.width}x${resolution.height}` : "";
173
+ const normalizedPlatform = platform?.toLowerCase() || "";
174
+ const data = `${apiKey}${filepath || ""}${stepNumber}${description}${normalizedPlatform || ""}${resString}`;
175
+ return crypto.createHash("sha256").update(data).digest("hex");
176
+ }
177
+ function scaleCommand(cmd, operation) {
178
+ if (cmd.match(/([xy])=(\d+)/)) {
179
+ return cmd.replace(/([xy])=(\d+)/g, (_match, axis, val) => {
180
+ const num = parseInt(val, 10);
181
+ let scaled;
182
+ if (operation === "multiply") {
183
+ scaled = Math.round(num * RESCALE_FACTOR);
184
+ } else {
185
+ scaled = Math.round(num / RESCALE_FACTOR);
186
+ }
187
+ return `${axis}=${scaled}`;
188
+ });
189
+ }
190
+ return cmd.replace(/(^|;)(\d+);(\d+)(;|$)/, (_match, prefix, xStr, yStr, suffix) => {
191
+ const x = parseInt(xStr, 10);
192
+ const y = parseInt(yStr, 10);
193
+ let scaledX;
194
+ let scaledY;
195
+ if (operation === "multiply") {
196
+ scaledX = Math.round(x * RESCALE_FACTOR);
197
+ scaledY = Math.round(y * RESCALE_FACTOR);
198
+ } else {
199
+ scaledX = Math.round(x / RESCALE_FACTOR);
200
+ scaledY = Math.round(y / RESCALE_FACTOR);
201
+ }
202
+ return `${prefix}${scaledX};${scaledY}${suffix}`;
203
+ });
204
+ }
205
+ async function resizeScreenshotForCache(screenshotBase64) {
206
+ const buffer = Buffer.from(
207
+ screenshotBase64.replace(/^data:image\/\w+;base64,/, ""),
208
+ "base64"
209
+ );
210
+ const metadata = await sharp(buffer).metadata();
211
+ const originalWidth = metadata.width ?? 1080;
212
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
213
+ return sharp(buffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
214
+ }
215
+
216
+ async function executeFromCache(params) {
217
+ try {
218
+ const hash = generateCacheHash(
219
+ params.apiKey,
220
+ params.filepath,
221
+ params.stepNumber,
222
+ params.stepDescription,
223
+ params.platform,
224
+ params.screenResolution
225
+ );
226
+ const resizedBuffer = await resizeScreenshotForCache(params.screenshot);
227
+ const formData = new FormData();
228
+ formData.append("hash", hash);
229
+ const blob = new Blob([new Uint8Array(resizedBuffer)], { type: "image/png" });
230
+ const blobSizeMB = (blob.size / (1024 * 1024)).toFixed(2);
231
+ globalLogger.debug(`[Cache] Executing from cache with screenshot size: ${blobSizeMB} MB`);
232
+ formData.append("screenshot", blob, "screenshot.png");
233
+ if (params.highestUsedIndex !== void 0 && params.highestUsedIndex !== null) {
234
+ globalLogger.debug(`[Cache] Sending highest_used_index: ${params.highestUsedIndex}`);
235
+ formData.append("highest_used_index", String(params.highestUsedIndex));
236
+ }
237
+ const response = await axios.post(`${CACHE_SERVER_URL}/execute-from-cache`, formData);
238
+ const result = response.data;
239
+ if (result.found && result.cacheCommands) {
240
+ const scaledCommands = result.cacheCommands.map(
241
+ (cmd) => scaleCommand(cmd, "multiply")
242
+ );
243
+ return {
244
+ found: true,
245
+ cacheCommands: scaledCommands,
246
+ cacheIndex: result.cacheIndex
247
+ };
248
+ }
249
+ return { found: false };
250
+ } catch (error) {
251
+ if (axios.isAxiosError(error)) {
252
+ globalLogger.warn(`[Cache] Cache lookup failed: ${error.response?.data || error.message}`);
253
+ } else {
254
+ globalLogger.error(`[Cache] Error executing from cache: ${error}`);
255
+ }
256
+ return { found: false };
257
+ }
258
+ }
259
+ async function populateCache(params) {
260
+ try {
261
+ const hash = generateCacheHash(
262
+ params.apiKey,
263
+ params.filepath,
264
+ params.stepNumber,
265
+ params.stepDescription,
266
+ params.platform,
267
+ params.screenResolution
268
+ );
269
+ const payload = await Promise.all(params.executionData.map(async (item) => {
270
+ const resizedBuffer = await resizeScreenshotForCache(item.screenshot);
271
+ const scaledCommands = item.commands.map(
272
+ (cmd) => scaleCommand(cmd, "divide")
273
+ );
274
+ return {
275
+ screenshot: resizedBuffer.toString("base64"),
276
+ commands: scaledCommands
277
+ };
278
+ }));
279
+ const payloadSizeMB = (JSON.stringify(payload).length / (1024 * 1024)).toFixed(2);
280
+ globalLogger.debug(`[Cache] Populating cache with payload size: ~${payloadSizeMB} MB (Hash: ${hash})`);
281
+ await axios.post(`${CACHE_SERVER_URL}/populate-cache`, payload, {
282
+ params: { hash }
283
+ });
284
+ return { success: true };
285
+ } catch (error) {
286
+ if (axios.isAxiosError(error)) {
287
+ globalLogger.error(`[Cache] Failed to populate cache: ${error.response?.data || error.message}`);
288
+ } else {
289
+ globalLogger.error(`[Cache] Error populating cache: ${error}`);
290
+ }
291
+ return { success: false };
292
+ }
293
+ }
294
+
295
+ const AI_AGENT_ENDPOINT = "https://api.mobileboost.io/call_lambda";
296
+ async function executeAgentStep(params) {
297
+ const imageBuffer = Buffer.from(params.base64_screenshot, "base64");
298
+ const metadata = await sharp(imageBuffer).metadata();
299
+ const originalWidth = metadata.width ?? 1080;
300
+ const originalHeight = metadata.height ?? 1920;
301
+ const desiredWidth = Math.round(originalWidth / RESCALE_FACTOR);
302
+ const resizedBuffer = await sharp(imageBuffer).resize({ width: desiredWidth, withoutEnlargement: true }).toBuffer();
303
+ const resizedMetadata = await sharp(resizedBuffer).metadata();
304
+ const resizedWidth = resizedMetadata.width ?? desiredWidth;
305
+ const resizedHeight = resizedMetadata.height ?? Math.round(originalHeight * (desiredWidth / originalWidth));
306
+ globalLogger.debug(`[AI Client] Resized screenshot: ${originalWidth}x${originalHeight} -> ${resizedWidth}x${resizedHeight}`);
307
+ const payload = {
308
+ lambda_flow: "get_next_step",
309
+ current_date: (/* @__PURE__ */ new Date()).toLocaleDateString("en-GB", {
310
+ day: "numeric",
311
+ month: "long",
312
+ year: "numeric"
313
+ }),
314
+ base64_screenshot: resizedBuffer.toString("base64"),
315
+ getUI_elements: [],
316
+ uiHierarchy: [],
317
+ test_task_string: JSON.stringify([
318
+ {
319
+ id: "step-1",
320
+ text: `1. ${params.instruction}`,
321
+ plainText: params.instruction
322
+ }
323
+ ]),
324
+ image_width: resizedWidth,
325
+ image_height: resizedHeight,
326
+ action_history: params.action_history,
327
+ orgKey: params.apiKey,
328
+ template_images: {},
329
+ model_provider: "vellum",
330
+ model_version: "claude-agent",
331
+ fallbackModel: "claude-agent",
332
+ utilize_fullTextAnnotation: false,
333
+ enableSortingOCR: true,
334
+ enableActionHistoryCut: true,
335
+ removeOverlappingText: false,
336
+ currentAndPreviousScreenMatch: false,
337
+ popupDetectionEnabled: true,
338
+ ocrProvider: "gcp"
339
+ };
340
+ globalLogger.debug(`[AI Client] Sending request to ${AI_AGENT_ENDPOINT}`);
341
+ try {
342
+ const response = await axios.post(
343
+ AI_AGENT_ENDPOINT,
344
+ payload,
345
+ {
346
+ headers: {
347
+ "Content-Type": "application/json"
348
+ }
349
+ }
350
+ );
351
+ const result = response.data;
352
+ globalLogger.debug("[AI Client] Received response from backend");
353
+ if (result.appetizeCommands) {
354
+ result.appetizeCommands = result.appetizeCommands.map(
355
+ (cmd) => scaleCommand(cmd, "multiply")
356
+ );
357
+ }
358
+ return result;
359
+ } catch (error) {
360
+ if (axios.isAxiosError(error)) {
361
+ const status = error.response?.status ?? "unknown";
362
+ const errorText = error.response?.data ?? error.message;
363
+ globalLogger.error(`[AI Client] Backend error (${status}): ${JSON.stringify(errorText)}`);
364
+ throw new Error(`AI Backend Error: ${status} - ${error.message}`);
365
+ }
366
+ throw error;
367
+ }
368
+ }
369
+
370
+ function parseTapCoordinates(cmd) {
371
+ const xMatch = cmd.match(/x=(\d+)/);
372
+ const yMatch = cmd.match(/y=(\d+)/);
373
+ if (xMatch && yMatch) {
374
+ return {
375
+ x: parseInt(xMatch[1], 10),
376
+ y: parseInt(yMatch[1], 10)
377
+ };
378
+ }
379
+ const parts = cmd.split(";");
380
+ if (parts.length >= 3) {
381
+ const x = parseInt(parts[1], 10);
382
+ const y = parseInt(parts[2], 10);
383
+ if (!isNaN(x) && !isNaN(y)) {
384
+ return { x, y };
385
+ }
386
+ }
387
+ return null;
388
+ }
389
+ function parseWaitSeconds(cmd) {
390
+ const match = cmd.match(/wait:\s*(\d+)/);
391
+ return match ? parseInt(match[1], 10) : null;
392
+ }
393
+ function parseScrollDirection(cmd) {
394
+ const match = cmd.match(/scroll:\s*(up|down)/i);
395
+ return match ? match[1].toLowerCase() : null;
396
+ }
397
+ function parseTypeText(cmd) {
398
+ const match = cmd.match(/^type:\s*(.+)$/);
399
+ return match ? match[1] : null;
400
+ }
401
+ function isTaskComplete(cmd) {
402
+ return cmd.toLowerCase().includes("task complete:");
403
+ }
404
+ function isErrorDetected(cmd) {
405
+ return cmd.toLowerCase().includes("error detected:");
406
+ }
407
+ function isRememberCommand(cmd) {
408
+ return cmd.startsWith("remember:");
409
+ }
410
+ function isTapCommand(cmd) {
411
+ return /^t(ap|ab)On:/.test(cmd);
412
+ }
413
+ function isWaitCommand(cmd) {
414
+ return cmd.startsWith("wait:");
415
+ }
416
+ function isScrollCommand(cmd) {
417
+ return cmd.startsWith("scroll:");
418
+ }
419
+ function isTypeCommand(cmd) {
420
+ return cmd.startsWith("type:");
421
+ }
422
+
423
+ async function executeSmartLoop(ctx, params) {
424
+ const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
425
+ let loopCount = 0;
426
+ let actionHistory = [...ctx.globalActionHistory];
427
+ let lastCacheIndex = void 0;
428
+ let anyCacheMiss = false;
429
+ let everHadCacheHit = false;
430
+ const currentExecutionData = [];
431
+ globalLogger.info(`[SmartLoop] Starting for step ${params.stepNumber}: "${params.description}"`);
432
+ try {
433
+ while (loopCount < SMART_LOOP_MAX_ITERATIONS) {
434
+ let screenshot = "";
435
+ let commands = [];
436
+ let isCacheHit = false;
437
+ for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
438
+ screenshot = await ctx.getScreenshot();
439
+ const sizeInBytes = screenshot.length * 0.75;
440
+ const sizeInMB = (sizeInBytes / (1024 * 1024)).toFixed(2);
441
+ globalLogger.debug(`[SmartLoop] Captured screenshot: ~${sizeInMB} MB`);
442
+ try {
443
+ globalLogger.debug(`[SmartLoop] Checking cache (Attempt ${attempt + 1}/${maxCacheAttempts})`);
444
+ const cacheResult = await executeFromCache({
445
+ apiKey: ctx.organisationId,
446
+ stepNumber: params.stepNumber,
447
+ stepDescription: params.description,
448
+ screenshot,
449
+ screenResolution: ctx.screenSize,
450
+ highestUsedIndex: lastCacheIndex,
451
+ platform: ctx.platform,
452
+ filepath: params.filepath
453
+ });
454
+ if (cacheResult.found && cacheResult.cacheCommands) {
455
+ commands = cacheResult.cacheCommands;
456
+ lastCacheIndex = cacheResult.cacheIndex;
457
+ isCacheHit = true;
458
+ everHadCacheHit = true;
459
+ globalLogger.info(`[SmartLoop] Cache Hit! (${commands.length} commands)`);
460
+ break;
461
+ }
462
+ } catch (e) {
463
+ globalLogger.warn(`[SmartLoop] Cache check failed: ${e.message}`);
464
+ }
465
+ if (attempt < maxCacheAttempts - 1) {
466
+ globalLogger.debug(`[SmartLoop] Cache miss, retrying in ${CACHE_CHECK_INTERVAL_MS}ms...`);
467
+ await delay(CACHE_CHECK_INTERVAL_MS);
468
+ }
469
+ }
470
+ let aiCommands = [];
471
+ if (!isCacheHit) {
472
+ anyCacheMiss = true;
473
+ globalLogger.info(`[SmartLoop] Cache Miss. Requesting AI agent...`);
474
+ const agentResponse = await executeAgentStep({
475
+ apiKey: ctx.organisationId,
476
+ base64_screenshot: screenshot,
477
+ instruction: params.instruction,
478
+ action_history: actionHistory
479
+ });
480
+ aiCommands = agentResponse.appetizeCommands || [];
481
+ const gptCommands = agentResponse.gptCommands || [];
482
+ const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
483
+ if (reasoningIndex !== -1) {
484
+ const parsedCommands = gptCommands.slice(reasoningIndex);
485
+ const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
486
+ if (rememberCommands.length > 0) {
487
+ ctx.globalActionHistory.push(...rememberCommands);
488
+ }
489
+ actionHistory = [...actionHistory, ...parsedCommands];
490
+ }
491
+ commands = [...aiCommands];
492
+ globalLogger.debug(`[SmartLoop] AI returned ${commands.length} command(s)`);
493
+ }
494
+ currentExecutionData.push({
495
+ screenshot,
496
+ commands: aiCommands.length > 0 ? aiCommands : commands
497
+ });
498
+ await ctx.logCodeExecution(screenshot, commands.join("\n"));
499
+ let actionExecuted = false;
500
+ let taskCompleted = false;
501
+ if (commands.length > 0) {
502
+ globalLogger.debug(`[SmartLoop] Executing ${commands.length} command(s)`);
503
+ }
504
+ for (const cmd of commands) {
505
+ if (isTaskComplete(cmd)) {
506
+ taskCompleted = true;
507
+ globalLogger.info(`[SmartLoop] Task completed signal received`);
508
+ continue;
509
+ }
510
+ if (isErrorDetected(cmd)) {
511
+ throw new Error(`AI Reported Error: ${cmd}`);
512
+ }
513
+ if (isRememberCommand(cmd)) {
514
+ ctx.globalActionHistory.push(cmd);
515
+ }
516
+ if (isTapCommand(cmd)) {
517
+ const coords = parseTapCoordinates(cmd);
518
+ if (coords) {
519
+ globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
520
+ await ctx.performTap(coords.x, coords.y);
521
+ actionExecuted = true;
522
+ }
523
+ } else if (isWaitCommand(cmd)) {
524
+ const seconds = parseWaitSeconds(cmd);
525
+ if (seconds) {
526
+ globalLogger.debug(`[SmartLoop] Waiting ${seconds}s`);
527
+ await delay(seconds * 1e3);
528
+ actionExecuted = true;
529
+ }
530
+ } else if (isScrollCommand(cmd)) {
531
+ const direction = parseScrollDirection(cmd);
532
+ if (direction) {
533
+ globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
534
+ await ctx.performScroll(direction);
535
+ actionExecuted = true;
536
+ }
537
+ } else if (isTypeCommand(cmd)) {
538
+ const text = parseTypeText(cmd);
539
+ if (text) {
540
+ globalLogger.debug(`[SmartLoop] Typing text`);
541
+ await ctx.performType(text);
542
+ actionExecuted = true;
543
+ }
544
+ }
545
+ }
546
+ if (actionExecuted) {
547
+ if (isCacheHit) {
548
+ actionHistory.push(...commands);
549
+ }
550
+ await delay(100);
551
+ }
552
+ if (taskCompleted) {
553
+ globalLogger.info(`[SmartLoop] Task completed successfully`);
554
+ if (anyCacheMiss && currentExecutionData.length > 0) {
555
+ globalLogger.info(`[SmartLoop] Populating cache with ${currentExecutionData.length} frame(s)...`);
556
+ try {
557
+ await populateCache({
558
+ apiKey: ctx.organisationId,
559
+ stepNumber: params.stepNumber,
560
+ stepDescription: params.description,
561
+ executionData: currentExecutionData,
562
+ screenResolution: ctx.screenSize,
563
+ platform: ctx.platform,
564
+ filepath: params.filepath
565
+ });
566
+ globalLogger.debug(`[SmartLoop] Cache populated successfully`);
567
+ } catch (e) {
568
+ globalLogger.warn(`[SmartLoop] Failed to populate cache: ${e.message}`);
569
+ }
570
+ } else if (!anyCacheMiss) {
571
+ globalLogger.debug(`[SmartLoop] Skipping cache population (all actions were cached)`);
572
+ }
573
+ return {
574
+ success: true,
575
+ iterations: loopCount + 1,
576
+ cacheHit: everHadCacheHit
577
+ };
578
+ }
579
+ loopCount++;
580
+ }
581
+ throw new Error(`Smart Loop timeout after ${SMART_LOOP_MAX_ITERATIONS} iterations`);
582
+ } catch (error) {
583
+ const message = error instanceof Error ? error.message : String(error);
584
+ globalLogger.error(`[SmartLoop] Error: ${message}`);
585
+ return {
586
+ success: false,
587
+ error: message,
588
+ iterations: loopCount + 1,
589
+ cacheHit: everHadCacheHit
590
+ };
591
+ }
592
+ }
593
+
20
594
  class GptDriver {
595
+ interpolateTemplate(input, params) {
596
+ if (typeof input !== "string" || !input.includes("{{")) return input;
597
+ const pattern = /{{\s*([^}]+?)\s*}}/g;
598
+ return input.replace(pattern, (_match, keyRaw) => {
599
+ const key = String(keyRaw);
600
+ if (!(key in params)) {
601
+ throw new Error(`Missing flow param: {{${key}}}`);
602
+ }
603
+ return params[key];
604
+ });
605
+ }
21
606
  apiKey;
607
+ organisationId;
22
608
  gptDriverSessionId;
23
609
  gptDriverBaseUrl;
24
610
  appiumSessionConfig;
611
+ cachingMode;
25
612
  driver;
613
+ appiumSessionStarted;
614
+ useGptDriverCloud;
615
+ gptDriverCloudConfig;
616
+ buildId;
617
+ testId;
618
+ step_number = 1;
619
+ // Smart loop state - maintains action history across steps for context
620
+ globalActionHistory = [];
26
621
  /**
27
622
  * Creates an instance of the GptDriver class.
28
623
  *
@@ -34,6 +629,7 @@ class GptDriver {
34
629
  *
35
630
  * @param {GptDriverConfig} config - The configuration object for initializing the GptDriver instance. This includes:
36
631
  * - `apiKey`: The API key for authenticating requests to the GPT Driver server.
632
+ * - `organisationId` (optional): The Organisation Identifier for authenticating requests to the GPT Driver Cache Server.
37
633
  * - `driver` (optional): An existing WebDriver instance.
38
634
  * - `severConfig` (optional): Configuration for the Appium server, including URL and device settings.
39
635
  * @throws {Error} If a WebDriver instance is provided without a server URL, or if neither a WebDriver instance nor
@@ -42,19 +638,35 @@ class GptDriver {
42
638
  * - `device.platform`: The platform name of the device (e.g., iOS, Android).
43
639
  */
44
640
  constructor(config) {
641
+ this.testId = config.testId;
45
642
  this.apiKey = config.apiKey;
46
- this.gptDriverBaseUrl = "https://api.mobileboost.io";
47
- this.initializeDriver(config);
48
- this.initializeAppiumConfig(config);
643
+ this.organisationId = config.organisationId;
644
+ this.buildId = config.buildId;
645
+ this.useGptDriverCloud = config.useGptDriverCloud;
646
+ this.gptDriverBaseUrl = GPT_DRIVER_BASE_URL;
647
+ this.cachingMode = config.cachingMode ?? "NONE";
648
+ if (config.useGptDriverCloud) {
649
+ if (config.serverConfig.device?.platform == null) {
650
+ throw new Error("Platform is missing. Please specify the platform when using GPTDriver Cloud.");
651
+ }
652
+ this.gptDriverCloudConfig = {
653
+ platform: config.serverConfig.device.platform,
654
+ deviceName: config.serverConfig.device.deviceName,
655
+ platformVersion: config.serverConfig.device.platformVersion
656
+ };
657
+ } else {
658
+ this.initializeDriver(config);
659
+ this.initializeAppiumConfig(config);
660
+ }
49
661
  }
50
662
  initializeDriver(config) {
51
663
  if (config.driver) {
52
664
  this.driver = config.driver;
53
- if (!config.severConfig?.url) {
665
+ if (!config.serverConfig.url) {
54
666
  throw new Error("Server url is missing. Please specify the server url when providing a driver.");
55
667
  }
56
668
  } else {
57
- const isValidServerConfig = config.severConfig?.url && config.severConfig.device?.platform;
669
+ const isValidServerConfig = config.serverConfig.url && config.serverConfig.device?.platform;
58
670
  if (!isValidServerConfig) {
59
671
  throw new Error("Either provide a driver, or a valid severConfig object.");
60
672
  }
@@ -63,10 +675,10 @@ class GptDriver {
63
675
  initializeAppiumConfig(config) {
64
676
  const defaultPort = parseInt(process.env.APPIUM_PORT ?? "4723", 10);
65
677
  const defaultHost = process.env.APPIUM_HOST ?? "127.0.0.1";
66
- let serverUrl = config.severConfig?.url instanceof URL ? config.severConfig.url : new URL(config.severConfig?.url ?? `http://${defaultHost}:${defaultPort}`);
678
+ const serverUrl = config.serverConfig.url instanceof URL ? config.serverConfig.url : new URL(config.serverConfig.url ?? `http://${defaultHost}:${defaultPort}`);
67
679
  this.appiumSessionConfig = {
68
680
  serverUrl,
69
- ...config.severConfig?.device
681
+ ...config.serverConfig.device
70
682
  };
71
683
  }
72
684
  /**
@@ -76,50 +688,53 @@ class GptDriver {
76
688
  * @throws {Error} If the session cannot be started or the driver is not properly initialized.
77
689
  */
78
690
  async startSession() {
79
- console.log(">> Starting session...");
80
- if (this.driver) {
81
- let platform;
82
- let platformVersion;
83
- let deviceName;
84
- let sessionId;
85
- if (this.driver.sessionId == null) {
86
- const driver = this.driver;
87
- const capabilities = await driver.getCapabilities();
88
- platform = capabilities.get("platformName");
89
- platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
90
- deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
91
- const session = await driver.getSession();
92
- sessionId = session.getId();
691
+ globalLogger.info("Starting session...");
692
+ if (!this.useGptDriverCloud) {
693
+ if (this.driver) {
694
+ let platform;
695
+ let platformVersion;
696
+ let deviceName;
697
+ let sessionId;
698
+ if (this.driver.sessionId == null) {
699
+ const driver = this.driver;
700
+ const capabilities = await driver.getCapabilities();
701
+ platform = capabilities.get("platformName");
702
+ platformVersion = capabilities.get("platformVersion") ?? this.appiumSessionConfig?.platformVersion;
703
+ deviceName = this.appiumSessionConfig?.deviceName ?? capabilities.get("deviceName");
704
+ const session = await driver.getSession();
705
+ sessionId = session.getId();
706
+ } else {
707
+ const driver = this.driver;
708
+ platform = driver.capabilities["appium:platformName"] ?? driver.capabilities["platformName"];
709
+ platformVersion = driver.capabilities["appium:platformVersion"] ?? driver.capabilities["platformVersion"];
710
+ deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? driver.capabilities["deviceName"];
711
+ sessionId = driver.sessionId;
712
+ }
713
+ this.appiumSessionConfig = {
714
+ ...this.appiumSessionConfig,
715
+ id: sessionId,
716
+ platform,
717
+ platformVersion,
718
+ deviceName
719
+ };
720
+ globalLogger.debug(`Session config: ${JSON.stringify(this.appiumSessionConfig)}`);
93
721
  } else {
94
- const driver = this.driver;
95
- platform = driver.capabilities["appium:platformName"];
96
- platformVersion = driver.capabilities["appium:platformVersion"];
97
- deviceName = this.appiumSessionConfig?.deviceName ?? driver.capabilities["appium:deviceName"] ?? "";
98
- sessionId = driver.sessionId;
99
- }
100
- this.appiumSessionConfig = {
101
- ...this.appiumSessionConfig,
102
- id: sessionId,
103
- platform,
104
- platformVersion,
105
- deviceName
722
+ this.appiumSessionConfig.id = await this.createSession();
723
+ }
724
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
725
+ const rectResponse = await axios.get(url);
726
+ this.appiumSessionConfig.size = {
727
+ width: rectResponse.data.value.width,
728
+ height: rectResponse.data.value.height
106
729
  };
107
- } else {
108
- this.appiumSessionConfig.id = await this.createSession();
730
+ this.appiumSessionStarted = true;
109
731
  }
110
732
  await this.createGptDriverSession();
111
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/window/rect`);
112
- const rectResponse = await axios.get(
113
- url
114
- );
115
- this.appiumSessionConfig.size = {
116
- width: rectResponse.data.value.width,
117
- height: rectResponse.data.value.height
118
- };
119
- console.log(`>> Session created. Monitor execution at: ${this.getSessionLink()}`);
733
+ globalLogger.info(logStyles.highlight(`Session created. Monitor execution at: ${this.getSessionLink()}`));
120
734
  }
121
735
  async createSession() {
122
736
  const { platform, deviceName, platformVersion, serverUrl } = this.appiumSessionConfig;
737
+ globalLogger.debug(`Creating Appium session for ${platform} ${platformVersion} on ${deviceName}`);
123
738
  const url = buildUrl(serverUrl, `/session`);
124
739
  const response = await axios.post(
125
740
  url,
@@ -134,22 +749,41 @@ class GptDriver {
134
749
  }
135
750
  }
136
751
  );
137
- return response.data.value.sessionId;
752
+ const sessionId = response.data.value.sessionId;
753
+ globalLogger.debug(`Appium session created with ID: ${sessionId}`);
754
+ return sessionId;
138
755
  }
139
756
  async createGptDriverSession() {
757
+ globalLogger.debug("Creating GPT Driver session...");
140
758
  const response = await axios.post(
141
759
  `${this.gptDriverBaseUrl}/sessions/create`,
142
760
  {
761
+ ...this.testId && { test_id: this.testId },
143
762
  api_key: this.apiKey,
144
- appium_session_id: this.appiumSessionConfig.id,
763
+ appium_session_id: this.appiumSessionConfig?.id,
145
764
  device_config: {
146
- platform: this.appiumSessionConfig.platform,
147
- device: this.appiumSessionConfig.deviceName,
148
- os: this.appiumSessionConfig.platformVersion
149
- }
765
+ platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
766
+ device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
767
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
768
+ },
769
+ use_internal_virtual_device: this.useGptDriverCloud,
770
+ build_id: this.buildId,
771
+ caching_mode: this.cachingMode
150
772
  }
151
773
  );
152
774
  this.gptDriverSessionId = response.data.sessionId;
775
+ globalLogger.debug(`GPT Driver session created with ID: ${this.gptDriverSessionId}`);
776
+ if (this.useGptDriverCloud) {
777
+ const parsedUrl = new URL(response.data.appiumServerUrl);
778
+ this.driver = await webdriverio.attach({
779
+ options: {
780
+ hostname: parsedUrl.hostname,
781
+ path: parsedUrl.pathname
782
+ },
783
+ sessionId: response.data.appiumSessionId
784
+ });
785
+ this.appiumSessionStarted = true;
786
+ }
153
787
  }
154
788
  getSessionLink() {
155
789
  return `https://app.mobileboost.io/gpt-driver/sessions/${this.gptDriverSessionId}`;
@@ -165,20 +799,178 @@ class GptDriver {
165
799
  *
166
800
  * @throws {Error} If the request to stop the session fails.
167
801
  */
168
- async stopSession(status) {
169
- console.log(">> Stopping session...");
170
- await axios.post(
171
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
802
+ async setSessionStatus(status) {
803
+ if (this.gptDriverSessionId) {
804
+ globalLogger.info(`Stopping session with status: ${status}`);
805
+ await axios.post(
806
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/stop`,
807
+ {
808
+ api_key: this.apiKey,
809
+ status
810
+ }
811
+ );
812
+ globalLogger.info("Session stopped successfully");
813
+ this.appiumSessionStarted = false;
814
+ this.gptDriverSessionId = void 0;
815
+ this.step_number = 1;
816
+ this.globalActionHistory = [];
817
+ }
818
+ }
819
+ // ─────────────────────────────────────────────────────────────────────────────
820
+ // SMART LOOP INTEGRATION
821
+ // ─────────────────────────────────────────────────────────────────────────────
822
+ /**
823
+ * Creates a SmartLoopContext for the current session.
824
+ * This context provides all the callbacks needed by the smart loop executor.
825
+ */
826
+ createSmartLoopContext() {
827
+ if (!this.organisationId) {
828
+ throw new Error("Organisation ID is missing, please set it in the GPTDriver constructor");
829
+ }
830
+ return {
831
+ apiKey: this.apiKey,
832
+ platform: this.appiumSessionConfig?.platform,
833
+ screenSize: this.appiumSessionConfig.size,
834
+ globalActionHistory: this.globalActionHistory,
835
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
836
+ performTap: (x, y) => this.performTap(x, y),
837
+ performScroll: (direction) => this.performScroll(direction),
838
+ performType: (text) => this.performType(text),
839
+ logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command),
840
+ organisationId: this.organisationId
841
+ };
842
+ }
843
+ /**
844
+ * Calls the AI agent to determine the next actions based on the current screenshot.
845
+ * This requires an active GPT Driver session.
846
+ */
847
+ async executeAgentStep(params) {
848
+ const response = await axios.post(
849
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/agent/execute`,
172
850
  {
173
851
  api_key: this.apiKey,
174
- status
852
+ base64_screenshot: params.screenshot.replace(/^data:image\/\w+;base64,/, ""),
853
+ instruction: params.instruction,
854
+ action_history: params.actionHistory
175
855
  }
176
856
  );
177
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}`);
178
- await axios.delete(url);
179
- console.log(">> Session stopped.");
180
- this.gptDriverSessionId = void 0;
857
+ return {
858
+ gptCommands: response.data.gpt_commands,
859
+ appetizeCommands: response.data.appetize_commands,
860
+ actionHistory: response.data.action_history
861
+ };
862
+ }
863
+ // ─────────────────────────────────────────────────────────────────────────────
864
+ // DEVICE ACTION METHODS
865
+ // ─────────────────────────────────────────────────────────────────────────────
866
+ async getWdioClient() {
867
+ if (!this.appiumSessionStarted) {
868
+ await this.startSession();
869
+ }
870
+ if (this.driver?.sessionId != null) {
871
+ return this.driver;
872
+ }
873
+ const url = this.appiumSessionConfig.serverUrl;
874
+ const parsed = new URL(url);
875
+ const client = await webdriverio.attach({
876
+ sessionId: this.appiumSessionConfig.id,
877
+ options: {
878
+ protocol: parsed.protocol.replace(":", ""),
879
+ hostname: parsed.hostname,
880
+ port: parsed.port ? Number(parsed.port) : parsed.protocol === "https:" ? 443 : 80,
881
+ path: parsed.pathname && parsed.pathname !== "/" ? parsed.pathname : "/"
882
+ }
883
+ });
884
+ this.driver = client;
885
+ return client;
181
886
  }
887
+ /**
888
+ * Performs a tap action at the specified coordinates.
889
+ */
890
+ async performTap(x, y) {
891
+ const client = await this.getWdioClient();
892
+ await client.performActions([
893
+ {
894
+ type: "pointer",
895
+ id: "finger1",
896
+ parameters: { pointerType: "touch" },
897
+ actions: [
898
+ { type: "pointerMove", duration: 0, x, y },
899
+ { type: "pointerDown", button: 0 },
900
+ { type: "pause", duration: 100 },
901
+ { type: "pointerUp", button: 0 }
902
+ ]
903
+ }
904
+ ]);
905
+ }
906
+ async performType(text) {
907
+ const client = await this.getWdioClient();
908
+ await client.keys(text.split(""));
909
+ }
910
+ async performScroll(direction) {
911
+ const client = await this.getWdioClient();
912
+ const w = this.appiumSessionConfig?.size?.width ?? 1080;
913
+ const h = this.appiumSessionConfig?.size?.height ?? 1920;
914
+ const x = Math.round(w / 2);
915
+ const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
916
+ const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
917
+ await client.performActions([
918
+ {
919
+ type: "pointer",
920
+ id: "finger1",
921
+ parameters: { pointerType: "touch" },
922
+ actions: [
923
+ { type: "pointerMove", duration: 0, x, y: startY },
924
+ { type: "pointerDown", button: 0 },
925
+ { type: "pause", duration: 100 },
926
+ { type: "pointerMove", duration: 500, x, y: endY },
927
+ { type: "pointerUp", button: 0 }
928
+ ]
929
+ }
930
+ ]);
931
+ }
932
+ async getPageSource() {
933
+ const client = await this.getWdioClient();
934
+ return client.getPageSource();
935
+ }
936
+ async performScrollUntil(params) {
937
+ const { direction, text, elementId } = params;
938
+ const max = params.maxScrolls ?? 10;
939
+ for (let i = 0; i < max; i++) {
940
+ const source = await this.getPageSource();
941
+ const found = elementId ? source.includes(elementId) : text ? source.includes(text) : false;
942
+ if (found) {
943
+ return;
944
+ }
945
+ await this.performScroll(direction);
946
+ await this._delay(500);
947
+ }
948
+ throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
949
+ }
950
+ async getScreenshot(appiumSessionConfig) {
951
+ globalLogger.debug("Capturing screenshot...");
952
+ const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
953
+ const screenshotResponse = await axios.get(url);
954
+ let screenshot = await screenshotResponse.data.value;
955
+ if (appiumSessionConfig.platform === "iOS") {
956
+ globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
957
+ const imageBuffer = Buffer.from(screenshot, "base64");
958
+ const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
959
+ screenshot = transformedImage.toString("base64");
960
+ }
961
+ return screenshot;
962
+ }
963
+ /**
964
+ * Helper method to delay execution.
965
+ *
966
+ * @private
967
+ */
968
+ _delay(ms) {
969
+ return new Promise((resolve) => setTimeout(resolve, ms));
970
+ }
971
+ // ─────────────────────────────────────────────────────────────────────────────
972
+ // PUBLIC API METHODS
973
+ // ─────────────────────────────────────────────────────────────────────────────
182
974
  /**
183
975
  * Executes a specified command within the WebDriver session, optionally using an Appium handler.
184
976
  *
@@ -186,6 +978,7 @@ class GptDriver {
186
978
  * the command-specific operations. After executing the handler, the executed commands get logged on the GPTDriver servers.
187
979
  * If the handler execution fails or no handler is provided, the command gets executed by the GPTDriver using just natural language.
188
980
  *
981
+ * @deprecated Use `aiExecute()` instead. This method will be removed in a future version.
189
982
  * @param {string} command - The natural language command to be executed by the GPTDriver.
190
983
  * @param {AppiumHandler} [appiumHandler] - An optional function that processes Appium-specific commands.
191
984
  * If provided, this handler is executed instead of calling the GPTDriver serves.
@@ -193,24 +986,136 @@ class GptDriver {
193
986
  * @throws {Error} If an error occurs during the execution of the Appium handler or while processing the command by the GPTDriver.
194
987
  */
195
988
  async execute(command, appiumHandler) {
196
- console.log(">> Executing command:", command);
989
+ globalLogger.warn("Method 'execute()' is deprecated. Please use 'aiExecute()' instead.");
990
+ if (!this.appiumSessionStarted) {
991
+ await this.startSession();
992
+ }
993
+ globalLogger.info(`Executing command: ${command}`);
197
994
  const driver = this.driver;
198
995
  if (appiumHandler != null) {
199
996
  try {
997
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
200
998
  await appiumHandler(driver);
201
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
202
- await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
203
- api_key: this.apiKey,
204
- base64_screenshot: screenshot,
205
- command: appiumHandler.toString()
206
- });
999
+ globalLogger.debug("Custom Appium handler executed successfully");
207
1000
  } catch (e) {
1001
+ globalLogger.warn("Custom Appium handler failed, falling back to GPT handler");
208
1002
  await this.gptHandler(command);
209
1003
  }
210
1004
  } else {
211
1005
  await this.gptHandler(command);
212
1006
  }
213
1007
  }
1008
+ /**
1009
+ * Executes a specified command within the WebDriver session with configurable caching options.
1010
+ *
1011
+ * This is the recommended method for executing commands. It provides fine-grained control over
1012
+ * caching behavior, allowing you to optimize performance and costs for repetitive test scenarios.
1013
+ *
1014
+ * If an `appiumHandler` is provided, it will be invoked with the WebDriver instance to perform
1015
+ * the command-specific operations. After executing the handler, the executed commands get logged
1016
+ * on the GPTDriver servers. If the handler execution fails or no handler is provided, the command
1017
+ * gets executed by the GPTDriver using natural language processing.
1018
+ *
1019
+ * @param {Object} params - The execution parameters
1020
+ * @param {string} params.command - The natural language command to be executed by the GPTDriver.
1021
+ * Examples: "Click the login button", "Enter 'test@example.com' in the email field"
1022
+ * @param {AppiumHandler} [params.appiumHandler] - An optional function that processes Appium-specific commands.
1023
+ * If provided, this handler is executed instead of calling
1024
+ * the GPTDriver API. Useful for performance optimization when
1025
+ * you know the exact Appium commands to execute.
1026
+ * @param {CachingMode} [params.cachingMode] - Controls how the GPTDriver caches this command execution.
1027
+ * If not specified, uses the global caching mode set in the constructor.
1028
+ * Options:
1029
+ * - "NONE"
1030
+ * - "FULL_SCREEN"
1031
+ * - "INTERACTION_REGION"
1032
+ * @param {boolean} [params.useSmartLoop] - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1033
+ * which optimizes execution by checking cache first and populating it after.
1034
+ * Default: false (uses legacy gptHandler)
1035
+ *
1036
+ * @returns {Promise<void>} A promise that resolves when the command execution is complete.
1037
+ *
1038
+ * @throws {Error} If an error occurs during the execution of the Appium handler or while processing
1039
+ * the command by the GPTDriver.
1040
+ *
1041
+ * @example
1042
+ * // Basic usage with natural language (no caching)
1043
+ * await driver.aiExecute({
1044
+ * command: "Click the submit button"
1045
+ * });
1046
+ *
1047
+ * @example
1048
+ * // Full screen caching for repetitive navigation on similar screens
1049
+ * await driver.aiExecute({
1050
+ * command: "Navigate to the settings page",
1051
+ * cachingMode: "FULL_SCREEN"
1052
+ * });
1053
+ *
1054
+ * @example
1055
+ * // Interaction region caching for repeated actions on the same button
1056
+ * await driver.aiExecute({
1057
+ * command: "Click the login button",
1058
+ * cachingMode: "INTERACTION_REGION"
1059
+ * });
1060
+ *
1061
+ * @example
1062
+ * // With custom Appium handler as fallback
1063
+ * await driver.aiExecute({
1064
+ * command: "Click the login button",
1065
+ * appiumHandler: async (driver) => {
1066
+ * const loginBtn = await driver.$('~loginButton');
1067
+ * await loginBtn.click();
1068
+ * },
1069
+ * cachingMode: "INTERACTION_REGION"
1070
+ * });
1071
+ *
1072
+ * @example
1073
+ * // Force fresh execution for dynamic content
1074
+ * await driver.aiExecute({
1075
+ * command: "Verify the current timestamp",
1076
+ * cachingMode: "NONE"
1077
+ * });
1078
+ *
1079
+ * @example
1080
+ * // Using smart loop for optimized caching
1081
+ * await driver.aiExecute({
1082
+ * command: "Click the login button",
1083
+ * useSmartLoop: true,
1084
+ * cachingMode: "FULL_SCREEN"
1085
+ * });
1086
+ */
1087
+ async aiExecute({ command, appiumHandler, cachingMode, useSmartLoop = false }) {
1088
+ if (!this.appiumSessionStarted) {
1089
+ await this.startSession();
1090
+ }
1091
+ globalLogger.info(`Executing command: ${command}`);
1092
+ const driver = this.driver;
1093
+ if (appiumHandler != null) {
1094
+ try {
1095
+ await this.takeScreenshotAndLogCodeExecution(appiumHandler.toString());
1096
+ await appiumHandler(driver);
1097
+ globalLogger.debug("Custom Appium handler executed successfully");
1098
+ this.step_number++;
1099
+ return;
1100
+ } catch (e) {
1101
+ globalLogger.warn("Custom Appium handler failed, falling back to AI execution");
1102
+ }
1103
+ }
1104
+ if (useSmartLoop) {
1105
+ const ctx = this.createSmartLoopContext();
1106
+ const result = await executeSmartLoop(ctx, {
1107
+ stepNumber: this.step_number,
1108
+ description: command,
1109
+ instruction: command
1110
+ });
1111
+ if (!result.success) {
1112
+ throw new Error(result.error || "Smart loop execution failed");
1113
+ }
1114
+ this.step_number++;
1115
+ } else {
1116
+ await this.gptHandler(command, cachingMode);
1117
+ }
1118
+ }
214
1119
  /**
215
1120
  * Asserts a single condition using the GPTDriver.
216
1121
  *
@@ -218,13 +1123,25 @@ class GptDriver {
218
1123
  * If the assertion fails, an error is thrown.
219
1124
  *
220
1125
  * @param {string} assertion - The condition to be asserted.
1126
+ * @param cachingMode - The caching mode to be used for the assertion.
221
1127
  * @throws {Error} If the assertion fails.
222
1128
  */
223
- async assert(assertion) {
224
- console.log(">> Asserting:", assertion);
225
- const results = await this.checkBulk([assertion]);
226
- if (!Object.values(results).at(0)) {
227
- throw new Error(`Failed assertion: ${assertion}`);
1129
+ async assert(assertion, cachingMode) {
1130
+ if (!this.appiumSessionStarted) {
1131
+ await this.startSession();
1132
+ }
1133
+ try {
1134
+ const results = await this.checkBulk([assertion], cachingMode);
1135
+ if (!Object.values(results).at(0)) {
1136
+ await this.setSessionStatus("failed");
1137
+ globalLogger.error(`Assertion failed: ${assertion}`);
1138
+ throw new Error(`Failed assertion: ${assertion}`);
1139
+ }
1140
+ this.step_number = this.step_number + 1;
1141
+ globalLogger.info(`Assertion passed: ${assertion}`);
1142
+ } catch (e) {
1143
+ await this.setSessionStatus("failed");
1144
+ throw e;
228
1145
  }
229
1146
  }
230
1147
  /**
@@ -234,43 +1151,100 @@ class GptDriver {
234
1151
  * If any assertion fails, an error is thrown listing all failed assertions.
235
1152
  *
236
1153
  * @param {string[]} assertions - An array of conditions to be asserted.
1154
+ * @param cachingMode - The caching mode to be used for the assertions.
237
1155
  * @throws {Error} If any of the assertions fail.
238
1156
  */
239
- async assertBulk(assertions) {
240
- console.log(">> Asserting:", assertions);
241
- const results = await this.checkBulk(assertions);
242
- const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
243
- if (!current) {
244
- return [...prev, assertions.at(currentIndex)];
1157
+ async assertBulk(assertions, cachingMode) {
1158
+ if (!this.appiumSessionStarted) {
1159
+ await this.startSession();
1160
+ }
1161
+ try {
1162
+ const results = await this.checkBulk(assertions, cachingMode);
1163
+ const failedAssertions = Object.values(results).reduce((prev, current, currentIndex) => {
1164
+ if (!current) {
1165
+ return [...prev, assertions.at(currentIndex)];
1166
+ }
1167
+ return prev;
1168
+ }, []);
1169
+ if (failedAssertions.length > 0) {
1170
+ await this.setSessionStatus("failed");
1171
+ globalLogger.error(`Multiple assertions failed: ${failedAssertions.join(", ")}`);
1172
+ throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
245
1173
  }
246
- return prev;
247
- }, []);
248
- if (failedAssertions.length > 0) {
249
- throw new Error(`Failed assertions: ${failedAssertions.join(", ")}`);
1174
+ this.step_number = this.step_number + 1;
1175
+ globalLogger.info(`All ${assertions.length} assertions passed`);
1176
+ } catch (e) {
1177
+ await this.setSessionStatus("failed");
1178
+ throw e;
250
1179
  }
251
1180
  }
252
1181
  /**
253
1182
  * Checks multiple conditions and returns their results using the GPTDriver.
254
1183
  *
255
1184
  * This method sends a bulk condition request and returns the results of the conditions.
1185
+ * Failed conditions will be retried up to maxRetries times.
256
1186
  *
257
1187
  * @param {string[]} conditions - An array of conditions to be checked.
1188
+ * @param {CachingMode} cachingMode - The caching mode to be used for the conditions.
1189
+ * @param {number} maxRetries - The maximum number of retries if any condition fails (default: 2).
1190
+ * @param {number} retryDelayMs - The delay in milliseconds between retries (default: 1000).
258
1191
  * @returns {Promise<Record<string, boolean>>} A promise that resolves with an object mapping each condition
259
1192
  * to a boolean indicating whether the condition was met.
260
1193
  */
261
- async checkBulk(conditions) {
262
- console.log(">> Checking:", conditions);
263
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
264
- const response = await axios.post(
265
- `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
266
- {
267
- api_key: this.apiKey,
268
- base64_screenshot: screenshot,
269
- assertions: conditions,
270
- command: `Assert: ${JSON.stringify(conditions)}`
1194
+ async checkBulk(conditions, cachingMode, maxRetries = 2, retryDelayMs = 1e3) {
1195
+ let attempt = 0;
1196
+ let results = {};
1197
+ while (attempt <= maxRetries) {
1198
+ results = await this._checkBulkOnce(conditions, cachingMode, attempt);
1199
+ const failedConditions = Object.entries(results).filter(([_, success]) => !success).map(([key, _]) => key);
1200
+ if (failedConditions.length === 0) {
1201
+ return results;
271
1202
  }
272
- );
273
- return response.data.results;
1203
+ attempt++;
1204
+ if (attempt <= maxRetries) {
1205
+ globalLogger.info(
1206
+ `>> Conditions failed ${JSON.stringify(failedConditions)}. Retrying in ${retryDelayMs}ms... (Attempt ${attempt}/${maxRetries})`
1207
+ );
1208
+ await this._delay(retryDelayMs);
1209
+ } else {
1210
+ globalLogger.info(`>> Conditions failed: ${JSON.stringify(failedConditions)}`);
1211
+ }
1212
+ }
1213
+ return results;
1214
+ }
1215
+ /**
1216
+ * Internal method to check conditions once without retry logic.
1217
+ *
1218
+ * @private
1219
+ */
1220
+ async _checkBulkOnce(conditions, cachingMode, attempt = 0) {
1221
+ if (!this.appiumSessionStarted) {
1222
+ await this.startSession();
1223
+ }
1224
+ globalLogger.info(`Checking conditions (attempt ${attempt}): ${conditions.join(", ")}`);
1225
+ try {
1226
+ let screenshot;
1227
+ if (!this.useGptDriverCloud) {
1228
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1229
+ }
1230
+ const response = await axios.post(
1231
+ `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/assert`,
1232
+ {
1233
+ api_key: this.apiKey,
1234
+ base64_screenshot: screenshot,
1235
+ assertions: conditions,
1236
+ command: `Assert: ${JSON.stringify(conditions)}`,
1237
+ caching_mode: cachingMode ?? this.cachingMode,
1238
+ step_number: this.step_number
1239
+ }
1240
+ );
1241
+ globalLogger.debug(`Check results: ${JSON.stringify(response.data.results)}`);
1242
+ return response.data.results;
1243
+ } catch (e) {
1244
+ globalLogger.error("Failed to check conditions", e);
1245
+ await this.setSessionStatus("failed");
1246
+ throw e;
1247
+ }
274
1248
  }
275
1249
  /**
276
1250
  * Extracts specified information using the GPTDriver.
@@ -280,30 +1254,305 @@ class GptDriver {
280
1254
  *
281
1255
  * @param {string[]} extractions - An array of extraction criteria. Each criterion specifies what information
282
1256
  * should be extracted from the session.
1257
+ * @param cachingMode - The caching mode to be used for the extraction.
283
1258
  * @returns {Promise<Record<string, any>>} A promise that resolves with an object mapping each extraction criterion
284
1259
  * to the extracted data. The structure of the returned data depends on the
285
1260
  * specifics of the extraction criteria.
286
1261
  */
287
- async extract(extractions) {
288
- console.log(">> Extracting:", extractions);
289
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1262
+ async extract(extractions, cachingMode) {
1263
+ if (!this.appiumSessionStarted) {
1264
+ await this.startSession();
1265
+ }
1266
+ globalLogger.info(`Extracting data: ${extractions.join(", ")}`);
1267
+ let screenshot;
1268
+ if (!this.useGptDriverCloud) {
1269
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1270
+ }
290
1271
  const response = await axios.post(
291
1272
  `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/extract`,
292
1273
  {
293
1274
  api_key: this.apiKey,
294
1275
  base64_screenshot: screenshot,
295
1276
  extractions,
296
- command: `Extract: ${JSON.stringify(extractions)}`
1277
+ command: `Extract: ${JSON.stringify(extractions)}`,
1278
+ step_number: this.step_number
297
1279
  }
298
1280
  );
1281
+ this.step_number = this.step_number + 1;
1282
+ globalLogger.debug(`Extraction results: ${JSON.stringify(response.data.results)}`);
299
1283
  return response.data.results;
300
1284
  }
301
- async gptHandler(command) {
1285
+ /**
1286
+ * Opens a deep link url in the Appium session.
1287
+ *
1288
+ * This method sends a request to the GPT Driver server to open a deep link url in the Appium session.
1289
+ *
1290
+ * @param {OpenDeepLinkUrlParams} params - The parameters for opening the deep link url.
1291
+ * @returns {Promise<void>} A promise that resolves when the deep link url is opened.
1292
+ */
1293
+ async openDeepLinkUrl(params) {
1294
+ if (!this.appiumSessionStarted) {
1295
+ await this.startSession();
1296
+ }
1297
+ globalLogger.info(`Opening deep link: ${params.url}`);
1298
+ if (params.package == null && this.appiumSessionConfig?.platform === "Android") {
1299
+ throw new Error("Package is required for Android platform");
1300
+ }
1301
+ await this.executeCommand(
1302
+ {
1303
+ url: `http://localhost:4723/session/${this.appiumSessionConfig?.id}/execute/sync`,
1304
+ method: "POST",
1305
+ data: {
1306
+ "script": "mobile:deepLink",
1307
+ "args": [{
1308
+ url: params.url,
1309
+ ...params.bundleId && { bundleId: params.bundleId },
1310
+ ...params.package && { package: params.package }
1311
+ }]
1312
+ }
1313
+ }
1314
+ );
1315
+ this.step_number = this.step_number + 1;
1316
+ globalLogger.debug("Deep link opened successfully");
1317
+ }
1318
+ /**
1319
+ * Reads a flow JSON file from disk and validates it using the SavableTestStoreSchema.
1320
+ *
1321
+ * Returns the parsed and validated object on success; throws a detailed error on failure.
1322
+ *
1323
+ * @param filePath - Path to the flow file (JSON)
1324
+ * @param options - Optional execution options
1325
+ * @param options.useSmartLoop - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1326
+ * for AI, tap, and assert steps. This optimizes execution by checking cache
1327
+ * first and populating it after successful execution. Default: false
1328
+ * @returns The validated flow data
1329
+ *
1330
+ * @example
1331
+ * // Execute flow with default settings (legacy gptHandler)
1332
+ * const result = await driver.executeFlow('tests/login-flow.json');
1333
+ *
1334
+ * @example
1335
+ * // Execute flow with smart loop enabled for optimized caching
1336
+ * const result = await driver.executeFlow('tests/login-flow.json', { useSmartLoop: true });
1337
+ */
1338
+ async executeFlow(filePath, options) {
1339
+ const useSmartLoop = options?.useSmartLoop ?? false;
1340
+ globalLogger.info(`Loading flow from file: ${filePath}`);
1341
+ const absolutePath = path.resolve(filePath);
1342
+ const baseDir = path.dirname(absolutePath);
1343
+ let raw;
1344
+ try {
1345
+ raw = await node_fs.promises.readFile(absolutePath, "utf-8");
1346
+ } catch (e) {
1347
+ const msg = `Failed to read file at ${filePath}: ${e?.message ?? e}`;
1348
+ globalLogger.error(msg);
1349
+ throw new Error(msg);
1350
+ }
1351
+ let json;
1352
+ try {
1353
+ json = JSON.parse(raw);
1354
+ } catch (e) {
1355
+ const msg = `Invalid JSON in flow file ${filePath}: ${e?.message ?? e}`;
1356
+ globalLogger.error(msg);
1357
+ throw new Error(msg);
1358
+ }
1359
+ const parsed = SavableTestStoreSchema.safeParse(json);
1360
+ if (!parsed.success) {
1361
+ const issues = parsed.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1362
+ const msg = `Flow validation failed for ${filePath}:
1363
+ ${issues}`;
1364
+ globalLogger.error(msg);
1365
+ throw new Error(msg);
1366
+ }
1367
+ const rootFlow = parsed.data;
1368
+ globalLogger.info(`Flow file validated successfully: ${filePath}`);
1369
+ const visited = /* @__PURE__ */ new Set();
1370
+ const loadFlow = async (p) => {
1371
+ const abs = path.isAbsolute(p) ? p : path.resolve(baseDir, p);
1372
+ const rawChild = await node_fs.promises.readFile(abs, "utf-8");
1373
+ const childJson = JSON.parse(rawChild);
1374
+ const val = SavableTestStoreSchema.safeParse(childJson);
1375
+ if (!val.success) {
1376
+ const issues = val.error.issues.map((iss) => `- ${iss.path.join(".") || "<root>"}: ${iss.message}`).join("\n");
1377
+ throw new Error(`Flow validation failed for referenced file ${abs}:
1378
+ ${issues}`);
1379
+ }
1380
+ return val.data;
1381
+ };
1382
+ const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1383
+ const out = [];
1384
+ for (const step of steps) {
1385
+ if (step.type === "fileRef") {
1386
+ const refPath = path.isAbsolute(step.path) ? step.path : path.resolve(parentDir, step.path);
1387
+ const refKey = path.normalize(refPath);
1388
+ if (visited.has(refKey)) {
1389
+ const cycle = [...stack, refKey].map((p) => path.basename(p)).join(" -> ");
1390
+ throw new Error(`Detected circular fileRef: ${cycle}`);
1391
+ }
1392
+ visited.add(refKey);
1393
+ const child = await loadFlow(refPath);
1394
+ const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1395
+ const childDir = path.dirname(refPath);
1396
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1397
+ out.push(...childExpanded);
1398
+ } else {
1399
+ const resolved = { ...step, __params: { ...inheritedParams } };
1400
+ out.push(resolved);
1401
+ }
1402
+ }
1403
+ return out;
1404
+ };
1405
+ const effectiveParams = { ...rootFlow.params ?? {} };
1406
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1407
+ if (!this.appiumSessionStarted) {
1408
+ await this.startSession();
1409
+ }
1410
+ globalLogger.info(`Executing flow '${rootFlow.name}' with ${expandedSteps.length} step(s)...`);
1411
+ let executed = 0;
1412
+ try {
1413
+ for (const step of expandedSteps) {
1414
+ const params = step.__params ?? effectiveParams;
1415
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1416
+ try {
1417
+ switch (step.type) {
1418
+ case "ai": {
1419
+ const instruction = this.interpolateTemplate(step.instruction, params);
1420
+ globalLogger.info(`${prefix}: ${instruction}`);
1421
+ if (useSmartLoop) {
1422
+ const ctx = this.createSmartLoopContext();
1423
+ const result = await executeSmartLoop(ctx, {
1424
+ stepNumber: this.step_number,
1425
+ description: instruction,
1426
+ instruction
1427
+ });
1428
+ if (!result.success) {
1429
+ throw new Error(result.error || "Smart loop execution failed");
1430
+ }
1431
+ this.step_number++;
1432
+ } else {
1433
+ await this.aiExecute({ command: instruction });
1434
+ }
1435
+ break;
1436
+ }
1437
+ case "tap": {
1438
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1439
+ if (!description) {
1440
+ throw new Error("Tap step requires a descriptionText. Coordinate-based taps are no longer supported.");
1441
+ }
1442
+ globalLogger.info(`${prefix}: ${description}`);
1443
+ if (useSmartLoop) {
1444
+ const ctx = this.createSmartLoopContext();
1445
+ const result = await executeSmartLoop(ctx, {
1446
+ stepNumber: this.step_number,
1447
+ description,
1448
+ instruction: description
1449
+ });
1450
+ if (!result.success) {
1451
+ throw new Error(result.error || "Smart loop execution failed");
1452
+ }
1453
+ this.step_number++;
1454
+ } else {
1455
+ await this.aiExecute({ command: description });
1456
+ }
1457
+ break;
1458
+ }
1459
+ case "assert": {
1460
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1461
+ if (!description) {
1462
+ throw new Error("Assert step requires a descriptionText. Coordinate-based assertions are no longer supported.");
1463
+ }
1464
+ globalLogger.info(`${prefix}: ${description}`);
1465
+ if (useSmartLoop) {
1466
+ const instruction = `Verify that: ${description}`;
1467
+ const ctx = this.createSmartLoopContext();
1468
+ const result = await executeSmartLoop(ctx, {
1469
+ stepNumber: this.step_number,
1470
+ description,
1471
+ instruction
1472
+ });
1473
+ if (!result.success) {
1474
+ throw new Error(result.error || "Smart loop execution failed");
1475
+ }
1476
+ this.step_number++;
1477
+ } else {
1478
+ await this.assert(description);
1479
+ }
1480
+ break;
1481
+ }
1482
+ case "type": {
1483
+ const text = this.interpolateTemplate(step.text, params);
1484
+ globalLogger.info(`${prefix}: Type text`);
1485
+ await this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
1486
+ await this.performType(text);
1487
+ this.step_number++;
1488
+ break;
1489
+ }
1490
+ case "scroll": {
1491
+ globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1492
+ await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1493
+ await this.performScroll(step.direction);
1494
+ this.step_number++;
1495
+ break;
1496
+ }
1497
+ case "zoom": {
1498
+ globalLogger.info(`${prefix}: Zoom ${step.direction}`);
1499
+ await this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
1500
+ this.step_number++;
1501
+ break;
1502
+ }
1503
+ case "scrollUntil": {
1504
+ const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
1505
+ globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
1506
+ await this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
1507
+ await this.performScrollUntil({
1508
+ direction: step.direction,
1509
+ text: interpolatedText,
1510
+ elementId: step.elementId,
1511
+ maxScrolls: step.maxScrolls
1512
+ });
1513
+ this.step_number++;
1514
+ break;
1515
+ }
1516
+ case "deeplink": {
1517
+ const pkg = params["package"];
1518
+ const bundleId = params["bundleId"];
1519
+ const url = this.interpolateTemplate(step.url, params);
1520
+ globalLogger.info(`${prefix}: Open deeplink ${url}`);
1521
+ await this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
1522
+ await this.openDeepLinkUrl({ url, package: pkg, bundleId });
1523
+ break;
1524
+ }
1525
+ default: {
1526
+ throw new Error(`Unsupported step type at execution: ${step.type}`);
1527
+ }
1528
+ }
1529
+ executed++;
1530
+ } catch (err) {
1531
+ if (step.optional) {
1532
+ globalLogger.warn(`${prefix} failed but marked optional. Continuing. Error: ${err.message}`);
1533
+ continue;
1534
+ }
1535
+ throw err;
1536
+ }
1537
+ }
1538
+ } catch (e) {
1539
+ try {
1540
+ await this.setSessionStatus("failed");
1541
+ } catch {
1542
+ }
1543
+ throw e;
1544
+ }
1545
+ return rootFlow;
1546
+ }
1547
+ async gptHandler(command, cachingMode) {
302
1548
  try {
303
1549
  let conditionSucceeded = false;
304
1550
  while (!conditionSucceeded) {
305
- const screenshot = await this.getScreenshot(this.appiumSessionConfig);
306
- console.log(">> Asking GTP Driver for next action...");
1551
+ let screenshot;
1552
+ if (!this.useGptDriverCloud) {
1553
+ screenshot = await this.getScreenshot(this.appiumSessionConfig);
1554
+ }
1555
+ globalLogger.info("Requesting next action from GPT Driver...");
307
1556
  const response = await axios.request(
308
1557
  {
309
1558
  url: `${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/execute`,
@@ -311,39 +1560,49 @@ class GptDriver {
311
1560
  data: {
312
1561
  api_key: this.apiKey,
313
1562
  command,
314
- base64_screenshot: screenshot
1563
+ base64_screenshot: screenshot,
1564
+ caching_mode: cachingMode ?? this.cachingMode,
1565
+ step_number: this.step_number
315
1566
  }
316
1567
  }
317
1568
  );
318
1569
  const executeStatus = response.data.status;
319
1570
  if (executeStatus === "failed") {
320
- const errorMessage = response?.data?.commands?.at(0)?.data;
1571
+ const errorMessage = response.data?.commands?.at(0)?.data;
1572
+ globalLogger.error(`Execution failed: ${errorMessage ?? "Unknown error"}`);
321
1573
  throw new Error(errorMessage ?? "Execution failed");
322
1574
  }
323
1575
  conditionSucceeded = executeStatus !== "inProgress";
324
1576
  const executeResponse = response.data;
325
- for (const command2 of executeResponse.commands) {
326
- await this.executeCommand(command2);
1577
+ globalLogger.debug(`Received ${executeResponse.commands.length} command(s) to execute`);
1578
+ for (const appiumCommand of executeResponse.commands) {
1579
+ await this.executeCommand(appiumCommand);
327
1580
  }
328
1581
  if (!conditionSucceeded) {
1582
+ globalLogger.debug("Command still in progress, waiting...");
329
1583
  await delay(1500);
330
1584
  }
331
1585
  }
1586
+ this.step_number = this.step_number + 1;
1587
+ globalLogger.info("Command execution completed successfully");
332
1588
  } catch (e) {
333
- await this.stopSession("failed");
1589
+ globalLogger.error("GPT handler failed", e);
1590
+ await this.setSessionStatus("failed");
334
1591
  throw e;
335
1592
  }
336
1593
  }
337
1594
  async executeCommand(command) {
338
- const firstAction = command.data.actions?.at(0);
1595
+ const firstAction = command.data?.actions?.at(0);
339
1596
  if (firstAction?.type === "pause" && firstAction.duration != null) {
1597
+ globalLogger.debug(`Pausing for ${firstAction.duration} seconds`);
340
1598
  await delay(firstAction * 1e3);
341
- } else {
1599
+ } else if (!this.useGptDriverCloud) {
342
1600
  const parsedUrl = new URL(command.url);
343
1601
  parsedUrl.protocol = this.appiumSessionConfig.serverUrl.protocol;
344
1602
  parsedUrl.host = this.appiumSessionConfig.serverUrl.host;
345
1603
  parsedUrl.port = this.appiumSessionConfig.serverUrl.port != "" ? `${this.appiumSessionConfig.serverUrl.port}` : "";
346
1604
  parsedUrl.pathname = this.appiumSessionConfig.serverUrl.pathname != "/" ? `${this.appiumSessionConfig.serverUrl.pathname}${parsedUrl.pathname}` : parsedUrl.pathname;
1605
+ globalLogger.debug(`Executing ${command.method} request to ${parsedUrl.pathname}`);
347
1606
  await axios.request({
348
1607
  url: parsedUrl.toString(),
349
1608
  method: command.method,
@@ -351,16 +1610,25 @@ class GptDriver {
351
1610
  });
352
1611
  }
353
1612
  }
354
- async getScreenshot(appiumSessionConfig) {
355
- const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
356
- const screenshotResponse = await axios.get(url);
357
- let screenshot = await screenshotResponse.data.value;
358
- if (appiumSessionConfig.platform === "iOS") {
359
- const imageBuffer = Buffer.from(screenshot, "base64");
360
- const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
361
- screenshot = transformedImage.toString("base64");
1613
+ async logCodeExecution(screenshot, command) {
1614
+ try {
1615
+ const screenshot2 = await this.getScreenshot(this.appiumSessionConfig);
1616
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_code_execution`, {
1617
+ api_key: this.apiKey,
1618
+ base64_screenshot: screenshot2,
1619
+ command
1620
+ });
1621
+ } catch (e) {
1622
+ globalLogger.error("Failed to log code execution", e);
1623
+ }
1624
+ }
1625
+ async takeScreenshotAndLogCodeExecution(command) {
1626
+ try {
1627
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
1628
+ await this.logCodeExecution(screenshot, command);
1629
+ } catch (e) {
1630
+ globalLogger.error("Failed to log code execution", e);
362
1631
  }
363
- return screenshot;
364
1632
  }
365
1633
  }
366
1634