gpt-driver-node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -20,6 +20,15 @@ function buildUrl(base, extraPath) {
20
20
  }
21
21
  return `${baseUrl}${extraPath}`;
22
22
  }
23
+ const getImageDimensions = async (base64) => {
24
+ const base64Data = base64.replace(/^data:image\/\w+;base64,/, "");
25
+ const buffer = Buffer.from(base64Data, "base64");
26
+ const metadata = await sharp(buffer).metadata();
27
+ if (!metadata.width || !metadata.height) {
28
+ throw new Error("Unable to get image dimensions");
29
+ }
30
+ return { width: metadata.width, height: metadata.height };
31
+ };
23
32
 
24
33
  const colors = {
25
34
  reset: "\x1B[0m",
@@ -105,6 +114,15 @@ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
105
114
  type: z.literal("type"),
106
115
  text: z.string()
107
116
  });
117
+ const SavableSwipeStepSchema = SavableStepBaseSchema.extend({
118
+ type: z.literal("swipe"),
119
+ direction: z.enum(["left", "right", "up", "down"]),
120
+ x1: z.number().optional(),
121
+ y1: z.number().optional(),
122
+ x2: z.number().optional(),
123
+ y2: z.number().optional(),
124
+ duration: z.number().optional().default(500)
125
+ });
108
126
  const SavableScrollStepSchema = SavableStepBaseSchema.extend({
109
127
  type: z.literal("scroll"),
110
128
  direction: z.enum(["up", "down"])
@@ -140,6 +158,8 @@ const SavableStepSchema = z.discriminatedUnion("type", [
140
158
  // type: 'assert'
141
159
  SavableTypeStepSchema,
142
160
  // type: 'type'
161
+ SavableSwipeStepSchema,
162
+ // type: 'swipe'
143
163
  SavableScrollStepSchema,
144
164
  // type: 'scroll'
145
165
  SavableZoomStepSchema,
@@ -158,6 +178,14 @@ const SavableTestStoreSchema = z.object({
158
178
  steps: z.array(SavableStepSchema),
159
179
  params: z.record(z.string(), z.string()).optional()
160
180
  });
181
+ const VariablesSchema = z.record(z.string(), z.string()).optional().default({});
182
+ const ConfigSchema = z.object({
183
+ testDir: z.string(),
184
+ driver: z.string(),
185
+ port: z.number(),
186
+ apiKey: z.string(),
187
+ variables: VariablesSchema
188
+ });
161
189
 
162
190
  const CACHE_SERVER_URL = "https://cache.mobileboost.io";
163
191
  const GPT_DRIVER_BASE_URL = "https://api.mobileboost.io";
@@ -417,6 +445,30 @@ function isScrollCommand(cmd) {
417
445
  function isTypeCommand(cmd) {
418
446
  return cmd.startsWith("type:");
419
447
  }
448
+ function isSlideCommand(cmd) {
449
+ return cmd.startsWith("slide");
450
+ }
451
+ function parseSlideCommand(cmd) {
452
+ const slideMatch = cmd.match(
453
+ /slide\s+(up|down|left|right)\s+(\d+)%(?::\s*[^;]*)?;(\d+);(\d+)/i
454
+ );
455
+ if (!slideMatch) {
456
+ return null;
457
+ }
458
+ const extractedDirection = slideMatch[1].toLowerCase();
459
+ const directionMap = {
460
+ down: "up",
461
+ up: "down",
462
+ left: "right",
463
+ right: "left"
464
+ };
465
+ return {
466
+ direction: directionMap[extractedDirection],
467
+ percentage: parseInt(slideMatch[2], 10),
468
+ startX: parseInt(slideMatch[3], 10),
469
+ startY: parseInt(slideMatch[4], 10)
470
+ };
471
+ }
420
472
 
421
473
  async function executeSmartLoop(ctx, params) {
422
474
  const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
@@ -432,6 +484,8 @@ async function executeSmartLoop(ctx, params) {
432
484
  let screenshot = "";
433
485
  let commands = [];
434
486
  let isCacheHit = false;
487
+ const firstScreenshot = await ctx.getScreenshot();
488
+ const screenshotResolution = await getImageDimensions(firstScreenshot);
435
489
  for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
436
490
  screenshot = await ctx.getScreenshot();
437
491
  const sizeInBytes = screenshot.length * 0.75;
@@ -444,7 +498,7 @@ async function executeSmartLoop(ctx, params) {
444
498
  stepNumber: params.stepNumber,
445
499
  stepDescription: params.description,
446
500
  screenshot,
447
- screenResolution: ctx.screenSize,
501
+ screenResolution: screenshotResolution,
448
502
  highestUsedIndex: lastCacheIndex,
449
503
  platform: ctx.platform,
450
504
  filepath: params.filepath
@@ -515,7 +569,12 @@ async function executeSmartLoop(ctx, params) {
515
569
  const coords = parseTapCoordinates(cmd);
516
570
  if (coords) {
517
571
  globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
518
- await ctx.performTap(coords.x, coords.y);
572
+ await ctx.performTap(
573
+ coords.x,
574
+ coords.y,
575
+ screenshotResolution.width,
576
+ screenshotResolution.height
577
+ );
519
578
  actionExecuted = true;
520
579
  }
521
580
  } else if (isWaitCommand(cmd)) {
@@ -529,7 +588,44 @@ async function executeSmartLoop(ctx, params) {
529
588
  const direction = parseScrollDirection(cmd);
530
589
  if (direction) {
531
590
  globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
532
- await ctx.performScroll(direction);
591
+ await ctx.performSwipe({ direction });
592
+ actionExecuted = true;
593
+ }
594
+ } else if (isSlideCommand(cmd)) {
595
+ const slideParams = parseSlideCommand(cmd);
596
+ if (slideParams) {
597
+ const { direction, percentage, startX, startY } = slideParams;
598
+ const isVertical = direction === "up" || direction === "down";
599
+ const distance = Math.round(
600
+ (isVertical ? screenshotResolution.height : screenshotResolution.width) * (percentage / 100)
601
+ );
602
+ let endX = startX;
603
+ let endY = startY;
604
+ switch (direction) {
605
+ case "up":
606
+ endY = startY + distance;
607
+ break;
608
+ case "down":
609
+ endY = startY - distance;
610
+ break;
611
+ case "left":
612
+ endX = startX - distance;
613
+ break;
614
+ case "right":
615
+ endX = startX + distance;
616
+ break;
617
+ }
618
+ globalLogger.debug(`[SmartLoop] Sliding ${direction} ${percentage}% from (${startX}, ${startY}) to (${endX}, ${endY})`);
619
+ await ctx.performSwipe({
620
+ direction,
621
+ x1: startX,
622
+ y1: startY,
623
+ x2: endX,
624
+ y2: endY,
625
+ screenshotWidth: screenshotResolution.width,
626
+ screenshotHeight: screenshotResolution.height,
627
+ duration: 500
628
+ });
533
629
  actionExecuted = true;
534
630
  }
535
631
  } else if (isTypeCommand(cmd)) {
@@ -557,7 +653,7 @@ async function executeSmartLoop(ctx, params) {
557
653
  stepNumber: params.stepNumber,
558
654
  stepDescription: params.description,
559
655
  executionData: currentExecutionData,
560
- screenResolution: ctx.screenSize,
656
+ screenResolution: screenshotResolution,
561
657
  platform: ctx.platform,
562
658
  filepath: params.filepath
563
659
  });
@@ -602,7 +698,6 @@ class GptDriver {
602
698
  });
603
699
  }
604
700
  apiKey;
605
- organisationId;
606
701
  gptDriverSessionId;
607
702
  gptDriverBaseUrl;
608
703
  appiumSessionConfig;
@@ -614,6 +709,8 @@ class GptDriver {
614
709
  buildId;
615
710
  testId;
616
711
  step_number = 1;
712
+ organisationId;
713
+ configFilePath;
617
714
  // Smart loop state - maintains action history across steps for context
618
715
  globalActionHistory = [];
619
716
  /**
@@ -638,11 +735,12 @@ class GptDriver {
638
735
  constructor(config) {
639
736
  this.testId = config.testId;
640
737
  this.apiKey = config.apiKey;
641
- this.organisationId = config.organisationId;
642
738
  this.buildId = config.buildId;
643
739
  this.useGptDriverCloud = config.useGptDriverCloud;
644
740
  this.gptDriverBaseUrl = GPT_DRIVER_BASE_URL;
645
741
  this.cachingMode = config.cachingMode ?? "NONE";
742
+ this.organisationId = config.organisationId;
743
+ this.configFilePath = config.configFilePath;
646
744
  if (config.useGptDriverCloud) {
647
745
  if (config.serverConfig.device?.platform == null) {
648
746
  throw new Error("Platform is missing. Please specify the platform when using GPTDriver Cloud.");
@@ -828,11 +926,10 @@ class GptDriver {
828
926
  return {
829
927
  apiKey: this.apiKey,
830
928
  platform: this.appiumSessionConfig?.platform,
831
- screenSize: this.appiumSessionConfig.size,
832
929
  globalActionHistory: this.globalActionHistory,
833
- getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
834
- performTap: (x, y) => this.performTap(x, y),
835
- performScroll: (direction) => this.performScroll(direction),
930
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig, false),
931
+ performTap: (x, y, screenshotWidth, screenshotHeight) => this.performTap(x, y, screenshotWidth, screenshotHeight),
932
+ performSwipe: (params) => this.performSwipe(params),
836
933
  performType: (text) => this.performType(text),
837
934
  logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command),
838
935
  organisationId: this.organisationId
@@ -885,15 +982,31 @@ class GptDriver {
885
982
  /**
886
983
  * Performs a tap action at the specified coordinates.
887
984
  */
888
- async performTap(x, y) {
985
+ async performTap(x, y, screenshotWidth, screenshotHeight) {
889
986
  const client = await this.getWdioClient();
987
+ const platform = this.appiumSessionConfig?.platform;
988
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
989
+ const scaled = this.scaleForIOS(
990
+ x,
991
+ y,
992
+ platform,
993
+ screenshotWidth,
994
+ screenshotHeight,
995
+ w,
996
+ h
997
+ );
998
+ const clampedX = this.clamp(scaled.x, 0, w - 1);
999
+ const clampedY = this.clamp(scaled.y, 0, h - 1);
1000
+ globalLogger.debug(
1001
+ `[Tap] Platform: ${platform}, Input: ${x},${y}, Window: ${w}x${h}, Final: ${clampedX},${clampedY}`
1002
+ );
890
1003
  await client.performActions([
891
1004
  {
892
1005
  type: "pointer",
893
1006
  id: "finger1",
894
1007
  parameters: { pointerType: "touch" },
895
1008
  actions: [
896
- { type: "pointerMove", duration: 0, x, y },
1009
+ { type: "pointerMove", duration: 0, x: clampedX, y: clampedY },
897
1010
  { type: "pointerDown", button: 0 },
898
1011
  { type: "pause", duration: 100 },
899
1012
  { type: "pointerUp", button: 0 }
@@ -903,25 +1016,125 @@ class GptDriver {
903
1016
  }
904
1017
  async performType(text) {
905
1018
  const client = await this.getWdioClient();
906
- await client.keys(text.split(""));
1019
+ const platform = this.appiumSessionConfig?.platform;
1020
+ if (platform === "iOS") {
1021
+ const actions = text.split("").flatMap((char) => [
1022
+ { type: "keyDown", value: char },
1023
+ { type: "keyUp", value: char }
1024
+ ]);
1025
+ await client.performActions([
1026
+ {
1027
+ type: "key",
1028
+ id: "keyboard",
1029
+ actions
1030
+ }
1031
+ ]);
1032
+ } else {
1033
+ await client.keys(text.split(""));
1034
+ }
907
1035
  }
908
- async performScroll(direction) {
1036
+ clamp(value, min, max) {
1037
+ return Math.max(min, Math.min(max, value));
1038
+ }
1039
+ scaleForIOS(x, y, platform, screenshotWidth, screenshotHeight, windowWidth, windowHeight) {
1040
+ if (platform !== "iOS" || !screenshotWidth || !screenshotHeight || !windowWidth || !windowHeight) {
1041
+ return { x: Math.round(x), y: Math.round(y) };
1042
+ }
1043
+ const scaleX = windowWidth / screenshotWidth;
1044
+ const scaleY = windowHeight / screenshotHeight;
1045
+ return {
1046
+ x: Math.round(x * scaleX),
1047
+ y: Math.round(y * scaleY)
1048
+ };
1049
+ }
1050
+ async performSwipe(params) {
909
1051
  const client = await this.getWdioClient();
910
- const w = this.appiumSessionConfig?.size?.width ?? 1080;
911
- const h = this.appiumSessionConfig?.size?.height ?? 1920;
912
- const x = Math.round(w / 2);
913
- const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
914
- const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
1052
+ const platform = this.appiumSessionConfig?.platform;
1053
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
1054
+ const defaultStartX = w / 2;
1055
+ let defaultStartY;
1056
+ if (params.direction === "down") {
1057
+ defaultStartY = h * 0.75;
1058
+ } else if (params.direction === "up") {
1059
+ defaultStartY = h * 0.25;
1060
+ } else {
1061
+ defaultStartY = h / 2;
1062
+ }
1063
+ let startX;
1064
+ let startY;
1065
+ if (params.x1 !== void 0 || params.y1 !== void 0) {
1066
+ const scaled = this.scaleForIOS(
1067
+ params.x1 ?? defaultStartX,
1068
+ params.y1 ?? defaultStartY,
1069
+ platform,
1070
+ params.screenshotWidth,
1071
+ params.screenshotHeight,
1072
+ w,
1073
+ h
1074
+ );
1075
+ startX = scaled.x;
1076
+ startY = scaled.y;
1077
+ } else {
1078
+ startX = Math.round(defaultStartX);
1079
+ startY = Math.round(defaultStartY);
1080
+ }
1081
+ startX = this.clamp(startX, 0, w - 1);
1082
+ startY = this.clamp(startY, 0, h - 1);
1083
+ let endX;
1084
+ let endY;
1085
+ if (params.x2 !== void 0 || params.y2 !== void 0) {
1086
+ const scaled = this.scaleForIOS(
1087
+ params.x2 ?? startX,
1088
+ params.y2 ?? startY,
1089
+ platform,
1090
+ params.screenshotWidth,
1091
+ params.screenshotHeight,
1092
+ w,
1093
+ h
1094
+ );
1095
+ endX = scaled.x;
1096
+ endY = scaled.y;
1097
+ } else {
1098
+ const deltaX = Math.round(w * 0.5);
1099
+ const deltaY = Math.round(h * 0.5);
1100
+ switch (params.direction) {
1101
+ case "left":
1102
+ endX = Math.max(0, startX - deltaX);
1103
+ endY = startY;
1104
+ break;
1105
+ case "right":
1106
+ endX = Math.min(w - 1, startX + deltaX);
1107
+ endY = startY;
1108
+ break;
1109
+ case "up":
1110
+ endX = startX;
1111
+ endY = Math.min(h - 1, startY + deltaY);
1112
+ break;
1113
+ case "down":
1114
+ endX = startX;
1115
+ endY = Math.max(0, startY - deltaY);
1116
+ break;
1117
+ default:
1118
+ endX = startX;
1119
+ endY = startY;
1120
+ }
1121
+ }
1122
+ endX = this.clamp(endX, 0, w - 1);
1123
+ endY = this.clamp(endY, 0, h - 1);
1124
+ globalLogger.debug(
1125
+ `[Swipe] Platform: ${platform}, Direction: ${params.direction}, Start: ${startX},${startY}, End: ${endX},${endY}`
1126
+ );
1127
+ const duration = params.duration ?? 500;
915
1128
  await client.performActions([
916
1129
  {
917
1130
  type: "pointer",
918
1131
  id: "finger1",
919
1132
  parameters: { pointerType: "touch" },
920
1133
  actions: [
921
- { type: "pointerMove", duration: 0, x, y: startY },
1134
+ { type: "pointerMove", duration: 0, x: startX, y: startY },
922
1135
  { type: "pointerDown", button: 0 },
923
1136
  { type: "pause", duration: 100 },
924
- { type: "pointerMove", duration: 500, x, y: endY },
1137
+ { type: "pointerMove", duration, x: endX, y: endY },
925
1138
  { type: "pointerUp", button: 0 }
926
1139
  ]
927
1140
  }
@@ -940,17 +1153,17 @@ class GptDriver {
940
1153
  if (found) {
941
1154
  return;
942
1155
  }
943
- await this.performScroll(direction);
1156
+ await this.performSwipe({ direction });
944
1157
  await this._delay(500);
945
1158
  }
946
1159
  throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
947
1160
  }
948
- async getScreenshot(appiumSessionConfig) {
1161
+ async getScreenshot(appiumSessionConfig, shouldScale = true) {
949
1162
  globalLogger.debug("Capturing screenshot...");
950
1163
  const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
951
1164
  const screenshotResponse = await axios.get(url);
952
1165
  let screenshot = await screenshotResponse.data.value;
953
- if (appiumSessionConfig.platform === "iOS") {
1166
+ if (appiumSessionConfig.platform === "iOS" && shouldScale) {
954
1167
  globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
955
1168
  const imageBuffer = Buffer.from(screenshot, "base64");
956
1169
  const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
@@ -1003,86 +1216,12 @@ class GptDriver {
1003
1216
  await this.gptHandler(command);
1004
1217
  }
1005
1218
  }
1006
- /**
1007
- * Executes a specified command within the WebDriver session with configurable caching options.
1008
- *
1009
- * This is the recommended method for executing commands. It provides fine-grained control over
1010
- * caching behavior, allowing you to optimize performance and costs for repetitive test scenarios.
1011
- *
1012
- * If an `appiumHandler` is provided, it will be invoked with the WebDriver instance to perform
1013
- * the command-specific operations. After executing the handler, the executed commands get logged
1014
- * on the GPTDriver servers. If the handler execution fails or no handler is provided, the command
1015
- * gets executed by the GPTDriver using natural language processing.
1016
- *
1017
- * @param {Object} params - The execution parameters
1018
- * @param {string} params.command - The natural language command to be executed by the GPTDriver.
1019
- * Examples: "Click the login button", "Enter 'test@example.com' in the email field"
1020
- * @param {AppiumHandler} [params.appiumHandler] - An optional function that processes Appium-specific commands.
1021
- * If provided, this handler is executed instead of calling
1022
- * the GPTDriver API. Useful for performance optimization when
1023
- * you know the exact Appium commands to execute.
1024
- * @param {CachingMode} [params.cachingMode] - Controls how the GPTDriver caches this command execution.
1025
- * If not specified, uses the global caching mode set in the constructor.
1026
- * Options:
1027
- * - "NONE"
1028
- * - "FULL_SCREEN"
1029
- * - "INTERACTION_REGION"
1030
- * @param {boolean} [params.useSmartLoop] - If true, uses the smart loop execution (Cache -> AI -> Execute -> Populate)
1031
- * which optimizes execution by checking cache first and populating it after.
1032
- * Default: false (uses legacy gptHandler)
1033
- *
1034
- * @returns {Promise<void>} A promise that resolves when the command execution is complete.
1035
- *
1036
- * @throws {Error} If an error occurs during the execution of the Appium handler or while processing
1037
- * the command by the GPTDriver.
1038
- *
1039
- * @example
1040
- * // Basic usage with natural language (no caching)
1041
- * await driver.aiExecute({
1042
- * command: "Click the submit button"
1043
- * });
1044
- *
1045
- * @example
1046
- * // Full screen caching for repetitive navigation on similar screens
1047
- * await driver.aiExecute({
1048
- * command: "Navigate to the settings page",
1049
- * cachingMode: "FULL_SCREEN"
1050
- * });
1051
- *
1052
- * @example
1053
- * // Interaction region caching for repeated actions on the same button
1054
- * await driver.aiExecute({
1055
- * command: "Click the login button",
1056
- * cachingMode: "INTERACTION_REGION"
1057
- * });
1058
- *
1059
- * @example
1060
- * // With custom Appium handler as fallback
1061
- * await driver.aiExecute({
1062
- * command: "Click the login button",
1063
- * appiumHandler: async (driver) => {
1064
- * const loginBtn = await driver.$('~loginButton');
1065
- * await loginBtn.click();
1066
- * },
1067
- * cachingMode: "INTERACTION_REGION"
1068
- * });
1069
- *
1070
- * @example
1071
- * // Force fresh execution for dynamic content
1072
- * await driver.aiExecute({
1073
- * command: "Verify the current timestamp",
1074
- * cachingMode: "NONE"
1075
- * });
1076
- *
1077
- * @example
1078
- * // Using smart loop for optimized caching
1079
- * await driver.aiExecute({
1080
- * command: "Click the login button",
1081
- * useSmartLoop: true,
1082
- * cachingMode: "FULL_SCREEN"
1083
- * });
1084
- */
1085
- async aiExecute({ command, appiumHandler, cachingMode, useSmartLoop = false }) {
1219
+ async aiExecute(commandOrOptions, options) {
1220
+ const command = typeof commandOrOptions === "string" ? commandOrOptions : commandOrOptions.command;
1221
+ const opts = typeof commandOrOptions === "string" ? options : commandOrOptions;
1222
+ const appiumHandler = opts?.appiumHandler;
1223
+ const cachingMode = opts?.cachingMode;
1224
+ const useSmartLoop = opts?.useSmartLoop ?? false;
1086
1225
  if (!this.appiumSessionStarted) {
1087
1226
  await this.startSession();
1088
1227
  }
@@ -1335,9 +1474,42 @@ class GptDriver {
1335
1474
  */
1336
1475
  async executeFlow(filePath, options) {
1337
1476
  const useSmartLoop = options?.useSmartLoop ?? false;
1477
+ const configFilePath = this.configFilePath;
1478
+ let baseDir;
1479
+ let absolutePath;
1480
+ if (configFilePath) {
1481
+ let raw2;
1482
+ try {
1483
+ raw2 = await promises.readFile(configFilePath, "utf-8");
1484
+ } catch (e) {
1485
+ const msg = `Failed to read file at ${configFilePath}: ${e?.message ?? e}`;
1486
+ globalLogger.error(msg);
1487
+ throw new Error(msg);
1488
+ }
1489
+ let json2;
1490
+ try {
1491
+ json2 = JSON.parse(raw2);
1492
+ } catch (e) {
1493
+ const msg = `Invalid JSON in flow file ${configFilePath}: ${e?.message ?? e}`;
1494
+ globalLogger.error(msg);
1495
+ throw new Error(msg);
1496
+ }
1497
+ const parsedConfigFile = ConfigSchema.parse(json2);
1498
+ if (path.isAbsolute(parsedConfigFile.testDir)) {
1499
+ baseDir = parsedConfigFile.testDir;
1500
+ } else {
1501
+ baseDir = path.resolve(path.dirname(configFilePath), parsedConfigFile.testDir);
1502
+ }
1503
+ absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(baseDir, filePath);
1504
+ } else {
1505
+ if (useSmartLoop) {
1506
+ throw new Error("Config file is required when using SmartLoop, please provide the path in the constructor");
1507
+ } else {
1508
+ absolutePath = path.resolve(filePath);
1509
+ baseDir = path.dirname(absolutePath);
1510
+ }
1511
+ }
1338
1512
  globalLogger.info(`Loading flow from file: ${filePath}`);
1339
- const absolutePath = path.resolve(filePath);
1340
- const baseDir = path.dirname(absolutePath);
1341
1513
  let raw;
1342
1514
  try {
1343
1515
  raw = await promises.readFile(absolutePath, "utf-8");
@@ -1377,7 +1549,7 @@ ${issues}`);
1377
1549
  }
1378
1550
  return val.data;
1379
1551
  };
1380
- const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1552
+ const expandSteps = async (steps, inheritedParams, parentDir, stack, currentFilePath) => {
1381
1553
  const out = [];
1382
1554
  for (const step of steps) {
1383
1555
  if (step.type === "fileRef") {
@@ -1391,17 +1563,23 @@ ${issues}`);
1391
1563
  const child = await loadFlow(refPath);
1392
1564
  const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1393
1565
  const childDir = path.dirname(refPath);
1394
- const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1566
+ const childRelativePath = path.relative(baseDir, refPath).replace(/^\.\//, "");
1567
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey], childRelativePath);
1395
1568
  out.push(...childExpanded);
1396
1569
  } else {
1397
- const resolved = { ...step, __params: { ...inheritedParams } };
1570
+ const resolved = {
1571
+ ...step,
1572
+ __params: { ...inheritedParams },
1573
+ __filepath: currentFilePath
1574
+ };
1398
1575
  out.push(resolved);
1399
1576
  }
1400
1577
  }
1401
1578
  return out;
1402
1579
  };
1403
1580
  const effectiveParams = { ...rootFlow.params ?? {} };
1404
- const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1581
+ const rootRelativePath = path.relative(baseDir, absolutePath).replace(/^\.\//, "");
1582
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath], rootRelativePath);
1405
1583
  if (!this.appiumSessionStarted) {
1406
1584
  await this.startSession();
1407
1585
  }
@@ -1410,7 +1588,8 @@ ${issues}`);
1410
1588
  try {
1411
1589
  for (const step of expandedSteps) {
1412
1590
  const params = step.__params ?? effectiveParams;
1413
- const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1591
+ const filepath = step.__filepath ?? rootRelativePath;
1592
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}] (filepath: ${filepath})`;
1414
1593
  try {
1415
1594
  switch (step.type) {
1416
1595
  case "ai": {
@@ -1421,7 +1600,8 @@ ${issues}`);
1421
1600
  const result = await executeSmartLoop(ctx, {
1422
1601
  stepNumber: this.step_number,
1423
1602
  description: instruction,
1424
- instruction
1603
+ instruction,
1604
+ filepath
1425
1605
  });
1426
1606
  if (!result.success) {
1427
1607
  throw new Error(result.error || "Smart loop execution failed");
@@ -1443,7 +1623,8 @@ ${issues}`);
1443
1623
  const result = await executeSmartLoop(ctx, {
1444
1624
  stepNumber: this.step_number,
1445
1625
  description,
1446
- instruction: description
1626
+ instruction: description,
1627
+ filepath
1447
1628
  });
1448
1629
  if (!result.success) {
1449
1630
  throw new Error(result.error || "Smart loop execution failed");
@@ -1466,7 +1647,8 @@ ${issues}`);
1466
1647
  const result = await executeSmartLoop(ctx, {
1467
1648
  stepNumber: this.step_number,
1468
1649
  description,
1469
- instruction
1650
+ instruction,
1651
+ filepath
1470
1652
  });
1471
1653
  if (!result.success) {
1472
1654
  throw new Error(result.error || "Smart loop execution failed");
@@ -1485,11 +1667,39 @@ ${issues}`);
1485
1667
  this.step_number++;
1486
1668
  break;
1487
1669
  }
1488
- case "scroll": {
1489
- globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1490
- await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1491
- await this.performScroll(step.direction);
1492
- this.step_number++;
1670
+ case "scroll":
1671
+ case "swipe": {
1672
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1673
+ if (description && useSmartLoop) {
1674
+ globalLogger.info(`${prefix}: ${description}`);
1675
+ const ctx = this.createSmartLoopContext();
1676
+ const result = await executeSmartLoop(ctx, {
1677
+ stepNumber: this.step_number,
1678
+ description,
1679
+ instruction: description,
1680
+ filepath
1681
+ });
1682
+ if (!result.success) {
1683
+ throw new Error(result.error || "Smart loop execution failed");
1684
+ }
1685
+ this.step_number++;
1686
+ } else {
1687
+ globalLogger.info(`${prefix}: ${step.type} ${step.direction}`);
1688
+ await this.takeScreenshotAndLogCodeExecution(`${step.type}: direction=${step.direction}`);
1689
+ if (step.type === "swipe") {
1690
+ await this.performSwipe({
1691
+ direction: step.direction,
1692
+ x1: step.x1,
1693
+ y1: step.y1,
1694
+ x2: step.x2,
1695
+ y2: step.y2,
1696
+ duration: step.duration
1697
+ });
1698
+ } else {
1699
+ await this.performSwipe({ direction: step.direction });
1700
+ }
1701
+ this.step_number++;
1702
+ }
1493
1703
  break;
1494
1704
  }
1495
1705
  case "zoom": {
@@ -1576,10 +1786,7 @@ ${issues}`);
1576
1786
  for (const appiumCommand of executeResponse.commands) {
1577
1787
  await this.executeCommand(appiumCommand);
1578
1788
  }
1579
- if (!conditionSucceeded) {
1580
- globalLogger.debug("Command still in progress, waiting...");
1581
- await delay(1500);
1582
- }
1789
+ await delay(1500);
1583
1790
  }
1584
1791
  this.step_number = this.step_number + 1;
1585
1792
  globalLogger.info("Command execution completed successfully");
@@ -1592,8 +1799,7 @@ ${issues}`);
1592
1799
  async executeCommand(command) {
1593
1800
  const firstAction = command.data?.actions?.at(0);
1594
1801
  if (firstAction?.type === "pause" && firstAction.duration != null) {
1595
- globalLogger.debug(`Pausing for ${firstAction.duration} seconds`);
1596
- await delay(firstAction * 1e3);
1802
+ await delay(firstAction.duration * 1e3);
1597
1803
  } else if (!this.useGptDriverCloud) {
1598
1804
  const parsedUrl = new URL(command.url);
1599
1805
  parsedUrl.protocol = this.appiumSessionConfig.serverUrl.protocol;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gpt-driver-node",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "main": "./dist/index.cjs",
5
5
  "module": "./dist/index.mjs",
6
6
  "types": "./dist/index.d.cts",