gpt-driver-node 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -22,6 +22,15 @@ function buildUrl(base, extraPath) {
22
22
  }
23
23
  return `${baseUrl}${extraPath}`;
24
24
  }
25
+ const getImageDimensions = async (base64) => {
26
+ const base64Data = base64.replace(/^data:image\/\w+;base64,/, "");
27
+ const buffer = Buffer.from(base64Data, "base64");
28
+ const metadata = await sharp(buffer).metadata();
29
+ if (!metadata.width || !metadata.height) {
30
+ throw new Error("Unable to get image dimensions");
31
+ }
32
+ return { width: metadata.width, height: metadata.height };
33
+ };
25
34
 
26
35
  const colors = {
27
36
  reset: "\x1B[0m",
@@ -107,6 +116,15 @@ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
107
116
  type: zod.z.literal("type"),
108
117
  text: zod.z.string()
109
118
  });
119
+ const SavableSwipeStepSchema = SavableStepBaseSchema.extend({
120
+ type: zod.z.literal("swipe"),
121
+ direction: zod.z.enum(["left", "right", "up", "down"]),
122
+ x1: zod.z.number().optional(),
123
+ y1: zod.z.number().optional(),
124
+ x2: zod.z.number().optional(),
125
+ y2: zod.z.number().optional(),
126
+ duration: zod.z.number().optional().default(500)
127
+ });
110
128
  const SavableScrollStepSchema = SavableStepBaseSchema.extend({
111
129
  type: zod.z.literal("scroll"),
112
130
  direction: zod.z.enum(["up", "down"])
@@ -142,6 +160,8 @@ const SavableStepSchema = zod.z.discriminatedUnion("type", [
142
160
  // type: 'assert'
143
161
  SavableTypeStepSchema,
144
162
  // type: 'type'
163
+ SavableSwipeStepSchema,
164
+ // type: 'swipe'
145
165
  SavableScrollStepSchema,
146
166
  // type: 'scroll'
147
167
  SavableZoomStepSchema,
@@ -427,6 +447,30 @@ function isScrollCommand(cmd) {
427
447
  function isTypeCommand(cmd) {
428
448
  return cmd.startsWith("type:");
429
449
  }
450
+ function isSlideCommand(cmd) {
451
+ return cmd.startsWith("slide");
452
+ }
453
+ function parseSlideCommand(cmd) {
454
+ const slideMatch = cmd.match(
455
+ /slide\s+(up|down|left|right)\s+(\d+)%(?::\s*[^;]*)?;(\d+);(\d+)/i
456
+ );
457
+ if (!slideMatch) {
458
+ return null;
459
+ }
460
+ const extractedDirection = slideMatch[1].toLowerCase();
461
+ const directionMap = {
462
+ down: "up",
463
+ up: "down",
464
+ left: "right",
465
+ right: "left"
466
+ };
467
+ return {
468
+ direction: directionMap[extractedDirection],
469
+ percentage: parseInt(slideMatch[2], 10),
470
+ startX: parseInt(slideMatch[3], 10),
471
+ startY: parseInt(slideMatch[4], 10)
472
+ };
473
+ }
430
474
 
431
475
  async function executeSmartLoop(ctx, params) {
432
476
  const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
@@ -442,6 +486,8 @@ async function executeSmartLoop(ctx, params) {
442
486
  let screenshot = "";
443
487
  let commands = [];
444
488
  let isCacheHit = false;
489
+ const firstScreenshot = await ctx.getScreenshot();
490
+ const screenshotResolution = await getImageDimensions(firstScreenshot);
445
491
  for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
446
492
  screenshot = await ctx.getScreenshot();
447
493
  const sizeInBytes = screenshot.length * 0.75;
@@ -454,7 +500,7 @@ async function executeSmartLoop(ctx, params) {
454
500
  stepNumber: params.stepNumber,
455
501
  stepDescription: params.description,
456
502
  screenshot,
457
- screenResolution: ctx.screenSize,
503
+ screenResolution: screenshotResolution,
458
504
  highestUsedIndex: lastCacheIndex,
459
505
  platform: ctx.platform,
460
506
  filepath: params.filepath
@@ -525,7 +571,12 @@ async function executeSmartLoop(ctx, params) {
525
571
  const coords = parseTapCoordinates(cmd);
526
572
  if (coords) {
527
573
  globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
528
- await ctx.performTap(coords.x, coords.y);
574
+ await ctx.performTap(
575
+ coords.x,
576
+ coords.y,
577
+ screenshotResolution.width,
578
+ screenshotResolution.height
579
+ );
529
580
  actionExecuted = true;
530
581
  }
531
582
  } else if (isWaitCommand(cmd)) {
@@ -539,7 +590,44 @@ async function executeSmartLoop(ctx, params) {
539
590
  const direction = parseScrollDirection(cmd);
540
591
  if (direction) {
541
592
  globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
542
- await ctx.performScroll(direction);
593
+ await ctx.performSwipe({ direction });
594
+ actionExecuted = true;
595
+ }
596
+ } else if (isSlideCommand(cmd)) {
597
+ const slideParams = parseSlideCommand(cmd);
598
+ if (slideParams) {
599
+ const { direction, percentage, startX, startY } = slideParams;
600
+ const isVertical = direction === "up" || direction === "down";
601
+ const distance = Math.round(
602
+ (isVertical ? screenshotResolution.height : screenshotResolution.width) * (percentage / 100)
603
+ );
604
+ let endX = startX;
605
+ let endY = startY;
606
+ switch (direction) {
607
+ case "up":
608
+ endY = startY + distance;
609
+ break;
610
+ case "down":
611
+ endY = startY - distance;
612
+ break;
613
+ case "left":
614
+ endX = startX - distance;
615
+ break;
616
+ case "right":
617
+ endX = startX + distance;
618
+ break;
619
+ }
620
+ globalLogger.debug(`[SmartLoop] Sliding ${direction} ${percentage}% from (${startX}, ${startY}) to (${endX}, ${endY})`);
621
+ await ctx.performSwipe({
622
+ direction,
623
+ x1: startX,
624
+ y1: startY,
625
+ x2: endX,
626
+ y2: endY,
627
+ screenshotWidth: screenshotResolution.width,
628
+ screenshotHeight: screenshotResolution.height,
629
+ duration: 500
630
+ });
543
631
  actionExecuted = true;
544
632
  }
545
633
  } else if (isTypeCommand(cmd)) {
@@ -567,7 +655,7 @@ async function executeSmartLoop(ctx, params) {
567
655
  stepNumber: params.stepNumber,
568
656
  stepDescription: params.description,
569
657
  executionData: currentExecutionData,
570
- screenResolution: ctx.screenSize,
658
+ screenResolution: screenshotResolution,
571
659
  platform: ctx.platform,
572
660
  filepath: params.filepath
573
661
  });
@@ -840,11 +928,10 @@ class GptDriver {
840
928
  return {
841
929
  apiKey: this.apiKey,
842
930
  platform: this.appiumSessionConfig?.platform,
843
- screenSize: this.appiumSessionConfig.size,
844
931
  globalActionHistory: this.globalActionHistory,
845
- getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
846
- performTap: (x, y) => this.performTap(x, y),
847
- performScroll: (direction) => this.performScroll(direction),
932
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig, false),
933
+ performTap: (x, y, screenshotWidth, screenshotHeight) => this.performTap(x, y, screenshotWidth, screenshotHeight),
934
+ performSwipe: (params) => this.performSwipe(params),
848
935
  performType: (text) => this.performType(text),
849
936
  logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command),
850
937
  organisationId: this.organisationId
@@ -897,15 +984,31 @@ class GptDriver {
897
984
  /**
898
985
  * Performs a tap action at the specified coordinates.
899
986
  */
900
- async performTap(x, y) {
987
+ async performTap(x, y, screenshotWidth, screenshotHeight) {
901
988
  const client = await this.getWdioClient();
989
+ const platform = this.appiumSessionConfig?.platform;
990
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
991
+ const scaled = this.scaleForIOS(
992
+ x,
993
+ y,
994
+ platform,
995
+ screenshotWidth,
996
+ screenshotHeight,
997
+ w,
998
+ h
999
+ );
1000
+ const clampedX = this.clamp(scaled.x, 0, w - 1);
1001
+ const clampedY = this.clamp(scaled.y, 0, h - 1);
1002
+ globalLogger.debug(
1003
+ `[Tap] Platform: ${platform}, Input: ${x},${y}, Window: ${w}x${h}, Final: ${clampedX},${clampedY}`
1004
+ );
902
1005
  await client.performActions([
903
1006
  {
904
1007
  type: "pointer",
905
1008
  id: "finger1",
906
1009
  parameters: { pointerType: "touch" },
907
1010
  actions: [
908
- { type: "pointerMove", duration: 0, x, y },
1011
+ { type: "pointerMove", duration: 0, x: clampedX, y: clampedY },
909
1012
  { type: "pointerDown", button: 0 },
910
1013
  { type: "pause", duration: 100 },
911
1014
  { type: "pointerUp", button: 0 }
@@ -915,25 +1018,125 @@ class GptDriver {
915
1018
  }
916
1019
  async performType(text) {
917
1020
  const client = await this.getWdioClient();
918
- await client.keys(text.split(""));
1021
+ const platform = this.appiumSessionConfig?.platform;
1022
+ if (platform === "iOS") {
1023
+ const actions = text.split("").flatMap((char) => [
1024
+ { type: "keyDown", value: char },
1025
+ { type: "keyUp", value: char }
1026
+ ]);
1027
+ await client.performActions([
1028
+ {
1029
+ type: "key",
1030
+ id: "keyboard",
1031
+ actions
1032
+ }
1033
+ ]);
1034
+ } else {
1035
+ await client.keys(text.split(""));
1036
+ }
1037
+ }
1038
+ clamp(value, min, max) {
1039
+ return Math.max(min, Math.min(max, value));
1040
+ }
1041
+ scaleForIOS(x, y, platform, screenshotWidth, screenshotHeight, windowWidth, windowHeight) {
1042
+ if (platform !== "iOS" || !screenshotWidth || !screenshotHeight || !windowWidth || !windowHeight) {
1043
+ return { x: Math.round(x), y: Math.round(y) };
1044
+ }
1045
+ const scaleX = windowWidth / screenshotWidth;
1046
+ const scaleY = windowHeight / screenshotHeight;
1047
+ return {
1048
+ x: Math.round(x * scaleX),
1049
+ y: Math.round(y * scaleY)
1050
+ };
919
1051
  }
920
- async performScroll(direction) {
1052
+ async performSwipe(params) {
921
1053
  const client = await this.getWdioClient();
922
- const w = this.appiumSessionConfig?.size?.width ?? 1080;
923
- const h = this.appiumSessionConfig?.size?.height ?? 1920;
924
- const x = Math.round(w / 2);
925
- const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
926
- const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
1054
+ const platform = this.appiumSessionConfig?.platform;
1055
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
1056
+ const defaultStartX = w / 2;
1057
+ let defaultStartY;
1058
+ if (params.direction === "down") {
1059
+ defaultStartY = h * 0.75;
1060
+ } else if (params.direction === "up") {
1061
+ defaultStartY = h * 0.25;
1062
+ } else {
1063
+ defaultStartY = h / 2;
1064
+ }
1065
+ let startX;
1066
+ let startY;
1067
+ if (params.x1 !== void 0 || params.y1 !== void 0) {
1068
+ const scaled = this.scaleForIOS(
1069
+ params.x1 ?? defaultStartX,
1070
+ params.y1 ?? defaultStartY,
1071
+ platform,
1072
+ params.screenshotWidth,
1073
+ params.screenshotHeight,
1074
+ w,
1075
+ h
1076
+ );
1077
+ startX = scaled.x;
1078
+ startY = scaled.y;
1079
+ } else {
1080
+ startX = Math.round(defaultStartX);
1081
+ startY = Math.round(defaultStartY);
1082
+ }
1083
+ startX = this.clamp(startX, 0, w - 1);
1084
+ startY = this.clamp(startY, 0, h - 1);
1085
+ let endX;
1086
+ let endY;
1087
+ if (params.x2 !== void 0 || params.y2 !== void 0) {
1088
+ const scaled = this.scaleForIOS(
1089
+ params.x2 ?? startX,
1090
+ params.y2 ?? startY,
1091
+ platform,
1092
+ params.screenshotWidth,
1093
+ params.screenshotHeight,
1094
+ w,
1095
+ h
1096
+ );
1097
+ endX = scaled.x;
1098
+ endY = scaled.y;
1099
+ } else {
1100
+ const deltaX = Math.round(w * 0.5);
1101
+ const deltaY = Math.round(h * 0.5);
1102
+ switch (params.direction) {
1103
+ case "left":
1104
+ endX = Math.max(0, startX - deltaX);
1105
+ endY = startY;
1106
+ break;
1107
+ case "right":
1108
+ endX = Math.min(w - 1, startX + deltaX);
1109
+ endY = startY;
1110
+ break;
1111
+ case "up":
1112
+ endX = startX;
1113
+ endY = Math.min(h - 1, startY + deltaY);
1114
+ break;
1115
+ case "down":
1116
+ endX = startX;
1117
+ endY = Math.max(0, startY - deltaY);
1118
+ break;
1119
+ default:
1120
+ endX = startX;
1121
+ endY = startY;
1122
+ }
1123
+ }
1124
+ endX = this.clamp(endX, 0, w - 1);
1125
+ endY = this.clamp(endY, 0, h - 1);
1126
+ globalLogger.debug(
1127
+ `[Swipe] Platform: ${platform}, Direction: ${params.direction}, Start: ${startX},${startY}, End: ${endX},${endY}`
1128
+ );
1129
+ const duration = params.duration ?? 500;
927
1130
  await client.performActions([
928
1131
  {
929
1132
  type: "pointer",
930
1133
  id: "finger1",
931
1134
  parameters: { pointerType: "touch" },
932
1135
  actions: [
933
- { type: "pointerMove", duration: 0, x, y: startY },
1136
+ { type: "pointerMove", duration: 0, x: startX, y: startY },
934
1137
  { type: "pointerDown", button: 0 },
935
1138
  { type: "pause", duration: 100 },
936
- { type: "pointerMove", duration: 500, x, y: endY },
1139
+ { type: "pointerMove", duration, x: endX, y: endY },
937
1140
  { type: "pointerUp", button: 0 }
938
1141
  ]
939
1142
  }
@@ -952,17 +1155,17 @@ class GptDriver {
952
1155
  if (found) {
953
1156
  return;
954
1157
  }
955
- await this.performScroll(direction);
1158
+ await this.performSwipe({ direction });
956
1159
  await this._delay(500);
957
1160
  }
958
1161
  throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
959
1162
  }
960
- async getScreenshot(appiumSessionConfig) {
1163
+ async getScreenshot(appiumSessionConfig, shouldScale = true) {
961
1164
  globalLogger.debug("Capturing screenshot...");
962
1165
  const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
963
1166
  const screenshotResponse = await axios.get(url);
964
1167
  let screenshot = await screenshotResponse.data.value;
965
- if (appiumSessionConfig.platform === "iOS") {
1168
+ if (appiumSessionConfig.platform === "iOS" && shouldScale) {
966
1169
  globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
967
1170
  const imageBuffer = Buffer.from(screenshot, "base64");
968
1171
  const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
@@ -1348,7 +1551,7 @@ ${issues}`);
1348
1551
  }
1349
1552
  return val.data;
1350
1553
  };
1351
- const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1554
+ const expandSteps = async (steps, inheritedParams, parentDir, stack, currentFilePath) => {
1352
1555
  const out = [];
1353
1556
  for (const step of steps) {
1354
1557
  if (step.type === "fileRef") {
@@ -1362,17 +1565,23 @@ ${issues}`);
1362
1565
  const child = await loadFlow(refPath);
1363
1566
  const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1364
1567
  const childDir = path.dirname(refPath);
1365
- const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1568
+ const childRelativePath = path.relative(baseDir, refPath).replace(/^\.\//, "");
1569
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey], childRelativePath);
1366
1570
  out.push(...childExpanded);
1367
1571
  } else {
1368
- const resolved = { ...step, __params: { ...inheritedParams } };
1572
+ const resolved = {
1573
+ ...step,
1574
+ __params: { ...inheritedParams },
1575
+ __filepath: currentFilePath
1576
+ };
1369
1577
  out.push(resolved);
1370
1578
  }
1371
1579
  }
1372
1580
  return out;
1373
1581
  };
1374
1582
  const effectiveParams = { ...rootFlow.params ?? {} };
1375
- const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1583
+ const rootRelativePath = path.relative(baseDir, absolutePath).replace(/^\.\//, "");
1584
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath], rootRelativePath);
1376
1585
  if (!this.appiumSessionStarted) {
1377
1586
  await this.startSession();
1378
1587
  }
@@ -1381,7 +1590,8 @@ ${issues}`);
1381
1590
  try {
1382
1591
  for (const step of expandedSteps) {
1383
1592
  const params = step.__params ?? effectiveParams;
1384
- const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1593
+ const filepath = step.__filepath ?? rootRelativePath;
1594
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}] (filepath: ${filepath})`;
1385
1595
  try {
1386
1596
  switch (step.type) {
1387
1597
  case "ai": {
@@ -1392,7 +1602,8 @@ ${issues}`);
1392
1602
  const result = await executeSmartLoop(ctx, {
1393
1603
  stepNumber: this.step_number,
1394
1604
  description: instruction,
1395
- instruction
1605
+ instruction,
1606
+ filepath
1396
1607
  });
1397
1608
  if (!result.success) {
1398
1609
  throw new Error(result.error || "Smart loop execution failed");
@@ -1414,7 +1625,8 @@ ${issues}`);
1414
1625
  const result = await executeSmartLoop(ctx, {
1415
1626
  stepNumber: this.step_number,
1416
1627
  description,
1417
- instruction: description
1628
+ instruction: description,
1629
+ filepath
1418
1630
  });
1419
1631
  if (!result.success) {
1420
1632
  throw new Error(result.error || "Smart loop execution failed");
@@ -1437,7 +1649,8 @@ ${issues}`);
1437
1649
  const result = await executeSmartLoop(ctx, {
1438
1650
  stepNumber: this.step_number,
1439
1651
  description,
1440
- instruction
1652
+ instruction,
1653
+ filepath
1441
1654
  });
1442
1655
  if (!result.success) {
1443
1656
  throw new Error(result.error || "Smart loop execution failed");
@@ -1456,11 +1669,39 @@ ${issues}`);
1456
1669
  this.step_number++;
1457
1670
  break;
1458
1671
  }
1459
- case "scroll": {
1460
- globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1461
- await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1462
- await this.performScroll(step.direction);
1463
- this.step_number++;
1672
+ case "scroll":
1673
+ case "swipe": {
1674
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1675
+ if (description && useSmartLoop) {
1676
+ globalLogger.info(`${prefix}: ${description}`);
1677
+ const ctx = this.createSmartLoopContext();
1678
+ const result = await executeSmartLoop(ctx, {
1679
+ stepNumber: this.step_number,
1680
+ description,
1681
+ instruction: description,
1682
+ filepath
1683
+ });
1684
+ if (!result.success) {
1685
+ throw new Error(result.error || "Smart loop execution failed");
1686
+ }
1687
+ this.step_number++;
1688
+ } else {
1689
+ globalLogger.info(`${prefix}: ${step.type} ${step.direction}`);
1690
+ await this.takeScreenshotAndLogCodeExecution(`${step.type}: direction=${step.direction}`);
1691
+ if (step.type === "swipe") {
1692
+ await this.performSwipe({
1693
+ direction: step.direction,
1694
+ x1: step.x1,
1695
+ y1: step.y1,
1696
+ x2: step.x2,
1697
+ y2: step.y2,
1698
+ duration: step.duration
1699
+ });
1700
+ } else {
1701
+ await this.performSwipe({ direction: step.direction });
1702
+ }
1703
+ this.step_number++;
1704
+ }
1464
1705
  break;
1465
1706
  }
1466
1707
  case "zoom": {
package/dist/index.d.cts CHANGED
@@ -121,6 +121,40 @@ declare const SavableTestStoreSchema: z.ZodObject<{
121
121
  id: z.ZodOptional<z.ZodNumber>;
122
122
  descriptionText: z.ZodOptional<z.ZodString>;
123
123
  optional: z.ZodOptional<z.ZodBoolean>;
124
+ } & {
125
+ type: z.ZodLiteral<"swipe">;
126
+ direction: z.ZodEnum<["left", "right", "up", "down"]>;
127
+ x1: z.ZodOptional<z.ZodNumber>;
128
+ y1: z.ZodOptional<z.ZodNumber>;
129
+ x2: z.ZodOptional<z.ZodNumber>;
130
+ y2: z.ZodOptional<z.ZodNumber>;
131
+ duration: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
132
+ }, "strip", z.ZodTypeAny, {
133
+ type: "swipe";
134
+ direction: "left" | "right" | "up" | "down";
135
+ duration: number;
136
+ id?: number | undefined;
137
+ descriptionText?: string | undefined;
138
+ optional?: boolean | undefined;
139
+ x1?: number | undefined;
140
+ y1?: number | undefined;
141
+ x2?: number | undefined;
142
+ y2?: number | undefined;
143
+ }, {
144
+ type: "swipe";
145
+ direction: "left" | "right" | "up" | "down";
146
+ id?: number | undefined;
147
+ descriptionText?: string | undefined;
148
+ optional?: boolean | undefined;
149
+ x1?: number | undefined;
150
+ y1?: number | undefined;
151
+ x2?: number | undefined;
152
+ y2?: number | undefined;
153
+ duration?: number | undefined;
154
+ }>, z.ZodObject<{
155
+ id: z.ZodOptional<z.ZodNumber>;
156
+ descriptionText: z.ZodOptional<z.ZodString>;
157
+ optional: z.ZodOptional<z.ZodBoolean>;
124
158
  } & {
125
159
  type: z.ZodLiteral<"scroll">;
126
160
  direction: z.ZodEnum<["up", "down"]>;
@@ -271,6 +305,17 @@ declare const SavableTestStoreSchema: z.ZodObject<{
271
305
  id?: number | undefined;
272
306
  descriptionText?: string | undefined;
273
307
  optional?: boolean | undefined;
308
+ } | {
309
+ type: "swipe";
310
+ direction: "left" | "right" | "up" | "down";
311
+ duration: number;
312
+ id?: number | undefined;
313
+ descriptionText?: string | undefined;
314
+ optional?: boolean | undefined;
315
+ x1?: number | undefined;
316
+ y1?: number | undefined;
317
+ x2?: number | undefined;
318
+ y2?: number | undefined;
274
319
  } | {
275
320
  type: "scroll";
276
321
  direction: "up" | "down";
@@ -339,6 +384,17 @@ declare const SavableTestStoreSchema: z.ZodObject<{
339
384
  id?: number | undefined;
340
385
  descriptionText?: string | undefined;
341
386
  optional?: boolean | undefined;
387
+ } | {
388
+ type: "swipe";
389
+ direction: "left" | "right" | "up" | "down";
390
+ id?: number | undefined;
391
+ descriptionText?: string | undefined;
392
+ optional?: boolean | undefined;
393
+ x1?: number | undefined;
394
+ y1?: number | undefined;
395
+ x2?: number | undefined;
396
+ y2?: number | undefined;
397
+ duration?: number | undefined;
342
398
  } | {
343
399
  type: "scroll";
344
400
  direction: "up" | "down";
@@ -461,7 +517,9 @@ declare class GptDriver {
461
517
  */
462
518
  private performTap;
463
519
  private performType;
464
- private performScroll;
520
+ private clamp;
521
+ private scaleForIOS;
522
+ private performSwipe;
465
523
  private getPageSource;
466
524
  private performScrollUntil;
467
525
  private getScreenshot;
package/dist/index.mjs CHANGED
@@ -20,6 +20,15 @@ function buildUrl(base, extraPath) {
20
20
  }
21
21
  return `${baseUrl}${extraPath}`;
22
22
  }
23
+ const getImageDimensions = async (base64) => {
24
+ const base64Data = base64.replace(/^data:image\/\w+;base64,/, "");
25
+ const buffer = Buffer.from(base64Data, "base64");
26
+ const metadata = await sharp(buffer).metadata();
27
+ if (!metadata.width || !metadata.height) {
28
+ throw new Error("Unable to get image dimensions");
29
+ }
30
+ return { width: metadata.width, height: metadata.height };
31
+ };
23
32
 
24
33
  const colors = {
25
34
  reset: "\x1B[0m",
@@ -105,6 +114,15 @@ const SavableTypeStepSchema = SavableStepBaseSchema.extend({
105
114
  type: z.literal("type"),
106
115
  text: z.string()
107
116
  });
117
+ const SavableSwipeStepSchema = SavableStepBaseSchema.extend({
118
+ type: z.literal("swipe"),
119
+ direction: z.enum(["left", "right", "up", "down"]),
120
+ x1: z.number().optional(),
121
+ y1: z.number().optional(),
122
+ x2: z.number().optional(),
123
+ y2: z.number().optional(),
124
+ duration: z.number().optional().default(500)
125
+ });
108
126
  const SavableScrollStepSchema = SavableStepBaseSchema.extend({
109
127
  type: z.literal("scroll"),
110
128
  direction: z.enum(["up", "down"])
@@ -140,6 +158,8 @@ const SavableStepSchema = z.discriminatedUnion("type", [
140
158
  // type: 'assert'
141
159
  SavableTypeStepSchema,
142
160
  // type: 'type'
161
+ SavableSwipeStepSchema,
162
+ // type: 'swipe'
143
163
  SavableScrollStepSchema,
144
164
  // type: 'scroll'
145
165
  SavableZoomStepSchema,
@@ -425,6 +445,30 @@ function isScrollCommand(cmd) {
425
445
  function isTypeCommand(cmd) {
426
446
  return cmd.startsWith("type:");
427
447
  }
448
+ function isSlideCommand(cmd) {
449
+ return cmd.startsWith("slide");
450
+ }
451
+ function parseSlideCommand(cmd) {
452
+ const slideMatch = cmd.match(
453
+ /slide\s+(up|down|left|right)\s+(\d+)%(?::\s*[^;]*)?;(\d+);(\d+)/i
454
+ );
455
+ if (!slideMatch) {
456
+ return null;
457
+ }
458
+ const extractedDirection = slideMatch[1].toLowerCase();
459
+ const directionMap = {
460
+ down: "up",
461
+ up: "down",
462
+ left: "right",
463
+ right: "left"
464
+ };
465
+ return {
466
+ direction: directionMap[extractedDirection],
467
+ percentage: parseInt(slideMatch[2], 10),
468
+ startX: parseInt(slideMatch[3], 10),
469
+ startY: parseInt(slideMatch[4], 10)
470
+ };
471
+ }
428
472
 
429
473
  async function executeSmartLoop(ctx, params) {
430
474
  const maxCacheAttempts = Math.floor(CACHE_RETRY_MS / CACHE_CHECK_INTERVAL_MS);
@@ -440,6 +484,8 @@ async function executeSmartLoop(ctx, params) {
440
484
  let screenshot = "";
441
485
  let commands = [];
442
486
  let isCacheHit = false;
487
+ const firstScreenshot = await ctx.getScreenshot();
488
+ const screenshotResolution = await getImageDimensions(firstScreenshot);
443
489
  for (let attempt = 0; attempt < maxCacheAttempts; attempt++) {
444
490
  screenshot = await ctx.getScreenshot();
445
491
  const sizeInBytes = screenshot.length * 0.75;
@@ -452,7 +498,7 @@ async function executeSmartLoop(ctx, params) {
452
498
  stepNumber: params.stepNumber,
453
499
  stepDescription: params.description,
454
500
  screenshot,
455
- screenResolution: ctx.screenSize,
501
+ screenResolution: screenshotResolution,
456
502
  highestUsedIndex: lastCacheIndex,
457
503
  platform: ctx.platform,
458
504
  filepath: params.filepath
@@ -523,7 +569,12 @@ async function executeSmartLoop(ctx, params) {
523
569
  const coords = parseTapCoordinates(cmd);
524
570
  if (coords) {
525
571
  globalLogger.debug(`[SmartLoop] Executing tap at (${coords.x}, ${coords.y})`);
526
- await ctx.performTap(coords.x, coords.y);
572
+ await ctx.performTap(
573
+ coords.x,
574
+ coords.y,
575
+ screenshotResolution.width,
576
+ screenshotResolution.height
577
+ );
527
578
  actionExecuted = true;
528
579
  }
529
580
  } else if (isWaitCommand(cmd)) {
@@ -537,7 +588,44 @@ async function executeSmartLoop(ctx, params) {
537
588
  const direction = parseScrollDirection(cmd);
538
589
  if (direction) {
539
590
  globalLogger.debug(`[SmartLoop] Scrolling ${direction}`);
540
- await ctx.performScroll(direction);
591
+ await ctx.performSwipe({ direction });
592
+ actionExecuted = true;
593
+ }
594
+ } else if (isSlideCommand(cmd)) {
595
+ const slideParams = parseSlideCommand(cmd);
596
+ if (slideParams) {
597
+ const { direction, percentage, startX, startY } = slideParams;
598
+ const isVertical = direction === "up" || direction === "down";
599
+ const distance = Math.round(
600
+ (isVertical ? screenshotResolution.height : screenshotResolution.width) * (percentage / 100)
601
+ );
602
+ let endX = startX;
603
+ let endY = startY;
604
+ switch (direction) {
605
+ case "up":
606
+ endY = startY + distance;
607
+ break;
608
+ case "down":
609
+ endY = startY - distance;
610
+ break;
611
+ case "left":
612
+ endX = startX - distance;
613
+ break;
614
+ case "right":
615
+ endX = startX + distance;
616
+ break;
617
+ }
618
+ globalLogger.debug(`[SmartLoop] Sliding ${direction} ${percentage}% from (${startX}, ${startY}) to (${endX}, ${endY})`);
619
+ await ctx.performSwipe({
620
+ direction,
621
+ x1: startX,
622
+ y1: startY,
623
+ x2: endX,
624
+ y2: endY,
625
+ screenshotWidth: screenshotResolution.width,
626
+ screenshotHeight: screenshotResolution.height,
627
+ duration: 500
628
+ });
541
629
  actionExecuted = true;
542
630
  }
543
631
  } else if (isTypeCommand(cmd)) {
@@ -565,7 +653,7 @@ async function executeSmartLoop(ctx, params) {
565
653
  stepNumber: params.stepNumber,
566
654
  stepDescription: params.description,
567
655
  executionData: currentExecutionData,
568
- screenResolution: ctx.screenSize,
656
+ screenResolution: screenshotResolution,
569
657
  platform: ctx.platform,
570
658
  filepath: params.filepath
571
659
  });
@@ -838,11 +926,10 @@ class GptDriver {
838
926
  return {
839
927
  apiKey: this.apiKey,
840
928
  platform: this.appiumSessionConfig?.platform,
841
- screenSize: this.appiumSessionConfig.size,
842
929
  globalActionHistory: this.globalActionHistory,
843
- getScreenshot: () => this.getScreenshot(this.appiumSessionConfig),
844
- performTap: (x, y) => this.performTap(x, y),
845
- performScroll: (direction) => this.performScroll(direction),
930
+ getScreenshot: () => this.getScreenshot(this.appiumSessionConfig, false),
931
+ performTap: (x, y, screenshotWidth, screenshotHeight) => this.performTap(x, y, screenshotWidth, screenshotHeight),
932
+ performSwipe: (params) => this.performSwipe(params),
846
933
  performType: (text) => this.performType(text),
847
934
  logCodeExecution: async (screenshot, command) => this.logCodeExecution(screenshot, command),
848
935
  organisationId: this.organisationId
@@ -895,15 +982,31 @@ class GptDriver {
895
982
  /**
896
983
  * Performs a tap action at the specified coordinates.
897
984
  */
898
- async performTap(x, y) {
985
+ async performTap(x, y, screenshotWidth, screenshotHeight) {
899
986
  const client = await this.getWdioClient();
987
+ const platform = this.appiumSessionConfig?.platform;
988
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
989
+ const scaled = this.scaleForIOS(
990
+ x,
991
+ y,
992
+ platform,
993
+ screenshotWidth,
994
+ screenshotHeight,
995
+ w,
996
+ h
997
+ );
998
+ const clampedX = this.clamp(scaled.x, 0, w - 1);
999
+ const clampedY = this.clamp(scaled.y, 0, h - 1);
1000
+ globalLogger.debug(
1001
+ `[Tap] Platform: ${platform}, Input: ${x},${y}, Window: ${w}x${h}, Final: ${clampedX},${clampedY}`
1002
+ );
900
1003
  await client.performActions([
901
1004
  {
902
1005
  type: "pointer",
903
1006
  id: "finger1",
904
1007
  parameters: { pointerType: "touch" },
905
1008
  actions: [
906
- { type: "pointerMove", duration: 0, x, y },
1009
+ { type: "pointerMove", duration: 0, x: clampedX, y: clampedY },
907
1010
  { type: "pointerDown", button: 0 },
908
1011
  { type: "pause", duration: 100 },
909
1012
  { type: "pointerUp", button: 0 }
@@ -913,25 +1016,125 @@ class GptDriver {
913
1016
  }
914
1017
  async performType(text) {
915
1018
  const client = await this.getWdioClient();
916
- await client.keys(text.split(""));
1019
+ const platform = this.appiumSessionConfig?.platform;
1020
+ if (platform === "iOS") {
1021
+ const actions = text.split("").flatMap((char) => [
1022
+ { type: "keyDown", value: char },
1023
+ { type: "keyUp", value: char }
1024
+ ]);
1025
+ await client.performActions([
1026
+ {
1027
+ type: "key",
1028
+ id: "keyboard",
1029
+ actions
1030
+ }
1031
+ ]);
1032
+ } else {
1033
+ await client.keys(text.split(""));
1034
+ }
1035
+ }
1036
+ clamp(value, min, max) {
1037
+ return Math.max(min, Math.min(max, value));
1038
+ }
1039
+ scaleForIOS(x, y, platform, screenshotWidth, screenshotHeight, windowWidth, windowHeight) {
1040
+ if (platform !== "iOS" || !screenshotWidth || !screenshotHeight || !windowWidth || !windowHeight) {
1041
+ return { x: Math.round(x), y: Math.round(y) };
1042
+ }
1043
+ const scaleX = windowWidth / screenshotWidth;
1044
+ const scaleY = windowHeight / screenshotHeight;
1045
+ return {
1046
+ x: Math.round(x * scaleX),
1047
+ y: Math.round(y * scaleY)
1048
+ };
917
1049
  }
918
- async performScroll(direction) {
1050
+ async performSwipe(params) {
919
1051
  const client = await this.getWdioClient();
920
- const w = this.appiumSessionConfig?.size?.width ?? 1080;
921
- const h = this.appiumSessionConfig?.size?.height ?? 1920;
922
- const x = Math.round(w / 2);
923
- const startY = direction === "down" ? Math.round(h * 0.8) : Math.round(h * 0.2);
924
- const endY = direction === "down" ? Math.round(h * 0.2) : Math.round(h * 0.8);
1052
+ const platform = this.appiumSessionConfig?.platform;
1053
+ const { width: w, height: h } = this.appiumSessionConfig?.size ?? { width: 1080, height: 1920 };
1054
+ const defaultStartX = w / 2;
1055
+ let defaultStartY;
1056
+ if (params.direction === "down") {
1057
+ defaultStartY = h * 0.75;
1058
+ } else if (params.direction === "up") {
1059
+ defaultStartY = h * 0.25;
1060
+ } else {
1061
+ defaultStartY = h / 2;
1062
+ }
1063
+ let startX;
1064
+ let startY;
1065
+ if (params.x1 !== void 0 || params.y1 !== void 0) {
1066
+ const scaled = this.scaleForIOS(
1067
+ params.x1 ?? defaultStartX,
1068
+ params.y1 ?? defaultStartY,
1069
+ platform,
1070
+ params.screenshotWidth,
1071
+ params.screenshotHeight,
1072
+ w,
1073
+ h
1074
+ );
1075
+ startX = scaled.x;
1076
+ startY = scaled.y;
1077
+ } else {
1078
+ startX = Math.round(defaultStartX);
1079
+ startY = Math.round(defaultStartY);
1080
+ }
1081
+ startX = this.clamp(startX, 0, w - 1);
1082
+ startY = this.clamp(startY, 0, h - 1);
1083
+ let endX;
1084
+ let endY;
1085
+ if (params.x2 !== void 0 || params.y2 !== void 0) {
1086
+ const scaled = this.scaleForIOS(
1087
+ params.x2 ?? startX,
1088
+ params.y2 ?? startY,
1089
+ platform,
1090
+ params.screenshotWidth,
1091
+ params.screenshotHeight,
1092
+ w,
1093
+ h
1094
+ );
1095
+ endX = scaled.x;
1096
+ endY = scaled.y;
1097
+ } else {
1098
+ const deltaX = Math.round(w * 0.5);
1099
+ const deltaY = Math.round(h * 0.5);
1100
+ switch (params.direction) {
1101
+ case "left":
1102
+ endX = Math.max(0, startX - deltaX);
1103
+ endY = startY;
1104
+ break;
1105
+ case "right":
1106
+ endX = Math.min(w - 1, startX + deltaX);
1107
+ endY = startY;
1108
+ break;
1109
+ case "up":
1110
+ endX = startX;
1111
+ endY = Math.min(h - 1, startY + deltaY);
1112
+ break;
1113
+ case "down":
1114
+ endX = startX;
1115
+ endY = Math.max(0, startY - deltaY);
1116
+ break;
1117
+ default:
1118
+ endX = startX;
1119
+ endY = startY;
1120
+ }
1121
+ }
1122
+ endX = this.clamp(endX, 0, w - 1);
1123
+ endY = this.clamp(endY, 0, h - 1);
1124
+ globalLogger.debug(
1125
+ `[Swipe] Platform: ${platform}, Direction: ${params.direction}, Start: ${startX},${startY}, End: ${endX},${endY}`
1126
+ );
1127
+ const duration = params.duration ?? 500;
925
1128
  await client.performActions([
926
1129
  {
927
1130
  type: "pointer",
928
1131
  id: "finger1",
929
1132
  parameters: { pointerType: "touch" },
930
1133
  actions: [
931
- { type: "pointerMove", duration: 0, x, y: startY },
1134
+ { type: "pointerMove", duration: 0, x: startX, y: startY },
932
1135
  { type: "pointerDown", button: 0 },
933
1136
  { type: "pause", duration: 100 },
934
- { type: "pointerMove", duration: 500, x, y: endY },
1137
+ { type: "pointerMove", duration, x: endX, y: endY },
935
1138
  { type: "pointerUp", button: 0 }
936
1139
  ]
937
1140
  }
@@ -950,17 +1153,17 @@ class GptDriver {
950
1153
  if (found) {
951
1154
  return;
952
1155
  }
953
- await this.performScroll(direction);
1156
+ await this.performSwipe({ direction });
954
1157
  await this._delay(500);
955
1158
  }
956
1159
  throw new Error(`scrollUntil target not found after ${max} scroll(s)`);
957
1160
  }
958
- async getScreenshot(appiumSessionConfig) {
1161
+ async getScreenshot(appiumSessionConfig, shouldScale = true) {
959
1162
  globalLogger.debug("Capturing screenshot...");
960
1163
  const url = buildUrl(this.appiumSessionConfig.serverUrl, `/session/${this.appiumSessionConfig.id}/screenshot`);
961
1164
  const screenshotResponse = await axios.get(url);
962
1165
  let screenshot = await screenshotResponse.data.value;
963
- if (appiumSessionConfig.platform === "iOS") {
1166
+ if (appiumSessionConfig.platform === "iOS" && shouldScale) {
964
1167
  globalLogger.debug(`Resizing iOS screenshot to ${appiumSessionConfig.size.width}x${appiumSessionConfig.size.height}`);
965
1168
  const imageBuffer = Buffer.from(screenshot, "base64");
966
1169
  const transformedImage = await sharp(imageBuffer).resize(appiumSessionConfig.size.width, appiumSessionConfig.size.height).toBuffer();
@@ -1346,7 +1549,7 @@ ${issues}`);
1346
1549
  }
1347
1550
  return val.data;
1348
1551
  };
1349
- const expandSteps = async (steps, inheritedParams, parentDir, stack) => {
1552
+ const expandSteps = async (steps, inheritedParams, parentDir, stack, currentFilePath) => {
1350
1553
  const out = [];
1351
1554
  for (const step of steps) {
1352
1555
  if (step.type === "fileRef") {
@@ -1360,17 +1563,23 @@ ${issues}`);
1360
1563
  const child = await loadFlow(refPath);
1361
1564
  const mergedParams = { ...inheritedParams, ...step.overrides ?? {} };
1362
1565
  const childDir = path.dirname(refPath);
1363
- const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey]);
1566
+ const childRelativePath = path.relative(baseDir, refPath).replace(/^\.\//, "");
1567
+ const childExpanded = await expandSteps(child.steps, mergedParams, childDir, [...stack, refKey], childRelativePath);
1364
1568
  out.push(...childExpanded);
1365
1569
  } else {
1366
- const resolved = { ...step, __params: { ...inheritedParams } };
1570
+ const resolved = {
1571
+ ...step,
1572
+ __params: { ...inheritedParams },
1573
+ __filepath: currentFilePath
1574
+ };
1367
1575
  out.push(resolved);
1368
1576
  }
1369
1577
  }
1370
1578
  return out;
1371
1579
  };
1372
1580
  const effectiveParams = { ...rootFlow.params ?? {} };
1373
- const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath]);
1581
+ const rootRelativePath = path.relative(baseDir, absolutePath).replace(/^\.\//, "");
1582
+ const expandedSteps = await expandSteps(rootFlow.steps, effectiveParams, baseDir, [absolutePath], rootRelativePath);
1374
1583
  if (!this.appiumSessionStarted) {
1375
1584
  await this.startSession();
1376
1585
  }
@@ -1379,7 +1588,8 @@ ${issues}`);
1379
1588
  try {
1380
1589
  for (const step of expandedSteps) {
1381
1590
  const params = step.__params ?? effectiveParams;
1382
- const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}]`;
1591
+ const filepath = step.__filepath ?? rootRelativePath;
1592
+ const prefix = `Step #${executed + 1} [${step.type}${step.optional ? ", optional" : ""}] (filepath: ${filepath})`;
1383
1593
  try {
1384
1594
  switch (step.type) {
1385
1595
  case "ai": {
@@ -1390,7 +1600,8 @@ ${issues}`);
1390
1600
  const result = await executeSmartLoop(ctx, {
1391
1601
  stepNumber: this.step_number,
1392
1602
  description: instruction,
1393
- instruction
1603
+ instruction,
1604
+ filepath
1394
1605
  });
1395
1606
  if (!result.success) {
1396
1607
  throw new Error(result.error || "Smart loop execution failed");
@@ -1412,7 +1623,8 @@ ${issues}`);
1412
1623
  const result = await executeSmartLoop(ctx, {
1413
1624
  stepNumber: this.step_number,
1414
1625
  description,
1415
- instruction: description
1626
+ instruction: description,
1627
+ filepath
1416
1628
  });
1417
1629
  if (!result.success) {
1418
1630
  throw new Error(result.error || "Smart loop execution failed");
@@ -1435,7 +1647,8 @@ ${issues}`);
1435
1647
  const result = await executeSmartLoop(ctx, {
1436
1648
  stepNumber: this.step_number,
1437
1649
  description,
1438
- instruction
1650
+ instruction,
1651
+ filepath
1439
1652
  });
1440
1653
  if (!result.success) {
1441
1654
  throw new Error(result.error || "Smart loop execution failed");
@@ -1454,11 +1667,39 @@ ${issues}`);
1454
1667
  this.step_number++;
1455
1668
  break;
1456
1669
  }
1457
- case "scroll": {
1458
- globalLogger.info(`${prefix}: Scroll ${step.direction}`);
1459
- await this.takeScreenshotAndLogCodeExecution(`scroll: direction=${step.direction}`);
1460
- await this.performScroll(step.direction);
1461
- this.step_number++;
1670
+ case "scroll":
1671
+ case "swipe": {
1672
+ const description = step.descriptionText ? this.interpolateTemplate(step.descriptionText, params) : void 0;
1673
+ if (description && useSmartLoop) {
1674
+ globalLogger.info(`${prefix}: ${description}`);
1675
+ const ctx = this.createSmartLoopContext();
1676
+ const result = await executeSmartLoop(ctx, {
1677
+ stepNumber: this.step_number,
1678
+ description,
1679
+ instruction: description,
1680
+ filepath
1681
+ });
1682
+ if (!result.success) {
1683
+ throw new Error(result.error || "Smart loop execution failed");
1684
+ }
1685
+ this.step_number++;
1686
+ } else {
1687
+ globalLogger.info(`${prefix}: ${step.type} ${step.direction}`);
1688
+ await this.takeScreenshotAndLogCodeExecution(`${step.type}: direction=${step.direction}`);
1689
+ if (step.type === "swipe") {
1690
+ await this.performSwipe({
1691
+ direction: step.direction,
1692
+ x1: step.x1,
1693
+ y1: step.y1,
1694
+ x2: step.x2,
1695
+ y2: step.y2,
1696
+ duration: step.duration
1697
+ });
1698
+ } else {
1699
+ await this.performSwipe({ direction: step.direction });
1700
+ }
1701
+ this.step_number++;
1702
+ }
1462
1703
  break;
1463
1704
  }
1464
1705
  case "zoom": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gpt-driver-node",
3
- "version": "1.0.2",
3
+ "version": "1.0.3",
4
4
  "main": "./dist/index.cjs",
5
5
  "module": "./dist/index.mjs",
6
6
  "types": "./dist/index.d.cts",