gpt-driver-node 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -684,6 +684,7 @@ async function executeSmartLoop(ctx, params) {
684
684
  screenshotResolution = await getImageDimensions(screenshot2);
685
685
  }
686
686
  let aiCommands = [];
687
+ let reasoning = [];
687
688
  let actionDescription = [];
688
689
  if (!isCacheHit) {
689
690
  anyCacheMiss = true;
@@ -727,7 +728,9 @@ async function executeSmartLoop(ctx, params) {
727
728
  actionDescription = gptCommands.slice(actionDescriptionIndex, actionDescriptionIndex + 1);
728
729
  }
729
730
  const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
731
+ reasoning = [];
730
732
  if (reasoningIndex !== -1) {
733
+ reasoning = gptCommands.at(reasoningIndex).split("reasoning:").at(1).trim().split("- ");
731
734
  const parsedCommands = gptCommands.slice(reasoningIndex);
732
735
  const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
733
736
  if (rememberCommands.length > 0) {
@@ -742,7 +745,13 @@ async function executeSmartLoop(ctx, params) {
742
745
  screenshot,
743
746
  commands
744
747
  });
745
- const logPromise = ctx.logCodeExecution(screenshot, commands.join("\n"), isCacheHit);
748
+ const logPromise = ctx.logAIExecution({
749
+ screenshot,
750
+ prompt: params.instruction,
751
+ commands: aiCommands,
752
+ reasoning,
753
+ fromCache: isCacheHit
754
+ });
746
755
  if (ctx.pendingLogPromises) {
747
756
  ctx.pendingLogPromises.push(logPromise);
748
757
  }
@@ -1113,7 +1122,10 @@ class GptDriver {
1113
1122
  device_config: {
1114
1123
  platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
1115
1124
  device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
1116
- os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
1125
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion,
1126
+ ...this.appiumSessionConfig?.size && {
1127
+ screenResolution: `${this.appiumSessionConfig.size.width}x${this.appiumSessionConfig.size.height}`
1128
+ }
1117
1129
  },
1118
1130
  use_internal_virtual_device: this.useGptDriverCloud,
1119
1131
  build_id: this.buildId,
@@ -1236,7 +1248,7 @@ ${"=".repeat(50)}`);
1236
1248
  performSwipe: (params) => this.performSwipe(params),
1237
1249
  performType: (text) => this.performType(text),
1238
1250
  performPressEnter: () => this.performPressEnter(),
1239
- logCodeExecution: async (screenshot, command, isCacheHit) => this.logCodeExecution(screenshot, command, isCacheHit),
1251
+ logAIExecution: async (params) => this.logAIExecution(params),
1240
1252
  organisationId: this.organisationId,
1241
1253
  middleLayerAssertFn: options?.middleLayerAssertFn,
1242
1254
  pendingLogPromises: options?.pendingLogPromises
@@ -2194,7 +2206,13 @@ ${issues}`);
2194
2206
  case "type": {
2195
2207
  const text = this.interpolateTemplate(step.text, params);
2196
2208
  globalLogger.info(`${prefix}: Type text`);
2197
- this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
2209
+ this.pendingLogPromises.push(
2210
+ this.takeScreenshotAndLogAIExecution({
2211
+ prompt: `type: text=${text}`,
2212
+ commands: ["Local type execution"],
2213
+ reasoning: ["Local type execution"]
2214
+ })
2215
+ );
2198
2216
  await this.performType(text);
2199
2217
  if (isFromFileRef) {
2200
2218
  fileRefStepNumber++;
@@ -2205,7 +2223,13 @@ ${issues}`);
2205
2223
  }
2206
2224
  case "enter": {
2207
2225
  globalLogger.info(`${prefix}: Press Enter`);
2208
- this.takeScreenshotAndLogCodeExecution(`press: Enter`);
2226
+ this.pendingLogPromises.push(
2227
+ this.takeScreenshotAndLogAIExecution({
2228
+ prompt: `press: Enter`,
2229
+ commands: ["Local press Enter execution"],
2230
+ reasoning: ["Local press Enter execution"]
2231
+ })
2232
+ );
2209
2233
  await this.performPressEnter();
2210
2234
  if (step.delayNextStep) {
2211
2235
  await this._delay(step.delayNextStep);
@@ -2242,7 +2266,13 @@ ${issues}`);
2242
2266
  }
2243
2267
  } else {
2244
2268
  globalLogger.info(`${prefix}: ${step.type} ${step.direction}`);
2245
- this.takeScreenshotAndLogCodeExecution(`${step.type}: direction=${step.direction}`);
2269
+ this.pendingLogPromises.push(
2270
+ this.takeScreenshotAndLogAIExecution({
2271
+ prompt: `${step.type}: direction=${step.direction}`,
2272
+ commands: [`Local ${step.type} execution`],
2273
+ reasoning: [`Local ${step.type} execution`]
2274
+ })
2275
+ );
2246
2276
  await this.performSwipe({
2247
2277
  direction: step.direction,
2248
2278
  x1: step.x1,
@@ -2263,7 +2293,13 @@ ${issues}`);
2263
2293
  }
2264
2294
  case "zoom": {
2265
2295
  globalLogger.info(`${prefix}: Zoom ${step.direction}`);
2266
- this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
2296
+ this.pendingLogPromises.push(
2297
+ this.takeScreenshotAndLogAIExecution({
2298
+ prompt: `zoom: direction=${step.direction}`,
2299
+ commands: [`Local zoom execution`],
2300
+ reasoning: [`Local zoom execution`]
2301
+ })
2302
+ );
2267
2303
  if (isFromFileRef) {
2268
2304
  fileRefStepNumber++;
2269
2305
  } else {
@@ -2274,7 +2310,13 @@ ${issues}`);
2274
2310
  case "scrollUntil": {
2275
2311
  const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
2276
2312
  globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
2277
- this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
2313
+ this.pendingLogPromises.push(
2314
+ this.takeScreenshotAndLogAIExecution({
2315
+ prompt: `scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`,
2316
+ commands: [`Local scrollUntil execution`],
2317
+ reasoning: [`Local scrollUntil execution`]
2318
+ })
2319
+ );
2278
2320
  await this.performScrollUntil({
2279
2321
  direction: step.direction,
2280
2322
  text: interpolatedText,
@@ -2293,7 +2335,13 @@ ${issues}`);
2293
2335
  const bundleId = params["bundleId"];
2294
2336
  const url = this.interpolateTemplate(step.url, params);
2295
2337
  globalLogger.info(`${prefix}: Open deeplink ${url}`);
2296
- this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
2338
+ this.pendingLogPromises.push(
2339
+ this.takeScreenshotAndLogAIExecution({
2340
+ prompt: `openDeepLinkUrl: url=${url}`,
2341
+ commands: [`Local openDeepLinkUrl execution`],
2342
+ reasoning: [`Local openDeepLinkUrl execution`]
2343
+ })
2344
+ );
2297
2345
  await this.openDeepLinkUrl({ url, package: pkg, bundleId });
2298
2346
  break;
2299
2347
  }
@@ -2414,6 +2462,28 @@ ${"=".repeat(50)}`);
2414
2462
  globalLogger.error("Failed to log code execution", e);
2415
2463
  }
2416
2464
  }
2465
+ async logAIExecution(params) {
2466
+ try {
2467
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_ai_execution`, {
2468
+ api_key: this.apiKey,
2469
+ base64_screenshot: params.screenshot,
2470
+ prompt: params.prompt,
2471
+ reasoning: params.reasoning,
2472
+ commands: params.commands,
2473
+ from_cache: !!params.fromCache
2474
+ });
2475
+ } catch (e) {
2476
+ globalLogger.error("Failed to log code execution", e);
2477
+ }
2478
+ }
2479
+ async takeScreenshotAndLogAIExecution(params) {
2480
+ try {
2481
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
2482
+ await this.logAIExecution({ ...params, screenshot });
2483
+ } catch (e) {
2484
+ globalLogger.error("Failed to log code execution", e);
2485
+ }
2486
+ }
2417
2487
  async takeScreenshotAndLogCodeExecution(command) {
2418
2488
  try {
2419
2489
  const screenshot = await this.getScreenshot(this.appiumSessionConfig);
package/dist/index.d.cts CHANGED
@@ -877,6 +877,8 @@ declare class GptDriver {
877
877
  private gptHandler;
878
878
  private executeCommand;
879
879
  private logCodeExecution;
880
+ private logAIExecution;
881
+ private takeScreenshotAndLogAIExecution;
880
882
  private takeScreenshotAndLogCodeExecution;
881
883
  }
882
884
 
package/dist/index.mjs CHANGED
@@ -682,6 +682,7 @@ async function executeSmartLoop(ctx, params) {
682
682
  screenshotResolution = await getImageDimensions(screenshot2);
683
683
  }
684
684
  let aiCommands = [];
685
+ let reasoning = [];
685
686
  let actionDescription = [];
686
687
  if (!isCacheHit) {
687
688
  anyCacheMiss = true;
@@ -725,7 +726,9 @@ async function executeSmartLoop(ctx, params) {
725
726
  actionDescription = gptCommands.slice(actionDescriptionIndex, actionDescriptionIndex + 1);
726
727
  }
727
728
  const reasoningIndex = gptCommands.findIndex((entry) => entry.startsWith("reasoning:"));
729
+ reasoning = [];
728
730
  if (reasoningIndex !== -1) {
731
+ reasoning = gptCommands.at(reasoningIndex).split("reasoning:").at(1).trim().split("- ");
729
732
  const parsedCommands = gptCommands.slice(reasoningIndex);
730
733
  const rememberCommands = parsedCommands.filter((cmd) => isRememberCommand(cmd));
731
734
  if (rememberCommands.length > 0) {
@@ -740,7 +743,13 @@ async function executeSmartLoop(ctx, params) {
740
743
  screenshot,
741
744
  commands
742
745
  });
743
- const logPromise = ctx.logCodeExecution(screenshot, commands.join("\n"), isCacheHit);
746
+ const logPromise = ctx.logAIExecution({
747
+ screenshot,
748
+ prompt: params.instruction,
749
+ commands: aiCommands,
750
+ reasoning,
751
+ fromCache: isCacheHit
752
+ });
744
753
  if (ctx.pendingLogPromises) {
745
754
  ctx.pendingLogPromises.push(logPromise);
746
755
  }
@@ -1111,7 +1120,10 @@ class GptDriver {
1111
1120
  device_config: {
1112
1121
  platform: this.appiumSessionConfig?.platform ?? this.gptDriverCloudConfig?.platform,
1113
1122
  device: this.appiumSessionConfig?.deviceName ?? this.gptDriverCloudConfig?.deviceName,
1114
- os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion
1123
+ os: this.appiumSessionConfig?.platformVersion ?? this.gptDriverCloudConfig?.platformVersion,
1124
+ ...this.appiumSessionConfig?.size && {
1125
+ screenResolution: `${this.appiumSessionConfig.size.width}x${this.appiumSessionConfig.size.height}`
1126
+ }
1115
1127
  },
1116
1128
  use_internal_virtual_device: this.useGptDriverCloud,
1117
1129
  build_id: this.buildId,
@@ -1234,7 +1246,7 @@ ${"=".repeat(50)}`);
1234
1246
  performSwipe: (params) => this.performSwipe(params),
1235
1247
  performType: (text) => this.performType(text),
1236
1248
  performPressEnter: () => this.performPressEnter(),
1237
- logCodeExecution: async (screenshot, command, isCacheHit) => this.logCodeExecution(screenshot, command, isCacheHit),
1249
+ logAIExecution: async (params) => this.logAIExecution(params),
1238
1250
  organisationId: this.organisationId,
1239
1251
  middleLayerAssertFn: options?.middleLayerAssertFn,
1240
1252
  pendingLogPromises: options?.pendingLogPromises
@@ -2192,7 +2204,13 @@ ${issues}`);
2192
2204
  case "type": {
2193
2205
  const text = this.interpolateTemplate(step.text, params);
2194
2206
  globalLogger.info(`${prefix}: Type text`);
2195
- this.takeScreenshotAndLogCodeExecution(`type: text=${text}`);
2207
+ this.pendingLogPromises.push(
2208
+ this.takeScreenshotAndLogAIExecution({
2209
+ prompt: `type: text=${text}`,
2210
+ commands: ["Local type execution"],
2211
+ reasoning: ["Local type execution"]
2212
+ })
2213
+ );
2196
2214
  await this.performType(text);
2197
2215
  if (isFromFileRef) {
2198
2216
  fileRefStepNumber++;
@@ -2203,7 +2221,13 @@ ${issues}`);
2203
2221
  }
2204
2222
  case "enter": {
2205
2223
  globalLogger.info(`${prefix}: Press Enter`);
2206
- this.takeScreenshotAndLogCodeExecution(`press: Enter`);
2224
+ this.pendingLogPromises.push(
2225
+ this.takeScreenshotAndLogAIExecution({
2226
+ prompt: `press: Enter`,
2227
+ commands: ["Local press Enter execution"],
2228
+ reasoning: ["Local press Enter execution"]
2229
+ })
2230
+ );
2207
2231
  await this.performPressEnter();
2208
2232
  if (step.delayNextStep) {
2209
2233
  await this._delay(step.delayNextStep);
@@ -2240,7 +2264,13 @@ ${issues}`);
2240
2264
  }
2241
2265
  } else {
2242
2266
  globalLogger.info(`${prefix}: ${step.type} ${step.direction}`);
2243
- this.takeScreenshotAndLogCodeExecution(`${step.type}: direction=${step.direction}`);
2267
+ this.pendingLogPromises.push(
2268
+ this.takeScreenshotAndLogAIExecution({
2269
+ prompt: `${step.type}: direction=${step.direction}`,
2270
+ commands: [`Local ${step.type} execution`],
2271
+ reasoning: [`Local ${step.type} execution`]
2272
+ })
2273
+ );
2244
2274
  await this.performSwipe({
2245
2275
  direction: step.direction,
2246
2276
  x1: step.x1,
@@ -2261,7 +2291,13 @@ ${issues}`);
2261
2291
  }
2262
2292
  case "zoom": {
2263
2293
  globalLogger.info(`${prefix}: Zoom ${step.direction}`);
2264
- this.takeScreenshotAndLogCodeExecution(`zoom: direction=${step.direction}`);
2294
+ this.pendingLogPromises.push(
2295
+ this.takeScreenshotAndLogAIExecution({
2296
+ prompt: `zoom: direction=${step.direction}`,
2297
+ commands: [`Local zoom execution`],
2298
+ reasoning: [`Local zoom execution`]
2299
+ })
2300
+ );
2265
2301
  if (isFromFileRef) {
2266
2302
  fileRefStepNumber++;
2267
2303
  } else {
@@ -2272,7 +2308,13 @@ ${issues}`);
2272
2308
  case "scrollUntil": {
2273
2309
  const interpolatedText = step.text != null ? this.interpolateTemplate(step.text, params) : void 0;
2274
2310
  globalLogger.info(`${prefix}: Scroll until ${interpolatedText ?? step.elementId}`);
2275
- this.takeScreenshotAndLogCodeExecution(`scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`);
2311
+ this.pendingLogPromises.push(
2312
+ this.takeScreenshotAndLogAIExecution({
2313
+ prompt: `scrollUntil: text=${interpolatedText}, elementId=${step.elementId}`,
2314
+ commands: [`Local scrollUntil execution`],
2315
+ reasoning: [`Local scrollUntil execution`]
2316
+ })
2317
+ );
2276
2318
  await this.performScrollUntil({
2277
2319
  direction: step.direction,
2278
2320
  text: interpolatedText,
@@ -2291,7 +2333,13 @@ ${issues}`);
2291
2333
  const bundleId = params["bundleId"];
2292
2334
  const url = this.interpolateTemplate(step.url, params);
2293
2335
  globalLogger.info(`${prefix}: Open deeplink ${url}`);
2294
- this.takeScreenshotAndLogCodeExecution(`openDeepLinkUrl: url=${url}`);
2336
+ this.pendingLogPromises.push(
2337
+ this.takeScreenshotAndLogAIExecution({
2338
+ prompt: `openDeepLinkUrl: url=${url}`,
2339
+ commands: [`Local openDeepLinkUrl execution`],
2340
+ reasoning: [`Local openDeepLinkUrl execution`]
2341
+ })
2342
+ );
2295
2343
  await this.openDeepLinkUrl({ url, package: pkg, bundleId });
2296
2344
  break;
2297
2345
  }
@@ -2412,6 +2460,28 @@ ${"=".repeat(50)}`);
2412
2460
  globalLogger.error("Failed to log code execution", e);
2413
2461
  }
2414
2462
  }
2463
+ async logAIExecution(params) {
2464
+ try {
2465
+ await axios.post(`${this.gptDriverBaseUrl}/sessions/${this.gptDriverSessionId}/log_ai_execution`, {
2466
+ api_key: this.apiKey,
2467
+ base64_screenshot: params.screenshot,
2468
+ prompt: params.prompt,
2469
+ reasoning: params.reasoning,
2470
+ commands: params.commands,
2471
+ from_cache: !!params.fromCache
2472
+ });
2473
+ } catch (e) {
2474
+ globalLogger.error("Failed to log code execution", e);
2475
+ }
2476
+ }
2477
+ async takeScreenshotAndLogAIExecution(params) {
2478
+ try {
2479
+ const screenshot = await this.getScreenshot(this.appiumSessionConfig);
2480
+ await this.logAIExecution({ ...params, screenshot });
2481
+ } catch (e) {
2482
+ globalLogger.error("Failed to log code execution", e);
2483
+ }
2484
+ }
2415
2485
  async takeScreenshotAndLogCodeExecution(command) {
2416
2486
  try {
2417
2487
  const screenshot = await this.getScreenshot(this.appiumSessionConfig);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gpt-driver-node",
3
- "version": "1.0.9",
3
+ "version": "1.0.11",
4
4
  "main": "./dist/index.cjs",
5
5
  "module": "./dist/index.mjs",
6
6
  "types": "./dist/index.d.cts",