testdriverai 7.3.9 → 7.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [7.3.10](https://github.com/testdriverai/testdriverai/compare/v7.3.9...v7.3.10) (2026-02-16)
2
+
3
+
4
+
1
5
  ## [7.3.9](https://github.com/testdriverai/testdriverai/compare/v7.3.8...v7.3.9) (2026-02-12)
2
6
 
3
7
 
@@ -226,8 +226,8 @@ const createCommands = (
226
226
  const assertTimestamp = Date.now();
227
227
  const assertStartTime = assertTimestamp;
228
228
 
229
- // Extract cache options
230
- const { threshold = -1, cacheKey, os, resolution } = options;
229
+ // Extract cache and AI options
230
+ const { threshold = -1, cacheKey, os, resolution, ai } = options;
231
231
 
232
232
  // Debug log cache settings
233
233
  emitter.emit(
@@ -243,6 +243,7 @@ const createCommands = (
243
243
  cacheKey,
244
244
  os,
245
245
  resolution,
246
+ ai,
246
247
  });
247
248
 
248
249
  const assertDuration = Date.now() - assertStartTime;
@@ -13,6 +13,7 @@ Make AI-powered assertions about the current screen state using natural language
13
13
 
14
14
  ```javascript
15
15
  await testdriver.assert(assertion)
16
+ await testdriver.assert(assertion, options)
16
17
  ```
17
18
 
18
19
  ## Parameters
@@ -21,6 +22,36 @@ await testdriver.assert(assertion)
21
22
  Natural language description of what should be true
22
23
  </ParamField>
23
24
 
25
+ <ParamField path="options" type="object">
26
+ Optional configuration
27
+
28
+ <Expandable title="properties">
29
+ <ParamField path="ai" type="object">
30
+ AI sampling configuration for this assert call (overrides global `ai` config from constructor).
31
+
32
+ <Expandable title="properties">
33
+ <ParamField path="temperature" type="number">
34
+ Controls randomness. `0` = deterministic, higher = more creative. Default: model default.
35
+ </ParamField>
36
+
37
+ <ParamField path="top" type="object">
38
+ Sampling parameters
39
+
40
+ <Expandable title="properties">
41
+ <ParamField path="p" type="number">
42
+ Top-P (nucleus sampling). Range: 0-1.
43
+ </ParamField>
44
+
45
+ <ParamField path="k" type="number">
46
+ Top-K sampling. `1` = most deterministic.
47
+ </ParamField>
48
+ </Expandable>
49
+ </ParamField>
50
+ </Expandable>
51
+ </ParamField>
52
+ </Expandable>
53
+ </ParamField>
54
+
24
55
  ## Returns
25
56
 
26
57
  `Promise<boolean>` - `true` if assertion passes, throws error if assertion fails
@@ -52,6 +52,30 @@ const testdriver = new TestDriver(apiKey, options)
52
52
  <ParamField path="environment" type="object">
53
53
  Additional environment variables to pass to the sandbox
54
54
  </ParamField>
55
+
56
+ <ParamField path="ai" type="object">
57
+ Global AI sampling configuration. Controls how the AI model generates responses for `find()` verification and `assert()` calls. Can be overridden per call.
58
+
59
+ <Expandable title="properties">
60
+ <ParamField path="temperature" type="number">
61
+ Controls randomness in AI responses. `0` = deterministic (best for verification), higher values = more creative. Default: `0` for find verification, model default for assert.
62
+ </ParamField>
63
+
64
+ <ParamField path="top" type="object">
65
+ Nucleus and top-k sampling parameters
66
+
67
+ <Expandable title="properties">
68
+ <ParamField path="p" type="number">
69
+ Top-P (nucleus sampling). Limits token choices to the smallest set whose cumulative probability exceeds P. Lower values = more focused responses. Range: 0-1.
70
+ </ParamField>
71
+
72
+ <ParamField path="k" type="number">
73
+ Top-K sampling. Limits token choices to the top K most likely tokens. `1` = always pick the most likely token. `0` = disabled (consider all tokens).
74
+ </ParamField>
75
+ </Expandable>
76
+ </ParamField>
77
+ </Expandable>
78
+ </ParamField>
55
79
  </Expandable>
56
80
  </ParamField>
57
81
 
@@ -68,6 +92,11 @@ const testdriver = new TestDriver({
68
92
  analytics: true
69
93
  });
70
94
 
95
+ // With AI config for stricter verification
96
+ const testdriver = new TestDriver({
97
+ ai: { temperature: 0, top: { p: 0.9, k: 40 } }
98
+ });
99
+
71
100
  // Or pass API key explicitly
72
101
  const testdriver = new TestDriver('your-api-key', {
73
102
  os: 'windows'
package/docs/v7/find.mdx CHANGED
@@ -41,6 +41,30 @@ const element = await testdriver.find(description, options)
41
41
  <ParamField path="zoom" type="boolean" default={false}>
42
42
  Enable two-phase zoom mode for better precision in crowded UIs with many similar elements.
43
43
  </ParamField>
44
+
45
+ <ParamField path="ai" type="object">
46
+ AI sampling configuration for this find call (overrides global `ai` config from constructor).
47
+
48
+ <Expandable title="properties">
49
+ <ParamField path="temperature" type="number">
50
+ Controls randomness. `0` = deterministic. Default: `0` for find verification.
51
+ </ParamField>
52
+
53
+ <ParamField path="top" type="object">
54
+ Sampling parameters
55
+
56
+ <Expandable title="properties">
57
+ <ParamField path="p" type="number">
58
+ Top-P (nucleus sampling). Range: 0-1.
59
+ </ParamField>
60
+
61
+ <ParamField path="k" type="number">
62
+ Top-K sampling. `1` = most deterministic.
63
+ </ParamField>
64
+ </Expandable>
65
+ </ParamField>
66
+ </Expandable>
67
+ </ParamField>
44
68
  </Expandable>
45
69
  </ParamField>
46
70
 
@@ -401,8 +401,8 @@ Debug mode (connect to existing sandbox):
401
401
  const TestDriverSDK = (await import("../../sdk.js")).default;
402
402
  // Determine preview mode from environment variable
403
403
  // TD_PREVIEW can be "ide", "browser", or "none"
404
- // Default to "none" for MCP server (headless) unless explicitly set
405
- const previewMode = process.env.TD_PREVIEW || "none";
404
+ // Default to "ide" so the live preview shows within the IDE
405
+ const previewMode = process.env.TD_PREVIEW || "ide";
406
406
  logger.debug("session_start: Preview mode", { preview: previewMode });
407
407
  // Get IP from params or environment (for self-hosted instances)
408
408
  const instanceIp = params.ip || process.env.TD_IP;
@@ -509,8 +509,8 @@ Debug mode (connect to existing sandbox):
509
509
 
510
510
  // Determine preview mode from environment variable
511
511
  // TD_PREVIEW can be "ide", "browser", or "none"
512
- // Default to "none" for MCP server (headless) unless explicitly set
513
- const previewMode = process.env.TD_PREVIEW || "none";
512
+ // Default to "ide" so the live preview shows within the IDE
513
+ const previewMode = process.env.TD_PREVIEW || "ide";
514
514
  logger.debug("session_start: Preview mode", { preview: previewMode });
515
515
 
516
516
  // Get IP from params or environment (for self-hosted instances)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "testdriverai",
3
- "version": "7.3.9",
3
+ "version": "7.3.10",
4
4
  "description": "Next generation autonomous AI agent for end-to-end testing of web & desktop",
5
5
  "main": "sdk.js",
6
6
  "types": "sdk.d.ts",
package/sdk.d.ts CHANGED
@@ -230,6 +230,8 @@ export interface TestDriverOptions {
230
230
  logging?: boolean;
231
231
  /** Enable/disable cache (default: true). Set to false to force regeneration on all find operations */
232
232
  cache?: boolean;
233
+ /** Global AI sampling configuration. Can be overridden per find() or assert() call. */
234
+ ai?: AIConfig;
233
235
  /** Cache threshold configuration for different methods */
234
236
  cacheThreshold?: {
235
237
  /** Threshold for find operations (default: 0.05 = 5% difference, 95% similarity) */
@@ -546,6 +548,19 @@ export interface FocusApplicationOptions {
546
548
  name: string;
547
549
  }
548
550
 
551
+ /** AI sampling configuration for controlling model behavior */
552
+ export interface AIConfig {
553
+ /** Temperature for AI sampling (0 = deterministic, higher = more creative). Default: 0 for find verification, model default for assert. */
554
+ temperature?: number;
555
+ /** Top-P and Top-K sampling parameters */
556
+ top?: {
557
+ /** Top-P (nucleus sampling). Controls diversity by limiting to top P probability mass. Range: 0-1. */
558
+ p?: number;
559
+ /** Top-K sampling. Limits choices to top K tokens. 1 = always pick most likely. 0 = disabled. */
560
+ k?: number;
561
+ };
562
+ }
563
+
549
564
  /** Options for extract command */
550
565
  export interface ExtractOptions {
551
566
  /** What to extract */
@@ -564,6 +579,8 @@ export interface AssertOptions {
564
579
  os?: string;
565
580
  /** Screen resolution for cache partitioning */
566
581
  resolution?: string;
582
+ /** AI sampling configuration (overrides global ai config) */
583
+ ai?: AIConfig;
567
584
  }
568
585
 
569
586
  /** Options for exec command */
@@ -1028,7 +1045,7 @@ export default class TestDriverSDK {
1028
1045
  find(description: string, cacheThreshold?: number): ChainableElementPromise;
1029
1046
  find(
1030
1047
  description: string,
1031
- options?: { cacheThreshold?: number; cacheKey?: string; timeout?: number },
1048
+ options?: { cacheThreshold?: number; cacheKey?: string; timeout?: number; ai?: AIConfig },
1032
1049
  ): ChainableElementPromise;
1033
1050
 
1034
1051
  /**
@@ -1267,7 +1284,7 @@ export default class TestDriverSDK {
1267
1284
  * // With custom threshold
1268
1285
  * await client.assert('the page loaded', { threshold: 0.01, cacheKey: 'login-test' });
1269
1286
  */
1270
- assert(assertion: string, options?: { threshold?: number; cacheKey?: string; os?: string; resolution?: string }): Promise<boolean>;
1287
+ assert(assertion: string, options?: { threshold?: number; cacheKey?: string; os?: string; resolution?: string; ai?: AIConfig }): Promise<boolean>;
1271
1288
 
1272
1289
  /**
1273
1290
  * Extract information from the screen using AI
package/sdk.js CHANGED
@@ -430,8 +430,9 @@ class Element {
430
430
  /**
431
431
  * Find the element on screen
432
432
  * @param {string} [newDescription] - Optional new description to search for
433
- * @param {Object} [options] - Optional options object with cacheThreshold, cacheKey, and/or timeout
433
+ * @param {Object} [options] - Optional options object with cache thresholds, cacheKey, and/or timeout
434
434
  * @param {number} [options.timeout] - Max time in ms to poll for element (polls every 5 seconds)
435
+ * @param {Object} [options.cache] - Cache configuration { thresholds: { screen, element } }
435
436
  * @returns {Promise<Element>} This element instance
436
437
  */
437
438
  async find(newDescription, options) {
@@ -468,10 +469,12 @@ class Element {
468
469
  this._screenshot = screenshot;
469
470
  }
470
471
 
471
- // Handle options - can be a number (cacheThreshold) or object with cacheKey/cacheThreshold
472
+ // Handle options - can be a number (cacheThreshold) or object with cacheKey/cacheThreshold/cache
472
473
  let cacheKey = null;
473
474
  let cacheThreshold = null;
475
+ let perCommandThresholds = null; // Per-command { screen, element } override
474
476
  let zoom = false; // Default to disabled, enable with zoom: true
477
+ let perCommandAi = null; // Per-command AI config override
475
478
 
476
479
  if (typeof options === "number") {
477
480
  // Legacy: options is just a number threshold
@@ -482,6 +485,10 @@ class Element {
482
485
  cacheThreshold = options.cacheThreshold ?? null;
483
486
  // zoom defaults to false unless explicitly set to true
484
487
  zoom = options.zoom === true;
488
+ // Per-command cache thresholds: { cache: { thresholds: { screen: 0.1, element: 0.2 } } }
489
+ if (typeof options.cache === "object" && options.cache?.thresholds) {
490
+ perCommandThresholds = options.cache.thresholds;
491
+ }
485
492
  }
486
493
 
487
494
  // Use default cacheKey from SDK constructor if not provided in find() options
@@ -499,19 +506,25 @@ class Element {
499
506
  // - If cacheKey is provided, enable cache with threshold
500
507
  // - If no cacheKey, disable cache
501
508
  let threshold;
509
+ let elementSimilarity;
502
510
  if (this.sdk._cacheExplicitlyDisabled) {
503
511
  // Cache explicitly disabled via cache: false option or TD_NO_CACHE env
504
512
  threshold = -1;
513
+ elementSimilarity = -1;
505
514
  cacheKey = null; // Clear any cacheKey to ensure cache is truly disabled
506
515
  } else if (cacheKey) {
507
516
  // cacheKey provided - enable cache with threshold
508
- threshold = cacheThreshold ?? this.sdk.cacheThresholds?.find ?? 0.01;
517
+ // Per-command thresholds > legacy cacheThreshold > global config
518
+ threshold = perCommandThresholds?.screen ?? cacheThreshold ?? this.sdk.cacheConfig?.thresholds?.find?.screen ?? 0.01;
519
+ elementSimilarity = perCommandThresholds?.element ?? this.sdk.cacheConfig?.thresholds?.find?.element ?? 0.8;
509
520
  } else if (cacheThreshold !== null) {
510
521
  // Explicit threshold provided without cacheKey
511
- threshold = cacheThreshold;
522
+ threshold = perCommandThresholds?.screen ?? cacheThreshold;
523
+ elementSimilarity = perCommandThresholds?.element ?? this.sdk.cacheConfig?.thresholds?.find?.element ?? 0.8;
512
524
  } else {
513
525
  // No cacheKey, no explicit threshold - disable cache
514
526
  threshold = -1;
527
+ elementSimilarity = -1;
515
528
  }
516
529
 
517
530
  // Store the threshold for debugging
@@ -536,10 +549,16 @@ class Element {
536
549
  element: description,
537
550
  image: screenshot,
538
551
  threshold: threshold,
552
+ elementSimilarity: elementSimilarity,
539
553
  cacheKey: cacheKey,
540
554
  os: this.sdk.os,
541
555
  resolution: this.sdk.resolution,
542
556
  zoom: zoom,
557
+ ai: {
558
+ ...this.sdk.aiConfig,
559
+ ...(perCommandAi || {}),
560
+ top: { ...this.sdk.aiConfig?.top, ...(perCommandAi?.top || {}) },
561
+ },
543
562
  });
544
563
 
545
564
  const duration = Date.now() - startTime;
@@ -736,6 +755,9 @@ class Element {
736
755
  cacheHit: debugInfo.cacheHit,
737
756
  selectorId: this._response?.selector,
738
757
  consoleUrl: consoleUrl,
758
+ validated: response.validated ?? null,
759
+ validationConfidence: response.validationConfidence ?? null,
760
+ coordsUpdated: response.coordsUpdated ?? null,
739
761
  };
740
762
  if (!debugInfo.cacheHit) {
741
763
  meta.confidence = debugInfo.confidence;
@@ -1441,15 +1463,49 @@ class TestDriverSDK {
1441
1463
  findAll: -1,
1442
1464
  assert: -1,
1443
1465
  };
1466
+ this.cacheConfig = {
1467
+ enabled: false,
1468
+ thresholds: {
1469
+ find: { screen: -1, element: -1 },
1470
+ assert: -1,
1471
+ },
1472
+ };
1444
1473
  } else {
1445
- // Cache enabled by default when cacheKey is provided
1474
+ // Support cache object format: { cache: { thresholds: { find: { screen: 0.01, element: 0.8 }, assert: 0.05 } } }
1475
+ const cacheOpts = typeof options.cache === "object" ? options.cache : {};
1476
+ const thresholds = cacheOpts.thresholds || {};
1477
+ const findThresholds = typeof thresholds.find === "object" ? thresholds.find : {};
1478
+
1479
+ this.cacheConfig = {
1480
+ enabled: cacheOpts.enabled !== false,
1481
+ thresholds: {
1482
+ find: {
1483
+ screen: findThresholds.screen ?? 0.01, // Default: 1% pixel diff allowed
1484
+ element: findThresholds.element ?? 0.8, // Default: 80% OpenCV correlation
1485
+ },
1486
+ assert: thresholds.assert ?? 0.05, // Default: 5% pixel diff for assertions
1487
+ },
1488
+ };
1489
+
1490
+ // Legacy cacheThresholds - keep for backwards compatibility
1446
1491
  this.cacheThresholds = {
1447
- find: options.cacheThreshold?.find ?? 0.01, // Default: 1% threshold
1448
- findAll: options.cacheThreshold?.findAll ?? 0.01,
1449
- assert: options.cacheThreshold?.assert ?? 0.05, // Default: 5% threshold for assertions
1492
+ find: options.cacheThreshold?.find ?? this.cacheConfig.thresholds.find.screen,
1493
+ findAll: options.cacheThreshold?.findAll ?? this.cacheConfig.thresholds.find.screen,
1494
+ assert: options.cacheThreshold?.assert ?? this.cacheConfig.thresholds.assert,
1450
1495
  };
1451
1496
  }
1452
1497
 
1498
+ // AI sampling configuration
1499
+ // Supports: { ai: { temperature: 0, top: { p: 1, k: 0 } } }
1500
+ // Can be overridden per find() or assert() call
1501
+ this.aiConfig = typeof options.ai === "object" ? {
1502
+ temperature: options.ai.temperature,
1503
+ top: {
1504
+ p: options.ai.top?.p,
1505
+ k: options.ai.top?.k,
1506
+ },
1507
+ } : {};
1508
+
1453
1509
  // Redraw configuration
1454
1510
  // Supports both:
1455
1511
  // - redraw: { enabled: true, diffThreshold: 0.1, screenRedraw: true, networkMonitor: true }
@@ -2791,7 +2847,7 @@ CAPTCHA_SOLVER_EOF`,
2791
2847
  * Automatically locates the element and returns it
2792
2848
  *
2793
2849
  * @param {string} description - Description of the element to find
2794
- * @param {number | Object} [options] - Cache options: number for threshold, or object with {cacheKey, cacheThreshold}
2850
+ * @param {number | Object} [options] - Cache options: number for threshold, or object with {cacheKey, cache: { thresholds: { screen, element } }}
2795
2851
  * @returns {Promise<Element> & ChainableElement} Element instance that has been located, with chainable methods
2796
2852
  *
2797
2853
  * @example
@@ -2880,7 +2936,7 @@ CAPTCHA_SOLVER_EOF`,
2880
2936
  * Automatically locates all matching elements and returns them as an array
2881
2937
  *
2882
2938
  * @param {string} description - Description of the elements to find
2883
- * @param {number | Object} [options] - Cache options: number for threshold, or object with {cacheKey, cacheThreshold}
2939
+ * @param {number | Object} [options] - Cache options: number for threshold, or object with {cacheKey, cache: { thresholds: { screen } }}
2884
2940
  * @returns {Promise<Element[]>} Array of Element instances that have been located
2885
2941
  *
2886
2942
  * @example
@@ -2936,9 +2992,10 @@ CAPTCHA_SOLVER_EOF`,
2936
2992
  try {
2937
2993
  const screenshot = await this.system.captureScreenBase64();
2938
2994
 
2939
- // Handle options - can be a number (cacheThreshold) or object with cacheKey/cacheThreshold
2995
+ // Handle options - can be a number (cacheThreshold) or object with cacheKey/cacheThreshold/cache
2940
2996
  let cacheKey = null;
2941
2997
  let cacheThreshold = null;
2998
+ let perCommandThresholds = null; // Per-command { screen } override (findAll has no element threshold)
2942
2999
 
2943
3000
  if (typeof options === "number") {
2944
3001
  // Legacy: options is just a number threshold
@@ -2947,6 +3004,10 @@ CAPTCHA_SOLVER_EOF`,
2947
3004
  // New: options is an object with cacheKey and/or cacheThreshold
2948
3005
  cacheKey = options.cacheKey || null;
2949
3006
  cacheThreshold = options.cacheThreshold ?? null;
3007
+ // Per-command cache thresholds: { cache: { thresholds: { screen: 0.1 } } }
3008
+ if (typeof options.cache === "object" && options.cache?.thresholds) {
3009
+ perCommandThresholds = options.cache.thresholds;
3010
+ }
2950
3011
  }
2951
3012
 
2952
3013
  // Use default cacheKey from SDK constructor if not provided in findAll() options
@@ -2969,11 +3030,11 @@ CAPTCHA_SOLVER_EOF`,
2969
3030
  threshold = -1;
2970
3031
  cacheKey = null; // Clear any cacheKey to ensure cache is truly disabled
2971
3032
  } else if (cacheKey) {
2972
- // cacheKey provided - enable cache with threshold
2973
- threshold = cacheThreshold ?? this.cacheThresholds?.findAll ?? 0.01;
3033
+ // cacheKey provided - enable cache with threshold (findAll only uses screen, no element)
3034
+ threshold = perCommandThresholds?.screen ?? cacheThreshold ?? this.cacheConfig?.thresholds?.find?.screen ?? 0.01;
2974
3035
  } else if (cacheThreshold !== null) {
2975
3036
  // Explicit threshold provided without cacheKey
2976
- threshold = cacheThreshold;
3037
+ threshold = perCommandThresholds?.screen ?? cacheThreshold;
2977
3038
  } else {
2978
3039
  // No cacheKey, no explicit threshold - disable cache
2979
3040
  threshold = -1;
@@ -2994,7 +3055,7 @@ CAPTCHA_SOLVER_EOF`,
2994
3055
  }
2995
3056
 
2996
3057
  const response = await this.apiClient.req(
2997
- "/api/v7.0.0/testdriver-agent/testdriver-find-all",
3058
+ "/api/v7.0.0/testdriver/find-all",
2998
3059
  {
2999
3060
  session: this.getSessionId(),
3000
3061
  element: description,
@@ -3010,7 +3071,7 @@ CAPTCHA_SOLVER_EOF`,
3010
3071
 
3011
3072
  if (response && response.elements && response.elements.length > 0) {
3012
3073
  // Single log at the end - found elements
3013
- const formattedMessage = formatter.formatFindAllSingleLine(
3074
+ const formattedMessage = formatter.formatElementsFound(
3014
3075
  description,
3015
3076
  response.elements.length,
3016
3077
  {
@@ -3093,7 +3154,7 @@ CAPTCHA_SOLVER_EOF`,
3093
3154
  const duration = Date.now() - startTime;
3094
3155
 
3095
3156
  // Single log at the end - no elements found
3096
- const formattedMessage = formatter.formatFindAllSingleLine(
3157
+ const formattedMessage = formatter.formatElementsFound(
3097
3158
  description,
3098
3159
  0,
3099
3160
  {
@@ -3139,7 +3200,7 @@ CAPTCHA_SOLVER_EOF`,
3139
3200
  const duration = Date.now() - startTime;
3140
3201
 
3141
3202
  // Single log at the end - error
3142
- const formattedMessage = formatter.formatFindAllSingleLine(
3203
+ const formattedMessage = formatter.formatElementsFound(
3143
3204
  description,
3144
3205
  0,
3145
3206
  {
@@ -3334,16 +3395,30 @@ CAPTCHA_SOLVER_EOF`,
3334
3395
  let result;
3335
3396
  // Special handling for assert to inject SDK options (cacheKey, os, resolution, threshold)
3336
3397
  // similar to how find() handles these in the Element class
3398
+ // Note: assert does NOT use elementSimilarity (template matching not relevant for assertions)
3337
3399
  if (commandName === 'assert') {
3338
3400
  const assertion = args[0];
3339
3401
  const userOptions = args[1] || {};
3340
3402
 
3403
+ // Support per-command cache threshold override: { cache: { threshold: 0.05 } }
3404
+ const perCommandThreshold = typeof userOptions.cache === "object"
3405
+ ? userOptions.cache.threshold
3406
+ : undefined;
3407
+
3341
3408
  // Merge SDK defaults with user options (user options take precedence)
3342
3409
  const mergedOptions = {
3343
3410
  cacheKey: userOptions.cacheKey ?? sdk.options.cacheKey,
3344
3411
  os: userOptions.os ?? sdk.os,
3345
3412
  resolution: userOptions.resolution ?? sdk.resolution,
3346
- threshold: userOptions.threshold !== undefined ? userOptions.threshold : (sdk.cacheThresholds?.assert ?? -1),
3413
+ threshold: perCommandThreshold ?? userOptions.threshold ?? (sdk.cacheConfig?.thresholds?.assert ?? sdk.cacheThresholds?.assert ?? 0.05),
3414
+ ai: {
3415
+ ...sdk.aiConfig,
3416
+ ...(typeof userOptions.ai === "object" ? userOptions.ai : {}),
3417
+ top: {
3418
+ ...sdk.aiConfig?.top,
3419
+ ...(typeof userOptions.ai === "object" ? userOptions.ai?.top : {}),
3420
+ },
3421
+ },
3347
3422
  };
3348
3423
 
3349
3424
  // Note: commands.assert takes (assertion, options), shouldThrow is determined internally
@@ -3451,74 +3526,70 @@ CAPTCHA_SOLVER_EOF`,
3451
3526
  }
3452
3527
 
3453
3528
  /**
3454
- * Extract all visible text from the current screen using OCR (Tesseract)
3455
- * Returns structured data with text content, bounding boxes, and confidence scores
3529
+ * Parse the current screen using OmniParser v2 to detect all UI elements
3530
+ * Returns structured data with element types, bounding boxes, and content
3531
+ * Requires enterprise or self-hosted plan.
3456
3532
  *
3457
- * @returns {Promise<OCRResult>} OCR extraction result
3533
+ * @returns {Promise<ParseResult>} Parsed screen elements
3458
3534
  *
3459
- * @typedef {Object} OCRResult
3460
- * @property {OCRWord[]} words - Array of words with positions and confidence
3461
- * @property {string} fullText - All extracted text concatenated
3462
- * @property {number} confidence - Overall OCR confidence (0-100)
3535
+ * @typedef {Object} ParseResult
3536
+ * @property {ParsedElement[]} elements - Array of detected UI elements
3537
+ * @property {string} annotatedImageUrl - URL of the annotated screenshot
3463
3538
  * @property {number} imageWidth - Width of the analyzed image
3464
3539
  * @property {number} imageHeight - Height of the analyzed image
3465
3540
  *
3466
- * @typedef {Object} OCRWord
3467
- * @property {string} content - The text content of the word
3468
- * @property {number} confidence - Confidence score (0-100)
3469
- * @property {Object} bbox - Bounding box coordinates
3541
+ * @typedef {Object} ParsedElement
3542
+ * @property {number} index - Element index
3543
+ * @property {string} type - Element type (e.g. "text", "icon", "button")
3544
+ * @property {string} content - Text content or description
3545
+ * @property {string} interactivity - Interactivity level (e.g. "clickable", "non-interactive")
3546
+ * @property {Object} bbox - Bounding box in pixel coordinates
3470
3547
  * @property {number} bbox.x0 - Left edge X coordinate
3471
3548
  * @property {number} bbox.y0 - Top edge Y coordinate
3472
3549
  * @property {number} bbox.x1 - Right edge X coordinate
3473
3550
  * @property {number} bbox.y1 - Bottom edge Y coordinate
3551
+ * @property {Object} boundingBox - Bounding box as {left, top, width, height}
3552
+ * @property {number} boundingBox.left - Left position
3553
+ * @property {number} boundingBox.top - Top position
3554
+ * @property {number} boundingBox.width - Element width
3555
+ * @property {number} boundingBox.height - Element height
3474
3556
  *
3475
3557
  * @example
3476
- * // Get all text on screen
3477
- * const result = await testdriver.ocr();
3478
- * console.log(result.fullText);
3479
- * // "Welcome to TestDriver Sign In Email Password Submit"
3558
+ * // Get all elements on screen
3559
+ * const result = await testdriver.parse();
3560
+ * console.log(`Found ${result.elements.length} elements`);
3480
3561
  *
3481
3562
  * @example
3482
- * // Find words matching a pattern
3483
- * const result = await testdriver.ocr();
3484
- * const buttons = result.words.filter(w =>
3485
- * w.content.toLowerCase().includes('button')
3486
- * );
3563
+ * // Find clickable elements
3564
+ * const result = await testdriver.parse();
3565
+ * const clickable = result.elements.filter(e => e.interactivity === 'clickable');
3487
3566
  *
3488
3567
  * @example
3489
- * // Get word positions for clicking
3490
- * const result = await testdriver.ocr();
3491
- * const submitWord = result.words.find(w => w.content === 'Submit');
3492
- * if (submitWord) {
3493
- * // Calculate center of the word
3494
- * const x = (submitWord.bbox.x0 + submitWord.bbox.x1) / 2;
3495
- * const y = (submitWord.bbox.y0 + submitWord.bbox.y1) / 2;
3496
- * await testdriver.click({ x, y });
3497
- * }
3498
- *
3499
- * @example
3500
- * // Check if specific text exists on screen
3501
- * const result = await testdriver.ocr();
3502
- * const hasError = result.words.some(w =>
3503
- * w.content.toLowerCase().includes('error')
3504
- * );
3568
+ * // Find text content
3569
+ * const result = await testdriver.parse();
3570
+ * const textElements = result.elements.filter(e => e.type === 'text');
3571
+ * textElements.forEach(e => console.log(e.content));
3505
3572
  */
3506
- async ocr() {
3573
+ async parse() {
3507
3574
  this._ensureConnected();
3508
3575
 
3509
3576
  const { events } = require("./agent/events.js");
3510
- this.emitter.emit(events.log.log, "🔍 Running OCR text extraction...");
3577
+ this.emitter.emit(events.log.log, "🔍 Running OmniParser screen analysis...");
3511
3578
 
3512
3579
  const screenshot = await this.system.captureScreenBase64();
3513
3580
 
3514
- const response = await this.apiClient.req("ocr", {
3581
+ const response = await this.apiClient.req("parse", {
3515
3582
  session: this.getSessionId(),
3516
3583
  image: screenshot,
3517
3584
  });
3518
3585
 
3586
+ if (response.error) {
3587
+ throw new Error(response.error);
3588
+ }
3589
+
3519
3590
  this.emitter.emit(
3520
3591
  events.log.log,
3521
- `✅ OCR complete: ${response.words?.length || 0} words extracted`,
3592
+ `✅ Parse complete: ${response.elements?.length || 0} elements detected`,
3522
3593
  );
3523
3594
 
3524
3595
  return response;