@midscene/core 0.9.1 → 0.9.2-beta-20250114083542.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2790 @@
1
+ "use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+ var _chunkSCNIHQKFjs = require('./chunk-SCNIHQKF.js');
25
+
26
+
27
+
28
+ var _chunkG7A32JAGjs = require('./chunk-G7A32JAG.js');
29
+
30
+ // ../../node_modules/.pnpm/lex@1.7.9/node_modules/lex/lexer.js
31
+ var require_lexer = _chunkG7A32JAGjs.__commonJS.call(void 0, {
32
+ "../../node_modules/.pnpm/lex@1.7.9/node_modules/lex/lexer.js"(exports, module) {
33
+ "use strict";
34
+ if (typeof module === "object" && typeof module.exports === "object")
35
+ module.exports = Lexer;
36
+ Lexer.defunct = function(chr) {
37
+ throw new Error("Unexpected character at index " + (this.index - 1) + ": " + chr);
38
+ };
39
+ function Lexer(defunct) {
40
+ if (typeof defunct !== "function")
41
+ defunct = Lexer.defunct;
42
+ var tokens = [];
43
+ var rules = [];
44
+ var remove = 0;
45
+ this.state = 0;
46
+ this.index = 0;
47
+ this.input = "";
48
+ this.addRule = function(pattern, action, start) {
49
+ var global = pattern.global;
50
+ if (!global) {
51
+ var flags = "g";
52
+ if (pattern.multiline)
53
+ flags += "m";
54
+ if (pattern.ignoreCase)
55
+ flags += "i";
56
+ pattern = new RegExp(pattern.source, flags);
57
+ }
58
+ if (Object.prototype.toString.call(start) !== "[object Array]")
59
+ start = [0];
60
+ rules.push({
61
+ pattern,
62
+ global,
63
+ action,
64
+ start
65
+ });
66
+ return this;
67
+ };
68
+ this.setInput = function(input) {
69
+ remove = 0;
70
+ this.state = 0;
71
+ this.index = 0;
72
+ tokens.length = 0;
73
+ this.input = input;
74
+ return this;
75
+ };
76
+ this.lex = function() {
77
+ if (tokens.length)
78
+ return tokens.shift();
79
+ this.reject = true;
80
+ while (this.index <= this.input.length) {
81
+ var matches = scan.call(this).splice(remove);
82
+ var index = this.index;
83
+ while (matches.length) {
84
+ if (this.reject) {
85
+ var match = matches.shift();
86
+ var result = match.result;
87
+ var length = match.length;
88
+ this.index += length;
89
+ this.reject = false;
90
+ remove++;
91
+ var token = match.action.apply(this, result);
92
+ if (this.reject)
93
+ this.index = result.index;
94
+ else if (typeof token !== "undefined") {
95
+ switch (Object.prototype.toString.call(token)) {
96
+ case "[object Array]":
97
+ tokens = token.slice(1);
98
+ token = token[0];
99
+ default:
100
+ if (length)
101
+ remove = 0;
102
+ return token;
103
+ }
104
+ }
105
+ } else
106
+ break;
107
+ }
108
+ var input = this.input;
109
+ if (index < input.length) {
110
+ if (this.reject) {
111
+ remove = 0;
112
+ var token = defunct.call(this, input.charAt(this.index++));
113
+ if (typeof token !== "undefined") {
114
+ if (Object.prototype.toString.call(token) === "[object Array]") {
115
+ tokens = token.slice(1);
116
+ return token[0];
117
+ } else
118
+ return token;
119
+ }
120
+ } else {
121
+ if (this.index !== index)
122
+ remove = 0;
123
+ this.reject = true;
124
+ }
125
+ } else if (matches.length)
126
+ this.reject = true;
127
+ else
128
+ break;
129
+ }
130
+ };
131
+ function scan() {
132
+ var matches = [];
133
+ var index = 0;
134
+ var state = this.state;
135
+ var lastIndex = this.index;
136
+ var input = this.input;
137
+ for (var i = 0, length = rules.length; i < length; i++) {
138
+ var rule = rules[i];
139
+ var start = rule.start;
140
+ var states = start.length;
141
+ if (!states || start.indexOf(state) >= 0 || state % 2 && states === 1 && !start[0]) {
142
+ var pattern = rule.pattern;
143
+ pattern.lastIndex = lastIndex;
144
+ var result = pattern.exec(input);
145
+ if (result && result.index === lastIndex) {
146
+ var j = matches.push({
147
+ result,
148
+ action: rule.action,
149
+ length: result[0].length
150
+ });
151
+ if (rule.global)
152
+ index = j;
153
+ while (--j > index) {
154
+ var k = j - 1;
155
+ if (matches[j].length > matches[k].length) {
156
+ var temple = matches[j];
157
+ matches[j] = matches[k];
158
+ matches[k] = temple;
159
+ }
160
+ }
161
+ }
162
+ }
163
+ }
164
+ return matches;
165
+ }
166
+ }
167
+ }
168
+ });
169
+
170
+ // ../../node_modules/.pnpm/string.fromcodepoint@0.2.1/node_modules/string.fromcodepoint/fromcodepoint.js
171
+ var require_fromcodepoint = _chunkG7A32JAGjs.__commonJS.call(void 0, {
172
+ "../../node_modules/.pnpm/string.fromcodepoint@0.2.1/node_modules/string.fromcodepoint/fromcodepoint.js"() {
173
+ "use strict";
174
+ if (!String.fromCodePoint) {
175
+ (function() {
176
+ var defineProperty = function() {
177
+ try {
178
+ var object = {};
179
+ var $defineProperty = Object.defineProperty;
180
+ var result = $defineProperty(object, object, object) && $defineProperty;
181
+ } catch (error) {
182
+ }
183
+ return result;
184
+ }();
185
+ var stringFromCharCode = String.fromCharCode;
186
+ var floor = Math.floor;
187
+ var fromCodePoint = function(_) {
188
+ var MAX_SIZE = 16384;
189
+ var codeUnits = [];
190
+ var highSurrogate;
191
+ var lowSurrogate;
192
+ var index = -1;
193
+ var length = arguments.length;
194
+ if (!length) {
195
+ return "";
196
+ }
197
+ var result = "";
198
+ while (++index < length) {
199
+ var codePoint = Number(arguments[index]);
200
+ if (!isFinite(codePoint) || // `NaN`, `+Infinity`, or `-Infinity`
201
+ codePoint < 0 || // not a valid Unicode code point
202
+ codePoint > 1114111 || // not a valid Unicode code point
203
+ floor(codePoint) != codePoint) {
204
+ throw RangeError("Invalid code point: " + codePoint);
205
+ }
206
+ if (codePoint <= 65535) {
207
+ codeUnits.push(codePoint);
208
+ } else {
209
+ codePoint -= 65536;
210
+ highSurrogate = (codePoint >> 10) + 55296;
211
+ lowSurrogate = codePoint % 1024 + 56320;
212
+ codeUnits.push(highSurrogate, lowSurrogate);
213
+ }
214
+ if (index + 1 == length || codeUnits.length > MAX_SIZE) {
215
+ result += stringFromCharCode.apply(null, codeUnits);
216
+ codeUnits.length = 0;
217
+ }
218
+ }
219
+ return result;
220
+ };
221
+ if (defineProperty) {
222
+ defineProperty(String, "fromCodePoint", {
223
+ "value": fromCodePoint,
224
+ "configurable": true,
225
+ "writable": true
226
+ });
227
+ } else {
228
+ String.fromCodePoint = fromCodePoint;
229
+ }
230
+ })();
231
+ }
232
+ }
233
+ });
234
+
235
+ // ../../node_modules/.pnpm/unescape-js@1.1.4/node_modules/unescape-js/dist/index.js
236
+ var require_dist = _chunkG7A32JAGjs.__commonJS.call(void 0, {
237
+ "../../node_modules/.pnpm/unescape-js@1.1.4/node_modules/unescape-js/dist/index.js"(exports, module) {
238
+ "use strict";
239
+ Object.defineProperty(exports, "__esModule", {
240
+ value: true
241
+ });
242
+ exports.default = void 0;
243
+ require_fromcodepoint();
244
+ var jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g;
245
+ var usualEscapeSequences = {
246
+ "0": "\0",
247
+ "b": "\b",
248
+ "f": "\f",
249
+ "n": "\n",
250
+ "r": "\r",
251
+ "t": " ",
252
+ "v": "\v",
253
+ "'": "'",
254
+ '"': '"',
255
+ "\\": "\\"
256
+ };
257
+ var fromHex = function fromHex2(str) {
258
+ return String.fromCodePoint(parseInt(str, 16));
259
+ };
260
+ var fromOct = function fromOct2(str) {
261
+ return String.fromCodePoint(parseInt(str, 8));
262
+ };
263
+ var _default = function _default2(string) {
264
+ return string.replace(jsEscapeRegex, function(_, __, varHex, longHex, shortHex, octal, specialCharacter, python) {
265
+ if (varHex !== void 0) {
266
+ return fromHex(varHex);
267
+ } else if (longHex !== void 0) {
268
+ return fromHex(longHex);
269
+ } else if (shortHex !== void 0) {
270
+ return fromHex(shortHex);
271
+ } else if (octal !== void 0) {
272
+ return fromOct(octal);
273
+ } else if (python !== void 0) {
274
+ return fromHex(python);
275
+ } else {
276
+ return usualEscapeSequences[specialCharacter];
277
+ }
278
+ });
279
+ };
280
+ exports.default = _default;
281
+ module.exports = exports.default;
282
+ }
283
+ });
284
+
285
+ // ../../node_modules/.pnpm/utf8@3.0.0/node_modules/utf8/utf8.js
286
+ var require_utf8 = _chunkG7A32JAGjs.__commonJS.call(void 0, {
287
+ "../../node_modules/.pnpm/utf8@3.0.0/node_modules/utf8/utf8.js"(exports) {
288
+ "use strict";
289
+ (function(root) {
290
+ var stringFromCharCode = String.fromCharCode;
291
+ function ucs2decode(string) {
292
+ var output = [];
293
+ var counter = 0;
294
+ var length = string.length;
295
+ var value;
296
+ var extra;
297
+ while (counter < length) {
298
+ value = string.charCodeAt(counter++);
299
+ if (value >= 55296 && value <= 56319 && counter < length) {
300
+ extra = string.charCodeAt(counter++);
301
+ if ((extra & 64512) == 56320) {
302
+ output.push(((value & 1023) << 10) + (extra & 1023) + 65536);
303
+ } else {
304
+ output.push(value);
305
+ counter--;
306
+ }
307
+ } else {
308
+ output.push(value);
309
+ }
310
+ }
311
+ return output;
312
+ }
313
+ function ucs2encode(array) {
314
+ var length = array.length;
315
+ var index = -1;
316
+ var value;
317
+ var output = "";
318
+ while (++index < length) {
319
+ value = array[index];
320
+ if (value > 65535) {
321
+ value -= 65536;
322
+ output += stringFromCharCode(value >>> 10 & 1023 | 55296);
323
+ value = 56320 | value & 1023;
324
+ }
325
+ output += stringFromCharCode(value);
326
+ }
327
+ return output;
328
+ }
329
+ function checkScalarValue(codePoint) {
330
+ if (codePoint >= 55296 && codePoint <= 57343) {
331
+ throw Error(
332
+ "Lone surrogate U+" + codePoint.toString(16).toUpperCase() + " is not a scalar value"
333
+ );
334
+ }
335
+ }
336
+ function createByte(codePoint, shift) {
337
+ return stringFromCharCode(codePoint >> shift & 63 | 128);
338
+ }
339
+ function encodeCodePoint(codePoint) {
340
+ if ((codePoint & 4294967168) == 0) {
341
+ return stringFromCharCode(codePoint);
342
+ }
343
+ var symbol = "";
344
+ if ((codePoint & 4294965248) == 0) {
345
+ symbol = stringFromCharCode(codePoint >> 6 & 31 | 192);
346
+ } else if ((codePoint & 4294901760) == 0) {
347
+ checkScalarValue(codePoint);
348
+ symbol = stringFromCharCode(codePoint >> 12 & 15 | 224);
349
+ symbol += createByte(codePoint, 6);
350
+ } else if ((codePoint & 4292870144) == 0) {
351
+ symbol = stringFromCharCode(codePoint >> 18 & 7 | 240);
352
+ symbol += createByte(codePoint, 12);
353
+ symbol += createByte(codePoint, 6);
354
+ }
355
+ symbol += stringFromCharCode(codePoint & 63 | 128);
356
+ return symbol;
357
+ }
358
+ function utf8encode(string) {
359
+ var codePoints = ucs2decode(string);
360
+ var length = codePoints.length;
361
+ var index = -1;
362
+ var codePoint;
363
+ var byteString = "";
364
+ while (++index < length) {
365
+ codePoint = codePoints[index];
366
+ byteString += encodeCodePoint(codePoint);
367
+ }
368
+ return byteString;
369
+ }
370
+ function readContinuationByte() {
371
+ if (byteIndex >= byteCount) {
372
+ throw Error("Invalid byte index");
373
+ }
374
+ var continuationByte = byteArray[byteIndex] & 255;
375
+ byteIndex++;
376
+ if ((continuationByte & 192) == 128) {
377
+ return continuationByte & 63;
378
+ }
379
+ throw Error("Invalid continuation byte");
380
+ }
381
+ function decodeSymbol() {
382
+ var byte1;
383
+ var byte2;
384
+ var byte3;
385
+ var byte4;
386
+ var codePoint;
387
+ if (byteIndex > byteCount) {
388
+ throw Error("Invalid byte index");
389
+ }
390
+ if (byteIndex == byteCount) {
391
+ return false;
392
+ }
393
+ byte1 = byteArray[byteIndex] & 255;
394
+ byteIndex++;
395
+ if ((byte1 & 128) == 0) {
396
+ return byte1;
397
+ }
398
+ if ((byte1 & 224) == 192) {
399
+ byte2 = readContinuationByte();
400
+ codePoint = (byte1 & 31) << 6 | byte2;
401
+ if (codePoint >= 128) {
402
+ return codePoint;
403
+ } else {
404
+ throw Error("Invalid continuation byte");
405
+ }
406
+ }
407
+ if ((byte1 & 240) == 224) {
408
+ byte2 = readContinuationByte();
409
+ byte3 = readContinuationByte();
410
+ codePoint = (byte1 & 15) << 12 | byte2 << 6 | byte3;
411
+ if (codePoint >= 2048) {
412
+ checkScalarValue(codePoint);
413
+ return codePoint;
414
+ } else {
415
+ throw Error("Invalid continuation byte");
416
+ }
417
+ }
418
+ if ((byte1 & 248) == 240) {
419
+ byte2 = readContinuationByte();
420
+ byte3 = readContinuationByte();
421
+ byte4 = readContinuationByte();
422
+ codePoint = (byte1 & 7) << 18 | byte2 << 12 | byte3 << 6 | byte4;
423
+ if (codePoint >= 65536 && codePoint <= 1114111) {
424
+ return codePoint;
425
+ }
426
+ }
427
+ throw Error("Invalid UTF-8 detected");
428
+ }
429
+ var byteArray;
430
+ var byteCount;
431
+ var byteIndex;
432
+ function utf8decode(byteString) {
433
+ byteArray = ucs2decode(byteString);
434
+ byteCount = byteArray.length;
435
+ byteIndex = 0;
436
+ var codePoints = [];
437
+ var tmp;
438
+ while ((tmp = decodeSymbol()) !== false) {
439
+ codePoints.push(tmp);
440
+ }
441
+ return ucs2encode(codePoints);
442
+ }
443
+ root.version = "3.0.0";
444
+ root.encode = utf8encode;
445
+ root.decode = utf8decode;
446
+ })(typeof exports === "undefined" ? exports.utf8 = {} : exports);
447
+ }
448
+ });
449
+
450
+ // ../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/lexer.js
451
+ var require_lexer2 = _chunkG7A32JAGjs.__commonJS.call(void 0, {
452
+ "../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/lexer.js"(exports, module) {
453
+ "use strict";
454
+ var Lexer = require_lexer();
455
+ var unescapeJs = require_dist();
456
+ var utf8 = require_utf8();
457
+ var LEX_FLOAT = 6;
458
+ var LEX_INT = 7;
459
+ var LEX_QUOTE = 11;
460
+ var LEX_RB = 12;
461
+ var LEX_RCB = 13;
462
+ var LEX_TOKEN = 14;
463
+ var LEX_COLON = -1;
464
+ var LEX_COMMA = -2;
465
+ var LEX_LCB = -3;
466
+ var LEX_LB = -4;
467
+ var LEX_DOT = -5;
468
+ var lexSpc = [
469
+ [/\s*:\s*/, LEX_COLON],
470
+ [/\s*,\s*/, LEX_COMMA],
471
+ [/\s*{\s*/, LEX_LCB],
472
+ [/\s*}\s*/, LEX_RCB],
473
+ [/\s*\[\s*/, LEX_LB],
474
+ [/\s*\]\s*/, LEX_RB],
475
+ [/\s*\.\s*/, LEX_DOT]
476
+ // TODO: remove?
477
+ ];
478
+ function parseString(str) {
479
+ str = str.replace(/\\\//, "/");
480
+ return unescapeJs(str);
481
+ }
482
+ function getLexer(string) {
483
+ let lexer = new Lexer();
484
+ let col = 0;
485
+ let row = 0;
486
+ lexer.addRule(/"((?:\\.|[^"])*?)($|")/, (lexeme, txt) => {
487
+ col += lexeme.length;
488
+ return { type: LEX_QUOTE, value: parseString(txt), row, col, single: false };
489
+ });
490
+ lexer.addRule(/'((?:\\.|[^'])*?)($|'|(",?[ \t]*\n))/, (lexeme, txt) => {
491
+ col += lexeme.length;
492
+ return { type: LEX_QUOTE, value: parseString(txt), row, col, single: true };
493
+ });
494
+ lexer.addRule(/[\-0-9]*\.[0-9]*([eE][\+\-]?)?[0-9]*(?:\s*)/, (lexeme) => {
495
+ col += lexeme.length;
496
+ return { type: LEX_FLOAT, value: parseFloat(lexeme), row, col };
497
+ });
498
+ lexer.addRule(/\-?[0-9]+([eE][\+\-]?)[0-9]*(?:\s*)/, (lexeme) => {
499
+ col += lexeme.length;
500
+ return { type: LEX_FLOAT, value: parseFloat(lexeme), row, col };
501
+ });
502
+ lexer.addRule(/\-?[0-9]+(?:\s*)/, (lexeme) => {
503
+ col += lexeme.length;
504
+ return { type: LEX_INT, value: parseInt(lexeme), row, col };
505
+ });
506
+ lexSpc.forEach((item) => {
507
+ lexer.addRule(item[0], (lexeme) => {
508
+ col += lexeme.length;
509
+ return { type: item[1], value: lexeme, row, col };
510
+ });
511
+ });
512
+ lexer.addRule(/\s/, (lexeme) => {
513
+ if (lexeme == "\n") {
514
+ col = 0;
515
+ row++;
516
+ } else {
517
+ col += lexeme.length;
518
+ }
519
+ });
520
+ lexer.addRule(/\S[ \t]*/, (lexeme) => {
521
+ col += lexeme.length;
522
+ let lt = LEX_TOKEN;
523
+ let val = lexeme;
524
+ return { type: lt, value: val, row, col };
525
+ });
526
+ lexer.setInput(string);
527
+ return lexer;
528
+ }
529
+ module.exports.lexString = lexString;
530
+ function lexString(str, emit) {
531
+ let lex = getLexer(str);
532
+ let token = "";
533
+ while (token = lex.lex()) {
534
+ emit(token);
535
+ }
536
+ }
537
+ module.exports.getAllTokens = getAllTokens;
538
+ function getAllTokens(str) {
539
+ let arr = [];
540
+ let emit = function(i) {
541
+ arr.push(i);
542
+ };
543
+ lexString(str, emit);
544
+ return arr;
545
+ }
546
+ }
547
+ });
548
+
549
+ // ../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/parser.js
550
+ var require_parser = _chunkG7A32JAGjs.__commonJS.call(void 0, {
551
+ "../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/parser.js"(exports, module) {
552
+ "use strict";
553
+ var lexer = require_lexer2();
554
+ var LEX_KV = 0;
555
+ var LEX_KVLIST = 1;
556
+ var LEX_VLIST = 2;
557
+ var LEX_BOOLEAN = 3;
558
+ var LEX_COVALUE = 4;
559
+ var LEX_CVALUE = 5;
560
+ var LEX_FLOAT = 6;
561
+ var LEX_INT = 7;
562
+ var LEX_KEY = 8;
563
+ var LEX_LIST = 9;
564
+ var LEX_OBJ = 10;
565
+ var LEX_QUOTE = 11;
566
+ var LEX_RB = 12;
567
+ var LEX_RCB = 13;
568
+ var LEX_TOKEN = 14;
569
+ var LEX_VALUE = 15;
570
+ var LEX_COLON = -1;
571
+ var LEX_COMMA = -2;
572
+ var LEX_LCB = -3;
573
+ var LEX_LB = -4;
574
+ function extendArray(arr) {
575
+ if (arr.peek == null) {
576
+ Object.defineProperty(arr, "peek", {
577
+ enumerable: false,
578
+ value: function() {
579
+ return this[this.length - 1];
580
+ }
581
+ });
582
+ }
583
+ if (arr.last == null) {
584
+ Object.defineProperty(arr, "last", {
585
+ enumerable: false,
586
+ value: function(i) {
587
+ return this[this.length - (1 + i)];
588
+ }
589
+ });
590
+ }
591
+ }
592
+ function is(obj, prop) {
593
+ return obj && obj.hasOwnProperty("type") && obj.type == prop;
594
+ }
595
+ function log(str) {
596
+ }
597
+ module.exports.parse = parse;
598
+ function parse(text, dupKeys) {
599
+ let stack = [];
600
+ let tokens = [];
601
+ extendArray(stack);
602
+ extendArray(tokens);
603
+ let emit = function(t) {
604
+ tokens.push(t);
605
+ };
606
+ lexer.lexString(text, emit);
607
+ if (tokens[0].type == LEX_LB && tokens.last(0).type != LEX_RB) {
608
+ tokens.push({ type: LEX_RB, value: "]", row: -1, col: -1 });
609
+ }
610
+ if (tokens[0].type == LEX_LCB && tokens.last(0).type != LEX_RCB) {
611
+ tokens.push({ type: LEX_RCB, value: "}", row: -1, col: -1 });
612
+ }
613
+ for (let i = 0; i < tokens.length; i++) {
614
+ log("Shifting " + tokens[i].type);
615
+ stack.push(tokens[i]);
616
+ log(stack);
617
+ log("Reducing...");
618
+ while (reduce(stack)) {
619
+ log(stack);
620
+ log("Reducing...");
621
+ }
622
+ }
623
+ if (stack.length == 1 && stack[0].type == LEX_KVLIST) {
624
+ log("Pre-compile error fix 1");
625
+ stack = [{ type: LEX_OBJ, value: stack[0].value }];
626
+ }
627
+ return compileOST(stack[0], dupKeys);
628
+ }
629
+ function reduce(stack) {
630
+ let next = stack.pop();
631
+ switch (next.type) {
632
+ case LEX_KEY:
633
+ if (next.value.trim() == "true") {
634
+ log("Rule 5");
635
+ stack.push({ "type": LEX_BOOLEAN, "value": "true" });
636
+ return true;
637
+ }
638
+ if (next.value.trim() == "false") {
639
+ log("Rule 6");
640
+ stack.push({ "type": LEX_BOOLEAN, "value": "false" });
641
+ return true;
642
+ }
643
+ if (next.value.trim() == "null") {
644
+ log("Rule 7");
645
+ stack.push({ "type": LEX_VALUE, "value": null });
646
+ return true;
647
+ }
648
+ break;
649
+ case LEX_TOKEN:
650
+ if (is(stack.peek(), LEX_KEY)) {
651
+ log("Rule 11a");
652
+ stack.peek().value += next.value;
653
+ return true;
654
+ }
655
+ log("Rule 11c");
656
+ stack.push({ type: LEX_KEY, value: next.value });
657
+ return true;
658
+ case LEX_INT:
659
+ if (is(next, LEX_INT) && is(stack.peek(), LEX_KEY)) {
660
+ log("Rule 11b");
661
+ stack.peek().value += next.value;
662
+ return true;
663
+ }
664
+ log("Rule 11f");
665
+ next.type = LEX_VALUE;
666
+ stack.push(next);
667
+ return true;
668
+ case LEX_QUOTE:
669
+ log("Rule 11d");
670
+ next.type = LEX_VALUE;
671
+ next.value = next.value;
672
+ stack.push(next);
673
+ return true;
674
+ case LEX_BOOLEAN:
675
+ log("Rule 11e");
676
+ next.type = LEX_VALUE;
677
+ if (next.value == "true") {
678
+ next.value = true;
679
+ } else {
680
+ next.value = false;
681
+ }
682
+ stack.push(next);
683
+ return true;
684
+ case LEX_FLOAT:
685
+ log("Rule 11g");
686
+ next.type = LEX_VALUE;
687
+ stack.push(next);
688
+ return true;
689
+ case LEX_VALUE:
690
+ if (is(stack.peek(), LEX_COMMA)) {
691
+ log("Rule 12");
692
+ next.type = LEX_CVALUE;
693
+ stack.pop();
694
+ stack.push(next);
695
+ return true;
696
+ }
697
+ if (is(stack.peek(), LEX_COLON)) {
698
+ log("Rule 13");
699
+ next.type = LEX_COVALUE;
700
+ stack.pop();
701
+ stack.push(next);
702
+ return true;
703
+ }
704
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_VALUE)) {
705
+ log("Error rule 1");
706
+ let middleVal = stack.pop();
707
+ stack.peek().value += '"' + middleVal.value + '"';
708
+ stack.peek().value += next.value;
709
+ return true;
710
+ }
711
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_VLIST)) {
712
+ log("Error rule 2");
713
+ let middleVal = stack.pop();
714
+ let oldLastVal = stack.peek().value.pop();
715
+ oldLastVal += '"' + middleVal.value + '"';
716
+ oldLastVal += next.value;
717
+ stack.peek().value.push(oldLastVal);
718
+ return true;
719
+ }
720
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_KVLIST)) {
721
+ log("Error rule 3");
722
+ let middleVal = stack.pop();
723
+ let oldLastVal = stack.peek().value.pop();
724
+ const qChar = next.single ? "'" : '"';
725
+ oldLastVal.value += qChar + middleVal.value + qChar;
726
+ oldLastVal.value += next.value;
727
+ stack.peek().value.push(oldLastVal);
728
+ return true;
729
+ }
730
+ if (is(stack.peek(), LEX_KEY)) {
731
+ log("Error rule 4");
732
+ let keyValue = stack.pop().value;
733
+ next.value = keyValue + next.value;
734
+ stack.push(next);
735
+ return true;
736
+ }
737
+ break;
738
+ case LEX_LIST:
739
+ if (is(next, LEX_LIST) && is(stack.peek(), LEX_COMMA)) {
740
+ log("Rule 12a");
741
+ next.type = LEX_CVALUE;
742
+ stack.pop();
743
+ stack.push(next);
744
+ return true;
745
+ }
746
+ if (is(stack.peek(), LEX_COLON)) {
747
+ log("Rule 13a");
748
+ next.type = LEX_COVALUE;
749
+ stack.pop();
750
+ stack.push(next);
751
+ return true;
752
+ }
753
+ break;
754
+ case LEX_OBJ:
755
+ if (is(stack.peek(), LEX_COMMA)) {
756
+ log("Rule 12b");
757
+ let toPush = { "type": LEX_CVALUE, "value": next };
758
+ stack.pop();
759
+ stack.push(toPush);
760
+ return true;
761
+ }
762
+ if (is(stack.peek(), LEX_COLON)) {
763
+ log("Rule 13b");
764
+ let toPush = { "type": LEX_COVALUE, "value": next };
765
+ stack.pop();
766
+ stack.push(toPush);
767
+ return true;
768
+ }
769
+ if (is(stack.peek(), LEX_KEY)) {
770
+ log("Error rule 9");
771
+ let key = stack.pop();
772
+ stack.push({ "type": LEX_KV, "key": key.value.trim(), "value": next });
773
+ return true;
774
+ }
775
+ break;
776
+ case LEX_CVALUE:
777
+ if (is(stack.peek(), LEX_VLIST)) {
778
+ log("Rule 14");
779
+ stack.peek().value.push(next.value);
780
+ return true;
781
+ }
782
+ log("Rule 15");
783
+ stack.push({ "type": LEX_VLIST, "value": [next.value] });
784
+ return true;
785
+ case LEX_VLIST:
786
+ if (is(stack.peek(), LEX_VALUE)) {
787
+ log("Rule 15a");
788
+ next.value.unshift(stack.peek().value);
789
+ stack.pop();
790
+ stack.push(next);
791
+ return true;
792
+ }
793
+ if (is(stack.peek(), LEX_LIST)) {
794
+ log("Rule 15b");
795
+ next.value.unshift(stack.peek().value);
796
+ stack.pop();
797
+ stack.push(next);
798
+ return true;
799
+ }
800
+ if (is(stack.peek(), LEX_OBJ)) {
801
+ log("Rule 15c");
802
+ next.value.unshift(stack.peek());
803
+ stack.pop();
804
+ stack.push(next);
805
+ return true;
806
+ }
807
+ if (is(stack.peek(), LEX_KEY) && (stack.last(1), LEX_COMMA)) {
808
+ log("Error rule 7");
809
+ let l = stack.pop();
810
+ stack.push({ type: LEX_VALUE, "value": l.value });
811
+ log("Start subreduce... (" + l.value + ")");
812
+ while (reduce(stack))
813
+ ;
814
+ log("End subreduce");
815
+ stack.push(next);
816
+ return true;
817
+ }
818
+ if (is(stack.peek(), LEX_VLIST)) {
819
+ log("Error rule 8");
820
+ stack.peek().value.push(next.value[0]);
821
+ return true;
822
+ }
823
+ break;
824
+ case LEX_COVALUE:
825
+ if (is(stack.peek(), LEX_KEY) || is(stack.peek(), LEX_VALUE) || is(stack.peek(), LEX_VLIST)) {
826
+ log("Rule 16");
827
+ let key = stack.pop();
828
+ stack.push({ "type": LEX_KV, "key": key.value, "value": next.value });
829
+ return true;
830
+ }
831
+ throw new Error("Got a :value that can't be handled at line " + next.row + ":" + next.col);
832
+ case LEX_KV:
833
+ if (is(stack.last(0), LEX_COMMA) && is(stack.last(1), LEX_KVLIST)) {
834
+ log("Rule 17");
835
+ stack.last(1).value.push(next);
836
+ stack.pop();
837
+ return true;
838
+ }
839
+ log("Rule 18");
840
+ stack.push({ "type": LEX_KVLIST, "value": [next] });
841
+ return true;
842
+ case LEX_KVLIST:
843
+ if (is(stack.peek(), LEX_KVLIST)) {
844
+ log("Rule 17a");
845
+ next.value.forEach(function(i) {
846
+ stack.peek().value.push(i);
847
+ });
848
+ return true;
849
+ }
850
+ break;
851
+ case LEX_RB:
852
+ if (is(stack.peek(), LEX_VLIST) && is(stack.last(1), LEX_LB)) {
853
+ log("Rule 19");
854
+ let l = stack.pop();
855
+ stack.pop();
856
+ stack.push({ "type": LEX_LIST, "value": l.value });
857
+ return true;
858
+ }
859
+ if (is(stack.peek(), LEX_LIST) && is(stack.last(1), LEX_LB)) {
860
+ log("Rule 19b");
861
+ let l = stack.pop();
862
+ stack.pop();
863
+ stack.push({ "type": LEX_LIST, "value": [l.value] });
864
+ return true;
865
+ }
866
+ if (is(stack.peek(), LEX_LB)) {
867
+ log("Rule 22");
868
+ stack.pop();
869
+ stack.push({ type: LEX_LIST, "value": [] });
870
+ return true;
871
+ }
872
+ if (is(stack.peek(), LEX_VALUE) && is(stack.last(1), LEX_LB)) {
873
+ log("Rule 23");
874
+ let val = stack.pop().value;
875
+ stack.pop();
876
+ stack.push({ type: LEX_LIST, "value": [val] });
877
+ return true;
878
+ }
879
+ if (is(stack.peek(), LEX_OBJ) && is(stack.last(1), LEX_LB)) {
880
+ log("Rule 23b");
881
+ let val = stack.pop();
882
+ stack.pop();
883
+ stack.push({ type: LEX_LIST, "value": [val] });
884
+ return true;
885
+ }
886
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_COMMA)) {
887
+ log("Error rule 5");
888
+ let l = stack.pop();
889
+ stack.push({ type: LEX_VALUE, "value": l.value });
890
+ log("Start subreduce... (" + l.value + ")");
891
+ while (reduce(stack))
892
+ ;
893
+ log("End subreduce");
894
+ stack.push({ type: LEX_RB });
895
+ return true;
896
+ }
897
+ if (is(stack.peek(), LEX_COMMA) && (is(stack.last(1), LEX_KEY) || is(stack.last(1), LEX_OBJ) || is(stack.last(1), LEX_VALUE))) {
898
+ log("Error rule 5a");
899
+ stack.pop();
900
+ stack.push({ type: LEX_RB, "value": "]" });
901
+ log("Start subreduce...");
902
+ log("Content: " + JSON.stringify(stack));
903
+ while (reduce(stack))
904
+ ;
905
+ log("End subreduce");
906
+ return true;
907
+ }
908
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_LB)) {
909
+ log("Error rule 5b");
910
+ let v = stack.pop();
911
+ stack.pop();
912
+ stack.push({ type: LEX_LIST, value: [v.value] });
913
+ return true;
914
+ }
915
+ if (is(stack.peek(), LEX_COMMA) && is(stack.last(1), LEX_VLIST)) {
916
+ log("Error rule 5c");
917
+ stack.pop();
918
+ stack.push({ type: LEX_RB });
919
+ log("Start subreduce...");
920
+ log("Content: " + JSON.stringify(stack));
921
+ while (reduce(stack))
922
+ ;
923
+ log("End subreduce");
924
+ return true;
925
+ }
926
+ break;
927
+ case LEX_RCB:
928
+ if (is(stack.peek(), LEX_KVLIST) && is(stack.last(1), LEX_LCB)) {
929
+ log("Rule 20");
930
+ let l = stack.pop();
931
+ stack.pop();
932
+ stack.push({ "type": LEX_OBJ, "value": l.value });
933
+ return true;
934
+ }
935
+ if (is(stack.peek(), LEX_LCB)) {
936
+ log("Rule 21");
937
+ stack.pop();
938
+ stack.push({ type: LEX_OBJ, "value": null });
939
+ return true;
940
+ }
941
+ if (is(stack.peek(), LEX_KEY) && is(stack.last(1), LEX_COLON)) {
942
+ log("Error rule 4a");
943
+ let l = stack.pop();
944
+ stack.push({ type: LEX_VALUE, "value": l.value });
945
+ log("Start subreduce... (" + l.value + ")");
946
+ while (reduce(stack))
947
+ ;
948
+ log("End subreduce");
949
+ stack.push({ type: LEX_RCB });
950
+ return true;
951
+ }
952
+ if (is(stack.peek(), LEX_COLON)) {
953
+ log("Error rule 4b");
954
+ stack.push({ type: LEX_VALUE, value: null });
955
+ log("Starting subreduce...");
956
+ while (reduce(stack))
957
+ ;
958
+ log("End subreduce.");
959
+ stack.push({ type: LEX_RCB });
960
+ return true;
961
+ }
962
+ if (is(stack.peek(), LEX_COMMA)) {
963
+ log("Error rule 10a");
964
+ stack.pop();
965
+ stack.push({ type: LEX_RCB });
966
+ return true;
967
+ }
968
+ throw new Error("Found } that I can't handle at line " + next.row + ":" + next.col);
969
+ case LEX_COMMA:
970
+ if (is(stack.peek(), LEX_COMMA)) {
971
+ log("Comma error rule 1");
972
+ return true;
973
+ }
974
+ if (is(stack.peek(), LEX_KEY)) {
975
+ log("Comma error rule 2");
976
+ const key = stack.pop();
977
+ stack.push({ type: LEX_VALUE, value: key.value });
978
+ log("Starting subreduce...");
979
+ while (reduce(stack))
980
+ ;
981
+ log("End subreduce.");
982
+ stack.push(next);
983
+ return true;
984
+ }
985
+ if (is(stack.peek(), LEX_COLON)) {
986
+ log("Comma error rule 3");
987
+ stack.push({ type: LEX_VALUE, value: null });
988
+ log("Starting subreduce...");
989
+ while (reduce(stack))
990
+ ;
991
+ log("End subreduce.");
992
+ stack.push(next);
993
+ return true;
994
+ }
995
+ }
996
+ stack.push(next);
997
+ return false;
998
+ }
999
+ function compileOST(tree, dupKeys) {
1000
+ let rawTypes = ["boolean", "number", "string"];
1001
+ if (rawTypes.indexOf(typeof tree) != -1)
1002
+ return tree;
1003
+ if (tree === null)
1004
+ return null;
1005
+ if (Array.isArray(tree)) {
1006
+ let toR = [];
1007
+ while (tree.length > 0)
1008
+ toR.unshift(compileOST(tree.pop()));
1009
+ return toR;
1010
+ }
1011
+ if (is(tree, LEX_OBJ)) {
1012
+ let toR = {};
1013
+ if (tree.value === null)
1014
+ return {};
1015
+ tree.value.forEach(function(i) {
1016
+ const key = i.key;
1017
+ const val = compileOST(i.value);
1018
+ if (dupKeys && key in toR) {
1019
+ toR[key] = {
1020
+ "value": toR[key],
1021
+ "next": val
1022
+ };
1023
+ } else {
1024
+ toR[key] = val;
1025
+ }
1026
+ });
1027
+ return toR;
1028
+ }
1029
+ if (is(tree, LEX_LIST)) {
1030
+ return compileOST(tree.value);
1031
+ }
1032
+ return tree.value;
1033
+ }
1034
+ }
1035
+ });
1036
+
1037
+ // ../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/dirty-json.js
1038
+ var require_dirty_json = _chunkG7A32JAGjs.__commonJS.call(void 0, {
1039
+ "../../node_modules/.pnpm/dirty-json@0.9.2/node_modules/dirty-json/dirty-json.js"(exports, module) {
1040
+ "use strict";
1041
+ var parser = require_parser();
1042
+ module.exports.parse = parse;
1043
+ function parse(text, config) {
1044
+ let fallback = true;
1045
+ let duplicateKeys = false;
1046
+ if (config) {
1047
+ if ("fallback" in config && config[fallback] === false) {
1048
+ fallback = false;
1049
+ }
1050
+ duplicateKeys = "duplicateKeys" in config && config["duplicateKeys"] === true;
1051
+ }
1052
+ try {
1053
+ return parser.parse(text, duplicateKeys);
1054
+ } catch (e) {
1055
+ if (fallback === false) {
1056
+ throw e;
1057
+ }
1058
+ try {
1059
+ let json = JSON.parse(text);
1060
+ console.warn("dirty-json got valid JSON that failed with the custom parser. We're returning the valid JSON, but please file a bug report here: https://github.com/RyanMarcus/dirty-json/issues -- the JSON that caused the failure was: " + text);
1061
+ return json;
1062
+ } catch (json_error) {
1063
+ throw e;
1064
+ }
1065
+ }
1066
+ }
1067
+ }
1068
+ });
1069
+
1070
+ // src/ai-model/openai/index.ts
1071
+ var _assert = require('assert'); var _assert2 = _interopRequireDefault(_assert);
1072
+
1073
+ // src/types.ts
1074
+ var BaseElement = class {
1075
+ };
1076
+ var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
1077
+ AIResponseFormat2["JSON"] = "json_object";
1078
+ AIResponseFormat2["TEXT"] = "text";
1079
+ return AIResponseFormat2;
1080
+ })(AIResponseFormat || {});
1081
+ var UIContext = class {
1082
+ };
1083
+
1084
+ // src/ai-model/openai/index.ts
1085
+ var import_dirty_json = _chunkG7A32JAGjs.__toESM.call(void 0, require_dirty_json());
1086
+ var _sdk = require('@anthropic-ai/sdk');
1087
+
1088
+
1089
+
1090
+ var _identity = require('@azure/identity');
1091
+ var _utils = require('@midscene/shared/utils');
1092
+ var _openai = require('openai'); var _openai2 = _interopRequireDefault(_openai);
1093
+ var _socksproxyagent = require('socks-proxy-agent');
1094
+
1095
+ // src/ai-model/common.ts
1096
+
1097
+ async function callAiFn(msgs, AIActionTypeValue) {
1098
+ _assert2.default.call(void 0,
1099
+ checkAIConfig(),
1100
+ "Cannot find config for AI model service. You should set it before using. https://midscenejs.com/model-provider.html"
1101
+ );
1102
+ const { content, usage } = await callToGetJSONObject(
1103
+ msgs,
1104
+ AIActionTypeValue
1105
+ );
1106
+ return { content, usage };
1107
+ }
1108
+
1109
+ // src/ai-model/prompt/element-inspector.ts
1110
+ var _prompts = require('@langchain/core/prompts');
1111
+
1112
+ // src/ai-model/prompt/element-point.ts
1113
+ function systemPromptToFindElementPosition() {
1114
+ return `
1115
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
1116
+
1117
+ ## Output Format
1118
+ \`\`\`
1119
+ Action_Summary: ...
1120
+ Action: ...
1121
+ \`\`\`
1122
+
1123
+ ## Action Space
1124
+ click(start_box='[x1, y1, x2, y2]')
1125
+ long_press(start_box='[x1, y1, x2, y2]', time='')
1126
+ type(content='')
1127
+ scroll(direction='down or up or right or left')
1128
+ open_app(app_name='')
1129
+ navigate_back()
1130
+ navigate_home()
1131
+ WAIT()
1132
+ finished() # Submit the task regardless of whether it succeeds or fails.
1133
+
1134
+ ## Note
1135
+ - Use Chinese in \`Action_Summary\` part.
1136
+
1137
+ ## User Instruction
1138
+ `;
1139
+ }
1140
+
1141
+ // src/ai-model/prompt/element-inspector.ts
1142
+ function systemPromptToFindElement() {
1143
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MATCH_BY_POSITION)) {
1144
+ return systemPromptToFindElementPosition();
1145
+ }
1146
+ return `
1147
+ ## Role:
1148
+ You are an expert in software page image (2D) and page element text analysis.
1149
+
1150
+ ## Objective:
1151
+ - Identify elements in screenshots and text that match the user's description.
1152
+ - Return JSON data containing the selection reason and element ID.
1153
+
1154
+ ## Skills:
1155
+ - Image analysis and recognition
1156
+ - Multilingual text understanding
1157
+ - Software UI design and testing
1158
+
1159
+ ## Workflow:
1160
+ 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
1161
+ 2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
1162
+ 3. Found the required number of elements
1163
+ 4. Return JSON data containing the selection reason and element ID.
1164
+
1165
+ ## Constraints:
1166
+ - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
1167
+ - Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
1168
+ - Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
1169
+ - If no elements are found, the "elements" array should be empty.
1170
+ - The returned data must conform to the specified JSON format.
1171
+ - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
1172
+
1173
+ ## Output Format:
1174
+
1175
+ Please return the result in JSON format as follows:
1176
+
1177
+ \`\`\`json
1178
+ {
1179
+ "elements": [
1180
+ // If no matching elements are found, return an empty array []
1181
+ {
1182
+ "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
1183
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
1184
+ "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
1185
+ }
1186
+ // More elements...
1187
+ ],
1188
+ "errors": [] // Array of strings containing any error messages
1189
+ }
1190
+ \`\`\`
1191
+
1192
+ ## Example:
1193
+ Example 1:
1194
+ Input Example:
1195
+ \`\`\`json
1196
+ // Description: "Shopping cart icon in the upper right corner"
1197
+ {
1198
+ "description": "PLACEHOLDER", // Description of the target element
1199
+ "screenshot": "path/screenshot.png",
1200
+ "text": '{
1201
+ "pageSize": {
1202
+ "width": 400, // Width of the page
1203
+ "height": 905 // Height of the page
1204
+ },
1205
+ "elementInfos": [
1206
+ {
1207
+ "id": "1231", // ID of the element
1208
+ "indexId": "0", // Index of the element,The image is labeled to the left of the element
1209
+ "attributes": { // Attributes of the element
1210
+ "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
1211
+ "src": "https://ap-southeast-3.m",
1212
+ "class": ".img"
1213
+ },
1214
+ "content": "", // Text content of the element
1215
+ "rect": {
1216
+ "left": 280, // Distance from the left side of the page
1217
+ "top": 8, // Distance from the top of the page
1218
+ "width": 44, // Width of the element
1219
+ "height": 44 // Height of the element
1220
+ }
1221
+ },
1222
+ {
1223
+ "id": "66551", // ID of the element
1224
+ "indexId": "1", // Index of the element,The image is labeled to the left of the element
1225
+ "attributes": { // Attributes of the element
1226
+ "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
1227
+ "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
1228
+ "class": ".icon"
1229
+ },
1230
+ "content": "", // Text content of the element
1231
+ "rect": {
1232
+ "left": 350, // Distance from the left side of the page
1233
+ "top": 16, // Distance from the top of the page
1234
+ "width": 25, // Width of the element
1235
+ "height": 25 // Height of the element
1236
+ }
1237
+ },
1238
+ ...
1239
+ {
1240
+ "id": "12344",
1241
+ "indexId": "2", // Index of the element,The image is labeled to the left of the element
1242
+ "attributes": {
1243
+ "nodeType": "TEXT Node",
1244
+ "class": ".product-name"
1245
+ },
1246
+ "center": [
1247
+ 288,
1248
+ 834
1249
+ ],
1250
+ "content": "Mango Drink",
1251
+ "rect": {
1252
+ "left": 188,
1253
+ "top": 827,
1254
+ "width": 199,
1255
+ "height": 13
1256
+ }
1257
+ },
1258
+ ...
1259
+ ]
1260
+ }
1261
+ '
1262
+ }
1263
+ \`\`\`
1264
+ Output Example:
1265
+ \`\`\`json
1266
+ {
1267
+ "elements": [
1268
+ {
1269
+ // Describe the reason for finding this element, replace with actual value in practice
1270
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
1271
+ "text": "",
1272
+ // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
1273
+ "id": "1231"
1274
+ }
1275
+ ],
1276
+ "errors": []
1277
+ }
1278
+ \`\`\`
1279
+
1280
+ `;
1281
+ }
1282
+ var findElementSchema = {
1283
+ type: "json_schema",
1284
+ json_schema: {
1285
+ name: "find_elements",
1286
+ strict: true,
1287
+ schema: {
1288
+ type: "object",
1289
+ properties: {
1290
+ elements: {
1291
+ type: "array",
1292
+ items: {
1293
+ type: "object",
1294
+ properties: {
1295
+ reason: {
1296
+ type: "string",
1297
+ description: "Reason for finding this element"
1298
+ },
1299
+ text: {
1300
+ type: "string",
1301
+ description: "Text content of the element"
1302
+ },
1303
+ id: {
1304
+ type: "string",
1305
+ description: "ID of this element"
1306
+ }
1307
+ },
1308
+ required: ["reason", "text", "id"],
1309
+ additionalProperties: false
1310
+ },
1311
+ description: "List of found elements"
1312
+ },
1313
+ errors: {
1314
+ type: "array",
1315
+ items: {
1316
+ type: "string"
1317
+ },
1318
+ description: "List of error messages, if any"
1319
+ }
1320
+ },
1321
+ required: ["elements", "errors"],
1322
+ additionalProperties: false
1323
+ }
1324
+ }
1325
+ };
1326
+ var findElementPrompt = new (0, _prompts.PromptTemplate)({
1327
+ template: `
1328
+ Here is the item user want to find. Just go ahead:
1329
+ =====================================
1330
+ {{
1331
+ "description": "{targetElementDescription}",
1332
+ "multi": {multi}
1333
+ }}
1334
+ =====================================
1335
+
1336
+ pageDescription: {pageDescription}
1337
+ `,
1338
+ inputVariables: ["pageDescription", "targetElementDescription", "multi"]
1339
+ });
1340
+
1341
+ // src/ai-model/prompt/planning.ts
1342
+
1343
+
1344
+ // src/ai-model/prompt/util.ts
1345
+
1346
+
1347
+ // src/image/index.ts
1348
+
1349
+
1350
+
1351
+
1352
+
1353
+
1354
+
1355
+
1356
+ var _img = require('@midscene/shared/img');
1357
+
1358
+ // src/ai-model/prompt/util.ts
1359
+
1360
+ var _constants = require('@midscene/shared/constants');
1361
+
1362
+ var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
1363
+ var contextFormatIntro = `
1364
+ The user will give you a screenshot and some of the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app. If some text is shown on screenshot but not introduced by the JSON description, use the information you see on screenshot.`;
1365
+ var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
1366
+ var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
1367
+ var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
1368
+ function systemPromptToExtract() {
1369
+ return `
1370
+ You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
1371
+ The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
1372
+
1373
+ You have the following skills:
1374
+
1375
+ skill name: extract_data_from_UI
1376
+ related input: DATA_DEMAND
1377
+ skill content:
1378
+ * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
1379
+ * There may be some special commands in DATA_DEMAND, please pay extra attention
1380
+ - LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
1381
+ - LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
1382
+
1383
+ Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
1384
+
1385
+ Return in the following JSON format:
1386
+ {
1387
+ language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
1388
+ data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
1389
+ errors: [], // string[], error message if any
1390
+ }
1391
+ `;
1392
+ }
1393
+ var extractDataPrompt = new (0, _prompts.PromptTemplate)({
1394
+ template: `
1395
+ pageDescription: {pageDescription}
1396
+
1397
+ Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
1398
+ DATA_DEMAND start:
1399
+ =====================================
1400
+ {dataKeys}
1401
+
1402
+ {dataQuery}
1403
+
1404
+ =====================================
1405
+ DATA_DEMAND ends.
1406
+ `,
1407
+ inputVariables: ["pageDescription", "dataKeys", "dataQuery"]
1408
+ });
1409
+ function systemPromptToAssert() {
1410
+ return `
1411
+ ${characteristic}
1412
+ ${contextFormatIntro}
1413
+
1414
+ Based on the information you get, Return assertion judgment:
1415
+
1416
+ Return in the following JSON format:
1417
+ {
1418
+ thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
1419
+ pass: true, // true or false, whether the assertion is passed
1420
+ }
1421
+ `;
1422
+ }
1423
+ var assertSchema = {
1424
+ type: "json_schema",
1425
+ json_schema: {
1426
+ name: "assert",
1427
+ strict: true,
1428
+ schema: {
1429
+ type: "object",
1430
+ properties: {
1431
+ thought: {
1432
+ type: "string",
1433
+ description: "The thought process behind the assertion"
1434
+ },
1435
+ pass: {
1436
+ type: "boolean",
1437
+ description: "Whether the assertion passed or failed"
1438
+ }
1439
+ },
1440
+ required: ["thought", "pass"],
1441
+ additionalProperties: false
1442
+ }
1443
+ }
1444
+ };
1445
+ function describeSize(size) {
1446
+ return `${size.width} x ${size.height}`;
1447
+ }
1448
+ function truncateText(text, maxLength = 100) {
1449
+ if (text && text.length > maxLength) {
1450
+ return `${text.slice(0, maxLength)}...`;
1451
+ }
1452
+ if (typeof text === "string") {
1453
+ return text.trim();
1454
+ }
1455
+ return "";
1456
+ }
1457
+ function elementByPositionWithElementInfo(elementsInfo, position) {
1458
+ _assert2.default.call(void 0, typeof position !== "undefined", "position is required for query");
1459
+ const item = elementsInfo.find((item2) => {
1460
+ return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
1461
+ });
1462
+ return item;
1463
+ }
1464
+ var samplePageDescription = `
1465
+ The size of the page: 1280 x 720
1466
+ Some of the elements are marked with a rectangle in the screenshot, some are not.
1467
+
1468
+ JSON description of all the elements in screenshot:
1469
+ id=c81c4e9a33: {
1470
+ "markerId": 2, // The number indicated by the rectangle label in the screenshot
1471
+ "attributes": // Attributes of the element
1472
+ {"data-id":"@submit s0","class":".gh-search","aria-label":"搜索","nodeType":"IMG", "src": "image_url"},
1473
+ "rect": { "left": 16, "top": 378, "width": 89, "height": 16 } // Position of the element in the page
1474
+ }
1475
+
1476
+ id=5a29bf6419bd: {
1477
+ "content": "获取优惠券",
1478
+ "attributes": { "nodeType": "TEXT" },
1479
+ "rect": { "left": 32, "top": 332, "width": 70, "height": 18 }
1480
+ }
1481
+
1482
+ ...many more`;
1483
+ async function describeUserPage(context, opt) {
1484
+ const { screenshotBase64 } = context;
1485
+ let width;
1486
+ let height;
1487
+ if (context.size) {
1488
+ ({ width, height } = context.size);
1489
+ } else {
1490
+ const imgSize = await _img.imageInfoOfBase64.call(void 0, screenshotBase64);
1491
+ ({ width, height } = imgSize);
1492
+ }
1493
+ const elementsInfo = context.content;
1494
+ const idElementMap = {};
1495
+ elementsInfo.forEach((item) => {
1496
+ idElementMap[item.id] = item;
1497
+ if (item.indexId) {
1498
+ idElementMap[item.indexId] = item;
1499
+ }
1500
+ return { ...item };
1501
+ });
1502
+ const elementInfosDescription = cropFieldInformation(
1503
+ elementsInfo,
1504
+ opt == null ? void 0 : opt.truncateTextLength,
1505
+ opt == null ? void 0 : opt.filterNonTextContent
1506
+ );
1507
+ const contentList = elementInfosDescription.map((item) => {
1508
+ const { id, ...rest } = item;
1509
+ return `id=${id}: ${JSON.stringify(rest)}`;
1510
+ }).join("\n\n");
1511
+ const pageJSONDescription = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MATCH_BY_POSITION) ? "" : `Some of the elements are marked with a rectangle in the screenshot, some are not.
1512
+ Json description of all the page elements:
1513
+ ${contentList}`;
1514
+ const sizeDescription = describeSize({ width, height });
1515
+ return {
1516
+ description: `The size of the page: ${sizeDescription}
1517
+ ${pageJSONDescription}`,
1518
+ elementById(id) {
1519
+ _assert2.default.call(void 0, typeof id !== "undefined", "id is required for query");
1520
+ const item = idElementMap[`${id}`];
1521
+ return item;
1522
+ },
1523
+ elementByPosition(position, size) {
1524
+ console.log("elementByPosition", { position, size });
1525
+ return elementByPositionWithElementInfo(elementsInfo, position);
1526
+ },
1527
+ insertElementByPosition(position) {
1528
+ const rect = {
1529
+ left: Math.max(position.x - 4, 0),
1530
+ top: Math.max(position.y - 4, 0),
1531
+ width: 8,
1532
+ height: 8
1533
+ };
1534
+ const id = _utils.generateHashId.call(void 0, rect);
1535
+ const element = {
1536
+ id,
1537
+ attributes: { nodeType: _constants.NodeType.POSITION },
1538
+ rect,
1539
+ content: "",
1540
+ center: [position.x, position.y]
1541
+ };
1542
+ elementsInfo.push(element);
1543
+ idElementMap[id] = element;
1544
+ return element;
1545
+ },
1546
+ size: { width, height }
1547
+ };
1548
+ }
1549
+ function cropFieldInformation(elementsInfo, truncateTextLength, filterNonTextContent = false) {
1550
+ const elementInfosDescription = elementsInfo.map(
1551
+ (item) => {
1552
+ const { id, attributes = {}, rect, content } = item;
1553
+ let htmlTagName = "";
1554
+ const tailorContent = truncateText(content, truncateTextLength);
1555
+ const tailorAttributes = Object.keys(attributes).reduce(
1556
+ (res, currentKey) => {
1557
+ const attributeVal = attributes[currentKey];
1558
+ if (currentKey === "style" || currentKey === "src")
1559
+ return res;
1560
+ if (currentKey === "nodeType") {
1561
+ if (!filterNonTextContent) {
1562
+ res[currentKey] = attributeVal.replace(/\sNode$/, "");
1563
+ }
1564
+ } else if (currentKey === "htmlTagName") {
1565
+ if (!["<span>", "<p>", "<div>"].includes(attributeVal)) {
1566
+ htmlTagName = attributeVal;
1567
+ }
1568
+ } else {
1569
+ res[currentKey] = truncateText(attributeVal);
1570
+ }
1571
+ return res;
1572
+ },
1573
+ {}
1574
+ );
1575
+ return {
1576
+ id,
1577
+ ...filterNonTextContent || tailorContent ? {} : { markerId: item.indexId },
1578
+ ...tailorContent ? { content: tailorContent } : {},
1579
+ ...Object.keys(tailorAttributes).length && !tailorContent ? { attributes: tailorAttributes } : {},
1580
+ ...htmlTagName ? { htmlTagName } : {},
1581
+ rect: {
1582
+ left: rect.left,
1583
+ top: rect.top,
1584
+ width: rect.width,
1585
+ height: rect.height
1586
+ // remove 'zoom' if it exists
1587
+ }
1588
+ };
1589
+ }
1590
+ );
1591
+ if (filterNonTextContent) {
1592
+ return elementInfosDescription.filter((item) => item.content);
1593
+ }
1594
+ return elementInfosDescription;
1595
+ }
1596
+ function retrieveElement(prompt, opt) {
1597
+ if (opt == null ? void 0 : opt.multi) {
1598
+ return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
1599
+ }
1600
+ return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
1601
+ }
1602
+ function ifElementTypeResponse(response) {
1603
+ if (typeof response !== "string") {
1604
+ return false;
1605
+ }
1606
+ return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
1607
+ }
1608
+ function splitElementResponse(response) {
1609
+ const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
1610
+ if (response.startsWith(oneElementSplitter)) {
1611
+ const id = response.slice(oneElementSplitter.length);
1612
+ if (id.indexOf(",") >= 0) {
1613
+ console.warn(`unexpected comma in one element response: ${id}`);
1614
+ }
1615
+ return id ? id : null;
1616
+ }
1617
+ const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
1618
+ if (response.startsWith(elementsSplitter)) {
1619
+ const idsString = response.slice(elementsSplitter.length);
1620
+ if (!idsString) {
1621
+ return [];
1622
+ }
1623
+ return idsString.split(",");
1624
+ }
1625
+ return null;
1626
+ }
1627
+ function retrieveSection(prompt) {
1628
+ return `${SECTION_MATCHER_FLAG}${prompt}`;
1629
+ }
1630
+
1631
+ // src/ai-model/prompt/planning.ts
1632
+ var quickAnswerFormat = () => {
1633
+ const matchByPosition = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MATCH_BY_POSITION);
1634
+ const locationFormat = {
1635
+ position: {
1636
+ description: `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)`,
1637
+ format: '"position": { x: number; y: number }',
1638
+ sample: '{"prompt": "the search bar" // Use language consistent with the information on the page}',
1639
+ locateParam: `{
1640
+ "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
1641
+ } | null // If it's not on the page, the LocateParam should be null`
1642
+ },
1643
+ id: {
1644
+ description: '"id": string // Represents the ID of the element; replace with actual values in practice',
1645
+ format: '"id": string',
1646
+ sample: `{"id": "c81c4e9a33", "prompt": "the search bar"}`,
1647
+ locateParam: `{
1648
+ "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
1649
+ "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
1650
+ } | null // If it's not on the page, the LocateParam should be null`
1651
+ }
1652
+ };
1653
+ const type = matchByPosition ? "position" : "id";
1654
+ const format = locationFormat[type];
1655
+ return {
1656
+ description: format.description,
1657
+ format: format.format,
1658
+ sample: format.sample,
1659
+ locateParam: format.locateParam
1660
+ };
1661
+ };
1662
+ var systemTemplate = `
1663
+ ## Role
1664
+
1665
+ You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
1666
+
1667
+ ## Objective
1668
+
1669
+ - Decompose the instruction user asked into a series of actions
1670
+ - Locate the target element if possible
1671
+ - If the instruction cannot be accomplished, give a further plan.
1672
+
1673
+ ## Workflow
1674
+
1675
+ 1. Receive the user's element description, screenshot, and instruction.
1676
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
1677
+ 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
1678
+ 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
1679
+ 5. Consider whether the user's instruction will be accomplished after all the actions
1680
+ - If yes, set \`taskWillBeAccomplished\` to true
1681
+ - If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
1682
+
1683
+ ## Constraints
1684
+
1685
+ - All the actions you composed MUST be based on the page context information you get.
1686
+ - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
1687
+ - Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
1688
+ - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
1689
+
1690
+ ## About the \`actions\` field
1691
+
1692
+ ### The common \`locate\` param
1693
+
1694
+ The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it follows the following scheme:
1695
+
1696
+ type LocateParam = {locateParam}
1697
+
1698
+ ### Supported actions
1699
+
1700
+ Each action has a \`type\` and corresponding \`param\`. To be detailed:
1701
+ - type: 'Tap', tap the located element
1702
+ * {{ locate: {sample}, param: null }}
1703
+ - type: 'Hover', move mouse over to the located element
1704
+ * {{ locate: LocateParam, param: null }}
1705
+ - type: 'Input', replace the value in the input field
1706
+ * {{ locate: LocateParam, param: {{ value: string }} }}
1707
+ * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
1708
+ - type: 'KeyboardPress', press a key
1709
+ * {{ param: {{ value: string }} }}
1710
+ - type: 'Scroll', scroll up or down.
1711
+ * {{
1712
+ locate: LocateParam | null,
1713
+ param: {{
1714
+ direction: 'down'(default) | 'up' | 'right' | 'left',
1715
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
1716
+ distance: null | number
1717
+ }}
1718
+ }}
1719
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
1720
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
1721
+ - type: 'FalsyConditionStatement'
1722
+ * {{ param: null }}
1723
+ * use this action when the instruction is an "if" statement and the condition is falsy.
1724
+ - type: 'Sleep'
1725
+ * {{ param: {{ timeMs: number }} }}
1726
+
1727
+ ## How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields ?
1728
+
1729
+ \`taskWillBeAccomplished\` is a boolean field, means whether the task will be accomplished after all the actions.
1730
+
1731
+ \`furtherPlan\` is used when the task cannot be accomplished. It follows the scheme {{ whatHaveDone: string, whatToDoNext: string }}:
1732
+ - \`whatHaveDone\`: a string, describe what have been done after the previous actions.
1733
+ - \`whatToDoNext\`: a string, describe what should be done next after the previous actions has finished. It should be a concise and clear description of the actions to be performed. Make sure you don't lose any necessary steps user asked.
1734
+ `;
1735
+ var outputTemplate = `
1736
+ ## Output JSON Format:
1737
+
1738
+ The JSON format is as follows:
1739
+
1740
+ {{
1741
+ "actions": [
1742
+ {{
1743
+ "thought": "Reasons for generating this task, and why this task is feasible on this page",
1744
+ "type": "Tap",
1745
+ "param": null,
1746
+ "locate": {sample} | null,
1747
+ }},
1748
+ // ... more actions
1749
+ ],
1750
+ "taskWillBeAccomplished": boolean,
1751
+ "furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null,
1752
+ "error"?: string
1753
+ }}
1754
+ Here is an example of how to decompose a task:
1755
+
1756
+ When a user says 'Click the language switch button, wait 1s, click "English"', the user will give you the description like this:
1757
+
1758
+ ====================
1759
+ {pageDescription}
1760
+ ====================
1761
+
1762
+ By viewing the page screenshot and description, you should consider this and output the JSON:
1763
+
1764
+ * The main steps should be: tap the switch button, sleep, and tap the 'English' option
1765
+ * The language switch button is shown in the screenshot, but it's not marked with a rectangle. So we have to use the page description to find the element. By carefully checking the context information (coordinates, attributes, content, etc.), you can find the element.
1766
+ * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field.
1767
+ * The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
1768
+
1769
+ {{
1770
+ "actions":[
1771
+ {{
1772
+ "type": "Tap",
1773
+ "thought": "Click the language switch button to open the language options.",
1774
+ "param": null,
1775
+ "locate": {sample},
1776
+ }},
1777
+ {{
1778
+ "type": "Sleep",
1779
+ "thought": "Wait for 1 second to ensure the language options are displayed.",
1780
+ "param": {{ "timeMs": 1000 }},
1781
+ }},
1782
+ {{
1783
+ "type": "Tap",
1784
+ "thought": "Locate the 'English' option in the language menu.",
1785
+ "param": null,
1786
+ "locate": null
1787
+ }},
1788
+ ],
1789
+ "error": null,
1790
+ "taskWillBeAccomplished": false,
1791
+ "furtherPlan": {{
1792
+ "whatToDoNext": "find the 'English' option and click on it",
1793
+ "whatHaveDone": "Click the language switch button and wait 1s"
1794
+ }}
1795
+ }}
1796
+
1797
+ Here is another example of how to tolerate error situations only when the instruction is an "if" statement:
1798
+
1799
+ If the user says "If there is a popup, close it", you should consider this and output the JSON:
1800
+
1801
+ * By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
1802
+ * The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
1803
+
1804
+ {{
1805
+ "actions": [{{
1806
+ "type": "FalsyConditionStatement",
1807
+ "thought": "There is no popup on the page",
1808
+ "param": null
1809
+ }}
1810
+ ],
1811
+ "taskWillBeAccomplished": true,
1812
+ "furtherPlan": null
1813
+ }}
1814
+
1815
+ For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
1816
+
1817
+ {{
1818
+ "actions": [],
1819
+ "error": "The instruction and page context are irrelevant, there is no popup on the page",
1820
+ "taskWillBeAccomplished": true,
1821
+ "furtherPlan": null
1822
+ }}
1823
+
1824
+ Here is an example of when task is accomplished, don't plan more actions:
1825
+
1826
+ When the user ask to "Wait 4s", you should consider this:
1827
+
1828
+ {{
1829
+ "actions": [
1830
+ {{
1831
+ "type": "Sleep",
1832
+ "thought": "Wait for 4 seconds",
1833
+ "param": {{ "timeMs": 4000 }},
1834
+ }},
1835
+ ],
1836
+ "taskWillBeAccomplished": true,
1837
+ "furtherPlan": null // All steps have been included in the actions, so no further plan is needed
1838
+ }}
1839
+
1840
+ Here is an example of what NOT to do:
1841
+
1842
+ Wrong output:
1843
+
1844
+ {{
1845
+ "actions":[
1846
+ {{
1847
+ "type": "Tap",
1848
+ "thought": "Click the language switch button to open the language options.",
1849
+ "param": null,
1850
+ "locate": {{
1851
+ {sample}, // WRONG:prompt is missing
1852
+ }}
1853
+ }},
1854
+ {{
1855
+ "type": "Tap",
1856
+ "thought": "Click the English option",
1857
+ "param": null,
1858
+ "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
1859
+ }}
1860
+ ],
1861
+ "taskWillBeAccomplished": false,
1862
+ // WRONG: should not be null
1863
+ "furtherPlan": null,
1864
+ }}
1865
+
1866
+ Reason:
1867
+ * The \`prompt\` is missing in the first 'Locate' action
1868
+ * Since the option button is not shown in the screenshot, the task cannot be accomplished, so a \`furtherPlan\` field is needed.
1869
+ `;
1870
+ async function systemPromptToTaskPlanning() {
1871
+ const promptTemplate = new (0, _prompts.PromptTemplate)({
1872
+ template: `${systemTemplate}
1873
+
1874
+ ${outputTemplate}`,
1875
+ inputVariables: ["pageDescription", "sample", "locateParam"]
1876
+ });
1877
+ return await promptTemplate.format({
1878
+ pageDescription: samplePageDescription,
1879
+ sample: quickAnswerFormat().sample,
1880
+ locateParam: quickAnswerFormat().locateParam
1881
+ });
1882
+ }
1883
+ var planSchema = {
1884
+ type: "json_schema",
1885
+ json_schema: {
1886
+ name: "action_items",
1887
+ strict: true,
1888
+ schema: {
1889
+ type: "object",
1890
+ strict: true,
1891
+ properties: {
1892
+ actions: {
1893
+ type: "array",
1894
+ items: {
1895
+ type: "object",
1896
+ strict: true,
1897
+ properties: {
1898
+ thought: {
1899
+ type: "string",
1900
+ description: "Reasons for generating this task, and why this task is feasible on this page"
1901
+ },
1902
+ type: {
1903
+ type: "string",
1904
+ description: 'Type of action, like "Tap", "Hover", etc.'
1905
+ },
1906
+ param: {
1907
+ type: ["object", "null"],
1908
+ description: "Parameter of the action, can be null ONLY when the type field is Tap or Hover"
1909
+ },
1910
+ locate: {
1911
+ type: ["object", "null"],
1912
+ properties: {
1913
+ ..._chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MATCH_BY_POSITION) ? {
1914
+ position: {
1915
+ type: "object",
1916
+ properties: {
1917
+ x: { type: "number" },
1918
+ y: { type: "number" }
1919
+ },
1920
+ required: ["x", "y"],
1921
+ additionalProperties: false
1922
+ }
1923
+ } : {
1924
+ id: { type: "string" }
1925
+ },
1926
+ prompt: { type: "string" }
1927
+ },
1928
+ required: [
1929
+ _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MATCH_BY_POSITION) ? "position" : "id",
1930
+ "prompt"
1931
+ ],
1932
+ additionalProperties: false,
1933
+ description: "Location information for the target element"
1934
+ }
1935
+ },
1936
+ required: ["thought", "type", "param", "locate"],
1937
+ additionalProperties: false
1938
+ },
1939
+ description: "List of actions to be performed"
1940
+ },
1941
+ taskWillBeAccomplished: {
1942
+ type: "boolean",
1943
+ description: "Whether the task will be accomplished after the actions"
1944
+ },
1945
+ furtherPlan: {
1946
+ type: ["object", "null"],
1947
+ properties: {
1948
+ whatHaveDone: { type: "string" },
1949
+ whatToDoNext: { type: "string" }
1950
+ },
1951
+ required: ["whatHaveDone", "whatToDoNext"],
1952
+ additionalProperties: false,
1953
+ description: "Plan the task when the task cannot be accomplished"
1954
+ },
1955
+ error: {
1956
+ type: ["string", "null"],
1957
+ description: "Overall error messages"
1958
+ }
1959
+ },
1960
+ required: ["actions", "taskWillBeAccomplished", "furtherPlan", "error"],
1961
+ additionalProperties: false
1962
+ }
1963
+ }
1964
+ };
1965
+ var generateTaskBackgroundContext = (userPrompt, originalPrompt, whatHaveDone) => {
1966
+ if (originalPrompt && whatHaveDone) {
1967
+ return `
1968
+ Here is the instruction:
1969
+ =====================================
1970
+ ${userPrompt}
1971
+ =====================================
1972
+
1973
+ For your information, this is a task that some important person handed to you. Here is the original task description and what have been done after the previous actions:
1974
+ =====================================
1975
+ Original task description: ${originalPrompt}
1976
+ =====================================
1977
+ What have been done: ${whatHaveDone}
1978
+ =====================================
1979
+ `;
1980
+ }
1981
+ return `
1982
+ Here is the instruction:
1983
+ =====================================
1984
+ ${userPrompt}
1985
+ =====================================
1986
+ `;
1987
+ };
1988
+ var automationUserPrompt = new (0, _prompts.PromptTemplate)({
1989
+ template: `
1990
+ pageDescription:
1991
+ =====================================
1992
+ {pageDescription}
1993
+ =====================================
1994
+
1995
+ {taskBackgroundContext}
1996
+ `,
1997
+ inputVariables: ["pageDescription", "taskBackgroundContext"]
1998
+ });
1999
+
2000
+ // src/ai-model/openai/index.ts
2001
+ function checkAIConfig(preferVendor) {
2002
+ if (preferVendor && preferVendor !== "openAI")
2003
+ return false;
2004
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_API_KEY))
2005
+ return true;
2006
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_USE_AZURE_OPENAI))
2007
+ return true;
2008
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.ANTHROPIC_API_KEY))
2009
+ return true;
2010
+ return Boolean(_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_OPENAI_INIT_CONFIG_JSON));
2011
+ }
2012
+ var defaultModel = "gpt-4o-2024-08-06";
2013
+ function getModelName() {
2014
+ let modelName = defaultModel;
2015
+ const nameInConfig = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_MODEL_NAME);
2016
+ if (nameInConfig) {
2017
+ modelName = nameInConfig;
2018
+ }
2019
+ return modelName;
2020
+ }
2021
+ async function createChatClient({
2022
+ AIActionTypeValue
2023
+ }) {
2024
+ let openai;
2025
+ const extraConfig = _chunkSCNIHQKFjs.getAIConfigInJson.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_OPENAI_INIT_CONFIG_JSON);
2026
+ const socksProxy = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_OPENAI_SOCKS_PROXY);
2027
+ const socksAgent = socksProxy ? new (0, _socksproxyagent.SocksProxyAgent)(socksProxy) : void 0;
2028
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_USE_AZURE)) {
2029
+ openai = new (0, _openai.AzureOpenAI)({
2030
+ baseURL: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_BASE_URL),
2031
+ apiKey: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_API_KEY),
2032
+ httpAgent: socksAgent,
2033
+ ...extraConfig,
2034
+ dangerouslyAllowBrowser: true
2035
+ });
2036
+ } else if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_USE_AZURE_OPENAI)) {
2037
+ const extraAzureConfig = _chunkSCNIHQKFjs.getAIConfigInJson.call(void 0,
2038
+ _chunkSCNIHQKFjs.MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
2039
+ );
2040
+ const scope = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_AZURE_OPENAI_SCOPE);
2041
+ let tokenProvider = void 0;
2042
+ if (scope) {
2043
+ _assert2.default.call(void 0,
2044
+ !_utils.ifInBrowser,
2045
+ "Azure OpenAI is not supported in browser with Midscene."
2046
+ );
2047
+ const credential = new (0, _identity.DefaultAzureCredential)();
2048
+ _assert2.default.call(void 0, scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
2049
+ tokenProvider = _identity.getBearerTokenProvider.call(void 0, credential, scope);
2050
+ openai = new (0, _openai.AzureOpenAI)({
2051
+ azureADTokenProvider: tokenProvider,
2052
+ endpoint: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_ENDPOINT),
2053
+ apiVersion: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_API_VERSION),
2054
+ deployment: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_DEPLOYMENT),
2055
+ ...extraConfig,
2056
+ ...extraAzureConfig
2057
+ });
2058
+ } else {
2059
+ openai = new (0, _openai.AzureOpenAI)({
2060
+ apiKey: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_KEY),
2061
+ endpoint: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_ENDPOINT),
2062
+ apiVersion: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_API_VERSION),
2063
+ deployment: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.AZURE_OPENAI_DEPLOYMENT),
2064
+ dangerouslyAllowBrowser: true,
2065
+ ...extraConfig,
2066
+ ...extraAzureConfig
2067
+ });
2068
+ }
2069
+ } else if (!_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_USE_ANTHROPIC_SDK)) {
2070
+ openai = new (0, _openai2.default)({
2071
+ baseURL: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_BASE_URL),
2072
+ apiKey: _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_API_KEY),
2073
+ httpAgent: socksAgent,
2074
+ ...extraConfig,
2075
+ defaultHeaders: {
2076
+ ...(extraConfig == null ? void 0 : extraConfig.defaultHeaders) || {},
2077
+ [_chunkSCNIHQKFjs.MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
2078
+ },
2079
+ dangerouslyAllowBrowser: true
2080
+ });
2081
+ }
2082
+ if (openai && _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_LANGSMITH_DEBUG)) {
2083
+ if (_utils.ifInBrowser) {
2084
+ throw new Error("langsmith is not supported in browser");
2085
+ }
2086
+ console.log("DEBUGGING MODE: langsmith wrapper enabled");
2087
+ const { wrapOpenAI } = await Promise.resolve().then(() => require("./wrappers-KKGZQXJL.js"));
2088
+ openai = wrapOpenAI(openai);
2089
+ }
2090
+ if (typeof openai !== "undefined") {
2091
+ return {
2092
+ completion: openai.chat.completions,
2093
+ style: "openai"
2094
+ };
2095
+ }
2096
+ if (_chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_USE_ANTHROPIC_SDK)) {
2097
+ const apiKey = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.ANTHROPIC_API_KEY);
2098
+ _assert2.default.call(void 0, apiKey, "ANTHROPIC_API_KEY is required");
2099
+ openai = new (0, _sdk.Anthropic)({
2100
+ apiKey,
2101
+ dangerouslyAllowBrowser: true
2102
+ });
2103
+ }
2104
+ if (typeof openai !== "undefined" && openai.messages) {
2105
+ return {
2106
+ completion: openai.messages,
2107
+ style: "anthropic"
2108
+ };
2109
+ }
2110
+ throw new Error("Openai SDK or Anthropic SDK is not initialized");
2111
+ }
2112
+ async function call(messages, AIActionTypeValue, responseFormat) {
2113
+ const { completion, style } = await createChatClient({
2114
+ AIActionTypeValue
2115
+ });
2116
+ const shouldPrintTiming = typeof _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.MIDSCENE_DEBUG_AI_PROFILE) === "string";
2117
+ const maxTokens = _chunkSCNIHQKFjs.getAIConfig.call(void 0, _chunkSCNIHQKFjs.OPENAI_MAX_TOKENS);
2118
+ const startTime = Date.now();
2119
+ const model = getModelName();
2120
+ let content;
2121
+ let usage;
2122
+ const commonConfig = {
2123
+ temperature: 0.1,
2124
+ stream: false,
2125
+ max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10)
2126
+ };
2127
+ if (style === "openai") {
2128
+ const result = await completion.create({
2129
+ model,
2130
+ messages,
2131
+ response_format: responseFormat,
2132
+ ...commonConfig
2133
+ // betas: ['computer-use-2024-10-22'],
2134
+ });
2135
+ shouldPrintTiming && console.log(
2136
+ "Midscene - AI call",
2137
+ model,
2138
+ result.usage,
2139
+ `${Date.now() - startTime}ms`
2140
+ );
2141
+ content = result.choices[0].message.content;
2142
+ _assert2.default.call(void 0, content, "empty content");
2143
+ usage = result.usage;
2144
+ } else if (style === "anthropic") {
2145
+ const convertImageContent = (content2) => {
2146
+ if (content2.type === "image_url") {
2147
+ const imgBase64 = content2.image_url.url;
2148
+ _assert2.default.call(void 0, imgBase64, "image_url is required");
2149
+ return {
2150
+ source: {
2151
+ type: "base64",
2152
+ media_type: imgBase64.includes("data:image/png;base64,") ? "image/png" : "image/jpeg",
2153
+ data: imgBase64.split(",")[1]
2154
+ },
2155
+ type: "image"
2156
+ };
2157
+ }
2158
+ return content2;
2159
+ };
2160
+ const result = await completion.create({
2161
+ model,
2162
+ system: "You are a versatile professional in software UI automation",
2163
+ messages: messages.map((m) => ({
2164
+ role: "user",
2165
+ content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
2166
+ })),
2167
+ response_format: responseFormat,
2168
+ ...commonConfig
2169
+ });
2170
+ content = result.content[0].text;
2171
+ _assert2.default.call(void 0, content, "empty content");
2172
+ usage = result.usage;
2173
+ }
2174
+ return { content: content || "", usage };
2175
+ }
2176
+ async function callToGetJSONObject(messages, AIActionTypeValue) {
2177
+ let responseFormat;
2178
+ const model = getModelName();
2179
+ if (model.includes("gpt-4o")) {
2180
+ switch (AIActionTypeValue) {
2181
+ case 0 /* ASSERT */:
2182
+ responseFormat = assertSchema;
2183
+ break;
2184
+ case 1 /* INSPECT_ELEMENT */:
2185
+ responseFormat = findElementSchema;
2186
+ break;
2187
+ case 2 /* EXTRACT_DATA */:
2188
+ responseFormat = { type: "json_object" /* JSON */ };
2189
+ break;
2190
+ case 3 /* PLAN */:
2191
+ responseFormat = planSchema;
2192
+ break;
2193
+ }
2194
+ if (model === "gpt-4o-2024-05-13" || !responseFormat) {
2195
+ responseFormat = { type: "json_object" /* JSON */ };
2196
+ }
2197
+ }
2198
+ const response = await call(messages, AIActionTypeValue, responseFormat);
2199
+ _assert2.default.call(void 0, response, "empty response");
2200
+ const jsonContent = safeParseJson(response.content);
2201
+ return { content: jsonContent, usage: response.usage };
2202
+ }
2203
+ function extractJSONFromCodeBlock(response) {
2204
+ try {
2205
+ const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
2206
+ if (jsonMatch) {
2207
+ return jsonMatch[1];
2208
+ }
2209
+ const codeBlockMatch = response.match(
2210
+ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/
2211
+ );
2212
+ if (codeBlockMatch) {
2213
+ return codeBlockMatch[1];
2214
+ }
2215
+ const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
2216
+ if (jsonLikeMatch) {
2217
+ return jsonLikeMatch[0];
2218
+ }
2219
+ } catch (e) {
2220
+ }
2221
+ return response;
2222
+ }
2223
+ function safeParseJson(input) {
2224
+ var _a;
2225
+ const cleanJsonString = extractJSONFromCodeBlock(input);
2226
+ if (cleanJsonString.match(/\((\d+),(\d+)\)/)) {
2227
+ return (_a = cleanJsonString.match(/\((\d+),(\d+)\)/)) == null ? void 0 : _a.slice(1).map(Number);
2228
+ }
2229
+ try {
2230
+ return JSON.parse(cleanJsonString);
2231
+ } catch (e) {
2232
+ }
2233
+ try {
2234
+ return import_dirty_json.default.parse(cleanJsonString);
2235
+ } catch (e) {
2236
+ console.log("e:", e);
2237
+ }
2238
+ throw Error(`failed to parse json response: ${input}`);
2239
+ }
2240
+
2241
+ // src/ai-model/inspect.ts
2242
+
2243
+ var liteContextConfig = {
2244
+ filterNonTextContent: true,
2245
+ truncateTextLength: 200
2246
+ };
2247
+ function transformToAbsoluteCoords(relativePosition, size) {
2248
+ return {
2249
+ x: Number((relativePosition.x / 1e3 * size.width).toFixed(3)),
2250
+ y: Number((relativePosition.y / 1e3 * size.height).toFixed(3))
2251
+ };
2252
+ }
2253
+ async function transformElementPositionToId(aiResult, elementsInfo, size, screenshotBase64) {
2254
+ if (Array.isArray(aiResult)) {
2255
+ const relativePosition = aiResult;
2256
+ const absolutePosition = transformToAbsoluteCoords(
2257
+ {
2258
+ x: relativePosition[0],
2259
+ y: relativePosition[1]
2260
+ },
2261
+ size
2262
+ );
2263
+ const element = elementByPositionWithElementInfo(
2264
+ elementsInfo,
2265
+ absolutePosition
2266
+ );
2267
+ _assert2.default.call(void 0,
2268
+ element,
2269
+ `inspect: no id found with position: ${JSON.stringify({ absolutePosition })}`
2270
+ );
2271
+ return {
2272
+ errors: [],
2273
+ elements: [
2274
+ {
2275
+ id: element.id
2276
+ }
2277
+ ]
2278
+ };
2279
+ }
2280
+ return {
2281
+ errors: aiResult.errors,
2282
+ elements: aiResult.elements
2283
+ };
2284
+ }
2285
+ function getQuickAnswer(quickAnswer, elementsInfo, elementById, insertElementByPosition) {
2286
+ if (!quickAnswer) {
2287
+ return void 0;
2288
+ }
2289
+ if ("id" in quickAnswer && quickAnswer.id && elementById(quickAnswer.id)) {
2290
+ return {
2291
+ parseResult: {
2292
+ elements: [quickAnswer],
2293
+ errors: []
2294
+ },
2295
+ rawResponse: quickAnswer,
2296
+ elementById
2297
+ };
2298
+ }
2299
+ if ("position" in quickAnswer && quickAnswer.position) {
2300
+ let element = elementByPositionWithElementInfo(
2301
+ elementsInfo,
2302
+ quickAnswer.position
2303
+ );
2304
+ if (!element) {
2305
+ element = insertElementByPosition(quickAnswer.position);
2306
+ }
2307
+ return {
2308
+ parseResult: {
2309
+ elements: [element],
2310
+ errors: []
2311
+ },
2312
+ rawResponse: quickAnswer,
2313
+ elementById
2314
+ };
2315
+ }
2316
+ }
2317
+ async function AiInspectElement(options) {
2318
+ const { context, multi, targetElementDescription, callAI } = options;
2319
+ const { screenshotBase64, screenshotBase64WithElementMarker } = context;
2320
+ const { description, elementById, insertElementByPosition, size } = await describeUserPage(context);
2321
+ const quickAnswer = getQuickAnswer(
2322
+ options.quickAnswer,
2323
+ context.content,
2324
+ elementById,
2325
+ insertElementByPosition
2326
+ );
2327
+ if (quickAnswer) {
2328
+ return quickAnswer;
2329
+ }
2330
+ _assert2.default.call(void 0,
2331
+ targetElementDescription,
2332
+ "cannot find the target element description"
2333
+ );
2334
+ const userInstructionPrompt = await findElementPrompt.format({
2335
+ pageDescription: description,
2336
+ targetElementDescription,
2337
+ multi
2338
+ });
2339
+ const systemPrompt = systemPromptToFindElement();
2340
+ const msgs = [
2341
+ { role: "system", content: systemPrompt },
2342
+ {
2343
+ role: "user",
2344
+ content: [
2345
+ {
2346
+ type: "image_url",
2347
+ image_url: {
2348
+ url: screenshotBase64WithElementMarker || screenshotBase64,
2349
+ detail: "high"
2350
+ }
2351
+ },
2352
+ {
2353
+ type: "text",
2354
+ text: userInstructionPrompt
2355
+ }
2356
+ ]
2357
+ }
2358
+ ];
2359
+ const callAIFn = callAI || callToGetJSONObject;
2360
+ const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2361
+ return {
2362
+ parseResult: await transformElementPositionToId(
2363
+ res.content,
2364
+ context.content,
2365
+ size,
2366
+ screenshotBase64
2367
+ ),
2368
+ rawResponse: res.content,
2369
+ elementById,
2370
+ usage: res.usage
2371
+ };
2372
+ }
2373
+ async function AiExtractElementInfo(options) {
2374
+ const { dataQuery, context } = options;
2375
+ const systemPrompt = systemPromptToExtract();
2376
+ const { screenshotBase64 } = context;
2377
+ const { description, elementById } = await describeUserPage(
2378
+ context,
2379
+ liteContextConfig
2380
+ );
2381
+ let dataKeys = "";
2382
+ let dataQueryText = "";
2383
+ if (typeof dataQuery === "string") {
2384
+ dataKeys = "";
2385
+ dataQueryText = dataQuery;
2386
+ } else {
2387
+ dataKeys = `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}`;
2388
+ dataQueryText = JSON.stringify(dataQuery, null, 2);
2389
+ }
2390
+ const extractDataPromptText = await extractDataPrompt.format({
2391
+ pageDescription: description,
2392
+ dataKeys,
2393
+ dataQuery: dataQueryText
2394
+ });
2395
+ const msgs = [
2396
+ { role: "system", content: systemPrompt },
2397
+ {
2398
+ role: "user",
2399
+ content: [
2400
+ {
2401
+ type: "image_url",
2402
+ image_url: {
2403
+ url: screenshotBase64,
2404
+ detail: "high"
2405
+ }
2406
+ },
2407
+ {
2408
+ type: "text",
2409
+ text: extractDataPromptText
2410
+ }
2411
+ ]
2412
+ }
2413
+ ];
2414
+ const result = await callAiFn(
2415
+ msgs,
2416
+ 2 /* EXTRACT_DATA */
2417
+ );
2418
+ return {
2419
+ parseResult: result.content,
2420
+ elementById,
2421
+ usage: result.usage
2422
+ };
2423
+ }
2424
+ async function AiAssert(options) {
2425
+ const { assertion, context } = options;
2426
+ _assert2.default.call(void 0, assertion, "assertion should be a string");
2427
+ const { screenshotBase64 } = context;
2428
+ const { description } = await describeUserPage(context, liteContextConfig);
2429
+ const systemPrompt = systemPromptToAssert();
2430
+ const msgs = [
2431
+ { role: "system", content: systemPrompt },
2432
+ {
2433
+ role: "user",
2434
+ content: [
2435
+ {
2436
+ type: "image_url",
2437
+ image_url: {
2438
+ url: screenshotBase64,
2439
+ detail: "high"
2440
+ }
2441
+ },
2442
+ {
2443
+ type: "text",
2444
+ text: `
2445
+ pageDescription:
2446
+
2447
+ ${description}
2448
+ Here is the description of the assertion. Just go ahead:
2449
+ =====================================
2450
+ ${assertion}
2451
+ =====================================
2452
+ `
2453
+ }
2454
+ ]
2455
+ }
2456
+ ];
2457
+ const { content: assertResult, usage } = await callAiFn(
2458
+ msgs,
2459
+ 0 /* ASSERT */
2460
+ );
2461
+ return {
2462
+ content: assertResult,
2463
+ usage
2464
+ };
2465
+ }
2466
+
2467
+ // src/ai-model/automation.ts
2468
+
2469
+ async function plan(userPrompt, opts) {
2470
+ const { callAI, context } = opts || {};
2471
+ const { screenshotBase64, screenshotBase64WithElementMarker } = context;
2472
+ const { description: pageDescription, elementByPosition } = await describeUserPage(context);
2473
+ const systemPrompt = await systemPromptToTaskPlanning();
2474
+ const taskBackgroundContextText = generateTaskBackgroundContext(
2475
+ userPrompt,
2476
+ opts.originalPrompt,
2477
+ opts.whatHaveDone
2478
+ );
2479
+ const userInstructionPrompt = await automationUserPrompt.format({
2480
+ pageDescription,
2481
+ taskBackgroundContext: taskBackgroundContextText
2482
+ });
2483
+ const msgs = [
2484
+ { role: "system", content: systemPrompt },
2485
+ {
2486
+ role: "user",
2487
+ content: [
2488
+ {
2489
+ type: "image_url",
2490
+ image_url: {
2491
+ url: screenshotBase64WithElementMarker || screenshotBase64,
2492
+ detail: "high"
2493
+ }
2494
+ },
2495
+ {
2496
+ type: "text",
2497
+ text: userInstructionPrompt
2498
+ }
2499
+ ]
2500
+ }
2501
+ ];
2502
+ const call3 = callAI || callAiFn;
2503
+ const { content, usage } = await call3(msgs, 3 /* PLAN */);
2504
+ const planFromAI = content;
2505
+ const actions = (planFromAI == null ? void 0 : planFromAI.actions) || [];
2506
+ _assert2.default.call(void 0, planFromAI, "can't get plans from AI");
2507
+ _assert2.default.call(void 0,
2508
+ actions.length > 0,
2509
+ `Failed to plan actions: ${planFromAI.error || "(no error details)"}`
2510
+ );
2511
+ return planFromAI;
2512
+ }
2513
+
2514
+ // src/ai-model/prompt/ui-tars-planning.ts
2515
+ var uiTarsPlanningPrompt = `
2516
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
2517
+
2518
+ ## Output Format
2519
+
2520
+ \`\`\`
2521
+ Thought: ...
2522
+ Action: ...
2523
+ \`\`\`
2524
+
2525
+ ## Action Space
2526
+ click(start_box='[x1, y1, x2, y2]')
2527
+ left_double(start_box='[x1, y1, x2, y2]')
2528
+ right_single(start_box='[x1, y1, x2, y2]')
2529
+ drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
2530
+ hotkey(key='')
2531
+ type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
2532
+ scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
2533
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
2534
+ finished()
2535
+ call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
2536
+
2537
+ ## Note
2538
+ - Use Chinese in \`Thought\` part.
2539
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
2540
+
2541
+ ## User Instruction
2542
+ `;
2543
+ var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2544
+ function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
2545
+ let reflection = null;
2546
+ let thought = null;
2547
+ let actionStr = "";
2548
+ text = text.trim();
2549
+ if (mode === "bc") {
2550
+ if (text.startsWith("Thought:")) {
2551
+ const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/);
2552
+ if (thoughtMatch) {
2553
+ thought = thoughtMatch[1].trim();
2554
+ }
2555
+ } else if (text.startsWith("Reflection:")) {
2556
+ const reflectionMatch = text.match(
2557
+ /Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
2558
+ );
2559
+ if (reflectionMatch) {
2560
+ thought = reflectionMatch[2].trim();
2561
+ reflection = reflectionMatch[1].trim();
2562
+ }
2563
+ } else if (text.startsWith("Action_Summary:")) {
2564
+ const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
2565
+ if (summaryMatch) {
2566
+ thought = summaryMatch[1].trim();
2567
+ }
2568
+ }
2569
+ if (!text.includes("Action:")) {
2570
+ actionStr = text;
2571
+ } else {
2572
+ const actionParts = text.split("Action:");
2573
+ actionStr = actionParts[actionParts.length - 1];
2574
+ }
2575
+ } else if (mode === "o1") {
2576
+ const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
2577
+ const actionSummaryMatch = text.match(
2578
+ /\nAction_Summary:\s*(.*?)\s*Action:/
2579
+ );
2580
+ const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
2581
+ const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
2582
+ const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
2583
+ const actionContent = actionMatch ? actionMatch[1] : null;
2584
+ thought = `${thoughtContent}
2585
+ <Action_Summary>
2586
+ ${actionSummaryContent}`;
2587
+ actionStr = actionContent || "";
2588
+ }
2589
+ const allActions = actionStr.split("\n\n");
2590
+ const actions = [];
2591
+ for (const rawStr of allActions) {
2592
+ const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
2593
+ if (!actionInstance) {
2594
+ console.log(`Action can't parse: ${rawStr}`);
2595
+ continue;
2596
+ }
2597
+ const actionType = actionInstance.function;
2598
+ const params = actionInstance.args;
2599
+ const actionInputs = {};
2600
+ for (const [paramName, param] of Object.entries(params)) {
2601
+ if (!param)
2602
+ continue;
2603
+ const trimmedParam = param.trim();
2604
+ actionInputs[paramName.trim()] = trimmedParam;
2605
+ if (paramName.includes("start_box") || paramName.includes("end_box")) {
2606
+ const oriBox = trimmedParam;
2607
+ const numbers = oriBox.replace(/[()]/g, "").split(",");
2608
+ const floatNumbers = numbers.map(
2609
+ (num) => Number.parseFloat(num) / factor
2610
+ );
2611
+ if (floatNumbers.length === 2) {
2612
+ floatNumbers.push(floatNumbers[0], floatNumbers[1]);
2613
+ }
2614
+ actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
2615
+ }
2616
+ }
2617
+ if (actionType === "finished") {
2618
+ actions.push({
2619
+ reflection,
2620
+ thought,
2621
+ action_type: "finished",
2622
+ action_inputs: {}
2623
+ });
2624
+ } else {
2625
+ actions.push({
2626
+ reflection,
2627
+ thought,
2628
+ action_type: actionType,
2629
+ action_inputs: actionInputs
2630
+ });
2631
+ }
2632
+ }
2633
+ return actions;
2634
+ }
2635
+ function parseAction(actionStr) {
2636
+ try {
2637
+ const functionPattern = /^(\w+)\((.*)\)$/;
2638
+ const match = actionStr.trim().match(functionPattern);
2639
+ if (!match) {
2640
+ throw new Error("Not a function call");
2641
+ }
2642
+ const [_, functionName, argsStr] = match;
2643
+ const kwargs = {};
2644
+ if (argsStr.trim()) {
2645
+ const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
2646
+ for (const pair of argPairs) {
2647
+ const [key, ...valueParts] = pair.split("=");
2648
+ if (!key)
2649
+ continue;
2650
+ const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
2651
+ kwargs[key.trim()] = value;
2652
+ }
2653
+ }
2654
+ return {
2655
+ function: functionName,
2656
+ args: kwargs
2657
+ };
2658
+ } catch (e) {
2659
+ console.error(`Failed to parse action '${actionStr}': ${e}`);
2660
+ return null;
2661
+ }
2662
+ }
2663
+
2664
+ // src/ai-model/vlm-planning.ts
2665
+ function capitalize(str) {
2666
+ return str.charAt(0).toUpperCase() + str.slice(1);
2667
+ }
2668
+ async function vlmPlanning(options) {
2669
+ const { conversationHistory, userInstruction, size } = options;
2670
+ const systemPrompt = uiTarsPlanningPrompt + userInstruction;
2671
+ const res = await call(
2672
+ [
2673
+ {
2674
+ role: "user",
2675
+ content: systemPrompt
2676
+ },
2677
+ ...conversationHistory
2678
+ ],
2679
+ 1 /* INSPECT_ELEMENT */
2680
+ );
2681
+ const actions = parseActionFromVlm(res.content);
2682
+ const transformActions = [];
2683
+ actions.forEach((action) => {
2684
+ if (action.action_type === "click") {
2685
+ const point = getPoint(action.action_inputs.start_box, size);
2686
+ transformActions.push({
2687
+ type: "Locate",
2688
+ locate: {
2689
+ prompt: action.thought || "",
2690
+ position: { x: point[0], y: point[1] }
2691
+ },
2692
+ param: {
2693
+ // action,
2694
+ // position: { x: point[0], y: point[1] },
2695
+ }
2696
+ });
2697
+ transformActions.push({
2698
+ type: "Tap",
2699
+ locate: {
2700
+ prompt: action.thought || "",
2701
+ position: { x: point[0], y: point[1] }
2702
+ },
2703
+ param: action.thought || ""
2704
+ });
2705
+ } else if (action.action_type === "type") {
2706
+ transformActions.push({
2707
+ type: "Input",
2708
+ param: {
2709
+ value: action.action_inputs.content
2710
+ },
2711
+ locate: null,
2712
+ thought: action.thought || ""
2713
+ });
2714
+ } else if (action.action_type === "scroll") {
2715
+ transformActions.push({
2716
+ type: "Scroll",
2717
+ param: {
2718
+ direction: action.action_inputs.direction
2719
+ },
2720
+ locate: null,
2721
+ thought: action.thought || ""
2722
+ });
2723
+ } else if (action.action_type === "finished") {
2724
+ transformActions.push({
2725
+ type: "Finished",
2726
+ param: {},
2727
+ locate: null,
2728
+ thought: action.thought || ""
2729
+ });
2730
+ } else if (action.action_type === "hotkey") {
2731
+ const keys = action.action_inputs.key.split(",");
2732
+ for (const key of keys) {
2733
+ transformActions.push({
2734
+ type: "KeyboardPress",
2735
+ param: {
2736
+ value: capitalize(key)
2737
+ },
2738
+ locate: null,
2739
+ thought: action.thought || ""
2740
+ });
2741
+ }
2742
+ } else if (action.action_type === "wait") {
2743
+ transformActions.push({
2744
+ type: "Sleep",
2745
+ param: {
2746
+ timeMs: action.action_inputs.time
2747
+ },
2748
+ locate: null,
2749
+ thought: action.thought || ""
2750
+ });
2751
+ }
2752
+ });
2753
+ return {
2754
+ actions: transformActions,
2755
+ realActions: actions,
2756
+ action_summary: getSummary(res.content)
2757
+ };
2758
+ }
2759
+ function getPoint(startBox, size) {
2760
+ const [x, y] = JSON.parse(startBox);
2761
+ return [x * size.width, y * size.height];
2762
+ }
2763
+
2764
+
2765
+
2766
+
2767
+
2768
+
2769
+
2770
+
2771
+
2772
+
2773
+
2774
+
2775
+
2776
+
2777
+
2778
+
2779
+
2780
+
2781
+
2782
+ exports.BaseElement = BaseElement; exports.AIResponseFormat = AIResponseFormat; exports.UIContext = UIContext; exports.systemPromptToFindElement = systemPromptToFindElement; exports.describeUserPage = describeUserPage; exports.retrieveElement = retrieveElement; exports.ifElementTypeResponse = ifElementTypeResponse; exports.splitElementResponse = splitElementResponse; exports.retrieveSection = retrieveSection; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFn = callAiFn; exports.transformElementPositionToId = transformElementPositionToId; exports.AiInspectElement = AiInspectElement; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning;
2783
+ /*! Bundled license information:
2784
+
2785
+ string.fromcodepoint/fromcodepoint.js:
2786
+ (*! http://mths.be/fromcodepoint v0.2.1 by @mathias *)
2787
+
2788
+ utf8/utf8.js:
2789
+ (*! https://mths.be/utf8js v3.0.0 by @mathias *)
2790
+ */