@midscene/core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1083 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __defProps = Object.defineProperties;
3
+ var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
4
+ var __getOwnPropSymbols = Object.getOwnPropertySymbols;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __propIsEnum = Object.prototype.propertyIsEnumerable;
7
+ var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
8
+ var __spreadValues = (a, b) => {
9
+ for (var prop in b || (b = {}))
10
+ if (__hasOwnProp.call(b, prop))
11
+ __defNormalProp(a, prop, b[prop]);
12
+ if (__getOwnPropSymbols)
13
+ for (var prop of __getOwnPropSymbols(b)) {
14
+ if (__propIsEnum.call(b, prop))
15
+ __defNormalProp(a, prop, b[prop]);
16
+ }
17
+ return a;
18
+ };
19
+ var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
20
+ var __objRest = (source, exclude) => {
21
+ var target = {};
22
+ for (var prop in source)
23
+ if (__hasOwnProp.call(source, prop) && exclude.indexOf(prop) < 0)
24
+ target[prop] = source[prop];
25
+ if (source != null && __getOwnPropSymbols)
26
+ for (var prop of __getOwnPropSymbols(source)) {
27
+ if (exclude.indexOf(prop) < 0 && __propIsEnum.call(source, prop))
28
+ target[prop] = source[prop];
29
+ }
30
+ return target;
31
+ };
32
+ var __publicField = (obj, key, value) => {
33
+ __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
34
+ return value;
35
+ };
36
+
37
+ // src/insight/index.ts
38
+ import assert6 from "assert";
39
+
40
+ // src/ai-model/prompt/util.ts
41
+ import assert3 from "assert";
42
+
43
+ // src/image/info.ts
44
+ import assert from "assert";
45
+ import { Buffer as Buffer2 } from "buffer";
46
+ import { readFileSync } from "fs";
47
+ import Sharp from "sharp";
48
+ async function imageInfo(image) {
49
+ const { width, height } = await Sharp(image).metadata();
50
+ assert(width && height, `invalid image: ${image}`);
51
+ return { width, height };
52
+ }
53
+ async function imageInfoOfBase64(imageBase64) {
54
+ const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
55
+ return imageInfo(Buffer2.from(base64Data, "base64"));
56
+ }
57
+
58
+ // src/image/transform.ts
59
+ import { Buffer as Buffer3 } from "buffer";
60
+ import Sharp2 from "sharp";
61
+
62
+ // src/image/visualization.ts
63
+ import { Buffer as Buffer4 } from "buffer";
64
+ import Sharp3 from "sharp";
65
+
66
+ // src/utils.ts
67
+ import { tmpdir } from "os";
68
+ import { basename, join } from "path";
69
+ import { copyFileSync, existsSync, mkdirSync, readFileSync as readFileSync2, writeFileSync } from "fs";
70
+ import { randomUUID } from "crypto";
71
+ import assert2 from "assert";
72
+ var pkg;
73
+ function getPkgInfo() {
74
+ if (pkg) {
75
+ return pkg;
76
+ }
77
+ let pkgJsonFile = "";
78
+ if (existsSync(join(__dirname, "../package.json"))) {
79
+ pkgJsonFile = join(__dirname, "../package.json");
80
+ } else if (existsSync(join(__dirname, "../../../package.json"))) {
81
+ pkgJsonFile = join(__dirname, "../../../package.json");
82
+ }
83
+ if (pkgJsonFile) {
84
+ const { name, version } = JSON.parse(readFileSync2(pkgJsonFile, "utf-8"));
85
+ pkg = { name, version };
86
+ return pkg;
87
+ } else {
88
+ return {
89
+ name: "midscene-unknown-page-name",
90
+ version: "0.0.0"
91
+ };
92
+ }
93
+ }
94
+ var logDir = join(process.cwd(), "./midscene_run/");
95
+ var logEnvReady = false;
96
+ var insightDumpFileExt = "insight-dump.json";
97
+ function getDumpDir() {
98
+ return logDir;
99
+ }
100
+ function setDumpDir(dir) {
101
+ logDir = dir;
102
+ }
103
+ function writeDumpFile(fileName, fileExt, fileContent) {
104
+ if (!logEnvReady) {
105
+ assert2(logDir, "logDir should be set before writing dump file");
106
+ if (!existsSync(logDir)) {
107
+ mkdirSync(logDir, { recursive: true });
108
+ }
109
+ const gitIgnorePath = join(logDir, "../.gitignore");
110
+ let gitIgnoreContent = "";
111
+ if (existsSync(gitIgnorePath)) {
112
+ gitIgnoreContent = readFileSync2(gitIgnorePath, "utf-8");
113
+ }
114
+ const logDirName = basename(logDir);
115
+ if (!gitIgnoreContent.includes(`${logDirName}/`)) {
116
+ writeFileSync(
117
+ gitIgnorePath,
118
+ `${gitIgnoreContent}
119
+ # MidScene.js dump files
120
+ ${logDirName}/
121
+ `,
122
+ "utf-8"
123
+ );
124
+ }
125
+ logEnvReady = true;
126
+ }
127
+ const filePath = join(getDumpDir(), `${fileName}.${fileExt}`);
128
+ writeFileSync(filePath, fileContent);
129
+ copyFileSync(filePath, join(getDumpDir(), `latest.${fileExt}`));
130
+ return filePath;
131
+ }
132
+
133
+ // src/ai-model/prompt/util.ts
134
+ var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
135
+ var contextFormatIntro = `
136
+ The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
137
+ var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
138
+ var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
139
+ var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
140
+ var skillSegment = `skill name: segment_a_web_page
141
+ skill content:
142
+ Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
143
+ Some general rules for segmentation:
144
+ * Each section should NOT overlap with each other.
145
+ * Each text should only belong to one section.
146
+ * [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
147
+ * Analyze the page in a top-to-bottom and left-to-right order.
148
+ * The evidence indicates a separate section, for example
149
+ - The background color of certain parts of the page changes.
150
+ - A section of a page includes a title.
151
+ * Provide the following data for each of the UI section you found.
152
+ {
153
+ "name": "name of the section",
154
+ "description": "briefly summarize the key content or usage of this section.",
155
+ "sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
156
+ "textIds": ["5", "6", "7"], // ids of all text elements in this section
157
+ }
158
+ `;
159
+ var skillExtractData = `skill name: extract_data_from_UI
160
+ related input: DATA_DEMAND
161
+ skill content:
162
+ * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
163
+ * There may be some special commands in DATA_DEMAND, please pay extra attention
164
+ - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
165
+ function promptsOfSectionQuery(constraints) {
166
+ if (!constraints.length) {
167
+ return "";
168
+ }
169
+ const instruction = "Use your segment_a_web_page skill to find the following section(s)";
170
+ const singleSection = (c) => {
171
+ assert3(
172
+ c.name || c.description,
173
+ "either `name` or `description` is required to define a section constraint"
174
+ );
175
+ const number = "One section";
176
+ const name = c.name ? `named \`${c.name}\`` : "";
177
+ const description = c.description ? `, usage or criteria : ${c.description}` : "";
178
+ const basic = `* ${number} ${name}${description}`;
179
+ return basic;
180
+ };
181
+ return `${instruction}
182
+ ${constraints.map(singleSection).join("\n")}`;
183
+ }
184
+ function systemPromptToExtract(dataQuery, sections) {
185
+ const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
186
+ const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
187
+ const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
188
+ return `
189
+ ${characteristic}
190
+ ${contextFormatIntro}
191
+
192
+ You have the following skills:
193
+ ${allSectionNames.length ? skillSegment : ""}
194
+ ${skillExtractData}
195
+
196
+ Now, do the following jobs:
197
+ ${sectionFindingPrompt}
198
+ Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
199
+ DATA_DEMAND start:
200
+ ${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
201
+ ${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
202
+ DATA_DEMAND ends.
203
+
204
+ Return in the following JSON format:
205
+ {
206
+ language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
207
+ ${sectionReturnFormat}
208
+ data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
209
+ errors?: [], // string[], error message if any
210
+ }
211
+ `;
212
+ }
213
+ function describeSize(size) {
214
+ return `${size.width} x ${size.height}`;
215
+ }
216
+ function truncateText(text) {
217
+ const maxLength = 50;
218
+ if (text && text.length > maxLength) {
219
+ return `${text.slice(0, maxLength)}...`;
220
+ }
221
+ return text;
222
+ }
223
+ async function describeUserPage(context) {
224
+ const { screenshotBase64 } = context;
225
+ const { width, height } = await imageInfoOfBase64(screenshotBase64);
226
+ const elementsInfo = context.content;
227
+ const idElementMap = {};
228
+ elementsInfo.forEach((item) => {
229
+ idElementMap[item.id] = item;
230
+ return __spreadValues({}, item);
231
+ });
232
+ const elementInfosDescription = cropfieldInformation(elementsInfo);
233
+ return {
234
+ description: `
235
+ {
236
+ // The size of the page
237
+ "pageSize": ${describeSize({ width, height })},
238
+
239
+
240
+ // json description of the element
241
+ "elementInfos": ${JSON.stringify(elementInfosDescription)}
242
+ }`,
243
+ elementById(id) {
244
+ assert3(typeof id !== "undefined", "id is required for query");
245
+ const item = idElementMap[`${id}`];
246
+ return item;
247
+ }
248
+ };
249
+ }
250
+ function cropfieldInformation(elementsInfo) {
251
+ const elementInfosDescription = elementsInfo.map((item) => {
252
+ const { id, attributes = {}, rect, content } = item;
253
+ const tailorContent = truncateText(content);
254
+ const tailorAttributes = Object.keys(attributes).reduce((res, currentKey) => {
255
+ const attributeVal = attributes[currentKey];
256
+ res[currentKey] = truncateText(attributeVal);
257
+ return res;
258
+ }, {});
259
+ return {
260
+ id,
261
+ attributes: tailorAttributes,
262
+ rect,
263
+ content: tailorContent
264
+ };
265
+ });
266
+ return JSON.stringify(elementInfosDescription);
267
+ }
268
+ function retrieveElement(prompt, opt) {
269
+ if (opt == null ? void 0 : opt.multi) {
270
+ return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
271
+ }
272
+ return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
273
+ }
274
+ function ifElementTypeResponse(response) {
275
+ if (typeof response !== "string") {
276
+ return false;
277
+ }
278
+ return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
279
+ }
280
+ function splitElementResponse(response) {
281
+ const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
282
+ if (response.startsWith(oneElementSplitter)) {
283
+ const id = response.slice(oneElementSplitter.length);
284
+ if (id.indexOf(",") >= 0) {
285
+ console.warn(`unexpected comma in one element response: ${id}`);
286
+ }
287
+ return id ? id : null;
288
+ }
289
+ const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
290
+ if (response.startsWith(elementsSplitter)) {
291
+ const idsString = response.slice(elementsSplitter.length);
292
+ if (!idsString) {
293
+ return [];
294
+ }
295
+ return idsString.split(",");
296
+ }
297
+ return null;
298
+ }
299
+ function retrieveSection(prompt) {
300
+ return `${SECTION_MATCHER_FLAG}${prompt}`;
301
+ }
302
+ function extractSectionQuery(input) {
303
+ if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
304
+ return input.slice(SECTION_MATCHER_FLAG.length);
305
+ }
306
+ return false;
307
+ }
308
+
309
+ // src/insight/utils.ts
310
+ import { existsSync as existsSync2 } from "fs";
311
+ import { join as join2 } from "path";
312
+ import { randomUUID as randomUUID2 } from "crypto";
313
+ import assert4 from "assert";
314
+ var logFileName = "";
315
+ var logContent = [];
316
+ var logIdIndexMap = {};
317
+ var { pid } = process;
318
+ var logFileExt = insightDumpFileExt;
319
+ function writeInsightDump(data, logId, dumpSubscriber) {
320
+ const logDir2 = getDumpDir();
321
+ assert4(logDir2, "logDir should be set before writing dump file");
322
+ const id = logId || randomUUID2();
323
+ const baseData = {
324
+ sdkVersion: getPkgInfo().version,
325
+ logTime: Date.now()
326
+ };
327
+ const finalData = __spreadValues(__spreadValues({
328
+ logId: id
329
+ }, baseData), data);
330
+ dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
331
+ if (!logFileName) {
332
+ logFileName = `pid_${pid}_${baseData.logTime}`;
333
+ while (existsSync2(join2(logDir2, `${logFileName}.${logFileExt}`))) {
334
+ logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
335
+ }
336
+ }
337
+ const dataString = JSON.stringify(finalData, null, 2);
338
+ if (typeof logIdIndexMap[id] === "number") {
339
+ logContent[logIdIndexMap[id]] = dataString;
340
+ } else {
341
+ const length = logContent.push(dataString);
342
+ logIdIndexMap[id] = length - 1;
343
+ }
344
+ writeDumpFile(logFileName, logFileExt, `[
345
+ ${logContent.join(",\n")}
346
+ ]`);
347
+ return id;
348
+ }
349
+ function idsIntoElements(ids, elementById) {
350
+ return ids.reduce((acc, id) => {
351
+ const element = elementById(id);
352
+ if (element) {
353
+ acc.push(element);
354
+ } else {
355
+ console.warn(`element not found by id: ${id}`);
356
+ }
357
+ return acc;
358
+ }, []);
359
+ }
360
+ function shallowExpandIds(data, ifMeet, elementsById) {
361
+ const keys = Object.keys(data);
362
+ keys.forEach((key) => {
363
+ const value = data[key];
364
+ if (typeof value === "string" && ifMeet(value)) {
365
+ data[key] = elementsById(value);
366
+ } else if (Array.isArray(value)) {
367
+ const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
368
+ data[key] = newValue;
369
+ }
370
+ });
371
+ return data;
372
+ }
373
+ function expandLiteSection(liteSection, elementById) {
374
+ const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
375
+ const texts = idsIntoElements(textIds, elementById);
376
+ let leftMost = -1;
377
+ let topMost = -1;
378
+ let rightMost = -1;
379
+ let bottomMost = -1;
380
+ texts.forEach((text) => {
381
+ leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
382
+ topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
383
+ rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
384
+ bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
385
+ });
386
+ const sectionRect = {
387
+ left: leftMost,
388
+ top: topMost,
389
+ width: rightMost - leftMost,
390
+ height: bottomMost - topMost
391
+ };
392
+ const section = __spreadProps(__spreadValues({}, remainingFields), {
393
+ content: texts,
394
+ rect: sectionRect
395
+ });
396
+ return section;
397
+ }
398
+
399
+ // src/ai-model/openai.ts
400
+ import assert5 from "assert";
401
+ import OpenAI from "openai";
402
+ import wrapper from "langsmith/wrappers";
403
+
404
+ // src/types.ts
405
+ var BaseElement = class {
406
+ };
407
+ var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
408
+ AIResponseFormat2["JSON"] = "json_object";
409
+ AIResponseFormat2["TEXT"] = "text";
410
+ return AIResponseFormat2;
411
+ })(AIResponseFormat || {});
412
+ var UIContext = class {
413
+ };
414
+
415
+ // src/ai-model/openai.ts
416
+ var envConfigKey = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
417
+ var envModelKey = "MIDSCENE_MODEL_NAME";
418
+ var envSmithDebug = "MIDSCENE_LANGSMITH_DEBUG";
419
+ var extraConfig = {};
420
+ if (typeof process.env[envConfigKey] === "string") {
421
+ console.log("will use env config for openai");
422
+ extraConfig = JSON.parse(process.env[envConfigKey]);
423
+ }
424
+ var model = "gpt-4o";
425
+ if (typeof process.env[envModelKey] === "string") {
426
+ console.log(`will use model: ${process.env[envModelKey]}`);
427
+ model = process.env[envModelKey];
428
+ }
429
+ async function createOpenAI() {
430
+ const openai = new OpenAI(extraConfig);
431
+ if (process.env[envSmithDebug]) {
432
+ console.log("DEBUGGING MODE: using langsmith wrapper");
433
+ const openai2 = wrapper.wrapOpenAI(new OpenAI());
434
+ return openai2;
435
+ }
436
+ return openai;
437
+ }
438
+ async function call(messages, responseFormat) {
439
+ const openai = await createOpenAI();
440
+ const completion = await openai.chat.completions.create({
441
+ model,
442
+ messages,
443
+ response_format: { type: responseFormat }
444
+ });
445
+ const { content } = completion.choices[0].message;
446
+ assert5(content, "empty content");
447
+ return content;
448
+ }
449
+ async function callToGetJSONObject(messages) {
450
+ const response = await call(messages, "json_object" /* JSON */);
451
+ assert5(response, "empty response");
452
+ return JSON.parse(response);
453
+ }
454
+
455
+ // src/ai-model/prompt/element_inspector.ts
456
+ function systemPromptToFindElement(description, multi) {
457
+ return `
458
+ ## Role:
459
+ You are an expert in software page image (2D) and page element text analysis.
460
+
461
+ ## Objective:
462
+ - Identify elements in screenshots and text that match the user's description.
463
+ - Return JSON data containing the selection reason and element ID.
464
+
465
+ ## Skills:
466
+ - Image analysis and recognition
467
+ - Multilingual text understanding
468
+ - Software UI design and testing
469
+
470
+ ## Workflow:
471
+ 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
472
+ 2. Based on the description (${description}), locate the target element ID in the list of element descriptions and the screenshot.
473
+ 3. Return the number of elements: ${multi ? "multiple elements matching the description (two or more)" : "The element closest to the description (only one)"}.
474
+ 4. Return JSON data containing the selection reason and element ID.
475
+
476
+ ## Constraints:
477
+ - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
478
+ - Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
479
+ - Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
480
+ - If no elements are found, the "elements" array should be empty.
481
+ - The returned data must conform to the specified JSON format.
482
+
483
+ ## Output Format:
484
+ \`\`\`json
485
+ {
486
+ "elements": [
487
+ // If no matching elements are found, return an empty array []
488
+ {
489
+ "reason": "xxx", // The thought process for finding the element, replace xxx with your thought process
490
+ "text": "xxx", // Replace xxx with the text of elementInfo, if none, leave empty
491
+ "id": "xxx" // Replace xxx with the ID of elementInfo
492
+ }
493
+ // More elements...
494
+ ],
495
+ "errors": [] // Array of strings containing any error messages
496
+ }
497
+ \`\`\`
498
+
499
+ ## Example:
500
+ Example 1:
501
+ Input Example:
502
+ \`\`\`json
503
+ // Description: "Shopping cart icon in the upper right corner"
504
+ {
505
+ "screenshot": "path/screenshot.png",
506
+ "text": '{
507
+ "pageSize": {
508
+ "width": 400, // Width of the page
509
+ "height": 905 // Height of the page
510
+ },
511
+ "elementInfos": [
512
+ {
513
+ "id": "3", // ID of the element
514
+ "attributes": { // Attributes of the element
515
+ "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
516
+ "src": "https://ap-southeast-3.m",
517
+ "class": ".img"
518
+ },
519
+ "content": "", // Text content of the element
520
+ "rect": {
521
+ "left": 280, // Distance from the left side of the page
522
+ "top": 8, // Distance from the top of the page
523
+ "width": 44, // Width of the element
524
+ "height": 44 // Height of the element
525
+ }
526
+ },
527
+ {
528
+ "id": "4", // ID of the element
529
+ "attributes": { // Attributes of the element
530
+ "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
531
+ "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
532
+ "class": ".icon"
533
+ },
534
+ "content": "", // Text content of the element
535
+ "rect": {
536
+ "left": 350, // Distance from the left side of the page
537
+ "top": 16, // Distance from the top of the page
538
+ "width": 25, // Width of the element
539
+ "height": 25 // Height of the element
540
+ }
541
+ },
542
+ ...
543
+ {
544
+ "id": "27",
545
+ "attributes": {
546
+ "nodeType": "TEXT Node",
547
+ "class": ".product-name"
548
+ },
549
+ "center": [
550
+ 288,
551
+ 834
552
+ ],
553
+ "content": "Mango Drink",
554
+ "rect": {
555
+ "left": 188,
556
+ "top": 827,
557
+ "width": 199,
558
+ "height": 13
559
+ }
560
+ },
561
+ ...
562
+ ]
563
+ }
564
+ '
565
+ }
566
+ \`\`\`
567
+ Output Example:
568
+ \`\`\`json
569
+ {
570
+ "elements": [
571
+ {
572
+ // Describe the reason for finding this element, replace with actual value in practice
573
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
574
+ "text": "",
575
+ // ID of this element, replace with actual value in practice
576
+ "id": "4"
577
+ }
578
+ ],
579
+ "errors": []
580
+ }
581
+ \`\`\`
582
+
583
+ `;
584
+ }
585
+
586
+ // src/ai-model/inspect.ts
587
+ async function AiInspectElement(options) {
588
+ const { context, multi, findElementDescription, callAI = callToGetJSONObject } = options;
589
+ const { screenshotBase64 } = context;
590
+ const { description, elementById } = await describeUserPage(context);
591
+ const systemPrompt = systemPromptToFindElement(findElementDescription, multi);
592
+ const msgs = [
593
+ { role: "system", content: systemPrompt },
594
+ {
595
+ role: "user",
596
+ content: [
597
+ {
598
+ type: "image_url",
599
+ image_url: {
600
+ url: screenshotBase64,
601
+ detail: "high"
602
+ }
603
+ },
604
+ {
605
+ type: "text",
606
+ text: description
607
+ }
608
+ ]
609
+ }
610
+ ];
611
+ const parseResult = await callAI(msgs);
612
+ return {
613
+ parseResult,
614
+ elementById,
615
+ systemPrompt
616
+ };
617
+ }
618
+ async function AiExtractElementInfo(options) {
619
+ const { dataQuery, sectionConstraints, context, callAI = callToGetJSONObject } = options;
620
+ const systemPrompt = systemPromptToExtract(dataQuery, sectionConstraints);
621
+ const { screenshotBase64 } = context;
622
+ const { description, elementById } = await describeUserPage(context);
623
+ const msgs = [
624
+ { role: "system", content: systemPrompt },
625
+ {
626
+ role: "user",
627
+ content: [
628
+ {
629
+ type: "image_url",
630
+ image_url: {
631
+ url: screenshotBase64
632
+ }
633
+ },
634
+ {
635
+ type: "text",
636
+ text: description
637
+ }
638
+ ]
639
+ }
640
+ ];
641
+ const parseResult = await callAI(msgs);
642
+ return {
643
+ parseResult,
644
+ elementById,
645
+ systemPrompt
646
+ };
647
+ }
648
+
649
+ // src/insight/index.ts
650
+ var sortByOrder = (a, b) => {
651
+ if (a.rect.top - b.rect.top !== 0) {
652
+ return a.rect.top - b.rect.top;
653
+ } else {
654
+ return a.rect.left - b.rect.left;
655
+ }
656
+ };
657
+ var Insight = class {
658
+ constructor(context, opt) {
659
+ __publicField(this, "contextRetrieverFn");
660
+ __publicField(this, "aiVendorFn", callToGetJSONObject);
661
+ __publicField(this, "onceDumpUpdatedFn");
662
+ __publicField(this, "taskInfo");
663
+ assert6(context, "context is required for Insight");
664
+ if (typeof context === "function") {
665
+ this.contextRetrieverFn = context;
666
+ } else {
667
+ this.contextRetrieverFn = () => Promise.resolve(context);
668
+ }
669
+ if (typeof (opt == null ? void 0 : opt.aiVendorFn) !== "undefined") {
670
+ this.aiVendorFn = opt.aiVendorFn;
671
+ }
672
+ if (typeof (opt == null ? void 0 : opt.taskInfo) !== "undefined") {
673
+ this.taskInfo = opt.taskInfo;
674
+ }
675
+ }
676
+ async locate(queryPrompt, opt) {
677
+ var _a;
678
+ assert6(queryPrompt, "query is required for located");
679
+ const dumpSubscriber = this.onceDumpUpdatedFn;
680
+ this.onceDumpUpdatedFn = void 0;
681
+ const context = await this.contextRetrieverFn();
682
+ const startTime = Date.now();
683
+ const { parseResult, systemPrompt, elementById } = await AiInspectElement({
684
+ callAI: this.aiVendorFn,
685
+ context,
686
+ multi: Boolean(opt == null ? void 0 : opt.multi),
687
+ findElementDescription: queryPrompt
688
+ });
689
+ const timeCost = Date.now() - startTime;
690
+ const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
691
+ durationMs: timeCost,
692
+ rawResponse: JSON.stringify(parseResult),
693
+ systemPrompt
694
+ });
695
+ let errorLog;
696
+ if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
697
+ errorLog = `locate - AI response error:
698
+ ${parseResult.errors.join("\n")}`;
699
+ }
700
+ const dumpData = {
701
+ type: "locate",
702
+ context,
703
+ userQuery: {
704
+ element: queryPrompt
705
+ },
706
+ matchedSection: [],
707
+ matchedElement: [],
708
+ data: null,
709
+ taskInfo,
710
+ error: errorLog
711
+ };
712
+ const logId = writeInsightDump(dumpData, void 0, dumpSubscriber);
713
+ if (errorLog) {
714
+ console.error(errorLog);
715
+ throw new Error(errorLog);
716
+ }
717
+ const elements = [];
718
+ parseResult.elements.forEach((item) => {
719
+ const element = elementById(item.id);
720
+ if (!element) {
721
+ console.warn(`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
722
+ return;
723
+ }
724
+ elements.push(element);
725
+ });
726
+ writeInsightDump(
727
+ __spreadProps(__spreadValues({}, dumpData), {
728
+ matchedElement: elements
729
+ }),
730
+ logId,
731
+ dumpSubscriber
732
+ );
733
+ if (opt == null ? void 0 : opt.multi) {
734
+ return elements;
735
+ } else if (elements.length >= 2) {
736
+ console.warn(`locate: multiple elements found, return the first one. (query: ${queryPrompt})`);
737
+ return elements[0];
738
+ } else if (elements.length === 1) {
739
+ return elements[0];
740
+ } else {
741
+ return null;
742
+ }
743
+ }
744
+ async extract(dataDemand) {
745
+ var _a;
746
+ let dataQuery = {};
747
+ const sectionQueryMap = {};
748
+ assert6(
749
+ typeof dataDemand === "object" || typeof dataDemand === "string",
750
+ `dataDemand should be object or string, but get ${typeof dataDemand}`
751
+ );
752
+ const dumpSubscriber = this.onceDumpUpdatedFn;
753
+ this.onceDumpUpdatedFn = void 0;
754
+ if (typeof dataDemand === "string") {
755
+ dataQuery = dataDemand;
756
+ } else {
757
+ for (const key in dataDemand) {
758
+ const query = dataDemand[key];
759
+ const sectionQuery = extractSectionQuery(query);
760
+ if (sectionQuery) {
761
+ sectionQueryMap[key] = sectionQuery;
762
+ } else {
763
+ dataQuery[key] = query;
764
+ }
765
+ }
766
+ dataQuery = dataDemand;
767
+ }
768
+ const sectionConstraints = Object.keys(sectionQueryMap).map((name) => {
769
+ const sectionQueryPrompt = sectionQueryMap[name];
770
+ return {
771
+ name,
772
+ description: sectionQueryPrompt || ""
773
+ };
774
+ });
775
+ const context = await this.contextRetrieverFn();
776
+ const startTime = Date.now();
777
+ const { parseResult, systemPrompt, elementById } = await AiExtractElementInfo({
778
+ context,
779
+ dataQuery,
780
+ sectionConstraints,
781
+ callAI: this.aiVendorFn
782
+ });
783
+ const timeCost = Date.now() - startTime;
784
+ const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
785
+ durationMs: timeCost,
786
+ rawResponse: JSON.stringify(parseResult),
787
+ systemPrompt
788
+ });
789
+ let errorLog;
790
+ if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
791
+ errorLog = `segment - AI response error:
792
+ ${parseResult.errors.join("\n")}`;
793
+ }
794
+ const dumpData = {
795
+ type: "extract",
796
+ context,
797
+ userQuery: {
798
+ dataDemand
799
+ },
800
+ matchedSection: [],
801
+ matchedElement: [],
802
+ data: null,
803
+ taskInfo,
804
+ error: errorLog
805
+ };
806
+ const logId = writeInsightDump(dumpData, void 0, dumpSubscriber);
807
+ if (errorLog) {
808
+ console.error(errorLog);
809
+ throw new Error(errorLog);
810
+ }
811
+ const sectionsArr = (parseResult.sections || []).map((liteSection) => {
812
+ const section = expandLiteSection(liteSection, (id) => elementById(id));
813
+ return section;
814
+ }).sort(sortByOrder);
815
+ const sectionMap = sectionsArr.reduce((acc, section) => {
816
+ const { name } = section;
817
+ if (acc[name]) {
818
+ let i = 1;
819
+ while (acc[`${name}_${i}`]) {
820
+ i++;
821
+ }
822
+ console.warn(`section name conflict: ${name}, rename to ${name}_${i}`);
823
+ acc[`${name}_${i}`] = section;
824
+ } else {
825
+ acc[name] = section;
826
+ }
827
+ return acc;
828
+ }, {});
829
+ const { data } = parseResult;
830
+ let mergedData = data;
831
+ if (data && typeof data === "object" && !Array.isArray(data)) {
832
+ shallowExpandIds(data, ifElementTypeResponse, (id) => {
833
+ const idList = splitElementResponse(id);
834
+ if (typeof idList === "string") {
835
+ return elementById(idList);
836
+ } else if (Array.isArray(idList)) {
837
+ return idsIntoElements(idList, elementById);
838
+ }
839
+ return idList;
840
+ });
841
+ mergedData = __spreadValues(__spreadValues({}, data), sectionMap);
842
+ }
843
+ writeInsightDump(
844
+ __spreadProps(__spreadValues({}, dumpData), {
845
+ matchedSection: Object.values(sectionMap),
846
+ data: mergedData
847
+ }),
848
+ logId,
849
+ dumpSubscriber
850
+ );
851
+ return mergedData;
852
+ }
853
+ };
854
+
855
+ // src/action/executor.ts
856
+ import assert7 from "assert";
857
+ var Executor = class {
858
+ constructor(name, description, tasks) {
859
+ __publicField(this, "name");
860
+ __publicField(this, "description");
861
+ __publicField(this, "tasks");
862
+ // status of executor
863
+ __publicField(this, "status");
864
+ __publicField(this, "errorMsg");
865
+ __publicField(this, "dumpFileName");
866
+ this.status = tasks && tasks.length > 0 ? "pending" : "init";
867
+ this.name = name;
868
+ this.description = description;
869
+ this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
870
+ }
871
+ markTaskAsPending(task) {
872
+ return __spreadValues({
873
+ status: "pending"
874
+ }, task);
875
+ }
876
+ async append(task) {
877
+ assert7(this.status !== "error", "executor is in error state, cannot append task");
878
+ if (Array.isArray(task)) {
879
+ this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
880
+ } else {
881
+ this.tasks.push(this.markTaskAsPending(task));
882
+ }
883
+ if (this.status !== "running") {
884
+ this.status = "pending";
885
+ }
886
+ }
887
+ async flush() {
888
+ if (this.status === "init" && this.tasks.length > 0) {
889
+ console.warn("illegal state for executor, status is init but tasks are not empty");
890
+ }
891
+ assert7(this.status !== "running", "executor is already running");
892
+ assert7(this.status !== "completed", "executor is already completed");
893
+ assert7(this.status !== "error", "executor is in error state");
894
+ const nextPendingIndex = this.tasks.findIndex((task) => task.status === "pending");
895
+ if (nextPendingIndex < 0) {
896
+ return;
897
+ }
898
+ this.status = "running";
899
+ let taskIndex = nextPendingIndex;
900
+ let successfullyCompleted = true;
901
+ let errorMsg = "";
902
+ let previousFindOutput;
903
+ while (taskIndex < this.tasks.length) {
904
+ const task = this.tasks[taskIndex];
905
+ assert7(task.status === "pending", `task status should be pending, but got: ${task.status}`);
906
+ task.timing = {
907
+ start: Date.now()
908
+ };
909
+ try {
910
+ task.status = "running";
911
+ assert7(
912
+ ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
913
+ `unsupported task type: ${task.type}`
914
+ );
915
+ const { executor, param } = task;
916
+ assert7(executor, `executor is required for task type: ${task.type}`);
917
+ let returnValue;
918
+ const executorContext = {
919
+ task,
920
+ element: previousFindOutput == null ? void 0 : previousFindOutput.element
921
+ };
922
+ if (task.type === "Insight") {
923
+ assert7(
924
+ task.subType === "Locate" || task.subType === "Query",
925
+ `unsupported insight subType: ${task.subType}`
926
+ );
927
+ returnValue = await task.executor(param, executorContext);
928
+ if (task.subType === "Locate") {
929
+ previousFindOutput = returnValue == null ? void 0 : returnValue.output;
930
+ }
931
+ } else if (task.type === "Action" || task.type === "Planning") {
932
+ returnValue = await task.executor(param, executorContext);
933
+ } else {
934
+ console.warn(`unsupported task type: ${task.type}, will try to execute it directly`);
935
+ returnValue = await task.executor(param, executorContext);
936
+ }
937
+ Object.assign(task, returnValue);
938
+ task.status = "success";
939
+ task.timing.end = Date.now();
940
+ task.timing.cost = task.timing.end - task.timing.start;
941
+ taskIndex++;
942
+ } catch (e) {
943
+ successfullyCompleted = false;
944
+ task.status = "fail";
945
+ errorMsg = `${e == null ? void 0 : e.message}
946
+ ${e == null ? void 0 : e.stack}`;
947
+ task.error = errorMsg;
948
+ task.timing.end = Date.now();
949
+ task.timing.cost = task.timing.end - task.timing.start;
950
+ this.errorMsg = errorMsg;
951
+ break;
952
+ }
953
+ }
954
+ for (let i = taskIndex + 1; i < this.tasks.length; i++) {
955
+ this.tasks[i].status = "cancelled";
956
+ }
957
+ if (successfullyCompleted) {
958
+ this.status = "completed";
959
+ } else {
960
+ this.status = "error";
961
+ throw new Error(`executor failed: ${errorMsg}`);
962
+ }
963
+ }
964
+ dump() {
965
+ const dumpData = {
966
+ sdkVersion: getPkgInfo().version,
967
+ logTime: Date.now(),
968
+ name: this.name,
969
+ description: this.description,
970
+ tasks: this.tasks
971
+ };
972
+ return dumpData;
973
+ }
974
+ };
975
+
976
+ // src/query/index.ts
977
+ var getElement = retrieveElement;
978
+ var getSection = retrieveSection;
979
+
980
+ // src/automation/planning.ts
981
+ var characteristic2 = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
982
+ function systemPromptToTaskPlanning(query) {
983
+ return `
984
+ ${characteristic2}
985
+
986
+ Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
987
+ Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
988
+
989
+ Each action has a type and corresponding param. To be detailed:
990
+ * type: 'Locate', it means to locate one element
991
+ * param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
992
+ * type: 'Tap', tap the previous element found
993
+ * param: null
994
+ * type: 'Hover', hover the previous element found
995
+ * param: null
996
+ * type: 'Input', 'KeyboardPress', input something or press a key
997
+ * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
998
+ * type: 'Scroll'
999
+ * param: { scrollType: 'ScrollUntilBottom', 'ScrollUntilTop', 'ScrollDown', 'ScrollUp' }
1000
+ * type: 'Error'
1001
+ * param: { message: string }, the error message
1002
+
1003
+ Here is an example of how to decompose a task.
1004
+ When a user says 'Input "Weather in Shanghai" into the search bar, hit enter', by viewing the page screenshot and description, you my decompose this task into something like this:
1005
+ * Find: 'The search bar'
1006
+ * Input: 'Weather in Shanghai'
1007
+ * KeyboardPress: 'Enter'
1008
+
1009
+ Remember:
1010
+ 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
1011
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
1012
+
1013
+ If any error occurs during the task planning (like the page content and task are irrelevant, or the element mentioned does not exist at all), please return the error message with explanation in the errors field. The thoughts、prompts、error messages should all in the same language as the user query.
1014
+
1015
+ Return in the following JSON format:
1016
+ {
1017
+ queryLanguage: '', // language of the description of the task
1018
+ actions: [ // always return in Array
1019
+ {
1020
+ "thought": "Reasons for generating this task, and why this task is feasible on this page",
1021
+ "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
1022
+ "param": any, // Parameter towards the task type
1023
+ },
1024
+ // ... more actions
1025
+ ],
1026
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here
1027
+ }
1028
+
1029
+ Here is the description of the task. Just go ahead:
1030
+ =====================================
1031
+ ${query}
1032
+ =====================================
1033
+ `;
1034
+ }
1035
+ async function plan(context, userPrompt) {
1036
+ const { screenshotBase64 } = context;
1037
+ const { description } = await describeUserPage(context);
1038
+ const systemPrompt = systemPromptToTaskPlanning(userPrompt);
1039
+ const msgs = [
1040
+ { role: "system", content: systemPrompt },
1041
+ {
1042
+ role: "user",
1043
+ content: [
1044
+ {
1045
+ type: "image_url",
1046
+ image_url: {
1047
+ url: screenshotBase64,
1048
+ detail: "high"
1049
+ }
1050
+ },
1051
+ {
1052
+ type: "text",
1053
+ text: description
1054
+ }
1055
+ ]
1056
+ }
1057
+ ];
1058
+ const planFromAI = await callToGetJSONObject(msgs);
1059
+ if (planFromAI.error) {
1060
+ throw new Error(planFromAI.error);
1061
+ }
1062
+ const { actions } = planFromAI;
1063
+ actions.forEach((task) => {
1064
+ if (task.type === "Error") {
1065
+ throw new Error(task.thought);
1066
+ }
1067
+ });
1068
+ return { plans: actions };
1069
+ }
1070
+
1071
+ // src/index.ts
1072
+ var src_default = Insight;
1073
+ export {
1074
+ AIResponseFormat,
1075
+ BaseElement,
1076
+ Executor,
1077
+ UIContext,
1078
+ src_default as default,
1079
+ getElement,
1080
+ getSection,
1081
+ plan,
1082
+ setDumpDir
1083
+ };