@midscene/core 0.26.2 → 0.26.3-beta-20250813075706.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/es/ai-model.mjs +2502 -0
  2. package/dist/es/ai-model.mjs.map +1 -0
  3. package/dist/es/index.mjs +2362 -0
  4. package/dist/es/index.mjs.map +1 -0
  5. package/dist/es/tree.mjs +2 -0
  6. package/dist/es/utils.mjs +188 -0
  7. package/dist/es/{chunk-O3KUKF2A.js.map → utils.mjs.map} +1 -1
  8. package/dist/lib/ai-model.js +2581 -3
  9. package/dist/lib/ai-model.js.map +1 -0
  10. package/dist/lib/index.js +2375 -493
  11. package/dist/lib/index.js.map +1 -1
  12. package/dist/lib/tree.js +42 -11
  13. package/dist/lib/tree.js.map +1 -1
  14. package/dist/lib/utils.js +257 -29
  15. package/dist/lib/utils.js.map +1 -0
  16. package/dist/types/ai-model.d.ts +505 -99
  17. package/dist/types/index.d.ts +1299 -53
  18. package/dist/types/tree.d.ts +11 -1
  19. package/dist/types/utils.d.ts +47 -33
  20. package/package.json +28 -13
  21. package/dist/es/ai-model.d.ts +0 -99
  22. package/dist/es/ai-model.js +0 -44
  23. package/dist/es/chunk-DDYIQHOA.js +0 -2883
  24. package/dist/es/chunk-DDYIQHOA.js.map +0 -1
  25. package/dist/es/chunk-O3KUKF2A.js +0 -265
  26. package/dist/es/index.d.ts +0 -53
  27. package/dist/es/index.js +0 -570
  28. package/dist/es/index.js.map +0 -1
  29. package/dist/es/llm-planning-4e0c16fe.d.ts +0 -106
  30. package/dist/es/tree.d.ts +0 -1
  31. package/dist/es/tree.js +0 -13
  32. package/dist/es/tree.js.map +0 -1
  33. package/dist/es/types-8a6be57c.d.ts +0 -577
  34. package/dist/es/utils.d.ts +0 -33
  35. package/dist/es/utils.js +0 -30
  36. package/dist/lib/ai-model.d.ts +0 -99
  37. package/dist/lib/chunk-DDYIQHOA.js +0 -2883
  38. package/dist/lib/chunk-DDYIQHOA.js.map +0 -1
  39. package/dist/lib/chunk-O3KUKF2A.js +0 -265
  40. package/dist/lib/chunk-O3KUKF2A.js.map +0 -1
  41. package/dist/lib/index.d.ts +0 -53
  42. package/dist/lib/llm-planning-4e0c16fe.d.ts +0 -106
  43. package/dist/lib/tree.d.ts +0 -1
  44. package/dist/lib/types-8a6be57c.d.ts +0 -577
  45. package/dist/lib/utils.d.ts +0 -33
  46. package/dist/types/llm-planning-4e0c16fe.d.ts +0 -106
  47. package/dist/types/types-8a6be57c.d.ts +0 -577
@@ -1,2883 +0,0 @@
1
- // src/ai-model/service-caller/index.ts
2
- import { Anthropic } from "@anthropic-ai/sdk";
3
- import {
4
- DefaultAzureCredential,
5
- getBearerTokenProvider
6
- } from "@azure/identity";
7
- import {
8
- ANTHROPIC_API_KEY,
9
- AZURE_OPENAI_API_VERSION,
10
- AZURE_OPENAI_DEPLOYMENT,
11
- AZURE_OPENAI_ENDPOINT,
12
- AZURE_OPENAI_KEY,
13
- MIDSCENE_API_TYPE,
14
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
15
- MIDSCENE_AZURE_OPENAI_SCOPE,
16
- MIDSCENE_DEBUG_AI_PROFILE,
17
- MIDSCENE_DEBUG_AI_RESPONSE,
18
- MIDSCENE_LANGSMITH_DEBUG,
19
- MIDSCENE_MODEL_NAME,
20
- MIDSCENE_OPENAI_HTTP_PROXY,
21
- MIDSCENE_OPENAI_INIT_CONFIG_JSON,
22
- MIDSCENE_OPENAI_SOCKS_PROXY,
23
- MIDSCENE_USE_ANTHROPIC_SDK,
24
- MIDSCENE_USE_AZURE_OPENAI,
25
- OPENAI_API_KEY,
26
- OPENAI_BASE_URL,
27
- OPENAI_MAX_TOKENS,
28
- OPENAI_USE_AZURE,
29
- getAIConfig,
30
- getAIConfigInBoolean,
31
- getAIConfigInJson,
32
- uiTarsModelVersion,
33
- vlLocateMode as vlLocateMode2
34
- } from "@midscene/shared/env";
35
- import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
36
- import { assert as assert3 } from "@midscene/shared/utils";
37
- import { ifInBrowser } from "@midscene/shared/utils";
38
- import { HttpsProxyAgent } from "https-proxy-agent";
39
- import { jsonrepair } from "jsonrepair";
40
- import OpenAI, { AzureOpenAI } from "openai";
41
- import { SocksProxyAgent } from "socks-proxy-agent";
42
-
43
- // src/ai-model/common.ts
44
- import { assert } from "@midscene/shared/utils";
45
- import { NodeType } from "@midscene/shared/constants";
46
- import { vlLocateMode } from "@midscene/shared/env";
47
- import { treeToList } from "@midscene/shared/extractor";
48
- import { compositeElementInfoImg } from "@midscene/shared/img";
49
- import { getDebug } from "@midscene/shared/logger";
50
- var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
51
- AIActionType2[AIActionType2["ASSERT"] = 0] = "ASSERT";
52
- AIActionType2[AIActionType2["INSPECT_ELEMENT"] = 1] = "INSPECT_ELEMENT";
53
- AIActionType2[AIActionType2["EXTRACT_DATA"] = 2] = "EXTRACT_DATA";
54
- AIActionType2[AIActionType2["PLAN"] = 3] = "PLAN";
55
- AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
56
- return AIActionType2;
57
- })(AIActionType || {});
58
- async function callAiFn(msgs, AIActionTypeValue) {
59
- const { content, usage } = await callToGetJSONObject(
60
- msgs,
61
- AIActionTypeValue
62
- );
63
- return { content, usage };
64
- }
65
- var defaultBboxSize = 20;
66
- var debugInspectUtils = getDebug("ai:common");
67
- function fillBboxParam(locate, width, height) {
68
- if (locate.bbox_2d && !locate?.bbox) {
69
- locate.bbox = locate.bbox_2d;
70
- delete locate.bbox_2d;
71
- }
72
- if (locate?.bbox) {
73
- locate.bbox = adaptBbox(locate.bbox, width, height);
74
- }
75
- return locate;
76
- }
77
- function adaptQwenBbox(bbox) {
78
- if (bbox.length < 2) {
79
- const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
80
- throw new Error(msg);
81
- }
82
- const result = [
83
- Math.round(bbox[0]),
84
- Math.round(bbox[1]),
85
- typeof bbox[2] === "number" ? Math.round(bbox[2]) : Math.round(bbox[0] + defaultBboxSize),
86
- typeof bbox[3] === "number" ? Math.round(bbox[3]) : Math.round(bbox[1] + defaultBboxSize)
87
- ];
88
- return result;
89
- }
90
- function adaptDoubaoBbox(bbox, width, height) {
91
- assert(
92
- width > 0 && height > 0,
93
- "width and height must be greater than 0 in doubao mode"
94
- );
95
- if (typeof bbox === "string") {
96
- assert(
97
- /^(\d+)\s(\d+)\s(\d+)\s(\d+)$/.test(bbox.trim()),
98
- `invalid bbox data string for doubao-vision mode: ${bbox}`
99
- );
100
- const splitted = bbox.split(" ");
101
- if (splitted.length === 4) {
102
- return [
103
- Math.round(Number(splitted[0]) * width / 1e3),
104
- Math.round(Number(splitted[1]) * height / 1e3),
105
- Math.round(Number(splitted[2]) * width / 1e3),
106
- Math.round(Number(splitted[3]) * height / 1e3)
107
- ];
108
- }
109
- throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);
110
- }
111
- if (Array.isArray(bbox) && Array.isArray(bbox[0])) {
112
- bbox = bbox[0];
113
- }
114
- let bboxList = [];
115
- if (Array.isArray(bbox) && typeof bbox[0] === "string") {
116
- bbox.forEach((item) => {
117
- if (typeof item === "string" && item.includes(",")) {
118
- const [x, y] = item.split(",");
119
- bboxList.push(Number(x.trim()), Number(y.trim()));
120
- } else if (typeof item === "string" && item.includes(" ")) {
121
- const [x, y] = item.split(" ");
122
- bboxList.push(Number(x.trim()), Number(y.trim()));
123
- } else {
124
- bboxList.push(Number(item));
125
- }
126
- });
127
- } else {
128
- bboxList = bbox;
129
- }
130
- if (bboxList.length === 4 || bboxList.length === 5) {
131
- return [
132
- Math.round(bboxList[0] * width / 1e3),
133
- Math.round(bboxList[1] * height / 1e3),
134
- Math.round(bboxList[2] * width / 1e3),
135
- Math.round(bboxList[3] * height / 1e3)
136
- ];
137
- }
138
- if (bboxList.length === 6 || bboxList.length === 2 || bboxList.length === 3 || bboxList.length === 7) {
139
- return [
140
- Math.max(
141
- 0,
142
- Math.round(bboxList[0] * width / 1e3) - defaultBboxSize / 2
143
- ),
144
- Math.max(
145
- 0,
146
- Math.round(bboxList[1] * height / 1e3) - defaultBboxSize / 2
147
- ),
148
- Math.min(
149
- width,
150
- Math.round(bboxList[0] * width / 1e3) + defaultBboxSize / 2
151
- ),
152
- Math.min(
153
- height,
154
- Math.round(bboxList[1] * height / 1e3) + defaultBboxSize / 2
155
- )
156
- ];
157
- }
158
- if (bbox.length === 8) {
159
- return [
160
- Math.round(bboxList[0] * width / 1e3),
161
- Math.round(bboxList[1] * height / 1e3),
162
- Math.round(bboxList[4] * width / 1e3),
163
- Math.round(bboxList[5] * height / 1e3)
164
- ];
165
- }
166
- const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
167
- throw new Error(msg);
168
- }
169
- function adaptBbox(bbox, width, height) {
170
- if (vlLocateMode() === "doubao-vision" || vlLocateMode() === "vlm-ui-tars") {
171
- return adaptDoubaoBbox(bbox, width, height);
172
- }
173
- if (vlLocateMode() === "gemini") {
174
- return adaptGeminiBbox(bbox, width, height);
175
- }
176
- return adaptQwenBbox(bbox);
177
- }
178
- function adaptGeminiBbox(bbox, width, height) {
179
- const left = Math.round(bbox[1] * width / 1e3);
180
- const top = Math.round(bbox[0] * height / 1e3);
181
- const right = Math.round(bbox[3] * width / 1e3);
182
- const bottom = Math.round(bbox[2] * height / 1e3);
183
- return [left, top, right, bottom];
184
- }
185
- function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
186
- debugInspectUtils("adaptBboxToRect", bbox, width, height, offsetX, offsetY);
187
- const [left, top, right, bottom] = adaptBbox(bbox, width, height);
188
- const rect = {
189
- left: left + offsetX,
190
- top: top + offsetY,
191
- width: right - left,
192
- height: bottom - top
193
- };
194
- debugInspectUtils("adaptBboxToRect, result=", rect);
195
- return rect;
196
- }
197
- var warned = false;
198
- function warnGPT4oSizeLimit(size) {
199
- if (warned)
200
- return;
201
- if (getModelName()?.toLowerCase().includes("gpt-4o")) {
202
- const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;
203
- if (Math.max(size.width, size.height) > 2e3 || Math.min(size.width, size.height) > 768) {
204
- console.warn(warningMsg);
205
- warned = true;
206
- }
207
- } else if (size.width > 1800 || size.height > 1800) {
208
- console.warn(
209
- `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`
210
- );
211
- warned = true;
212
- }
213
- }
214
- function mergeRects(rects) {
215
- const minLeft = Math.min(...rects.map((r) => r.left));
216
- const minTop = Math.min(...rects.map((r) => r.top));
217
- const maxRight = Math.max(...rects.map((r) => r.left + r.width));
218
- const maxBottom = Math.max(...rects.map((r) => r.top + r.height));
219
- return {
220
- left: minLeft,
221
- top: minTop,
222
- width: maxRight - minLeft,
223
- height: maxBottom - minTop
224
- };
225
- }
226
- function expandSearchArea(rect, screenSize) {
227
- const minEdgeSize = vlLocateMode() === "doubao-vision" ? 500 : 300;
228
- const defaultPadding = 160;
229
- const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
230
- const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
231
- rect.left = Math.max(0, rect.left - paddingSizeHorizontal);
232
- rect.width = Math.min(
233
- rect.width + paddingSizeHorizontal * 2,
234
- screenSize.width - rect.left
235
- );
236
- rect.top = Math.max(0, rect.top - paddingSizeVertical);
237
- rect.height = Math.min(
238
- rect.height + paddingSizeVertical * 2,
239
- screenSize.height - rect.top
240
- );
241
- return rect;
242
- }
243
- async function markupImageForLLM(screenshotBase64, tree, size) {
244
- const elementsInfo = treeToList(tree);
245
- const elementsPositionInfoWithoutText = elementsInfo.filter(
246
- (elementInfo) => {
247
- if (elementInfo.attributes.nodeType === NodeType.TEXT) {
248
- return false;
249
- }
250
- return true;
251
- }
252
- );
253
- const imagePayload = await compositeElementInfoImg({
254
- inputImgBase64: screenshotBase64,
255
- elementsPositionInfo: elementsPositionInfoWithoutText,
256
- size
257
- });
258
- return imagePayload;
259
- }
260
- function buildYamlFlowFromPlans(plans, sleep) {
261
- const flow = [];
262
- for (const plan2 of plans) {
263
- const type = plan2.type;
264
- const locate = plan2.locate?.prompt;
265
- if (type === "Tap") {
266
- flow.push({
267
- aiTap: locate
268
- });
269
- } else if (type === "Hover") {
270
- flow.push({
271
- aiHover: locate
272
- });
273
- } else if (type === "Input") {
274
- const param = plan2.param;
275
- flow.push({
276
- aiInput: param.value,
277
- locate
278
- });
279
- } else if (type === "KeyboardPress") {
280
- const param = plan2.param;
281
- flow.push({
282
- aiKeyboardPress: param.value,
283
- locate
284
- });
285
- } else if (type === "Scroll") {
286
- const param = plan2.param;
287
- flow.push({
288
- aiScroll: null,
289
- locate,
290
- direction: param.direction,
291
- scrollType: param.scrollType,
292
- distance: param.distance
293
- });
294
- } else if (type === "Sleep") {
295
- const param = plan2.param;
296
- flow.push({
297
- sleep: param.timeMs
298
- });
299
- } else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton" || type === "AndroidLongPress" || type === "AndroidPull") {
300
- } else if (type === "Error" || type === "Assert" || type === "AssertWithoutThrow" || type === "Finished") {
301
- } else {
302
- console.warn(
303
- `Cannot convert action ${type} to yaml flow. This should be a bug of Midscene.`
304
- );
305
- }
306
- }
307
- if (sleep) {
308
- flow.push({
309
- sleep
310
- });
311
- }
312
- return flow;
313
- }
314
-
315
- // src/ai-model/prompt/assertion.ts
316
- import { getPreferredLanguage } from "@midscene/shared/env";
317
- var defaultAssertionPrompt = "You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.";
318
- var defaultAssertionResponseJsonFormat = `Return in the following JSON format:
319
- {
320
- pass: boolean, // whether the assertion is truthy
321
- thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
322
- }`;
323
- var getUiTarsAssertionResponseJsonFormat = () => `## Output Json String Format
324
- \`\`\`
325
- "{
326
- "pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
327
- "thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
328
- }"
329
- \`\`\`
330
-
331
- ## Rules **MUST** follow
332
- - Make sure to return **only** the JSON, with **no additional** text or explanations.
333
- - Use ${getPreferredLanguage()} in \`thought\` part.
334
- - You **MUST** strictly follow up the **Output Json String Format**.`;
335
- function systemPromptToAssert(model) {
336
- return `${defaultAssertionPrompt}
337
-
338
- ${model.isUITars ? getUiTarsAssertionResponseJsonFormat() : defaultAssertionResponseJsonFormat}`;
339
- }
340
- var assertSchema = {
341
- type: "json_schema",
342
- json_schema: {
343
- name: "assert",
344
- strict: true,
345
- schema: {
346
- type: "object",
347
- properties: {
348
- pass: {
349
- type: "boolean",
350
- description: "Whether the assertion passed or failed"
351
- },
352
- thought: {
353
- type: ["string", "null"],
354
- description: "The thought process behind the assertion"
355
- }
356
- },
357
- required: ["pass", "thought"],
358
- additionalProperties: false
359
- }
360
- }
361
- };
362
-
363
- // src/ai-model/prompt/llm-locator.ts
364
- import { PromptTemplate } from "@langchain/core/prompts";
365
-
366
- // src/ai-model/prompt/common.ts
367
- function bboxDescription(vlMode) {
368
- if (vlMode === "gemini") {
369
- return "2d bounding box as [ymin, xmin, ymax, xmax]";
370
- }
371
- return "2d bounding box as [xmin, ymin, xmax, ymax]";
372
- }
373
-
374
- // src/ai-model/prompt/llm-locator.ts
375
- function systemPromptToLocateElement(vlMode) {
376
- if (vlMode) {
377
- const bboxComment = bboxDescription(vlMode);
378
- return `
379
- ## Role:
380
- You are an expert in software testing.
381
-
382
- ## Objective:
383
- - Identify elements in screenshots and text that match the user's description.
384
- - Give the coordinates of the element that matches the user's description best in the screenshot.
385
- - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
386
-
387
- ## Output Format:
388
- \`\`\`json
389
- {
390
- "bbox": [number, number, number, number], // ${bboxComment}
391
- "errors"?: string[],
392
- "isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
393
- }
394
- \`\`\`
395
-
396
- Fields:
397
- * \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
398
- * \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
399
- * \`errors\` is an optional array of error messages (if any)
400
-
401
- Order-sensitive means the description contains phrases like:
402
- - "the third item in the list"
403
- - "the last button"
404
- - "the first input box"
405
- - "the second row"
406
-
407
- Not order-sensitive means the description is like:
408
- - "confirm button"
409
- - "search box"
410
- - "password input"
411
-
412
- For example, when an element is found and the description is order-sensitive:
413
- \`\`\`json
414
- {
415
- "bbox": [100, 100, 200, 200],
416
- "isOrderSensitive": true,
417
- "errors": []
418
- }
419
- \`\`\`
420
-
421
- When no element is found and the description is not order-sensitive:
422
- \`\`\`json
423
- {
424
- "bbox": [],
425
- "isOrderSensitive": false,
426
- "errors": ["I can see ..., but {some element} is not found"]
427
- }
428
- \`\`\`
429
- `;
430
- }
431
- return `
432
- ## Role:
433
- You are an expert in software page image (2D) and page element text analysis.
434
-
435
- ## Objective:
436
- - Identify elements in screenshots and text that match the user's description.
437
- - Return JSON data containing the selection reason and element ID.
438
- - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
439
-
440
- ## Skills:
441
- - Image analysis and recognition
442
- - Multilingual text understanding
443
- - Software UI design and testing
444
-
445
- ## Workflow:
446
- 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
447
- 2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
448
- 3. Found the required number of elements
449
- 4. Return JSON data containing the selection reason and element ID.
450
- 5. Judge whether the user's description is order-sensitive (see below for definition and examples).
451
-
452
- ## Constraints:
453
- - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
454
- - Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
455
- - Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
456
- - If no elements are found, the "elements" array should be empty.
457
- - The returned data must conform to the specified JSON format.
458
- - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
459
-
460
- ## Order-Sensitive Definition:
461
- - If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
462
- - If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
463
-
464
- ## Output Format:
465
-
466
- Please return the result in JSON format as follows:
467
-
468
- \`\`\`json
469
- {
470
- "elements": [
471
- // If no matching elements are found, return an empty array []
472
- {
473
- "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
474
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
475
- "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
476
- }
477
- // More elements...
478
- ],
479
- "isOrderSensitive": true, // or false, depending on the user's description
480
- "errors": [] // Array of strings containing any error messages
481
- }
482
- \`\`\`
483
-
484
- ## Example:
485
- Example 1:
486
- Input Example:
487
- \`\`\`json
488
- // Description: "Shopping cart icon in the upper right corner"
489
- {
490
- "description": "PLACEHOLDER", // Description of the target element
491
- "screenshot": "path/screenshot.png",
492
- "text": '{
493
- "pageSize": {
494
- "width": 400, // Width of the page
495
- "height": 905 // Height of the page
496
- },
497
- "elementInfos": [
498
- {
499
- "id": "1231", // ID of the element
500
- "indexId": "0", // Index of the element,The image is labeled to the left of the element
501
- "attributes": { // Attributes of the element
502
- "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
503
- "src": "https://ap-southeast-3.m",
504
- "class": ".img"
505
- },
506
- "content": "", // Text content of the element
507
- "rect": {
508
- "left": 280, // Distance from the left side of the page
509
- "top": 8, // Distance from the top of the page
510
- "width": 44, // Width of the element
511
- "height": 44 // Height of the element
512
- }
513
- },
514
- {
515
- "id": "66551", // ID of the element
516
- "indexId": "1", // Index of the element,The image is labeled to the left of the element
517
- "attributes": { // Attributes of the element
518
- "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
519
- "src": "...",
520
- "class": ".icon"
521
- },
522
- "content": "", // Text content of the element
523
- "rect": {
524
- "left": 350, // Distance from the left side of the page
525
- "top": 16, // Distance from the top of the page
526
- "width": 25, // Width of the element
527
- "height": 25 // Height of the element
528
- }
529
- },
530
- ...
531
- {
532
- "id": "12344",
533
- "indexId": "2", // Index of the element,The image is labeled to the left of the element
534
- "attributes": {
535
- "nodeType": "TEXT Node",
536
- "class": ".product-name"
537
- },
538
- "center": [
539
- 288,
540
- 834
541
- ],
542
- "content": "Mango Drink",
543
- "rect": {
544
- "left": 188,
545
- "top": 827,
546
- "width": 199,
547
- "height": 13
548
- }
549
- },
550
- ...
551
- ]
552
- }
553
- '
554
- }
555
- \`\`\`
556
- Output Example:
557
- \`\`\`json
558
- {
559
- "elements": [
560
- {
561
- // Describe the reason for finding this element, replace with actual value in practice
562
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
563
- "text": "",
564
- // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
565
- "id": "1231"
566
- }
567
- ],
568
- "isOrderSensitive": true,
569
- "errors": []
570
- }
571
- \`\`\`
572
-
573
- `;
574
- }
575
- var locatorSchema = {
576
- type: "json_schema",
577
- json_schema: {
578
- name: "find_elements",
579
- strict: true,
580
- schema: {
581
- type: "object",
582
- properties: {
583
- elements: {
584
- type: "array",
585
- items: {
586
- type: "object",
587
- properties: {
588
- reason: {
589
- type: "string",
590
- description: "Reason for finding this element"
591
- },
592
- text: {
593
- type: "string",
594
- description: "Text content of the element"
595
- },
596
- id: {
597
- type: "string",
598
- description: "ID of this element"
599
- }
600
- },
601
- required: ["reason", "text", "id"],
602
- additionalProperties: false
603
- },
604
- description: "List of found elements"
605
- },
606
- isOrderSensitive: {
607
- type: "boolean",
608
- description: "Whether the targetElementDescription is order-sensitive (true/false)"
609
- },
610
- errors: {
611
- type: "array",
612
- items: {
613
- type: "string"
614
- },
615
- description: "List of error messages, if any"
616
- }
617
- },
618
- required: ["elements", "isOrderSensitive", "errors"],
619
- additionalProperties: false
620
- }
621
- }
622
- };
623
- var findElementPrompt = new PromptTemplate({
624
- template: `
625
- Here is the item user want to find:
626
- =====================================
627
- {targetElementDescription}
628
- =====================================
629
-
630
- {pageDescription}
631
- `,
632
- inputVariables: ["pageDescription", "targetElementDescription"]
633
- });
634
-
635
- // src/ai-model/prompt/llm-planning.ts
636
- import assert2 from "assert";
637
- import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
638
- var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
639
- var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
640
- var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
641
- var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
642
- "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
643
- var vlLocateParam = (required) => `locate${required ? "" : "?"}: {bbox: [number, number, number, number], prompt: string }`;
644
- var llmLocateParam = (required) => `locate${required ? "" : "?"}: {"id": string, "prompt": string}`;
645
- var descriptionForAction = (action, locatorScheme) => {
646
- const tab = " ";
647
- let locateParam = "";
648
- if (action.location === "required") {
649
- locateParam = locatorScheme;
650
- } else if (action.location === "optional") {
651
- locateParam = `${locatorScheme} | null`;
652
- } else if (action.location === false) {
653
- locateParam = "";
654
- }
655
- const locatorParam = locateParam ? `- ${locateParam}` : "";
656
- if (action.whatToLocate) {
657
- if (!locateParam) {
658
- console.warn(
659
- `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
660
- );
661
- } else {
662
- locateParam += ` // ${action.whatToLocate}`;
663
- }
664
- }
665
- let paramSchema = "";
666
- if (action.paramSchema) {
667
- paramSchema = `- param: ${action.paramSchema}`;
668
- }
669
- if (action.paramDescription) {
670
- assert2(
671
- paramSchema,
672
- `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
673
- );
674
- paramSchema += ` // ${action.paramDescription}`;
675
- }
676
- const fields = [paramSchema, locatorParam].filter(Boolean);
677
- return `- ${action.name}, ${action.description}
678
- ${tab}- type: "${action.name}"
679
- ${tab}${fields.join(`
680
- ${tab}`)}
681
- `.trim();
682
- };
683
- var systemTemplateOfVLPlanning = ({
684
- actionSpace,
685
- vlMode
686
- }) => {
687
- const actionNameList = actionSpace.map((action) => action.name).join(", ");
688
- const actionDescriptionList = actionSpace.map(
689
- (action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
690
- );
691
- const actionList = actionDescriptionList.join("\n");
692
- return `
693
- Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
694
-
695
- Restriction:
696
- - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
697
- - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
698
- - Don't repeat actions in the previous logs.
699
- - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
700
-
701
- Supporting actions:
702
- ${actionList}
703
-
704
- Field description:
705
- * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
706
-
707
- Return in JSON format:
708
- {
709
- ${vlCoTLog}
710
- ${vlCurrentLog}
711
- ${commonOutputFields}
712
- "action":
713
- {
714
- // one of the supporting actions
715
- } | null,
716
- ,
717
- "sleep"?: number, // The sleep time after the action, in milliseconds.
718
- }
719
-
720
- For example, when the instruction is "click 'Confirm' button, and click 'Yes' in popup" and the log is "I will use action Tap to click 'Confirm' button", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.
721
-
722
- this and output the JSON:
723
-
724
- {
725
- "what_the_user_wants_to_do_next_by_instruction": "We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup",
726
- "log": "I will use action Tap to click 'Yes' in popup",
727
- "more_actions_needed_by_instruction": false,
728
- "action": {
729
- "type": "Tap",
730
- "locate": {
731
- "bbox": [100, 100, 200, 200],
732
- "prompt": "The 'Yes' button in popup"
733
- }
734
- }
735
- }
736
- `;
737
- };
738
- var systemTemplateOfLLM = ({
739
- actionSpace
740
- }) => {
741
- const actionNameList = actionSpace.map((action) => action.name).join(" / ");
742
- const actionDescriptionList = actionSpace.map(
743
- (action) => descriptionForAction(
744
- action,
745
- llmLocateParam(action.location === "required")
746
- )
747
- );
748
- const actionList = actionDescriptionList.join("\n");
749
- return `
750
- ## Role
751
-
752
- You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
753
-
754
- ## Objective
755
-
756
- - Decompose the instruction user asked into a series of actions
757
- - Locate the target element if possible
758
- - If the instruction cannot be accomplished, give a further plan.
759
-
760
- ## Workflow
761
-
762
- 1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
763
- 2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
764
- 3. Consider whether the user's instruction will be accomplished after the actions you composed.
765
- - If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
766
- - If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
767
- 4. If the task is not feasible on this page, set \`error\` field to the reason.
768
-
769
- ## Constraints
770
-
771
- - All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
772
- - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
773
- - Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
774
- - If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
775
-
776
- ## About the \`actions\` field
777
-
778
- The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
779
-
780
- type LocateParam = {
781
- "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
782
- "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
783
- } | null // If it's not on the page, the LocateParam should be null
784
-
785
- ## Supported actions
786
-
787
- Each action has a \`type\` and corresponding \`param\`. To be detailed:
788
- ${actionList}
789
-
790
- `.trim();
791
- };
792
- var outputTemplate = `
793
- ## Output JSON Format:
794
-
795
- The JSON format is as follows:
796
-
797
- {
798
- "actions": [
799
- // ... some actions
800
- ],
801
- ${llmCurrentLog}
802
- ${commonOutputFields}
803
- }
804
-
805
- ## Examples
806
-
807
- ### Example: Decompose a task
808
-
809
- When you received the following information:
810
-
811
- * Instruction: 'Click the language switch button, wait 1s, click "English"'
812
- * Logs: null
813
- * Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
814
-
815
- By viewing the page screenshot and description, you should consider this and output the JSON:
816
-
817
- * The user intent is: tap the switch button, sleep, and tap the 'English' option
818
- * The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
819
- * Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
820
- * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
821
- * Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
822
- * The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
823
-
824
- {
825
- "actions":[
826
- {
827
- "thought": "Click the language switch button to open the language options.",
828
- "type": "Tap",
829
- "param": null,
830
- "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
831
- },
832
- {
833
- "thought": "Wait for 1 second to ensure the language options are displayed.",
834
- "type": "Sleep",
835
- "param": { "timeMs": 1000 },
836
- }
837
- ],
838
- "error": null,
839
- "more_actions_needed_by_instruction": true,
840
- "log": "Click the language switch button to open the language options. Wait for 1 second",
841
- }
842
-
843
- ### Example: What NOT to do
844
- Wrong output:
845
- {
846
- "actions":[
847
- {
848
- "thought": "Click the language switch button to open the language options.",
849
- "type": "Tap",
850
- "param": null,
851
- "locate": {
852
- { "id": "c81c4e9a33" }, // WRONG: prompt is missing, this is not a valid LocateParam
853
- }
854
- },
855
- {
856
- "thought": "Click the English option",
857
- "type": "Tap",
858
- "param": null,
859
- "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
860
- }
861
- ],
862
- "more_actions_needed_by_instruction": false, // WRONG: should be true
863
- "log": "Click the language switch button to open the language options",
864
- }
865
- `;
866
- async function systemPromptToTaskPlanning({
867
- actionSpace,
868
- vlMode
869
- }) {
870
- if (vlMode) {
871
- return systemTemplateOfVLPlanning({ actionSpace, vlMode });
872
- }
873
- return `${systemTemplateOfLLM({ actionSpace })}
874
-
875
- ${outputTemplate}`;
876
- }
877
- var planSchema = {
878
- type: "json_schema",
879
- json_schema: {
880
- name: "action_items",
881
- strict: false,
882
- schema: {
883
- type: "object",
884
- strict: false,
885
- properties: {
886
- actions: {
887
- type: "array",
888
- items: {
889
- type: "object",
890
- strict: false,
891
- properties: {
892
- thought: {
893
- type: "string",
894
- description: "Reasons for generating this task, and why this task is feasible on this page"
895
- },
896
- type: {
897
- type: "string",
898
- description: "Type of action"
899
- },
900
- param: {
901
- anyOf: [
902
- { type: "null" },
903
- {
904
- type: "object",
905
- additionalProperties: true
906
- }
907
- ],
908
- description: "Parameter of the action"
909
- },
910
- locate: {
911
- type: ["object", "null"],
912
- properties: {
913
- id: { type: "string" },
914
- prompt: { type: "string" }
915
- },
916
- required: ["id", "prompt"],
917
- additionalProperties: false,
918
- description: "Location information for the target element"
919
- }
920
- },
921
- required: ["thought", "type", "param", "locate"],
922
- additionalProperties: false
923
- },
924
- description: "List of actions to be performed"
925
- },
926
- more_actions_needed_by_instruction: {
927
- type: "boolean",
928
- description: "If all the actions described in the instruction have been covered by this action and logs, set this field to false."
929
- },
930
- log: {
931
- type: "string",
932
- description: "Log what these planned actions do. Do not include further actions that have not been planned."
933
- },
934
- error: {
935
- type: ["string", "null"],
936
- description: "Error messages about unexpected situations"
937
- }
938
- },
939
- required: [
940
- "actions",
941
- "more_actions_needed_by_instruction",
942
- "log",
943
- "error"
944
- ],
945
- additionalProperties: false
946
- }
947
- }
948
- };
949
- var generateTaskBackgroundContext = (userInstruction, log, userActionContext) => {
950
- if (log) {
951
- return `
952
- Here is the user's instruction:
953
-
954
- <instruction>
955
- <high_priority_knowledge>
956
- ${userActionContext}
957
- </high_priority_knowledge>
958
-
959
- ${userInstruction}
960
- </instruction>
961
-
962
- These are the logs from previous executions, which indicate what was done in the previous actions.
963
- Do NOT repeat these actions.
964
- <previous_logs>
965
- ${log}
966
- </previous_logs>
967
- `;
968
- }
969
- return `
970
- Here is the user's instruction:
971
- <instruction>
972
- <high_priority_knowledge>
973
- ${userActionContext}
974
- </high_priority_knowledge>
975
-
976
- ${userInstruction}
977
- </instruction>
978
- `;
979
- };
980
- var automationUserPrompt = (vlMode) => {
981
- if (vlMode) {
982
- return new PromptTemplate2({
983
- template: "{taskBackgroundContext}",
984
- inputVariables: ["taskBackgroundContext"]
985
- });
986
- }
987
- return new PromptTemplate2({
988
- template: `
989
- pageDescription:
990
- =====================================
991
- {pageDescription}
992
- =====================================
993
-
994
- {taskBackgroundContext}`,
995
- inputVariables: ["pageDescription", "taskBackgroundContext"]
996
- });
997
- };
998
-
999
- // src/ai-model/service-caller/index.ts
1000
- function checkAIConfig() {
1001
- const openaiKey = getAIConfig(OPENAI_API_KEY);
1002
- const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
1003
- const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
1004
- const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1005
- if (openaiKey)
1006
- return true;
1007
- if (azureConfig)
1008
- return true;
1009
- if (anthropicKey)
1010
- return true;
1011
- return Boolean(initConfigJson);
1012
- }
1013
- var debugConfigInitialized = false;
1014
- function initDebugConfig() {
1015
- if (debugConfigInitialized)
1016
- return;
1017
- const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
1018
- let debugConfig = "";
1019
- if (shouldPrintTiming) {
1020
- console.warn(
1021
- "MIDSCENE_DEBUG_AI_PROFILE is deprecated, use DEBUG=midscene:ai:profile instead"
1022
- );
1023
- debugConfig = "ai:profile";
1024
- }
1025
- const shouldPrintAIResponse = getAIConfigInBoolean(
1026
- MIDSCENE_DEBUG_AI_RESPONSE
1027
- );
1028
- if (shouldPrintAIResponse) {
1029
- console.warn(
1030
- "MIDSCENE_DEBUG_AI_RESPONSE is deprecated, use DEBUG=midscene:ai:response instead"
1031
- );
1032
- if (debugConfig) {
1033
- debugConfig = "ai:*";
1034
- } else {
1035
- debugConfig = "ai:call";
1036
- }
1037
- }
1038
- if (debugConfig) {
1039
- enableDebug(debugConfig);
1040
- }
1041
- debugConfigInitialized = true;
1042
- }
1043
- var defaultModel = "gpt-4o";
1044
- function getModelName() {
1045
- let modelName = defaultModel;
1046
- const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1047
- if (nameInConfig) {
1048
- modelName = nameInConfig;
1049
- }
1050
- return modelName;
1051
- }
1052
- async function createChatClient({
1053
- AIActionTypeValue
1054
- }) {
1055
- initDebugConfig();
1056
- let openai;
1057
- const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1058
- const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
1059
- const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
1060
- let proxyAgent = void 0;
1061
- const debugProxy = getDebug2("ai:call:proxy");
1062
- if (httpProxy) {
1063
- debugProxy("using http proxy", httpProxy);
1064
- proxyAgent = new HttpsProxyAgent(httpProxy);
1065
- } else if (socksProxy) {
1066
- debugProxy("using socks proxy", socksProxy);
1067
- proxyAgent = new SocksProxyAgent(socksProxy);
1068
- }
1069
- if (getAIConfig(OPENAI_USE_AZURE)) {
1070
- openai = new AzureOpenAI({
1071
- baseURL: getAIConfig(OPENAI_BASE_URL),
1072
- apiKey: getAIConfig(OPENAI_API_KEY),
1073
- httpAgent: proxyAgent,
1074
- ...extraConfig,
1075
- dangerouslyAllowBrowser: true
1076
- });
1077
- } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
1078
- const extraAzureConfig = getAIConfigInJson(
1079
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
1080
- );
1081
- const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
1082
- let tokenProvider = void 0;
1083
- if (scope) {
1084
- assert3(
1085
- !ifInBrowser,
1086
- "Azure OpenAI is not supported in browser with Midscene."
1087
- );
1088
- const credential = new DefaultAzureCredential();
1089
- assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
1090
- tokenProvider = getBearerTokenProvider(credential, scope);
1091
- openai = new AzureOpenAI({
1092
- azureADTokenProvider: tokenProvider,
1093
- endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1094
- apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1095
- deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1096
- ...extraConfig,
1097
- ...extraAzureConfig
1098
- });
1099
- } else {
1100
- openai = new AzureOpenAI({
1101
- apiKey: getAIConfig(AZURE_OPENAI_KEY),
1102
- endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1103
- apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1104
- deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1105
- dangerouslyAllowBrowser: true,
1106
- ...extraConfig,
1107
- ...extraAzureConfig
1108
- });
1109
- }
1110
- } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1111
- const baseURL = getAIConfig(OPENAI_BASE_URL);
1112
- if (typeof baseURL === "string") {
1113
- if (!/^https?:\/\//.test(baseURL)) {
1114
- throw new Error(
1115
- `OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
1116
- Please check your config.`
1117
- );
1118
- }
1119
- }
1120
- openai = new OpenAI({
1121
- baseURL: getAIConfig(OPENAI_BASE_URL),
1122
- apiKey: getAIConfig(OPENAI_API_KEY),
1123
- httpAgent: proxyAgent,
1124
- ...extraConfig,
1125
- defaultHeaders: {
1126
- ...extraConfig?.defaultHeaders || {},
1127
- [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1128
- },
1129
- dangerouslyAllowBrowser: true
1130
- });
1131
- }
1132
- if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
1133
- if (ifInBrowser) {
1134
- throw new Error("langsmith is not supported in browser");
1135
- }
1136
- console.log("DEBUGGING MODE: langsmith wrapper enabled");
1137
- const { wrapOpenAI } = await import("langsmith/wrappers");
1138
- openai = wrapOpenAI(openai);
1139
- }
1140
- if (typeof openai !== "undefined") {
1141
- return {
1142
- completion: openai.chat.completions,
1143
- style: "openai"
1144
- };
1145
- }
1146
- if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1147
- const apiKey = getAIConfig(ANTHROPIC_API_KEY);
1148
- assert3(apiKey, "ANTHROPIC_API_KEY is required");
1149
- openai = new Anthropic({
1150
- apiKey,
1151
- httpAgent: proxyAgent,
1152
- dangerouslyAllowBrowser: true
1153
- });
1154
- }
1155
- if (typeof openai !== "undefined" && openai.messages) {
1156
- return {
1157
- completion: openai.messages,
1158
- style: "anthropic"
1159
- };
1160
- }
1161
- throw new Error("Openai SDK or Anthropic SDK is not initialized");
1162
- }
1163
- async function call2(messages, AIActionTypeValue, responseFormat, options) {
1164
- assert3(
1165
- checkAIConfig(),
1166
- "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1167
- );
1168
- const { completion, style } = await createChatClient({
1169
- AIActionTypeValue
1170
- });
1171
- const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
1172
- const debugCall = getDebug2("ai:call");
1173
- const debugProfileStats = getDebug2("ai:profile:stats");
1174
- const debugProfileDetail = getDebug2("ai:profile:detail");
1175
- const startTime = Date.now();
1176
- const model = getModelName();
1177
- const isStreaming = options?.stream && options?.onChunk;
1178
- let content;
1179
- let accumulated = "";
1180
- let usage;
1181
- let timeCost;
1182
- const commonConfig = {
1183
- temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
1184
- stream: !!isStreaming,
1185
- max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
1186
- ...vlLocateMode2() === "qwen-vl" ? {
1187
- vl_high_resolution_images: true
1188
- } : {}
1189
- };
1190
- try {
1191
- if (style === "openai") {
1192
- debugCall(
1193
- `sending ${isStreaming ? "streaming " : ""}request to ${model}`
1194
- );
1195
- if (isStreaming) {
1196
- const stream = await completion.create(
1197
- {
1198
- model,
1199
- messages,
1200
- response_format: responseFormat,
1201
- ...commonConfig
1202
- },
1203
- {
1204
- stream: true
1205
- }
1206
- );
1207
- for await (const chunk of stream) {
1208
- const content2 = chunk.choices?.[0]?.delta?.content || "";
1209
- const reasoning_content = chunk.choices?.[0]?.delta?.reasoning_content || "";
1210
- if (chunk.usage) {
1211
- usage = chunk.usage;
1212
- }
1213
- if (content2 || reasoning_content) {
1214
- accumulated += content2;
1215
- const chunkData = {
1216
- content: content2,
1217
- reasoning_content,
1218
- accumulated,
1219
- isComplete: false,
1220
- usage: void 0
1221
- };
1222
- options.onChunk(chunkData);
1223
- }
1224
- if (chunk.choices?.[0]?.finish_reason) {
1225
- timeCost = Date.now() - startTime;
1226
- if (!usage) {
1227
- const estimatedTokens = Math.max(
1228
- 1,
1229
- Math.floor(accumulated.length / 4)
1230
- );
1231
- usage = {
1232
- prompt_tokens: estimatedTokens,
1233
- completion_tokens: estimatedTokens,
1234
- total_tokens: estimatedTokens * 2
1235
- };
1236
- }
1237
- const finalChunk = {
1238
- content: "",
1239
- accumulated,
1240
- reasoning_content: "",
1241
- isComplete: true,
1242
- usage: {
1243
- prompt_tokens: usage.prompt_tokens ?? 0,
1244
- completion_tokens: usage.completion_tokens ?? 0,
1245
- total_tokens: usage.total_tokens ?? 0,
1246
- time_cost: timeCost ?? 0
1247
- }
1248
- };
1249
- options.onChunk(finalChunk);
1250
- break;
1251
- }
1252
- }
1253
- content = accumulated;
1254
- debugProfileStats(
1255
- `streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
1256
- );
1257
- } else {
1258
- const result = await completion.create({
1259
- model,
1260
- messages,
1261
- response_format: responseFormat,
1262
- ...commonConfig
1263
- });
1264
- timeCost = Date.now() - startTime;
1265
- debugProfileStats(
1266
- `model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1267
- );
1268
- debugProfileDetail(
1269
- `model usage detail: ${JSON.stringify(result.usage)}`
1270
- );
1271
- assert3(
1272
- result.choices,
1273
- `invalid response from LLM service: ${JSON.stringify(result)}`
1274
- );
1275
- content = result.choices[0].message.content;
1276
- usage = result.usage;
1277
- }
1278
- debugCall(`response: ${content}`);
1279
- assert3(content, "empty content");
1280
- } else if (style === "anthropic") {
1281
- const convertImageContent = (content2) => {
1282
- if (content2.type === "image_url") {
1283
- const imgBase64 = content2.image_url.url;
1284
- assert3(imgBase64, "image_url is required");
1285
- return {
1286
- source: {
1287
- type: "base64",
1288
- media_type: imgBase64.includes("data:image/png;base64,") ? "image/png" : "image/jpeg",
1289
- data: imgBase64.split(",")[1]
1290
- },
1291
- type: "image"
1292
- };
1293
- }
1294
- return content2;
1295
- };
1296
- if (isStreaming) {
1297
- const stream = await completion.create({
1298
- model,
1299
- system: "You are a versatile professional in software UI automation",
1300
- messages: messages.map((m) => ({
1301
- role: "user",
1302
- content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
1303
- })),
1304
- response_format: responseFormat,
1305
- ...commonConfig
1306
- });
1307
- for await (const chunk of stream) {
1308
- const content2 = chunk.delta?.text || "";
1309
- if (content2) {
1310
- accumulated += content2;
1311
- const chunkData = {
1312
- content: content2,
1313
- accumulated,
1314
- reasoning_content: "",
1315
- isComplete: false,
1316
- usage: void 0
1317
- };
1318
- options.onChunk(chunkData);
1319
- }
1320
- if (chunk.type === "message_stop") {
1321
- timeCost = Date.now() - startTime;
1322
- const anthropicUsage = chunk.usage;
1323
- const finalChunk = {
1324
- content: "",
1325
- accumulated,
1326
- reasoning_content: "",
1327
- isComplete: true,
1328
- usage: anthropicUsage ? {
1329
- prompt_tokens: anthropicUsage.input_tokens ?? 0,
1330
- completion_tokens: anthropicUsage.output_tokens ?? 0,
1331
- total_tokens: (anthropicUsage.input_tokens ?? 0) + (anthropicUsage.output_tokens ?? 0),
1332
- time_cost: timeCost ?? 0
1333
- } : void 0
1334
- };
1335
- options.onChunk(finalChunk);
1336
- break;
1337
- }
1338
- }
1339
- content = accumulated;
1340
- } else {
1341
- const result = await completion.create({
1342
- model,
1343
- system: "You are a versatile professional in software UI automation",
1344
- messages: messages.map((m) => ({
1345
- role: "user",
1346
- content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
1347
- })),
1348
- response_format: responseFormat,
1349
- ...commonConfig
1350
- });
1351
- timeCost = Date.now() - startTime;
1352
- content = result.content[0].text;
1353
- usage = result.usage;
1354
- }
1355
- assert3(content, "empty content");
1356
- }
1357
- if (isStreaming && !usage) {
1358
- const estimatedTokens = Math.max(
1359
- 1,
1360
- Math.floor((content || "").length / 4)
1361
- );
1362
- usage = {
1363
- prompt_tokens: estimatedTokens,
1364
- completion_tokens: estimatedTokens,
1365
- total_tokens: estimatedTokens * 2
1366
- };
1367
- }
1368
- return {
1369
- content: content || "",
1370
- usage: usage ? {
1371
- prompt_tokens: usage.prompt_tokens ?? 0,
1372
- completion_tokens: usage.completion_tokens ?? 0,
1373
- total_tokens: usage.total_tokens ?? 0,
1374
- time_cost: timeCost ?? 0
1375
- } : void 0,
1376
- isStreamed: !!isStreaming
1377
- };
1378
- } catch (e) {
1379
- console.error(" call AI error", e);
1380
- const newError = new Error(
1381
- `failed to call ${isStreaming ? "streaming " : ""}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`,
1382
- {
1383
- cause: e
1384
- }
1385
- );
1386
- throw newError;
1387
- }
1388
- }
1389
- async function callToGetJSONObject(messages, AIActionTypeValue) {
1390
- let responseFormat;
1391
- const model = getModelName();
1392
- if (model.includes("gpt-4")) {
1393
- switch (AIActionTypeValue) {
1394
- case 0 /* ASSERT */:
1395
- responseFormat = assertSchema;
1396
- break;
1397
- case 1 /* INSPECT_ELEMENT */:
1398
- responseFormat = locatorSchema;
1399
- break;
1400
- case 3 /* PLAN */:
1401
- responseFormat = planSchema;
1402
- break;
1403
- case 2 /* EXTRACT_DATA */:
1404
- case 4 /* DESCRIBE_ELEMENT */:
1405
- responseFormat = { type: "json_object" /* JSON */ };
1406
- break;
1407
- }
1408
- }
1409
- if (model === "gpt-4o-2024-05-13") {
1410
- responseFormat = { type: "json_object" /* JSON */ };
1411
- }
1412
- const response = await call2(messages, AIActionTypeValue, responseFormat);
1413
- assert3(response, "empty response");
1414
- const jsonContent = safeParseJson(response.content);
1415
- return { content: jsonContent, usage: response.usage };
1416
- }
1417
- async function callAiFnWithStringResponse(msgs, AIActionTypeValue) {
1418
- const { content, usage } = await call2(msgs, AIActionTypeValue);
1419
- return { content, usage };
1420
- }
1421
- function extractJSONFromCodeBlock(response) {
1422
- try {
1423
- const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
1424
- if (jsonMatch) {
1425
- return jsonMatch[1];
1426
- }
1427
- const codeBlockMatch = response.match(
1428
- /```(?:json)?\s*(\{[\s\S]*?\})\s*```/
1429
- );
1430
- if (codeBlockMatch) {
1431
- return codeBlockMatch[1];
1432
- }
1433
- const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
1434
- if (jsonLikeMatch) {
1435
- return jsonLikeMatch[0];
1436
- }
1437
- } catch {
1438
- }
1439
- return response;
1440
- }
1441
- function preprocessDoubaoBboxJson(input) {
1442
- if (input.includes("bbox")) {
1443
- while (/\d+\s+\d+/.test(input)) {
1444
- input = input.replace(/(\d+)\s+(\d+)/g, "$1,$2");
1445
- }
1446
- }
1447
- return input;
1448
- }
1449
- function safeParseJson(input) {
1450
- const cleanJsonString = extractJSONFromCodeBlock(input);
1451
- if (cleanJsonString?.match(/\((\d+),(\d+)\)/)) {
1452
- return cleanJsonString.match(/\((\d+),(\d+)\)/)?.slice(1).map(Number);
1453
- }
1454
- try {
1455
- return JSON.parse(cleanJsonString);
1456
- } catch {
1457
- }
1458
- try {
1459
- return JSON.parse(jsonrepair(cleanJsonString));
1460
- } catch (e) {
1461
- }
1462
- if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
1463
- const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
1464
- return JSON.parse(jsonrepair(jsonString));
1465
- }
1466
- throw Error(`failed to parse json response: ${input}`);
1467
- }
1468
-
1469
- // src/image/index.ts
1470
- import {
1471
- imageInfo,
1472
- imageInfoOfBase64,
1473
- localImg2Base64,
1474
- httpImg2Base64,
1475
- resizeImg,
1476
- saveBase64Image,
1477
- zoomForGPT4o
1478
- } from "@midscene/shared/img";
1479
-
1480
- // src/ai-model/prompt/util.ts
1481
- import { NodeType as NodeType2 } from "@midscene/shared/constants";
1482
- import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
1483
- import {
1484
- descriptionOfTree,
1485
- generateElementByPosition,
1486
- treeToList as treeToList2
1487
- } from "@midscene/shared/extractor";
1488
- import { assert as assert4 } from "@midscene/shared/utils";
1489
- function describeSize(size) {
1490
- return `${size.width} x ${size.height}`;
1491
- }
1492
- var distanceThreshold = 16;
1493
- function elementByPositionWithElementInfo(treeRoot, position, options) {
1494
- const requireStrictDistance = options?.requireStrictDistance ?? true;
1495
- const filterPositionElements = options?.filterPositionElements ?? false;
1496
- assert4(typeof position !== "undefined", "position is required for query");
1497
- const matchingElements = [];
1498
- function dfs(node) {
1499
- if (node?.node) {
1500
- const item = node.node;
1501
- if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
1502
- if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
1503
- matchingElements.push(item);
1504
- }
1505
- }
1506
- }
1507
- for (const child of node.children) {
1508
- dfs(child);
1509
- }
1510
- }
1511
- dfs(treeRoot);
1512
- if (matchingElements.length === 0) {
1513
- return void 0;
1514
- }
1515
- const element = matchingElements.reduce((smallest, current) => {
1516
- const smallestArea = smallest.rect.width * smallest.rect.height;
1517
- const currentArea = current.rect.width * current.rect.height;
1518
- return currentArea < smallestArea ? current : smallest;
1519
- });
1520
- const distanceToCenter = distance(
1521
- { x: element.center[0], y: element.center[1] },
1522
- position
1523
- );
1524
- if (requireStrictDistance) {
1525
- return distanceToCenter <= distanceThreshold ? element : void 0;
1526
- }
1527
- return element;
1528
- }
1529
- function distance(point1, point2) {
1530
- return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
1531
- }
1532
- async function describeUserPage(context, opt) {
1533
- const { screenshotBase64 } = context;
1534
- let width;
1535
- let height;
1536
- if (context.size) {
1537
- ({ width, height } = context.size);
1538
- } else {
1539
- const imgSize = await imageInfoOfBase64(screenshotBase64);
1540
- ({ width, height } = imgSize);
1541
- }
1542
- const treeRoot = context.tree;
1543
- const idElementMap = {};
1544
- const flatElements = treeToList2(treeRoot);
1545
- if (opt?.domIncluded === true && flatElements.length >= 5e3) {
1546
- console.warn(
1547
- 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
1548
- );
1549
- }
1550
- flatElements.forEach((element) => {
1551
- idElementMap[element.id] = element;
1552
- if (typeof element.indexId !== "undefined") {
1553
- idElementMap[`${element.indexId}`] = element;
1554
- }
1555
- });
1556
- let pageDescription = "";
1557
- const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
1558
- if (opt?.domIncluded || !vlLocateMode3()) {
1559
- const contentTree = await descriptionOfTree(
1560
- treeRoot,
1561
- opt?.truncateTextLength,
1562
- opt?.filterNonTextContent,
1563
- visibleOnly
1564
- );
1565
- const sizeDescription = describeSize({ width, height });
1566
- pageDescription = `The size of the page: ${sizeDescription}
1567
- The page elements tree:
1568
- ${contentTree}`;
1569
- }
1570
- return {
1571
- description: pageDescription,
1572
- elementById(idOrIndexId) {
1573
- assert4(typeof idOrIndexId !== "undefined", "id is required for query");
1574
- const item = idElementMap[`${idOrIndexId}`];
1575
- return item;
1576
- },
1577
- elementByPosition(position, size) {
1578
- return elementByPositionWithElementInfo(treeRoot, position);
1579
- },
1580
- insertElementByPosition(position) {
1581
- const element = generateElementByPosition(position);
1582
- treeRoot.children.push({
1583
- node: element,
1584
- children: []
1585
- });
1586
- flatElements.push(element);
1587
- idElementMap[element.id] = element;
1588
- return element;
1589
- },
1590
- size: { width, height }
1591
- };
1592
- }
1593
-
1594
- // src/ai-model/prompt/playwright-generator.ts
1595
- import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
1596
-
1597
- // src/ai-model/prompt/yaml-generator.ts
1598
- import { YAML_EXAMPLE_CODE } from "@midscene/shared/constants";
1599
- var getScreenshotsForLLM = (events, maxScreenshots = 1) => {
1600
- const eventsWithScreenshots = events.filter(
1601
- (event) => event.screenshotBefore || event.screenshotAfter || event.screenshotWithBox
1602
- );
1603
- const sortedEvents = [...eventsWithScreenshots].sort((a, b) => {
1604
- if (a.type === "navigation" && b.type !== "navigation")
1605
- return -1;
1606
- if (a.type !== "navigation" && b.type === "navigation")
1607
- return 1;
1608
- if (a.type === "click" && b.type !== "click")
1609
- return -1;
1610
- if (a.type !== "click" && b.type === "click")
1611
- return 1;
1612
- return 0;
1613
- });
1614
- const screenshots = [];
1615
- for (const event of sortedEvents) {
1616
- const screenshot = event.screenshotWithBox || event.screenshotAfter || event.screenshotBefore;
1617
- if (screenshot && !screenshots.includes(screenshot)) {
1618
- screenshots.push(screenshot);
1619
- if (screenshots.length >= maxScreenshots)
1620
- break;
1621
- }
1622
- }
1623
- return screenshots;
1624
- };
1625
- var filterEventsByType = (events) => {
1626
- return {
1627
- navigationEvents: events.filter((event) => event.type === "navigation"),
1628
- clickEvents: events.filter((event) => event.type === "click"),
1629
- inputEvents: events.filter((event) => event.type === "input"),
1630
- scrollEvents: events.filter((event) => event.type === "scroll")
1631
- };
1632
- };
1633
- var createEventCounts = (filteredEvents, totalEvents) => {
1634
- return {
1635
- navigation: filteredEvents.navigationEvents.length,
1636
- click: filteredEvents.clickEvents.length,
1637
- input: filteredEvents.inputEvents.length,
1638
- scroll: filteredEvents.scrollEvents.length,
1639
- total: totalEvents
1640
- };
1641
- };
1642
- var extractInputDescriptions = (inputEvents) => {
1643
- return inputEvents.map((event) => ({
1644
- description: event.elementDescription || "",
1645
- value: event.value || ""
1646
- })).filter((item) => item.description && item.value);
1647
- };
1648
- var processEventsForLLM = (events) => {
1649
- return events.map((event) => ({
1650
- type: event.type,
1651
- timestamp: event.timestamp,
1652
- url: event.url,
1653
- title: event.title,
1654
- elementDescription: event.elementDescription,
1655
- value: event.value,
1656
- pageInfo: event.pageInfo,
1657
- elementRect: event.elementRect
1658
- }));
1659
- };
1660
- var prepareEventSummary = (events, options = {}) => {
1661
- const filteredEvents = filterEventsByType(events);
1662
- const eventCounts = createEventCounts(filteredEvents, events.length);
1663
- const startUrl = filteredEvents.navigationEvents.length > 0 ? filteredEvents.navigationEvents[0].url || "" : "";
1664
- const clickDescriptions = filteredEvents.clickEvents.map((event) => event.elementDescription).filter((desc) => Boolean(desc)).slice(0, 10);
1665
- const inputDescriptions = extractInputDescriptions(
1666
- filteredEvents.inputEvents
1667
- ).slice(0, 10);
1668
- const urls = filteredEvents.navigationEvents.map((e) => e.url).filter((url) => Boolean(url)).slice(0, 5);
1669
- const processedEvents = processEventsForLLM(events);
1670
- return {
1671
- testName: options.testName || "Automated test from recorded events",
1672
- startUrl,
1673
- eventCounts,
1674
- urls,
1675
- clickDescriptions,
1676
- inputDescriptions,
1677
- events: processedEvents
1678
- };
1679
- };
1680
- var createMessageContent = (promptText, screenshots = [], includeScreenshots = true) => {
1681
- const messageContent = [
1682
- {
1683
- type: "text",
1684
- text: promptText
1685
- }
1686
- ];
1687
- if (includeScreenshots && screenshots.length > 0) {
1688
- messageContent.unshift({
1689
- type: "text",
1690
- text: "Here are screenshots from the recording session to help you understand the context:"
1691
- });
1692
- screenshots.forEach((screenshot) => {
1693
- messageContent.push({
1694
- type: "image_url",
1695
- image_url: {
1696
- url: screenshot
1697
- }
1698
- });
1699
- });
1700
- }
1701
- return messageContent;
1702
- };
1703
- var validateEvents = (events) => {
1704
- if (!events.length) {
1705
- throw new Error("No events provided for test generation");
1706
- }
1707
- };
1708
- var generateYamlTest = async (events, options = {}) => {
1709
- try {
1710
- validateEvents(events);
1711
- const summary = prepareEventSummary(events, {
1712
- testName: options.testName,
1713
- maxScreenshots: options.maxScreenshots || 3
1714
- });
1715
- const yamlSummary = {
1716
- ...summary,
1717
- includeTimestamps: options.includeTimestamps || false
1718
- };
1719
- const screenshots = getScreenshotsForLLM(
1720
- events,
1721
- options.maxScreenshots || 3
1722
- );
1723
- const prompt = [
1724
- {
1725
- role: "system",
1726
- content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`
1727
- },
1728
- {
1729
- role: "user",
1730
- content: `Generate YAML test for Midscene.js automation from recorded browser events.
1731
-
1732
- Event Summary:
1733
- ${JSON.stringify(yamlSummary, null, 2)}
1734
-
1735
- Convert events:
1736
- - navigation → target.url
1737
- - click → aiTap with element description
1738
- - input → aiInput with value and locate
1739
- - scroll → aiScroll with appropriate direction
1740
- - Add aiAssert for important state changes
1741
-
1742
- Respond with YAML only, no explanations.`
1743
- }
1744
- ];
1745
- if (screenshots.length > 0) {
1746
- prompt.push({
1747
- role: "user",
1748
- content: "Here are screenshots from the recording session to help you understand the context:"
1749
- });
1750
- prompt.push({
1751
- role: "user",
1752
- content: screenshots.map((screenshot) => ({
1753
- type: "image_url",
1754
- image_url: {
1755
- url: screenshot
1756
- }
1757
- }))
1758
- });
1759
- }
1760
- const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1761
- if (response?.content && typeof response.content === "string") {
1762
- return response.content;
1763
- }
1764
- throw new Error("Failed to generate YAML test configuration");
1765
- } catch (error) {
1766
- throw new Error(`Failed to generate YAML test: ${error}`);
1767
- }
1768
- };
1769
- var generateYamlTestStream = async (events, options = {}) => {
1770
- try {
1771
- validateEvents(events);
1772
- const summary = prepareEventSummary(events, {
1773
- testName: options.testName,
1774
- maxScreenshots: options.maxScreenshots || 3
1775
- });
1776
- const yamlSummary = {
1777
- ...summary,
1778
- includeTimestamps: options.includeTimestamps || false
1779
- };
1780
- const screenshots = getScreenshotsForLLM(
1781
- events,
1782
- options.maxScreenshots || 3
1783
- );
1784
- const prompt = [
1785
- {
1786
- role: "system",
1787
- content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`
1788
- },
1789
- {
1790
- role: "user",
1791
- content: `Generate YAML test for Midscene.js automation from recorded browser events.
1792
-
1793
- Event Summary:
1794
- ${JSON.stringify(yamlSummary, null, 2)}
1795
-
1796
- Convert events:
1797
- - navigation → target.url
1798
- - click → aiTap with element description
1799
- - input → aiInput with value and locate
1800
- - scroll → aiScroll with appropriate direction
1801
- - Add aiAssert for important state changes
1802
-
1803
- Respond with YAML only, no explanations.`
1804
- }
1805
- ];
1806
- if (screenshots.length > 0) {
1807
- prompt.push({
1808
- role: "user",
1809
- content: "Here are screenshots from the recording session to help you understand the context:"
1810
- });
1811
- prompt.push({
1812
- role: "user",
1813
- content: screenshots.map((screenshot) => ({
1814
- type: "image_url",
1815
- image_url: {
1816
- url: screenshot
1817
- }
1818
- }))
1819
- });
1820
- }
1821
- if (options.stream && options.onChunk) {
1822
- return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1823
- stream: true,
1824
- onChunk: options.onChunk
1825
- });
1826
- } else {
1827
- const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1828
- if (response?.content && typeof response.content === "string") {
1829
- return {
1830
- content: response.content,
1831
- usage: response.usage,
1832
- isStreamed: false
1833
- };
1834
- }
1835
- throw new Error("Failed to generate YAML test configuration");
1836
- }
1837
- } catch (error) {
1838
- throw new Error(`Failed to generate YAML test: ${error}`);
1839
- }
1840
- };
1841
-
1842
- // src/ai-model/prompt/playwright-generator.ts
1843
- var generatePlaywrightTest = async (events, options = {}) => {
1844
- validateEvents(events);
1845
- const summary = prepareEventSummary(events, {
1846
- testName: options.testName,
1847
- maxScreenshots: options.maxScreenshots || 3
1848
- });
1849
- const playwrightSummary = {
1850
- ...summary,
1851
- waitForNetworkIdle: options.waitForNetworkIdle !== false,
1852
- waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2e3,
1853
- viewportSize: options.viewportSize || { width: 1280, height: 800 }
1854
- };
1855
- const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);
1856
- const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.
1857
-
1858
- Event Summary:
1859
- ${JSON.stringify(playwrightSummary, null, 2)}
1860
-
1861
- Generated code should:
1862
- 1. Import required dependencies
1863
- 2. Set up the test with proper configuration
1864
- 3. Include a beforeEach hook to navigate to the starting URL
1865
- 4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)
1866
- 5. Include appropriate assertions and validations
1867
- 6. Follow best practices for Playwright tests
1868
- 7. Be ready to execute without further modification
1869
-
1870
- Respond ONLY with the complete Playwright test code, no explanations.`;
1871
- const messageContent = createMessageContent(
1872
- promptText,
1873
- screenshots,
1874
- options.includeScreenshots !== false
1875
- );
1876
- const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene.
1877
- Your task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.
1878
-
1879
- ${PLAYWRIGHT_EXAMPLE_CODE}`;
1880
- const prompt = [
1881
- {
1882
- role: "system",
1883
- content: systemPrompt
1884
- },
1885
- {
1886
- role: "user",
1887
- content: messageContent
1888
- }
1889
- ];
1890
- const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1891
- if (response?.content && typeof response.content === "string") {
1892
- return response.content;
1893
- }
1894
- throw new Error("Failed to generate Playwright test code");
1895
- };
1896
- var generatePlaywrightTestStream = async (events, options = {}) => {
1897
- validateEvents(events);
1898
- const summary = prepareEventSummary(events, {
1899
- testName: options.testName,
1900
- maxScreenshots: options.maxScreenshots || 3
1901
- });
1902
- const playwrightSummary = {
1903
- ...summary,
1904
- waitForNetworkIdle: options.waitForNetworkIdle !== false,
1905
- waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2e3,
1906
- viewportSize: options.viewportSize || { width: 1280, height: 800 }
1907
- };
1908
- const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);
1909
- const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.
1910
-
1911
- Event Summary:
1912
- ${JSON.stringify(playwrightSummary, null, 2)}
1913
-
1914
- Generated code should:
1915
- 1. Import required dependencies
1916
- 2. Set up the test with proper configuration
1917
- 3. Include a beforeEach hook to navigate to the starting URL
1918
- 4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)
1919
- 5. Include appropriate assertions and validations
1920
- 6. Follow best practices for Playwright tests
1921
- 7. Be ready to execute without further modification
1922
- 8. can't wrap this test code in markdown code block
1923
-
1924
- Respond ONLY with the complete Playwright test code, no explanations.`;
1925
- const messageContent = createMessageContent(
1926
- promptText,
1927
- screenshots,
1928
- options.includeScreenshots !== false
1929
- );
1930
- const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene.
1931
- Your task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.
1932
-
1933
- ${PLAYWRIGHT_EXAMPLE_CODE}`;
1934
- const prompt = [
1935
- {
1936
- role: "system",
1937
- content: systemPrompt
1938
- },
1939
- {
1940
- role: "user",
1941
- content: messageContent
1942
- }
1943
- ];
1944
- if (options.stream && options.onChunk) {
1945
- return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1946
- stream: true,
1947
- onChunk: options.onChunk
1948
- });
1949
- } else {
1950
- const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1951
- if (response?.content && typeof response.content === "string") {
1952
- return {
1953
- content: response.content,
1954
- usage: response.usage,
1955
- isStreamed: false
1956
- };
1957
- }
1958
- throw new Error("Failed to generate Playwright test code");
1959
- }
1960
- };
1961
-
1962
- // src/ai-model/inspect.ts
1963
- import {
1964
- MIDSCENE_USE_QWEN_VL,
1965
- MIDSCENE_USE_VLM_UI_TARS,
1966
- getAIConfigInBoolean as getAIConfigInBoolean2,
1967
- vlLocateMode as vlLocateMode4
1968
- } from "@midscene/shared/env";
1969
- import {
1970
- cropByRect,
1971
- paddingToMatchBlockByBase64,
1972
- preProcessImageUrl
1973
- } from "@midscene/shared/img";
1974
- import { getDebug as getDebug3 } from "@midscene/shared/logger";
1975
- import { assert as assert5 } from "@midscene/shared/utils";
1976
-
1977
- // src/ai-model/prompt/extraction.ts
1978
- import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
1979
- function systemPromptToExtract() {
1980
- return `
1981
- You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
1982
-
1983
- The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
1984
-
1985
- If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1986
-
1987
- If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
1988
-
1989
- If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
1990
-
1991
- Return in the following JSON format:
1992
- {
1993
- thought: string, // the thought process of the extraction, less then 100 words, not required by default.
1994
- data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
1995
- errors: [], // string[], error message if any
1996
- }
1997
-
1998
- # Example 1
1999
- For example, if the DATA_DEMAND is:
2000
-
2001
- <DATA_DEMAND>
2002
- {
2003
- "name": "name shows on the left panel, string",
2004
- "age": "age shows on the right panel, number",
2005
- "isAdmin": "if the user is admin, boolean"
2006
- }
2007
- </DATA_DEMAND>
2008
-
2009
- By viewing the screenshot and page contents, you can extract the following data:
2010
-
2011
- {
2012
- data: {
2013
- name: "John",
2014
- age: 30,
2015
- isAdmin: true
2016
- },
2017
- }
2018
-
2019
- # Example 2
2020
- If the DATA_DEMAND is:
2021
-
2022
- <DATA_DEMAND>
2023
- the todo items list, string[]
2024
- </DATA_DEMAND>
2025
-
2026
- By viewing the screenshot and page contents, you can extract the following data:
2027
-
2028
- {
2029
- data: ["todo 1", "todo 2", "todo 3"],
2030
- }
2031
-
2032
- # Example 3
2033
- If the DATA_DEMAND is:
2034
-
2035
- <DATA_DEMAND>
2036
- the page title, string
2037
- </DATA_DEMAND>
2038
-
2039
- By viewing the screenshot and page contents, you can extract the following data:
2040
-
2041
- {
2042
- data: "todo list",
2043
- }
2044
-
2045
- # Example 4
2046
- If the DATA_DEMAND is:
2047
-
2048
- <DATA_DEMAND>
2049
- {
2050
- "result": "Boolean, is it currently the SMS page?"
2051
- }
2052
- </DATA_DEMAND>
2053
-
2054
- By viewing the screenshot and page contents, you can extract the following data:
2055
-
2056
- {
2057
- data: { result: true },
2058
- }
2059
- `;
2060
- }
2061
- var extractDataQueryPrompt = async (pageDescription, dataQuery) => {
2062
- let dataQueryText = "";
2063
- if (typeof dataQuery === "string") {
2064
- dataQueryText = dataQuery;
2065
- } else {
2066
- dataQueryText = JSON.stringify(dataQuery, null, 2);
2067
- }
2068
- const extractDataPrompt = new PromptTemplate3({
2069
- template: `
2070
- <PageDescription>
2071
- {pageDescription}
2072
- </PageDescription>
2073
-
2074
- <DATA_DEMAND>
2075
- {dataQuery}
2076
- </DATA_DEMAND>
2077
- `,
2078
- inputVariables: ["pageDescription", "dataQuery"]
2079
- });
2080
- return await extractDataPrompt.format({
2081
- pageDescription,
2082
- dataQuery: dataQueryText
2083
- });
2084
- };
2085
-
2086
- // src/ai-model/prompt/llm-section-locator.ts
2087
- import { PromptTemplate as PromptTemplate4 } from "@langchain/core/prompts";
2088
- function systemPromptToLocateSection(vlMode) {
2089
- return `
2090
- You goal is to find out one section containing the target element in the screenshot, put it in the \`bbox\` field. If the user describe the target element with some reference elements, you should also find the section containing the reference elements, put it in the \`references_bbox\` field.
2091
-
2092
- Usually, it should be approximately an area not more than 300x300px. Changes of the size are allowed if there are many elements to cover.
2093
-
2094
- return in this JSON format:
2095
- \`\`\`json
2096
- {
2097
- "bbox": [number, number, number, number],
2098
- "references_bbox"?: [
2099
- [number, number, number, number],
2100
- [number, number, number, number],
2101
- ...
2102
- ],
2103
- "error"?: string
2104
- }
2105
- \`\`\`
2106
-
2107
- In which, all the numbers in the \`bbox\` and \`references_bbox\` represent ${bboxDescription(vlMode)}.
2108
-
2109
- For example, if the user describe the target element as "the delete button on the second row with title 'Peter'", you should put the bounding box of the delete button in the \`bbox\` field, and the bounding box of the second row in the \`references_bbox\` field.
2110
-
2111
- the return value should be like this:
2112
- \`\`\`json
2113
- {
2114
- "bbox": [100, 100, 200, 200],
2115
- "references_bbox": [[100, 100, 200, 200]]
2116
- }
2117
- \`\`\`
2118
- `;
2119
- }
2120
- var sectionLocatorInstruction = new PromptTemplate4({
2121
- template: `Here is the target element user interested in:
2122
- <targetDescription>
2123
- {sectionDescription}
2124
- </targetDescription>
2125
- `,
2126
- inputVariables: ["sectionDescription"]
2127
- });
2128
-
2129
- // src/ai-model/inspect.ts
2130
- var debugInspect = getDebug3("ai:inspect");
2131
- var debugSection = getDebug3("ai:section");
2132
- var extraTextFromUserPrompt = (prompt) => {
2133
- if (typeof prompt === "string") {
2134
- return prompt;
2135
- } else {
2136
- return prompt.prompt;
2137
- }
2138
- };
2139
- var promptsToChatParam = async (multimodalPrompt) => {
2140
- const msgs = [];
2141
- if (multimodalPrompt?.images?.length) {
2142
- msgs.push({
2143
- role: "user",
2144
- content: [
2145
- {
2146
- type: "text",
2147
- text: "Next, I will provide all the reference images."
2148
- }
2149
- ]
2150
- });
2151
- for (const item of multimodalPrompt.images) {
2152
- const base64 = await preProcessImageUrl(
2153
- item.url,
2154
- !!multimodalPrompt.convertHttpImage2Base64
2155
- );
2156
- msgs.push({
2157
- role: "user",
2158
- content: [
2159
- {
2160
- type: "text",
2161
- text: `reference image ${item.name}:`
2162
- }
2163
- ]
2164
- });
2165
- msgs.push({
2166
- role: "user",
2167
- content: [
2168
- {
2169
- type: "image_url",
2170
- image_url: {
2171
- url: base64,
2172
- detail: "high"
2173
- }
2174
- }
2175
- ]
2176
- });
2177
- }
2178
- }
2179
- return msgs;
2180
- };
2181
- async function AiLocateElement(options) {
2182
- const { context, targetElementDescription, callAI } = options;
2183
- const { screenshotBase64 } = context;
2184
- const { description, elementById, insertElementByPosition } = await describeUserPage(context);
2185
- assert5(
2186
- targetElementDescription,
2187
- "cannot find the target element description"
2188
- );
2189
- const userInstructionPrompt = await findElementPrompt.format({
2190
- pageDescription: description,
2191
- targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
2192
- });
2193
- const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
2194
- let imagePayload = screenshotBase64;
2195
- if (options.searchConfig) {
2196
- assert5(
2197
- options.searchConfig.rect,
2198
- "searchArea is provided but its rect cannot be found. Failed to locate element"
2199
- );
2200
- assert5(
2201
- options.searchConfig.imageBase64,
2202
- "searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
2203
- );
2204
- imagePayload = options.searchConfig.imageBase64;
2205
- } else if (vlLocateMode4() === "qwen-vl") {
2206
- imagePayload = await paddingToMatchBlockByBase64(imagePayload);
2207
- } else if (!vlLocateMode4()) {
2208
- imagePayload = await markupImageForLLM(
2209
- screenshotBase64,
2210
- context.tree,
2211
- context.size
2212
- );
2213
- }
2214
- const msgs = [
2215
- { role: "system", content: systemPrompt },
2216
- {
2217
- role: "user",
2218
- content: [
2219
- {
2220
- type: "image_url",
2221
- image_url: {
2222
- url: imagePayload,
2223
- detail: "high"
2224
- }
2225
- },
2226
- {
2227
- type: "text",
2228
- text: userInstructionPrompt
2229
- }
2230
- ]
2231
- }
2232
- ];
2233
- if (typeof targetElementDescription !== "string") {
2234
- const addOns = await promptsToChatParam({
2235
- images: targetElementDescription.images,
2236
- convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
2237
- });
2238
- msgs.push(...addOns);
2239
- }
2240
- const callAIFn = callAI || callToGetJSONObject;
2241
- const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2242
- const rawResponse = JSON.stringify(res.content);
2243
- let resRect;
2244
- let matchedElements = "elements" in res.content ? res.content.elements : [];
2245
- let errors = "errors" in res.content ? res.content.errors : [];
2246
- try {
2247
- if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
2248
- resRect = adaptBboxToRect(
2249
- res.content.bbox,
2250
- options.searchConfig?.rect?.width || context.size.width,
2251
- options.searchConfig?.rect?.height || context.size.height,
2252
- options.searchConfig?.rect?.left,
2253
- options.searchConfig?.rect?.top
2254
- );
2255
- debugInspect("resRect", resRect);
2256
- const rectCenter = {
2257
- x: resRect.left + resRect.width / 2,
2258
- y: resRect.top + resRect.height / 2
2259
- };
2260
- let element = elementByPositionWithElementInfo(context.tree, rectCenter);
2261
- const distanceToCenter = element ? distance({ x: element.center[0], y: element.center[1] }, rectCenter) : 0;
2262
- if (!element || distanceToCenter > distanceThreshold) {
2263
- element = insertElementByPosition(rectCenter);
2264
- }
2265
- if (element) {
2266
- matchedElements = [element];
2267
- errors = [];
2268
- }
2269
- }
2270
- } catch (e) {
2271
- const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
2272
- if (!errors || errors?.length === 0) {
2273
- errors = [msg];
2274
- } else {
2275
- errors.push(`(${msg})`);
2276
- }
2277
- }
2278
- return {
2279
- rect: resRect,
2280
- parseResult: {
2281
- elements: matchedElements,
2282
- errors
2283
- },
2284
- rawResponse,
2285
- elementById,
2286
- usage: res.usage,
2287
- isOrderSensitive: typeof res.content === "object" && res.content !== null && "isOrderSensitive" in res.content ? res.content.isOrderSensitive : void 0
2288
- };
2289
- }
2290
- async function AiLocateSection(options) {
2291
- const { context, sectionDescription } = options;
2292
- const { screenshotBase64 } = context;
2293
- const systemPrompt = systemPromptToLocateSection(vlLocateMode4());
2294
- const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
2295
- sectionDescription: extraTextFromUserPrompt(sectionDescription)
2296
- });
2297
- const msgs = [
2298
- { role: "system", content: systemPrompt },
2299
- {
2300
- role: "user",
2301
- content: [
2302
- {
2303
- type: "image_url",
2304
- image_url: {
2305
- url: screenshotBase64,
2306
- detail: "high"
2307
- }
2308
- },
2309
- {
2310
- type: "text",
2311
- text: sectionLocatorInstructionText
2312
- }
2313
- ]
2314
- }
2315
- ];
2316
- if (typeof sectionDescription !== "string") {
2317
- const addOns = await promptsToChatParam({
2318
- images: sectionDescription.images,
2319
- convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
2320
- });
2321
- msgs.push(...addOns);
2322
- }
2323
- const result = await callAiFn(
2324
- msgs,
2325
- 2 /* EXTRACT_DATA */
2326
- );
2327
- let sectionRect;
2328
- const sectionBbox = result.content.bbox;
2329
- if (sectionBbox) {
2330
- const targetRect = adaptBboxToRect(
2331
- sectionBbox,
2332
- context.size.width,
2333
- context.size.height
2334
- );
2335
- debugSection("original targetRect %j", targetRect);
2336
- const referenceBboxList = result.content.references_bbox || [];
2337
- debugSection("referenceBboxList %j", referenceBboxList);
2338
- const referenceRects = referenceBboxList.filter((bbox) => Array.isArray(bbox)).map((bbox) => {
2339
- return adaptBboxToRect(bbox, context.size.width, context.size.height);
2340
- });
2341
- debugSection("referenceRects %j", referenceRects);
2342
- const mergedRect = mergeRects([targetRect, ...referenceRects]);
2343
- debugSection("mergedRect %j", mergedRect);
2344
- sectionRect = expandSearchArea(mergedRect, context.size);
2345
- debugSection("expanded sectionRect %j", sectionRect);
2346
- }
2347
- let imageBase64 = screenshotBase64;
2348
- if (sectionRect) {
2349
- imageBase64 = await cropByRect(
2350
- screenshotBase64,
2351
- sectionRect,
2352
- getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
2353
- );
2354
- }
2355
- return {
2356
- rect: sectionRect,
2357
- imageBase64,
2358
- error: result.content.error,
2359
- rawResponse: JSON.stringify(result.content),
2360
- usage: result.usage
2361
- };
2362
- }
2363
- async function AiExtractElementInfo(options) {
2364
- const { dataQuery, context, extractOption, multimodalPrompt } = options;
2365
- const systemPrompt = systemPromptToExtract();
2366
- const { screenshotBase64 } = context;
2367
- const { description, elementById } = await describeUserPage(context, {
2368
- truncateTextLength: 200,
2369
- filterNonTextContent: false,
2370
- visibleOnly: false,
2371
- domIncluded: extractOption?.domIncluded
2372
- });
2373
- const extractDataPromptText = await extractDataQueryPrompt(
2374
- description,
2375
- dataQuery
2376
- );
2377
- const userContent = [];
2378
- if (extractOption?.screenshotIncluded !== false) {
2379
- userContent.push({
2380
- type: "image_url",
2381
- image_url: {
2382
- url: screenshotBase64,
2383
- detail: "high"
2384
- }
2385
- });
2386
- }
2387
- userContent.push({
2388
- type: "text",
2389
- text: extractDataPromptText
2390
- });
2391
- const msgs = [
2392
- { role: "system", content: systemPrompt },
2393
- {
2394
- role: "user",
2395
- content: userContent
2396
- }
2397
- ];
2398
- if (options.extractOption?.returnThought) {
2399
- msgs.push({
2400
- role: "user",
2401
- content: "Please provide reasons."
2402
- });
2403
- }
2404
- if (multimodalPrompt) {
2405
- const addOns = await promptsToChatParam({
2406
- images: multimodalPrompt.images,
2407
- convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
2408
- });
2409
- msgs.push(...addOns);
2410
- }
2411
- const result = await callAiFn(
2412
- msgs,
2413
- 2 /* EXTRACT_DATA */
2414
- );
2415
- return {
2416
- parseResult: result.content,
2417
- elementById,
2418
- usage: result.usage
2419
- };
2420
- }
2421
- async function AiAssert(options) {
2422
- const { assertion, context } = options;
2423
- assert5(assertion, "assertion should not be empty");
2424
- const { screenshotBase64 } = context;
2425
- const systemPrompt = systemPromptToAssert({
2426
- isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2427
- });
2428
- const assertionText = extraTextFromUserPrompt(assertion);
2429
- const msgs = [
2430
- { role: "system", content: systemPrompt },
2431
- {
2432
- role: "user",
2433
- content: [
2434
- {
2435
- type: "image_url",
2436
- image_url: {
2437
- url: screenshotBase64,
2438
- detail: "high"
2439
- }
2440
- },
2441
- {
2442
- type: "text",
2443
- text: `
2444
- Here is the assertion. Please tell whether it is truthy according to the screenshot.
2445
- =====================================
2446
- ${assertionText}
2447
- =====================================
2448
- `
2449
- }
2450
- ]
2451
- }
2452
- ];
2453
- if (typeof assertion !== "string") {
2454
- const addOns = await promptsToChatParam({
2455
- images: assertion.images,
2456
- convertHttpImage2Base64: assertion.convertHttpImage2Base64
2457
- });
2458
- msgs.push(...addOns);
2459
- }
2460
- const { content: assertResult, usage } = await callAiFn(
2461
- msgs,
2462
- 0 /* ASSERT */
2463
- );
2464
- return {
2465
- content: assertResult,
2466
- usage
2467
- };
2468
- }
2469
-
2470
- // src/ai-model/llm-planning.ts
2471
- import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
2472
- import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
2473
- import { assert as assert6 } from "@midscene/shared/utils";
2474
- async function plan(userInstruction, opts) {
2475
- const { callAI, context } = opts || {};
2476
- const { screenshotBase64, size } = context;
2477
- const { description: pageDescription, elementById } = await describeUserPage(context);
2478
- const systemPrompt = await systemPromptToTaskPlanning({
2479
- actionSpace: opts.actionSpace,
2480
- vlMode: vlLocateMode5()
2481
- });
2482
- const taskBackgroundContextText = generateTaskBackgroundContext(
2483
- userInstruction,
2484
- opts.log,
2485
- opts.actionContext
2486
- );
2487
- const userInstructionPrompt = await automationUserPrompt(
2488
- vlLocateMode5()
2489
- ).format({
2490
- pageDescription,
2491
- taskBackgroundContext: taskBackgroundContextText
2492
- });
2493
- let imagePayload = screenshotBase64;
2494
- if (vlLocateMode5() === "qwen-vl") {
2495
- imagePayload = await paddingToMatchBlockByBase642(imagePayload);
2496
- } else if (!vlLocateMode5()) {
2497
- imagePayload = await markupImageForLLM(
2498
- screenshotBase64,
2499
- context.tree,
2500
- context.size
2501
- );
2502
- }
2503
- warnGPT4oSizeLimit(size);
2504
- const msgs = [
2505
- { role: "system", content: systemPrompt },
2506
- {
2507
- role: "user",
2508
- content: [
2509
- {
2510
- type: "image_url",
2511
- image_url: {
2512
- url: imagePayload,
2513
- detail: "high"
2514
- }
2515
- },
2516
- {
2517
- type: "text",
2518
- text: userInstructionPrompt
2519
- }
2520
- ]
2521
- }
2522
- ];
2523
- const call3 = callAI || callAiFn;
2524
- const { content, usage } = await call3(msgs, 3 /* PLAN */);
2525
- const rawResponse = JSON.stringify(content, void 0, 2);
2526
- const planFromAI = content;
2527
- const actions = (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];
2528
- const returnValue = {
2529
- ...planFromAI,
2530
- actions,
2531
- rawResponse,
2532
- usage,
2533
- yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
2534
- };
2535
- assert6(planFromAI, "can't get plans from AI");
2536
- if (vlLocateMode5()) {
2537
- actions.forEach((action) => {
2538
- if (action.locate) {
2539
- try {
2540
- action.locate = fillBboxParam(action.locate, size.width, size.height);
2541
- } catch (e) {
2542
- throw new Error(
2543
- `Failed to fill locate param: ${planFromAI.error} (${e instanceof Error ? e.message : "unknown error"})`,
2544
- {
2545
- cause: e
2546
- }
2547
- );
2548
- }
2549
- }
2550
- });
2551
- assert6(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2552
- } else {
2553
- actions.forEach((action) => {
2554
- if (action.locate?.id) {
2555
- const element = elementById(action.locate.id);
2556
- if (element) {
2557
- action.locate.id = element.id;
2558
- }
2559
- }
2560
- });
2561
- }
2562
- if (actions.length === 0 && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) {
2563
- console.warn(
2564
- "No actions planned for the prompt, but model said more actions are needed:",
2565
- userInstruction
2566
- );
2567
- }
2568
- return returnValue;
2569
- }
2570
-
2571
- // src/ai-model/ui-tars-planning.ts
2572
- import {
2573
- UITarsModelVersion,
2574
- uiTarsModelVersion as uiTarsModelVersion2,
2575
- vlLocateMode as vlLocateMode6
2576
- } from "@midscene/shared/env";
2577
- import { resizeImgBase64 } from "@midscene/shared/img";
2578
- import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
2579
- import { getDebug as getDebug4 } from "@midscene/shared/logger";
2580
- import { assert as assert7 } from "@midscene/shared/utils";
2581
- import { actionParser } from "@ui-tars/action-parser";
2582
-
2583
- // src/ai-model/prompt/ui-tars-planning.ts
2584
- import { getPreferredLanguage as getPreferredLanguage2 } from "@midscene/shared/env";
2585
- function getUiTarsPlanningPrompt() {
2586
- const preferredLanguage = getPreferredLanguage2();
2587
- return `
2588
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
2589
-
2590
- ## Output Format
2591
- \`\`\`
2592
- Thought: ...
2593
- Action: ...
2594
- \`\`\`
2595
-
2596
- ## Action Space
2597
-
2598
- click(start_box='[x1, y1, x2, y2]')
2599
- left_double(start_box='[x1, y1, x2, y2]')
2600
- right_single(start_box='[x1, y1, x2, y2]')
2601
- drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
2602
- hotkey(key='')
2603
- type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
2604
- scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
2605
- wait() #Sleep for 5s and take a screenshot to check for any changes.
2606
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
2607
-
2608
-
2609
- ## Note
2610
- - Use ${preferredLanguage} in \`Thought\` part.
2611
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
2612
-
2613
- ## User Instruction
2614
- `;
2615
- }
2616
- var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2617
-
2618
- // src/ai-model/ui-tars-planning.ts
2619
- var debug = getDebug4("ui-tars-planning");
2620
- var bboxSize = 10;
2621
- var pointToBbox = (point, width, height) => {
2622
- return [
2623
- Math.round(Math.max(point.x - bboxSize / 2, 0)),
2624
- Math.round(Math.max(point.y - bboxSize / 2, 0)),
2625
- Math.round(Math.min(point.x + bboxSize / 2, width)),
2626
- Math.round(Math.min(point.y + bboxSize / 2, height))
2627
- ];
2628
- };
2629
- async function vlmPlanning(options) {
2630
- const { conversationHistory, userInstruction, size } = options;
2631
- const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
2632
- const res = await call2(
2633
- [
2634
- {
2635
- role: "user",
2636
- content: systemPrompt
2637
- },
2638
- ...conversationHistory
2639
- ],
2640
- 1 /* INSPECT_ELEMENT */
2641
- );
2642
- const convertedText = convertBboxToCoordinates(res.content);
2643
- const modelVer = uiTarsModelVersion2();
2644
- const { parsed } = actionParser({
2645
- prediction: convertedText,
2646
- factor: [1e3, 1e3],
2647
- screenContext: {
2648
- width: size.width,
2649
- height: size.height
2650
- },
2651
- modelVer: modelVer || void 0
2652
- });
2653
- debug("modelVer", modelVer, "parsed", JSON.stringify(parsed));
2654
- const transformActions = [];
2655
- parsed.forEach((action) => {
2656
- if (action.action_type === "click") {
2657
- assert7(action.action_inputs.start_box, "start_box is required");
2658
- const point = getPoint(action.action_inputs.start_box, size);
2659
- transformActions.push({
2660
- type: "Locate",
2661
- param: {},
2662
- locate: {
2663
- prompt: action.thought || "",
2664
- bbox: pointToBbox(
2665
- { x: point[0], y: point[1] },
2666
- size.width,
2667
- size.height
2668
- )
2669
- }
2670
- });
2671
- transformActions.push({
2672
- type: "Tap",
2673
- locate: {
2674
- prompt: action.thought || "",
2675
- bbox: pointToBbox(
2676
- { x: point[0], y: point[1] },
2677
- size.width,
2678
- size.height
2679
- )
2680
- },
2681
- param: action.thought || ""
2682
- });
2683
- } else if (action.action_type === "drag") {
2684
- assert7(action.action_inputs.start_box, "start_box is required");
2685
- assert7(action.action_inputs.end_box, "end_box is required");
2686
- const startPoint = getPoint(action.action_inputs.start_box, size);
2687
- const endPoint = getPoint(action.action_inputs.end_box, size);
2688
- transformActions.push({
2689
- type: "Drag",
2690
- param: {
2691
- start_box: { x: startPoint[0], y: startPoint[1] },
2692
- end_box: { x: endPoint[0], y: endPoint[1] }
2693
- },
2694
- locate: null,
2695
- thought: action.thought || ""
2696
- });
2697
- } else if (action.action_type === "type") {
2698
- transformActions.push({
2699
- type: "Input",
2700
- param: {
2701
- value: action.action_inputs.content
2702
- },
2703
- locate: null,
2704
- thought: action.thought || ""
2705
- });
2706
- } else if (action.action_type === "scroll") {
2707
- transformActions.push({
2708
- type: "Scroll",
2709
- param: {
2710
- direction: action.action_inputs.direction
2711
- },
2712
- locate: null,
2713
- thought: action.thought || ""
2714
- });
2715
- } else if (action.action_type === "finished") {
2716
- transformActions.push({
2717
- type: "Finished",
2718
- param: {},
2719
- locate: null,
2720
- thought: action.thought || ""
2721
- });
2722
- } else if (action.action_type === "hotkey") {
2723
- if (!action.action_inputs.key) {
2724
- console.warn(
2725
- "No key found in action: hotkey. Will not perform action."
2726
- );
2727
- } else {
2728
- const keys = transformHotkeyInput(action.action_inputs.key);
2729
- transformActions.push({
2730
- type: "KeyboardPress",
2731
- param: {
2732
- value: keys
2733
- },
2734
- locate: null,
2735
- thought: action.thought || ""
2736
- });
2737
- }
2738
- } else if (action.action_type === "wait") {
2739
- transformActions.push({
2740
- type: "Sleep",
2741
- param: {
2742
- timeMs: 1e3
2743
- },
2744
- locate: null,
2745
- thought: action.thought || ""
2746
- });
2747
- } else if (action.action_type === "androidBackButton") {
2748
- transformActions.push({
2749
- type: "AndroidBackButton",
2750
- param: {},
2751
- locate: null,
2752
- thought: action.thought || ""
2753
- });
2754
- } else if (action.action_type === "androidHomeButton") {
2755
- transformActions.push({
2756
- type: "AndroidHomeButton",
2757
- param: {},
2758
- locate: null,
2759
- thought: action.thought || ""
2760
- });
2761
- } else if (action.action_type === "androidRecentAppsButton") {
2762
- transformActions.push({
2763
- type: "AndroidRecentAppsButton",
2764
- param: {}
2765
- });
2766
- } else if (action.action_type === "androidLongPress") {
2767
- assert7(
2768
- action.action_inputs.start_coords,
2769
- "start_coords is required for androidLongPress"
2770
- );
2771
- const point = action.action_inputs.start_coords;
2772
- transformActions.push({
2773
- type: "AndroidLongPress",
2774
- param: {
2775
- x: point[0],
2776
- y: point[1],
2777
- duration: 1e3
2778
- },
2779
- locate: null,
2780
- thought: action.thought || ""
2781
- });
2782
- } else if (action.action_type === "androidPull") {
2783
- const pullDirection = action.action_inputs.direction || "down";
2784
- const startPoint = action.action_inputs.start_coords ? {
2785
- x: action.action_inputs.start_coords[0],
2786
- y: action.action_inputs.start_coords[1]
2787
- } : void 0;
2788
- transformActions.push({
2789
- type: "AndroidPull",
2790
- param: {
2791
- direction: pullDirection,
2792
- startPoint,
2793
- distance: action.action_inputs.distance,
2794
- duration: action.action_inputs.duration || 500
2795
- },
2796
- locate: null,
2797
- thought: action.thought || ""
2798
- });
2799
- }
2800
- });
2801
- if (transformActions.length === 0) {
2802
- throw new Error(`No actions found, response: ${res.content}`, {
2803
- cause: {
2804
- prediction: res.content,
2805
- parsed
2806
- }
2807
- });
2808
- }
2809
- return {
2810
- actions: transformActions,
2811
- actionsFromModel: parsed,
2812
- action_summary: getSummary(res.content),
2813
- usage: res.usage,
2814
- rawResponse: JSON.stringify(res.content, void 0, 2)
2815
- };
2816
- }
2817
- function convertBboxToCoordinates(text) {
2818
- const pattern = /<bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)<\/bbox>/g;
2819
- function replaceMatch(match, x1, y1, x2, y2) {
2820
- const x1Num = Number.parseInt(x1, 10);
2821
- const y1Num = Number.parseInt(y1, 10);
2822
- const x2Num = Number.parseInt(x2, 10);
2823
- const y2Num = Number.parseInt(y2, 10);
2824
- const x = Math.floor((x1Num + x2Num) / 2);
2825
- const y = Math.floor((y1Num + y2Num) / 2);
2826
- return `(${x},${y})`;
2827
- }
2828
- const cleanedText = text.replace(/\[EOS\]/g, "");
2829
- return cleanedText.replace(pattern, replaceMatch).trim();
2830
- }
2831
- function getPoint(startBox, size) {
2832
- const [x, y] = JSON.parse(startBox);
2833
- return [x * size.width, y * size.height];
2834
- }
2835
- async function resizeImageForUiTars(imageBase64, size) {
2836
- if (vlLocateMode6() === "vlm-ui-tars" && uiTarsModelVersion2() === UITarsModelVersion.V1_5) {
2837
- debug("ui-tars-v1.5, will check image size", size);
2838
- const currentPixels = size.width * size.height;
2839
- const maxPixels = 16384 * 28 * 28;
2840
- if (currentPixels > maxPixels) {
2841
- const resizeFactor = Math.sqrt(maxPixels / currentPixels);
2842
- const newWidth = Math.floor(size.width * resizeFactor);
2843
- const newHeight = Math.floor(size.height * resizeFactor);
2844
- debug(
2845
- "resize image for ui-tars, new width: %s, new height: %s",
2846
- newWidth,
2847
- newHeight
2848
- );
2849
- const resizedImage = await resizeImgBase64(imageBase64, {
2850
- width: newWidth,
2851
- height: newHeight
2852
- });
2853
- return resizedImage;
2854
- }
2855
- }
2856
- return imageBase64;
2857
- }
2858
-
2859
- export {
2860
- systemPromptToLocateElement,
2861
- call2 as call,
2862
- callToGetJSONObject,
2863
- callAiFnWithStringResponse,
2864
- AIActionType,
2865
- callAiFn,
2866
- adaptBboxToRect,
2867
- expandSearchArea,
2868
- elementByPositionWithElementInfo,
2869
- describeUserPage,
2870
- generateYamlTest,
2871
- generateYamlTestStream,
2872
- generatePlaywrightTest,
2873
- generatePlaywrightTestStream,
2874
- AiLocateElement,
2875
- AiLocateSection,
2876
- AiExtractElementInfo,
2877
- AiAssert,
2878
- plan,
2879
- vlmPlanning,
2880
- resizeImageForUiTars
2881
- };
2882
-
2883
- //# sourceMappingURL=chunk-DDYIQHOA.js.map