npm - @midscene/core - Versions diffs - 0.17.2-beta-20250521031635.0 → 0.17.2-beta-20250521131112.0 - Mend

@midscene/core 0.17.2-beta-20250521031635.0 → 0.17.2-beta-20250521131112.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/es/ai-model.d.ts +3 -3
package/dist/es/ai-model.js +1 -1
package/dist/es/{chunk-ZKT2DWJO.js → chunk-GHP3FR4O.js} +3 -3
package/dist/es/{chunk-OINLEVDF.js → chunk-K2IXQ5O2.js} +132 -82
package/dist/es/chunk-K2IXQ5O2.js.map +1 -0
package/dist/es/index.d.ts +8 -5
package/dist/es/index.js +85 -2
package/dist/es/index.js.map +1 -1
package/dist/es/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
package/dist/es/tree.d.ts +1 -1
package/dist/es/{types-cbcbeb4e.d.ts → types-01381369.d.ts} +26 -2
package/dist/es/utils.d.ts +1 -1
package/dist/es/utils.js +1 -1
package/dist/lib/ai-model.d.ts +3 -3
package/dist/lib/ai-model.js +2 -2
package/dist/lib/{chunk-ZKT2DWJO.js → chunk-GHP3FR4O.js} +3 -3
package/dist/lib/{chunk-OINLEVDF.js → chunk-K2IXQ5O2.js} +140 -90
package/dist/lib/chunk-K2IXQ5O2.js.map +1 -0
package/dist/lib/index.d.ts +8 -5
package/dist/lib/index.js +93 -10
package/dist/lib/index.js.map +1 -1
package/dist/lib/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
package/dist/lib/tree.d.ts +1 -1
package/dist/{types/types-cbcbeb4e.d.ts → lib/types-01381369.d.ts} +26 -2
package/dist/lib/utils.d.ts +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/ai-model.d.ts +3 -3
package/dist/types/index.d.ts +8 -5
package/dist/types/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
package/dist/types/tree.d.ts +1 -1
package/dist/{lib/types-cbcbeb4e.d.ts → types/types-01381369.d.ts} +26 -2
package/dist/types/utils.d.ts +1 -1
package/package.json +2 -2
package/dist/es/chunk-OINLEVDF.js.map +0 -1
package/dist/lib/chunk-OINLEVDF.js.map +0 -1
/package/dist/es/{chunk-ZKT2DWJO.js.map → chunk-GHP3FR4O.js.map} +0 -0
/package/dist/lib/{chunk-ZKT2DWJO.js.map → chunk-GHP3FR4O.js.map} +0 -0

package/dist/lib/{chunk-OINLEVDF.js → chunk-K2IXQ5O2.js} RENAMED Viewed

@@ -28,8 +28,6 @@ var _identity = require('@azure/identity');
@@ -310,53 +308,9 @@ function buildYamlFlowFromPlans(plans, sleep) {
   return flow;
 }
-// src/ai-model/prompt/ui-tars-planning.ts
-function getTimeZoneInfo() {
-  const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
-  const offset = -(/* @__PURE__ */ new Date()).getTimezoneOffset() / 60;
-  return {
-    timezone: `UTC${offset >= 0 ? "+" : ""}${offset}`,
-    isChina: timeZone === "Asia/Shanghai"
-  };
-}
-function getLanguage() {
-  return getTimeZoneInfo().isChina ? "Chinese" : "English";
-}
-function getUiTarsPlanningPrompt() {
-  const language2 = getLanguage();
-  return `
-You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
-## Output Format
-\`\`\`
-Thought: ...
-Action: ...
-\`\`\`
-## Action Space
-click(start_box='[x1, y1, x2, y2]')
-left_double(start_box='[x1, y1, x2, y2]')
-right_single(start_box='[x1, y1, x2, y2]')
-drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
-hotkey(key='')
-type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
-scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
-wait() #Sleep for 5s and take a screenshot to check for any changes.
-finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
-## Note
-- Use ${language2} in \`Thought\` part.
-- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
-## User Instruction
-`;
-}
-var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
 // src/ai-model/prompt/assertion.ts
-var language = getTimeZoneInfo().isChina ? "Chinese" : "English";
+var preferredLanguage = _env.getPreferredLanguage.call(void 0, );
 var defaultAssertionPrompt = "You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.";
 var defaultAssertionResponseJsonFormat = `Return in the following JSON format:
 {
@@ -373,7 +327,7 @@ var uiTarsAssertionResponseJsonFormat = `## Output Json String Format
 ## Rules **MUST** follow
 - Make sure to return **only** the JSON, with **no additional** text or explanations.
-- Use ${language} in \`thought\` part.
+- Use ${preferredLanguage} in \`thought\` part.
 - You **MUST** strictly follow up the **Output Json String Format**.`;
 function systemPromptToAssert(model) {
   return `${defaultAssertionPrompt}
@@ -1331,10 +1285,10 @@ async function call(messages, AIActionTypeValue, responseFormat) {
   let content;
   let usage;
   const commonConfig = {
-    temperature: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS) ? 0 : 0.1,
+    temperature: _env.vlLocateMode.call(void 0, ) === "vlm-ui-tars" ? 0 : 0.1,
     stream: false,
     max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
-    ..._env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_QWEN_VL) ? {
+    ..._env.vlLocateMode.call(void 0, ) === "qwen-vl" ? {
       vl_high_resolution_images: true
     } : {}
   };
@@ -1412,12 +1366,13 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
       case 1 /* INSPECT_ELEMENT */:
         responseFormat = locatorSchema;
         break;
-      case 2 /* EXTRACT_DATA */:
-        responseFormat = { type: "json_object" /* JSON */ };
-        break;
       case 3 /* PLAN */:
         responseFormat = planSchema;
         break;
+      case 2 /* EXTRACT_DATA */:
+      case 4 /* DESCRIBE_ELEMENT */:
+        responseFormat = { type: "json_object" /* JSON */ };
+        break;
     }
   }
   if (model === "gpt-4o-2024-05-13") {
@@ -1493,30 +1448,89 @@ function systemPromptToExtract() {
   return `
 You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
-The user will give you a screenshot, the contents of it (optional), and some data requirements in DATA_DEMAND. You need to extract the data according to the DATA_DEMAND.
+The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
+If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 Return in the following JSON format:
 {
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
   errors: [], // string[], error message if any
 }
-`;
+# Example 1
+For example, if the DATA_DEMAND is:
+<DATA_DEMAND>
+{
+  "name": "name shows on the left panel, string",
+  "age": "age shows on the right panel, number",
+  "isAdmin": "if the user is admin, boolean"
 }
-var extractDataPrompt = new (0, _prompts.PromptTemplate)({
-  template: `
-pageDescription: {pageDescription}
+</DATA_DEMAND>
-Extract the following data and place it in the \`data\` field. If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
-DATA_DEMAND start:
-=====================================
-{dataKeys}
+By viewing the screenshot and page contents, you can extract the following data:
+{
+  data: {
+    name: "John",
+    age: 30,
+    isAdmin: true
+  },
+}
+# Example 2
+If the DATA_DEMAND is:
+<DATA_DEMAND>
+the todo items list, string[]
+</DATA_DEMAND>
+By viewing the screenshot and page contents, you can extract the following data:
+{
+  data: ["todo 1", "todo 2", "todo 3"],
+}
+# Example 3
+If the DATA_DEMAND is:
+<DATA_DEMAND>
+the page title, string
+</DATA_DEMAND>
+By viewing the screenshot and page contents, you can extract the following data:
+{
+  data: "todo list",
+}
+`;
+}
+var extractDataQueryPrompt = async (pageDescription, dataQuery) => {
+  let dataQueryText = "";
+  if (typeof dataQuery === "string") {
+    dataQueryText = dataQuery;
+  } else {
+    dataQueryText = JSON.stringify(dataQuery, null, 2);
+  }
+  const extractDataPrompt = new (0, _prompts.PromptTemplate)({
+    template: `
+<PageDescription>
+{pageDescription}
+</PageDescription>
+<DATA_DEMAND>
 {dataQuery}
-=====================================
-DATA_DEMAND ends.
+</DATA_DEMAND>
   `,
-  inputVariables: ["pageDescription", "dataKeys", "dataQuery"]
-});
+    inputVariables: ["pageDescription", "dataQuery"]
+  });
+  return await extractDataPrompt.format({
+    pageDescription,
+    dataQuery: dataQueryText
+  });
+};
 // src/ai-model/prompt/llm-section-locator.ts
@@ -1601,6 +1615,14 @@ async function AiLocateElement(options) {
       context.size
     );
   }
+  let referenceImagePayload;
+  if (_optionalChain([options, 'access', _32 => _32.referenceImage, 'optionalAccess', _33 => _33.rect]) && options.referenceImage.base64) {
+    referenceImagePayload = await _img.cropByRect.call(void 0,
+      options.referenceImage.base64,
+      options.referenceImage.rect,
+      _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_QWEN_VL)
+    );
+  }
   const msgs = [
     { role: "system", content: systemPrompt },
     {
@@ -1630,10 +1652,10 @@ async function AiLocateElement(options) {
     if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
       resRect = adaptBboxToRect(
         res.content.bbox,
-        _optionalChain([options, 'access', _32 => _32.searchConfig, 'optionalAccess', _33 => _33.rect, 'optionalAccess', _34 => _34.width]) || context.size.width,
-        _optionalChain([options, 'access', _35 => _35.searchConfig, 'optionalAccess', _36 => _36.rect, 'optionalAccess', _37 => _37.height]) || context.size.height,
-        _optionalChain([options, 'access', _38 => _38.searchConfig, 'optionalAccess', _39 => _39.rect, 'optionalAccess', _40 => _40.left]),
-        _optionalChain([options, 'access', _41 => _41.searchConfig, 'optionalAccess', _42 => _42.rect, 'optionalAccess', _43 => _43.top])
+        _optionalChain([options, 'access', _34 => _34.searchConfig, 'optionalAccess', _35 => _35.rect, 'optionalAccess', _36 => _36.width]) || context.size.width,
+        _optionalChain([options, 'access', _37 => _37.searchConfig, 'optionalAccess', _38 => _38.rect, 'optionalAccess', _39 => _39.height]) || context.size.height,
+        _optionalChain([options, 'access', _40 => _40.searchConfig, 'optionalAccess', _41 => _41.rect, 'optionalAccess', _42 => _42.left]),
+        _optionalChain([options, 'access', _43 => _43.searchConfig, 'optionalAccess', _44 => _44.rect, 'optionalAccess', _45 => _45.top])
       );
       debugInspect("resRect", resRect);
       const rectCenter = {
@@ -1652,7 +1674,7 @@ async function AiLocateElement(options) {
     }
   } catch (e) {
     const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
-    if (!errors || _optionalChain([errors, 'optionalAccess', _44 => _44.length]) === 0) {
+    if (!errors || _optionalChain([errors, 'optionalAccess', _46 => _46.length]) === 0) {
       errors = [msg];
     } else {
       errors.push(`(${msg})`);
@@ -1743,20 +1765,10 @@ async function AiExtractElementInfo(options) {
     context,
     liteContextConfig
   );
-  let dataKeys = "";
-  let dataQueryText = "";
-  if (typeof dataQuery === "string") {
-    dataKeys = "";
-    dataQueryText = dataQuery;
-  } else {
-    dataKeys = `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}`;
-    dataQueryText = JSON.stringify(dataQuery, null, 2);
-  }
-  const extractDataPromptText = await extractDataPrompt.format({
-    pageDescription: description,
-    dataKeys,
-    dataQuery: dataQueryText
-  });
+  const extractDataPromptText = await extractDataQueryPrompt(
+    description,
+    dataQuery
+  );
   const msgs = [
     { role: "system", content: systemPrompt },
     {
@@ -1884,7 +1896,7 @@ async function plan(userInstruction, opts) {
   const { content, usage } = await call2(msgs, 3 /* PLAN */);
   const rawResponse = JSON.stringify(content, void 0, 2);
   const planFromAI = content;
-  const actions = (_optionalChain([planFromAI, 'access', _45 => _45.action, 'optionalAccess', _46 => _46.type]) ? [planFromAI.action] : planFromAI.actions) || [];
+  const actions = (_optionalChain([planFromAI, 'access', _47 => _47.action, 'optionalAccess', _48 => _48.type]) ? [planFromAI.action] : planFromAI.actions) || [];
   const returnValue = {
     ...planFromAI,
     actions,
@@ -1911,7 +1923,7 @@ async function plan(userInstruction, opts) {
     _utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
   } else {
     actions.forEach((action) => {
-      if (_optionalChain([action, 'access', _47 => _47.locate, 'optionalAccess', _48 => _48.id])) {
+      if (_optionalChain([action, 'access', _49 => _49.locate, 'optionalAccess', _50 => _50.id])) {
         const element = elementById(action.locate.id);
         if (element) {
           action.locate.id = element.id;
@@ -1939,6 +1951,43 @@ var _keyboardlayout = require('@midscene/shared/keyboard-layout');
 var _actionparser = require('@ui-tars/action-parser');
+// src/ai-model/prompt/ui-tars-planning.ts
+function getUiTarsPlanningPrompt() {
+  const preferredLanguage2 = _env.getPreferredLanguage.call(void 0, );
+  return `
+You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+## Output Format
+\`\`\`
+Thought: ...
+Action: ...
+\`\`\`
+## Action Space
+click(start_box='[x1, y1, x2, y2]')
+left_double(start_box='[x1, y1, x2, y2]')
+right_single(start_box='[x1, y1, x2, y2]')
+drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
+hotkey(key='')
+type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
+scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
+## Note
+- Use ${preferredLanguage2} in \`Thought\` part.
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
+## User Instruction
+`;
+}
+var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
+// src/ai-model/ui-tars-planning.ts
 var debug = _logger.getDebug.call(void 0, "ui-tars-planning");
 var bboxSize = 10;
 var pointToBbox = (point, width, height) => {
@@ -2159,6 +2208,7 @@ async function resizeImageForUiTars(imageBase64, size) {
-exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
-//# sourceMappingURL=chunk-OINLEVDF.js.map
+exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
+//# sourceMappingURL=chunk-K2IXQ5O2.js.map