@midscene/core 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@
9
9
 
10
10
 
11
11
 
12
- var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
12
+ var _chunk2SOSTOJYjs = require('./chunk-2SOSTOJY.js');
13
13
  require('./chunk-JP3JBDZS.js');
14
14
  require('./chunk-YSQDPG26.js');
15
15
 
@@ -23,4 +23,4 @@ require('./chunk-YSQDPG26.js');
23
23
 
24
24
 
25
25
 
26
- exports.AiAssert = _chunkCERQVVPJjs.AiAssert; exports.AiExtractElementInfo = _chunkCERQVVPJjs.AiExtractElementInfo; exports.AiInspectElement = _chunkCERQVVPJjs.AiInspectElement; exports.callAiFn = _chunkCERQVVPJjs.callAiFn; exports.callToGetJSONObject = _chunkCERQVVPJjs.callToGetJSONObject; exports.describeUserPage = _chunkCERQVVPJjs.describeUserPage; exports.plan = _chunkCERQVVPJjs.plan; exports.systemPromptToLocateElement = _chunkCERQVVPJjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId; exports.vlmPlanning = _chunkCERQVVPJjs.vlmPlanning;
26
+ exports.AiAssert = _chunk2SOSTOJYjs.AiAssert; exports.AiExtractElementInfo = _chunk2SOSTOJYjs.AiExtractElementInfo; exports.AiInspectElement = _chunk2SOSTOJYjs.AiInspectElement; exports.callAiFn = _chunk2SOSTOJYjs.callAiFn; exports.callToGetJSONObject = _chunk2SOSTOJYjs.callToGetJSONObject; exports.describeUserPage = _chunk2SOSTOJYjs.describeUserPage; exports.plan = _chunk2SOSTOJYjs.plan; exports.systemPromptToLocateElement = _chunk2SOSTOJYjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunk2SOSTOJYjs.transformElementPositionToId; exports.vlmPlanning = _chunk2SOSTOJYjs.vlmPlanning;
@@ -1147,125 +1147,6 @@ call_user() # Submit the task and call the user when the task is unsolvable, or
1147
1147
  ## User Instruction
1148
1148
  `;
1149
1149
  var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
1150
- function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
1151
- let reflection = null;
1152
- let thought = null;
1153
- let actionStr = "";
1154
- text = text.trim();
1155
- if (mode === "bc") {
1156
- if (text.startsWith("Thought:")) {
1157
- const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
1158
- if (thoughtMatch) {
1159
- thought = thoughtMatch[1].trim();
1160
- }
1161
- } else if (text.startsWith("Reflection:")) {
1162
- const reflectionMatch = text.match(
1163
- /Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
1164
- );
1165
- if (reflectionMatch) {
1166
- thought = reflectionMatch[2].trim();
1167
- reflection = reflectionMatch[1].trim();
1168
- }
1169
- } else if (text.startsWith("Action_Summary:")) {
1170
- const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
1171
- if (summaryMatch) {
1172
- thought = summaryMatch[1].trim();
1173
- }
1174
- }
1175
- if (!text.includes("Action:")) {
1176
- actionStr = text;
1177
- } else {
1178
- const actionParts = text.split("Action:");
1179
- actionStr = actionParts[actionParts.length - 1];
1180
- }
1181
- } else if (mode === "o1") {
1182
- const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
1183
- const actionSummaryMatch = text.match(
1184
- /\nAction_Summary:\s*(.*?)\s*Action:/
1185
- );
1186
- const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
1187
- const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
1188
- const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
1189
- const actionContent = actionMatch ? actionMatch[1] : null;
1190
- thought = `${thoughtContent}
1191
- <Action_Summary>
1192
- ${actionSummaryContent}`;
1193
- actionStr = actionContent || "";
1194
- }
1195
- const allActions = actionStr.split("\n\n");
1196
- const actions = [];
1197
- for (const rawStr of allActions) {
1198
- const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
1199
- if (!actionInstance) {
1200
- console.log(`Action can't parse: ${rawStr}`);
1201
- continue;
1202
- }
1203
- const actionType = actionInstance.function;
1204
- const params = actionInstance.args;
1205
- const actionInputs = {};
1206
- for (const [paramName, param] of Object.entries(params)) {
1207
- if (!param)
1208
- continue;
1209
- const trimmedParam = param.trim();
1210
- actionInputs[paramName.trim()] = trimmedParam;
1211
- if (paramName.includes("start_box") || paramName.includes("end_box")) {
1212
- const oriBox = trimmedParam;
1213
- const numbers = oriBox.replace(/[()]/g, "").split(",");
1214
- const floatNumbers = numbers.map(
1215
- (num) => Number.parseFloat(num) / factor
1216
- );
1217
- if (floatNumbers.length === 2) {
1218
- floatNumbers.push(floatNumbers[0], floatNumbers[1]);
1219
- }
1220
- actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
1221
- }
1222
- }
1223
- if (actionType === "finished") {
1224
- actions.push({
1225
- reflection,
1226
- thought,
1227
- action_type: "finished",
1228
- action_inputs: {}
1229
- });
1230
- } else {
1231
- actions.push({
1232
- reflection,
1233
- thought,
1234
- action_type: actionType,
1235
- action_inputs: actionInputs
1236
- });
1237
- }
1238
- }
1239
- return actions;
1240
- }
1241
- function parseAction(actionStr) {
1242
- try {
1243
- const functionPattern = /^(\w+)\((.*)\)$/;
1244
- const match = actionStr.trim().match(functionPattern);
1245
- if (!match) {
1246
- throw new Error("Not a function call");
1247
- }
1248
- const [_, functionName, argsStr] = match;
1249
- const kwargs = {};
1250
- if (argsStr.trim()) {
1251
- const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
1252
- for (const pair of argPairs) {
1253
- const [key, ...valueParts] = pair.split("=");
1254
- if (!key)
1255
- continue;
1256
- const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
1257
- kwargs[key.trim()] = value;
1258
- }
1259
- }
1260
- return {
1261
- function: functionName,
1262
- args: kwargs
1263
- };
1264
- } catch (e) {
1265
- console.error(`Failed to parse action '${actionStr}': ${e}`);
1266
- return null;
1267
- }
1268
- }
1269
1150
 
1270
1151
  // src/ai-model/prompt/ui-tars-locator.ts
1271
1152
  function systemPromptToLocateElementPosition() {
@@ -2638,9 +2519,9 @@ async function plan(userPrompt, opts) {
2638
2519
  }
2639
2520
 
2640
2521
  // src/ai-model/ui-tars-planning.ts
2641
- function capitalize(str) {
2642
- return str.charAt(0).toUpperCase() + str.slice(1);
2643
- }
2522
+
2523
+ var _keyboardlayout = require('@midscene/shared/keyboard-layout');
2524
+ var _actionparser = require('@ui-tars/action-parser');
2644
2525
  async function vlmPlanning(options) {
2645
2526
  const { conversationHistory, userInstruction, size } = options;
2646
2527
  const systemPrompt = uiTarsPlanningPrompt + userInstruction;
@@ -2654,10 +2535,14 @@ async function vlmPlanning(options) {
2654
2535
  ],
2655
2536
  1 /* INSPECT_ELEMENT */
2656
2537
  );
2657
- const actions = parseActionFromVlm(res.content);
2538
+ const { parsed } = _actionparser.actionParser.call(void 0, {
2539
+ prediction: res.content,
2540
+ factor: 1e3
2541
+ });
2658
2542
  const transformActions = [];
2659
- actions.forEach((action) => {
2543
+ parsed.forEach((action) => {
2660
2544
  if (action.action_type === "click") {
2545
+ _assert2.default.call(void 0, action.action_inputs.start_box, "start_box is required");
2661
2546
  const point = getPoint(action.action_inputs.start_box, size);
2662
2547
  transformActions.push({
2663
2548
  type: "Locate",
@@ -2678,6 +2563,20 @@ async function vlmPlanning(options) {
2678
2563
  },
2679
2564
  param: action.thought || ""
2680
2565
  });
2566
+ } else if (action.action_type === "drag") {
2567
+ _assert2.default.call(void 0, action.action_inputs.start_box, "start_box is required");
2568
+ _assert2.default.call(void 0, action.action_inputs.end_box, "end_box is required");
2569
+ const startPoint = getPoint(action.action_inputs.start_box, size);
2570
+ const endPoint = getPoint(action.action_inputs.end_box, size);
2571
+ transformActions.push({
2572
+ type: "Drag",
2573
+ param: {
2574
+ start_box: { x: startPoint[0], y: startPoint[1] },
2575
+ end_box: { x: endPoint[0], y: endPoint[1] }
2576
+ },
2577
+ locate: null,
2578
+ thought: action.thought || ""
2579
+ });
2681
2580
  } else if (action.action_type === "type") {
2682
2581
  transformActions.push({
2683
2582
  type: "Input",
@@ -2704,31 +2603,38 @@ async function vlmPlanning(options) {
2704
2603
  thought: action.thought || ""
2705
2604
  });
2706
2605
  } else if (action.action_type === "hotkey") {
2707
- const keys = action.action_inputs.key.split(",");
2708
- for (const key of keys) {
2709
- transformActions.push({
2710
- type: "KeyboardPress",
2711
- param: {
2712
- value: capitalize(key)
2713
- },
2714
- locate: null,
2715
- thought: action.thought || ""
2716
- });
2717
- }
2606
+ _assert2.default.call(void 0, action.action_inputs.key, "key is required");
2607
+ const keys = _keyboardlayout.transformHotkeyInput.call(void 0, action.action_inputs.key);
2608
+ transformActions.push({
2609
+ type: "KeyboardPress",
2610
+ param: {
2611
+ value: keys
2612
+ },
2613
+ locate: null,
2614
+ thought: action.thought || ""
2615
+ });
2718
2616
  } else if (action.action_type === "wait") {
2719
2617
  transformActions.push({
2720
2618
  type: "Sleep",
2721
2619
  param: {
2722
- timeMs: action.action_inputs.time
2620
+ timeMs: 1e3
2723
2621
  },
2724
2622
  locate: null,
2725
2623
  thought: action.thought || ""
2726
2624
  });
2727
2625
  }
2728
2626
  });
2627
+ if (transformActions.length === 0) {
2628
+ throw new Error("No actions found", {
2629
+ cause: {
2630
+ prediction: res.content,
2631
+ parsed
2632
+ }
2633
+ });
2634
+ }
2729
2635
  return {
2730
2636
  actions: transformActions,
2731
- realActions: actions,
2637
+ realActions: parsed,
2732
2638
  action_summary: getSummary(res.content)
2733
2639
  };
2734
2640
  }
@@ -188,7 +188,7 @@ function stringifyDumpData(data, indents) {
188
188
  return JSON.stringify(data, replacerForPageObject, indents);
189
189
  }
190
190
  function getVersion() {
191
- return "0.10.1";
191
+ return "0.10.2";
192
192
  }
193
193
  function debugLog(...message) {
194
194
  const debugMode = _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_DEBUG_MODE);
package/dist/lib/index.js CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
 
8
8
 
9
- var _chunkVPW777ADjs = require('./chunk-VPW777AD.js');
9
+ var _chunkMTBFUT2Hjs = require('./chunk-MTBFUT2H.js');
10
10
 
11
11
 
12
12
 
@@ -17,7 +17,7 @@ var _chunkVPW777ADjs = require('./chunk-VPW777AD.js');
17
17
 
18
18
 
19
19
 
20
- var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
20
+ var _chunk2SOSTOJYjs = require('./chunk-2SOSTOJY.js');
21
21
 
22
22
 
23
23
 
@@ -169,7 +169,7 @@ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
169
169
  }
170
170
  dump() {
171
171
  const dumpData = {
172
- sdkVersion: _chunkVPW777ADjs.getVersion.call(void 0, ),
172
+ sdkVersion: _chunkMTBFUT2Hjs.getVersion.call(void 0, ),
173
173
  model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
174
174
  logTime: Date.now(),
175
175
  name: this.name,
@@ -192,14 +192,14 @@ var logFileName = "";
192
192
  var logContent = [];
193
193
  var logIdIndexMap = {};
194
194
  var { pid } = process;
195
- var logFileExt = _chunkVPW777ADjs.insightDumpFileExt;
195
+ var logFileExt = _chunkMTBFUT2Hjs.insightDumpFileExt;
196
196
  var ifInBrowser = typeof window !== "undefined";
197
197
  function writeInsightDump(data, logId, dumpSubscriber) {
198
- const logDir = _chunkVPW777ADjs.getLogDir.call(void 0, );
198
+ const logDir = _chunkMTBFUT2Hjs.getLogDir.call(void 0, );
199
199
  _assert2.default.call(void 0, logDir, "logDir should be set before writing dump file");
200
200
  const id = logId || _utils.uuid.call(void 0, );
201
201
  const baseData = {
202
- sdkVersion: _chunkVPW777ADjs.getVersion.call(void 0, ),
202
+ sdkVersion: _chunkMTBFUT2Hjs.getVersion.call(void 0, ),
203
203
  logTime: Date.now(),
204
204
  model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
205
205
  model_description: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_USE_VLM_UI_TARS) ? "vlm-ui-tars enabled" : ""
@@ -210,7 +210,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
210
210
  ...data
211
211
  };
212
212
  dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
213
- const dataString = _chunkVPW777ADjs.stringifyDumpData.call(void 0, finalData, 2);
213
+ const dataString = _chunkMTBFUT2Hjs.stringifyDumpData.call(void 0, finalData, 2);
214
214
  if (typeof logIdIndexMap[id] === "number") {
215
215
  logContent[logIdIndexMap[id]] = dataString;
216
216
  } else {
@@ -224,7 +224,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
224
224
  logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
225
225
  }
226
226
  }
227
- _chunkVPW777ADjs.writeLogFile.call(void 0, {
227
+ _chunkMTBFUT2Hjs.writeLogFile.call(void 0, {
228
228
  fileName: logFileName,
229
229
  fileExt: logFileExt,
230
230
  fileContent: `[
@@ -239,7 +239,7 @@ ${logContent.join(",\n")}
239
239
  // src/insight/index.ts
240
240
  var Insight = class {
241
241
  constructor(context, opt) {
242
- this.aiVendorFn = _chunkCERQVVPJjs.callAiFn;
242
+ this.aiVendorFn = _chunk2SOSTOJYjs.callAiFn;
243
243
  _assert2.default.call(void 0, context, "context is required for Insight");
244
244
  if (typeof context === "function") {
245
245
  this.contextRetrieverFn = context;
@@ -265,7 +265,7 @@ var Insight = class {
265
265
  this.onceDumpUpdatedFn = void 0;
266
266
  const context = await this.contextRetrieverFn("locate");
267
267
  const startTime = Date.now();
268
- const { parseResult, elementById, rawResponse, usage } = await _chunkCERQVVPJjs.AiInspectElement.call(void 0, {
268
+ const { parseResult, elementById, rawResponse, usage } = await _chunk2SOSTOJYjs.AiInspectElement.call(void 0, {
269
269
  callAI: callAI || this.aiVendorFn,
270
270
  context,
271
271
  multi: Boolean(multi),
@@ -348,7 +348,7 @@ ${parseResult.errors.join("\n")}`;
348
348
  this.onceDumpUpdatedFn = void 0;
349
349
  const context = await this.contextRetrieverFn("extract");
350
350
  const startTime = Date.now();
351
- const { parseResult, elementById } = await _chunkCERQVVPJjs.AiExtractElementInfo.call(void 0, {
351
+ const { parseResult, elementById } = await _chunk2SOSTOJYjs.AiExtractElementInfo.call(void 0, {
352
352
  context,
353
353
  dataQuery: dataDemand
354
354
  });
@@ -402,7 +402,7 @@ ${parseResult.errors.join("\n")}`;
402
402
  this.onceDumpUpdatedFn = void 0;
403
403
  const context = await this.contextRetrieverFn("assert");
404
404
  const startTime = Date.now();
405
- const assertResult = await _chunkCERQVVPJjs.AiAssert.call(void 0, {
405
+ const assertResult = await _chunk2SOSTOJYjs.AiAssert.call(void 0, {
406
406
  assertion,
407
407
  context
408
408
  });
@@ -450,4 +450,4 @@ var src_default = Insight;
450
450
 
451
451
 
452
452
 
453
- exports.AIResponseFormat = _chunkCERQVVPJjs.AIResponseFormat; exports.BaseElement = _chunkCERQVVPJjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunkCERQVVPJjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunkVPW777ADjs.getLogDirByType; exports.getVersion = _chunkVPW777ADjs.getVersion; exports.plan = _chunkCERQVVPJjs.plan; exports.setLogDir = _chunkVPW777ADjs.setLogDir; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId;
453
+ exports.AIResponseFormat = _chunk2SOSTOJYjs.AIResponseFormat; exports.BaseElement = _chunk2SOSTOJYjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunk2SOSTOJYjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunkMTBFUT2Hjs.getLogDirByType; exports.getVersion = _chunkMTBFUT2Hjs.getVersion; exports.plan = _chunk2SOSTOJYjs.plan; exports.setLogDir = _chunkMTBFUT2Hjs.setLogDir; exports.transformElementPositionToId = _chunk2SOSTOJYjs.transformElementPositionToId;
@@ -1,8 +1,9 @@
1
- import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-64c4d87b.js';
1
+ import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-7fe32cfe.js';
2
2
  import { ChatCompletionMessageParam } from 'openai/resources';
3
3
  export { ChatCompletionMessageParam } from 'openai/resources';
4
- import { A as AIActionType } from './llm-planning-ca109221.js';
5
- export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
4
+ import { A as AIActionType } from './llm-planning-373f78e9.js';
5
+ export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-373f78e9.js';
6
+ import { actionParser } from '@ui-tars/action-parser';
6
7
  import '@midscene/shared/constants';
7
8
 
8
9
  declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
@@ -35,7 +36,6 @@ declare function describeUserPage<ElementType extends BaseElement = BaseElement>
35
36
  };
36
37
  }>;
37
38
 
38
- type ActionType = 'click' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
39
39
  declare function vlmPlanning(options: {
40
40
  userInstruction: string;
41
41
  conversationHistory: ChatCompletionMessageParam[];
@@ -45,49 +45,8 @@ declare function vlmPlanning(options: {
45
45
  };
46
46
  }): Promise<{
47
47
  actions: PlanningAction<any>[];
48
- realActions: Array<Action>;
48
+ realActions: ReturnType<typeof actionParser>['parsed'];
49
49
  action_summary: string;
50
50
  }>;
51
- interface BaseAction {
52
- action_type: ActionType;
53
- action_inputs: Record<string, any>;
54
- reflection: string | null;
55
- thought: string | null;
56
- }
57
- interface ClickAction extends BaseAction {
58
- action_type: 'click';
59
- action_inputs: {
60
- start_box: string;
61
- };
62
- }
63
- interface WaitAction extends BaseAction {
64
- action_type: 'wait';
65
- action_inputs: {
66
- time: string;
67
- };
68
- }
69
- interface TypeAction extends BaseAction {
70
- action_type: 'type';
71
- action_inputs: {
72
- content: string;
73
- };
74
- }
75
- interface HotkeyAction extends BaseAction {
76
- action_type: 'hotkey';
77
- action_inputs: {
78
- key: string;
79
- };
80
- }
81
- interface ScrollAction extends BaseAction {
82
- action_type: 'scroll';
83
- action_inputs: {
84
- direction: 'up' | 'down';
85
- };
86
- }
87
- interface FinishedAction extends BaseAction {
88
- action_type: 'finished';
89
- action_inputs: Record<string, never>;
90
- }
91
- type Action = ClickAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction;
92
51
 
93
52
  export { callToGetJSONObject, describeUserPage, systemPromptToLocateElement, vlmPlanning };
@@ -1,7 +1,7 @@
1
- import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-64c4d87b.js';
2
- export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-64c4d87b.js';
3
- import { c as callAiFn } from './llm-planning-ca109221.js';
4
- export { p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-7fe32cfe.js';
2
+ export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7fe32cfe.js';
3
+ import { c as callAiFn } from './llm-planning-373f78e9.js';
4
+ export { p as plan, t as transformElementPositionToId } from './llm-planning-373f78e9.js';
5
5
  export { getLogDirByType, getVersion, setLogDir } from './utils.js';
6
6
  import '@midscene/shared/constants';
7
7
  import 'openai/resources';
@@ -32,6 +32,7 @@ interface MidsceneYamlScriptEnv {
32
32
  };
33
33
  cookie?: string;
34
34
  output?: string;
35
+ trackingActiveTab?: boolean; // if track the newly opened tab, true for default in yaml script
35
36
 
36
37
  // bridge mode config
37
38
  bridgeMode?: false | 'newTabWithUrl' | 'currentTab';
@@ -1,4 +1,4 @@
1
- import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-64c4d87b.js';
1
+ import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-7fe32cfe.js';
2
2
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
3
3
 
4
4
  type AIArgs = [
@@ -161,7 +161,7 @@ interface PlanningLocateParam {
161
161
  }
162
162
  interface PlanningAction<ParamType = any> {
163
163
  thought?: string;
164
- type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished';
164
+ type: 'Locate' | 'Tap' | 'Drag' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished';
165
165
  param: ParamType;
166
166
  locate: PlanningLocateParam | null;
167
167
  }
@@ -1,4 +1,4 @@
1
- import { r as ReportDumpWithAttributes, R as Rect } from './types-64c4d87b.js';
1
+ import { r as ReportDumpWithAttributes, R as Rect } from './types-7fe32cfe.js';
2
2
  import '@midscene/shared/constants';
3
3
  import 'openai/resources';
4
4
 
package/dist/lib/utils.js CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
 
18
18
 
19
- var _chunkVPW777ADjs = require('./chunk-VPW777AD.js');
19
+ var _chunkMTBFUT2Hjs = require('./chunk-MTBFUT2H.js');
20
20
  require('./chunk-JP3JBDZS.js');
21
21
  require('./chunk-YSQDPG26.js');
22
22
 
@@ -37,4 +37,4 @@ require('./chunk-YSQDPG26.js');
37
37
 
38
38
 
39
39
 
40
- exports.getLogDir = _chunkVPW777ADjs.getLogDir; exports.getLogDirByType = _chunkVPW777ADjs.getLogDirByType; exports.getTmpDir = _chunkVPW777ADjs.getTmpDir; exports.getTmpFile = _chunkVPW777ADjs.getTmpFile; exports.getVersion = _chunkVPW777ADjs.getVersion; exports.groupedActionDumpFileExt = _chunkVPW777ADjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunkVPW777ADjs.insightDumpFileExt; exports.overlapped = _chunkVPW777ADjs.overlapped; exports.replaceStringWithFirstAppearance = _chunkVPW777ADjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunkVPW777ADjs.replacerForPageObject; exports.reportHTMLContent = _chunkVPW777ADjs.reportHTMLContent; exports.setLogDir = _chunkVPW777ADjs.setLogDir; exports.sleep = _chunkVPW777ADjs.sleep; exports.stringifyDumpData = _chunkVPW777ADjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunkVPW777ADjs.uploadTestInfoToServer; exports.writeDumpReport = _chunkVPW777ADjs.writeDumpReport; exports.writeLogFile = _chunkVPW777ADjs.writeLogFile;
40
+ exports.getLogDir = _chunkMTBFUT2Hjs.getLogDir; exports.getLogDirByType = _chunkMTBFUT2Hjs.getLogDirByType; exports.getTmpDir = _chunkMTBFUT2Hjs.getTmpDir; exports.getTmpFile = _chunkMTBFUT2Hjs.getTmpFile; exports.getVersion = _chunkMTBFUT2Hjs.getVersion; exports.groupedActionDumpFileExt = _chunkMTBFUT2Hjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunkMTBFUT2Hjs.insightDumpFileExt; exports.overlapped = _chunkMTBFUT2Hjs.overlapped; exports.replaceStringWithFirstAppearance = _chunkMTBFUT2Hjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunkMTBFUT2Hjs.replacerForPageObject; exports.reportHTMLContent = _chunkMTBFUT2Hjs.reportHTMLContent; exports.setLogDir = _chunkMTBFUT2Hjs.setLogDir; exports.sleep = _chunkMTBFUT2Hjs.sleep; exports.stringifyDumpData = _chunkMTBFUT2Hjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunkMTBFUT2Hjs.uploadTestInfoToServer; exports.writeDumpReport = _chunkMTBFUT2Hjs.writeDumpReport; exports.writeLogFile = _chunkMTBFUT2Hjs.writeLogFile;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "0.10.1",
4
+ "version": "0.10.2",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "jsnext:source": "./src/index.ts",
@@ -37,11 +37,12 @@
37
37
  },
38
38
  "dependencies": {
39
39
  "@azure/identity": "4.5.0",
40
+ "@ui-tars/action-parser": "1.0.1",
40
41
  "@anthropic-ai/sdk": "0.33.1",
41
42
  "@langchain/core": "0.3.26",
42
43
  "socks-proxy-agent": "8.0.4",
43
44
  "openai": "4.57.1",
44
- "@midscene/shared": "0.10.1"
45
+ "@midscene/shared": "0.10.2"
45
46
  },
46
47
  "devDependencies": {
47
48
  "@modern-js/module-tools": "2.60.6",
@@ -67,12 +68,14 @@
67
68
  "build:watch": "modern build -w",
68
69
  "new": "modern new",
69
70
  "upgrade": "modern upgrade",
70
- "test": "vitest --run -u",
71
+ "test": "vitest --run",
71
72
  "test:ai": "AITEST=true npm run test",
72
73
  "computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
74
+ "test:parse-action": "npm run test:ai -- tests/ai/parse-action.test.ts",
73
75
  "evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
74
76
  "evaluate:assertion": "npm run test:ai -- tests/ai/evaluate/assertion.test.ts",
75
- "prompt": "npm run test:ai -- tests/ai/parse-action.test.ts",
76
- "evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts"
77
+ "evaluate:plan": "npm run test:ai -- tests/ai/evaluate/plan/planning.test.ts",
78
+ "evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
79
+ "prompt": "npm run test:ai -- tests/ai/parse-action.test.ts"
77
80
  }
78
81
  }