@midscene/core 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-CERQVVPJ.js → chunk-2SOSTOJY.js} +43 -137
- package/dist/lib/{chunk-VPW777AD.js → chunk-MTBFUT2H.js} +1 -1
- package/dist/lib/index.js +13 -13
- package/dist/lib/types/ai-model.d.ts +5 -46
- package/dist/lib/types/index.d.ts +5 -4
- package/dist/lib/types/{llm-planning-ca109221.d.ts → llm-planning-373f78e9.d.ts} +1 -1
- package/dist/lib/types/{types-64c4d87b.d.ts → types-7fe32cfe.d.ts} +1 -1
- package/dist/lib/types/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/package.json +8 -5
- package/report/index.html +1 -1
package/dist/lib/ai-model.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
var
|
|
12
|
+
var _chunk2SOSTOJYjs = require('./chunk-2SOSTOJY.js');
|
|
13
13
|
require('./chunk-JP3JBDZS.js');
|
|
14
14
|
require('./chunk-YSQDPG26.js');
|
|
15
15
|
|
|
@@ -23,4 +23,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
exports.AiAssert =
|
|
26
|
+
exports.AiAssert = _chunk2SOSTOJYjs.AiAssert; exports.AiExtractElementInfo = _chunk2SOSTOJYjs.AiExtractElementInfo; exports.AiInspectElement = _chunk2SOSTOJYjs.AiInspectElement; exports.callAiFn = _chunk2SOSTOJYjs.callAiFn; exports.callToGetJSONObject = _chunk2SOSTOJYjs.callToGetJSONObject; exports.describeUserPage = _chunk2SOSTOJYjs.describeUserPage; exports.plan = _chunk2SOSTOJYjs.plan; exports.systemPromptToLocateElement = _chunk2SOSTOJYjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunk2SOSTOJYjs.transformElementPositionToId; exports.vlmPlanning = _chunk2SOSTOJYjs.vlmPlanning;
|
|
@@ -1147,125 +1147,6 @@ call_user() # Submit the task and call the user when the task is unsolvable, or
|
|
|
1147
1147
|
## User Instruction
|
|
1148
1148
|
`;
|
|
1149
1149
|
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
1150
|
-
function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
|
|
1151
|
-
let reflection = null;
|
|
1152
|
-
let thought = null;
|
|
1153
|
-
let actionStr = "";
|
|
1154
|
-
text = text.trim();
|
|
1155
|
-
if (mode === "bc") {
|
|
1156
|
-
if (text.startsWith("Thought:")) {
|
|
1157
|
-
const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
|
|
1158
|
-
if (thoughtMatch) {
|
|
1159
|
-
thought = thoughtMatch[1].trim();
|
|
1160
|
-
}
|
|
1161
|
-
} else if (text.startsWith("Reflection:")) {
|
|
1162
|
-
const reflectionMatch = text.match(
|
|
1163
|
-
/Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
|
|
1164
|
-
);
|
|
1165
|
-
if (reflectionMatch) {
|
|
1166
|
-
thought = reflectionMatch[2].trim();
|
|
1167
|
-
reflection = reflectionMatch[1].trim();
|
|
1168
|
-
}
|
|
1169
|
-
} else if (text.startsWith("Action_Summary:")) {
|
|
1170
|
-
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
|
|
1171
|
-
if (summaryMatch) {
|
|
1172
|
-
thought = summaryMatch[1].trim();
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
if (!text.includes("Action:")) {
|
|
1176
|
-
actionStr = text;
|
|
1177
|
-
} else {
|
|
1178
|
-
const actionParts = text.split("Action:");
|
|
1179
|
-
actionStr = actionParts[actionParts.length - 1];
|
|
1180
|
-
}
|
|
1181
|
-
} else if (mode === "o1") {
|
|
1182
|
-
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
|
|
1183
|
-
const actionSummaryMatch = text.match(
|
|
1184
|
-
/\nAction_Summary:\s*(.*?)\s*Action:/
|
|
1185
|
-
);
|
|
1186
|
-
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
|
|
1187
|
-
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
|
|
1188
|
-
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
|
|
1189
|
-
const actionContent = actionMatch ? actionMatch[1] : null;
|
|
1190
|
-
thought = `${thoughtContent}
|
|
1191
|
-
<Action_Summary>
|
|
1192
|
-
${actionSummaryContent}`;
|
|
1193
|
-
actionStr = actionContent || "";
|
|
1194
|
-
}
|
|
1195
|
-
const allActions = actionStr.split("\n\n");
|
|
1196
|
-
const actions = [];
|
|
1197
|
-
for (const rawStr of allActions) {
|
|
1198
|
-
const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
|
|
1199
|
-
if (!actionInstance) {
|
|
1200
|
-
console.log(`Action can't parse: ${rawStr}`);
|
|
1201
|
-
continue;
|
|
1202
|
-
}
|
|
1203
|
-
const actionType = actionInstance.function;
|
|
1204
|
-
const params = actionInstance.args;
|
|
1205
|
-
const actionInputs = {};
|
|
1206
|
-
for (const [paramName, param] of Object.entries(params)) {
|
|
1207
|
-
if (!param)
|
|
1208
|
-
continue;
|
|
1209
|
-
const trimmedParam = param.trim();
|
|
1210
|
-
actionInputs[paramName.trim()] = trimmedParam;
|
|
1211
|
-
if (paramName.includes("start_box") || paramName.includes("end_box")) {
|
|
1212
|
-
const oriBox = trimmedParam;
|
|
1213
|
-
const numbers = oriBox.replace(/[()]/g, "").split(",");
|
|
1214
|
-
const floatNumbers = numbers.map(
|
|
1215
|
-
(num) => Number.parseFloat(num) / factor
|
|
1216
|
-
);
|
|
1217
|
-
if (floatNumbers.length === 2) {
|
|
1218
|
-
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
|
|
1219
|
-
}
|
|
1220
|
-
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
|
|
1221
|
-
}
|
|
1222
|
-
}
|
|
1223
|
-
if (actionType === "finished") {
|
|
1224
|
-
actions.push({
|
|
1225
|
-
reflection,
|
|
1226
|
-
thought,
|
|
1227
|
-
action_type: "finished",
|
|
1228
|
-
action_inputs: {}
|
|
1229
|
-
});
|
|
1230
|
-
} else {
|
|
1231
|
-
actions.push({
|
|
1232
|
-
reflection,
|
|
1233
|
-
thought,
|
|
1234
|
-
action_type: actionType,
|
|
1235
|
-
action_inputs: actionInputs
|
|
1236
|
-
});
|
|
1237
|
-
}
|
|
1238
|
-
}
|
|
1239
|
-
return actions;
|
|
1240
|
-
}
|
|
1241
|
-
function parseAction(actionStr) {
|
|
1242
|
-
try {
|
|
1243
|
-
const functionPattern = /^(\w+)\((.*)\)$/;
|
|
1244
|
-
const match = actionStr.trim().match(functionPattern);
|
|
1245
|
-
if (!match) {
|
|
1246
|
-
throw new Error("Not a function call");
|
|
1247
|
-
}
|
|
1248
|
-
const [_, functionName, argsStr] = match;
|
|
1249
|
-
const kwargs = {};
|
|
1250
|
-
if (argsStr.trim()) {
|
|
1251
|
-
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
|
|
1252
|
-
for (const pair of argPairs) {
|
|
1253
|
-
const [key, ...valueParts] = pair.split("=");
|
|
1254
|
-
if (!key)
|
|
1255
|
-
continue;
|
|
1256
|
-
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
|
|
1257
|
-
kwargs[key.trim()] = value;
|
|
1258
|
-
}
|
|
1259
|
-
}
|
|
1260
|
-
return {
|
|
1261
|
-
function: functionName,
|
|
1262
|
-
args: kwargs
|
|
1263
|
-
};
|
|
1264
|
-
} catch (e) {
|
|
1265
|
-
console.error(`Failed to parse action '${actionStr}': ${e}`);
|
|
1266
|
-
return null;
|
|
1267
|
-
}
|
|
1268
|
-
}
|
|
1269
1150
|
|
|
1270
1151
|
// src/ai-model/prompt/ui-tars-locator.ts
|
|
1271
1152
|
function systemPromptToLocateElementPosition() {
|
|
@@ -2638,9 +2519,9 @@ async function plan(userPrompt, opts) {
|
|
|
2638
2519
|
}
|
|
2639
2520
|
|
|
2640
2521
|
// src/ai-model/ui-tars-planning.ts
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2522
|
+
|
|
2523
|
+
var _keyboardlayout = require('@midscene/shared/keyboard-layout');
|
|
2524
|
+
var _actionparser = require('@ui-tars/action-parser');
|
|
2644
2525
|
async function vlmPlanning(options) {
|
|
2645
2526
|
const { conversationHistory, userInstruction, size } = options;
|
|
2646
2527
|
const systemPrompt = uiTarsPlanningPrompt + userInstruction;
|
|
@@ -2654,10 +2535,14 @@ async function vlmPlanning(options) {
|
|
|
2654
2535
|
],
|
|
2655
2536
|
1 /* INSPECT_ELEMENT */
|
|
2656
2537
|
);
|
|
2657
|
-
const
|
|
2538
|
+
const { parsed } = _actionparser.actionParser.call(void 0, {
|
|
2539
|
+
prediction: res.content,
|
|
2540
|
+
factor: 1e3
|
|
2541
|
+
});
|
|
2658
2542
|
const transformActions = [];
|
|
2659
|
-
|
|
2543
|
+
parsed.forEach((action) => {
|
|
2660
2544
|
if (action.action_type === "click") {
|
|
2545
|
+
_assert2.default.call(void 0, action.action_inputs.start_box, "start_box is required");
|
|
2661
2546
|
const point = getPoint(action.action_inputs.start_box, size);
|
|
2662
2547
|
transformActions.push({
|
|
2663
2548
|
type: "Locate",
|
|
@@ -2678,6 +2563,20 @@ async function vlmPlanning(options) {
|
|
|
2678
2563
|
},
|
|
2679
2564
|
param: action.thought || ""
|
|
2680
2565
|
});
|
|
2566
|
+
} else if (action.action_type === "drag") {
|
|
2567
|
+
_assert2.default.call(void 0, action.action_inputs.start_box, "start_box is required");
|
|
2568
|
+
_assert2.default.call(void 0, action.action_inputs.end_box, "end_box is required");
|
|
2569
|
+
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
2570
|
+
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
2571
|
+
transformActions.push({
|
|
2572
|
+
type: "Drag",
|
|
2573
|
+
param: {
|
|
2574
|
+
start_box: { x: startPoint[0], y: startPoint[1] },
|
|
2575
|
+
end_box: { x: endPoint[0], y: endPoint[1] }
|
|
2576
|
+
},
|
|
2577
|
+
locate: null,
|
|
2578
|
+
thought: action.thought || ""
|
|
2579
|
+
});
|
|
2681
2580
|
} else if (action.action_type === "type") {
|
|
2682
2581
|
transformActions.push({
|
|
2683
2582
|
type: "Input",
|
|
@@ -2704,31 +2603,38 @@ async function vlmPlanning(options) {
|
|
|
2704
2603
|
thought: action.thought || ""
|
|
2705
2604
|
});
|
|
2706
2605
|
} else if (action.action_type === "hotkey") {
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
}
|
|
2606
|
+
_assert2.default.call(void 0, action.action_inputs.key, "key is required");
|
|
2607
|
+
const keys = _keyboardlayout.transformHotkeyInput.call(void 0, action.action_inputs.key);
|
|
2608
|
+
transformActions.push({
|
|
2609
|
+
type: "KeyboardPress",
|
|
2610
|
+
param: {
|
|
2611
|
+
value: keys
|
|
2612
|
+
},
|
|
2613
|
+
locate: null,
|
|
2614
|
+
thought: action.thought || ""
|
|
2615
|
+
});
|
|
2718
2616
|
} else if (action.action_type === "wait") {
|
|
2719
2617
|
transformActions.push({
|
|
2720
2618
|
type: "Sleep",
|
|
2721
2619
|
param: {
|
|
2722
|
-
timeMs:
|
|
2620
|
+
timeMs: 1e3
|
|
2723
2621
|
},
|
|
2724
2622
|
locate: null,
|
|
2725
2623
|
thought: action.thought || ""
|
|
2726
2624
|
});
|
|
2727
2625
|
}
|
|
2728
2626
|
});
|
|
2627
|
+
if (transformActions.length === 0) {
|
|
2628
|
+
throw new Error("No actions found", {
|
|
2629
|
+
cause: {
|
|
2630
|
+
prediction: res.content,
|
|
2631
|
+
parsed
|
|
2632
|
+
}
|
|
2633
|
+
});
|
|
2634
|
+
}
|
|
2729
2635
|
return {
|
|
2730
2636
|
actions: transformActions,
|
|
2731
|
-
realActions:
|
|
2637
|
+
realActions: parsed,
|
|
2732
2638
|
action_summary: getSummary(res.content)
|
|
2733
2639
|
};
|
|
2734
2640
|
}
|
|
@@ -188,7 +188,7 @@ function stringifyDumpData(data, indents) {
|
|
|
188
188
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
189
189
|
}
|
|
190
190
|
function getVersion() {
|
|
191
|
-
return "0.10.
|
|
191
|
+
return "0.10.2";
|
|
192
192
|
}
|
|
193
193
|
function debugLog(...message) {
|
|
194
194
|
const debugMode = _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_DEBUG_MODE);
|
package/dist/lib/index.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
var
|
|
9
|
+
var _chunkMTBFUT2Hjs = require('./chunk-MTBFUT2H.js');
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
@@ -17,7 +17,7 @@ var _chunkVPW777ADjs = require('./chunk-VPW777AD.js');
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
var
|
|
20
|
+
var _chunk2SOSTOJYjs = require('./chunk-2SOSTOJY.js');
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
|
|
@@ -169,7 +169,7 @@ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
|
|
|
169
169
|
}
|
|
170
170
|
dump() {
|
|
171
171
|
const dumpData = {
|
|
172
|
-
sdkVersion:
|
|
172
|
+
sdkVersion: _chunkMTBFUT2Hjs.getVersion.call(void 0, ),
|
|
173
173
|
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
|
|
174
174
|
logTime: Date.now(),
|
|
175
175
|
name: this.name,
|
|
@@ -192,14 +192,14 @@ var logFileName = "";
|
|
|
192
192
|
var logContent = [];
|
|
193
193
|
var logIdIndexMap = {};
|
|
194
194
|
var { pid } = process;
|
|
195
|
-
var logFileExt =
|
|
195
|
+
var logFileExt = _chunkMTBFUT2Hjs.insightDumpFileExt;
|
|
196
196
|
var ifInBrowser = typeof window !== "undefined";
|
|
197
197
|
function writeInsightDump(data, logId, dumpSubscriber) {
|
|
198
|
-
const logDir =
|
|
198
|
+
const logDir = _chunkMTBFUT2Hjs.getLogDir.call(void 0, );
|
|
199
199
|
_assert2.default.call(void 0, logDir, "logDir should be set before writing dump file");
|
|
200
200
|
const id = logId || _utils.uuid.call(void 0, );
|
|
201
201
|
const baseData = {
|
|
202
|
-
sdkVersion:
|
|
202
|
+
sdkVersion: _chunkMTBFUT2Hjs.getVersion.call(void 0, ),
|
|
203
203
|
logTime: Date.now(),
|
|
204
204
|
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
|
|
205
205
|
model_description: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_USE_VLM_UI_TARS) ? "vlm-ui-tars enabled" : ""
|
|
@@ -210,7 +210,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
210
210
|
...data
|
|
211
211
|
};
|
|
212
212
|
dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
|
|
213
|
-
const dataString =
|
|
213
|
+
const dataString = _chunkMTBFUT2Hjs.stringifyDumpData.call(void 0, finalData, 2);
|
|
214
214
|
if (typeof logIdIndexMap[id] === "number") {
|
|
215
215
|
logContent[logIdIndexMap[id]] = dataString;
|
|
216
216
|
} else {
|
|
@@ -224,7 +224,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
224
224
|
logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
|
|
225
225
|
}
|
|
226
226
|
}
|
|
227
|
-
|
|
227
|
+
_chunkMTBFUT2Hjs.writeLogFile.call(void 0, {
|
|
228
228
|
fileName: logFileName,
|
|
229
229
|
fileExt: logFileExt,
|
|
230
230
|
fileContent: `[
|
|
@@ -239,7 +239,7 @@ ${logContent.join(",\n")}
|
|
|
239
239
|
// src/insight/index.ts
|
|
240
240
|
var Insight = class {
|
|
241
241
|
constructor(context, opt) {
|
|
242
|
-
this.aiVendorFn =
|
|
242
|
+
this.aiVendorFn = _chunk2SOSTOJYjs.callAiFn;
|
|
243
243
|
_assert2.default.call(void 0, context, "context is required for Insight");
|
|
244
244
|
if (typeof context === "function") {
|
|
245
245
|
this.contextRetrieverFn = context;
|
|
@@ -265,7 +265,7 @@ var Insight = class {
|
|
|
265
265
|
this.onceDumpUpdatedFn = void 0;
|
|
266
266
|
const context = await this.contextRetrieverFn("locate");
|
|
267
267
|
const startTime = Date.now();
|
|
268
|
-
const { parseResult, elementById, rawResponse, usage } = await
|
|
268
|
+
const { parseResult, elementById, rawResponse, usage } = await _chunk2SOSTOJYjs.AiInspectElement.call(void 0, {
|
|
269
269
|
callAI: callAI || this.aiVendorFn,
|
|
270
270
|
context,
|
|
271
271
|
multi: Boolean(multi),
|
|
@@ -348,7 +348,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
348
348
|
this.onceDumpUpdatedFn = void 0;
|
|
349
349
|
const context = await this.contextRetrieverFn("extract");
|
|
350
350
|
const startTime = Date.now();
|
|
351
|
-
const { parseResult, elementById } = await
|
|
351
|
+
const { parseResult, elementById } = await _chunk2SOSTOJYjs.AiExtractElementInfo.call(void 0, {
|
|
352
352
|
context,
|
|
353
353
|
dataQuery: dataDemand
|
|
354
354
|
});
|
|
@@ -402,7 +402,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
402
402
|
this.onceDumpUpdatedFn = void 0;
|
|
403
403
|
const context = await this.contextRetrieverFn("assert");
|
|
404
404
|
const startTime = Date.now();
|
|
405
|
-
const assertResult = await
|
|
405
|
+
const assertResult = await _chunk2SOSTOJYjs.AiAssert.call(void 0, {
|
|
406
406
|
assertion,
|
|
407
407
|
context
|
|
408
408
|
});
|
|
@@ -450,4 +450,4 @@ var src_default = Insight;
|
|
|
450
450
|
|
|
451
451
|
|
|
452
452
|
|
|
453
|
-
exports.AIResponseFormat =
|
|
453
|
+
exports.AIResponseFormat = _chunk2SOSTOJYjs.AIResponseFormat; exports.BaseElement = _chunk2SOSTOJYjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunk2SOSTOJYjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunkMTBFUT2Hjs.getLogDirByType; exports.getVersion = _chunkMTBFUT2Hjs.getVersion; exports.plan = _chunk2SOSTOJYjs.plan; exports.setLogDir = _chunkMTBFUT2Hjs.setLogDir; exports.transformElementPositionToId = _chunk2SOSTOJYjs.transformElementPositionToId;
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-
|
|
1
|
+
import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-7fe32cfe.js';
|
|
2
2
|
import { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
3
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
4
|
-
import { A as AIActionType } from './llm-planning-
|
|
5
|
-
export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-
|
|
4
|
+
import { A as AIActionType } from './llm-planning-373f78e9.js';
|
|
5
|
+
export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-373f78e9.js';
|
|
6
|
+
import { actionParser } from '@ui-tars/action-parser';
|
|
6
7
|
import '@midscene/shared/constants';
|
|
7
8
|
|
|
8
9
|
declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
|
|
@@ -35,7 +36,6 @@ declare function describeUserPage<ElementType extends BaseElement = BaseElement>
|
|
|
35
36
|
};
|
|
36
37
|
}>;
|
|
37
38
|
|
|
38
|
-
type ActionType = 'click' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
|
|
39
39
|
declare function vlmPlanning(options: {
|
|
40
40
|
userInstruction: string;
|
|
41
41
|
conversationHistory: ChatCompletionMessageParam[];
|
|
@@ -45,49 +45,8 @@ declare function vlmPlanning(options: {
|
|
|
45
45
|
};
|
|
46
46
|
}): Promise<{
|
|
47
47
|
actions: PlanningAction<any>[];
|
|
48
|
-
realActions:
|
|
48
|
+
realActions: ReturnType<typeof actionParser>['parsed'];
|
|
49
49
|
action_summary: string;
|
|
50
50
|
}>;
|
|
51
|
-
interface BaseAction {
|
|
52
|
-
action_type: ActionType;
|
|
53
|
-
action_inputs: Record<string, any>;
|
|
54
|
-
reflection: string | null;
|
|
55
|
-
thought: string | null;
|
|
56
|
-
}
|
|
57
|
-
interface ClickAction extends BaseAction {
|
|
58
|
-
action_type: 'click';
|
|
59
|
-
action_inputs: {
|
|
60
|
-
start_box: string;
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
interface WaitAction extends BaseAction {
|
|
64
|
-
action_type: 'wait';
|
|
65
|
-
action_inputs: {
|
|
66
|
-
time: string;
|
|
67
|
-
};
|
|
68
|
-
}
|
|
69
|
-
interface TypeAction extends BaseAction {
|
|
70
|
-
action_type: 'type';
|
|
71
|
-
action_inputs: {
|
|
72
|
-
content: string;
|
|
73
|
-
};
|
|
74
|
-
}
|
|
75
|
-
interface HotkeyAction extends BaseAction {
|
|
76
|
-
action_type: 'hotkey';
|
|
77
|
-
action_inputs: {
|
|
78
|
-
key: string;
|
|
79
|
-
};
|
|
80
|
-
}
|
|
81
|
-
interface ScrollAction extends BaseAction {
|
|
82
|
-
action_type: 'scroll';
|
|
83
|
-
action_inputs: {
|
|
84
|
-
direction: 'up' | 'down';
|
|
85
|
-
};
|
|
86
|
-
}
|
|
87
|
-
interface FinishedAction extends BaseAction {
|
|
88
|
-
action_type: 'finished';
|
|
89
|
-
action_inputs: Record<string, never>;
|
|
90
|
-
}
|
|
91
|
-
type Action = ClickAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction;
|
|
92
51
|
|
|
93
52
|
export { callToGetJSONObject, describeUserPage, systemPromptToLocateElement, vlmPlanning };
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-
|
|
2
|
-
export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-
|
|
3
|
-
import { c as callAiFn } from './llm-planning-
|
|
4
|
-
export { p as plan, t as transformElementPositionToId } from './llm-planning-
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-7fe32cfe.js';
|
|
2
|
+
export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-7fe32cfe.js';
|
|
3
|
+
import { c as callAiFn } from './llm-planning-373f78e9.js';
|
|
4
|
+
export { p as plan, t as transformElementPositionToId } from './llm-planning-373f78e9.js';
|
|
5
5
|
export { getLogDirByType, getVersion, setLogDir } from './utils.js';
|
|
6
6
|
import '@midscene/shared/constants';
|
|
7
7
|
import 'openai/resources';
|
|
@@ -32,6 +32,7 @@ interface MidsceneYamlScriptEnv {
|
|
|
32
32
|
};
|
|
33
33
|
cookie?: string;
|
|
34
34
|
output?: string;
|
|
35
|
+
trackingActiveTab?: boolean; // if track the newly opened tab, true for default in yaml script
|
|
35
36
|
|
|
36
37
|
// bridge mode config
|
|
37
38
|
bridgeMode?: false | 'newTabWithUrl' | 'currentTab';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-
|
|
1
|
+
import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-7fe32cfe.js';
|
|
2
2
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
|
@@ -161,7 +161,7 @@ interface PlanningLocateParam {
|
|
|
161
161
|
}
|
|
162
162
|
interface PlanningAction<ParamType = any> {
|
|
163
163
|
thought?: string;
|
|
164
|
-
type: 'Locate' | 'Tap' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished';
|
|
164
|
+
type: 'Locate' | 'Tap' | 'Drag' | 'Hover' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'FalsyConditionStatement' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished';
|
|
165
165
|
param: ParamType;
|
|
166
166
|
locate: PlanningLocateParam | null;
|
|
167
167
|
}
|
package/dist/lib/utils.js
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
var
|
|
19
|
+
var _chunkMTBFUT2Hjs = require('./chunk-MTBFUT2H.js');
|
|
20
20
|
require('./chunk-JP3JBDZS.js');
|
|
21
21
|
require('./chunk-YSQDPG26.js');
|
|
22
22
|
|
|
@@ -37,4 +37,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
exports.getLogDir =
|
|
40
|
+
exports.getLogDir = _chunkMTBFUT2Hjs.getLogDir; exports.getLogDirByType = _chunkMTBFUT2Hjs.getLogDirByType; exports.getTmpDir = _chunkMTBFUT2Hjs.getTmpDir; exports.getTmpFile = _chunkMTBFUT2Hjs.getTmpFile; exports.getVersion = _chunkMTBFUT2Hjs.getVersion; exports.groupedActionDumpFileExt = _chunkMTBFUT2Hjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunkMTBFUT2Hjs.insightDumpFileExt; exports.overlapped = _chunkMTBFUT2Hjs.overlapped; exports.replaceStringWithFirstAppearance = _chunkMTBFUT2Hjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunkMTBFUT2Hjs.replacerForPageObject; exports.reportHTMLContent = _chunkMTBFUT2Hjs.reportHTMLContent; exports.setLogDir = _chunkMTBFUT2Hjs.setLogDir; exports.sleep = _chunkMTBFUT2Hjs.sleep; exports.stringifyDumpData = _chunkMTBFUT2Hjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunkMTBFUT2Hjs.uploadTestInfoToServer; exports.writeDumpReport = _chunkMTBFUT2Hjs.writeDumpReport; exports.writeLogFile = _chunkMTBFUT2Hjs.writeLogFile;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@midscene/core",
|
|
3
3
|
"description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.2",
|
|
5
5
|
"repository": "https://github.com/web-infra-dev/midscene",
|
|
6
6
|
"homepage": "https://midscenejs.com/",
|
|
7
7
|
"jsnext:source": "./src/index.ts",
|
|
@@ -37,11 +37,12 @@
|
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@azure/identity": "4.5.0",
|
|
40
|
+
"@ui-tars/action-parser": "1.0.1",
|
|
40
41
|
"@anthropic-ai/sdk": "0.33.1",
|
|
41
42
|
"@langchain/core": "0.3.26",
|
|
42
43
|
"socks-proxy-agent": "8.0.4",
|
|
43
44
|
"openai": "4.57.1",
|
|
44
|
-
"@midscene/shared": "0.10.
|
|
45
|
+
"@midscene/shared": "0.10.2"
|
|
45
46
|
},
|
|
46
47
|
"devDependencies": {
|
|
47
48
|
"@modern-js/module-tools": "2.60.6",
|
|
@@ -67,12 +68,14 @@
|
|
|
67
68
|
"build:watch": "modern build -w",
|
|
68
69
|
"new": "modern new",
|
|
69
70
|
"upgrade": "modern upgrade",
|
|
70
|
-
"test": "vitest --run
|
|
71
|
+
"test": "vitest --run",
|
|
71
72
|
"test:ai": "AITEST=true npm run test",
|
|
72
73
|
"computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
|
|
74
|
+
"test:parse-action": "npm run test:ai -- tests/ai/parse-action.test.ts",
|
|
73
75
|
"evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
|
|
74
76
|
"evaluate:assertion": "npm run test:ai -- tests/ai/evaluate/assertion.test.ts",
|
|
75
|
-
"
|
|
76
|
-
"evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts"
|
|
77
|
+
"evaluate:plan": "npm run test:ai -- tests/ai/evaluate/plan/planning.test.ts",
|
|
78
|
+
"evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
|
|
79
|
+
"prompt": "npm run test:ai -- tests/ai/parse-action.test.ts"
|
|
77
80
|
}
|
|
78
81
|
}
|