@midscene/core 0.24.2-beta-20250731151311.0 → 0.24.2-beta-20250801111909.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +3 -3
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
- package/dist/es/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +94 -10
- package/dist/es/chunk-KFA65L55.js.map +1 -0
- package/dist/es/index.d.ts +7 -9
- package/dist/es/index.js +4 -8
- package/dist/es/index.js.map +1 -1
- package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
- package/dist/es/{types-d836fa73.d.ts → types-7b64b80b.d.ts} +33 -13
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +3 -3
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
- package/dist/lib/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +100 -16
- package/dist/lib/chunk-KFA65L55.js.map +1 -0
- package/dist/lib/index.d.ts +7 -9
- package/dist/lib/index.js +14 -18
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
- package/dist/{types/types-d836fa73.d.ts → lib/types-7b64b80b.d.ts} +33 -13
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/index.d.ts +7 -9
- package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
- package/dist/{lib/types-d836fa73.d.ts → types/types-7b64b80b.d.ts} +33 -13
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-FKQMUAXP.js.map +0 -1
- package/dist/lib/chunk-FKQMUAXP.js.map +0 -1
- /package/dist/es/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0
- /package/dist/lib/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0
|
@@ -228,7 +228,7 @@ function mergeRects(rects) {
|
|
|
228
228
|
};
|
|
229
229
|
}
|
|
230
230
|
function expandSearchArea(rect, screenSize) {
|
|
231
|
-
const minEdgeSize = 300;
|
|
231
|
+
const minEdgeSize = _env.vlLocateMode.call(void 0, ) === "doubao-vision" ? 500 : 300;
|
|
232
232
|
const defaultPadding = 160;
|
|
233
233
|
const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
|
|
234
234
|
const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
|
|
@@ -2008,6 +2008,10 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
|
|
|
2008
2008
|
|
|
2009
2009
|
|
|
2010
2010
|
|
|
2011
|
+
|
|
2012
|
+
|
|
2013
|
+
|
|
2014
|
+
|
|
2011
2015
|
// src/ai-model/prompt/extraction.ts
|
|
2012
2016
|
|
|
2013
2017
|
function systemPromptToExtract() {
|
|
@@ -2018,6 +2022,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
|
|
|
2018
2022
|
|
|
2019
2023
|
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
|
|
2020
2024
|
|
|
2025
|
+
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
|
|
2026
|
+
|
|
2021
2027
|
Return in the following JSON format:
|
|
2022
2028
|
{
|
|
2023
2029
|
data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
|
|
@@ -2158,6 +2164,55 @@ var sectionLocatorInstruction = new (0, _prompts.PromptTemplate)({
|
|
|
2158
2164
|
// src/ai-model/inspect.ts
|
|
2159
2165
|
var debugInspect = _logger.getDebug.call(void 0, "ai:inspect");
|
|
2160
2166
|
var debugSection = _logger.getDebug.call(void 0, "ai:section");
|
|
2167
|
+
var extraTextFromUserPrompt = (prompt) => {
|
|
2168
|
+
if (typeof prompt === "string") {
|
|
2169
|
+
return prompt;
|
|
2170
|
+
} else {
|
|
2171
|
+
return prompt.prompt;
|
|
2172
|
+
}
|
|
2173
|
+
};
|
|
2174
|
+
var promptsToChatParam = async (multimodalPrompt) => {
|
|
2175
|
+
const msgs = [];
|
|
2176
|
+
if (_optionalChain([multimodalPrompt, 'optionalAccess', _55 => _55.images, 'optionalAccess', _56 => _56.length])) {
|
|
2177
|
+
msgs.push({
|
|
2178
|
+
role: "user",
|
|
2179
|
+
content: [
|
|
2180
|
+
{
|
|
2181
|
+
type: "text",
|
|
2182
|
+
text: "Next, I will provide all the reference images."
|
|
2183
|
+
}
|
|
2184
|
+
]
|
|
2185
|
+
});
|
|
2186
|
+
for (const item of multimodalPrompt.images) {
|
|
2187
|
+
const base64 = await _img.preProcessImageUrl.call(void 0,
|
|
2188
|
+
item.url,
|
|
2189
|
+
!!multimodalPrompt.convertHttpImage2Base64
|
|
2190
|
+
);
|
|
2191
|
+
msgs.push({
|
|
2192
|
+
role: "user",
|
|
2193
|
+
content: [
|
|
2194
|
+
{
|
|
2195
|
+
type: "text",
|
|
2196
|
+
text: `reference image ${item.name}:`
|
|
2197
|
+
}
|
|
2198
|
+
]
|
|
2199
|
+
});
|
|
2200
|
+
msgs.push({
|
|
2201
|
+
role: "user",
|
|
2202
|
+
content: [
|
|
2203
|
+
{
|
|
2204
|
+
type: "image_url",
|
|
2205
|
+
image_url: {
|
|
2206
|
+
url: base64,
|
|
2207
|
+
detail: "high"
|
|
2208
|
+
}
|
|
2209
|
+
}
|
|
2210
|
+
]
|
|
2211
|
+
});
|
|
2212
|
+
}
|
|
2213
|
+
}
|
|
2214
|
+
return msgs;
|
|
2215
|
+
};
|
|
2161
2216
|
async function AiLocateElement(options) {
|
|
2162
2217
|
const { context, targetElementDescription, callAI } = options;
|
|
2163
2218
|
const { screenshotBase64 } = context;
|
|
@@ -2168,7 +2223,7 @@ async function AiLocateElement(options) {
|
|
|
2168
2223
|
);
|
|
2169
2224
|
const userInstructionPrompt = await findElementPrompt.format({
|
|
2170
2225
|
pageDescription: description,
|
|
2171
|
-
targetElementDescription
|
|
2226
|
+
targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
|
|
2172
2227
|
});
|
|
2173
2228
|
const systemPrompt = systemPromptToLocateElement(_env.vlLocateMode.call(void 0, ));
|
|
2174
2229
|
let imagePayload = screenshotBase64;
|
|
@@ -2210,6 +2265,13 @@ async function AiLocateElement(options) {
|
|
|
2210
2265
|
]
|
|
2211
2266
|
}
|
|
2212
2267
|
];
|
|
2268
|
+
if (typeof targetElementDescription !== "string") {
|
|
2269
|
+
const addOns = await promptsToChatParam({
|
|
2270
|
+
images: targetElementDescription.images,
|
|
2271
|
+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
|
|
2272
|
+
});
|
|
2273
|
+
msgs.push(...addOns);
|
|
2274
|
+
}
|
|
2213
2275
|
const callAIFn = callAI || callToGetJSONObject;
|
|
2214
2276
|
const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
|
|
2215
2277
|
const rawResponse = JSON.stringify(res.content);
|
|
@@ -2220,10 +2282,10 @@ async function AiLocateElement(options) {
|
|
|
2220
2282
|
if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
|
|
2221
2283
|
resRect = adaptBboxToRect(
|
|
2222
2284
|
res.content.bbox,
|
|
2223
|
-
_optionalChain([options, 'access',
|
|
2224
|
-
_optionalChain([options, 'access',
|
|
2225
|
-
_optionalChain([options, 'access',
|
|
2226
|
-
_optionalChain([options, 'access',
|
|
2285
|
+
_optionalChain([options, 'access', _57 => _57.searchConfig, 'optionalAccess', _58 => _58.rect, 'optionalAccess', _59 => _59.width]) || context.size.width,
|
|
2286
|
+
_optionalChain([options, 'access', _60 => _60.searchConfig, 'optionalAccess', _61 => _61.rect, 'optionalAccess', _62 => _62.height]) || context.size.height,
|
|
2287
|
+
_optionalChain([options, 'access', _63 => _63.searchConfig, 'optionalAccess', _64 => _64.rect, 'optionalAccess', _65 => _65.left]),
|
|
2288
|
+
_optionalChain([options, 'access', _66 => _66.searchConfig, 'optionalAccess', _67 => _67.rect, 'optionalAccess', _68 => _68.top])
|
|
2227
2289
|
);
|
|
2228
2290
|
debugInspect("resRect", resRect);
|
|
2229
2291
|
const rectCenter = {
|
|
@@ -2242,7 +2304,7 @@ async function AiLocateElement(options) {
|
|
|
2242
2304
|
}
|
|
2243
2305
|
} catch (e) {
|
|
2244
2306
|
const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
|
|
2245
|
-
if (!errors || _optionalChain([errors, 'optionalAccess',
|
|
2307
|
+
if (!errors || _optionalChain([errors, 'optionalAccess', _69 => _69.length]) === 0) {
|
|
2246
2308
|
errors = [msg];
|
|
2247
2309
|
} else {
|
|
2248
2310
|
errors.push(`(${msg})`);
|
|
@@ -2265,7 +2327,7 @@ async function AiLocateSection(options) {
|
|
|
2265
2327
|
const { screenshotBase64 } = context;
|
|
2266
2328
|
const systemPrompt = systemPromptToLocateSection(_env.vlLocateMode.call(void 0, ));
|
|
2267
2329
|
const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
|
|
2268
|
-
sectionDescription
|
|
2330
|
+
sectionDescription: extraTextFromUserPrompt(sectionDescription)
|
|
2269
2331
|
});
|
|
2270
2332
|
const msgs = [
|
|
2271
2333
|
{ role: "system", content: systemPrompt },
|
|
@@ -2286,6 +2348,13 @@ async function AiLocateSection(options) {
|
|
|
2286
2348
|
]
|
|
2287
2349
|
}
|
|
2288
2350
|
];
|
|
2351
|
+
if (typeof sectionDescription !== "string") {
|
|
2352
|
+
const addOns = await promptsToChatParam({
|
|
2353
|
+
images: sectionDescription.images,
|
|
2354
|
+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
|
|
2355
|
+
});
|
|
2356
|
+
msgs.push(...addOns);
|
|
2357
|
+
}
|
|
2289
2358
|
const result = await callAiFn(
|
|
2290
2359
|
msgs,
|
|
2291
2360
|
2 /* EXTRACT_DATA */
|
|
@@ -2327,21 +2396,21 @@ async function AiLocateSection(options) {
|
|
|
2327
2396
|
};
|
|
2328
2397
|
}
|
|
2329
2398
|
async function AiExtractElementInfo(options) {
|
|
2330
|
-
const { dataQuery, context, extractOption } = options;
|
|
2399
|
+
const { dataQuery, context, extractOption, multimodalPrompt } = options;
|
|
2331
2400
|
const systemPrompt = systemPromptToExtract();
|
|
2332
2401
|
const { screenshotBase64 } = context;
|
|
2333
2402
|
const { description, elementById } = await describeUserPage(context, {
|
|
2334
2403
|
truncateTextLength: 200,
|
|
2335
2404
|
filterNonTextContent: false,
|
|
2336
2405
|
visibleOnly: false,
|
|
2337
|
-
domIncluded: _optionalChain([extractOption, 'optionalAccess',
|
|
2406
|
+
domIncluded: _optionalChain([extractOption, 'optionalAccess', _70 => _70.domIncluded])
|
|
2338
2407
|
});
|
|
2339
2408
|
const extractDataPromptText = await extractDataQueryPrompt(
|
|
2340
2409
|
description,
|
|
2341
2410
|
dataQuery
|
|
2342
2411
|
);
|
|
2343
2412
|
const userContent = [];
|
|
2344
|
-
if (_optionalChain([extractOption, 'optionalAccess',
|
|
2413
|
+
if (_optionalChain([extractOption, 'optionalAccess', _71 => _71.screenshotIncluded]) !== false) {
|
|
2345
2414
|
userContent.push({
|
|
2346
2415
|
type: "image_url",
|
|
2347
2416
|
image_url: {
|
|
@@ -2361,6 +2430,13 @@ async function AiExtractElementInfo(options) {
|
|
|
2361
2430
|
content: userContent
|
|
2362
2431
|
}
|
|
2363
2432
|
];
|
|
2433
|
+
if (multimodalPrompt) {
|
|
2434
|
+
const addOns = await promptsToChatParam({
|
|
2435
|
+
images: multimodalPrompt.images,
|
|
2436
|
+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
|
|
2437
|
+
});
|
|
2438
|
+
msgs.push(...addOns);
|
|
2439
|
+
}
|
|
2364
2440
|
const result = await callAiFn(
|
|
2365
2441
|
msgs,
|
|
2366
2442
|
2 /* EXTRACT_DATA */
|
|
@@ -2373,11 +2449,12 @@ async function AiExtractElementInfo(options) {
|
|
|
2373
2449
|
}
|
|
2374
2450
|
async function AiAssert(options) {
|
|
2375
2451
|
const { assertion, context } = options;
|
|
2376
|
-
_utils.assert.call(void 0, assertion, "assertion should be
|
|
2452
|
+
_utils.assert.call(void 0, assertion, "assertion should not be empty");
|
|
2377
2453
|
const { screenshotBase64 } = context;
|
|
2378
2454
|
const systemPrompt = systemPromptToAssert({
|
|
2379
2455
|
isUITars: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS)
|
|
2380
2456
|
});
|
|
2457
|
+
const assertionText = extraTextFromUserPrompt(assertion);
|
|
2381
2458
|
const msgs = [
|
|
2382
2459
|
{ role: "system", content: systemPrompt },
|
|
2383
2460
|
{
|
|
@@ -2395,13 +2472,20 @@ async function AiAssert(options) {
|
|
|
2395
2472
|
text: `
|
|
2396
2473
|
Here is the assertion. Please tell whether it is truthy according to the screenshot.
|
|
2397
2474
|
=====================================
|
|
2398
|
-
${
|
|
2475
|
+
${assertionText}
|
|
2399
2476
|
=====================================
|
|
2400
2477
|
`
|
|
2401
2478
|
}
|
|
2402
2479
|
]
|
|
2403
2480
|
}
|
|
2404
2481
|
];
|
|
2482
|
+
if (typeof assertion !== "string") {
|
|
2483
|
+
const addOns = await promptsToChatParam({
|
|
2484
|
+
images: assertion.images,
|
|
2485
|
+
convertHttpImage2Base64: assertion.convertHttpImage2Base64
|
|
2486
|
+
});
|
|
2487
|
+
msgs.push(...addOns);
|
|
2488
|
+
}
|
|
2405
2489
|
const { content: assertResult, usage } = await callAiFn(
|
|
2406
2490
|
msgs,
|
|
2407
2491
|
0 /* ASSERT */
|
|
@@ -2469,7 +2553,7 @@ async function plan(userInstruction, opts) {
|
|
|
2469
2553
|
const { content, usage } = await call2(msgs, 3 /* PLAN */);
|
|
2470
2554
|
const rawResponse = JSON.stringify(content, void 0, 2);
|
|
2471
2555
|
const planFromAI = content;
|
|
2472
|
-
const actions = (_optionalChain([planFromAI, 'access',
|
|
2556
|
+
const actions = (_optionalChain([planFromAI, 'access', _72 => _72.action, 'optionalAccess', _73 => _73.type]) ? [planFromAI.action] : planFromAI.actions) || [];
|
|
2473
2557
|
const returnValue = {
|
|
2474
2558
|
...planFromAI,
|
|
2475
2559
|
actions,
|
|
@@ -2496,7 +2580,7 @@ async function plan(userInstruction, opts) {
|
|
|
2496
2580
|
_utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
2497
2581
|
} else {
|
|
2498
2582
|
actions.forEach((action) => {
|
|
2499
|
-
if (_optionalChain([action, 'access',
|
|
2583
|
+
if (_optionalChain([action, 'access', _74 => _74.locate, 'optionalAccess', _75 => _75.id])) {
|
|
2500
2584
|
const element = elementById(action.locate.id);
|
|
2501
2585
|
if (element) {
|
|
2502
2586
|
action.locate.id = element.id;
|
|
@@ -2824,4 +2908,4 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
2824
2908
|
|
|
2825
2909
|
exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call; exports.callToGetJSONObject = callToGetJSONObject; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
|
|
2826
2910
|
|
|
2827
|
-
//# sourceMappingURL=chunk-
|
|
2911
|
+
//# sourceMappingURL=chunk-KFA65L55.js.map
|