@midscene/core 0.24.1 → 0.24.2-beta-20250730123854.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +3 -3
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-TV47VKQ7.js → chunk-3LFZDNMQ.js} +80 -12
- package/dist/es/chunk-3LFZDNMQ.js.map +1 -0
- package/dist/es/{chunk-HUEQURIG.js → chunk-M4OYNEBY.js} +3 -3
- package/dist/es/index.d.ts +4 -4
- package/dist/es/index.js +12 -4
- package/dist/es/index.js.map +1 -1
- package/dist/es/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
- package/dist/es/{types-4d9c9a85.d.ts → types-d187efec.d.ts} +19 -2
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +3 -3
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-TV47VKQ7.js → chunk-3LFZDNMQ.js} +80 -12
- package/dist/lib/chunk-3LFZDNMQ.js.map +1 -0
- package/dist/lib/{chunk-HUEQURIG.js → chunk-M4OYNEBY.js} +3 -3
- package/dist/lib/index.d.ts +4 -4
- package/dist/lib/index.js +21 -13
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
- package/dist/{types/types-4d9c9a85.d.ts → lib/types-d187efec.d.ts} +19 -2
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/index.d.ts +4 -4
- package/dist/types/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
- package/dist/{lib/types-4d9c9a85.d.ts → types/types-d187efec.d.ts} +19 -2
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-TV47VKQ7.js.map +0 -1
- package/dist/lib/chunk-TV47VKQ7.js.map +0 -1
- /package/dist/es/{chunk-HUEQURIG.js.map → chunk-M4OYNEBY.js.map} +0 -0
- /package/dist/lib/{chunk-HUEQURIG.js.map → chunk-M4OYNEBY.js.map} +0 -0
package/dist/es/ai-model.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { aA as StreamingCallback, m as AIUsageInfo, az as StreamingCodeGenerationOptions, aC as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-d187efec.js';
|
|
2
2
|
import OpenAI from 'openai';
|
|
3
3
|
import { ChatCompletionMessageParam } from 'openai/resources';
|
|
4
4
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
5
|
-
import { b as AIActionType } from './llm-planning-
|
|
6
|
-
export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-
|
|
5
|
+
import { b as AIActionType } from './llm-planning-ae19dc49.js';
|
|
6
|
+
export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-ae19dc49.js';
|
|
7
7
|
import { vlLocateMode } from '@midscene/shared/env';
|
|
8
8
|
import { actionParser } from '@ui-tars/action-parser';
|
|
9
9
|
import { Size } from '@midscene/shared/types';
|
package/dist/es/ai-model.js
CHANGED
|
@@ -300,7 +300,7 @@ function buildYamlFlowFromPlans(plans, sleep) {
|
|
|
300
300
|
flow.push({
|
|
301
301
|
sleep: param.timeMs
|
|
302
302
|
});
|
|
303
|
-
} else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton") {
|
|
303
|
+
} else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton" || type === "AndroidLongPress" || type === "AndroidPull") {
|
|
304
304
|
} else if (type === "Error" || type === "ExpectedFalsyCondition" || type === "Assert" || type === "AssertWithoutThrow" || type === "Finished") {
|
|
305
305
|
} else {
|
|
306
306
|
console.warn(
|
|
@@ -386,31 +386,47 @@ You are an expert in software testing.
|
|
|
386
386
|
## Objective:
|
|
387
387
|
- Identify elements in screenshots and text that match the user's description.
|
|
388
388
|
- Give the coordinates of the element that matches the user's description best in the screenshot.
|
|
389
|
+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
389
390
|
|
|
390
391
|
## Output Format:
|
|
391
392
|
\`\`\`json
|
|
392
393
|
{
|
|
393
394
|
"bbox": [number, number, number, number], // ${bboxComment}
|
|
394
|
-
"errors"?: string[]
|
|
395
|
+
"errors"?: string[],
|
|
396
|
+
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
|
|
395
397
|
}
|
|
396
398
|
\`\`\`
|
|
397
399
|
|
|
398
400
|
Fields:
|
|
399
401
|
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
|
|
402
|
+
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
|
|
400
403
|
* \`errors\` is an optional array of error messages (if any)
|
|
401
404
|
|
|
402
|
-
|
|
405
|
+
Order-sensitive means the description contains phrases like:
|
|
406
|
+
- "the third item in the list"
|
|
407
|
+
- "the last button"
|
|
408
|
+
- "the first input box"
|
|
409
|
+
- "the second row"
|
|
410
|
+
|
|
411
|
+
Not order-sensitive means the description is like:
|
|
412
|
+
- "confirm button"
|
|
413
|
+
- "search box"
|
|
414
|
+
- "password input"
|
|
415
|
+
|
|
416
|
+
For example, when an element is found and the description is order-sensitive:
|
|
403
417
|
\`\`\`json
|
|
404
418
|
{
|
|
405
419
|
"bbox": [100, 100, 200, 200],
|
|
420
|
+
"isOrderSensitive": true,
|
|
406
421
|
"errors": []
|
|
407
422
|
}
|
|
408
423
|
\`\`\`
|
|
409
424
|
|
|
410
|
-
When no element is found:
|
|
425
|
+
When no element is found and the description is not order-sensitive:
|
|
411
426
|
\`\`\`json
|
|
412
427
|
{
|
|
413
428
|
"bbox": [],
|
|
429
|
+
"isOrderSensitive": false,
|
|
414
430
|
"errors": ["I can see ..., but {some element} is not found"]
|
|
415
431
|
}
|
|
416
432
|
\`\`\`
|
|
@@ -423,6 +439,7 @@ You are an expert in software page image (2D) and page element text analysis.
|
|
|
423
439
|
## Objective:
|
|
424
440
|
- Identify elements in screenshots and text that match the user's description.
|
|
425
441
|
- Return JSON data containing the selection reason and element ID.
|
|
442
|
+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
426
443
|
|
|
427
444
|
## Skills:
|
|
428
445
|
- Image analysis and recognition
|
|
@@ -434,6 +451,7 @@ You are an expert in software page image (2D) and page element text analysis.
|
|
|
434
451
|
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
|
|
435
452
|
3. Found the required number of elements
|
|
436
453
|
4. Return JSON data containing the selection reason and element ID.
|
|
454
|
+
5. Judge whether the user's description is order-sensitive (see below for definition and examples).
|
|
437
455
|
|
|
438
456
|
## Constraints:
|
|
439
457
|
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
|
|
@@ -443,6 +461,10 @@ You are an expert in software page image (2D) and page element text analysis.
|
|
|
443
461
|
- The returned data must conform to the specified JSON format.
|
|
444
462
|
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
|
|
445
463
|
|
|
464
|
+
## Order-Sensitive Definition:
|
|
465
|
+
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
|
|
466
|
+
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
|
|
467
|
+
|
|
446
468
|
## Output Format:
|
|
447
469
|
|
|
448
470
|
Please return the result in JSON format as follows:
|
|
@@ -458,6 +480,7 @@ Please return the result in JSON format as follows:
|
|
|
458
480
|
}
|
|
459
481
|
// More elements...
|
|
460
482
|
],
|
|
483
|
+
"isOrderSensitive": true, // or false, depending on the user's description
|
|
461
484
|
"errors": [] // Array of strings containing any error messages
|
|
462
485
|
}
|
|
463
486
|
\`\`\`
|
|
@@ -546,6 +569,7 @@ Output Example:
|
|
|
546
569
|
"id": "1231"
|
|
547
570
|
}
|
|
548
571
|
],
|
|
572
|
+
"isOrderSensitive": true,
|
|
549
573
|
"errors": []
|
|
550
574
|
}
|
|
551
575
|
\`\`\`
|
|
@@ -583,6 +607,10 @@ var locatorSchema = {
|
|
|
583
607
|
},
|
|
584
608
|
description: "List of found elements"
|
|
585
609
|
},
|
|
610
|
+
isOrderSensitive: {
|
|
611
|
+
type: "boolean",
|
|
612
|
+
description: "Whether the targetElementDescription is order-sensitive (true/false)"
|
|
613
|
+
},
|
|
586
614
|
errors: {
|
|
587
615
|
type: "array",
|
|
588
616
|
items: {
|
|
@@ -591,7 +619,7 @@ var locatorSchema = {
|
|
|
591
619
|
description: "List of error messages, if any"
|
|
592
620
|
}
|
|
593
621
|
},
|
|
594
|
-
required: ["elements", "errors"],
|
|
622
|
+
required: ["elements", "isOrderSensitive", "errors"],
|
|
595
623
|
additionalProperties: false
|
|
596
624
|
}
|
|
597
625
|
}
|
|
@@ -766,7 +794,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i
|
|
|
766
794
|
|
|
767
795
|
Restriction:
|
|
768
796
|
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
769
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton." : "."}
|
|
797
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
|
|
770
798
|
- Don't repeat actions in the previous logs.
|
|
771
799
|
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
772
800
|
|
|
@@ -779,7 +807,9 @@ Supporting actions:
|
|
|
779
807
|
- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
780
808
|
${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
|
|
781
809
|
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
|
|
782
|
-
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
810
|
+
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
811
|
+
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
|
|
812
|
+
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
|
|
783
813
|
|
|
784
814
|
Field description:
|
|
785
815
|
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
@@ -829,7 +859,7 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
829
859
|
## Workflow
|
|
830
860
|
|
|
831
861
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
832
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton" : ""}). The "About the action" section below will give you more details.
|
|
862
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
|
|
833
863
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
834
864
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
835
865
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -888,7 +918,11 @@ ${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "bac
|
|
|
888
918
|
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
|
|
889
919
|
* {{ param: {{}} }}
|
|
890
920
|
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
|
|
891
|
-
* {{ param: {{}} }}
|
|
921
|
+
* {{ param: {{}} }}
|
|
922
|
+
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
|
|
923
|
+
* {{ param: {{ x: number, y: number, duration?: number }} }}
|
|
924
|
+
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
|
|
925
|
+
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
|
|
892
926
|
`;
|
|
893
927
|
var outputTemplate = `
|
|
894
928
|
## Output JSON Format:
|
|
@@ -1002,7 +1036,7 @@ var planSchema = {
|
|
|
1002
1036
|
},
|
|
1003
1037
|
type: {
|
|
1004
1038
|
type: "string",
|
|
1005
|
-
description: 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"'
|
|
1039
|
+
description: 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"'
|
|
1006
1040
|
},
|
|
1007
1041
|
param: {
|
|
1008
1042
|
anyOf: [
|
|
@@ -2230,7 +2264,8 @@ async function AiLocateElement(options) {
|
|
|
2230
2264
|
},
|
|
2231
2265
|
rawResponse,
|
|
2232
2266
|
elementById,
|
|
2233
|
-
usage: res.usage
|
|
2267
|
+
usage: res.usage,
|
|
2268
|
+
isOrderSensitive: typeof res.content === "object" && res.content !== null && "isOrderSensitive" in res.content ? res.content.isOrderSensitive : void 0
|
|
2234
2269
|
};
|
|
2235
2270
|
}
|
|
2236
2271
|
async function AiLocateSection(options) {
|
|
@@ -2681,6 +2716,39 @@ async function vlmPlanning(options) {
|
|
|
2681
2716
|
type: "AndroidRecentAppsButton",
|
|
2682
2717
|
param: {}
|
|
2683
2718
|
});
|
|
2719
|
+
} else if (action.action_type === "androidLongPress") {
|
|
2720
|
+
assert6(
|
|
2721
|
+
action.action_inputs.start_coords,
|
|
2722
|
+
"start_coords is required for androidLongPress"
|
|
2723
|
+
);
|
|
2724
|
+
const point = action.action_inputs.start_coords;
|
|
2725
|
+
transformActions.push({
|
|
2726
|
+
type: "AndroidLongPress",
|
|
2727
|
+
param: {
|
|
2728
|
+
x: point[0],
|
|
2729
|
+
y: point[1],
|
|
2730
|
+
duration: 1e3
|
|
2731
|
+
},
|
|
2732
|
+
locate: null,
|
|
2733
|
+
thought: action.thought || ""
|
|
2734
|
+
});
|
|
2735
|
+
} else if (action.action_type === "androidPull") {
|
|
2736
|
+
const pullDirection = action.action_inputs.direction || "down";
|
|
2737
|
+
const startPoint = action.action_inputs.start_coords ? {
|
|
2738
|
+
x: action.action_inputs.start_coords[0],
|
|
2739
|
+
y: action.action_inputs.start_coords[1]
|
|
2740
|
+
} : void 0;
|
|
2741
|
+
transformActions.push({
|
|
2742
|
+
type: "AndroidPull",
|
|
2743
|
+
param: {
|
|
2744
|
+
direction: pullDirection,
|
|
2745
|
+
startPoint,
|
|
2746
|
+
distance: action.action_inputs.distance,
|
|
2747
|
+
duration: action.action_inputs.duration || 500
|
|
2748
|
+
},
|
|
2749
|
+
locate: null,
|
|
2750
|
+
thought: action.thought || ""
|
|
2751
|
+
});
|
|
2684
2752
|
}
|
|
2685
2753
|
});
|
|
2686
2754
|
if (transformActions.length === 0) {
|
|
@@ -2764,4 +2832,4 @@ export {
|
|
|
2764
2832
|
resizeImageForUiTars
|
|
2765
2833
|
};
|
|
2766
2834
|
|
|
2767
|
-
//# sourceMappingURL=chunk-
|
|
2835
|
+
//# sourceMappingURL=chunk-3LFZDNMQ.js.map
|