@midscene/core 0.26.2-beta-20250812091127.0 → 0.26.3-beta-20250813021342.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.mjs +2502 -0
- package/dist/es/ai-model.mjs.map +1 -0
- package/dist/es/index.mjs +2362 -0
- package/dist/es/index.mjs.map +1 -0
- package/dist/es/tree.mjs +2 -0
- package/dist/es/utils.mjs +188 -0
- package/dist/es/{chunk-WXNDYUNG.js.map → utils.mjs.map} +1 -1
- package/dist/lib/ai-model.js +2581 -3
- package/dist/lib/ai-model.js.map +1 -0
- package/dist/lib/index.js +2375 -493
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/tree.js +42 -11
- package/dist/lib/tree.js.map +1 -1
- package/dist/lib/utils.js +257 -29
- package/dist/lib/utils.js.map +1 -0
- package/dist/types/ai-model.d.ts +505 -99
- package/dist/types/index.d.ts +1299 -53
- package/dist/types/tree.d.ts +11 -1
- package/dist/types/utils.d.ts +47 -33
- package/package.json +28 -12
- package/dist/es/ai-model.d.ts +0 -99
- package/dist/es/ai-model.js +0 -44
- package/dist/es/chunk-DDYIQHOA.js +0 -2883
- package/dist/es/chunk-DDYIQHOA.js.map +0 -1
- package/dist/es/chunk-WXNDYUNG.js +0 -265
- package/dist/es/index.d.ts +0 -53
- package/dist/es/index.js +0 -570
- package/dist/es/index.js.map +0 -1
- package/dist/es/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/es/tree.d.ts +0 -1
- package/dist/es/tree.js +0 -13
- package/dist/es/tree.js.map +0 -1
- package/dist/es/types-8a6be57c.d.ts +0 -577
- package/dist/es/utils.d.ts +0 -33
- package/dist/es/utils.js +0 -30
- package/dist/lib/ai-model.d.ts +0 -99
- package/dist/lib/chunk-DDYIQHOA.js +0 -2883
- package/dist/lib/chunk-DDYIQHOA.js.map +0 -1
- package/dist/lib/chunk-WXNDYUNG.js +0 -265
- package/dist/lib/chunk-WXNDYUNG.js.map +0 -1
- package/dist/lib/index.d.ts +0 -53
- package/dist/lib/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/lib/tree.d.ts +0 -1
- package/dist/lib/types-8a6be57c.d.ts +0 -577
- package/dist/lib/utils.d.ts +0 -33
- package/dist/types/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/types/types-8a6be57c.d.ts +0 -577
package/dist/es/index.d.ts
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightTaskInfo, e as InsightOptions, f as DetailedLocateParam, L as LocateResult, g as InsightExtractParam, h as InsightExtractOption, T as TMultimodalPrompt, A as AIUsageInfo, i as TUserPrompt, j as InsightAssertionResponse, k as AIDescribeElementResponse } from './types-8a6be57c.js';
|
|
2
|
-
export { z as AIAssertionResponse, x as AIDataExtractionResponse, v as AIElementCoordinatesResponse, u as AIElementLocatorResponse, w as AIElementResponse, q as AIResponseFormat, y as AISectionLocatorResponse, t as AISingleElementResponse, r as AISingleElementResponseById, s as AISingleElementResponseByPosition, W as AgentAssertOpt, F as AgentDescribeElementAtPointResult, V as AgentWaitForOpt, aa as BaseAgentParserOpt, G as CallAIFn, aE as CodeGenerationChunk, a9 as Color, aG as DeviceAction, K as DumpMeta, Q as ElementById, H as EnsureObject, ad as ExecutionRecorderItem, av as ExecutionTaskAction, au as ExecutionTaskActionApply, ag as ExecutionTaskHitBy, at as ExecutionTaskInsightAssertion, as as ExecutionTaskInsightAssertionApply, ar as ExecutionTaskInsightAssertionParam, ak as ExecutionTaskInsightDumpLog, am as ExecutionTaskInsightLocate, al as ExecutionTaskInsightLocateApply, aj as ExecutionTaskInsightLocateOutput, ai as ExecutionTaskInsightLocateParam, aq as ExecutionTaskInsightQuery, ap as ExecutionTaskInsightQueryApply, ao as ExecutionTaskInsightQueryOutput, an as ExecutionTaskInsightQueryParam, ax as ExecutionTaskLog, aw as ExecutionTaskLogApply, az as ExecutionTaskPlanning, ay as ExecutionTaskPlanningApply, ah as ExecutionTaskReturn, ae as ExecutionTaskType, af as ExecutorContext, b2 as FreeFn, aA as GroupedActionDump, N as InsightDump, O as LiteUISection, p as LocateOption, J as LocateResultElement, C as LocateValidatorResult, B as LocatorValidatorOption, b5 as MidsceneYamlConfig, b6 as MidsceneYamlConfigOutput, o as MidsceneYamlConfigResult, m as MidsceneYamlFlowItem, aN as MidsceneYamlFlowItemAIAction, aS as MidsceneYamlFlowItemAIAsk, aO as MidsceneYamlFlowItemAIAssert, aT as MidsceneYamlFlowItemAIBoolean, aX as MidsceneYamlFlowItemAIHover, aY as MidsceneYamlFlowItemAIInput, aZ as MidsceneYamlFlowItemAIKeyboardPress, aU as MidsceneYamlFlowItemAILocate, aQ as MidsceneYamlFlowItemAINumber, aP as MidsceneYamlFlowItemAIQuery, n as MidsceneYamlFlowItemAIRightClick, a_ as MidsceneYamlFlowItemAIScroll, aR as MidsceneYamlFlowItemAIString, aW as MidsceneYamlFlowItemAITap, aV as MidsceneYamlFlowItemAIWaitFor, a$ as MidsceneYamlFlowItemEvaluateJavaScript, b1 as MidsceneYamlFlowItemLogScreenshot, b0 as MidsceneYamlFlowItemSleep, M as MidsceneYamlScript, aL as MidsceneYamlScriptAndroidEnv, aM as MidsceneYamlScriptEnv, aJ as MidsceneYamlScriptEnvBase, aK as MidsceneYamlScriptWebEnv, l as MidsceneYamlTask, S as OnTaskStartTip, aB as PageType, P as PartialInsightDumpFromSDK, Z as PlanningAIResponse, Y as PlanningAction, a7 as PlanningActionParamAndroidLongPress, a8 as PlanningActionParamAndroidPull, a3 as PlanningActionParamAssert, a5 as PlanningActionParamError, $ as PlanningActionParamHover, a1 as PlanningActionParamInputOrKeyPress, a0 as PlanningActionParamRightClick, a2 as PlanningActionParamScroll, a4 as PlanningActionParamSleep, _ as PlanningActionParamTap, a6 as PlanningActionParamWaitFor, X as PlanningLocateParam, ac as PlaywrightParserOpt, ab as PuppeteerParserOpt, aH as ReferenceImage, R as ReportDumpWithAttributes, b4 as ScriptPlayerStatusValue, b3 as ScriptPlayerTaskStatus, aF as StreamingAIResponse, aD as StreamingCallback, aC as StreamingCodeGenerationOptions, aI as scrollParam } from './types-8a6be57c.js';
|
|
3
|
-
import { c as callAiFn } from './llm-planning-4e0c16fe.js';
|
|
4
|
-
export { a as AiAssert, A as AiLocateElement, d as describeUserPage, p as plan } from './llm-planning-4e0c16fe.js';
|
|
5
|
-
import { BaseElement, Rect } from '@midscene/shared/types';
|
|
6
|
-
export { BaseElement, ElementTreeNode, Point, Rect, Size } from '@midscene/shared/types';
|
|
7
|
-
export { getVersion } from './utils.js';
|
|
8
|
-
export { MIDSCENE_MODEL_NAME, getAIConfig } from '@midscene/shared/env';
|
|
9
|
-
import '@midscene/shared/constants';
|
|
10
|
-
import 'openai/resources';
|
|
11
|
-
|
|
12
|
-
declare class Executor {
|
|
13
|
-
name: string;
|
|
14
|
-
tasks: ExecutionTask[];
|
|
15
|
-
status: 'init' | 'pending' | 'running' | 'completed' | 'error';
|
|
16
|
-
onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
|
|
17
|
-
constructor(name: string, options?: ExecutionTaskProgressOptions & {
|
|
18
|
-
tasks?: ExecutionTaskApply[];
|
|
19
|
-
});
|
|
20
|
-
private markTaskAsPending;
|
|
21
|
-
append(task: ExecutionTaskApply[] | ExecutionTaskApply): Promise<void>;
|
|
22
|
-
flush(): Promise<{
|
|
23
|
-
output: any;
|
|
24
|
-
thought?: string;
|
|
25
|
-
} | undefined>;
|
|
26
|
-
isInErrorState(): boolean;
|
|
27
|
-
latestErrorTask(): ExecutionTask | null;
|
|
28
|
-
dump(): ExecutionDump;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
interface LocateOpts {
|
|
32
|
-
context?: UIContext<BaseElement>;
|
|
33
|
-
callAI?: typeof callAiFn<AIElementResponse>;
|
|
34
|
-
}
|
|
35
|
-
declare class Insight<ElementType extends BaseElement = BaseElement, ContextType extends UIContext<ElementType> = UIContext<ElementType>> {
|
|
36
|
-
contextRetrieverFn: (action: InsightAction) => Promise<ContextType> | ContextType;
|
|
37
|
-
aiVendorFn: (...args: Array<any>) => Promise<any>;
|
|
38
|
-
onceDumpUpdatedFn?: DumpSubscriber;
|
|
39
|
-
taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;
|
|
40
|
-
constructor(context: ContextType | ((action: InsightAction) => Promise<ContextType> | ContextType), opt?: InsightOptions);
|
|
41
|
-
locate(query: DetailedLocateParam, opt?: LocateOpts): Promise<LocateResult>;
|
|
42
|
-
extract<T>(dataDemand: InsightExtractParam, opt?: InsightExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<{
|
|
43
|
-
data: T;
|
|
44
|
-
thought?: string;
|
|
45
|
-
usage?: AIUsageInfo;
|
|
46
|
-
}>;
|
|
47
|
-
assert(assertion: TUserPrompt): Promise<InsightAssertionResponse>;
|
|
48
|
-
describe(target: Rect | [number, number], opt?: {
|
|
49
|
-
deepThink?: boolean;
|
|
50
|
-
}): Promise<Pick<AIDescribeElementResponse, 'description'>>;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
export { AIDescribeElementResponse, AIUsageInfo, DetailedLocateParam, DumpSubscriber, ExecutionDump, ExecutionTask, ExecutionTaskApply, ExecutionTaskProgressOptions, Executor, Insight, InsightAction, InsightAssertionResponse, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, TMultimodalPrompt, TUserPrompt, UIContext, Insight as default };
|
package/dist/es/index.js
DELETED
|
@@ -1,570 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
getVersion
|
|
3
|
-
} from "./chunk-WXNDYUNG.js";
|
|
4
|
-
import {
|
|
5
|
-
AiAssert,
|
|
6
|
-
AiExtractElementInfo,
|
|
7
|
-
AiLocateElement,
|
|
8
|
-
AiLocateSection,
|
|
9
|
-
callAiFn,
|
|
10
|
-
callToGetJSONObject,
|
|
11
|
-
describeUserPage,
|
|
12
|
-
expandSearchArea,
|
|
13
|
-
plan
|
|
14
|
-
} from "./chunk-DDYIQHOA.js";
|
|
15
|
-
|
|
16
|
-
// src/ai-model/action-executor.ts
|
|
17
|
-
import {
|
|
18
|
-
MIDSCENE_MODEL_NAME,
|
|
19
|
-
getAIConfig,
|
|
20
|
-
uiTarsModelVersion,
|
|
21
|
-
vlLocateMode
|
|
22
|
-
} from "@midscene/shared/env";
|
|
23
|
-
import { assert } from "@midscene/shared/utils";
|
|
24
|
-
var Executor = class {
|
|
25
|
-
constructor(name, options) {
|
|
26
|
-
this.status = options?.tasks && options.tasks.length > 0 ? "pending" : "init";
|
|
27
|
-
this.name = name;
|
|
28
|
-
this.tasks = (options?.tasks || []).map(
|
|
29
|
-
(item) => this.markTaskAsPending(item)
|
|
30
|
-
);
|
|
31
|
-
this.onTaskStart = options?.onTaskStart;
|
|
32
|
-
}
|
|
33
|
-
markTaskAsPending(task) {
|
|
34
|
-
return {
|
|
35
|
-
status: "pending",
|
|
36
|
-
...task
|
|
37
|
-
};
|
|
38
|
-
}
|
|
39
|
-
async append(task) {
|
|
40
|
-
assert(
|
|
41
|
-
this.status !== "error",
|
|
42
|
-
`executor is in error state, cannot append task
|
|
43
|
-
error=${this.latestErrorTask()?.error}
|
|
44
|
-
${this.latestErrorTask()?.errorStack}`
|
|
45
|
-
);
|
|
46
|
-
if (Array.isArray(task)) {
|
|
47
|
-
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
48
|
-
} else {
|
|
49
|
-
this.tasks.push(this.markTaskAsPending(task));
|
|
50
|
-
}
|
|
51
|
-
if (this.status !== "running") {
|
|
52
|
-
this.status = "pending";
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
async flush() {
|
|
56
|
-
if (this.status === "init" && this.tasks.length > 0) {
|
|
57
|
-
console.warn(
|
|
58
|
-
"illegal state for executor, status is init but tasks are not empty"
|
|
59
|
-
);
|
|
60
|
-
}
|
|
61
|
-
assert(this.status !== "running", "executor is already running");
|
|
62
|
-
assert(this.status !== "completed", "executor is already completed");
|
|
63
|
-
assert(this.status !== "error", "executor is in error state");
|
|
64
|
-
const nextPendingIndex = this.tasks.findIndex(
|
|
65
|
-
(task) => task.status === "pending"
|
|
66
|
-
);
|
|
67
|
-
if (nextPendingIndex < 0) {
|
|
68
|
-
return;
|
|
69
|
-
}
|
|
70
|
-
this.status = "running";
|
|
71
|
-
let taskIndex = nextPendingIndex;
|
|
72
|
-
let successfullyCompleted = true;
|
|
73
|
-
let previousFindOutput;
|
|
74
|
-
while (taskIndex < this.tasks.length) {
|
|
75
|
-
const task = this.tasks[taskIndex];
|
|
76
|
-
assert(
|
|
77
|
-
task.status === "pending",
|
|
78
|
-
`task status should be pending, but got: ${task.status}`
|
|
79
|
-
);
|
|
80
|
-
task.timing = {
|
|
81
|
-
start: Date.now()
|
|
82
|
-
};
|
|
83
|
-
try {
|
|
84
|
-
task.status = "running";
|
|
85
|
-
try {
|
|
86
|
-
if (this.onTaskStart) {
|
|
87
|
-
await this.onTaskStart(task);
|
|
88
|
-
}
|
|
89
|
-
} catch (e) {
|
|
90
|
-
console.error("error in onTaskStart", e);
|
|
91
|
-
}
|
|
92
|
-
assert(
|
|
93
|
-
["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
|
|
94
|
-
`unsupported task type: ${task.type}`
|
|
95
|
-
);
|
|
96
|
-
const { executor, param } = task;
|
|
97
|
-
assert(executor, `executor is required for task type: ${task.type}`);
|
|
98
|
-
let returnValue;
|
|
99
|
-
const executorContext = {
|
|
100
|
-
task,
|
|
101
|
-
element: previousFindOutput?.element
|
|
102
|
-
};
|
|
103
|
-
if (task.type === "Insight") {
|
|
104
|
-
assert(
|
|
105
|
-
task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert" || task.subType === "Boolean" || task.subType === "Number" || task.subType === "String",
|
|
106
|
-
`unsupported insight subType: ${task.subType}`
|
|
107
|
-
);
|
|
108
|
-
returnValue = await task.executor(param, executorContext);
|
|
109
|
-
if (task.subType === "Locate") {
|
|
110
|
-
previousFindOutput = returnValue?.output;
|
|
111
|
-
}
|
|
112
|
-
} else if (task.type === "Action" || task.type === "Planning") {
|
|
113
|
-
returnValue = await task.executor(param, executorContext);
|
|
114
|
-
} else {
|
|
115
|
-
console.warn(
|
|
116
|
-
`unsupported task type: ${task.type}, will try to execute it directly`
|
|
117
|
-
);
|
|
118
|
-
returnValue = await task.executor(param, executorContext);
|
|
119
|
-
}
|
|
120
|
-
Object.assign(task, returnValue);
|
|
121
|
-
task.status = "finished";
|
|
122
|
-
task.timing.end = Date.now();
|
|
123
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
124
|
-
taskIndex++;
|
|
125
|
-
} catch (e) {
|
|
126
|
-
successfullyCompleted = false;
|
|
127
|
-
task.error = e;
|
|
128
|
-
task.errorMessage = e?.message || (typeof e === "string" ? e : "error-without-message");
|
|
129
|
-
task.errorStack = e.stack;
|
|
130
|
-
task.status = "failed";
|
|
131
|
-
task.timing.end = Date.now();
|
|
132
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
133
|
-
break;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
for (let i = taskIndex + 1; i < this.tasks.length; i++) {
|
|
137
|
-
this.tasks[i].status = "cancelled";
|
|
138
|
-
}
|
|
139
|
-
if (successfullyCompleted) {
|
|
140
|
-
this.status = "completed";
|
|
141
|
-
} else {
|
|
142
|
-
this.status = "error";
|
|
143
|
-
}
|
|
144
|
-
if (this.tasks.length) {
|
|
145
|
-
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
146
|
-
const { thought, output } = this.tasks[outputIndex];
|
|
147
|
-
return {
|
|
148
|
-
thought,
|
|
149
|
-
output
|
|
150
|
-
};
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
isInErrorState() {
|
|
154
|
-
return this.status === "error";
|
|
155
|
-
}
|
|
156
|
-
latestErrorTask() {
|
|
157
|
-
if (this.status !== "error") {
|
|
158
|
-
return null;
|
|
159
|
-
}
|
|
160
|
-
const errorTaskIndex = this.tasks.findIndex(
|
|
161
|
-
(task) => task.status === "failed"
|
|
162
|
-
);
|
|
163
|
-
if (errorTaskIndex >= 0) {
|
|
164
|
-
return this.tasks[errorTaskIndex];
|
|
165
|
-
}
|
|
166
|
-
return null;
|
|
167
|
-
}
|
|
168
|
-
dump() {
|
|
169
|
-
let modelDescription = "";
|
|
170
|
-
if (vlLocateMode()) {
|
|
171
|
-
const uiTarsModelVer = uiTarsModelVersion();
|
|
172
|
-
if (uiTarsModelVer) {
|
|
173
|
-
modelDescription = `UI-TARS=${uiTarsModelVer}`;
|
|
174
|
-
} else {
|
|
175
|
-
modelDescription = `${vlLocateMode()} mode`;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
const dumpData = {
|
|
179
|
-
sdkVersion: getVersion(),
|
|
180
|
-
model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
|
|
181
|
-
model_description: modelDescription,
|
|
182
|
-
logTime: Date.now(),
|
|
183
|
-
name: this.name,
|
|
184
|
-
tasks: this.tasks
|
|
185
|
-
};
|
|
186
|
-
return dumpData;
|
|
187
|
-
}
|
|
188
|
-
};
|
|
189
|
-
|
|
190
|
-
// src/ai-model/prompt/describe.ts
|
|
191
|
-
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
192
|
-
var elementDescriberInstruction = () => {
|
|
193
|
-
return `
|
|
194
|
-
Describe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.
|
|
195
|
-
|
|
196
|
-
CRITICAL REQUIREMENTS:
|
|
197
|
-
1. UNIQUENESS: The description must uniquely identify this element on the current page
|
|
198
|
-
2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
|
|
199
|
-
3. PRECISION: Be specific enough to distinguish from similar elements
|
|
200
|
-
|
|
201
|
-
DESCRIPTION STRUCTURE:
|
|
202
|
-
1. Element type (button, input, link, div, etc.)
|
|
203
|
-
2. Primary identifier (in order of preference):
|
|
204
|
-
- Unique text content: "with text 'Login'"
|
|
205
|
-
- Unique attribute: "with aria-label 'Search'"
|
|
206
|
-
- Unique class/ID: "with class 'primary-button'"
|
|
207
|
-
- Unique position: "in header navigation"
|
|
208
|
-
3. Secondary identifiers (if needed for uniqueness):
|
|
209
|
-
- Visual features: "blue background", "with icon"
|
|
210
|
-
- Relative position: "below search bar", "in sidebar"
|
|
211
|
-
- Parent context: "in login form", "in main menu"
|
|
212
|
-
|
|
213
|
-
GUIDELINES:
|
|
214
|
-
- Keep description under 25 words
|
|
215
|
-
- Prioritize semantic identifiers over visual ones
|
|
216
|
-
- Use consistent terminology across similar elements
|
|
217
|
-
- Avoid page-specific or temporary content
|
|
218
|
-
- Don't mention the red rectangle or selection box
|
|
219
|
-
- Focus on stable, reusable characteristics
|
|
220
|
-
|
|
221
|
-
EXAMPLES:
|
|
222
|
-
- "Login button with text 'Sign In'"
|
|
223
|
-
- "Search input with placeholder 'Enter keywords'"
|
|
224
|
-
- "Navigation link with text 'Home' in header"
|
|
225
|
-
- "Submit button in contact form"
|
|
226
|
-
- "Menu icon with aria-label 'Open menu'"
|
|
227
|
-
|
|
228
|
-
Return JSON:
|
|
229
|
-
{
|
|
230
|
-
"description": "unique element identifier",
|
|
231
|
-
"error"?: "error message if any"
|
|
232
|
-
}`;
|
|
233
|
-
};
|
|
234
|
-
|
|
235
|
-
// src/insight/index.ts
|
|
236
|
-
import {
|
|
237
|
-
MIDSCENE_FORCE_DEEP_THINK,
|
|
238
|
-
MIDSCENE_USE_QWEN_VL,
|
|
239
|
-
getAIConfigInBoolean,
|
|
240
|
-
vlLocateMode as vlLocateMode2
|
|
241
|
-
} from "@midscene/shared/env";
|
|
242
|
-
import { compositeElementInfoImg, cropByRect } from "@midscene/shared/img";
|
|
243
|
-
import { getDebug } from "@midscene/shared/logger";
|
|
244
|
-
import { assert as assert2 } from "@midscene/shared/utils";
|
|
245
|
-
|
|
246
|
-
// src/insight/utils.ts
|
|
247
|
-
import { MIDSCENE_MODEL_NAME as MIDSCENE_MODEL_NAME2, getAIConfig as getAIConfig2 } from "@midscene/shared/env";
|
|
248
|
-
import { uuid } from "@midscene/shared/utils";
|
|
249
|
-
function emitInsightDump(data, dumpSubscriber) {
|
|
250
|
-
const baseData = {
|
|
251
|
-
sdkVersion: getVersion(),
|
|
252
|
-
logTime: Date.now(),
|
|
253
|
-
model_name: getAIConfig2(MIDSCENE_MODEL_NAME2) || ""
|
|
254
|
-
};
|
|
255
|
-
const finalData = {
|
|
256
|
-
logId: uuid(),
|
|
257
|
-
...baseData,
|
|
258
|
-
...data
|
|
259
|
-
};
|
|
260
|
-
dumpSubscriber?.(finalData);
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
// src/insight/index.ts
|
|
264
|
-
var debug = getDebug("ai:insight");
|
|
265
|
-
var Insight = class {
|
|
266
|
-
constructor(context, opt) {
|
|
267
|
-
this.aiVendorFn = callAiFn;
|
|
268
|
-
assert2(context, "context is required for Insight");
|
|
269
|
-
if (typeof context === "function") {
|
|
270
|
-
this.contextRetrieverFn = context;
|
|
271
|
-
} else {
|
|
272
|
-
this.contextRetrieverFn = () => Promise.resolve(context);
|
|
273
|
-
}
|
|
274
|
-
if (typeof opt?.aiVendorFn !== "undefined") {
|
|
275
|
-
this.aiVendorFn = opt.aiVendorFn;
|
|
276
|
-
}
|
|
277
|
-
if (typeof opt?.taskInfo !== "undefined") {
|
|
278
|
-
this.taskInfo = opt.taskInfo;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
async locate(query, opt) {
|
|
282
|
-
const { callAI } = opt || {};
|
|
283
|
-
const queryPrompt = typeof query === "string" ? query : query.prompt;
|
|
284
|
-
assert2(queryPrompt, "query is required for locate");
|
|
285
|
-
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
286
|
-
this.onceDumpUpdatedFn = void 0;
|
|
287
|
-
assert2(typeof query === "object", "query should be an object for locate");
|
|
288
|
-
const globalDeepThinkSwitch = getAIConfigInBoolean(
|
|
289
|
-
MIDSCENE_FORCE_DEEP_THINK
|
|
290
|
-
);
|
|
291
|
-
if (globalDeepThinkSwitch) {
|
|
292
|
-
debug("globalDeepThinkSwitch", globalDeepThinkSwitch);
|
|
293
|
-
}
|
|
294
|
-
let searchAreaPrompt;
|
|
295
|
-
if (query.deepThink || globalDeepThinkSwitch) {
|
|
296
|
-
searchAreaPrompt = query.prompt;
|
|
297
|
-
}
|
|
298
|
-
if (searchAreaPrompt && !vlLocateMode2()) {
|
|
299
|
-
console.warn(
|
|
300
|
-
'The "deepThink" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model'
|
|
301
|
-
);
|
|
302
|
-
searchAreaPrompt = void 0;
|
|
303
|
-
}
|
|
304
|
-
const context = opt?.context || await this.contextRetrieverFn("locate");
|
|
305
|
-
let searchArea = void 0;
|
|
306
|
-
let searchAreaRawResponse = void 0;
|
|
307
|
-
let searchAreaUsage = void 0;
|
|
308
|
-
let searchAreaResponse = void 0;
|
|
309
|
-
if (searchAreaPrompt) {
|
|
310
|
-
searchAreaResponse = await AiLocateSection({
|
|
311
|
-
context,
|
|
312
|
-
sectionDescription: searchAreaPrompt
|
|
313
|
-
});
|
|
314
|
-
assert2(
|
|
315
|
-
searchAreaResponse.rect,
|
|
316
|
-
`cannot find search area for "${searchAreaPrompt}"${searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ""}`
|
|
317
|
-
);
|
|
318
|
-
searchAreaRawResponse = searchAreaResponse.rawResponse;
|
|
319
|
-
searchAreaUsage = searchAreaResponse.usage;
|
|
320
|
-
searchArea = searchAreaResponse.rect;
|
|
321
|
-
}
|
|
322
|
-
const startTime = Date.now();
|
|
323
|
-
const {
|
|
324
|
-
parseResult,
|
|
325
|
-
rect,
|
|
326
|
-
elementById,
|
|
327
|
-
rawResponse,
|
|
328
|
-
usage,
|
|
329
|
-
isOrderSensitive
|
|
330
|
-
} = await AiLocateElement({
|
|
331
|
-
callAI: callAI || this.aiVendorFn,
|
|
332
|
-
context,
|
|
333
|
-
targetElementDescription: queryPrompt,
|
|
334
|
-
searchConfig: searchAreaResponse
|
|
335
|
-
});
|
|
336
|
-
const timeCost = Date.now() - startTime;
|
|
337
|
-
const taskInfo = {
|
|
338
|
-
...this.taskInfo ? this.taskInfo : {},
|
|
339
|
-
durationMs: timeCost,
|
|
340
|
-
rawResponse: JSON.stringify(rawResponse),
|
|
341
|
-
formatResponse: JSON.stringify(parseResult),
|
|
342
|
-
usage,
|
|
343
|
-
searchArea,
|
|
344
|
-
searchAreaRawResponse,
|
|
345
|
-
searchAreaUsage
|
|
346
|
-
};
|
|
347
|
-
let errorLog;
|
|
348
|
-
if (parseResult.errors?.length) {
|
|
349
|
-
errorLog = `AI model failed to locate:
|
|
350
|
-
${parseResult.errors.join("\n")}`;
|
|
351
|
-
}
|
|
352
|
-
const dumpData = {
|
|
353
|
-
type: "locate",
|
|
354
|
-
userQuery: {
|
|
355
|
-
element: queryPrompt
|
|
356
|
-
},
|
|
357
|
-
matchedElement: [],
|
|
358
|
-
matchedRect: rect,
|
|
359
|
-
data: null,
|
|
360
|
-
taskInfo,
|
|
361
|
-
deepThink: !!searchArea,
|
|
362
|
-
error: errorLog
|
|
363
|
-
};
|
|
364
|
-
const elements = [];
|
|
365
|
-
(parseResult.elements || []).forEach((item) => {
|
|
366
|
-
if ("id" in item) {
|
|
367
|
-
const element = elementById(item?.id);
|
|
368
|
-
if (!element) {
|
|
369
|
-
console.warn(
|
|
370
|
-
`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
|
|
371
|
-
);
|
|
372
|
-
return;
|
|
373
|
-
}
|
|
374
|
-
elements.push(element);
|
|
375
|
-
}
|
|
376
|
-
});
|
|
377
|
-
emitInsightDump(
|
|
378
|
-
{
|
|
379
|
-
...dumpData,
|
|
380
|
-
matchedElement: elements
|
|
381
|
-
},
|
|
382
|
-
dumpSubscriber
|
|
383
|
-
);
|
|
384
|
-
if (errorLog) {
|
|
385
|
-
throw new Error(errorLog);
|
|
386
|
-
}
|
|
387
|
-
assert2(
|
|
388
|
-
elements.length <= 1,
|
|
389
|
-
`locate: multiple elements found, length = ${elements.length}`
|
|
390
|
-
);
|
|
391
|
-
if (elements.length === 1) {
|
|
392
|
-
return {
|
|
393
|
-
element: {
|
|
394
|
-
id: elements[0].id,
|
|
395
|
-
indexId: elements[0].indexId,
|
|
396
|
-
center: elements[0].center,
|
|
397
|
-
rect: elements[0].rect,
|
|
398
|
-
xpaths: elements[0].xpaths || [],
|
|
399
|
-
attributes: elements[0].attributes,
|
|
400
|
-
isOrderSensitive
|
|
401
|
-
},
|
|
402
|
-
rect
|
|
403
|
-
};
|
|
404
|
-
}
|
|
405
|
-
return {
|
|
406
|
-
element: null,
|
|
407
|
-
rect
|
|
408
|
-
};
|
|
409
|
-
}
|
|
410
|
-
async extract(dataDemand, opt, multimodalPrompt) {
|
|
411
|
-
assert2(
|
|
412
|
-
typeof dataDemand === "object" || typeof dataDemand === "string",
|
|
413
|
-
`dataDemand should be object or string, but get ${typeof dataDemand}`
|
|
414
|
-
);
|
|
415
|
-
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
416
|
-
this.onceDumpUpdatedFn = void 0;
|
|
417
|
-
const context = await this.contextRetrieverFn("extract");
|
|
418
|
-
const startTime = Date.now();
|
|
419
|
-
const { parseResult, usage } = await AiExtractElementInfo({
|
|
420
|
-
context,
|
|
421
|
-
dataQuery: dataDemand,
|
|
422
|
-
multimodalPrompt,
|
|
423
|
-
extractOption: opt
|
|
424
|
-
});
|
|
425
|
-
const timeCost = Date.now() - startTime;
|
|
426
|
-
const taskInfo = {
|
|
427
|
-
...this.taskInfo ? this.taskInfo : {},
|
|
428
|
-
durationMs: timeCost,
|
|
429
|
-
rawResponse: JSON.stringify(parseResult)
|
|
430
|
-
};
|
|
431
|
-
let errorLog;
|
|
432
|
-
if (parseResult.errors?.length) {
|
|
433
|
-
errorLog = `AI response error:
|
|
434
|
-
${parseResult.errors.join("\n")}`;
|
|
435
|
-
}
|
|
436
|
-
const dumpData = {
|
|
437
|
-
type: "extract",
|
|
438
|
-
userQuery: {
|
|
439
|
-
dataDemand
|
|
440
|
-
},
|
|
441
|
-
matchedElement: [],
|
|
442
|
-
data: null,
|
|
443
|
-
taskInfo,
|
|
444
|
-
error: errorLog
|
|
445
|
-
};
|
|
446
|
-
const { data, thought } = parseResult || {};
|
|
447
|
-
emitInsightDump(
|
|
448
|
-
{
|
|
449
|
-
...dumpData,
|
|
450
|
-
data
|
|
451
|
-
},
|
|
452
|
-
dumpSubscriber
|
|
453
|
-
);
|
|
454
|
-
if (errorLog && !data) {
|
|
455
|
-
throw new Error(errorLog);
|
|
456
|
-
}
|
|
457
|
-
return {
|
|
458
|
-
data,
|
|
459
|
-
thought,
|
|
460
|
-
usage
|
|
461
|
-
};
|
|
462
|
-
}
|
|
463
|
-
async assert(assertion) {
|
|
464
|
-
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
465
|
-
this.onceDumpUpdatedFn = void 0;
|
|
466
|
-
const context = await this.contextRetrieverFn("assert");
|
|
467
|
-
const startTime = Date.now();
|
|
468
|
-
const assertResult = await AiAssert({
|
|
469
|
-
assertion,
|
|
470
|
-
context
|
|
471
|
-
});
|
|
472
|
-
const timeCost = Date.now() - startTime;
|
|
473
|
-
const taskInfo = {
|
|
474
|
-
...this.taskInfo ? this.taskInfo : {},
|
|
475
|
-
durationMs: timeCost,
|
|
476
|
-
rawResponse: JSON.stringify(assertResult.content)
|
|
477
|
-
};
|
|
478
|
-
const { thought, pass } = assertResult.content;
|
|
479
|
-
const dumpData = {
|
|
480
|
-
type: "assert",
|
|
481
|
-
userQuery: {
|
|
482
|
-
assertion
|
|
483
|
-
},
|
|
484
|
-
matchedElement: [],
|
|
485
|
-
data: null,
|
|
486
|
-
taskInfo,
|
|
487
|
-
assertionPass: pass,
|
|
488
|
-
assertionThought: thought,
|
|
489
|
-
error: pass ? void 0 : thought
|
|
490
|
-
};
|
|
491
|
-
emitInsightDump(dumpData, dumpSubscriber);
|
|
492
|
-
return {
|
|
493
|
-
pass,
|
|
494
|
-
thought,
|
|
495
|
-
usage: assertResult.usage
|
|
496
|
-
};
|
|
497
|
-
}
|
|
498
|
-
async describe(target, opt) {
|
|
499
|
-
assert2(target, "target is required for insight.describe");
|
|
500
|
-
const context = await this.contextRetrieverFn("describe");
|
|
501
|
-
const { screenshotBase64, size } = context;
|
|
502
|
-
assert2(screenshotBase64, "screenshot is required for insight.describe");
|
|
503
|
-
const systemPrompt = elementDescriberInstruction();
|
|
504
|
-
const defaultRectSize = 30;
|
|
505
|
-
const targetRect = Array.isArray(target) ? {
|
|
506
|
-
left: Math.floor(target[0] - defaultRectSize / 2),
|
|
507
|
-
top: Math.floor(target[1] - defaultRectSize / 2),
|
|
508
|
-
width: defaultRectSize,
|
|
509
|
-
height: defaultRectSize
|
|
510
|
-
} : target;
|
|
511
|
-
let imagePayload = await compositeElementInfoImg({
|
|
512
|
-
inputImgBase64: screenshotBase64,
|
|
513
|
-
size,
|
|
514
|
-
elementsPositionInfo: [
|
|
515
|
-
{
|
|
516
|
-
rect: targetRect
|
|
517
|
-
}
|
|
518
|
-
],
|
|
519
|
-
borderThickness: 3
|
|
520
|
-
});
|
|
521
|
-
if (opt?.deepThink) {
|
|
522
|
-
const searchArea = expandSearchArea(targetRect, context.size);
|
|
523
|
-
debug("describe: set searchArea", searchArea);
|
|
524
|
-
imagePayload = await cropByRect(
|
|
525
|
-
imagePayload,
|
|
526
|
-
searchArea,
|
|
527
|
-
getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)
|
|
528
|
-
);
|
|
529
|
-
}
|
|
530
|
-
const msgs = [
|
|
531
|
-
{ role: "system", content: systemPrompt },
|
|
532
|
-
{
|
|
533
|
-
role: "user",
|
|
534
|
-
content: [
|
|
535
|
-
{
|
|
536
|
-
type: "image_url",
|
|
537
|
-
image_url: {
|
|
538
|
-
url: imagePayload,
|
|
539
|
-
detail: "high"
|
|
540
|
-
}
|
|
541
|
-
}
|
|
542
|
-
]
|
|
543
|
-
}
|
|
544
|
-
];
|
|
545
|
-
const callAIFn = this.aiVendorFn || callToGetJSONObject;
|
|
546
|
-
const res = await callAIFn(msgs, 4 /* DESCRIBE_ELEMENT */);
|
|
547
|
-
const { content } = res;
|
|
548
|
-
assert2(!content.error, `describe failed: ${content.error}`);
|
|
549
|
-
assert2(content.description, "failed to describe the element");
|
|
550
|
-
return content;
|
|
551
|
-
}
|
|
552
|
-
};
|
|
553
|
-
|
|
554
|
-
// src/index.ts
|
|
555
|
-
import { getAIConfig as getAIConfig3, MIDSCENE_MODEL_NAME as MIDSCENE_MODEL_NAME3 } from "@midscene/shared/env";
|
|
556
|
-
var src_default = Insight;
|
|
557
|
-
export {
|
|
558
|
-
AiAssert,
|
|
559
|
-
AiLocateElement,
|
|
560
|
-
Executor,
|
|
561
|
-
Insight,
|
|
562
|
-
MIDSCENE_MODEL_NAME3 as MIDSCENE_MODEL_NAME,
|
|
563
|
-
src_default as default,
|
|
564
|
-
describeUserPage,
|
|
565
|
-
getAIConfig3 as getAIConfig,
|
|
566
|
-
getVersion,
|
|
567
|
-
plan
|
|
568
|
-
};
|
|
569
|
-
|
|
570
|
-
//# sourceMappingURL=index.js.map
|