@midscene/core 0.26.2 → 0.26.3-beta-20250813021342.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/es/ai-model.mjs +2502 -0
  2. package/dist/es/ai-model.mjs.map +1 -0
  3. package/dist/es/index.mjs +2362 -0
  4. package/dist/es/index.mjs.map +1 -0
  5. package/dist/es/tree.mjs +2 -0
  6. package/dist/es/utils.mjs +188 -0
  7. package/dist/es/{chunk-O3KUKF2A.js.map → utils.mjs.map} +1 -1
  8. package/dist/lib/ai-model.js +2581 -3
  9. package/dist/lib/ai-model.js.map +1 -0
  10. package/dist/lib/index.js +2375 -493
  11. package/dist/lib/index.js.map +1 -1
  12. package/dist/lib/tree.js +42 -11
  13. package/dist/lib/tree.js.map +1 -1
  14. package/dist/lib/utils.js +257 -29
  15. package/dist/lib/utils.js.map +1 -0
  16. package/dist/types/ai-model.d.ts +505 -99
  17. package/dist/types/index.d.ts +1299 -53
  18. package/dist/types/tree.d.ts +11 -1
  19. package/dist/types/utils.d.ts +47 -33
  20. package/package.json +28 -12
  21. package/dist/es/ai-model.d.ts +0 -99
  22. package/dist/es/ai-model.js +0 -44
  23. package/dist/es/chunk-DDYIQHOA.js +0 -2883
  24. package/dist/es/chunk-DDYIQHOA.js.map +0 -1
  25. package/dist/es/chunk-O3KUKF2A.js +0 -265
  26. package/dist/es/index.d.ts +0 -53
  27. package/dist/es/index.js +0 -570
  28. package/dist/es/index.js.map +0 -1
  29. package/dist/es/llm-planning-4e0c16fe.d.ts +0 -106
  30. package/dist/es/tree.d.ts +0 -1
  31. package/dist/es/tree.js +0 -13
  32. package/dist/es/tree.js.map +0 -1
  33. package/dist/es/types-8a6be57c.d.ts +0 -577
  34. package/dist/es/utils.d.ts +0 -33
  35. package/dist/es/utils.js +0 -30
  36. package/dist/lib/ai-model.d.ts +0 -99
  37. package/dist/lib/chunk-DDYIQHOA.js +0 -2883
  38. package/dist/lib/chunk-DDYIQHOA.js.map +0 -1
  39. package/dist/lib/chunk-O3KUKF2A.js +0 -265
  40. package/dist/lib/chunk-O3KUKF2A.js.map +0 -1
  41. package/dist/lib/index.d.ts +0 -53
  42. package/dist/lib/llm-planning-4e0c16fe.d.ts +0 -106
  43. package/dist/lib/tree.d.ts +0 -1
  44. package/dist/lib/types-8a6be57c.d.ts +0 -577
  45. package/dist/lib/utils.d.ts +0 -33
  46. package/dist/types/llm-planning-4e0c16fe.d.ts +0 -106
  47. package/dist/types/types-8a6be57c.d.ts +0 -577
@@ -1,53 +0,0 @@
1
- import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightTaskInfo, e as InsightOptions, f as DetailedLocateParam, L as LocateResult, g as InsightExtractParam, h as InsightExtractOption, T as TMultimodalPrompt, A as AIUsageInfo, i as TUserPrompt, j as InsightAssertionResponse, k as AIDescribeElementResponse } from './types-8a6be57c.js';
2
- export { z as AIAssertionResponse, x as AIDataExtractionResponse, v as AIElementCoordinatesResponse, u as AIElementLocatorResponse, w as AIElementResponse, q as AIResponseFormat, y as AISectionLocatorResponse, t as AISingleElementResponse, r as AISingleElementResponseById, s as AISingleElementResponseByPosition, W as AgentAssertOpt, F as AgentDescribeElementAtPointResult, V as AgentWaitForOpt, aa as BaseAgentParserOpt, G as CallAIFn, aE as CodeGenerationChunk, a9 as Color, aG as DeviceAction, K as DumpMeta, Q as ElementById, H as EnsureObject, ad as ExecutionRecorderItem, av as ExecutionTaskAction, au as ExecutionTaskActionApply, ag as ExecutionTaskHitBy, at as ExecutionTaskInsightAssertion, as as ExecutionTaskInsightAssertionApply, ar as ExecutionTaskInsightAssertionParam, ak as ExecutionTaskInsightDumpLog, am as ExecutionTaskInsightLocate, al as ExecutionTaskInsightLocateApply, aj as ExecutionTaskInsightLocateOutput, ai as ExecutionTaskInsightLocateParam, aq as ExecutionTaskInsightQuery, ap as ExecutionTaskInsightQueryApply, ao as ExecutionTaskInsightQueryOutput, an as ExecutionTaskInsightQueryParam, ax as ExecutionTaskLog, aw as ExecutionTaskLogApply, az as ExecutionTaskPlanning, ay as ExecutionTaskPlanningApply, ah as ExecutionTaskReturn, ae as ExecutionTaskType, af as ExecutorContext, b2 as FreeFn, aA as GroupedActionDump, N as InsightDump, O as LiteUISection, p as LocateOption, J as LocateResultElement, C as LocateValidatorResult, B as LocatorValidatorOption, b5 as MidsceneYamlConfig, b6 as MidsceneYamlConfigOutput, o as MidsceneYamlConfigResult, m as MidsceneYamlFlowItem, aN as MidsceneYamlFlowItemAIAction, aS as MidsceneYamlFlowItemAIAsk, aO as MidsceneYamlFlowItemAIAssert, aT as MidsceneYamlFlowItemAIBoolean, aX as MidsceneYamlFlowItemAIHover, aY as MidsceneYamlFlowItemAIInput, aZ as MidsceneYamlFlowItemAIKeyboardPress, aU as MidsceneYamlFlowItemAILocate, aQ as MidsceneYamlFlowItemAINumber, aP as MidsceneYamlFlowItemAIQuery, n as MidsceneYamlFlowItemAIRightClick, a_ as MidsceneYamlFlowItemAIScroll, aR as MidsceneYamlFlowItemAIString, aW as MidsceneYamlFlowItemAITap, aV as MidsceneYamlFlowItemAIWaitFor, a$ as MidsceneYamlFlowItemEvaluateJavaScript, b1 as MidsceneYamlFlowItemLogScreenshot, b0 as MidsceneYamlFlowItemSleep, M as MidsceneYamlScript, aL as MidsceneYamlScriptAndroidEnv, aM as MidsceneYamlScriptEnv, aJ as MidsceneYamlScriptEnvBase, aK as MidsceneYamlScriptWebEnv, l as MidsceneYamlTask, S as OnTaskStartTip, aB as PageType, P as PartialInsightDumpFromSDK, Z as PlanningAIResponse, Y as PlanningAction, a7 as PlanningActionParamAndroidLongPress, a8 as PlanningActionParamAndroidPull, a3 as PlanningActionParamAssert, a5 as PlanningActionParamError, $ as PlanningActionParamHover, a1 as PlanningActionParamInputOrKeyPress, a0 as PlanningActionParamRightClick, a2 as PlanningActionParamScroll, a4 as PlanningActionParamSleep, _ as PlanningActionParamTap, a6 as PlanningActionParamWaitFor, X as PlanningLocateParam, ac as PlaywrightParserOpt, ab as PuppeteerParserOpt, aH as ReferenceImage, R as ReportDumpWithAttributes, b4 as ScriptPlayerStatusValue, b3 as ScriptPlayerTaskStatus, aF as StreamingAIResponse, aD as StreamingCallback, aC as StreamingCodeGenerationOptions, aI as scrollParam } from './types-8a6be57c.js';
3
- import { c as callAiFn } from './llm-planning-4e0c16fe.js';
4
- export { a as AiAssert, A as AiLocateElement, d as describeUserPage, p as plan } from './llm-planning-4e0c16fe.js';
5
- import { BaseElement, Rect } from '@midscene/shared/types';
6
- export { BaseElement, ElementTreeNode, Point, Rect, Size } from '@midscene/shared/types';
7
- export { getVersion } from './utils.js';
8
- export { MIDSCENE_MODEL_NAME, getAIConfig } from '@midscene/shared/env';
9
- import '@midscene/shared/constants';
10
- import 'openai/resources';
11
-
12
- declare class Executor {
13
- name: string;
14
- tasks: ExecutionTask[];
15
- status: 'init' | 'pending' | 'running' | 'completed' | 'error';
16
- onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
17
- constructor(name: string, options?: ExecutionTaskProgressOptions & {
18
- tasks?: ExecutionTaskApply[];
19
- });
20
- private markTaskAsPending;
21
- append(task: ExecutionTaskApply[] | ExecutionTaskApply): Promise<void>;
22
- flush(): Promise<{
23
- output: any;
24
- thought?: string;
25
- } | undefined>;
26
- isInErrorState(): boolean;
27
- latestErrorTask(): ExecutionTask | null;
28
- dump(): ExecutionDump;
29
- }
30
-
31
- interface LocateOpts {
32
- context?: UIContext<BaseElement>;
33
- callAI?: typeof callAiFn<AIElementResponse>;
34
- }
35
- declare class Insight<ElementType extends BaseElement = BaseElement, ContextType extends UIContext<ElementType> = UIContext<ElementType>> {
36
- contextRetrieverFn: (action: InsightAction) => Promise<ContextType> | ContextType;
37
- aiVendorFn: (...args: Array<any>) => Promise<any>;
38
- onceDumpUpdatedFn?: DumpSubscriber;
39
- taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;
40
- constructor(context: ContextType | ((action: InsightAction) => Promise<ContextType> | ContextType), opt?: InsightOptions);
41
- locate(query: DetailedLocateParam, opt?: LocateOpts): Promise<LocateResult>;
42
- extract<T>(dataDemand: InsightExtractParam, opt?: InsightExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<{
43
- data: T;
44
- thought?: string;
45
- usage?: AIUsageInfo;
46
- }>;
47
- assert(assertion: TUserPrompt): Promise<InsightAssertionResponse>;
48
- describe(target: Rect | [number, number], opt?: {
49
- deepThink?: boolean;
50
- }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
51
- }
52
-
53
- export { AIDescribeElementResponse, AIUsageInfo, DetailedLocateParam, DumpSubscriber, ExecutionDump, ExecutionTask, ExecutionTaskApply, ExecutionTaskProgressOptions, Executor, Insight, InsightAction, InsightAssertionResponse, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, TMultimodalPrompt, TUserPrompt, UIContext, Insight as default };
package/dist/es/index.js DELETED
@@ -1,570 +0,0 @@
1
- import {
2
- getVersion
3
- } from "./chunk-O3KUKF2A.js";
4
- import {
5
- AiAssert,
6
- AiExtractElementInfo,
7
- AiLocateElement,
8
- AiLocateSection,
9
- callAiFn,
10
- callToGetJSONObject,
11
- describeUserPage,
12
- expandSearchArea,
13
- plan
14
- } from "./chunk-DDYIQHOA.js";
15
-
16
- // src/ai-model/action-executor.ts
17
- import {
18
- MIDSCENE_MODEL_NAME,
19
- getAIConfig,
20
- uiTarsModelVersion,
21
- vlLocateMode
22
- } from "@midscene/shared/env";
23
- import { assert } from "@midscene/shared/utils";
24
- var Executor = class {
25
- constructor(name, options) {
26
- this.status = options?.tasks && options.tasks.length > 0 ? "pending" : "init";
27
- this.name = name;
28
- this.tasks = (options?.tasks || []).map(
29
- (item) => this.markTaskAsPending(item)
30
- );
31
- this.onTaskStart = options?.onTaskStart;
32
- }
33
- markTaskAsPending(task) {
34
- return {
35
- status: "pending",
36
- ...task
37
- };
38
- }
39
- async append(task) {
40
- assert(
41
- this.status !== "error",
42
- `executor is in error state, cannot append task
43
- error=${this.latestErrorTask()?.error}
44
- ${this.latestErrorTask()?.errorStack}`
45
- );
46
- if (Array.isArray(task)) {
47
- this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
48
- } else {
49
- this.tasks.push(this.markTaskAsPending(task));
50
- }
51
- if (this.status !== "running") {
52
- this.status = "pending";
53
- }
54
- }
55
- async flush() {
56
- if (this.status === "init" && this.tasks.length > 0) {
57
- console.warn(
58
- "illegal state for executor, status is init but tasks are not empty"
59
- );
60
- }
61
- assert(this.status !== "running", "executor is already running");
62
- assert(this.status !== "completed", "executor is already completed");
63
- assert(this.status !== "error", "executor is in error state");
64
- const nextPendingIndex = this.tasks.findIndex(
65
- (task) => task.status === "pending"
66
- );
67
- if (nextPendingIndex < 0) {
68
- return;
69
- }
70
- this.status = "running";
71
- let taskIndex = nextPendingIndex;
72
- let successfullyCompleted = true;
73
- let previousFindOutput;
74
- while (taskIndex < this.tasks.length) {
75
- const task = this.tasks[taskIndex];
76
- assert(
77
- task.status === "pending",
78
- `task status should be pending, but got: ${task.status}`
79
- );
80
- task.timing = {
81
- start: Date.now()
82
- };
83
- try {
84
- task.status = "running";
85
- try {
86
- if (this.onTaskStart) {
87
- await this.onTaskStart(task);
88
- }
89
- } catch (e) {
90
- console.error("error in onTaskStart", e);
91
- }
92
- assert(
93
- ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
94
- `unsupported task type: ${task.type}`
95
- );
96
- const { executor, param } = task;
97
- assert(executor, `executor is required for task type: ${task.type}`);
98
- let returnValue;
99
- const executorContext = {
100
- task,
101
- element: previousFindOutput?.element
102
- };
103
- if (task.type === "Insight") {
104
- assert(
105
- task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert" || task.subType === "Boolean" || task.subType === "Number" || task.subType === "String",
106
- `unsupported insight subType: ${task.subType}`
107
- );
108
- returnValue = await task.executor(param, executorContext);
109
- if (task.subType === "Locate") {
110
- previousFindOutput = returnValue?.output;
111
- }
112
- } else if (task.type === "Action" || task.type === "Planning") {
113
- returnValue = await task.executor(param, executorContext);
114
- } else {
115
- console.warn(
116
- `unsupported task type: ${task.type}, will try to execute it directly`
117
- );
118
- returnValue = await task.executor(param, executorContext);
119
- }
120
- Object.assign(task, returnValue);
121
- task.status = "finished";
122
- task.timing.end = Date.now();
123
- task.timing.cost = task.timing.end - task.timing.start;
124
- taskIndex++;
125
- } catch (e) {
126
- successfullyCompleted = false;
127
- task.error = e;
128
- task.errorMessage = e?.message || (typeof e === "string" ? e : "error-without-message");
129
- task.errorStack = e.stack;
130
- task.status = "failed";
131
- task.timing.end = Date.now();
132
- task.timing.cost = task.timing.end - task.timing.start;
133
- break;
134
- }
135
- }
136
- for (let i = taskIndex + 1; i < this.tasks.length; i++) {
137
- this.tasks[i].status = "cancelled";
138
- }
139
- if (successfullyCompleted) {
140
- this.status = "completed";
141
- } else {
142
- this.status = "error";
143
- }
144
- if (this.tasks.length) {
145
- const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
146
- const { thought, output } = this.tasks[outputIndex];
147
- return {
148
- thought,
149
- output
150
- };
151
- }
152
- }
153
- isInErrorState() {
154
- return this.status === "error";
155
- }
156
- latestErrorTask() {
157
- if (this.status !== "error") {
158
- return null;
159
- }
160
- const errorTaskIndex = this.tasks.findIndex(
161
- (task) => task.status === "failed"
162
- );
163
- if (errorTaskIndex >= 0) {
164
- return this.tasks[errorTaskIndex];
165
- }
166
- return null;
167
- }
168
- dump() {
169
- let modelDescription = "";
170
- if (vlLocateMode()) {
171
- const uiTarsModelVer = uiTarsModelVersion();
172
- if (uiTarsModelVer) {
173
- modelDescription = `UI-TARS=${uiTarsModelVer}`;
174
- } else {
175
- modelDescription = `${vlLocateMode()} mode`;
176
- }
177
- }
178
- const dumpData = {
179
- sdkVersion: getVersion(),
180
- model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
181
- model_description: modelDescription,
182
- logTime: Date.now(),
183
- name: this.name,
184
- tasks: this.tasks
185
- };
186
- return dumpData;
187
- }
188
- };
189
-
190
- // src/ai-model/prompt/describe.ts
191
- import { getPreferredLanguage } from "@midscene/shared/env";
192
- var elementDescriberInstruction = () => {
193
- return `
194
- Describe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.
195
-
196
- CRITICAL REQUIREMENTS:
197
- 1. UNIQUENESS: The description must uniquely identify this element on the current page
198
- 2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
199
- 3. PRECISION: Be specific enough to distinguish from similar elements
200
-
201
- DESCRIPTION STRUCTURE:
202
- 1. Element type (button, input, link, div, etc.)
203
- 2. Primary identifier (in order of preference):
204
- - Unique text content: "with text 'Login'"
205
- - Unique attribute: "with aria-label 'Search'"
206
- - Unique class/ID: "with class 'primary-button'"
207
- - Unique position: "in header navigation"
208
- 3. Secondary identifiers (if needed for uniqueness):
209
- - Visual features: "blue background", "with icon"
210
- - Relative position: "below search bar", "in sidebar"
211
- - Parent context: "in login form", "in main menu"
212
-
213
- GUIDELINES:
214
- - Keep description under 25 words
215
- - Prioritize semantic identifiers over visual ones
216
- - Use consistent terminology across similar elements
217
- - Avoid page-specific or temporary content
218
- - Don't mention the red rectangle or selection box
219
- - Focus on stable, reusable characteristics
220
-
221
- EXAMPLES:
222
- - "Login button with text 'Sign In'"
223
- - "Search input with placeholder 'Enter keywords'"
224
- - "Navigation link with text 'Home' in header"
225
- - "Submit button in contact form"
226
- - "Menu icon with aria-label 'Open menu'"
227
-
228
- Return JSON:
229
- {
230
- "description": "unique element identifier",
231
- "error"?: "error message if any"
232
- }`;
233
- };
234
-
235
- // src/insight/index.ts
236
- import {
237
- MIDSCENE_FORCE_DEEP_THINK,
238
- MIDSCENE_USE_QWEN_VL,
239
- getAIConfigInBoolean,
240
- vlLocateMode as vlLocateMode2
241
- } from "@midscene/shared/env";
242
- import { compositeElementInfoImg, cropByRect } from "@midscene/shared/img";
243
- import { getDebug } from "@midscene/shared/logger";
244
- import { assert as assert2 } from "@midscene/shared/utils";
245
-
246
- // src/insight/utils.ts
247
- import { MIDSCENE_MODEL_NAME as MIDSCENE_MODEL_NAME2, getAIConfig as getAIConfig2 } from "@midscene/shared/env";
248
- import { uuid } from "@midscene/shared/utils";
249
- function emitInsightDump(data, dumpSubscriber) {
250
- const baseData = {
251
- sdkVersion: getVersion(),
252
- logTime: Date.now(),
253
- model_name: getAIConfig2(MIDSCENE_MODEL_NAME2) || ""
254
- };
255
- const finalData = {
256
- logId: uuid(),
257
- ...baseData,
258
- ...data
259
- };
260
- dumpSubscriber?.(finalData);
261
- }
262
-
263
- // src/insight/index.ts
264
- var debug = getDebug("ai:insight");
265
- var Insight = class {
266
- constructor(context, opt) {
267
- this.aiVendorFn = callAiFn;
268
- assert2(context, "context is required for Insight");
269
- if (typeof context === "function") {
270
- this.contextRetrieverFn = context;
271
- } else {
272
- this.contextRetrieverFn = () => Promise.resolve(context);
273
- }
274
- if (typeof opt?.aiVendorFn !== "undefined") {
275
- this.aiVendorFn = opt.aiVendorFn;
276
- }
277
- if (typeof opt?.taskInfo !== "undefined") {
278
- this.taskInfo = opt.taskInfo;
279
- }
280
- }
281
- async locate(query, opt) {
282
- const { callAI } = opt || {};
283
- const queryPrompt = typeof query === "string" ? query : query.prompt;
284
- assert2(queryPrompt, "query is required for locate");
285
- const dumpSubscriber = this.onceDumpUpdatedFn;
286
- this.onceDumpUpdatedFn = void 0;
287
- assert2(typeof query === "object", "query should be an object for locate");
288
- const globalDeepThinkSwitch = getAIConfigInBoolean(
289
- MIDSCENE_FORCE_DEEP_THINK
290
- );
291
- if (globalDeepThinkSwitch) {
292
- debug("globalDeepThinkSwitch", globalDeepThinkSwitch);
293
- }
294
- let searchAreaPrompt;
295
- if (query.deepThink || globalDeepThinkSwitch) {
296
- searchAreaPrompt = query.prompt;
297
- }
298
- if (searchAreaPrompt && !vlLocateMode2()) {
299
- console.warn(
300
- 'The "deepThink" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model'
301
- );
302
- searchAreaPrompt = void 0;
303
- }
304
- const context = opt?.context || await this.contextRetrieverFn("locate");
305
- let searchArea = void 0;
306
- let searchAreaRawResponse = void 0;
307
- let searchAreaUsage = void 0;
308
- let searchAreaResponse = void 0;
309
- if (searchAreaPrompt) {
310
- searchAreaResponse = await AiLocateSection({
311
- context,
312
- sectionDescription: searchAreaPrompt
313
- });
314
- assert2(
315
- searchAreaResponse.rect,
316
- `cannot find search area for "${searchAreaPrompt}"${searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ""}`
317
- );
318
- searchAreaRawResponse = searchAreaResponse.rawResponse;
319
- searchAreaUsage = searchAreaResponse.usage;
320
- searchArea = searchAreaResponse.rect;
321
- }
322
- const startTime = Date.now();
323
- const {
324
- parseResult,
325
- rect,
326
- elementById,
327
- rawResponse,
328
- usage,
329
- isOrderSensitive
330
- } = await AiLocateElement({
331
- callAI: callAI || this.aiVendorFn,
332
- context,
333
- targetElementDescription: queryPrompt,
334
- searchConfig: searchAreaResponse
335
- });
336
- const timeCost = Date.now() - startTime;
337
- const taskInfo = {
338
- ...this.taskInfo ? this.taskInfo : {},
339
- durationMs: timeCost,
340
- rawResponse: JSON.stringify(rawResponse),
341
- formatResponse: JSON.stringify(parseResult),
342
- usage,
343
- searchArea,
344
- searchAreaRawResponse,
345
- searchAreaUsage
346
- };
347
- let errorLog;
348
- if (parseResult.errors?.length) {
349
- errorLog = `AI model failed to locate:
350
- ${parseResult.errors.join("\n")}`;
351
- }
352
- const dumpData = {
353
- type: "locate",
354
- userQuery: {
355
- element: queryPrompt
356
- },
357
- matchedElement: [],
358
- matchedRect: rect,
359
- data: null,
360
- taskInfo,
361
- deepThink: !!searchArea,
362
- error: errorLog
363
- };
364
- const elements = [];
365
- (parseResult.elements || []).forEach((item) => {
366
- if ("id" in item) {
367
- const element = elementById(item?.id);
368
- if (!element) {
369
- console.warn(
370
- `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
371
- );
372
- return;
373
- }
374
- elements.push(element);
375
- }
376
- });
377
- emitInsightDump(
378
- {
379
- ...dumpData,
380
- matchedElement: elements
381
- },
382
- dumpSubscriber
383
- );
384
- if (errorLog) {
385
- throw new Error(errorLog);
386
- }
387
- assert2(
388
- elements.length <= 1,
389
- `locate: multiple elements found, length = ${elements.length}`
390
- );
391
- if (elements.length === 1) {
392
- return {
393
- element: {
394
- id: elements[0].id,
395
- indexId: elements[0].indexId,
396
- center: elements[0].center,
397
- rect: elements[0].rect,
398
- xpaths: elements[0].xpaths || [],
399
- attributes: elements[0].attributes,
400
- isOrderSensitive
401
- },
402
- rect
403
- };
404
- }
405
- return {
406
- element: null,
407
- rect
408
- };
409
- }
410
- async extract(dataDemand, opt, multimodalPrompt) {
411
- assert2(
412
- typeof dataDemand === "object" || typeof dataDemand === "string",
413
- `dataDemand should be object or string, but get ${typeof dataDemand}`
414
- );
415
- const dumpSubscriber = this.onceDumpUpdatedFn;
416
- this.onceDumpUpdatedFn = void 0;
417
- const context = await this.contextRetrieverFn("extract");
418
- const startTime = Date.now();
419
- const { parseResult, usage } = await AiExtractElementInfo({
420
- context,
421
- dataQuery: dataDemand,
422
- multimodalPrompt,
423
- extractOption: opt
424
- });
425
- const timeCost = Date.now() - startTime;
426
- const taskInfo = {
427
- ...this.taskInfo ? this.taskInfo : {},
428
- durationMs: timeCost,
429
- rawResponse: JSON.stringify(parseResult)
430
- };
431
- let errorLog;
432
- if (parseResult.errors?.length) {
433
- errorLog = `AI response error:
434
- ${parseResult.errors.join("\n")}`;
435
- }
436
- const dumpData = {
437
- type: "extract",
438
- userQuery: {
439
- dataDemand
440
- },
441
- matchedElement: [],
442
- data: null,
443
- taskInfo,
444
- error: errorLog
445
- };
446
- const { data, thought } = parseResult || {};
447
- emitInsightDump(
448
- {
449
- ...dumpData,
450
- data
451
- },
452
- dumpSubscriber
453
- );
454
- if (errorLog && !data) {
455
- throw new Error(errorLog);
456
- }
457
- return {
458
- data,
459
- thought,
460
- usage
461
- };
462
- }
463
- async assert(assertion) {
464
- const dumpSubscriber = this.onceDumpUpdatedFn;
465
- this.onceDumpUpdatedFn = void 0;
466
- const context = await this.contextRetrieverFn("assert");
467
- const startTime = Date.now();
468
- const assertResult = await AiAssert({
469
- assertion,
470
- context
471
- });
472
- const timeCost = Date.now() - startTime;
473
- const taskInfo = {
474
- ...this.taskInfo ? this.taskInfo : {},
475
- durationMs: timeCost,
476
- rawResponse: JSON.stringify(assertResult.content)
477
- };
478
- const { thought, pass } = assertResult.content;
479
- const dumpData = {
480
- type: "assert",
481
- userQuery: {
482
- assertion
483
- },
484
- matchedElement: [],
485
- data: null,
486
- taskInfo,
487
- assertionPass: pass,
488
- assertionThought: thought,
489
- error: pass ? void 0 : thought
490
- };
491
- emitInsightDump(dumpData, dumpSubscriber);
492
- return {
493
- pass,
494
- thought,
495
- usage: assertResult.usage
496
- };
497
- }
498
- async describe(target, opt) {
499
- assert2(target, "target is required for insight.describe");
500
- const context = await this.contextRetrieverFn("describe");
501
- const { screenshotBase64, size } = context;
502
- assert2(screenshotBase64, "screenshot is required for insight.describe");
503
- const systemPrompt = elementDescriberInstruction();
504
- const defaultRectSize = 30;
505
- const targetRect = Array.isArray(target) ? {
506
- left: Math.floor(target[0] - defaultRectSize / 2),
507
- top: Math.floor(target[1] - defaultRectSize / 2),
508
- width: defaultRectSize,
509
- height: defaultRectSize
510
- } : target;
511
- let imagePayload = await compositeElementInfoImg({
512
- inputImgBase64: screenshotBase64,
513
- size,
514
- elementsPositionInfo: [
515
- {
516
- rect: targetRect
517
- }
518
- ],
519
- borderThickness: 3
520
- });
521
- if (opt?.deepThink) {
522
- const searchArea = expandSearchArea(targetRect, context.size);
523
- debug("describe: set searchArea", searchArea);
524
- imagePayload = await cropByRect(
525
- imagePayload,
526
- searchArea,
527
- getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)
528
- );
529
- }
530
- const msgs = [
531
- { role: "system", content: systemPrompt },
532
- {
533
- role: "user",
534
- content: [
535
- {
536
- type: "image_url",
537
- image_url: {
538
- url: imagePayload,
539
- detail: "high"
540
- }
541
- }
542
- ]
543
- }
544
- ];
545
- const callAIFn = this.aiVendorFn || callToGetJSONObject;
546
- const res = await callAIFn(msgs, 4 /* DESCRIBE_ELEMENT */);
547
- const { content } = res;
548
- assert2(!content.error, `describe failed: ${content.error}`);
549
- assert2(content.description, "failed to describe the element");
550
- return content;
551
- }
552
- };
553
-
554
- // src/index.ts
555
- import { getAIConfig as getAIConfig3, MIDSCENE_MODEL_NAME as MIDSCENE_MODEL_NAME3 } from "@midscene/shared/env";
556
- var src_default = Insight;
557
- export {
558
- AiAssert,
559
- AiLocateElement,
560
- Executor,
561
- Insight,
562
- MIDSCENE_MODEL_NAME3 as MIDSCENE_MODEL_NAME,
563
- src_default as default,
564
- describeUserPage,
565
- getAIConfig3 as getAIConfig,
566
- getVersion,
567
- plan
568
- };
569
-
570
- //# sourceMappingURL=index.js.map