@midscene/core 0.28.2 → 0.28.3-beta-20250910131004.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/es/agent/agent.mjs +27 -17
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/tasks.mjs +43 -31
  4. package/dist/es/agent/tasks.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +4 -3
  6. package/dist/es/agent/utils.mjs.map +1 -1
  7. package/dist/es/ai-model/common.mjs +12 -22
  8. package/dist/es/ai-model/common.mjs.map +1 -1
  9. package/dist/es/ai-model/index.mjs +3 -3
  10. package/dist/es/ai-model/inspect.mjs +26 -34
  11. package/dist/es/ai-model/inspect.mjs.map +1 -1
  12. package/dist/es/ai-model/llm-planning.mjs +14 -15
  13. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  14. package/dist/es/ai-model/prompt/common.mjs.map +1 -1
  15. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  16. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  17. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
  18. package/dist/es/ai-model/prompt/playwright-generator.mjs +6 -12
  19. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  20. package/dist/es/ai-model/prompt/util.mjs +3 -3
  21. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  22. package/dist/es/ai-model/prompt/yaml-generator.mjs +6 -12
  23. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  24. package/dist/es/ai-model/service-caller/index.mjs +28 -23
  25. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  26. package/dist/es/ai-model/ui-tars-planning.mjs +10 -10
  27. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  28. package/dist/es/insight/index.mjs +20 -26
  29. package/dist/es/insight/index.mjs.map +1 -1
  30. package/dist/es/types.mjs.map +1 -1
  31. package/dist/es/utils.mjs +4 -5
  32. package/dist/es/utils.mjs.map +1 -1
  33. package/dist/lib/agent/agent.js +26 -16
  34. package/dist/lib/agent/agent.js.map +1 -1
  35. package/dist/lib/agent/tasks.js +43 -31
  36. package/dist/lib/agent/tasks.js.map +1 -1
  37. package/dist/lib/agent/utils.js +4 -3
  38. package/dist/lib/agent/utils.js.map +1 -1
  39. package/dist/lib/ai-model/common.js +14 -27
  40. package/dist/lib/ai-model/common.js.map +1 -1
  41. package/dist/lib/ai-model/index.js +12 -15
  42. package/dist/lib/ai-model/inspect.js +24 -32
  43. package/dist/lib/ai-model/inspect.js.map +1 -1
  44. package/dist/lib/ai-model/llm-planning.js +13 -14
  45. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  46. package/dist/lib/ai-model/prompt/common.js.map +1 -1
  47. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  48. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  49. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
  50. package/dist/lib/ai-model/prompt/playwright-generator.js +5 -11
  51. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  52. package/dist/lib/ai-model/prompt/util.js +3 -3
  53. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  54. package/dist/lib/ai-model/prompt/yaml-generator.js +5 -11
  55. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  56. package/dist/lib/ai-model/service-caller/index.js +36 -31
  57. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  58. package/dist/lib/ai-model/ui-tars-planning.js +8 -8
  59. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  60. package/dist/lib/insight/index.js +17 -23
  61. package/dist/lib/insight/index.js.map +1 -1
  62. package/dist/lib/types.js.map +1 -1
  63. package/dist/lib/utils.js +3 -4
  64. package/dist/lib/utils.js.map +1 -1
  65. package/dist/types/agent/agent.d.ts +2 -0
  66. package/dist/types/agent/tasks.d.ts +8 -7
  67. package/dist/types/agent/utils.d.ts +3 -1
  68. package/dist/types/ai-model/common.d.ts +9 -16
  69. package/dist/types/ai-model/index.d.ts +2 -2
  70. package/dist/types/ai-model/inspect.d.ts +7 -6
  71. package/dist/types/ai-model/llm-planning.d.ts +2 -2
  72. package/dist/types/ai-model/prompt/common.d.ts +2 -2
  73. package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
  74. package/dist/types/ai-model/prompt/llm-planning.d.ts +3 -3
  75. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
  76. package/dist/types/ai-model/prompt/playwright-generator.d.ts +3 -2
  77. package/dist/types/ai-model/prompt/util.d.ts +3 -2
  78. package/dist/types/ai-model/prompt/yaml-generator.d.ts +3 -2
  79. package/dist/types/ai-model/service-caller/index.d.ts +5 -5
  80. package/dist/types/ai-model/ui-tars-planning.d.ts +3 -3
  81. package/dist/types/device/index.d.ts +2 -2
  82. package/dist/types/insight/index.d.ts +12 -7
  83. package/dist/types/types.d.ts +0 -9
  84. package/dist/types/utils.d.ts +2 -1
  85. package/package.json +3 -3
@@ -1,8 +1,8 @@
1
- import { AIActionType, callAiFn, expandSearchArea } from "../ai-model/common.mjs";
2
- import { AiExtractElementInfo, AiLocateElement, callToGetJSONObject } from "../ai-model/index.mjs";
1
+ import { AIActionType, expandSearchArea } from "../ai-model/common.mjs";
2
+ import { AiExtractElementInfo, AiLocateElement, callAIWithObjectResponse } from "../ai-model/index.mjs";
3
3
  import { AiLocateSection } from "../ai-model/inspect.mjs";
4
4
  import { elementDescriberInstruction } from "../ai-model/prompt/describe.mjs";
5
- import { MIDSCENE_FORCE_DEEP_THINK, getIsUseQwenVl, globalConfigManager, vlLocateMode } from "@midscene/shared/env";
5
+ import { MIDSCENE_FORCE_DEEP_THINK, globalConfigManager } from "@midscene/shared/env";
6
6
  import { compositeElementInfoImg, cropByRect } from "@midscene/shared/img";
7
7
  import { getDebug } from "@midscene/shared/logger";
8
8
  import { assert } from "@midscene/shared/utils";
@@ -19,9 +19,8 @@ function _define_property(obj, key, value) {
19
19
  }
20
20
  const debug = getDebug('ai:insight');
21
21
  class Insight {
22
- async locate(query, opt) {
22
+ async locate(query, opt, modelConfig) {
23
23
  var _parseResult_errors;
24
- const { callAI } = opt || {};
25
24
  const queryPrompt = 'string' == typeof query ? query : query.prompt;
26
25
  assert(queryPrompt, 'query is required for locate');
27
26
  const dumpSubscriber = this.onceDumpUpdatedFn;
@@ -31,10 +30,8 @@ class Insight {
31
30
  if (globalDeepThinkSwitch) debug('globalDeepThinkSwitch', globalDeepThinkSwitch);
32
31
  let searchAreaPrompt;
33
32
  if (query.deepThink || globalDeepThinkSwitch) searchAreaPrompt = query.prompt;
34
- const modelPreferences = {
35
- intent: 'grounding'
36
- };
37
- if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {
33
+ const { vlMode } = modelConfig;
34
+ if (searchAreaPrompt && !vlMode) {
38
35
  console.warn('The "deepThink" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model');
39
36
  searchAreaPrompt = void 0;
40
37
  }
@@ -46,7 +43,8 @@ class Insight {
46
43
  if (searchAreaPrompt) {
47
44
  searchAreaResponse = await AiLocateSection({
48
45
  context,
49
- sectionDescription: searchAreaPrompt
46
+ sectionDescription: searchAreaPrompt,
47
+ modelConfig
50
48
  });
51
49
  assert(searchAreaResponse.rect, `cannot find search area for "${searchAreaPrompt}"${searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''}`);
52
50
  searchAreaRawResponse = searchAreaResponse.rawResponse;
@@ -55,10 +53,11 @@ class Insight {
55
53
  }
56
54
  const startTime = Date.now();
57
55
  const { parseResult, rect, elementById, rawResponse, usage, isOrderSensitive } = await AiLocateElement({
58
- callAI: callAI || this.aiVendorFn,
56
+ callAIFn: this.aiVendorFn,
59
57
  context,
60
58
  targetElementDescription: queryPrompt,
61
- searchConfig: searchAreaResponse
59
+ searchConfig: searchAreaResponse,
60
+ modelConfig
62
61
  });
63
62
  const timeCost = Date.now() - startTime;
64
63
  const taskInfo = {
@@ -116,14 +115,11 @@ class Insight {
116
115
  rect
117
116
  };
118
117
  }
119
- async extract(dataDemand, opt, multimodalPrompt) {
118
+ async extract(dataDemand, modelConfig, opt, multimodalPrompt) {
120
119
  var _parseResult_errors;
121
120
  assert('object' == typeof dataDemand || 'string' == typeof dataDemand, `dataDemand should be object or string, but get ${typeof dataDemand}`);
122
121
  const dumpSubscriber = this.onceDumpUpdatedFn;
123
122
  this.onceDumpUpdatedFn = void 0;
124
- const modelPreferences = {
125
- intent: 'VQA'
126
- };
127
123
  const context = await this.contextRetrieverFn('extract');
128
124
  const startTime = Date.now();
129
125
  const { parseResult, usage } = await AiExtractElementInfo({
@@ -131,7 +127,7 @@ class Insight {
131
127
  dataQuery: dataDemand,
132
128
  multimodalPrompt,
133
129
  extractOption: opt,
134
- modelPreferences
130
+ modelConfig
135
131
  });
136
132
  const timeCost = Date.now() - startTime;
137
133
  const taskInfo = {
@@ -163,14 +159,12 @@ class Insight {
163
159
  usage
164
160
  };
165
161
  }
166
- async describe(target, opt) {
162
+ async describe(target, modelConfig, opt) {
167
163
  assert(target, 'target is required for insight.describe');
168
164
  const context = await this.contextRetrieverFn('describe');
169
165
  const { screenshotBase64, size } = context;
170
166
  assert(screenshotBase64, 'screenshot is required for insight.describe');
171
- const modelPreferences = {
172
- intent: 'grounding'
173
- };
167
+ const { vlMode } = modelConfig;
174
168
  const systemPrompt = elementDescriberInstruction();
175
169
  const defaultRectSize = 30;
176
170
  const targetRect = Array.isArray(target) ? {
@@ -190,9 +184,9 @@ class Insight {
190
184
  borderThickness: 3
191
185
  });
192
186
  if (null == opt ? void 0 : opt.deepThink) {
193
- const searchArea = expandSearchArea(targetRect, context.size, modelPreferences);
187
+ const searchArea = expandSearchArea(targetRect, context.size, vlMode);
194
188
  debug('describe: set searchArea', searchArea);
195
- imagePayload = await cropByRect(imagePayload, searchArea, getIsUseQwenVl(modelPreferences));
189
+ imagePayload = await cropByRect(imagePayload, searchArea, 'qwen-vl' === vlMode);
196
190
  }
197
191
  const msgs = [
198
192
  {
@@ -212,8 +206,8 @@ class Insight {
212
206
  ]
213
207
  }
214
208
  ];
215
- const callAIFn = this.aiVendorFn || callToGetJSONObject;
216
- const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT, modelPreferences);
209
+ const callAIFn = this.aiVendorFn;
210
+ const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT, modelConfig);
217
211
  const { content } = res;
218
212
  assert(!content.error, `describe failed: ${content.error}`);
219
213
  assert(content.description, 'failed to describe the element');
@@ -221,7 +215,7 @@ class Insight {
221
215
  }
222
216
  constructor(context, opt){
223
217
  _define_property(this, "contextRetrieverFn", void 0);
224
- _define_property(this, "aiVendorFn", callAiFn);
218
+ _define_property(this, "aiVendorFn", callAIWithObjectResponse);
225
219
  _define_property(this, "onceDumpUpdatedFn", void 0);
226
220
  _define_property(this, "taskInfo", void 0);
227
221
  assert(context, 'context is required for Insight');
@@ -1 +1 @@
1
- {"version":3,"file":"insight/index.mjs","sources":["webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["import {\n AIActionType,\n type AIArgs,\n callAiFn,\n expandSearchArea,\n} from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callToGetJSONObject,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightOptions,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n MIDSCENE_FORCE_DEEP_THINK,\n getIsUseQwenVl,\n globalConfigManager,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n callAI?: typeof callAiFn<AIElementResponse>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: (...args: Array<any>) => Promise<any> = callAiFn;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt?: LocateOpts,\n ): Promise<LocateResult> {\n const { callAI } = opt || {};\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = globalConfigManager.getEnvConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAI: callAI || this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const modelPreferences: IModelPreferences = {\n intent: 'VQA',\n };\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelPreferences,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const modelPreferences: IModelPreferences = { intent: 'grounding' };\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(\n targetRect,\n context.size,\n modelPreferences,\n );\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n getIsUseQwenVl(modelPreferences),\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn =\n this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelPreferences,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Insight","query","opt","_parseResult_errors","callAI","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","globalConfigManager","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","modelPreferences","vlLocateMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","getIsUseQwenVl","msgs","callAIFn","callToGetJSONObject","res","AIActionType","content","callAiFn","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;AAoDA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IAmCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAgB,EACO;YAkFnBC;QAjFJ,MAAM,EAAEC,MAAM,EAAE,GAAGF,OAAO,CAAC;QAC3B,MAAMG,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,OAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,OAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,oBAAoB,qBAAqB,CACrEC;QAEF,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAEjC,MAAMY,mBAAsC;YAC1C,QAAQ;QACV;QAEA,IAAID,oBAAoB,CAACE,aAAaD,mBAAmB;YACvDE,QAAQ,IAAI,CACV;YAEFH,mBAAmBJ;QACrB;QAEA,MAAMQ,UAAUd,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIe;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBJ;YACtB;YACAN,OACEc,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,gBAAgB;YACxB,QAAQ1B,UAAU,IAAI,CAAC,UAAU;YACjCY;YACA,0BAA0BX;YAC1B,cAAce;QAChB;QAEA,MAAMW,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS9B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAaoB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,gBACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA7B;QAGF,IAAI2B,UACF,MAAM,IAAIM,MAAMN;QAGlB5B,OACE8B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BvC,GAA0B,EAC1BwC,gBAAoC,EAKnC;YA+BGvC;QA9BJG,OACE,AAAsB,YAAtB,OAAOmC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMlC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMK,mBAAsC;YAC1C,QAAQ;QACV;QAEA,MAAMG,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,qBAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAexC;YACfW;QACF;QAEA,MAAMkB,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,gBACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACArC;QAGF,IAAI2B,YAAY,CAACU,QAAQ,CAAC1C,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIsC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B5C,GAEC,EACwD;QACzDI,OAAOwC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCV,OAAOyC,kBAAkB;QAEzB,MAAMlC,mBAAsC;YAAE,QAAQ;QAAY;QAClE,MAAMoC,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIlD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAClB,MAAMe,aAAawC,iBACjBL,YACApC,QAAQ,IAAI,EACZH;YAEFf,MAAM,4BAA4BmB;YAClCsC,eAAe,MAAMG,WACnBH,cACAtC,YACA0C,eAAe9C;QAEnB;QAEA,MAAM+C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WACJ,IAAI,CAAC,UAAU,IAAIC;QAErB,MAAMC,MAAM,MAAMF,SAChBD,MACAI,aAAa,gBAAgB,EAC7BnD;QAGF,MAAM,EAAEoD,OAAO,EAAE,GAAGF;QACpBzD,OAAO,CAAC2D,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1D3D,OAAO2D,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IA1UA,YACEjD,OAEmE,EACnEd,GAAoB,CACpB;QAfF;QAIA,qCAAoDgE;QAEpD;QAEA;QAQE5D,OAAOU,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMmD,QAAQ,OAAO,CAACnD;QAGlD,IAAI,AAA2B,WAApBd,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAwTF"}
1
+ {"version":3,"file":"insight/index.mjs","sources":["webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["import { AIActionType, type AIArgs, expandSearchArea } from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callAIWithObjectResponse,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelConfig,\n MIDSCENE_FORCE_DEEP_THINK,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\ninterface InsightOptions {\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n aiVendorFn?: typeof callAIWithObjectResponse;\n}\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: Exclude<InsightOptions['aiVendorFn'], undefined> =\n callAIWithObjectResponse;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n // just for unit test, aiVendorFn is callAIWithObjectResponse by default\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt: LocateOpts,\n modelConfig: IModelConfig,\n ): Promise<LocateResult> {\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = globalConfigManager.getEnvConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n\n const { vlMode } = modelConfig;\n\n if (searchAreaPrompt && !vlMode) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n modelConfig,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAIFn: this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n modelConfig,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n modelConfig: IModelConfig,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelConfig,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n modelConfig: IModelConfig,\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const { vlMode } = modelConfig;\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(targetRect, context.size, vlMode);\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n vlMode === 'qwen-vl',\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn = this\n .aiVendorFn as typeof callAIWithObjectResponse<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelConfig,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["debug","getDebug","Insight","query","opt","modelConfig","_parseResult_errors","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","globalConfigManager","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","vlMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","msgs","callAIFn","res","AIActionType","content","callAIWithObjectResponse","Promise"],"mappings":";;;;;;;;;;;;;;;;;;;AA+CA,MAAMA,QAAQC,SAAS;AACR,MAAMC;IAqCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAe,EACfC,WAAyB,EACF;YAkFnBC;QAjFJ,MAAMC,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,OAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,OAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,oBAAoB,qBAAqB,CACrEC;QAEF,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAGjC,MAAM,EAAEY,MAAM,EAAE,GAAGV;QAEnB,IAAIS,oBAAoB,CAACC,QAAQ;YAC/BC,QAAQ,IAAI,CACV;YAEFF,mBAAmBJ;QACrB;QAEA,MAAMO,UAAUb,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIc;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIP,kBAAkB;YACpBO,qBAAqB,MAAMC,gBAAgB;gBACzCL;gBACA,oBAAoBH;gBACpBT;YACF;YACAG,OACEa,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAEP,iBAAiB,CAAC,EAChDO,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,gBAAgB;YACxB,UAAU,IAAI,CAAC,UAAU;YACzBd;YACA,0BAA0BV;YAC1B,cAAcc;YACdhB;QACF;QAEA,MAAM2B,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA7B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B6B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS7B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAamB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,gBACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA5B;QAGF,IAAI0B,UACF,MAAM,IAAIM,MAAMN;QAGlB3B,OACE6B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BrC,WAAyB,EACzBD,GAA0B,EAC1BuC,gBAAoC,EAKnC;YA4BGrC;QA3BJE,OACE,AAAsB,YAAtB,OAAOkC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMjC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMO,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAE1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,qBAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAevC;YACfC;QACF;QAEA,MAAM2B,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA7B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B6B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,gBACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACApC;QAGF,IAAI0B,YAAY,CAACU,QAAQ,CAACzC,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIqC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B1C,WAAyB,EACzBD,GAEC,EACwD;QACzDI,OAAOuC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCT,OAAOwC,kBAAkB;QAEzB,MAAM,EAAEjC,MAAM,EAAE,GAAGV;QACnB,MAAM6C,eAAeC;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,wBAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIjD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAClB,MAAMc,aAAawC,iBAAiBL,YAAYpC,QAAQ,IAAI,EAAEF;YAC9Df,MAAM,4BAA4BkB;YAClCsC,eAAe,MAAMG,WACnBH,cACAtC,YACAH,AAAW,cAAXA;QAEJ;QAEA,MAAM6C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASV;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMK,WAAW,IAAI,CAClB,UAAU;QAEb,MAAMC,MAAM,MAAMD,SAChBD,MACAG,aAAa,gBAAgB,EAC7B1D;QAGF,MAAM,EAAE2D,OAAO,EAAE,GAAGF;QACpBtD,OAAO,CAACwD,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1DxD,OAAOwD,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IAvUA,YACE/C,OAEmE,EACnEb,GAAoB,CACpB;QAhBF;QAIA,qCACE6D;QAEF;QAEA;QAQEzD,OAAOS,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMiD,QAAQ,OAAO,CAACjD;QAIlD,IAAI,AAA2B,WAApBb,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAoTF"}
@@ -1 +1 @@
1
- {"version":3,"file":"types.mjs","sources":["webpack://@midscene/core/./src/types.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\n\nimport type { NodeType } from '@midscene/shared/constants';\nimport type { TModelConfigFn } from '@midscene/shared/env';\nimport type {\n BaseElement,\n ElementTreeNode,\n Rect,\n Size,\n} from '@midscene/shared/types';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport type { TUserPrompt } from './ai-model/common';\nimport type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';\n\nexport type {\n ElementTreeNode,\n BaseElement,\n Rect,\n Size,\n Point,\n} from '@midscene/shared/types';\nexport * from './yaml';\n\nexport type AIUsageInfo = Record<string, any> & {\n prompt_tokens: number | undefined;\n completion_tokens: number | undefined;\n total_tokens: number | undefined;\n time_cost: number | undefined;\n model_name: string | undefined;\n model_description: string | undefined;\n intent: string | undefined;\n};\n\n/**\n * openai\n *\n */\nexport enum AIResponseFormat {\n JSON = 'json_object',\n TEXT = 'text',\n}\n\nexport type AISingleElementResponseById = {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n};\n\nexport type AISingleElementResponseByPosition = {\n position?: {\n x: number;\n y: number;\n };\n bbox?: [number, number, number, number];\n reason: string;\n text: string;\n};\n\nexport type AISingleElementResponse = AISingleElementResponseById;\nexport interface AIElementLocatorResponse {\n elements: {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n }[];\n bbox?: [number, number, number, number];\n isOrderSensitive?: boolean;\n errors?: string[];\n}\n\nexport interface AIElementCoordinatesResponse {\n bbox: [number, number, number, number];\n isOrderSensitive?: boolean;\n errors?: string[];\n}\n\nexport type AIElementResponse =\n | AIElementLocatorResponse\n | AIElementCoordinatesResponse;\n\nexport interface AIDataExtractionResponse<DataDemand> {\n data: DataDemand;\n errors?: string[];\n thought?: string;\n}\n\nexport interface AISectionLocatorResponse {\n bbox: [number, number, number, number];\n references_bbox?: [number, number, number, number][];\n error?: string;\n}\n\nexport interface AIAssertionResponse {\n pass: boolean;\n thought: string;\n}\n\nexport interface AIDescribeElementResponse {\n description: string;\n error?: string;\n}\n\nexport interface LocatorValidatorOption {\n centerDistanceThreshold?: number;\n}\n\nexport interface LocateValidatorResult {\n pass: boolean;\n rect: Rect;\n center: [number, number];\n centerDistance?: number;\n}\n\nexport interface AgentDescribeElementAtPointResult {\n prompt: string;\n deepThink: boolean;\n verifyResult?: LocateValidatorResult;\n}\n\n/**\n * context\n */\n\nexport abstract class UIContext<ElementType extends BaseElement = BaseElement> {\n abstract screenshotBase64: string;\n\n abstract tree: ElementTreeNode<ElementType>;\n\n abstract size: Size;\n\n abstract _isFrozen?: boolean;\n}\n\n/**\n * insight\n */\n\nexport type CallAIFn = <T>(\n messages: ChatCompletionMessageParam[],\n) => Promise<T>;\n\nexport interface InsightOptions {\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n aiVendorFn?: CallAIFn;\n}\n\nexport type EnsureObject<T> = { [K in keyof T]: any };\n\nexport type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';\n\nexport type InsightExtractParam = string | Record<string, string>;\n\nexport type LocateResultElement = {\n center: [number, number];\n rect: Rect;\n id: string;\n indexId?: number;\n xpaths: string[];\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n isOrderSensitive?: boolean;\n};\n\nexport interface LocateResult {\n element: LocateResultElement | null;\n rect?: Rect;\n}\n\nexport interface InsightTaskInfo {\n durationMs: number;\n formatResponse?: string;\n rawResponse?: string;\n usage?: AIUsageInfo;\n searchArea?: Rect;\n searchAreaRawResponse?: string;\n searchAreaUsage?: AIUsageInfo;\n}\n\nexport interface DumpMeta {\n sdkVersion: string;\n logTime: number;\n}\n\nexport interface ReportDumpWithAttributes {\n dumpString: string;\n attributes?: Record<string, any>;\n}\n\nexport interface InsightDump extends DumpMeta {\n type: 'locate' | 'extract' | 'assert';\n logId: string;\n userQuery: {\n element?: TUserPrompt;\n dataDemand?: InsightExtractParam;\n assertion?: TUserPrompt;\n };\n matchedElement: BaseElement[];\n matchedRect?: Rect;\n deepThink?: boolean;\n data: any;\n assertionPass?: boolean;\n assertionThought?: string;\n taskInfo: InsightTaskInfo;\n error?: string;\n output?: any;\n}\n\nexport type PartialInsightDumpFromSDK = Omit<\n InsightDump,\n 'sdkVersion' | 'logTime' | 'logId' | 'model_name'\n>;\n\nexport type DumpSubscriber = (dump: InsightDump) => Promise<void> | void;\n\n// intermediate variables to optimize the return value by AI\nexport interface LiteUISection {\n name: string;\n description: string;\n sectionCharacteristics: string;\n textIds: string[];\n}\n\nexport type ElementById = (id: string) => BaseElement | null;\n\nexport type InsightAssertionResponse = AIAssertionResponse & {\n usage?: AIUsageInfo;\n};\n\n/**\n * agent\n */\n\nexport type OnTaskStartTip = (tip: string) => Promise<void> | void;\n\nexport interface AgentWaitForOpt {\n checkIntervalMs?: number;\n timeoutMs?: number;\n}\n\nexport interface AgentAssertOpt {\n keepRawResponse?: boolean;\n}\n\n/**\n * planning\n *\n */\n\nexport interface PlanningLocateParam extends DetailedLocateParam {\n id?: string;\n bbox?: [number, number, number, number];\n}\n\nexport interface PlanningAction<ParamType = any> {\n thought?: string;\n type: string;\n param: ParamType;\n locate?: PlanningLocateParam | null;\n}\n\nexport interface PlanningAIResponse {\n action?: PlanningAction; // this is the qwen mode\n actions?: PlanningAction[];\n more_actions_needed_by_instruction: boolean;\n log: string;\n sleep?: number;\n error?: string;\n usage?: AIUsageInfo;\n rawResponse?: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n yamlString?: string;\n}\n\nexport type PlanningActionParamTap = null;\nexport type PlanningActionParamHover = null;\nexport type PlanningActionParamRightClick = null;\n\nexport interface PlanningActionParamInputOrKeyPress {\n value: string;\n autoDismissKeyboard?: boolean;\n}\n\nexport interface PlanningActionParamSleep {\n timeMs: number;\n}\n\nexport interface PlanningActionParamError {\n thought: string;\n}\n\nexport type PlanningActionParamWaitFor = AgentWaitForOpt & {};\n\nexport interface AndroidLongPressParam {\n duration?: number;\n}\n\nexport interface AndroidPullParam {\n direction: 'up' | 'down';\n distance?: number;\n duration?: number;\n}\n/**\n * misc\n */\n\nexport interface Color {\n name: string;\n hex: string;\n}\n\nexport interface BaseAgentParserOpt {\n selector?: string;\n}\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PuppeteerParserOpt extends BaseAgentParserOpt {}\n\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PlaywrightParserOpt extends BaseAgentParserOpt {}\n\n/*\naction\n*/\nexport interface ExecutionTaskProgressOptions {\n onTaskStart?: (task: ExecutionTask) => Promise<void> | void;\n}\n\nexport interface ExecutionRecorderItem {\n type: 'screenshot';\n ts: number;\n screenshot?: string;\n timing?: string;\n}\n\nexport type ExecutionTaskType =\n | 'Planning'\n | 'Insight'\n | 'Action'\n | 'Assertion'\n | 'Log';\n\nexport interface ExecutorContext {\n task: ExecutionTask;\n element?: LocateResultElement | null;\n}\n\nexport interface ExecutionTaskApply<\n Type extends ExecutionTaskType = any,\n TaskParam = any,\n TaskOutput = any,\n TaskLog = any,\n> {\n type: Type;\n subType?: string;\n param?: TaskParam;\n thought?: string;\n locate?: PlanningLocateParam | null;\n uiContext?: UIContext;\n executor: (\n param: TaskParam,\n context: ExecutorContext,\n ) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>\n | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>\n | undefined\n | void;\n}\n\nexport interface ExecutionTaskHitBy {\n from: string;\n context: Record<string, any>;\n}\n\nexport interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {\n output?: TaskOutput;\n log?: TaskLog;\n recorder?: ExecutionRecorderItem[];\n hitBy?: ExecutionTaskHitBy;\n}\n\nexport type ExecutionTask<\n E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<\n any,\n any,\n any\n >,\n> = E &\n ExecutionTaskReturn<\n E extends ExecutionTaskApply<any, any, infer TaskOutput, any>\n ? TaskOutput\n : unknown,\n E extends ExecutionTaskApply<any, any, any, infer TaskLog>\n ? TaskLog\n : unknown\n > & {\n status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';\n error?: Error;\n errorMessage?: string;\n errorStack?: string;\n timing?: {\n start: number;\n end?: number;\n cost?: number;\n };\n usage?: AIUsageInfo;\n };\n\nexport interface ExecutionDump extends DumpMeta {\n name: string;\n description?: string;\n tasks: ExecutionTask[];\n}\n\n/*\ntask - insight-locate\n*/\nexport type ExecutionTaskInsightLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskInsightLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport interface ExecutionTaskInsightDumpLog {\n dump?: InsightDump;\n}\n\nexport type ExecutionTaskInsightLocateApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightLocateParam,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightLocate =\n ExecutionTask<ExecutionTaskInsightLocateApply>;\n\n/*\ntask - insight-query\n*/\nexport interface ExecutionTaskInsightQueryParam {\n dataDemand: InsightExtractParam;\n}\n\nexport interface ExecutionTaskInsightQueryOutput {\n data: any;\n}\n\nexport type ExecutionTaskInsightQueryApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightQueryParam,\n any,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightQuery =\n ExecutionTask<ExecutionTaskInsightQueryApply>;\n\n/*\ntask - assertion\n*/\nexport interface ExecutionTaskInsightAssertionParam {\n assertion: string;\n}\n\nexport type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightAssertionParam,\n InsightAssertionResponse,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightAssertion =\n ExecutionTask<ExecutionTaskInsightAssertionApply>;\n\n/*\ntask - action (i.e. interact) \n*/\nexport type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<\n 'Action',\n ActionParam,\n void,\n void\n>;\n\nexport type ExecutionTaskAction = ExecutionTask<ExecutionTaskActionApply>;\n\n/*\ntask - Log\n*/\n\nexport type ExecutionTaskLogApply<\n LogParam = {\n content: string;\n },\n> = ExecutionTaskApply<'Log', LogParam, void, void>;\n\nexport type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;\n\n/*\ntask - planning\n*/\n\nexport type ExecutionTaskPlanningApply = ExecutionTaskApply<\n 'Planning',\n {\n userInstruction: string;\n log?: string;\n },\n PlanningAIResponse\n>;\n\nexport type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;\n\n/*\nGrouped dump\n*/\nexport interface GroupedActionDump {\n groupName: string;\n groupDescription?: string;\n modelBriefs: string[];\n executions: ExecutionDump[];\n}\n\nexport type InterfaceType =\n | 'puppeteer'\n | 'playwright'\n | 'static'\n | 'chrome-extension-proxy'\n | 'android'\n | string;\n\nexport interface StreamingCodeGenerationOptions {\n /** Whether to enable streaming output */\n stream?: boolean;\n /** Callback function to handle streaming chunks */\n onChunk?: StreamingCallback;\n /** Callback function to handle streaming completion */\n onComplete?: (finalCode: string) => void;\n /** Callback function to handle streaming errors */\n onError?: (error: Error) => void;\n}\n\nexport type StreamingCallback = (chunk: CodeGenerationChunk) => void;\n\nexport interface CodeGenerationChunk {\n /** The incremental content chunk */\n content: string;\n /** The reasoning content */\n reasoning_content: string;\n /** The accumulated content so far */\n accumulated: string;\n /** Whether this is the final chunk */\n isComplete: boolean;\n /** Token usage information if available */\n usage?: AIUsageInfo;\n}\n\nexport interface StreamingAIResponse {\n /** The final accumulated content */\n content: string;\n /** Token usage information */\n usage?: AIUsageInfo;\n /** Whether the response was streamed */\n isStreamed: boolean;\n}\n\nexport interface DeviceAction<T = any> {\n name: string;\n description?: string;\n interfaceAlias?: string;\n paramSchema?: z.ZodType<T>;\n call: (param: T, context: ExecutorContext) => Promise<void> | void;\n}\n\n/**\n * Web-specific types\n */\nexport interface WebElementInfo extends BaseElement {\n id: string;\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n}\n\nexport type WebUIContext = UIContext<WebElementInfo>;\n\n/**\n * Agent\n */\n\nexport interface AgentOpt {\n testId?: string;\n cacheId?: string;\n groupName?: string;\n groupDescription?: string;\n /* if auto generate report, default true */\n generateReport?: boolean;\n /* if auto print report msg, default true */\n autoPrintReportMsg?: boolean;\n onTaskStartTip?: OnTaskStartTip;\n aiActionContext?: string;\n /* custom report file name */\n reportFileName?: string;\n modelConfig?: TModelConfigFn;\n}\n"],"names":["AIResponseFormat","UIContext"],"mappings":";AAsCO,IAAKA,yBAAgBA,WAAAA,GAAAA,SAAhBA,gBAAgB;;;WAAhBA;;AAwFL,MAAeC;AAQtB"}
1
+ {"version":3,"file":"types.mjs","sources":["webpack://@midscene/core/./src/types.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\n\nimport type { NodeType } from '@midscene/shared/constants';\nimport type { TModelConfigFn } from '@midscene/shared/env';\nimport type {\n BaseElement,\n ElementTreeNode,\n Rect,\n Size,\n} from '@midscene/shared/types';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport type { TUserPrompt } from './ai-model/common';\nimport type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';\n\nexport type {\n ElementTreeNode,\n BaseElement,\n Rect,\n Size,\n Point,\n} from '@midscene/shared/types';\nexport * from './yaml';\n\nexport type AIUsageInfo = Record<string, any> & {\n prompt_tokens: number | undefined;\n completion_tokens: number | undefined;\n total_tokens: number | undefined;\n time_cost: number | undefined;\n model_name: string | undefined;\n model_description: string | undefined;\n intent: string | undefined;\n};\n\n/**\n * openai\n *\n */\nexport enum AIResponseFormat {\n JSON = 'json_object',\n TEXT = 'text',\n}\n\nexport type AISingleElementResponseById = {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n};\n\nexport type AISingleElementResponseByPosition = {\n position?: {\n x: number;\n y: number;\n };\n bbox?: [number, number, number, number];\n reason: string;\n text: string;\n};\n\nexport type AISingleElementResponse = AISingleElementResponseById;\nexport interface AIElementLocatorResponse {\n elements: {\n id: string;\n reason?: string;\n text?: string;\n xpaths?: string[];\n }[];\n bbox?: [number, number, number, number];\n isOrderSensitive?: boolean;\n errors?: string[];\n}\n\nexport interface AIElementCoordinatesResponse {\n bbox: [number, number, number, number];\n isOrderSensitive?: boolean;\n errors?: string[];\n}\n\nexport type AIElementResponse =\n | AIElementLocatorResponse\n | AIElementCoordinatesResponse;\n\nexport interface AIDataExtractionResponse<DataDemand> {\n data: DataDemand;\n errors?: string[];\n thought?: string;\n}\n\nexport interface AISectionLocatorResponse {\n bbox: [number, number, number, number];\n references_bbox?: [number, number, number, number][];\n error?: string;\n}\n\nexport interface AIAssertionResponse {\n pass: boolean;\n thought: string;\n}\n\nexport interface AIDescribeElementResponse {\n description: string;\n error?: string;\n}\n\nexport interface LocatorValidatorOption {\n centerDistanceThreshold?: number;\n}\n\nexport interface LocateValidatorResult {\n pass: boolean;\n rect: Rect;\n center: [number, number];\n centerDistance?: number;\n}\n\nexport interface AgentDescribeElementAtPointResult {\n prompt: string;\n deepThink: boolean;\n verifyResult?: LocateValidatorResult;\n}\n\n/**\n * context\n */\n\nexport abstract class UIContext<ElementType extends BaseElement = BaseElement> {\n abstract screenshotBase64: string;\n\n abstract tree: ElementTreeNode<ElementType>;\n\n abstract size: Size;\n\n abstract _isFrozen?: boolean;\n}\n\nexport type EnsureObject<T> = { [K in keyof T]: any };\n\nexport type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';\n\nexport type InsightExtractParam = string | Record<string, string>;\n\nexport type LocateResultElement = {\n center: [number, number];\n rect: Rect;\n id: string;\n indexId?: number;\n xpaths: string[];\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n isOrderSensitive?: boolean;\n};\n\nexport interface LocateResult {\n element: LocateResultElement | null;\n rect?: Rect;\n}\n\nexport interface InsightTaskInfo {\n durationMs: number;\n formatResponse?: string;\n rawResponse?: string;\n usage?: AIUsageInfo;\n searchArea?: Rect;\n searchAreaRawResponse?: string;\n searchAreaUsage?: AIUsageInfo;\n}\n\nexport interface DumpMeta {\n sdkVersion: string;\n logTime: number;\n}\n\nexport interface ReportDumpWithAttributes {\n dumpString: string;\n attributes?: Record<string, any>;\n}\n\nexport interface InsightDump extends DumpMeta {\n type: 'locate' | 'extract' | 'assert';\n logId: string;\n userQuery: {\n element?: TUserPrompt;\n dataDemand?: InsightExtractParam;\n assertion?: TUserPrompt;\n };\n matchedElement: BaseElement[];\n matchedRect?: Rect;\n deepThink?: boolean;\n data: any;\n assertionPass?: boolean;\n assertionThought?: string;\n taskInfo: InsightTaskInfo;\n error?: string;\n output?: any;\n}\n\nexport type PartialInsightDumpFromSDK = Omit<\n InsightDump,\n 'sdkVersion' | 'logTime' | 'logId' | 'model_name'\n>;\n\nexport type DumpSubscriber = (dump: InsightDump) => Promise<void> | void;\n\n// intermediate variables to optimize the return value by AI\nexport interface LiteUISection {\n name: string;\n description: string;\n sectionCharacteristics: string;\n textIds: string[];\n}\n\nexport type ElementById = (id: string) => BaseElement | null;\n\nexport type InsightAssertionResponse = AIAssertionResponse & {\n usage?: AIUsageInfo;\n};\n\n/**\n * agent\n */\n\nexport type OnTaskStartTip = (tip: string) => Promise<void> | void;\n\nexport interface AgentWaitForOpt {\n checkIntervalMs?: number;\n timeoutMs?: number;\n}\n\nexport interface AgentAssertOpt {\n keepRawResponse?: boolean;\n}\n\n/**\n * planning\n *\n */\n\nexport interface PlanningLocateParam extends DetailedLocateParam {\n id?: string;\n bbox?: [number, number, number, number];\n}\n\nexport interface PlanningAction<ParamType = any> {\n thought?: string;\n type: string;\n param: ParamType;\n locate?: PlanningLocateParam | null;\n}\n\nexport interface PlanningAIResponse {\n action?: PlanningAction; // this is the qwen mode\n actions?: PlanningAction[];\n more_actions_needed_by_instruction: boolean;\n log: string;\n sleep?: number;\n error?: string;\n usage?: AIUsageInfo;\n rawResponse?: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n yamlString?: string;\n}\n\nexport type PlanningActionParamTap = null;\nexport type PlanningActionParamHover = null;\nexport type PlanningActionParamRightClick = null;\n\nexport interface PlanningActionParamInputOrKeyPress {\n value: string;\n autoDismissKeyboard?: boolean;\n}\n\nexport interface PlanningActionParamSleep {\n timeMs: number;\n}\n\nexport interface PlanningActionParamError {\n thought: string;\n}\n\nexport type PlanningActionParamWaitFor = AgentWaitForOpt & {};\n\nexport interface AndroidLongPressParam {\n duration?: number;\n}\n\nexport interface AndroidPullParam {\n direction: 'up' | 'down';\n distance?: number;\n duration?: number;\n}\n/**\n * misc\n */\n\nexport interface Color {\n name: string;\n hex: string;\n}\n\nexport interface BaseAgentParserOpt {\n selector?: string;\n}\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PuppeteerParserOpt extends BaseAgentParserOpt {}\n\n// eslint-disable-next-line @typescript-eslint/no-empty-interface\nexport interface PlaywrightParserOpt extends BaseAgentParserOpt {}\n\n/*\naction\n*/\nexport interface ExecutionTaskProgressOptions {\n onTaskStart?: (task: ExecutionTask) => Promise<void> | void;\n}\n\nexport interface ExecutionRecorderItem {\n type: 'screenshot';\n ts: number;\n screenshot?: string;\n timing?: string;\n}\n\nexport type ExecutionTaskType =\n | 'Planning'\n | 'Insight'\n | 'Action'\n | 'Assertion'\n | 'Log';\n\nexport interface ExecutorContext {\n task: ExecutionTask;\n element?: LocateResultElement | null;\n}\n\nexport interface ExecutionTaskApply<\n Type extends ExecutionTaskType = any,\n TaskParam = any,\n TaskOutput = any,\n TaskLog = any,\n> {\n type: Type;\n subType?: string;\n param?: TaskParam;\n thought?: string;\n locate?: PlanningLocateParam | null;\n uiContext?: UIContext;\n executor: (\n param: TaskParam,\n context: ExecutorContext,\n ) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>\n | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>\n | undefined\n | void;\n}\n\nexport interface ExecutionTaskHitBy {\n from: string;\n context: Record<string, any>;\n}\n\nexport interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {\n output?: TaskOutput;\n log?: TaskLog;\n recorder?: ExecutionRecorderItem[];\n hitBy?: ExecutionTaskHitBy;\n}\n\nexport type ExecutionTask<\n E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<\n any,\n any,\n any\n >,\n> = E &\n ExecutionTaskReturn<\n E extends ExecutionTaskApply<any, any, infer TaskOutput, any>\n ? TaskOutput\n : unknown,\n E extends ExecutionTaskApply<any, any, any, infer TaskLog>\n ? TaskLog\n : unknown\n > & {\n status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';\n error?: Error;\n errorMessage?: string;\n errorStack?: string;\n timing?: {\n start: number;\n end?: number;\n cost?: number;\n };\n usage?: AIUsageInfo;\n };\n\nexport interface ExecutionDump extends DumpMeta {\n name: string;\n description?: string;\n tasks: ExecutionTask[];\n}\n\n/*\ntask - insight-locate\n*/\nexport type ExecutionTaskInsightLocateParam = PlanningLocateParam;\n\nexport interface ExecutionTaskInsightLocateOutput {\n element: LocateResultElement | null;\n}\n\nexport interface ExecutionTaskInsightDumpLog {\n dump?: InsightDump;\n}\n\nexport type ExecutionTaskInsightLocateApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightLocateParam,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightLocate =\n ExecutionTask<ExecutionTaskInsightLocateApply>;\n\n/*\ntask - insight-query\n*/\nexport interface ExecutionTaskInsightQueryParam {\n dataDemand: InsightExtractParam;\n}\n\nexport interface ExecutionTaskInsightQueryOutput {\n data: any;\n}\n\nexport type ExecutionTaskInsightQueryApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightQueryParam,\n any,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightQuery =\n ExecutionTask<ExecutionTaskInsightQueryApply>;\n\n/*\ntask - assertion\n*/\nexport interface ExecutionTaskInsightAssertionParam {\n assertion: string;\n}\n\nexport type ExecutionTaskInsightAssertionApply = ExecutionTaskApply<\n 'Insight',\n ExecutionTaskInsightAssertionParam,\n InsightAssertionResponse,\n ExecutionTaskInsightDumpLog\n>;\n\nexport type ExecutionTaskInsightAssertion =\n ExecutionTask<ExecutionTaskInsightAssertionApply>;\n\n/*\ntask - action (i.e. interact) \n*/\nexport type ExecutionTaskActionApply<ActionParam = any> = ExecutionTaskApply<\n 'Action',\n ActionParam,\n void,\n void\n>;\n\nexport type ExecutionTaskAction = ExecutionTask<ExecutionTaskActionApply>;\n\n/*\ntask - Log\n*/\n\nexport type ExecutionTaskLogApply<\n LogParam = {\n content: string;\n },\n> = ExecutionTaskApply<'Log', LogParam, void, void>;\n\nexport type ExecutionTaskLog = ExecutionTask<ExecutionTaskLogApply>;\n\n/*\ntask - planning\n*/\n\nexport type ExecutionTaskPlanningApply = ExecutionTaskApply<\n 'Planning',\n {\n userInstruction: string;\n log?: string;\n },\n PlanningAIResponse\n>;\n\nexport type ExecutionTaskPlanning = ExecutionTask<ExecutionTaskPlanningApply>;\n\n/*\nGrouped dump\n*/\nexport interface GroupedActionDump {\n groupName: string;\n groupDescription?: string;\n modelBriefs: string[];\n executions: ExecutionDump[];\n}\n\nexport type InterfaceType =\n | 'puppeteer'\n | 'playwright'\n | 'static'\n | 'chrome-extension-proxy'\n | 'android'\n | string;\n\nexport interface StreamingCodeGenerationOptions {\n /** Whether to enable streaming output */\n stream?: boolean;\n /** Callback function to handle streaming chunks */\n onChunk?: StreamingCallback;\n /** Callback function to handle streaming completion */\n onComplete?: (finalCode: string) => void;\n /** Callback function to handle streaming errors */\n onError?: (error: Error) => void;\n}\n\nexport type StreamingCallback = (chunk: CodeGenerationChunk) => void;\n\nexport interface CodeGenerationChunk {\n /** The incremental content chunk */\n content: string;\n /** The reasoning content */\n reasoning_content: string;\n /** The accumulated content so far */\n accumulated: string;\n /** Whether this is the final chunk */\n isComplete: boolean;\n /** Token usage information if available */\n usage?: AIUsageInfo;\n}\n\nexport interface StreamingAIResponse {\n /** The final accumulated content */\n content: string;\n /** Token usage information */\n usage?: AIUsageInfo;\n /** Whether the response was streamed */\n isStreamed: boolean;\n}\n\nexport interface DeviceAction<T = any> {\n name: string;\n description?: string;\n interfaceAlias?: string;\n paramSchema?: z.ZodType<T>;\n call: (param: T, context: ExecutorContext) => Promise<void> | void;\n}\n\n/**\n * Web-specific types\n */\nexport interface WebElementInfo extends BaseElement {\n id: string;\n attributes: {\n nodeType: NodeType;\n [key: string]: string;\n };\n}\n\nexport type WebUIContext = UIContext<WebElementInfo>;\n\n/**\n * Agent\n */\n\nexport interface AgentOpt {\n testId?: string;\n cacheId?: string;\n groupName?: string;\n groupDescription?: string;\n /* if auto generate report, default true */\n generateReport?: boolean;\n /* if auto print report msg, default true */\n autoPrintReportMsg?: boolean;\n onTaskStartTip?: OnTaskStartTip;\n aiActionContext?: string;\n /* custom report file name */\n reportFileName?: string;\n modelConfig?: TModelConfigFn;\n}\n"],"names":["AIResponseFormat","UIContext"],"mappings":";AAsCO,IAAKA,yBAAgBA,WAAAA,GAAAA,SAAhBA,gBAAgB;;;WAAhBA;;AAwFL,MAAeC;AAQtB"}