@midscene/mcp 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/API.mdx CHANGED
@@ -108,7 +108,8 @@ function aiTap(locate: string, options?: Object): Promise<void>;
108
108
 
109
109
  - `locate: string` - A natural language description of the element to tap.
110
110
  - `options?: Object` - Optional, a configuration object containing:
111
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
111
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
112
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
112
113
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
113
114
 
114
115
  - Return Value:
@@ -140,7 +141,8 @@ function aiHover(locate: string, options?: Object): Promise<void>;
140
141
 
141
142
  - `locate: string` - A natural language description of the element to hover over.
142
143
  - `options?: Object` - Optional, a configuration object containing:
143
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
144
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
145
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
144
146
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
145
147
 
146
148
  - Return Value:
@@ -168,7 +170,8 @@ function aiInput(text: string, locate: string, options?: Object): Promise<void>;
168
170
  - `text: string` - The final text content that should be placed in the input element. Use blank string to clear the input.
169
171
  - `locate: string` - A natural language description of the element to input text into.
170
172
  - `options?: Object` - Optional, a configuration object containing:
171
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
173
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
174
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
172
175
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
173
176
  - `autoDismissKeyboard?: boolean` - If true, the keyboard will be dismissed after input text, only available in Android. (Default: true)
174
177
 
@@ -201,7 +204,8 @@ function aiKeyboardPress(
201
204
  - `key: string` - The web key to press, e.g. 'Enter', 'Tab', 'Escape', etc. Key Combination is not supported.
202
205
  - `locate?: string` - Optional, a natural language description of the element to press the key on.
203
206
  - `options?: Object` - Optional, a configuration object containing:
204
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
207
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
208
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
205
209
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
206
210
 
207
211
  - Return Value:
@@ -236,7 +240,8 @@ function aiScroll(
236
240
  - `distance: number` - Optional, the distance to scroll in px.
237
241
  - `locate?: string` - Optional, a natural language description of the element to scroll on. If not provided, Midscene will perform scroll on the current mouse position.
238
242
  - `options?: Object` - Optional, a configuration object containing:
239
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
243
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
244
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
240
245
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
241
246
 
242
247
  - Return Value:
@@ -266,7 +271,8 @@ function aiRightClick(locate: string, options?: Object): Promise<void>;
266
271
 
267
272
  - `locate: string` - A natural language description of the element to right-click on.
268
273
  - `options?: Object` - Optional, a configuration object containing:
269
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
274
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
275
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
270
276
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
271
277
 
272
278
  - Return Value:
@@ -286,15 +292,45 @@ await agent.aiRightClick('The file name at the top of the page', {
286
292
 
287
293
  :::tip About the `deepThink` feature
288
294
 
289
- The `deepThink` feature is a powerful feature that allows Midscene to call AI model twice to precisely locate the element. It is useful when the AI model find it hard to distinguish the element from its surroundings.
295
+ The `deepThink` feature is a powerful feature that allows Midscene to call AI model twice to precisely locate the element. False by default. It is useful when the AI model find it hard to distinguish the element from its surroundings.
290
296
 
291
297
  :::
292
298
 
293
299
  ## Data Extraction
294
300
 
301
+ ### `agent.aiAsk()`
302
+
303
+ Ask the AI model any question about the current page. It returns the answer in string from the AI model.
304
+
305
+ - Type
306
+
307
+ ```typescript
308
+ function aiAsk(prompt: string, options?: Object): Promise<string>;
309
+ ```
310
+
311
+ - Parameters:
312
+
313
+ - `prompt: string` - A natural language description of the question.
314
+ - `options?: Object` - Optional, a configuration object containing:
315
+ - `domIncluded?: boolean | 'visible-only'` - Whether to send simplified DOM information to the model, usually used for extracting invisible attributes like image links. If set to `'visible-only'`, only the visible elements will be sent. Default: False.
316
+ - `screenshotIncluded?: boolean` - Whether to send screenshot to the model. Default: True.
317
+
318
+ - Return Value:
319
+
320
+ - Return a Promise. Return the answer from the AI model.
321
+
322
+ - Examples:
323
+
324
+ ```typescript
325
+ const result = await agent.aiAsk('What should I do to test this page?');
326
+ console.log(result); // Output the answer from the AI model
327
+ ```
328
+
329
+ Besides `aiAsk`, you can also use `aiQuery` to extract structured data from the UI.
330
+
295
331
  ### `agent.aiQuery()`
296
332
 
297
- This method allows you to extract data directly from the UI using multimodal AI reasoning capabilities. Simply define the expected format (e.g., string, number, JSON, or an array) in the `dataDemand`, and Midscene will return a result that matches the format.
333
+ This method allows you to extract structured data from current page. Simply define the expected format (e.g., string, number, JSON, or an array) in the `dataDemand`, and Midscene will return a result that matches the format.
298
334
 
299
335
  - Type
300
336
 
@@ -501,7 +537,8 @@ function aiLocate(
501
537
 
502
538
  - `locate: string` - A natural language description of the element to locate.
503
539
  - `options?: Object` - Optional, a configuration object containing:
504
- - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element.
540
+ - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. False by default.
541
+ - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
505
542
  - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
506
543
 
507
544
  - Return Value: