@aiscene/android 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,405 @@
1
- import { AbstractInterface } from '@midscene/core/device';
2
- import type { ActionParam } from '@midscene/core';
3
- import type { ActionReturn } from '@midscene/core';
4
1
  import { ADB } from 'appium-adb';
5
- import { Agent } from '@midscene/core/agent';
6
- import { AgentOpt } from '@midscene/core/agent';
7
- import { AndroidDeviceInputOpt } from '@midscene/core/device';
8
- import { AndroidDeviceOpt } from '@midscene/core/device';
9
- import { BaseMCPServer } from '@midscene/shared/mcp';
10
- import { BaseMidsceneTools } from '@midscene/shared/mcp';
11
- import { DeviceAction } from '@midscene/core';
12
- import type { ElementInfo } from '@midscene/shared/extractor';
13
- import { InterfaceType } from '@midscene/core';
14
- import { LaunchMCPServerOptions } from '@midscene/shared/mcp';
15
- import { LaunchMCPServerResult } from '@midscene/shared/mcp';
16
- import { Point } from '@midscene/core';
17
- import { Size } from '@midscene/core';
18
- import { Tool } from '@midscene/shared/mcp';
19
- import { ToolDefinition } from '@midscene/shared/mcp';
2
+ import type { CreateOpenAIClientFn } from '@midscene/shared/env';
3
+ import type { ElementNode } from '@midscene/shared/extractor';
4
+ import { IModelConfig } from '@midscene/shared/env';
5
+ import { LaunchMCPServerOptions } from '@aiscene/shared/mcp';
6
+ import { LaunchMCPServerResult } from '@aiscene/shared/mcp';
7
+ import type { LocateResultElement } from '@midscene/shared/types';
8
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
9
+ import { ModelConfigManager } from '@midscene/shared/env';
10
+ import { Point } from '@midscene/shared/types';
11
+ import { Rect } from '@midscene/shared/types';
12
+ import { Size } from '@midscene/shared/types';
13
+ import type { TModelConfig } from '@midscene/shared/env';
14
+ import { z } from './lib';
15
+
16
+ declare abstract class AbstractInterface {
17
+ abstract interfaceType: string;
18
+ abstract screenshotBase64(): Promise<string>;
19
+ abstract size(): Promise<Size>;
20
+ abstract actionSpace(): DeviceAction[];
21
+ abstract cacheFeatureForPoint?(center: [number, number], options?: {
22
+ targetDescription?: string;
23
+ modelConfig?: IModelConfig;
24
+ }): Promise<ElementCacheFeature>;
25
+ abstract rectMatchesCacheFeature?(feature: ElementCacheFeature): Promise<Rect>;
26
+ abstract destroy?(): Promise<void>;
27
+ abstract describe?(): string;
28
+ abstract beforeInvokeAction?(actionName: string, param: any): Promise<void>;
29
+ abstract afterInvokeAction?(actionName: string, param: any): Promise<void>;
30
+ registerFileChooserListener?(handler: (chooser: FileChooserHandler) => Promise<void>): Promise<{
31
+ dispose: () => void;
32
+ getError: () => Error | undefined;
33
+ }>;
34
+ abstract getElementsNodeTree?: () => Promise<ElementNode>;
35
+ abstract url?: () => string | Promise<string>;
36
+ abstract evaluateJavaScript?<T = any>(script: string): Promise<T>;
37
+ /**
38
+ * Get the current time from the device.
39
+ * Returns the device's current timestamp in milliseconds.
40
+ * This is useful when the system time and device time are not synchronized.
41
+ */
42
+ getTimestamp?(): Promise<number>;
43
+ /** URL of native MJPEG stream for real-time screen preview (e.g. WDA MJPEG server) */
44
+ mjpegStreamUrl?: string;
45
+ }
20
46
 
21
47
  declare type ActionArgs<T extends DeviceAction> = [ActionParam<T>] extends [undefined] ? [] : [ActionParam<T>];
22
48
 
49
+ /**
50
+ * Type utilities for extracting types from DeviceAction definitions
51
+ */
52
+ /**
53
+ * Extract parameter type from a DeviceAction
54
+ */
55
+ declare type ActionParam<Action extends DeviceAction<any, any>> = Action extends DeviceAction<infer P, any> ? P : never;
56
+
57
+ /**
58
+ * Extract return type from a DeviceAction
59
+ */
60
+ declare type ActionReturn<Action extends DeviceAction<any, any>> = Action extends DeviceAction<any, infer R> ? R : never;
61
+
62
+ declare type ActionScrollParam = {
63
+ direction?: 'down' | 'up' | 'right' | 'left';
64
+ scrollType?: ScrollType;
65
+ distance?: number | null;
66
+ locate?: LocateResultElement;
67
+ };
68
+
69
+ /**
70
+ * Action space item definition
71
+ * Note: Intentionally no index signature to maintain compatibility with DeviceAction
72
+ */
73
+ declare interface ActionSpaceItem {
74
+ name: string;
75
+ description?: string;
76
+ args?: Record<string, unknown>;
77
+ paramSchema?: z.ZodTypeAny;
78
+ }
79
+
80
+ declare class Agent<InterfaceType extends AbstractInterface = AbstractInterface> {
81
+ interface: InterfaceType;
82
+ service: Service;
83
+ dump: GroupedActionDump;
84
+ reportFile?: string | null;
85
+ reportFileName?: string;
86
+ taskExecutor: TaskExecutor;
87
+ opts: AgentOpt;
88
+ /**
89
+ * If true, the agent will not perform any actions
90
+ */
91
+ dryMode: boolean;
92
+ onTaskStartTip?: OnTaskStartTip;
93
+ taskCache?: TaskCache;
94
+ private dumpUpdateListeners;
95
+ get onDumpUpdate(): ((dump: string, executionDump?: ExecutionDump) => void) | undefined;
96
+ set onDumpUpdate(callback: ((dump: string, executionDump?: ExecutionDump) => void) | undefined);
97
+ destroyed: boolean;
98
+ modelConfigManager: ModelConfigManager;
99
+ /**
100
+ * Frozen page context for consistent AI operations
101
+ */
102
+ private frozenUIContext?;
103
+ private get aiActContext();
104
+ /**
105
+ * Flag to track if VL model warning has been shown
106
+ */
107
+ private hasWarnedNonVLModel;
108
+ private executionDumpIndexByRunner;
109
+ private fullActionSpace;
110
+ private reportGenerator;
111
+ get page(): InterfaceType;
112
+ /**
113
+ * Ensures VL model warning is shown once when needed
114
+ */
115
+ private ensureVLModelWarning;
116
+ private resolveReplanningCycleLimit;
117
+ constructor(interfaceInstance: InterfaceType, opts?: AgentOpt);
118
+ getActionSpace(): Promise<DeviceAction[]>;
119
+ private static readonly CONTEXT_RETRY_MAX;
120
+ private static readonly CONTEXT_RETRY_DELAY_MS;
121
+ /**
122
+ * Override in subclasses to indicate which errors are transient and should
123
+ * trigger an automatic retry when building the UI context.
124
+ * Returns `false` by default (no retry).
125
+ */
126
+ protected isRetryableContextError(_error: unknown): boolean;
127
+ getUIContext(action?: ServiceAction): Promise<UIContext>;
128
+ _snapshotContext(): Promise<UIContext>;
129
+ /**
130
+ * @deprecated Use {@link setAIActContext} instead.
131
+ */
132
+ setAIActionContext(prompt: string): Promise<void>;
133
+ setAIActContext(prompt: string): Promise<void>;
134
+ resetDump(): GroupedActionDump;
135
+ appendExecutionDump(execution: ExecutionDump, runner?: TaskRunner): void;
136
+ dumpDataString(opt?: {
137
+ inlineScreenshots?: boolean;
138
+ }): string;
139
+ reportHTMLString(opt?: {
140
+ inlineScreenshots?: boolean;
141
+ }): string;
142
+ private lastExecutionDump?;
143
+ writeOutActionDumps(executionDump?: ExecutionDump): void;
144
+ private getGroupMeta;
145
+ private callbackOnTaskStartTip;
146
+ wrapActionInActionSpace<T extends DeviceAction>(name: string): (param: ActionParam<T>) => Promise<ActionReturn<T>>;
147
+ callActionInActionSpace<T = any>(type: string, opt?: T): Promise<any>;
148
+ aiTap(locatePrompt: TUserPrompt, opt?: LocateOption & {
149
+ fileChooserAccept?: string | string[];
150
+ }): Promise<any>;
151
+ aiRightClick(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
152
+ aiDoubleClick(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
153
+ aiHover(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
154
+ aiInput(locatePrompt: TUserPrompt, opt: LocateOption & {
155
+ value: string | number;
156
+ } & {
157
+ autoDismissKeyboard?: boolean;
158
+ } & {
159
+ mode?: 'replace' | 'clear' | 'typeOnly' | 'append';
160
+ }): Promise<any>;
161
+ /**
162
+ * @deprecated Use aiInput(locatePrompt, opt) instead where opt contains the value
163
+ */
164
+ aiInput(value: string | number, locatePrompt: TUserPrompt, opt?: LocateOption & {
165
+ autoDismissKeyboard?: boolean;
166
+ } & {
167
+ mode?: 'replace' | 'clear' | 'typeOnly' | 'append';
168
+ }): Promise<any>;
169
+ aiKeyboardPress(locatePrompt: TUserPrompt, opt: LocateOption & {
170
+ keyName: string;
171
+ }): Promise<any>;
172
+ /**
173
+ * @deprecated Use aiKeyboardPress(locatePrompt, opt) instead where opt contains the keyName
174
+ */
175
+ aiKeyboardPress(keyName: string, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
176
+ aiScroll(locatePrompt: TUserPrompt | undefined, opt: LocateOption & ScrollParam): Promise<any>;
177
+ /**
178
+ * @deprecated Use aiScroll(locatePrompt, opt) instead where opt contains the scroll parameters
179
+ */
180
+ aiScroll(scrollParam: ScrollParam, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
181
+ aiPinch(locatePrompt: TUserPrompt | undefined, opt: LocateOption & {
182
+ direction: 'in' | 'out';
183
+ distance?: number;
184
+ duration?: number;
185
+ }): Promise<any>;
186
+ aiAct(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
187
+ /**
188
+ * @deprecated Use {@link Agent.aiAct} instead.
189
+ */
190
+ aiAction(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
191
+ aiQuery<ReturnType = any>(demand: ServiceExtractParam, opt?: ServiceExtractOption): Promise<ReturnType>;
192
+ aiBoolean(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<boolean>;
193
+ aiNumber(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<number>;
194
+ aiString(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<string>;
195
+ aiAsk(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<string>;
196
+ describeElementAtPoint(center: [number, number], opt?: {
197
+ verifyPrompt?: boolean;
198
+ retryLimit?: number;
199
+ deepLocate?: boolean;
200
+ } & LocatorValidatorOption): Promise<AgentDescribeElementAtPointResult>;
201
+ verifyLocator(prompt: string, locateOpt: LocateOption | undefined, expectCenter: [number, number], verifyLocateOption?: LocatorValidatorOption): Promise<LocateValidatorResult>;
202
+ aiLocate(prompt: TUserPrompt, opt?: LocateOption): Promise<Pick<LocateResultElement, "rect" | "center">>;
203
+ aiAssert(assertion: TUserPrompt, msg?: string, opt?: AgentAssertOpt & ServiceExtractOption): Promise<{
204
+ pass: boolean;
205
+ thought: string | undefined;
206
+ message: string | undefined;
207
+ } | undefined>;
208
+ aiWaitFor(assertion: TUserPrompt, opt?: AgentWaitForOpt): Promise<void>;
209
+ ai(...args: Parameters<typeof Agent.aiAct>): Promise<string | undefined>;
210
+ runYaml(yamlScriptContent: string): Promise<{
211
+ result: Record<string, any>;
212
+ }>;
213
+ evaluateJavaScript(script: string): Promise<any>;
214
+ /**
215
+ * Add a dump update listener
216
+ * @param listener Listener function
217
+ * @returns A remove function that can be called to remove this listener
218
+ */
219
+ addDumpUpdateListener(listener: (dump: string, executionDump?: ExecutionDump) => void): () => void;
220
+ /**
221
+ * Remove a dump update listener
222
+ * @param listener The listener function to remove
223
+ */
224
+ removeDumpUpdateListener(listener: (dump: string, executionDump?: ExecutionDump) => void): void;
225
+ /**
226
+ * Clear all dump update listeners
227
+ */
228
+ clearDumpUpdateListeners(): void;
229
+ destroy(): Promise<void>;
230
+ recordToReport(title?: string, opt?: {
231
+ content: string;
232
+ }): Promise<void>;
233
+ /**
234
+ * @deprecated Use {@link Agent.recordToReport} instead.
235
+ */
236
+ logScreenshot(title?: string, opt?: {
237
+ content: string;
238
+ }): Promise<void>;
239
+ _unstableLogContent(): {
240
+ groupName: string;
241
+ groupDescription: string | undefined;
242
+ executions: ExecutionDump[];
243
+ };
244
+ /**
245
+ * Freezes the current page context to be reused in subsequent AI operations
246
+ * This avoids recalculating page context for each operation
247
+ */
248
+ freezePageContext(): Promise<void>;
249
+ /**
250
+ * Unfreezes the page context, allowing AI operations to calculate context dynamically
251
+ */
252
+ unfreezePageContext(): Promise<void>;
253
+ /**
254
+ * Process cache configuration and return normalized cache settings
255
+ */
256
+ private processCacheConfig;
257
+ private normalizeFilePaths;
258
+ private normalizeFileInput;
259
+ /**
260
+ * Manually flush cache to file
261
+ * @param options - Optional configuration
262
+ * @param options.cleanUnused - If true, removes unused cache records before flushing
263
+ */
264
+ flushCache(options?: {
265
+ cleanUnused?: boolean;
266
+ }): Promise<void>;
267
+ }
268
+
269
+ declare interface AgentAssertOpt {
270
+ keepRawResponse?: boolean;
271
+ }
272
+
273
+ declare interface AgentDescribeElementAtPointResult {
274
+ prompt: string;
275
+ deepLocate: boolean;
276
+ verifyResult?: LocateValidatorResult;
277
+ }
278
+
279
+ declare interface AgentOpt {
280
+ testId?: string;
281
+ cacheId?: string;
282
+ groupName?: string;
283
+ groupDescription?: string;
284
+ generateReport?: boolean;
285
+ autoPrintReportMsg?: boolean;
286
+ /**
287
+ * Use directory-based report format with separate image files.
288
+ *
289
+ * When enabled:
290
+ * - Screenshots are saved as PNG files in a `screenshots/` subdirectory
291
+ * - Report is generated as `index.html` with relative image paths
292
+ * - Reduces memory usage and report file size
293
+ *
294
+ * IMPORTANT: 'html-and-external-assets' reports must be served via HTTP server
295
+ * (e.g., `npx serve ./report-dir`). The file:// protocol will not
296
+ * work due to browser CORS restrictions.
297
+ *
298
+ * @default 'single-html'
299
+ */
300
+ outputFormat?: 'single-html' | 'html-and-external-assets';
301
+ onTaskStartTip?: OnTaskStartTip;
302
+ aiActContext?: string;
303
+ aiActionContext?: string;
304
+ reportFileName?: string;
305
+ modelConfig?: TModelConfig;
306
+ cache?: Cache_2;
307
+ /**
308
+ * Maximum number of replanning cycles for aiAct.
309
+ * Defaults to 20 (40 for `vlm-ui-tars`) when not provided.
310
+ * If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.
311
+ */
312
+ replanningCycleLimit?: number;
313
+ /**
314
+ * Wait time in milliseconds after each action execution.
315
+ * This allows the UI to settle and stabilize before the next action.
316
+ * Defaults to 300ms when not provided.
317
+ */
318
+ waitAfterAction?: number;
319
+ /**
320
+ * When set to true, Midscene will use the target device's time (Android/iOS)
321
+ * instead of the system time. Useful when the device time differs from the
322
+ * host machine. Default: false
323
+ */
324
+ useDeviceTimestamp?: boolean;
325
+ /**
326
+ * Custom screenshot shrink factor to reduce AI token usage.
327
+ * When set, the screenshot will be scaled down by this factor from the physical resolution.
328
+ *
329
+ * Example:
330
+ * - Physical screen width: 3000px, dpr=6
331
+ * - Logical width: 500px
332
+ * - screenshotShrinkFactor: 2
333
+ * - Actual shrunk screenshot width: 3000 / 2 = 1500px
334
+ * - AI analyzes the 1500px screenshot
335
+ * - Coordinates are transformed back to logical (500px) before actions execute
336
+ *
337
+ * Benefits:
338
+ * - Reduces token usage for high-resolution screenshots
339
+ * - Maintains accuracy by scaling coordinates appropriately
340
+ *
341
+ * Must be >= 1 (shrinking only, enlarging is not supported).
342
+ *
343
+ * @default 1 (no shrinking, uses original physical screenshot)
344
+ */
345
+ screenshotShrinkFactor?: number;
346
+ /**
347
+ * Custom OpenAI client factory function
348
+ *
349
+ * If provided, this function will be called to create OpenAI client instances
350
+ * for each AI call, allowing you to:
351
+ * - Wrap clients with observability tools (langsmith, langfuse)
352
+ * - Use custom OpenAI-compatible clients
353
+ * - Apply different configurations based on intent
354
+ *
355
+ * @param config - Resolved model configuration
356
+ * @returns OpenAI client instance (original or wrapped)
357
+ *
358
+ * @example
359
+ * ```typescript
360
+ * createOpenAIClient: async (openai, opts) => {
361
+ * // Wrap with langsmith for planning tasks
362
+ * if (opts.baseURL?.includes('planning')) {
363
+ * return wrapOpenAI(openai, { metadata: { task: 'planning' } });
364
+ * }
365
+ *
366
+ * return openai;
367
+ * }
368
+ * ```
369
+ */
370
+ createOpenAIClient?: CreateOpenAIClientFn;
371
+ }
372
+
373
+ declare interface AgentWaitForOpt extends ServiceExtractOption {
374
+ checkIntervalMs?: number;
375
+ timeoutMs?: number;
376
+ }
377
+
378
+ declare type AiActOptions = {
379
+ cacheable?: boolean;
380
+ fileChooserAccept?: string | string[];
381
+ deepThink?: DeepThinkOption;
382
+ deepLocate?: boolean;
383
+ abortSignal?: AbortSignal;
384
+ };
385
+
386
+ declare interface AIDescribeElementResponse {
387
+ description: string;
388
+ error?: string;
389
+ }
390
+
391
+ declare type AIUsageInfo = Record<string, any> & {
392
+ prompt_tokens: number | undefined;
393
+ completion_tokens: number | undefined;
394
+ total_tokens: number | undefined;
395
+ cached_input: number | undefined;
396
+ time_cost: number | undefined;
397
+ model_name: string | undefined;
398
+ model_description: string | undefined;
399
+ intent: string | undefined;
400
+ request_id: string | undefined;
401
+ };
402
+
23
403
  declare class AndroidAgent extends Agent<AndroidDevice> {
24
404
  /**
25
405
  * Trigger the system back operation on Android devices
@@ -228,6 +608,107 @@ declare class AndroidDevice implements AbstractInterface {
228
608
  hideKeyboard(options?: AndroidDeviceInputOpt, timeoutMs?: number): Promise<boolean>;
229
609
  }
230
610
 
611
+ /**
612
+ * Android device input options
613
+ */
614
+ declare type AndroidDeviceInputOpt = {
615
+ /** Automatically dismiss the keyboard after input is completed */
616
+ autoDismissKeyboard?: boolean;
617
+ /** Strategy for dismissing the keyboard: 'esc-first' tries ESC before BACK, 'back-first' tries BACK before ESC */
618
+ keyboardDismissStrategy?: 'esc-first' | 'back-first';
619
+ };
620
+
621
+ /**
622
+ * Android device options
623
+ */
624
+ declare type AndroidDeviceOpt = {
625
+ /** Path to the ADB executable */
626
+ androidAdbPath?: string;
627
+ /** Remote ADB host address */
628
+ remoteAdbHost?: string;
629
+ /** Remote ADB port */
630
+ remoteAdbPort?: number;
631
+ /** Input method editor strategy: 'always-yadb' always uses yadb, 'yadb-for-non-ascii' uses yadb only for non-ASCII characters */
632
+ imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii';
633
+ /** Display ID to use for this device */
634
+ displayId?: number;
635
+ /** Use physical display ID for screenshot operations */
636
+ usePhysicalDisplayIdForScreenshot?: boolean;
637
+ /** Use physical display ID when looking up display information */
638
+ usePhysicalDisplayIdForDisplayLookup?: boolean;
639
+ /** Custom device actions to register */
640
+ customActions?: DeviceAction<any>[];
641
+ /**
642
+ * @deprecated Use `screenshotShrinkFactor` in AgentOpt instead.
643
+ * This option no longer affects screenshot size sent to AI model.
644
+ */
645
+ screenshotResizeScale?: number;
646
+ /** Always fetch screen info on each call; if false, cache the first result */
647
+ alwaysRefreshScreenInfo?: boolean;
648
+ /** Minimum screenshot buffer size in bytes (default: 10240 = 10KB). Set to 0 to disable validation. */
649
+ minScreenshotBufferSize?: number;
650
+ /**
651
+ * Scrcpy screenshot configuration for high-performance screen capture.
652
+ *
653
+ * Scrcpy provides 6-8x faster screenshots by streaming H.264 video from the device.
654
+ * When enabled, scrcpy will:
655
+ * 1. Start a video stream from the device on first screenshot request
656
+ * 2. Keep the connection alive for subsequent screenshots (16-50ms each)
657
+ * 3. Automatically disconnect after idle timeout to save resources
658
+ * 4. Fallback to standard ADB mode if unavailable
659
+ *
660
+ * @example
661
+ * ```typescript
662
+ * // Enable scrcpy for high-performance screenshots
663
+ * const device = new AndroidDevice(deviceId, {
664
+ * scrcpyConfig: {
665
+ * enabled: true,
666
+ * },
667
+ * });
668
+ *
669
+ * // Custom configuration
670
+ * const device = new AndroidDevice(deviceId, {
671
+ * scrcpyConfig: {
672
+ * enabled: true,
673
+ * maxSize: 0, // 0 = no scaling
674
+ * idleTimeoutMs: 30000,
675
+ * videoBitRate: 8_000_000,
676
+ * },
677
+ * });
678
+ * ```
679
+ */
680
+ scrcpyConfig?: {
681
+ /**
682
+ * Enable scrcpy for high-performance screenshots.
683
+ * @default false
684
+ */
685
+ enabled?: boolean;
686
+ /**
687
+ * Maximum video dimension (width or height).
688
+ * Video stream will be scaled down if device resolution exceeds this value.
689
+ * Lower values reduce bandwidth but may affect image quality.
690
+ *
691
+ * @default 0 (no scaling, use original resolution)
692
+ * @example
693
+ * { maxSize: 1024 } // Always scale to 1024
694
+ */
695
+ maxSize?: number;
696
+ /**
697
+ * Idle timeout in milliseconds before disconnecting scrcpy.
698
+ * Connection auto-closes after this period of inactivity to save resources.
699
+ * Set to 0 to disable auto-disconnect.
700
+ * @default 30000 (30 seconds)
701
+ */
702
+ idleTimeoutMs?: number;
703
+ /**
704
+ * Video bit rate for H.264 encoding in bits per second.
705
+ * Higher values improve quality but increase bandwidth usage.
706
+ * @default 2000000 (2 Mbps)
707
+ */
708
+ videoBitRate?: number;
709
+ };
710
+ } & AndroidDeviceInputOpt;
711
+
231
712
  /**
232
713
  * Android MCP Server
233
714
  * Provides MCP tools for Android automation through ADB
@@ -250,12 +731,521 @@ declare class AndroidMidsceneTools extends BaseMidsceneTools<AndroidAgent> {
250
731
  protected preparePlatformTools(): ToolDefinition[];
251
732
  }
252
733
 
734
+ /**
735
+ * Base agent interface
736
+ * Represents a platform-specific agent (Android, iOS, Web)
737
+ * Note: Return types use `unknown` for compatibility with platform-specific implementations
738
+ */
739
+ declare interface BaseAgent {
740
+ getActionSpace(): Promise<ActionSpaceItem[]>;
741
+ destroy?(): Promise<void>;
742
+ page?: {
743
+ screenshotBase64(): Promise<string>;
744
+ };
745
+ aiAction?: (description: string, params?: Record<string, unknown>) => Promise<unknown>;
746
+ aiWaitFor?: (assertion: string, options: Record<string, unknown>) => Promise<unknown>;
747
+ }
748
+
749
+ /**
750
+ * Base device interface for temporary device instances
751
+ */
752
+ declare interface BaseDevice {
753
+ actionSpace(): ActionSpaceItem[];
754
+ destroy?(): Promise<void>;
755
+ }
756
+
757
+ /**
758
+ * Base MCP Server class with programmatic launch() API
759
+ * Each platform extends this to provide their own tools manager
760
+ */
761
+ declare abstract class BaseMCPServer {
762
+ protected mcpServer: McpServer;
763
+ protected toolsManager?: IMidsceneTools;
764
+ protected config: BaseMCPServerConfig;
765
+ protected providedToolsManager?: IMidsceneTools;
766
+ constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools);
767
+ /**
768
+ * Platform-specific: create tools manager instance
769
+ * This is only called if no tools manager was provided in constructor
770
+ */
771
+ protected abstract createToolsManager(): IMidsceneTools;
772
+ /**
773
+ * Initialize tools manager and attach to MCP server
774
+ */
775
+ private initializeToolsManager;
776
+ /**
777
+ * Perform cleanup on shutdown
778
+ */
779
+ private performCleanup;
780
+ /**
781
+ * Initialize and launch the MCP server with stdio transport
782
+ */
783
+ launch(): Promise<LaunchMCPServerResult_2>;
784
+ /**
785
+ * Launch MCP server with HTTP transport
786
+ * Supports stateful sessions for web applications and service integration
787
+ */
788
+ launchHttp(options: HttpLaunchOptions): Promise<LaunchMCPServerResult_2>;
789
+ /**
790
+ * Create a new HTTP session with transport
791
+ */
792
+ private createHttpSession;
793
+ /**
794
+ * Start periodic session cleanup for inactive sessions
795
+ */
796
+ private startSessionCleanup;
797
+ /**
798
+ * Setup shutdown handlers for HTTP server
799
+ */
800
+ private setupHttpShutdownHandlers;
801
+ /**
802
+ * Get the underlying MCP server instance
803
+ */
804
+ getServer(): McpServer;
805
+ /**
806
+ * Get the tools manager instance
807
+ */
808
+ getToolsManager(): IMidsceneTools | undefined;
809
+ }
810
+
811
+ declare interface BaseMCPServerConfig {
812
+ name: string;
813
+ version: string;
814
+ description: string;
815
+ }
816
+
817
+ /**
818
+ * Base class for platform-specific MCP tools
819
+ * Generic type TAgent allows subclasses to use their specific agent types
820
+ */
821
+ declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent> implements IMidsceneTools {
822
+ protected mcpServer?: McpServer;
823
+ protected agent?: TAgent;
824
+ protected toolDefinitions: ToolDefinition[];
825
+ /**
826
+ * Ensure agent is initialized and ready for use.
827
+ * Must be implemented by subclasses to create platform-specific agent.
828
+ * @param initParam Optional initialization parameter (platform-specific, e.g., URL, device ID)
829
+ * @returns Promise resolving to initialized agent instance
830
+ * @throws Error if agent initialization fails
831
+ */
832
+ protected abstract ensureAgent(initParam?: string): Promise<TAgent>;
833
+ /**
834
+ * Optional: prepare platform-specific tools (e.g., device connection)
835
+ */
836
+ protected preparePlatformTools(): ToolDefinition[];
837
+ /**
838
+ * Must be implemented by subclasses to create a temporary device instance
839
+ * This allows getting real actionSpace without connecting to device
840
+ */
841
+ protected abstract createTemporaryDevice(): BaseDevice;
842
+ /**
843
+ * Initialize all tools by querying actionSpace
844
+ * Uses two-layer fallback strategy:
845
+ * 1. Try to get actionSpace from connected agent (if available)
846
+ * 2. Create temporary device instance to read actionSpace (always succeeds)
847
+ */
848
+ initTools(): Promise<void>;
849
+ /**
850
+ * Attach to MCP server and register all tools
851
+ */
852
+ attachToServer(server: McpServer): void;
853
+ /**
854
+ * Cleanup method - destroy agent and release resources
855
+ */
856
+ destroy(): Promise<void>;
857
+ /**
858
+ * Get tool definitions
859
+ */
860
+ getToolDefinitions(): ToolDefinition[];
861
+ /**
862
+ * Set agent for the tools manager
863
+ */
864
+ setAgent(agent: TAgent): void;
865
+ /**
866
+ * Helper: Convert base64 screenshot to image content array
867
+ */
868
+ protected buildScreenshotContent(screenshot: string): {
869
+ type: "image";
870
+ data: string;
871
+ mimeType: string;
872
+ }[];
873
+ /**
874
+ * Helper: Build a simple text result for tool responses
875
+ */
876
+ protected buildTextResult(text: string): {
877
+ content: {
878
+ type: "text";
879
+ text: string;
880
+ }[];
881
+ };
882
+ /**
883
+ * Create a disconnect handler for releasing platform resources
884
+ * @param platformName Human-readable platform name for the response message
885
+ * @returns Handler function that destroys the agent and returns appropriate response
886
+ */
887
+ protected createDisconnectHandler(platformName: string): () => Promise<{
888
+ content: {
889
+ type: "text";
890
+ text: string;
891
+ }[];
892
+ }>;
893
+ }
894
+
895
+ declare type Cache_2 = false | true | CacheConfig;
896
+
897
+ /**
898
+ * Agent
899
+ */
900
+ declare type CacheConfig = {
901
+ strategy?: 'read-only' | 'read-write' | 'write-only';
902
+ id: string;
903
+ };
904
+
905
+ declare type CacheFileContent = {
906
+ midsceneVersion: string;
907
+ cacheId: string;
908
+ caches: Array<PlanningCache | LocateCache>;
909
+ };
910
+
911
+ declare type DeepThinkOption = 'unset' | true | false;
912
+
913
+ declare interface DetailedLocateParam extends Omit<LocateOption, 'deepThink' | keyof TMultimodalPrompt> {
914
+ prompt: TUserPrompt;
915
+ }
916
+
917
+ declare interface DeviceAction<TParam = any, TReturn = any> {
918
+ name: string;
919
+ description?: string;
920
+ interfaceAlias?: string;
921
+ paramSchema?: z.ZodType<TParam>;
922
+ call: (param: TParam, context: ExecutorContext) => Promise<TReturn> | TReturn;
923
+ delayAfterRunner?: number;
924
+ /**
925
+ * An example param object for this action.
926
+ * Locate fields with { prompt } will automatically get bbox injected when needed.
927
+ */
928
+ sample?: {
929
+ [K in keyof TParam]?: any;
930
+ };
931
+ }
932
+
253
933
  declare type DeviceActionAndroidBackButton = DeviceAction<undefined, void>;
254
934
 
255
935
  declare type DeviceActionAndroidHomeButton = DeviceAction<undefined, void>;
256
936
 
257
937
  declare type DeviceActionAndroidRecentAppsButton = DeviceAction<undefined, void>;
258
938
 
939
+ declare interface DumpMeta {
940
+ logTime: number;
941
+ }
942
+
943
+ declare type ElementCacheFeature = Record<string, unknown>;
944
+
945
+ declare interface ElementInfo {
946
+ id: string;
947
+ indexId: number;
948
+ nodeHashId: string;
949
+ xpaths?: string[];
950
+ attributes: {
951
+ nodeType: NodeType;
952
+ [key: string]: string;
953
+ };
954
+ nodeType: NodeType;
955
+ content: string;
956
+ rect: {
957
+ left: number;
958
+ top: number;
959
+ width: number;
960
+ height: number;
961
+ };
962
+ center: [number, number];
963
+ isVisible: boolean;
964
+ }
965
+
966
+ /**
967
+ * ExecutionDump class for serializing and deserializing execution dumps
968
+ */
969
+ declare class ExecutionDump implements IExecutionDump {
970
+ id?: string;
971
+ logTime: number;
972
+ name: string;
973
+ description?: string;
974
+ tasks: ExecutionTask[];
975
+ aiActContext?: string;
976
+ constructor(data: IExecutionDump);
977
+ /**
978
+ * Serialize the ExecutionDump to a JSON string
979
+ */
980
+ serialize(indents?: number): string;
981
+ /**
982
+ * Convert to a plain object for JSON serialization
983
+ */
984
+ toJSON(): IExecutionDump;
985
+ /**
986
+ * Create an ExecutionDump instance from a serialized JSON string
987
+ */
988
+ static fromSerializedString(serialized: string): ExecutionDump;
989
+ /**
990
+ * Create an ExecutionDump instance from a plain object
991
+ */
992
+ static fromJSON(data: IExecutionDump): ExecutionDump;
993
+ /**
994
+ * Collect all ScreenshotItem instances from tasks.
995
+ * Scans through uiContext and recorder items to find screenshots.
996
+ *
997
+ * @returns Array of ScreenshotItem instances
998
+ */
999
+ collectScreenshots(): ScreenshotItem[];
1000
+ }
1001
+
1002
+ declare interface ExecutionRecorderItem {
1003
+ type: 'screenshot';
1004
+ ts: number;
1005
+ screenshot?: ScreenshotItem;
1006
+ timing?: string;
1007
+ }
1008
+
1009
+ declare interface ExecutionResult<OutputType = any> {
1010
+ output: OutputType;
1011
+ thought?: string;
1012
+ runner: TaskRunner;
1013
+ }
1014
+
1015
+ declare type ExecutionTask<E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<any, any, any>> = E & ExecutionTaskReturn<E extends ExecutionTaskApply<any, any, infer TaskOutput, any> ? TaskOutput : unknown, E extends ExecutionTaskApply<any, any, any, infer TaskLog> ? TaskLog : unknown> & {
1016
+ taskId: string;
1017
+ status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';
1018
+ error?: Error;
1019
+ errorMessage?: string;
1020
+ errorStack?: string;
1021
+ timing?: {
1022
+ start: number;
1023
+ getUiContextStart?: number;
1024
+ getUiContextEnd?: number;
1025
+ callAiStart?: number;
1026
+ callAiEnd?: number;
1027
+ beforeInvokeActionHookStart?: number;
1028
+ beforeInvokeActionHookEnd?: number;
1029
+ callActionStart?: number;
1030
+ callActionEnd?: number;
1031
+ afterInvokeActionHookStart?: number;
1032
+ afterInvokeActionHookEnd?: number;
1033
+ captureAfterCallingSnapshotStart?: number;
1034
+ captureAfterCallingSnapshotEnd?: number;
1035
+ end?: number;
1036
+ cost?: number;
1037
+ };
1038
+ usage?: AIUsageInfo;
1039
+ searchAreaUsage?: AIUsageInfo;
1040
+ reasoning_content?: string;
1041
+ };
1042
+
1043
+ declare interface ExecutionTaskApply<Type extends ExecutionTaskType = any, TaskParam = any, TaskOutput = any, TaskLog = any> {
1044
+ type: Type;
1045
+ subType?: string;
1046
+ param?: TaskParam;
1047
+ thought?: string;
1048
+ uiContext?: UIContext;
1049
+ executor: (param: TaskParam, context: ExecutorContext) => Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void> | undefined | void;
1050
+ }
1051
+
1052
+ declare interface ExecutionTaskHitBy {
1053
+ from: string;
1054
+ context: Record<string, any>;
1055
+ }
1056
+
1057
+ declare interface ExecutionTaskProgressOptions {
1058
+ onTaskStart?: (task: ExecutionTask) => Promise<void> | void;
1059
+ }
1060
+
1061
+ declare interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {
1062
+ output?: TaskOutput;
1063
+ log?: TaskLog;
1064
+ recorder?: ExecutionRecorderItem[];
1065
+ hitBy?: ExecutionTaskHitBy;
1066
+ }
1067
+
1068
+ declare type ExecutionTaskType = 'Planning' | 'Insight' | 'Action Space' | 'Log';
1069
+
1070
+ declare interface ExecutorContext {
1071
+ task: ExecutionTask;
1072
+ element?: LocateResultElement | null;
1073
+ uiContext?: UIContext;
1074
+ }
1075
+
1076
+ declare interface FileChooserHandler {
1077
+ accept(files: string[]): Promise<void>;
1078
+ }
1079
+
1080
+ /**
1081
+ * GroupedActionDump class for serializing and deserializing grouped action dumps
1082
+ */
1083
+ declare class GroupedActionDump implements IGroupedActionDump {
1084
+ sdkVersion: string;
1085
+ groupName: string;
1086
+ groupDescription?: string;
1087
+ modelBriefs: ModelBrief[];
1088
+ executions: ExecutionDump[];
1089
+ deviceType?: string;
1090
+ constructor(data: IGroupedActionDump);
1091
+ /**
1092
+ * Serialize the GroupedActionDump to a JSON string
1093
+ * Uses compact { $screenshot: id } format
1094
+ */
1095
+ serialize(indents?: number): string;
1096
+ /**
1097
+ * Serialize the GroupedActionDump with inline screenshots to a JSON string.
1098
+ * Each ScreenshotItem is replaced with { base64: "...", capturedAt }.
1099
+ */
1100
+ serializeWithInlineScreenshots(indents?: number): string;
1101
+ /**
1102
+ * Convert to a plain object for JSON serialization
1103
+ */
1104
+ toJSON(): IGroupedActionDump;
1105
+ /**
1106
+ * Create a GroupedActionDump instance from a serialized JSON string
1107
+ */
1108
+ static fromSerializedString(serialized: string): GroupedActionDump;
1109
+ /**
1110
+ * Create a GroupedActionDump instance from a plain object
1111
+ */
1112
+ static fromJSON(data: IGroupedActionDump): GroupedActionDump;
1113
+ /**
1114
+ * Collect all ScreenshotItem instances from all executions.
1115
+ *
1116
+ * @returns Array of all ScreenshotItem instances across all executions
1117
+ */
1118
+ collectAllScreenshots(): ScreenshotItem[];
1119
+ /**
1120
+ * Serialize the dump to files with screenshots as separate PNG files.
1121
+ * Creates:
1122
+ * - {basePath} - dump JSON with { $screenshot: id } references
1123
+ * - {basePath}.screenshots/ - PNG files
1124
+ * - {basePath}.screenshots.json - ID to path mapping
1125
+ *
1126
+ * @param basePath - Base path for the dump file
1127
+ */
1128
+ serializeToFiles(basePath: string): void;
1129
+ /**
1130
+ * Read dump from files and return JSON string with inline screenshots.
1131
+ * Reads the dump JSON and screenshot files, then inlines the base64 data.
1132
+ *
1133
+ * @param basePath - Base path for the dump file
1134
+ * @returns JSON string with inline screenshots ({ base64: "..." } format)
1135
+ */
1136
+ static fromFilesAsInlineJson(basePath: string): string;
1137
+ /**
1138
+ * Clean up all files associated with a serialized dump.
1139
+ *
1140
+ * @param basePath - Base path for the dump file
1141
+ */
1142
+ static cleanupFiles(basePath: string): void;
1143
+ /**
1144
+ * Get all file paths associated with a serialized dump.
1145
+ *
1146
+ * @param basePath - Base path for the dump file
1147
+ * @returns Array of all associated file paths
1148
+ */
1149
+ static getFilePaths(basePath: string): string[];
1150
+ }
1151
+
1152
+ declare interface HttpLaunchOptions {
1153
+ port: number;
1154
+ host?: string;
1155
+ }
1156
+
1157
+ declare interface IExecutionDump extends DumpMeta {
1158
+ /** Stable unique identifier for this execution run */
1159
+ id?: string;
1160
+ name: string;
1161
+ description?: string;
1162
+ tasks: ExecutionTask[];
1163
+ aiActContext?: string;
1164
+ }
1165
+
1166
+ declare interface IGroupedActionDump {
1167
+ sdkVersion: string;
1168
+ groupName: string;
1169
+ groupDescription?: string;
1170
+ modelBriefs: ModelBrief[];
1171
+ executions: IExecutionDump[];
1172
+ deviceType?: string;
1173
+ }
1174
+
1175
+ /**
1176
+ * Interface for platform-specific MCP tools manager
1177
+ */
1178
+ declare interface IMidsceneTools {
1179
+ attachToServer(server: McpServer): void;
1180
+ initTools(): Promise<void>;
1181
+ destroy?(): Promise<void>;
1182
+ }
1183
+
1184
+ declare type InterfaceType = 'puppeteer' | 'playwright' | 'static' | 'chrome-extension-proxy' | 'android' | string;
1185
+
1186
+ declare interface LaunchMCPServerResult_2 {
1187
+ /**
1188
+ * The MCP server port (for HTTP mode)
1189
+ */
1190
+ port?: number;
1191
+ /**
1192
+ * The server host (for HTTP mode)
1193
+ */
1194
+ host?: string;
1195
+ /**
1196
+ * Function to gracefully shutdown the MCP server
1197
+ */
1198
+ close: () => Promise<void>;
1199
+ }
1200
+
1201
+ declare interface LocateCache {
1202
+ type: 'locate';
1203
+ prompt: TUserPrompt;
1204
+ cache?: ElementCacheFeature;
1205
+ /** @deprecated kept for backward compatibility */
1206
+ xpaths?: string[];
1207
+ }
1208
+
1209
+ declare interface LocateOption extends Partial<TMultimodalPrompt> {
1210
+ prompt?: TUserPrompt;
1211
+ deepLocate?: boolean;
1212
+ /** @deprecated Use `deepLocate` instead. Kept for backward compatibility. */
1213
+ deepThink?: boolean;
1214
+ cacheable?: boolean;
1215
+ xpath?: string;
1216
+ uiContext?: UIContext;
1217
+ fileChooserAccept?: string | string[];
1218
+ }
1219
+
1220
+ declare interface LocateOpts {
1221
+ context?: UIContext;
1222
+ planLocatedElement?: LocateResultElement;
1223
+ }
1224
+
1225
+ declare interface LocateResult {
1226
+ element: LocateResultElement | null;
1227
+ rect?: Rect;
1228
+ }
1229
+
1230
+ declare type LocateResultWithDump = LocateResult & ServiceResultBase;
1231
+
1232
+ declare interface LocateValidatorResult {
1233
+ pass: boolean;
1234
+ rect: Rect;
1235
+ center: [number, number];
1236
+ centerDistance?: number;
1237
+ }
1238
+
1239
+ declare interface LocatorValidatorOption {
1240
+ centerDistanceThreshold?: number;
1241
+ }
1242
+
1243
+ declare interface MatchCacheResult<T extends PlanningCache | LocateCache> {
1244
+ cacheContent: T;
1245
+ cacheUsable: boolean;
1246
+ updateFn: (cb: (cache: T) => void) => void;
1247
+ }
1248
+
259
1249
  /**
260
1250
  * Create MCP kit for a specific Android Agent
261
1251
  */
@@ -274,6 +1264,518 @@ export declare function mcpServerForAgent(agent: Agent | AndroidAgent): {
274
1264
  launchHttp(options: LaunchMCPServerOptions): Promise<LaunchMCPServerResult>;
275
1265
  };
276
1266
 
1267
+ declare type MidsceneYamlFlowItem = MidsceneYamlFlowItemAIAction | MidsceneYamlFlowItemAIAssert | MidsceneYamlFlowItemAIWaitFor | MidsceneYamlFlowItemEvaluateJavaScript | MidsceneYamlFlowItemSleep | MidsceneYamlFlowItemLogScreenshot;
1268
+
1269
+ declare interface MidsceneYamlFlowItemAIAction {
1270
+ aiAction?: string;
1271
+ ai?: string;
1272
+ aiAct?: string;
1273
+ aiActionProgressTips?: string[];
1274
+ cacheable?: boolean;
1275
+ [key: string]: unknown;
1276
+ }
1277
+
1278
+ declare interface MidsceneYamlFlowItemAIAssert extends ServiceExtractOption {
1279
+ aiAssert: string;
1280
+ errorMessage?: string;
1281
+ name?: string;
1282
+ }
1283
+
1284
+ declare interface MidsceneYamlFlowItemAIWaitFor extends ServiceExtractOption {
1285
+ aiWaitFor: string;
1286
+ timeout?: number;
1287
+ }
1288
+
1289
+ declare interface MidsceneYamlFlowItemEvaluateJavaScript {
1290
+ javascript: string;
1291
+ name?: string;
1292
+ }
1293
+
1294
+ declare interface MidsceneYamlFlowItemLogScreenshot {
1295
+ logScreenshot?: string;
1296
+ recordToReport?: string;
1297
+ content?: string;
1298
+ }
1299
+
1300
+ declare interface MidsceneYamlFlowItemSleep {
1301
+ sleep: number;
1302
+ }
1303
+
1304
+ declare interface ModelBrief {
1305
+ /**
1306
+ * The intent/category of the model call, for example "planning" or "insight".
1307
+ */
1308
+ intent?: string;
1309
+ /**
1310
+ * The model name returned by usage metadata, for example "gpt-4o".
1311
+ */
1312
+ name?: string;
1313
+ /**
1314
+ * Optional human-readable model description, for example "qwen2.5-vl mode".
1315
+ */
1316
+ modelDescription?: string;
1317
+ }
1318
+
1319
+ declare enum NodeType {
1320
+ CONTAINER = "CONTAINER Node",
1321
+ FORM_ITEM = "FORM_ITEM Node",
1322
+ BUTTON = "BUTTON Node",
1323
+ A = "Anchor Node",
1324
+ IMG = "IMG Node",
1325
+ TEXT = "TEXT Node",
1326
+ POSITION = "POSITION Node"
1327
+ }
1328
+
1329
+ /**
1330
+ * agent
1331
+ */
1332
+ declare type OnTaskStartTip = (tip: string) => Promise<void> | void;
1333
+
1334
+ declare interface PlanningAction<ParamType = any> {
1335
+ thought?: string;
1336
+ log?: string;
1337
+ type: string;
1338
+ param: ParamType;
1339
+ }
1340
+
1341
+ declare type PlanningActionParamWaitFor = AgentWaitForOpt & {};
1342
+
1343
+ declare interface PlanningCache {
1344
+ type: 'plan';
1345
+ prompt: string;
1346
+ yamlWorkflow: string;
1347
+ }
1348
+
1349
+ /**
1350
+ * planning
1351
+ *
1352
+ */
1353
+ declare interface PlanningLocateParam extends DetailedLocateParam {
1354
+ bbox?: [number, number, number, number];
1355
+ }
1356
+
1357
+ /**
1358
+ * ScreenshotItem encapsulates screenshot data.
1359
+ *
1360
+ * Supports lazy loading after memory release:
1361
+ * - inline mode: reads from HTML file using streaming (extractImageByIdSync)
1362
+ * - directory mode: reads from file on disk
1363
+ *
1364
+ * After persistence, memory is released but the screenshot can be recovered
1365
+ * on-demand from disk, making it safe to release memory at any time.
1366
+ */
1367
+ declare class ScreenshotItem {
1368
+ private _id;
1369
+ private _base64;
1370
+ private _format;
1371
+ private _capturedAt;
1372
+ private _persistedAs;
1373
+ private _persistedPath;
1374
+ private _persistedHtmlPath;
1375
+ private constructor();
1376
+ /** Create a new ScreenshotItem from base64 data */
1377
+ static create(base64: string, capturedAt: number): ScreenshotItem;
1378
+ get id(): string;
1379
+ /** Get the image format (png or jpeg) */
1380
+ get format(): 'png' | 'jpeg';
1381
+ /** Get the file extension for this screenshot */
1382
+ get extension(): string;
1383
+ /** Get screenshot capture timestamp in milliseconds */
1384
+ get capturedAt(): number;
1385
+ get base64(): string;
1386
+ /** Check if base64 data is still available in memory (not yet released) */
1387
+ hasBase64(): boolean;
1388
+ /**
1389
+ * Mark as persisted to HTML (inline mode).
1390
+ * Releases base64 memory, but keeps HTML path for lazy loading recovery.
1391
+ * @param htmlPath - absolute path to the HTML file containing the image
1392
+ */
1393
+ markPersistedInline(htmlPath: string): void;
1394
+ /**
1395
+ * Mark as persisted to file (directory mode).
1396
+ * Releases base64 memory, but keeps file path for lazy loading recovery.
1397
+ * @param relativePath - relative path for serialization (e.g., "./screenshots/id.jpeg")
1398
+ * @param absolutePath - absolute path for lazy loading recovery
1399
+ */
1400
+ markPersistedToPath(relativePath: string, absolutePath: string): void;
1401
+ /** Serialize for JSON - format depends on persistence state */
1402
+ toSerializable(): ScreenshotSerializeFormat;
1403
+ /** Check if a value is a serialized ScreenshotItem reference (inline or directory mode) */
1404
+ static isSerialized(value: unknown): value is ScreenshotSerializeFormat;
1405
+ /**
1406
+ * Get base64 data without the data URI prefix.
1407
+ * Useful for writing raw binary data to files.
1408
+ */
1409
+ get rawBase64(): string;
1410
+ }
1411
+
1412
+ /**
1413
+ * Serialization format for ScreenshotItem
1414
+ * - { $screenshot: "id" } - inline mode, references imageMap in HTML
1415
+ * - { base64: "path" } - directory mode, references external file path
1416
+ */
1417
+ declare type ScreenshotSerializeFormat = {
1418
+ $screenshot: string;
1419
+ capturedAt: number;
1420
+ } | {
1421
+ base64: string;
1422
+ capturedAt: number;
1423
+ };
1424
+
1425
+ declare type ScrollParam = Omit<ActionScrollParam, 'locate'>;
1426
+
1427
+ declare type ScrollType = 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft' | 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
1428
+
1429
+ declare class Service {
1430
+ contextRetrieverFn: () => Promise<UIContext> | UIContext;
1431
+ taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;
1432
+ constructor(context: UIContext | (() => Promise<UIContext> | UIContext), opt?: ServiceOptions);
1433
+ locate(query: PlanningLocateParam, opt: LocateOpts, modelConfig: IModelConfig, abortSignal?: AbortSignal): Promise<LocateResultWithDump>;
1434
+ extract<T>(dataDemand: ServiceExtractParam, modelConfig: IModelConfig, opt?: ServiceExtractOption, pageDescription?: string, multimodalPrompt?: TMultimodalPrompt, context?: UIContext): Promise<ServiceExtractResult<T>>;
1435
+ describe(target: Rect | [number, number], modelConfig: IModelConfig, opt?: {
1436
+ deepLocate?: boolean;
1437
+ }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
1438
+ }
1439
+
1440
+ declare type ServiceAction = 'locate' | 'extract' | 'assert' | 'describe';
1441
+
1442
+ declare interface ServiceDump extends DumpMeta {
1443
+ type: 'locate' | 'extract' | 'assert';
1444
+ logId: string;
1445
+ userQuery: {
1446
+ element?: TUserPrompt;
1447
+ dataDemand?: ServiceExtractParam;
1448
+ assertion?: TUserPrompt;
1449
+ };
1450
+ matchedElement: LocateResultElement[];
1451
+ matchedRect?: Rect;
1452
+ deepLocate?: boolean;
1453
+ data: any;
1454
+ assertionPass?: boolean;
1455
+ assertionThought?: string;
1456
+ taskInfo: ServiceTaskInfo;
1457
+ error?: string;
1458
+ output?: any;
1459
+ }
1460
+
1461
+ declare interface ServiceExtractOption {
1462
+ domIncluded?: boolean | 'visible-only';
1463
+ screenshotIncluded?: boolean;
1464
+ [key: string]: unknown;
1465
+ }
1466
+
1467
+ declare type ServiceExtractParam = string | Record<string, string>;
1468
+
1469
+ declare interface ServiceExtractResult<T> extends ServiceResultBase {
1470
+ data: T;
1471
+ thought?: string;
1472
+ usage?: AIUsageInfo;
1473
+ reasoning_content?: string;
1474
+ }
1475
+
1476
+ declare interface ServiceOptions {
1477
+ taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;
1478
+ }
1479
+
1480
+ declare interface ServiceResultBase {
1481
+ dump: ServiceDump;
1482
+ }
1483
+
1484
+ declare interface ServiceTaskInfo {
1485
+ durationMs: number;
1486
+ formatResponse?: string;
1487
+ rawResponse?: string;
1488
+ usage?: AIUsageInfo;
1489
+ searchArea?: Rect;
1490
+ searchAreaRawResponse?: string;
1491
+ searchAreaUsage?: AIUsageInfo;
1492
+ reasoning_content?: string;
1493
+ }
1494
+
1495
+ declare class TaskCache {
1496
+ cacheId: string;
1497
+ cacheFilePath?: string;
1498
+ cache: CacheFileContent;
1499
+ isCacheResultUsed: boolean;
1500
+ cacheOriginalLength: number;
1501
+ readOnlyMode: boolean;
1502
+ writeOnlyMode: boolean;
1503
+ private matchedCacheIndices;
1504
+ constructor(cacheId: string, isCacheResultUsed: boolean, cacheFilePath?: string, options?: {
1505
+ readOnly?: boolean;
1506
+ writeOnly?: boolean;
1507
+ });
1508
+ matchCache(prompt: TUserPrompt, type: 'plan' | 'locate'): MatchCacheResult<PlanningCache | LocateCache> | undefined;
1509
+ matchPlanCache(prompt: string): MatchCacheResult<PlanningCache> | undefined;
1510
+ matchLocateCache(prompt: TUserPrompt): MatchCacheResult<LocateCache> | undefined;
1511
+ appendCache(cache: PlanningCache | LocateCache): void;
1512
+ loadCacheFromFile(): CacheFileContent | undefined;
1513
+ flushCacheToFile(options?: {
1514
+ cleanUnused?: boolean;
1515
+ }): void;
1516
+ updateOrAppendCacheRecord(newRecord: PlanningCache | LocateCache, cachedRecord?: MatchCacheResult<PlanningCache | LocateCache>): void;
1517
+ }
1518
+
1519
+ declare class TaskExecutionError extends Error {
1520
+ runner: TaskRunner;
1521
+ errorTask: ExecutionTask | null;
1522
+ constructor(message: string, runner: TaskRunner, errorTask: ExecutionTask | null, options?: {
1523
+ cause?: unknown;
1524
+ });
1525
+ }
1526
+
1527
+ declare class TaskExecutor {
1528
+ interface: AbstractInterface;
1529
+ service: Service;
1530
+ taskCache?: TaskCache;
1531
+ private readonly providedActionSpace;
1532
+ private readonly taskBuilder;
1533
+ onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];
1534
+ private readonly hooks?;
1535
+ replanningCycleLimit?: number;
1536
+ waitAfterAction?: number;
1537
+ useDeviceTimestamp?: boolean;
1538
+ get page(): AbstractInterface;
1539
+ constructor(interfaceInstance: AbstractInterface, service: Service, opts: {
1540
+ taskCache?: TaskCache;
1541
+ onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
1542
+ replanningCycleLimit?: number;
1543
+ waitAfterAction?: number;
1544
+ useDeviceTimestamp?: boolean;
1545
+ hooks?: TaskExecutorHooks;
1546
+ actionSpace: DeviceAction[];
1547
+ });
1548
+ private createExecutionSession;
1549
+ private getActionSpace;
1550
+ /**
1551
+ * Get a readable time string using device time when configured.
1552
+ * This method respects the useDeviceTimestamp configuration.
1553
+ * @param format - Optional format string
1554
+ * @returns A formatted time string
1555
+ */
1556
+ private getTimeString;
1557
+ convertPlanToExecutable(plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, options?: {
1558
+ cacheable?: boolean;
1559
+ deepLocate?: boolean;
1560
+ abortSignal?: AbortSignal;
1561
+ }): Promise<{
1562
+ tasks: ExecutionTaskApply[];
1563
+ }>;
1564
+ loadYamlFlowAsPlanning(userInstruction: string, yamlString: string): Promise<{
1565
+ runner: TaskRunner;
1566
+ }>;
1567
+ runPlans(title: string, plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig): Promise<ExecutionResult>;
1568
+ action(userPrompt: string, modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, includeBboxInPlanning: boolean, aiActContext?: string, cacheable?: boolean, replanningCycleLimitOverride?: number, imagesIncludeCount?: number, deepThink?: DeepThinkOption, fileChooserAccept?: string[], deepLocate?: boolean, abortSignal?: AbortSignal): Promise<ExecutionResult<{
1569
+ yamlFlow?: MidsceneYamlFlowItem[];
1570
+ output?: string;
1571
+ } | undefined>>;
1572
+ private runAction;
1573
+ private createTypeQueryTask;
1574
+ createTypeQueryExecution<T>(type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert', demand: ServiceExtractParam, modelConfig: IModelConfig, opt?: ServiceExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<ExecutionResult<T>>;
1575
+ waitFor(assertion: TUserPrompt, opt: PlanningActionParamWaitFor, modelConfig: IModelConfig): Promise<ExecutionResult<void>>;
1576
+ }
1577
+
1578
+ declare interface TaskExecutorHooks {
1579
+ onTaskUpdate?: (runner: TaskRunner, error?: TaskExecutionError) => Promise<void> | void;
1580
+ }
1581
+
1582
+ declare class TaskRunner {
1583
+ readonly id: string;
1584
+ name: string;
1585
+ tasks: ExecutionTask[];
1586
+ status: 'init' | 'pending' | 'running' | 'completed' | 'error';
1587
+ onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
1588
+ private readonly uiContextBuilder;
1589
+ private readonly onTaskUpdate?;
1590
+ private readonly executionLogTime;
1591
+ constructor(name: string, uiContextBuilder: () => Promise<UIContext>, options?: TaskRunnerInitOptions);
1592
+ private emitOnTaskUpdate;
1593
+ private lastUiContext?;
1594
+ private getUiContext;
1595
+ private captureScreenshot;
1596
+ private attachRecorderItem;
1597
+ private markTaskAsPending;
1598
+ private normalizeStatusFromError;
1599
+ append(task: ExecutionTaskApply[] | ExecutionTaskApply, options?: TaskRunnerOperationOptions): Promise<void>;
1600
+ appendAndFlush(task: ExecutionTaskApply[] | ExecutionTaskApply, options?: TaskRunnerOperationOptions): Promise<{
1601
+ output: any;
1602
+ thought?: string;
1603
+ } | undefined>;
1604
+ flush(options?: TaskRunnerOperationOptions): Promise<{
1605
+ output: any;
1606
+ thought?: string;
1607
+ } | undefined>;
1608
+ isInErrorState(): boolean;
1609
+ latestErrorTask(): ExecutionTask | null;
1610
+ dump(): ExecutionDump;
1611
+ appendErrorPlan(errorMsg: string): Promise<{
1612
+ output: undefined;
1613
+ runner: TaskRunner;
1614
+ }>;
1615
+ }
1616
+
1617
+ declare type TaskRunnerInitOptions = ExecutionTaskProgressOptions & {
1618
+ tasks?: ExecutionTaskApply[];
1619
+ onTaskUpdate?: (runner: TaskRunner, error?: TaskExecutionError) => Promise<void> | void;
1620
+ };
1621
+
1622
+ declare type TaskRunnerOperationOptions = {
1623
+ allowWhenError?: boolean;
1624
+ };
1625
+
1626
+ declare type TMultimodalPrompt = z.infer<typeof TMultimodalPromptSchema>;
1627
+
1628
+ declare const TMultimodalPromptSchema: z.ZodObject<{
1629
+ images: z.ZodOptional<z.ZodArray<z.ZodObject<{
1630
+ name: z.ZodString;
1631
+ url: z.ZodString;
1632
+ }, "strip", z.ZodTypeAny, {
1633
+ name: string;
1634
+ url: string;
1635
+ }, {
1636
+ name: string;
1637
+ url: string;
1638
+ }>, "many">>;
1639
+ convertHttpImage2Base64: z.ZodOptional<z.ZodBoolean>;
1640
+ }, "strip", z.ZodTypeAny, {
1641
+ images?: {
1642
+ name: string;
1643
+ url: string;
1644
+ }[] | undefined;
1645
+ convertHttpImage2Base64?: boolean | undefined;
1646
+ }, {
1647
+ images?: {
1648
+ name: string;
1649
+ url: string;
1650
+ }[] | undefined;
1651
+ convertHttpImage2Base64?: boolean | undefined;
1652
+ }>;
1653
+
1654
+ /**
1655
+ * Tool type for mcpKitForAgent return value
1656
+ */
1657
+ declare type Tool = ToolDefinition;
1658
+
1659
+ /**
1660
+ * Tool definition for MCP server
1661
+ */
1662
+ declare interface ToolDefinition<T = Record<string, unknown>> {
1663
+ name: string;
1664
+ description: string;
1665
+ schema: ToolSchema;
1666
+ handler: ToolHandler<T>;
1667
+ }
1668
+
1669
+ /**
1670
+ * Tool handler function type
1671
+ * Takes parsed arguments and returns a tool result
1672
+ */
1673
+ declare type ToolHandler<T = Record<string, unknown>> = (args: T) => Promise<ToolResult>;
1674
+
1675
+ /**
1676
+ * Result type for tool execution (MCP compatible)
1677
+ */
1678
+ declare interface ToolResult {
1679
+ [x: string]: unknown;
1680
+ content: ToolResultContent[];
1681
+ isError?: boolean;
1682
+ _meta?: Record<string, unknown>;
1683
+ }
1684
+
1685
+ /**
1686
+ * Content item types for tool results (MCP compatible)
1687
+ */
1688
+ declare type ToolResultContent = {
1689
+ type: 'text';
1690
+ text: string;
1691
+ } | {
1692
+ type: 'image';
1693
+ data: string;
1694
+ mimeType: string;
1695
+ } | {
1696
+ type: 'audio';
1697
+ data: string;
1698
+ mimeType: string;
1699
+ } | {
1700
+ type: 'resource';
1701
+ resource: {
1702
+ text: string;
1703
+ uri: string;
1704
+ mimeType?: string;
1705
+ } | {
1706
+ uri: string;
1707
+ blob: string;
1708
+ mimeType?: string;
1709
+ };
1710
+ };
1711
+
1712
+ /**
1713
+ * Tool schema type using Zod
1714
+ */
1715
+ declare type ToolSchema = Record<string, z.ZodTypeAny>;
1716
+
1717
+ declare type TUserPrompt = z.infer<typeof TUserPromptSchema>;
1718
+
1719
+ declare const TUserPromptSchema: z.ZodUnion<[z.ZodString, z.ZodIntersection<z.ZodObject<{
1720
+ prompt: z.ZodString;
1721
+ }, "strip", z.ZodTypeAny, {
1722
+ prompt: string;
1723
+ }, {
1724
+ prompt: string;
1725
+ }>, z.ZodObject<{
1726
+ images: z.ZodOptional<z.ZodOptional<z.ZodArray<z.ZodObject<{
1727
+ name: z.ZodString;
1728
+ url: z.ZodString;
1729
+ }, "strip", z.ZodTypeAny, {
1730
+ name: string;
1731
+ url: string;
1732
+ }, {
1733
+ name: string;
1734
+ url: string;
1735
+ }>, "many">>>;
1736
+ convertHttpImage2Base64: z.ZodOptional<z.ZodOptional<z.ZodBoolean>>;
1737
+ }, "strip", z.ZodTypeAny, {
1738
+ images?: {
1739
+ name: string;
1740
+ url: string;
1741
+ }[] | undefined;
1742
+ convertHttpImage2Base64?: boolean | undefined;
1743
+ }, {
1744
+ images?: {
1745
+ name: string;
1746
+ url: string;
1747
+ }[] | undefined;
1748
+ convertHttpImage2Base64?: boolean | undefined;
1749
+ }>>]>;
1750
+
1751
+ /**
1752
+ * context
1753
+ */
1754
+ declare abstract class UIContext {
1755
+ /**
1756
+ * screenshot of the current UI state. which size is shotSize(be shrunk by screenshotShrinkFactor),
1757
+ */
1758
+ abstract screenshot: ScreenshotItem;
1759
+ /**
1760
+ * screenshot size after shrinking
1761
+ */
1762
+ abstract shotSize: Size;
1763
+ /**
1764
+ * The ratio for converting shrunk screenshot coordinates to logical coordinates.
1765
+ *
1766
+ * Example:
1767
+ * - Physical screen width: 3000px, dpr=6
1768
+ * - Logical width: 500px
1769
+ * - User-defined screenshotShrinkFactor: 2
1770
+ * - Actual shrunk screenshot width: 3000 / 2 = 1500px
1771
+ * - shrunkShotToLogicalRatio: dpr / screenshotShrinkFactor = 6 / 2 = 3
1772
+ * - To map back to logical coordinates: 1500 / shrunkShotToLogicalRatio = 500px
1773
+ */
1774
+ abstract shrunkShotToLogicalRatio: number;
1775
+ abstract _isFrozen?: boolean;
1776
+ abstract deprecatedDpr?: number;
1777
+ }
1778
+
277
1779
  /**
278
1780
  * Helper type to convert DeviceAction to wrapped method signature
279
1781
  */