npm - @aiscene/android - Versions diffs - 1.8.0 → 1.8.1 - Mend

@aiscene/android 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/es/cli.mjs +17 -17
package/dist/es/index.mjs +15 -15
package/dist/es/mcp-server.mjs +17 -17
package/dist/lib/cli.js +16 -16
package/dist/lib/index.js +14 -14
package/dist/lib/mcp-server.js +16 -16
package/dist/types/index.d.ts +1448 -16
package/dist/types/mcp-server.d.ts +1520 -18
package/package.json +3 -3

package/dist/types/index.d.ts CHANGED Viewed

@@ -1,26 +1,407 @@
-import { AbstractInterface } from '@midscene/core/device';
-import type { ActionParam } from '@midscene/core';
-import type { ActionReturn } from '@midscene/core';
 import { ADB } from 'appium-adb';
 import type { Adb } from '@yume-chan/adb';
-import { Agent } from '@midscene/core/agent';
-import { AgentOpt } from '@midscene/core/agent';
-import { AndroidDeviceInputOpt } from '@midscene/core/device';
-import { AndroidDeviceOpt } from '@midscene/core/device';
-import { BaseMidsceneTools } from '@midscene/shared/mcp';
+import type { CreateOpenAIClientFn } from '@midscene/shared/env';
 import { Device } from 'appium-adb';
-import { DeviceAction } from '@midscene/core';
-import type { ElementInfo } from '@midscene/shared/extractor';
-import { InterfaceType } from '@midscene/core';
-import { overrideAIConfig } from '@midscene/shared/env';
-import { Point } from '@midscene/core';
-import { Size } from '@midscene/core';
-import { ToolDefinition } from '@midscene/shared/mcp';
+import type { ElementNode } from '@midscene/shared/extractor';
+import { IModelConfig } from '@midscene/shared/env';
+import type { LocateResultElement } from '@midscene/shared/types';
+import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
+import { ModelConfigManager } from '@midscene/shared/env';
+import { Point } from '@midscene/shared/types';
+import { Rect } from '@midscene/shared/types';
+import { Size } from '@midscene/shared/types';
+import type { TModelConfig } from '@midscene/shared/env';
+import { z } from './lib';
+declare abstract class AbstractInterface {
+    abstract interfaceType: string;
+    abstract screenshotBase64(): Promise<string>;
+    abstract size(): Promise<Size>;
+    abstract actionSpace(): DeviceAction[];
+    abstract cacheFeatureForPoint?(center: [number, number], options?: {
+        targetDescription?: string;
+        modelConfig?: IModelConfig;
+    }): Promise<ElementCacheFeature>;
+    abstract rectMatchesCacheFeature?(feature: ElementCacheFeature): Promise<Rect>;
+    abstract destroy?(): Promise<void>;
+    abstract describe?(): string;
+    abstract beforeInvokeAction?(actionName: string, param: any): Promise<void>;
+    abstract afterInvokeAction?(actionName: string, param: any): Promise<void>;
+    registerFileChooserListener?(handler: (chooser: FileChooserHandler) => Promise<void>): Promise<{
+        dispose: () => void;
+        getError: () => Error | undefined;
+    }>;
+    abstract getElementsNodeTree?: () => Promise<ElementNode>;
+    abstract url?: () => string | Promise<string>;
+    abstract evaluateJavaScript?<T = any>(script: string): Promise<T>;
+    /**
+     * Get the current time from the device.
+     * Returns the device's current timestamp in milliseconds.
+     * This is useful when the system time and device time are not synchronized.
+     */
+    getTimestamp?(): Promise<number>;
+    /** URL of native MJPEG stream for real-time screen preview (e.g. WDA MJPEG server) */
+    mjpegStreamUrl?: string;
+}
 declare type ActionArgs<T extends DeviceAction> = [ActionParam<T>] extends [undefined] ? [] : [ActionParam<T>];
+/**
+ * Type utilities for extracting types from DeviceAction definitions
+ */
+/**
+ * Extract parameter type from a DeviceAction
+ */
+declare type ActionParam<Action extends DeviceAction<any, any>> = Action extends DeviceAction<infer P, any> ? P : never;
+/**
+ * Extract return type from a DeviceAction
+ */
+declare type ActionReturn<Action extends DeviceAction<any, any>> = Action extends DeviceAction<any, infer R> ? R : never;
+declare type ActionScrollParam = {
+    direction?: 'down' | 'up' | 'right' | 'left';
+    scrollType?: ScrollType;
+    distance?: number | null;
+    locate?: LocateResultElement;
+};
+/**
+ * Action space item definition
+ * Note: Intentionally no index signature to maintain compatibility with DeviceAction
+ */
+declare interface ActionSpaceItem {
+    name: string;
+    description?: string;
+    args?: Record<string, unknown>;
+    paramSchema?: z.ZodTypeAny;
+}
+declare class Agent<InterfaceType extends AbstractInterface = AbstractInterface> {
+    interface: InterfaceType;
+    service: Service;
+    dump: GroupedActionDump;
+    reportFile?: string | null;
+    reportFileName?: string;
+    taskExecutor: TaskExecutor;
+    opts: AgentOpt;
+    /**
+     * If true, the agent will not perform any actions
+     */
+    dryMode: boolean;
+    onTaskStartTip?: OnTaskStartTip;
+    taskCache?: TaskCache;
+    private dumpUpdateListeners;
+    get onDumpUpdate(): ((dump: string, executionDump?: ExecutionDump) => void) | undefined;
+    set onDumpUpdate(callback: ((dump: string, executionDump?: ExecutionDump) => void) | undefined);
+    destroyed: boolean;
+    modelConfigManager: ModelConfigManager;
+    /**
+     * Frozen page context for consistent AI operations
+     */
+    private frozenUIContext?;
+    private get aiActContext();
+    /**
+     * Flag to track if VL model warning has been shown
+     */
+    private hasWarnedNonVLModel;
+    private executionDumpIndexByRunner;
+    private fullActionSpace;
+    private reportGenerator;
+    get page(): InterfaceType;
+    /**
+     * Ensures VL model warning is shown once when needed
+     */
+    private ensureVLModelWarning;
+    private resolveReplanningCycleLimit;
+    constructor(interfaceInstance: InterfaceType, opts?: AgentOpt);
+    getActionSpace(): Promise<DeviceAction[]>;
+    private static readonly CONTEXT_RETRY_MAX;
+    private static readonly CONTEXT_RETRY_DELAY_MS;
+    /**
+     * Override in subclasses to indicate which errors are transient and should
+     * trigger an automatic retry when building the UI context.
+     * Returns `false` by default (no retry).
+     */
+    protected isRetryableContextError(_error: unknown): boolean;
+    getUIContext(action?: ServiceAction): Promise<UIContext>;
+    _snapshotContext(): Promise<UIContext>;
+    /**
+     * @deprecated Use {@link setAIActContext} instead.
+     */
+    setAIActionContext(prompt: string): Promise<void>;
+    setAIActContext(prompt: string): Promise<void>;
+    resetDump(): GroupedActionDump;
+    appendExecutionDump(execution: ExecutionDump, runner?: TaskRunner): void;
+    dumpDataString(opt?: {
+        inlineScreenshots?: boolean;
+    }): string;
+    reportHTMLString(opt?: {
+        inlineScreenshots?: boolean;
+    }): string;
+    private lastExecutionDump?;
+    writeOutActionDumps(executionDump?: ExecutionDump): void;
+    private getGroupMeta;
+    private callbackOnTaskStartTip;
+    wrapActionInActionSpace<T extends DeviceAction>(name: string): (param: ActionParam<T>) => Promise<ActionReturn<T>>;
+    callActionInActionSpace<T = any>(type: string, opt?: T): Promise<any>;
+    aiTap(locatePrompt: TUserPrompt, opt?: LocateOption & {
+        fileChooserAccept?: string | string[];
+    }): Promise<any>;
+    aiRightClick(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
+    aiDoubleClick(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
+    aiHover(locatePrompt: TUserPrompt, opt?: LocateOption): Promise<any>;
+    aiInput(locatePrompt: TUserPrompt, opt: LocateOption & {
+        value: string | number;
+    } & {
+        autoDismissKeyboard?: boolean;
+    } & {
+        mode?: 'replace' | 'clear' | 'typeOnly' | 'append';
+    }): Promise<any>;
+    /**
+     * @deprecated Use aiInput(locatePrompt, opt) instead where opt contains the value
+     */
+    aiInput(value: string | number, locatePrompt: TUserPrompt, opt?: LocateOption & {
+        autoDismissKeyboard?: boolean;
+    } & {
+        mode?: 'replace' | 'clear' | 'typeOnly' | 'append';
+    }): Promise<any>;
+    aiKeyboardPress(locatePrompt: TUserPrompt, opt: LocateOption & {
+        keyName: string;
+    }): Promise<any>;
+    /**
+     * @deprecated Use aiKeyboardPress(locatePrompt, opt) instead where opt contains the keyName
+     */
+    aiKeyboardPress(keyName: string, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
+    aiScroll(locatePrompt: TUserPrompt | undefined, opt: LocateOption & ScrollParam): Promise<any>;
+    /**
+     * @deprecated Use aiScroll(locatePrompt, opt) instead where opt contains the scroll parameters
+     */
+    aiScroll(scrollParam: ScrollParam, locatePrompt?: TUserPrompt, opt?: LocateOption): Promise<any>;
+    aiPinch(locatePrompt: TUserPrompt | undefined, opt: LocateOption & {
+        direction: 'in' | 'out';
+        distance?: number;
+        duration?: number;
+    }): Promise<any>;
+    aiAct(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
+    /**
+     * @deprecated Use {@link Agent.aiAct} instead.
+     */
+    aiAction(taskPrompt: string, opt?: AiActOptions): Promise<string | undefined>;
+    aiQuery<ReturnType = any>(demand: ServiceExtractParam, opt?: ServiceExtractOption): Promise<ReturnType>;
+    aiBoolean(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<boolean>;
+    aiNumber(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<number>;
+    aiString(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<string>;
+    aiAsk(prompt: TUserPrompt, opt?: ServiceExtractOption): Promise<string>;
+    describeElementAtPoint(center: [number, number], opt?: {
+        verifyPrompt?: boolean;
+        retryLimit?: number;
+        deepLocate?: boolean;
+    } & LocatorValidatorOption): Promise<AgentDescribeElementAtPointResult>;
+    verifyLocator(prompt: string, locateOpt: LocateOption | undefined, expectCenter: [number, number], verifyLocateOption?: LocatorValidatorOption): Promise<LocateValidatorResult>;
+    aiLocate(prompt: TUserPrompt, opt?: LocateOption): Promise<Pick<LocateResultElement, "rect" | "center">>;
+    aiAssert(assertion: TUserPrompt, msg?: string, opt?: AgentAssertOpt & ServiceExtractOption): Promise<{
+        pass: boolean;
+        thought: string | undefined;
+        message: string | undefined;
+    } | undefined>;
+    aiWaitFor(assertion: TUserPrompt, opt?: AgentWaitForOpt): Promise<void>;
+    ai(...args: Parameters<typeof Agent.aiAct>): Promise<string | undefined>;
+    runYaml(yamlScriptContent: string): Promise<{
+        result: Record<string, any>;
+    }>;
+    evaluateJavaScript(script: string): Promise<any>;
+    /**
+     * Add a dump update listener
+     * @param listener Listener function
+     * @returns A remove function that can be called to remove this listener
+     */
+    addDumpUpdateListener(listener: (dump: string, executionDump?: ExecutionDump) => void): () => void;
+    /**
+     * Remove a dump update listener
+     * @param listener The listener function to remove
+     */
+    removeDumpUpdateListener(listener: (dump: string, executionDump?: ExecutionDump) => void): void;
+    /**
+     * Clear all dump update listeners
+     */
+    clearDumpUpdateListeners(): void;
+    destroy(): Promise<void>;
+    recordToReport(title?: string, opt?: {
+        content: string;
+    }): Promise<void>;
+    /**
+     * @deprecated Use {@link Agent.recordToReport} instead.
+     */
+    logScreenshot(title?: string, opt?: {
+        content: string;
+    }): Promise<void>;
+    _unstableLogContent(): {
+        groupName: string;
+        groupDescription: string | undefined;
+        executions: ExecutionDump[];
+    };
+    /**
+     * Freezes the current page context to be reused in subsequent AI operations
+     * This avoids recalculating page context for each operation
+     */
+    freezePageContext(): Promise<void>;
+    /**
+     * Unfreezes the page context, allowing AI operations to calculate context dynamically
+     */
+    unfreezePageContext(): Promise<void>;
+    /**
+     * Process cache configuration and return normalized cache settings
+     */
+    private processCacheConfig;
+    private normalizeFilePaths;
+    private normalizeFileInput;
+    /**
+     * Manually flush cache to file
+     * @param options - Optional configuration
+     * @param options.cleanUnused - If true, removes unused cache records before flushing
+     */
+    flushCache(options?: {
+        cleanUnused?: boolean;
+    }): Promise<void>;
+}
+declare interface AgentAssertOpt {
+    keepRawResponse?: boolean;
+}
+declare interface AgentDescribeElementAtPointResult {
+    prompt: string;
+    deepLocate: boolean;
+    verifyResult?: LocateValidatorResult;
+}
 export declare function agentFromAdbDevice(deviceId?: string, opts?: AndroidAgentOpt & AndroidDeviceOpt): Promise<AndroidAgent>;
+declare interface AgentOpt {
+    testId?: string;
+    cacheId?: string;
+    groupName?: string;
+    groupDescription?: string;
+    generateReport?: boolean;
+    autoPrintReportMsg?: boolean;
+    /**
+     * Use directory-based report format with separate image files.
+     *
+     * When enabled:
+     * - Screenshots are saved as PNG files in a `screenshots/` subdirectory
+     * - Report is generated as `index.html` with relative image paths
+     * - Reduces memory usage and report file size
+     *
+     * IMPORTANT: 'html-and-external-assets' reports must be served via HTTP server
+     * (e.g., `npx serve ./report-dir`). The file:// protocol will not
+     * work due to browser CORS restrictions.
+     *
+     * @default 'single-html'
+     */
+    outputFormat?: 'single-html' | 'html-and-external-assets';
+    onTaskStartTip?: OnTaskStartTip;
+    aiActContext?: string;
+    aiActionContext?: string;
+    reportFileName?: string;
+    modelConfig?: TModelConfig;
+    cache?: Cache_2;
+    /**
+     * Maximum number of replanning cycles for aiAct.
+     * Defaults to 20 (40 for `vlm-ui-tars`) when not provided.
+     * If omitted, the agent will also read `MIDSCENE_REPLANNING_CYCLE_LIMIT` for backward compatibility.
+     */
+    replanningCycleLimit?: number;
+    /**
+     * Wait time in milliseconds after each action execution.
+     * This allows the UI to settle and stabilize before the next action.
+     * Defaults to 300ms when not provided.
+     */
+    waitAfterAction?: number;
+    /**
+     * When set to true, Midscene will use the target device's time (Android/iOS)
+     * instead of the system time. Useful when the device time differs from the
+     * host machine. Default: false
+     */
+    useDeviceTimestamp?: boolean;
+    /**
+     * Custom screenshot shrink factor to reduce AI token usage.
+     * When set, the screenshot will be scaled down by this factor from the physical resolution.
+     *
+     * Example:
+     * - Physical screen width: 3000px, dpr=6
+     * - Logical width: 500px
+     * - screenshotShrinkFactor: 2
+     * - Actual shrunk screenshot width: 3000 / 2 = 1500px
+     * - AI analyzes the 1500px screenshot
+     * - Coordinates are transformed back to logical (500px) before actions execute
+     *
+     * Benefits:
+     * - Reduces token usage for high-resolution screenshots
+     * - Maintains accuracy by scaling coordinates appropriately
+     *
+     * Must be >= 1 (shrinking only, enlarging is not supported).
+     *
+     * @default 1 (no shrinking, uses original physical screenshot)
+     */
+    screenshotShrinkFactor?: number;
+    /**
+     * Custom OpenAI client factory function
+     *
+     * If provided, this function will be called to create OpenAI client instances
+     * for each AI call, allowing you to:
+     * - Wrap clients with observability tools (langsmith, langfuse)
+     * - Use custom OpenAI-compatible clients
+     * - Apply different configurations based on intent
+     *
+     * @param config - Resolved model configuration
+     * @returns OpenAI client instance (original or wrapped)
+     *
+     * @example
+     * ```typescript
+     * createOpenAIClient: async (openai, opts) => {
+     *   // Wrap with langsmith for planning tasks
+     *   if (opts.baseURL?.includes('planning')) {
+     *     return wrapOpenAI(openai, { metadata: { task: 'planning' } });
+     *   }
+     *
+     *   return openai;
+     * }
+     * ```
+     */
+    createOpenAIClient?: CreateOpenAIClientFn;
+}
+declare interface AgentWaitForOpt extends ServiceExtractOption {
+    checkIntervalMs?: number;
+    timeoutMs?: number;
+}
+declare type AiActOptions = {
+    cacheable?: boolean;
+    fileChooserAccept?: string | string[];
+    deepThink?: DeepThinkOption;
+    deepLocate?: boolean;
+    abortSignal?: AbortSignal;
+};
+declare interface AIDescribeElementResponse {
+    description: string;
+    error?: string;
+}
+declare type AIUsageInfo = Record<string, any> & {
+    prompt_tokens: number | undefined;
+    completion_tokens: number | undefined;
+    total_tokens: number | undefined;
+    cached_input: number | undefined;
+    time_cost: number | undefined;
+    model_name: string | undefined;
+    model_description: string | undefined;
+    intent: string | undefined;
+    request_id: string | undefined;
+};
 export declare class AndroidAgent extends Agent<AndroidDevice> {
     /**
      * Trigger the system back operation on Android devices
@@ -229,6 +610,107 @@ export declare class AndroidDevice implements AbstractInterface {
     hideKeyboard(options?: AndroidDeviceInputOpt, timeoutMs?: number): Promise<boolean>;
 }
+/**
+ * Android device input options
+ */
+declare type AndroidDeviceInputOpt = {
+    /** Automatically dismiss the keyboard after input is completed */
+    autoDismissKeyboard?: boolean;
+    /** Strategy for dismissing the keyboard: 'esc-first' tries ESC before BACK, 'back-first' tries BACK before ESC */
+    keyboardDismissStrategy?: 'esc-first' | 'back-first';
+};
+/**
+ * Android device options
+ */
+declare type AndroidDeviceOpt = {
+    /** Path to the ADB executable */
+    androidAdbPath?: string;
+    /** Remote ADB host address */
+    remoteAdbHost?: string;
+    /** Remote ADB port */
+    remoteAdbPort?: number;
+    /** Input method editor strategy: 'always-yadb' always uses yadb, 'yadb-for-non-ascii' uses yadb only for non-ASCII characters */
+    imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii';
+    /** Display ID to use for this device */
+    displayId?: number;
+    /** Use physical display ID for screenshot operations */
+    usePhysicalDisplayIdForScreenshot?: boolean;
+    /** Use physical display ID when looking up display information */
+    usePhysicalDisplayIdForDisplayLookup?: boolean;
+    /** Custom device actions to register */
+    customActions?: DeviceAction<any>[];
+    /**
+     * @deprecated Use `screenshotShrinkFactor` in AgentOpt instead.
+     * This option no longer affects screenshot size sent to AI model.
+     */
+    screenshotResizeScale?: number;
+    /** Always fetch screen info on each call; if false, cache the first result */
+    alwaysRefreshScreenInfo?: boolean;
+    /** Minimum screenshot buffer size in bytes (default: 10240 = 10KB). Set to 0 to disable validation. */
+    minScreenshotBufferSize?: number;
+    /**
+     * Scrcpy screenshot configuration for high-performance screen capture.
+     *
+     * Scrcpy provides 6-8x faster screenshots by streaming H.264 video from the device.
+     * When enabled, scrcpy will:
+     * 1. Start a video stream from the device on first screenshot request
+     * 2. Keep the connection alive for subsequent screenshots (16-50ms each)
+     * 3. Automatically disconnect after idle timeout to save resources
+     * 4. Fallback to standard ADB mode if unavailable
+     *
+     * @example
+     * ```typescript
+     * // Enable scrcpy for high-performance screenshots
+     * const device = new AndroidDevice(deviceId, {
+     *   scrcpyConfig: {
+     *     enabled: true,
+     *   },
+     * });
+     *
+     * // Custom configuration
+     * const device = new AndroidDevice(deviceId, {
+     *   scrcpyConfig: {
+     *     enabled: true,
+     *     maxSize: 0,        // 0 = no scaling
+     *     idleTimeoutMs: 30000,
+     *     videoBitRate: 8_000_000,
+     *   },
+     * });
+     * ```
+     */
+    scrcpyConfig?: {
+        /**
+         * Enable scrcpy for high-performance screenshots.
+         * @default false
+         */
+        enabled?: boolean;
+        /**
+         * Maximum video dimension (width or height).
+         * Video stream will be scaled down if device resolution exceeds this value.
+         * Lower values reduce bandwidth but may affect image quality.
+         *
+         * @default 0 (no scaling, use original resolution)
+         * @example
+         * { maxSize: 1024 } // Always scale to 1024
+         */
+        maxSize?: number;
+        /**
+         * Idle timeout in milliseconds before disconnecting scrcpy.
+         * Connection auto-closes after this period of inactivity to save resources.
+         * Set to 0 to disable auto-disconnect.
+         * @default 30000 (30 seconds)
+         */
+        idleTimeoutMs?: number;
+        /**
+         * Video bit rate for H.264 encoding in bits per second.
+         * Higher values improve quality but increase bandwidth usage.
+         * @default 2000000 (2 Mbps)
+         */
+        videoBitRate?: number;
+    };
+} & AndroidDeviceInputOpt;
 /**
  * Android-specific tools manager
  * Extends BaseMidsceneTools to provide Android ADB device connection tools
@@ -242,6 +724,145 @@ export declare class AndroidMidsceneTools extends BaseMidsceneTools<AndroidAgent
     protected preparePlatformTools(): ToolDefinition[];
 }
+/**
+ * Base agent interface
+ * Represents a platform-specific agent (Android, iOS, Web)
+ * Note: Return types use `unknown` for compatibility with platform-specific implementations
+ */
+declare interface BaseAgent {
+    getActionSpace(): Promise<ActionSpaceItem[]>;
+    destroy?(): Promise<void>;
+    page?: {
+        screenshotBase64(): Promise<string>;
+    };
+    aiAction?: (description: string, params?: Record<string, unknown>) => Promise<unknown>;
+    aiWaitFor?: (assertion: string, options: Record<string, unknown>) => Promise<unknown>;
+}
+/**
+ * Base device interface for temporary device instances
+ */
+declare interface BaseDevice {
+    actionSpace(): ActionSpaceItem[];
+    destroy?(): Promise<void>;
+}
+/**
+ * Base class for platform-specific MCP tools
+ * Generic type TAgent allows subclasses to use their specific agent types
+ */
+declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent> implements IMidsceneTools {
+    protected mcpServer?: McpServer;
+    protected agent?: TAgent;
+    protected toolDefinitions: ToolDefinition[];
+    /**
+     * Ensure agent is initialized and ready for use.
+     * Must be implemented by subclasses to create platform-specific agent.
+     * @param initParam Optional initialization parameter (platform-specific, e.g., URL, device ID)
+     * @returns Promise resolving to initialized agent instance
+     * @throws Error if agent initialization fails
+     */
+    protected abstract ensureAgent(initParam?: string): Promise<TAgent>;
+    /**
+     * Optional: prepare platform-specific tools (e.g., device connection)
+     */
+    protected preparePlatformTools(): ToolDefinition[];
+    /**
+     * Must be implemented by subclasses to create a temporary device instance
+     * This allows getting real actionSpace without connecting to device
+     */
+    protected abstract createTemporaryDevice(): BaseDevice;
+    /**
+     * Initialize all tools by querying actionSpace
+     * Uses two-layer fallback strategy:
+     * 1. Try to get actionSpace from connected agent (if available)
+     * 2. Create temporary device instance to read actionSpace (always succeeds)
+     */
+    initTools(): Promise<void>;
+    /**
+     * Attach to MCP server and register all tools
+     */
+    attachToServer(server: McpServer): void;
+    /**
+     * Cleanup method - destroy agent and release resources
+     */
+    destroy(): Promise<void>;
+    /**
+     * Get tool definitions
+     */
+    getToolDefinitions(): ToolDefinition[];
+    /**
+     * Set agent for the tools manager
+     */
+    setAgent(agent: TAgent): void;
+    /**
+     * Helper: Convert base64 screenshot to image content array
+     */
+    protected buildScreenshotContent(screenshot: string): {
+        type: "image";
+        data: string;
+        mimeType: string;
+    }[];
+    /**
+     * Helper: Build a simple text result for tool responses
+     */
+    protected buildTextResult(text: string): {
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    };
+    /**
+     * Create a disconnect handler for releasing platform resources
+     * @param platformName Human-readable platform name for the response message
+     * @returns Handler function that destroys the agent and returns appropriate response
+     */
+    protected createDisconnectHandler(platformName: string): () => Promise<{
+        content: {
+            type: "text";
+            text: string;
+        }[];
+    }>;
+}
+declare type Cache_2 = false | true | CacheConfig;
+/**
+ * Agent
+ */
+declare type CacheConfig = {
+    strategy?: 'read-only' | 'read-write' | 'write-only';
+    id: string;
+};
+declare type CacheFileContent = {
+    midsceneVersion: string;
+    cacheId: string;
+    caches: Array<PlanningCache | LocateCache>;
+};
+declare type DeepThinkOption = 'unset' | true | false;
+declare interface DetailedLocateParam extends Omit<LocateOption, 'deepThink' | keyof TMultimodalPrompt> {
+    prompt: TUserPrompt;
+}
+declare interface DeviceAction<TParam = any, TReturn = any> {
+    name: string;
+    description?: string;
+    interfaceAlias?: string;
+    paramSchema?: z.ZodType<TParam>;
+    call: (param: TParam, context: ExecutorContext) => Promise<TReturn> | TReturn;
+    delayAfterRunner?: number;
+    /**
+     * An example param object for this action.
+     * Locate fields with { prompt } will automatically get bbox injected when needed.
+     */
+    sample?: {
+        [K in keyof TParam]?: any;
+    };
+}
 declare type DeviceActionAndroidBackButton = DeviceAction<undefined, void>;
 declare type DeviceActionAndroidHomeButton = DeviceAction<undefined, void>;
@@ -256,9 +877,403 @@ declare interface DevicePhysicalInfo {
     isCurrentOrientation?: boolean;
 }
+declare interface DumpMeta {
+    logTime: number;
+}
+declare type ElementCacheFeature = Record<string, unknown>;
+declare interface ElementInfo {
+    id: string;
+    indexId: number;
+    nodeHashId: string;
+    xpaths?: string[];
+    attributes: {
+        nodeType: NodeType;
+        [key: string]: string;
+    };
+    nodeType: NodeType;
+    content: string;
+    rect: {
+        left: number;
+        top: number;
+        width: number;
+        height: number;
+    };
+    center: [number, number];
+    isVisible: boolean;
+}
+/**
+ * ExecutionDump class for serializing and deserializing execution dumps
+ */
+declare class ExecutionDump implements IExecutionDump {
+    id?: string;
+    logTime: number;
+    name: string;
+    description?: string;
+    tasks: ExecutionTask[];
+    aiActContext?: string;
+    constructor(data: IExecutionDump);
+    /**
+     * Serialize the ExecutionDump to a JSON string
+     */
+    serialize(indents?: number): string;
+    /**
+     * Convert to a plain object for JSON serialization
+     */
+    toJSON(): IExecutionDump;
+    /**
+     * Create an ExecutionDump instance from a serialized JSON string
+     */
+    static fromSerializedString(serialized: string): ExecutionDump;
+    /**
+     * Create an ExecutionDump instance from a plain object
+     */
+    static fromJSON(data: IExecutionDump): ExecutionDump;
+    /**
+     * Collect all ScreenshotItem instances from tasks.
+     * Scans through uiContext and recorder items to find screenshots.
+     *
+     * @returns Array of ScreenshotItem instances
+     */
+    collectScreenshots(): ScreenshotItem[];
+}
+declare interface ExecutionRecorderItem {
+    type: 'screenshot';
+    ts: number;
+    screenshot?: ScreenshotItem;
+    timing?: string;
+}
+declare interface ExecutionResult<OutputType = any> {
+    output: OutputType;
+    thought?: string;
+    runner: TaskRunner;
+}
+declare type ExecutionTask<E extends ExecutionTaskApply<any, any, any> = ExecutionTaskApply<any, any, any>> = E & ExecutionTaskReturn<E extends ExecutionTaskApply<any, any, infer TaskOutput, any> ? TaskOutput : unknown, E extends ExecutionTaskApply<any, any, any, infer TaskLog> ? TaskLog : unknown> & {
+    taskId: string;
+    status: 'pending' | 'running' | 'finished' | 'failed' | 'cancelled';
+    error?: Error;
+    errorMessage?: string;
+    errorStack?: string;
+    timing?: {
+        start: number;
+        getUiContextStart?: number;
+        getUiContextEnd?: number;
+        callAiStart?: number;
+        callAiEnd?: number;
+        beforeInvokeActionHookStart?: number;
+        beforeInvokeActionHookEnd?: number;
+        callActionStart?: number;
+        callActionEnd?: number;
+        afterInvokeActionHookStart?: number;
+        afterInvokeActionHookEnd?: number;
+        captureAfterCallingSnapshotStart?: number;
+        captureAfterCallingSnapshotEnd?: number;
+        end?: number;
+        cost?: number;
+    };
+    usage?: AIUsageInfo;
+    searchAreaUsage?: AIUsageInfo;
+    reasoning_content?: string;
+};
+declare interface ExecutionTaskApply<Type extends ExecutionTaskType = any, TaskParam = any, TaskOutput = any, TaskLog = any> {
+    type: Type;
+    subType?: string;
+    param?: TaskParam;
+    thought?: string;
+    uiContext?: UIContext;
+    executor: (param: TaskParam, context: ExecutorContext) => Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void> | undefined | void;
+}
+declare interface ExecutionTaskHitBy {
+    from: string;
+    context: Record<string, any>;
+}
+declare interface ExecutionTaskProgressOptions {
+    onTaskStart?: (task: ExecutionTask) => Promise<void> | void;
+}
+declare interface ExecutionTaskReturn<TaskOutput = unknown, TaskLog = unknown> {
+    output?: TaskOutput;
+    log?: TaskLog;
+    recorder?: ExecutionRecorderItem[];
+    hitBy?: ExecutionTaskHitBy;
+}
+declare type ExecutionTaskType = 'Planning' | 'Insight' | 'Action Space' | 'Log';
+declare interface ExecutorContext {
+    task: ExecutionTask;
+    element?: LocateResultElement | null;
+    uiContext?: UIContext;
+}
+declare interface FileChooserHandler {
+    accept(files: string[]): Promise<void>;
+}
 export declare function getConnectedDevices(): Promise<Device[]>;
-export { overrideAIConfig }
+/**
+ * Non model related env keys, used for globally controlling the behavior of midscene
+ * Can not be override by agent.modelConfig but can be override by overrideAIConfig
+ * Can be access at any time
+ */
+declare const GLOBAL_ENV_KEYS: readonly ["MIDSCENE_CACHE", "MIDSCENE_MCP_USE_PUPPETEER_MODE", "MIDSCENE_MCP_ANDROID_MODE", "MIDSCENE_LANGSMITH_DEBUG", "MIDSCENE_LANGFUSE_DEBUG", "MIDSCENE_REPORT_QUIET", "MIDSCENE_MODEL_MAX_TOKENS", "MIDSCENE_CACHE_MAX_FILENAME_LENGTH", "MIDSCENE_REPLANNING_CYCLE_LIMIT", "MIDSCENE_MODEL_MAX_TOKENS", "OPENAI_MAX_TOKENS", "MIDSCENE_ADB_PATH", "MIDSCENE_ADB_REMOTE_HOST", "MIDSCENE_ADB_REMOTE_PORT", "MIDSCENE_ANDROID_IME_STRATEGY", "MIDSCENE_IOS_DEVICE_UDID", "MIDSCENE_IOS_SIMULATOR_UDID", "MIDSCENE_REPORT_TAG_NAME", "MIDSCENE_PREFERRED_LANGUAGE", "MATCH_BY_POSITION", "MIDSCENE_MCP_CHROME_PATH", "DOCKER_CONTAINER"];
+/**
+ * GroupedActionDump class for serializing and deserializing grouped action dumps
+ */
+declare class GroupedActionDump implements IGroupedActionDump {
+    sdkVersion: string;
+    groupName: string;
+    groupDescription?: string;
+    modelBriefs: ModelBrief[];
+    executions: ExecutionDump[];
+    deviceType?: string;
+    constructor(data: IGroupedActionDump);
+    /**
+     * Serialize the GroupedActionDump to a JSON string
+     * Uses compact { $screenshot: id } format
+     */
+    serialize(indents?: number): string;
+    /**
+     * Serialize the GroupedActionDump with inline screenshots to a JSON string.
+     * Each ScreenshotItem is replaced with { base64: "...", capturedAt }.
+     */
+    serializeWithInlineScreenshots(indents?: number): string;
+    /**
+     * Convert to a plain object for JSON serialization
+     */
+    toJSON(): IGroupedActionDump;
+    /**
+     * Create a GroupedActionDump instance from a serialized JSON string
+     */
+    static fromSerializedString(serialized: string): GroupedActionDump;
+    /**
+     * Create a GroupedActionDump instance from a plain object
+     */
+    static fromJSON(data: IGroupedActionDump): GroupedActionDump;
+    /**
+     * Collect all ScreenshotItem instances from all executions.
+     *
+     * @returns Array of all ScreenshotItem instances across all executions
+     */
+    collectAllScreenshots(): ScreenshotItem[];
+    /**
+     * Serialize the dump to files with screenshots as separate PNG files.
+     * Creates:
+     * - {basePath} - dump JSON with { $screenshot: id } references
+     * - {basePath}.screenshots/ - PNG files
+     * - {basePath}.screenshots.json - ID to path mapping
+     *
+     * @param basePath - Base path for the dump file
+     */
+    serializeToFiles(basePath: string): void;
+    /**
+     * Read dump from files and return JSON string with inline screenshots.
+     * Reads the dump JSON and screenshot files, then inlines the base64 data.
+     *
+     * @param basePath - Base path for the dump file
+     * @returns JSON string with inline screenshots ({ base64: "..." } format)
+     */
+    static fromFilesAsInlineJson(basePath: string): string;
+    /**
+     * Clean up all files associated with a serialized dump.
+     *
+     * @param basePath - Base path for the dump file
+     */
+    static cleanupFiles(basePath: string): void;
+    /**
+     * Get all file paths associated with a serialized dump.
+     *
+     * @param basePath - Base path for the dump file
+     * @returns Array of all associated file paths
+     */
+    static getFilePaths(basePath: string): string[];
+}
+declare interface IExecutionDump extends DumpMeta {
+    /** Stable unique identifier for this execution run */
+    id?: string;
+    name: string;
+    description?: string;
+    tasks: ExecutionTask[];
+    aiActContext?: string;
+}
+declare interface IGroupedActionDump {
+    sdkVersion: string;
+    groupName: string;
+    groupDescription?: string;
+    modelBriefs: ModelBrief[];
+    executions: IExecutionDump[];
+    deviceType?: string;
+}
+/**
+ * Interface for platform-specific MCP tools manager
+ */
+declare interface IMidsceneTools {
+    attachToServer(server: McpServer): void;
+    initTools(): Promise<void>;
+    destroy?(): Promise<void>;
+}
+declare type InterfaceType = 'puppeteer' | 'playwright' | 'static' | 'chrome-extension-proxy' | 'android' | string;
+declare interface LocateCache {
+    type: 'locate';
+    prompt: TUserPrompt;
+    cache?: ElementCacheFeature;
+    /** @deprecated kept for backward compatibility */
+    xpaths?: string[];
+}
+declare interface LocateOption extends Partial<TMultimodalPrompt> {
+    prompt?: TUserPrompt;
+    deepLocate?: boolean;
+    /** @deprecated Use `deepLocate` instead. Kept for backward compatibility. */
+    deepThink?: boolean;
+    cacheable?: boolean;
+    xpath?: string;
+    uiContext?: UIContext;
+    fileChooserAccept?: string | string[];
+}
+declare interface LocateOpts {
+    context?: UIContext;
+    planLocatedElement?: LocateResultElement;
+}
+declare interface LocateResult {
+    element: LocateResultElement | null;
+    rect?: Rect;
+}
+declare type LocateResultWithDump = LocateResult & ServiceResultBase;
+declare interface LocateValidatorResult {
+    pass: boolean;
+    rect: Rect;
+    center: [number, number];
+    centerDistance?: number;
+}
+declare interface LocatorValidatorOption {
+    centerDistanceThreshold?: number;
+}
+declare interface MatchCacheResult<T extends PlanningCache | LocateCache> {
+    cacheContent: T;
+    cacheUsable: boolean;
+    updateFn: (cb: (cache: T) => void) => void;
+}
+declare type MidsceneYamlFlowItem = MidsceneYamlFlowItemAIAction | MidsceneYamlFlowItemAIAssert | MidsceneYamlFlowItemAIWaitFor | MidsceneYamlFlowItemEvaluateJavaScript | MidsceneYamlFlowItemSleep | MidsceneYamlFlowItemLogScreenshot;
+declare interface MidsceneYamlFlowItemAIAction {
+    aiAction?: string;
+    ai?: string;
+    aiAct?: string;
+    aiActionProgressTips?: string[];
+    cacheable?: boolean;
+    [key: string]: unknown;
+}
+declare interface MidsceneYamlFlowItemAIAssert extends ServiceExtractOption {
+    aiAssert: string;
+    errorMessage?: string;
+    name?: string;
+}
+declare interface MidsceneYamlFlowItemAIWaitFor extends ServiceExtractOption {
+    aiWaitFor: string;
+    timeout?: number;
+}
+declare interface MidsceneYamlFlowItemEvaluateJavaScript {
+    javascript: string;
+    name?: string;
+}
+declare interface MidsceneYamlFlowItemLogScreenshot {
+    logScreenshot?: string;
+    recordToReport?: string;
+    content?: string;
+}
+declare interface MidsceneYamlFlowItemSleep {
+    sleep: number;
+}
+/**
+ * Model related eve keys, used for declare which model to use.
+ * Can be override by both agent.modelConfig and overrideAIConfig
+ * Can only be access after agent.constructor
+ */
+declare const MODEL_ENV_KEYS: readonly ["MIDSCENE_MODEL_NAME", "MIDSCENE_MODEL_INIT_CONFIG_JSON", "MIDSCENE_MODEL_EXTRA_BODY_JSON", "MIDSCENE_MODEL_API_KEY", "MIDSCENE_MODEL_BASE_URL", "MIDSCENE_MODEL_SOCKS_PROXY", "MIDSCENE_MODEL_HTTP_PROXY", "MIDSCENE_MODEL_TIMEOUT", "MIDSCENE_MODEL_TEMPERATURE", "MIDSCENE_MODEL_RETRY_COUNT", "MIDSCENE_MODEL_RETRY_INTERVAL", "MIDSCENE_MODEL_REASONING_EFFORT", "MIDSCENE_MODEL_REASONING_ENABLED", "MIDSCENE_MODEL_REASONING_BUDGET", "MIDSCENE_USE_VLM_UI_TARS", "MIDSCENE_USE_QWEN_VL", "MIDSCENE_USE_QWEN3_VL", "MIDSCENE_USE_DOUBAO_VISION", "MIDSCENE_USE_GEMINI", "MIDSCENE_USE_VL_MODEL", "OPENAI_API_KEY", "OPENAI_BASE_URL", "MIDSCENE_OPENAI_INIT_CONFIG_JSON", "MIDSCENE_OPENAI_HTTP_PROXY", "MIDSCENE_OPENAI_SOCKS_PROXY", "MIDSCENE_INSIGHT_MODEL_NAME", "MIDSCENE_INSIGHT_MODEL_SOCKS_PROXY", "MIDSCENE_INSIGHT_MODEL_HTTP_PROXY", "MIDSCENE_INSIGHT_MODEL_BASE_URL", "MIDSCENE_INSIGHT_MODEL_API_KEY", "MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON", "MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON", "MIDSCENE_INSIGHT_MODEL_TIMEOUT", "MIDSCENE_INSIGHT_MODEL_TEMPERATURE", "MIDSCENE_INSIGHT_MODEL_RETRY_COUNT", "MIDSCENE_INSIGHT_MODEL_RETRY_INTERVAL", "MIDSCENE_INSIGHT_MODEL_FAMILY", "MIDSCENE_INSIGHT_MODEL_REASONING_EFFORT", "MIDSCENE_INSIGHT_MODEL_REASONING_ENABLED", "MIDSCENE_INSIGHT_MODEL_REASONING_BUDGET", "MIDSCENE_PLANNING_MODEL_NAME", "MIDSCENE_PLANNING_MODEL_SOCKS_PROXY", "MIDSCENE_PLANNING_MODEL_HTTP_PROXY", "MIDSCENE_PLANNING_MODEL_BASE_URL", "MIDSCENE_PLANNING_MODEL_API_KEY", "MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON", "MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON", "MIDSCENE_PLANNING_MODEL_TIMEOUT", "MIDSCENE_PLANNING_MODEL_TEMPERATURE", "MIDSCENE_PLANNING_MODEL_RETRY_COUNT", "MIDSCENE_PLANNING_MODEL_RETRY_INTERVAL", "MIDSCENE_PLANNING_MODEL_FAMILY", "MIDSCENE_PLANNING_MODEL_REASONING_EFFORT", "MIDSCENE_PLANNING_MODEL_REASONING_ENABLED", "MIDSCENE_PLANNING_MODEL_REASONING_BUDGET", "MIDSCENE_MODEL_FAMILY"];
+declare interface ModelBrief {
+    /**
+     * The intent/category of the model call, for example "planning" or "insight".
+     */
+    intent?: string;
+    /**
+     * The model name returned by usage metadata, for example "gpt-4o".
+     */
+    name?: string;
+    /**
+     * Optional human-readable model description, for example "qwen2.5-vl mode".
+     */
+    modelDescription?: string;
+}
+declare enum NodeType {
+    CONTAINER = "CONTAINER Node",
+    FORM_ITEM = "FORM_ITEM Node",
+    BUTTON = "BUTTON Node",
+    A = "Anchor Node",
+    IMG = "IMG Node",
+    TEXT = "TEXT Node",
+    POSITION = "POSITION Node"
+}
+/**
+ * agent
+ */
+declare type OnTaskStartTip = (tip: string) => Promise<void> | void;
+export declare const overrideAIConfig: (newConfig: Partial<Record<(typeof GLOBAL_ENV_KEYS)[number] | (typeof MODEL_ENV_KEYS)[number], string>>, extendMode?: boolean) => void;
+declare interface PlanningAction<ParamType = any> {
+    thought?: string;
+    log?: string;
+    type: string;
+    param: ParamType;
+}
+declare type PlanningActionParamWaitFor = AgentWaitForOpt & {};
+declare interface PlanningCache {
+    type: 'plan';
+    prompt: string;
+    yamlWorkflow: string;
+}
+/**
+ * planning
+ *
+ */
+declare interface PlanningLocateParam extends DetailedLocateParam {
+    bbox?: [number, number, number, number];
+}
 declare interface ResolvedScrcpyConfig {
     enabled: boolean;
@@ -440,6 +1455,423 @@ declare interface ScrcpyScreenshotOptions {
     idleTimeoutMs?: number;
 }
+/**
+ * ScreenshotItem encapsulates screenshot data.
+ *
+ * Supports lazy loading after memory release:
+ * - inline mode: reads from HTML file using streaming (extractImageByIdSync)
+ * - directory mode: reads from file on disk
+ *
+ * After persistence, memory is released but the screenshot can be recovered
+ * on-demand from disk, making it safe to release memory at any time.
+ */
+declare class ScreenshotItem {
+    private _id;
+    private _base64;
+    private _format;
+    private _capturedAt;
+    private _persistedAs;
+    private _persistedPath;
+    private _persistedHtmlPath;
+    private constructor();
+    /** Create a new ScreenshotItem from base64 data */
+    static create(base64: string, capturedAt: number): ScreenshotItem;
+    get id(): string;
+    /** Get the image format (png or jpeg) */
+    get format(): 'png' | 'jpeg';
+    /** Get the file extension for this screenshot */
+    get extension(): string;
+    /** Get screenshot capture timestamp in milliseconds */
+    get capturedAt(): number;
+    get base64(): string;
+    /** Check if base64 data is still available in memory (not yet released) */
+    hasBase64(): boolean;
+    /**
+     * Mark as persisted to HTML (inline mode).
+     * Releases base64 memory, but keeps HTML path for lazy loading recovery.
+     * @param htmlPath - absolute path to the HTML file containing the image
+     */
+    markPersistedInline(htmlPath: string): void;
+    /**
+     * Mark as persisted to file (directory mode).
+     * Releases base64 memory, but keeps file path for lazy loading recovery.
+     * @param relativePath - relative path for serialization (e.g., "./screenshots/id.jpeg")
+     * @param absolutePath - absolute path for lazy loading recovery
+     */
+    markPersistedToPath(relativePath: string, absolutePath: string): void;
+    /** Serialize for JSON - format depends on persistence state */
+    toSerializable(): ScreenshotSerializeFormat;
+    /** Check if a value is a serialized ScreenshotItem reference (inline or directory mode) */
+    static isSerialized(value: unknown): value is ScreenshotSerializeFormat;
+    /**
+     * Get base64 data without the data URI prefix.
+     * Useful for writing raw binary data to files.
+     */
+    get rawBase64(): string;
+}
+/**
+ * Serialization format for ScreenshotItem
+ * - { $screenshot: "id" } - inline mode, references imageMap in HTML
+ * - { base64: "path" } - directory mode, references external file path
+ */
+declare type ScreenshotSerializeFormat = {
+    $screenshot: string;
+    capturedAt: number;
+} | {
+    base64: string;
+    capturedAt: number;
+};
+declare type ScrollParam = Omit<ActionScrollParam, 'locate'>;
+declare type ScrollType = 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft' | 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
+declare class Service {
+    contextRetrieverFn: () => Promise<UIContext> | UIContext;
+    taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;
+    constructor(context: UIContext | (() => Promise<UIContext> | UIContext), opt?: ServiceOptions);
+    locate(query: PlanningLocateParam, opt: LocateOpts, modelConfig: IModelConfig, abortSignal?: AbortSignal): Promise<LocateResultWithDump>;
+    extract<T>(dataDemand: ServiceExtractParam, modelConfig: IModelConfig, opt?: ServiceExtractOption, pageDescription?: string, multimodalPrompt?: TMultimodalPrompt, context?: UIContext): Promise<ServiceExtractResult<T>>;
+    describe(target: Rect | [number, number], modelConfig: IModelConfig, opt?: {
+        deepLocate?: boolean;
+    }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
+}
+declare type ServiceAction = 'locate' | 'extract' | 'assert' | 'describe';
+declare interface ServiceDump extends DumpMeta {
+    type: 'locate' | 'extract' | 'assert';
+    logId: string;
+    userQuery: {
+        element?: TUserPrompt;
+        dataDemand?: ServiceExtractParam;
+        assertion?: TUserPrompt;
+    };
+    matchedElement: LocateResultElement[];
+    matchedRect?: Rect;
+    deepLocate?: boolean;
+    data: any;
+    assertionPass?: boolean;
+    assertionThought?: string;
+    taskInfo: ServiceTaskInfo;
+    error?: string;
+    output?: any;
+}
+declare interface ServiceExtractOption {
+    domIncluded?: boolean | 'visible-only';
+    screenshotIncluded?: boolean;
+    [key: string]: unknown;
+}
+declare type ServiceExtractParam = string | Record<string, string>;
+declare interface ServiceExtractResult<T> extends ServiceResultBase {
+    data: T;
+    thought?: string;
+    usage?: AIUsageInfo;
+    reasoning_content?: string;
+}
+declare interface ServiceOptions {
+    taskInfo?: Omit<ServiceTaskInfo, 'durationMs'>;
+}
+declare interface ServiceResultBase {
+    dump: ServiceDump;
+}
+declare interface ServiceTaskInfo {
+    durationMs: number;
+    formatResponse?: string;
+    rawResponse?: string;
+    usage?: AIUsageInfo;
+    searchArea?: Rect;
+    searchAreaRawResponse?: string;
+    searchAreaUsage?: AIUsageInfo;
+    reasoning_content?: string;
+}
+declare class TaskCache {
+    cacheId: string;
+    cacheFilePath?: string;
+    cache: CacheFileContent;
+    isCacheResultUsed: boolean;
+    cacheOriginalLength: number;
+    readOnlyMode: boolean;
+    writeOnlyMode: boolean;
+    private matchedCacheIndices;
+    constructor(cacheId: string, isCacheResultUsed: boolean, cacheFilePath?: string, options?: {
+        readOnly?: boolean;
+        writeOnly?: boolean;
+    });
+    matchCache(prompt: TUserPrompt, type: 'plan' | 'locate'): MatchCacheResult<PlanningCache | LocateCache> | undefined;
+    matchPlanCache(prompt: string): MatchCacheResult<PlanningCache> | undefined;
+    matchLocateCache(prompt: TUserPrompt): MatchCacheResult<LocateCache> | undefined;
+    appendCache(cache: PlanningCache | LocateCache): void;
+    loadCacheFromFile(): CacheFileContent | undefined;
+    flushCacheToFile(options?: {
+        cleanUnused?: boolean;
+    }): void;
+    updateOrAppendCacheRecord(newRecord: PlanningCache | LocateCache, cachedRecord?: MatchCacheResult<PlanningCache | LocateCache>): void;
+}
+declare class TaskExecutionError extends Error {
+    runner: TaskRunner;
+    errorTask: ExecutionTask | null;
+    constructor(message: string, runner: TaskRunner, errorTask: ExecutionTask | null, options?: {
+        cause?: unknown;
+    });
+}
+declare class TaskExecutor {
+    interface: AbstractInterface;
+    service: Service;
+    taskCache?: TaskCache;
+    private readonly providedActionSpace;
+    private readonly taskBuilder;
+    onTaskStartCallback?: ExecutionTaskProgressOptions['onTaskStart'];
+    private readonly hooks?;
+    replanningCycleLimit?: number;
+    waitAfterAction?: number;
+    useDeviceTimestamp?: boolean;
+    get page(): AbstractInterface;
+    constructor(interfaceInstance: AbstractInterface, service: Service, opts: {
+        taskCache?: TaskCache;
+        onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
+        replanningCycleLimit?: number;
+        waitAfterAction?: number;
+        useDeviceTimestamp?: boolean;
+        hooks?: TaskExecutorHooks;
+        actionSpace: DeviceAction[];
+    });
+    private createExecutionSession;
+    private getActionSpace;
+    /**
+     * Get a readable time string using device time when configured.
+     * This method respects the useDeviceTimestamp configuration.
+     * @param format - Optional format string
+     * @returns A formatted time string
+     */
+    private getTimeString;
+    convertPlanToExecutable(plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, options?: {
+        cacheable?: boolean;
+        deepLocate?: boolean;
+        abortSignal?: AbortSignal;
+    }): Promise<{
+        tasks: ExecutionTaskApply[];
+    }>;
+    loadYamlFlowAsPlanning(userInstruction: string, yamlString: string): Promise<{
+        runner: TaskRunner;
+    }>;
+    runPlans(title: string, plans: PlanningAction[], modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig): Promise<ExecutionResult>;
+    action(userPrompt: string, modelConfigForPlanning: IModelConfig, modelConfigForDefaultIntent: IModelConfig, includeBboxInPlanning: boolean, aiActContext?: string, cacheable?: boolean, replanningCycleLimitOverride?: number, imagesIncludeCount?: number, deepThink?: DeepThinkOption, fileChooserAccept?: string[], deepLocate?: boolean, abortSignal?: AbortSignal): Promise<ExecutionResult<{
+        yamlFlow?: MidsceneYamlFlowItem[];
+        output?: string;
+    } | undefined>>;
+    private runAction;
+    private createTypeQueryTask;
+    createTypeQueryExecution<T>(type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert', demand: ServiceExtractParam, modelConfig: IModelConfig, opt?: ServiceExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<ExecutionResult<T>>;
+    waitFor(assertion: TUserPrompt, opt: PlanningActionParamWaitFor, modelConfig: IModelConfig): Promise<ExecutionResult<void>>;
+}
+declare interface TaskExecutorHooks {
+    onTaskUpdate?: (runner: TaskRunner, error?: TaskExecutionError) => Promise<void> | void;
+}
+declare class TaskRunner {
+    readonly id: string;
+    name: string;
+    tasks: ExecutionTask[];
+    status: 'init' | 'pending' | 'running' | 'completed' | 'error';
+    onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
+    private readonly uiContextBuilder;
+    private readonly onTaskUpdate?;
+    private readonly executionLogTime;
+    constructor(name: string, uiContextBuilder: () => Promise<UIContext>, options?: TaskRunnerInitOptions);
+    private emitOnTaskUpdate;
+    private lastUiContext?;
+    private getUiContext;
+    private captureScreenshot;
+    private attachRecorderItem;
+    private markTaskAsPending;
+    private normalizeStatusFromError;
+    append(task: ExecutionTaskApply[] | ExecutionTaskApply, options?: TaskRunnerOperationOptions): Promise<void>;
+    appendAndFlush(task: ExecutionTaskApply[] | ExecutionTaskApply, options?: TaskRunnerOperationOptions): Promise<{
+        output: any;
+        thought?: string;
+    } | undefined>;
+    flush(options?: TaskRunnerOperationOptions): Promise<{
+        output: any;
+        thought?: string;
+    } | undefined>;
+    isInErrorState(): boolean;
+    latestErrorTask(): ExecutionTask | null;
+    dump(): ExecutionDump;
+    appendErrorPlan(errorMsg: string): Promise<{
+        output: undefined;
+        runner: TaskRunner;
+    }>;
+}
+declare type TaskRunnerInitOptions = ExecutionTaskProgressOptions & {
+    tasks?: ExecutionTaskApply[];
+    onTaskUpdate?: (runner: TaskRunner, error?: TaskExecutionError) => Promise<void> | void;
+};
+declare type TaskRunnerOperationOptions = {
+    allowWhenError?: boolean;
+};
+declare type TMultimodalPrompt = z.infer<typeof TMultimodalPromptSchema>;
+declare const TMultimodalPromptSchema: z.ZodObject<{
+    images: z.ZodOptional<z.ZodArray<z.ZodObject<{
+        name: z.ZodString;
+        url: z.ZodString;
+    }, "strip", z.ZodTypeAny, {
+        name: string;
+        url: string;
+    }, {
+        name: string;
+        url: string;
+    }>, "many">>;
+    convertHttpImage2Base64: z.ZodOptional<z.ZodBoolean>;
+}, "strip", z.ZodTypeAny, {
+    images?: {
+        name: string;
+        url: string;
+    }[] | undefined;
+    convertHttpImage2Base64?: boolean | undefined;
+}, {
+    images?: {
+        name: string;
+        url: string;
+    }[] | undefined;
+    convertHttpImage2Base64?: boolean | undefined;
+}>;
+/**
+ * Tool definition for MCP server
+ */
+declare interface ToolDefinition<T = Record<string, unknown>> {
+    name: string;
+    description: string;
+    schema: ToolSchema;
+    handler: ToolHandler<T>;
+}
+/**
+ * Tool handler function type
+ * Takes parsed arguments and returns a tool result
+ */
+declare type ToolHandler<T = Record<string, unknown>> = (args: T) => Promise<ToolResult>;
+/**
+ * Result type for tool execution (MCP compatible)
+ */
+declare interface ToolResult {
+    [x: string]: unknown;
+    content: ToolResultContent[];
+    isError?: boolean;
+    _meta?: Record<string, unknown>;
+}
+/**
+ * Content item types for tool results (MCP compatible)
+ */
+declare type ToolResultContent = {
+    type: 'text';
+    text: string;
+} | {
+    type: 'image';
+    data: string;
+    mimeType: string;
+} | {
+    type: 'audio';
+    data: string;
+    mimeType: string;
+} | {
+    type: 'resource';
+    resource: {
+        text: string;
+        uri: string;
+        mimeType?: string;
+    } | {
+        uri: string;
+        blob: string;
+        mimeType?: string;
+    };
+};
+/**
+ * Tool schema type using Zod
+ */
+declare type ToolSchema = Record<string, z.ZodTypeAny>;
+declare type TUserPrompt = z.infer<typeof TUserPromptSchema>;
+declare const TUserPromptSchema: z.ZodUnion<[z.ZodString, z.ZodIntersection<z.ZodObject<{
+    prompt: z.ZodString;
+}, "strip", z.ZodTypeAny, {
+    prompt: string;
+}, {
+    prompt: string;
+}>, z.ZodObject<{
+    images: z.ZodOptional<z.ZodOptional<z.ZodArray<z.ZodObject<{
+        name: z.ZodString;
+        url: z.ZodString;
+    }, "strip", z.ZodTypeAny, {
+        name: string;
+        url: string;
+    }, {
+        name: string;
+        url: string;
+    }>, "many">>>;
+    convertHttpImage2Base64: z.ZodOptional<z.ZodOptional<z.ZodBoolean>>;
+}, "strip", z.ZodTypeAny, {
+    images?: {
+        name: string;
+        url: string;
+    }[] | undefined;
+    convertHttpImage2Base64?: boolean | undefined;
+}, {
+    images?: {
+        name: string;
+        url: string;
+    }[] | undefined;
+    convertHttpImage2Base64?: boolean | undefined;
+}>>]>;
+/**
+ * context
+ */
+declare abstract class UIContext {
+    /**
+     * screenshot of the current UI state. which size is shotSize(be shrunk by screenshotShrinkFactor),
+     */
+    abstract screenshot: ScreenshotItem;
+    /**
+     * screenshot size after shrinking
+     */
+    abstract shotSize: Size;
+    /**
+     * The ratio for converting shrunk screenshot coordinates to logical coordinates.
+     *
+     * Example:
+     * - Physical screen width: 3000px, dpr=6
+     * - Logical width: 500px
+     * - User-defined screenshotShrinkFactor: 2
+     * - Actual shrunk screenshot width: 3000 / 2 = 1500px
+     * - shrunkShotToLogicalRatio: dpr / screenshotShrinkFactor = 6 / 2 = 3
+     * - To map back to logical coordinates: 1500 / shrunkShotToLogicalRatio = 500px
+     */
+    abstract shrunkShotToLogicalRatio: number;
+    abstract _isFrozen?: boolean;
+    abstract deprecatedDpr?: number;
+}
 /**
  * Helper type to convert DeviceAction to wrapped method signature
  */