npm - @midscene/core - Versions diffs - 1.8.10 → 1.9.0 - Mend

@midscene/core 1.8.10 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (304) hide show

package/dist/types/ai-model/models/index.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { getModelAdapter, getModelRuntime, } from './registry';
2	+ export type { ModelRuntime } from './types';

package/dist/types/ai-model/models/qwen.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+import { type LocateResultValue, type PixelBbox } from '../shared/model-locate-result';
+import type { ChatCompletionCallContext, ChatCompletionParamsResult, ModelAdapterDefinition } from './types';
+declare function parseQwen25RawLocateValue(input: unknown): LocateResultValue;
+declare function normalizeQwen25ResultToPixelBbox(result: LocateResultValue): PixelBbox;
+export declare const qwenAdapters: {
+    'qwen2.5-vl': {
+        chatCompletion: {
+            unsupportedUserConfig: ("reasoningEnabled" | "reasoningEffort" | "reasoningBudget")[];
+            buildChatCompletionParams: (input: ChatCompletionCallContext) => ChatCompletionParamsResult;
+        };
+        imagePreprocess: {
+            padBlockSize: number;
+        };
+        locate: {
+            resultAdapter: {
+                coordinates: {
+                    shape: "bbox";
+                    order: "xy";
+                };
+                parseRawLocateValue: typeof parseQwen25RawLocateValue;
+                mapLocateResultToPixelBbox: typeof normalizeQwen25ResultToPixelBbox;
+            };
+        };
+    };
+    'qwen3-vl': ModelAdapterDefinition;
+    qwen3: ModelAdapterDefinition;
+    'qwen3.5': ModelAdapterDefinition;
+    'qwen3.6': ModelAdapterDefinition;
+};
+export {};

package/dist/types/ai-model/models/registry.d.ts ADDED Viewed

@@ -0,0 +1,81 @@
+import type { IModelConfig, TModelFamily } from '@midscene/shared/env';
+import type { ModelAdapter, ModelAdapterDefinition, ModelRuntime } from './types';
+export declare const MODEL_ADAPTER_CONFIGS: {
+    'gpt-5': {
+        chatCompletion: {
+            unsupportedUserConfig: ("temperature" | "reasoningEnabled" | "reasoningEffort" | "reasoningBudget")[];
+            buildChatCompletionParams: () => import("./types").ChatCompletionParamsResult;
+            resolveImageDetail: (input: import("./types").ChatCompletionCallContext) => import("./types").ImageDetail | undefined;
+        };
+        locate: {
+            resultAdapter: {
+                coordinates: {
+                    shape: "bbox";
+                    order: "xy";
+                };
+            };
+        };
+    };
+    'auto-glm': ModelAdapterDefinition;
+    'auto-glm-multilingual': ModelAdapterDefinition;
+    'glm-v': {
+        chatCompletion: {
+            unsupportedUserConfig: ("reasoningEffort" | "reasoningBudget")[];
+            buildChatCompletionParams: (input: import("./types").ChatCompletionCallContext) => import("./types").ChatCompletionParamsResult;
+        };
+        locate: {
+            resultAdapter: {
+                coordinates: {
+                    shape: "bbox";
+                    order: "xy";
+                    normalizedBy: number;
+                };
+            };
+        };
+    };
+    'vlm-ui-tars': ModelAdapterDefinition;
+    'vlm-ui-tars-doubao': ModelAdapterDefinition;
+    'vlm-ui-tars-doubao-1.5': ModelAdapterDefinition;
+    gemini: {
+        chatCompletion: {
+            unsupportedUserConfig: ("reasoningEnabled" | "reasoningBudget")[];
+            buildChatCompletionParams: (input: import("./types").ChatCompletionCallContext) => import("./types").ChatCompletionParamsResult;
+        };
+        locate: {
+            resultAdapter: {
+                coordinates: {
+                    shape: "bbox";
+                    order: "yx";
+                    normalizedBy: number;
+                };
+            };
+        };
+    };
+    'doubao-vision': ModelAdapterDefinition;
+    'doubao-seed': ModelAdapterDefinition;
+    'qwen2.5-vl': {
+        chatCompletion: {
+            unsupportedUserConfig: ("reasoningEnabled" | "reasoningEffort" | "reasoningBudget")[];
+            buildChatCompletionParams: (input: import("./types").ChatCompletionCallContext) => import("./types").ChatCompletionParamsResult;
+        };
+        imagePreprocess: {
+            padBlockSize: number;
+        };
+        locate: {
+            resultAdapter: {
+                coordinates: {
+                    shape: "bbox";
+                    order: "xy";
+                };
+                parseRawLocateValue: (input: unknown) => import("../shared/model-locate-result").LocateResultValue;
+                mapLocateResultToPixelBbox: (result: import("../shared/model-locate-result").LocateResultValue) => import("../shared/model-locate-result").PixelBbox;
+            };
+        };
+    };
+    'qwen3-vl': ModelAdapterDefinition;
+    qwen3: ModelAdapterDefinition;
+    'qwen3.5': ModelAdapterDefinition;
+    'qwen3.6': ModelAdapterDefinition;
+};
+export declare function getModelAdapter(modelFamily?: TModelFamily): ModelAdapter;
+export declare function getModelRuntime(config: IModelConfig): ModelRuntime;

package/dist/types/ai-model/models/resolved.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import type { ChatCompletionAdapter, ImagePreprocessPolicy, JsonParser, LocateAdapter, ModelAdapter, ModelAdapterDefinition, PlanningAdapter } from './types';
+export declare class ResolvedModelAdapter implements ModelAdapter {
+    readonly jsonParser: JsonParser;
+    readonly chatCompletion: ChatCompletionAdapter;
+    readonly imagePreprocess: ImagePreprocessPolicy;
+    readonly planning: PlanningAdapter;
+    readonly locate: LocateAdapter;
+    constructor(config: ModelAdapterDefinition, modelFamily: string);
+}

package/dist/types/ai-model/models/types.d.ts ADDED Viewed

@@ -0,0 +1,102 @@
+import type { IModelConfig, TIntent } from '@midscene/shared/env';
+import type { JsonParser, JsonParserContext, JsonParserSource } from '../service-caller/json';
+import type { LocateResultAdapter, LocateResultAdapterDefinition } from '../shared/model-locate-result/types';
+import type { ImagePreprocessPolicy } from '../workflows/image-preprocess';
+import type { LocateFn } from '../workflows/inspect/types';
+import type { PlanFn } from '../workflows/planning/types';
+export type { ImagePreprocessPolicy, JsonParser, JsonParserContext, JsonParserSource, };
+export type JsonParserPreset = 'lenient-json';
+export interface ReasoningInput {
+    reasoningEnabled?: boolean;
+    reasoningEffort?: string;
+    reasoningBudget?: number;
+}
+export interface ChatCompletionParamsResult {
+    config: Record<string, unknown>;
+}
+export interface MidsceneChatCompletionDefaults {
+    temperature: number;
+}
+export interface ChatCompletionCallUserConfig extends ReasoningInput {
+    temperature?: number;
+}
+export type ChatCompletionUnsupportedUserConfig = keyof ChatCompletionCallUserConfig;
+export interface ChatCompletionCallInput {
+    intent?: TIntent;
+    userConfig?: ChatCompletionCallUserConfig;
+    requiresOriginalImageDetail?: boolean;
+}
+export interface ChatCompletionCallContext {
+    intent?: TIntent;
+    userConfig: ChatCompletionCallUserConfig;
+    requiresOriginalImageDetail?: boolean;
+    midsceneDefaults: MidsceneChatCompletionDefaults;
+}
+export type ImageDetail = 'auto' | 'low' | 'high' | 'original';
+export interface ChatCompletionAdapter {
+    unsupportedUserConfig: ChatCompletionUnsupportedUserConfig[];
+    buildChatCompletionParams(input: ChatCompletionCallInput): ChatCompletionParamsResult;
+    resolveImageDetail(input: ChatCompletionCallInput): ImageDetail | undefined;
+}
+export interface ChatCompletionDefinition {
+    unsupportedUserConfig?: ChatCompletionUnsupportedUserConfig[];
+    buildChatCompletionParams?: (input: ChatCompletionCallContext) => ChatCompletionParamsResult;
+    resolveImageDetail?: (input: ChatCompletionCallContext) => ImageDetail | undefined;
+}
+export type ImagePreprocessDefinition = Partial<ImagePreprocessPolicy>;
+interface PlanningPolicy {
+    cacheEnabled: boolean;
+    defaultReplanningCycleLimit: number;
+    supportsActionDeepLocate: boolean;
+}
+export type PlanningAdapter = (PlanningPolicy & {
+    kind: 'standard';
+}) | (PlanningPolicy & {
+    kind: 'custom';
+    planFn: PlanFn;
+});
+export type PlanningDefinition = (Partial<PlanningPolicy> & {
+    kind?: 'standard';
+}) | (Partial<PlanningPolicy> & {
+    kind: 'custom';
+    planFn: PlanFn;
+});
+interface LocatePolicy {
+    supportsSearchArea: boolean;
+}
+type StandardLocateAdapter = LocatePolicy & {
+    kind: 'standard';
+    resultAdapter: LocateResultAdapter;
+};
+type CustomLocateAdapter = LocatePolicy & {
+    kind: 'custom';
+    locateFn: LocateFn;
+};
+export type LocateAdapter = StandardLocateAdapter | CustomLocateAdapter;
+type StandardLocateDefinition = Partial<LocatePolicy> & {
+    kind?: 'standard';
+    resultAdapter?: LocateResultAdapterDefinition;
+};
+type CustomLocateDefinition = Partial<LocatePolicy> & {
+    kind: 'custom';
+    locateFn: LocateFn;
+};
+export type LocateDefinition = StandardLocateDefinition | CustomLocateDefinition;
+export interface ModelAdapter {
+    jsonParser: JsonParser;
+    chatCompletion: ChatCompletionAdapter;
+    imagePreprocess: ImagePreprocessPolicy;
+    planning: PlanningAdapter;
+    locate: LocateAdapter;
+}
+export interface ModelRuntime {
+    config: IModelConfig;
+    adapter: ModelAdapter;
+}
+export interface ModelAdapterDefinition {
+    jsonParser?: JsonParserPreset | JsonParser;
+    chatCompletion?: ChatCompletionDefinition;
+    imagePreprocess?: ImagePreprocessDefinition;
+    planning?: PlanningDefinition;
+    locate?: LocateDefinition;
+}

package/dist/types/ai-model/models/ui-tars/adapter.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+import type { ModelAdapterDefinition } from '../types';
+export declare const uiTarsAdapters: {
+    'vlm-ui-tars': ModelAdapterDefinition;
+    'vlm-ui-tars-doubao': ModelAdapterDefinition;
+    'vlm-ui-tars-doubao-1.5': ModelAdapterDefinition;
+};

package/dist/types/ai-model/{ui-tars-planning.d.ts → models/ui-tars/planning.d.ts} RENAMED Viewed

@@ -1,14 +1,8 @@
-import type { PlanningAIResponse, UIContext } from '../types';
-import { type IModelConfig } from '@midscene/shared/env';
-import type { ConversationHistory } from './conversation-history';
+import type { PlanningAIResponse } from '../../../types';
+import type { UITarsModelVersion } from '@midscene/shared/env';
+import type { PlanOptions } from '../../workflows/planning/types';
 type ActionType = 'click' | 'left_double' | 'right_single' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
-export declare function uiTarsPlanning(userInstruction: string, options: {
-    conversationHistory: ConversationHistory;
-    context: UIContext;
-    modelConfig: IModelConfig;
-    actionContext?: string;
-    abortSignal?: AbortSignal;
-}): Promise<PlanningAIResponse>;
+export declare function uiTarsPlanning(userInstruction: string, options: PlanOptions, uiTarsModelVersion: UITarsModelVersion): Promise<PlanningAIResponse>;
 interface BaseAction {
     action_type: ActionType;
     action_inputs: Record<string, any>;
@@ -66,7 +60,9 @@ interface ScrollAction extends BaseAction {
 }
 interface FinishedAction extends BaseAction {
     action_type: 'finished';
-    action_inputs: Record<string, never>;
+    action_inputs: {
+        content?: string;
+    };
 }
 export type Action = ClickAction | LeftDoubleAction | RightSingleAction | DragAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction;
 export {};

package/dist/types/ai-model/prompt/llm-locator.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-import type { TModelFamily } from '@midscene/shared/env';
-export declare function systemPromptToLocateElement(modelFamily: TModelFamily | undefined): string;
+import type { LocateResultPromptSpec } from '../shared/model-locate-result';
+export declare function systemPromptToLocateElement(promptSpec: LocateResultPromptSpec): string;
 export declare const findElementPrompt: (targetElementDescription: string) => string;

package/dist/types/ai-model/prompt/llm-planning.d.ts CHANGED Viewed

@@ -1,10 +1,10 @@
 import type { DeviceAction } from '../../types';
-import type { TModelFamily } from '@midscene/shared/env';
-export declare const descriptionForAction: (action: DeviceAction<any>, locatorSchemaTypeDescription: string, includeBbox?: boolean) => string;
-export declare function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbox, includeThought, includeSubGoals, }: {
+import type { LocateResultPromptSpec } from '../shared/model-locate-result';
+export declare const descriptionForAction: (action: DeviceAction<any>, locateParamTypeDescription: string, includeLocateInPlanning?: boolean, locatePromptSpec?: LocateResultPromptSpec) => string;
+export declare function systemPromptToTaskPlanning({ actionSpace, locatePromptSpec, includeLocateInPlanning, includeThought, includeSubGoals, }: {
     actionSpace: DeviceAction<any>[];
-    modelFamily: TModelFamily | undefined;
-    includeBbox: boolean;
+    locatePromptSpec?: LocateResultPromptSpec;
+    includeLocateInPlanning: boolean;
     includeThought?: boolean;
     includeSubGoals?: boolean;
 }): Promise<string>;

package/dist/types/ai-model/prompt/llm-section-locator.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-import type { TModelFamily } from '@midscene/shared/env';
-export declare function systemPromptToLocateSection(modelFamily: TModelFamily | undefined): string;
+import type { LocateResultPromptSpec } from '../shared/model-locate-result';
+export declare function systemPromptToLocateSection(promptSpec: LocateResultPromptSpec): string;
 export declare const sectionLocatorInstruction: (sectionDescription: string) => string;

package/dist/types/ai-model/prompt/locate-grounding-rules.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function locateGroundingRules(): string;

package/dist/types/ai-model/prompt/locate-param-example.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import type { LocateResultPromptSpec } from '../shared/model-locate-result';
+export declare function formatLocateExampleValue(value: unknown): string;
+export declare function locateParamExample(prompt: string, promptSpec?: LocateResultPromptSpec, exampleValue?: unknown): string;

package/dist/types/ai-model/prompt/playwright-generator.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import type { StreamingAIResponse, StreamingCodeGenerationOptions } from '../../types';
-import type { IModelConfig } from '@midscene/shared/env';
+import type { ModelRuntime } from '../models';
 import { type ChromeRecordedEvent, type EventCounts, type EventSummary, type InputDescription, type ProcessedEvent, createEventCounts, createMessageContent, extractInputDescriptions, filterEventsByType, getScreenshotsForLLM, prepareEventSummary, processEventsForLLM, validateEvents } from './yaml-generator';
 export interface PlaywrightGenerationOptions {
     testName?: string;
@@ -19,8 +19,8 @@ export { getScreenshotsForLLM, filterEventsByType, createEventCounts, extractInp
 /**
  * Generates Playwright test code from recorded events
  */
-export declare const generatePlaywrightTest: (events: ChromeRecordedEvent[], options: PlaywrightGenerationOptions, modelConfig: IModelConfig) => Promise<string>;
+export declare const generatePlaywrightTest: (events: ChromeRecordedEvent[], options: PlaywrightGenerationOptions, modelRuntime: ModelRuntime) => Promise<string>;
 /**
  * Generates Playwright test code from recorded events with streaming support
  */
-export declare const generatePlaywrightTestStream: (events: ChromeRecordedEvent[], options: PlaywrightGenerationOptions & StreamingCodeGenerationOptions, modelConfig: IModelConfig) => Promise<StreamingAIResponse>;
+export declare const generatePlaywrightTestStream: (events: ChromeRecordedEvent[], options: PlaywrightGenerationOptions & StreamingCodeGenerationOptions, modelRuntime: ModelRuntime) => Promise<StreamingAIResponse>;

package/dist/types/ai-model/prompt/yaml-generator.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import type { StreamingAIResponse, StreamingCodeGenerationOptions } from '../../types';
-import type { IModelConfig } from '@midscene/shared/env';
+import type { ModelRuntime } from '../models';
 export interface EventCounts {
     navigation: number;
     click: number;
@@ -95,8 +95,8 @@ export declare const validateEvents: (events: ChromeRecordedEvent[]) => void;
 /**
  * Generates YAML test configuration from recorded events using AI
  */
-export declare const generateYamlTest: (events: ChromeRecordedEvent[], options: YamlGenerationOptions, modelConfig: IModelConfig) => Promise<string>;
+export declare const generateYamlTest: (events: ChromeRecordedEvent[], options: YamlGenerationOptions, modelRuntime: ModelRuntime) => Promise<string>;
 /**
  * Generates YAML test configuration from recorded events using AI with streaming support
  */
-export declare const generateYamlTestStream: (events: ChromeRecordedEvent[], options: YamlGenerationOptions & StreamingCodeGenerationOptions, modelConfig: IModelConfig) => Promise<StreamingAIResponse>;
+export declare const generateYamlTestStream: (events: ChromeRecordedEvent[], options: YamlGenerationOptions & StreamingCodeGenerationOptions, modelRuntime: ModelRuntime) => Promise<StreamingAIResponse>;

package/dist/types/ai-model/prompts/locate-result-coordinates.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+import type { LocateResultBbox, LocateResultPromptSpec } from '../shared/model-locate-result';
+import type { ResolvedLocateResultCoordinates } from '../shared/model-locate-result/types';
+export declare function describeLocateResultValueSchema({ shape, }: ResolvedLocateResultCoordinates): string;
+export declare function locateResultExampleValue(resolvedCoordinates: ResolvedLocateResultCoordinates, region: LocateResultBbox): number[];
+export declare const locateResultExampleRegions: LocateResultBbox[];
+export declare function createLocateResultPromptSpec(resolvedCoordinates: ResolvedLocateResultCoordinates): LocateResultPromptSpec;

package/dist/types/ai-model/service-caller/index.d.ts CHANGED Viewed

@@ -6,52 +6,44 @@ export declare class AIResponseParseError extends Error {
     constructor(message: string, rawResponse: string, usage?: AIUsageInfo);
 }
 import { type IModelConfig, type TModelFamily } from '@midscene/shared/env';
+import OpenAI from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources/index';
-import type { AIArgs } from '../../common';
-export declare function callAI(messages: ChatCompletionMessageParam[], modelConfig: IModelConfig, options?: {
+import { type ModelRuntime } from '../models';
+import type { AIArgs } from '../types';
+import type { JsonParserSource } from './json';
+export { extractJSONFromCodeBlock, normalJsonParser, safeParseJson, } from './json';
+export type { JsonParser } from './json';
+export declare function createChatClient({ modelConfig, }: {
+    modelConfig: IModelConfig;
+}): Promise<{
+    completion: OpenAI.Chat.Completions;
+    modelName: string;
+    modelDescription: string;
+    modelFamily: TModelFamily | undefined;
+}>;
+export declare function callAI(messages: ChatCompletionMessageParam[], modelRuntime: ModelRuntime, options?: {
     stream?: boolean;
     onChunk?: StreamingCallback;
     abortSignal?: AbortSignal;
-    forceOriginalImageDetail?: boolean;
+    requiresOriginalImageDetail?: boolean;
 }): Promise<{
     content: string;
     reasoning_content?: string;
     usage?: AIUsageInfo;
     isStreamed: boolean;
 }>;
-export declare function callAIWithObjectResponse<T>(messages: ChatCompletionMessageParam[], modelConfig: IModelConfig, options?: {
+export declare function callAIWithObjectResponse<T>(messages: ChatCompletionMessageParam[], model: IModelConfig | ModelRuntime, options?: {
     abortSignal?: AbortSignal;
+    jsonParserSource?: JsonParserSource;
 }): Promise<{
     content: T;
     contentString: string;
     usage?: AIUsageInfo;
     reasoning_content?: string;
 }>;
-export declare function callAIWithStringResponse(msgs: AIArgs, modelConfig: IModelConfig, options?: {
+export declare function callAIWithStringResponse(msgs: AIArgs, modelRuntime: ModelRuntime, options?: {
     abortSignal?: AbortSignal;
 }): Promise<{
     content: string;
     usage?: AIUsageInfo;
 }>;
-export declare function extractJSONFromCodeBlock(response: string): string;
-export declare function preprocessDoubaoBboxJson(input: string): string;
-export declare function resolveReasoningConfig({ reasoningEnabled, reasoningEffort, reasoningBudget, modelFamily, }: {
-    reasoningEnabled?: boolean;
-    reasoningEffort?: string;
-    reasoningBudget?: number;
-    modelFamily?: TModelFamily;
-}): {
-    config: Record<string, unknown>;
-    debugMessage?: string;
-};
-/**
- * Normalize a parsed JSON object by trimming whitespace from:
- * 1. All object keys (e.g., " prompt " -> "prompt")
- * 2. String values unless the key is explicitly preserved
- * This handles LLM output that may include leading/trailing spaces.
- */
-interface ParseModelResponseJsonOptions {
-    preserveStringValueKeys?: string[];
-}
-export declare function parseModelResponseJson(input: string, modelFamily: TModelFamily | undefined, options?: ParseModelResponseJsonOptions): any;
-export {};

package/dist/types/ai-model/service-caller/json.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+export declare function extractJSONFromCodeBlock(response: string): string;
+export type JsonParserSource = 'generic-object' | 'planning-action-param' | 'locate' | 'section-locator';
+export interface JsonParserContext {
+    source: JsonParserSource;
+    preserveStringValueKeys?: string[];
+}
+export type JsonParser = (raw: string, context?: JsonParserContext) => unknown;
+export declare function safeParseJson(raw: string, context?: JsonParserContext): any;
+export declare const normalJsonParser: JsonParser;

package/dist/types/ai-model/shared/model-locate-result/bbox.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { LocateResultBbox, LocateResultContext, PixelBbox, SectionLocatePixelBboxGroup } from './types';
+export declare function maxPixelIndex(size: number): number;
+export declare function normalizedCoordinateToPixelIndex(value: number, normalizedBy: number, size: number): number;
+export declare function mapNormalizedCoordinatesToPixelBbox(coordinates: LocateResultBbox, normalizedBy: number, width: number, height: number): PixelBbox;
+export declare function expandPointToBbox(x: number, y: number, maxX: number, maxY: number, halfSize: number): LocateResultBbox;
+export declare function finalizePixelBbox(pixelBbox: PixelBbox, rawResult: unknown, { preparedSize, contentSize }: LocateResultContext): PixelBbox;
+export declare function finalizeSectionLocatePixelBboxGroup(result: SectionLocatePixelBboxGroup, rawResult: unknown, ctx: LocateResultContext): SectionLocatePixelBboxGroup;

package/dist/types/ai-model/shared/model-locate-result/factory.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { LocateResultAdapter, LocateResultAdapterDefinition } from './types';
2	+ export declare function createLocateResultAdapter(config: LocateResultAdapterDefinition): LocateResultAdapter;

package/dist/types/ai-model/shared/model-locate-result/index.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export { createLocateResultAdapter } from './factory';
+export { unwrapCoordinateListLikeInput } from './parse';
+export type { LocateResultBbox, PixelBbox, RawLocateValue, SectionLocatePixelBboxGroup, CustomLocateResultAdapterDefinition, LocateResultAdapter, LocateResultAdapterDefinition, LocateResultCoordinates, LocateResultContext, LocateResultShape, LocateResultPromptSpec, LocateResultValue, NonEmptyArray, StandardLocateResultAdapterDefinition, } from './types';

package/dist/types/ai-model/shared/model-locate-result/parse.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import type { LocateResultValue, ResolvedLocateResultCoordinates } from './types';
+type CoordinateListLikeInput = number[] | string[] | string | (number[] | string[])[];
+export declare function unwrapCoordinateListLikeInput(coordinateList: CoordinateListLikeInput): number[] | string[] | string;
+export declare function parseNumericLocateResult(resolvedCoordinates: ResolvedLocateResultCoordinates, input: unknown): LocateResultValue;
+export {};

package/dist/types/ai-model/shared/model-locate-result/pixel-bbox-mapper.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { LocateResultValue, PixelBbox, ResolvedLocateResultCoordinates } from './types';
+export declare function mapLocateResultToPixelBboxByCoordinates(result: LocateResultValue, { preparedSize }: {
+    preparedSize: {
+        width: number;
+        height: number;
+    };
+}, resolvedCoordinates: ResolvedLocateResultCoordinates): PixelBbox;

package/dist/types/ai-model/shared/model-locate-result/types.d.ts ADDED Viewed

@@ -0,0 +1,157 @@
+import type { Bbox } from '../../../types';
+export type { Bbox };
+export type LocateResultBbox = Bbox;
+export type PixelBbox = Bbox;
+export type NonEmptyArray<T> = [T, ...T[]];
+export type RawLocateValue = unknown;
+export type LocateResultValue = {
+    type: 'bbox';
+    coordinates: LocateResultBbox;
+} | {
+    type: 'point';
+    coordinates: [number, number];
+};
+export type LocateResultShape = 'bbox' | 'point';
+export interface LocateResultContext {
+    preparedSize: {
+        width: number;
+        height: number;
+    };
+    contentSize?: {
+        width: number;
+        height: number;
+    };
+}
+export interface LocateResultPromptSpec {
+    resultKey: string;
+    resultValueSchema: string;
+    resultValueDescription: string;
+    resultNoun: string;
+    resultNounPlural: string;
+    exampleValues: NonEmptyArray<unknown>;
+}
+export interface SectionLocatePixelBboxGroup {
+    target: PixelBbox;
+    references?: PixelBbox[];
+}
+export interface LocateResultAdapter {
+    kind: 'standard' | 'custom';
+    promptSpec: LocateResultPromptSpec;
+    /**
+     * Converts a locate payload to a pixel bbox. This adapter intentionally does
+     * not interpret model-level `error` / `errors` fields; callers decide whether
+     * those fields should stop the locate flow before invoking the adapter.
+     */
+    adaptElementLocateResultToPixelBbox(input: unknown, ctx: LocateResultContext): PixelBbox;
+    /**
+     * Converts a section locate payload to target/reference pixel bboxes. This
+     * adapter intentionally does not interpret model-level `error` / `errors`
+     * fields; callers own that policy before invoking the adapter.
+     */
+    adaptSectionLocateResultToPixelBboxGroup(input: unknown, ctx: LocateResultContext): SectionLocatePixelBboxGroup;
+    adaptPlanningParamToPixelBbox(planningParam: unknown, ctx: LocateResultContext): PixelBbox;
+}
+export interface LocateResultCoordinates {
+    shape: LocateResultShape;
+    order?: 'xy' | 'yx';
+    normalizedBy?: number;
+}
+export interface ResolvedLocateResultCoordinates {
+    shape: LocateResultShape;
+    order: 'xy' | 'yx';
+    normalizedBy?: number;
+}
+export type RawLocateValueParser = (input: RawLocateValue) => LocateResultValue;
+export type LocateResultPixelBboxMapper = (result: LocateResultValue, ctx: LocateResultContext) => PixelBbox;
+/**
+ * Declarative config for the standard locate workflow.
+ *
+ * The standard workflow has three steps:
+ * 1. `coordinates` is expanded into prompt wording, a default
+ *    raw result parser, and a default pixel bbox mapper.
+ * 2. `parseRawLocateValue` converts that raw result value into Midscene's
+ *    internal `LocateResultValue` shape:
+ *    `{ type: 'bbox' | 'point', coordinates: ... }`. Omit it when the model
+ *    returns a plain numeric bbox/point matching `coordinates`; provide it when the
+ *    model needs repair or fallback handling.
+ * 3. `mapLocateResultToPixelBbox` converts the parsed result into a pixel bbox
+ *    `[left, top, right, bottom]`. Omit it when `coordinates` is enough to describe
+ *    the coordinate system and order; provide it only for model-specific
+ *    conversion rules.
+ *
+ * Standard adapters intentionally use fixed result fields (`bbox` / `bbox_2d` /
+ * `point` and `references_*`). A previous design considered `pickRawLocateValue`
+ * for custom keys, but normal locate, search-area references, and future
+ * locateAll responses may return different shapes (single arrays, nested
+ * arrays, or object arrays), so a generic picker contract was unclear. A
+ * declarative `resultKeys` option is one possible future direction, but without
+ * a concrete need we avoid that over-design for now.
+ *
+ * Example 1: a GLM-like model that directly matches the standard coordinates.
+ *
+ * ```ts
+ * resultAdapter: {
+ *   coordinates: { shape: 'bbox', order: 'xy', normalizedBy: 1000 },
+ * }
+ * ```
+ *
+ * Example 2: Qwen 2.5 returns pixel coordinates, but may return a point-like
+ * value that needs custom parsing/fallback. The default pixel bbox mapper is
+ * bypassed only if custom fallback sizing is required.
+ *
+ * ```ts
+ * resultAdapter: {
+ *   coordinates: { shape: 'bbox', order: 'xy' },
+ *   parseRawLocateValue: parseQwen25RawLocateValue,
+ *   mapLocateResultToPixelBbox: normalizeQwen25ResultToPixelBbox,
+ * }
+ * ```
+ *
+ * Example 3: a model with a custom raw value shape can keep the standard
+ * workflow while replacing parsing and mapping.
+ *
+ * ```ts
+ * resultAdapter: {
+ *   coordinates: { shape: 'bbox', order: 'xy' },
+ *   parseRawLocateValue: (raw) => ({
+ *     type: 'bbox',
+ *     coordinates: [
+ *       Number((raw as any).left),
+ *       Number((raw as any).top),
+ *       Number((raw as any).right),
+ *       Number((raw as any).bottom),
+ *     ],
+ *   }),
+ *   mapLocateResultToPixelBbox: (result) => result.coordinates,
+ * }
+ * ```
+ */
+export type StandardLocateResultAdapterDefinition = {
+    kind?: 'standard';
+    /**
+     * Common locate result coordinates shorthand. This is the preferred config surface
+     * for normal models because it keeps result type, coordinate system, and
+     * coordinate order in one orthogonal field.
+     */
+    coordinates: LocateResultCoordinates;
+    /**
+     * Parses the picked raw value into a `LocateResultValue`. This function
+     * should handle response repair and bbox-vs-point fallback only;
+     * coordinate-system conversion should stay in `mapLocateResultToPixelBbox`.
+     */
+    parseRawLocateValue?: RawLocateValueParser;
+    /**
+     * Maps the parsed result into a pixel bbox. Most models should omit this
+     * and let `coordinates` drive the default conversion. Provide it only when point
+     * fallback size, clipping, or coordinate semantics are model-specific.
+     */
+    mapLocateResultToPixelBbox?: LocateResultPixelBboxMapper;
+};
+export type CustomLocateResultAdapterDefinition = {
+    kind: 'custom';
+    promptSpec: LocateResultPromptSpec;
+    adaptElementLocateResultToPixelBbox(input: unknown, ctx: LocateResultContext): PixelBbox;
+    adaptSectionLocateResultToPixelBboxGroup(input: unknown, ctx: LocateResultContext): SectionLocatePixelBboxGroup;
+    adaptPlanningParamToPixelBbox(planningParam: unknown, ctx: LocateResultContext): PixelBbox;
+};
+export type LocateResultAdapterDefinition = StandardLocateResultAdapterDefinition | CustomLocateResultAdapterDefinition;

package/dist/types/ai-model/types.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { ChatCompletionMessageParam } from 'openai/resources/index';
2	+ export type AIArgs = ChatCompletionMessageParam[];