npm - @midscene/core - Versions diffs - 0.26.7-beta-20250818081955.0 → 0.26.7-beta-20250820150415.0 - Mend

@midscene/core 0.26.7-beta-20250818081955.0 → 0.26.7-beta-20250820150415.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/dist/es/ai-model/common.mjs +130 -22
package/dist/es/ai-model/common.mjs.map +1 -1
package/dist/es/ai-model/index.mjs +3 -3
package/dist/es/ai-model/inspect.mjs +28 -16
package/dist/es/ai-model/inspect.mjs.map +1 -1
package/dist/es/ai-model/llm-planning.mjs +25 -23
package/dist/es/ai-model/llm-planning.mjs.map +1 -1
package/dist/es/ai-model/prompt/llm-planning.mjs +69 -23
package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
package/dist/es/ai-model/prompt/util.mjs +2 -2
package/dist/es/ai-model/prompt/util.mjs.map +1 -1
package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
package/dist/es/ai-model/service-caller/index.mjs +72 -118
package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
package/dist/es/index.mjs +3 -2
package/dist/es/index.mjs.map +1 -1
package/dist/es/insight/index.mjs +13 -61
package/dist/es/insight/index.mjs.map +1 -1
package/dist/es/types.mjs.map +1 -1
package/dist/es/utils.mjs +5 -6
package/dist/es/utils.mjs.map +1 -1
package/dist/lib/ai-model/common.js +166 -28
package/dist/lib/ai-model/common.js.map +1 -1
package/dist/lib/ai-model/index.js +31 -10
package/dist/lib/ai-model/inspect.js +27 -15
package/dist/lib/ai-model/inspect.js.map +1 -1
package/dist/lib/ai-model/llm-planning.js +24 -22
package/dist/lib/ai-model/llm-planning.js.map +1 -1
package/dist/lib/ai-model/prompt/llm-planning.js +71 -25
package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
package/dist/lib/ai-model/prompt/util.js +2 -2
package/dist/lib/ai-model/prompt/util.js.map +1 -1
package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
package/dist/lib/ai-model/service-caller/index.js +75 -124
package/dist/lib/ai-model/service-caller/index.js.map +1 -1
package/dist/lib/ai-model/ui-tars-planning.js +5 -5
package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
package/dist/lib/index.js +25 -3
package/dist/lib/index.js.map +1 -1
package/dist/lib/insight/index.js +10 -58
package/dist/lib/insight/index.js.map +1 -1
package/dist/lib/types.js.map +1 -1
package/dist/lib/utils.js +4 -5
package/dist/lib/utils.js.map +1 -1
package/dist/types/ai-model/common.d.ts +517 -8
package/dist/types/ai-model/index.d.ts +2 -2
package/dist/types/ai-model/inspect.d.ts +4 -1
package/dist/types/ai-model/llm-planning.d.ts +1 -1
package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
package/dist/types/ai-model/prompt/util.d.ts +2 -1
package/dist/types/ai-model/service-caller/index.d.ts +6 -6
package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
package/dist/types/index.d.ts +3 -1
package/dist/types/insight/index.d.ts +2 -4
package/dist/types/types.d.ts +9 -31
package/dist/types/yaml.d.ts +6 -2
package/package.json +4 -3

package/dist/types/ai-model/llm-planning.d.ts CHANGED Viewed

@@ -3,7 +3,7 @@ import { callAiFn } from './common';
 export declare function plan(userInstruction: string, opts: {
     context: UIContext;
     pageType: PageType;
-    actionSpace: DeviceAction[];
+    actionSpace: DeviceAction<any>[];
     callAI?: typeof callAiFn<PlanningAIResponse>;
     log?: string;
     actionContext?: string;

package/dist/types/ai-model/prompt/llm-planning.d.ts CHANGED Viewed

@@ -2,9 +2,9 @@ import type { DeviceAction } from '../../types';
 import { PromptTemplate } from '@langchain/core/prompts';
 import type { vlLocateMode } from '@midscene/shared/env';
 import type { ResponseFormatJSONSchema } from 'openai/resources/index';
-export declare const descriptionForAction: (action: DeviceAction, locatorScheme: string) => string;
+export declare const descriptionForAction: (action: DeviceAction<any>, locatorSchemaTypeDescription: string) => string;
 export declare function systemPromptToTaskPlanning({ actionSpace, vlMode, }: {
-    actionSpace: DeviceAction[];
+    actionSpace: DeviceAction<any>[];
     vlMode: ReturnType<typeof vlLocateMode>;
 }): Promise<string>;
 export declare const planSchema: ResponseFormatJSONSchema;

package/dist/types/ai-model/prompt/util.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { BaseElement, ElementTreeNode, Size, UIContext } from '../../types';
+import { type IModelPreferences } from '@midscene/shared/env';
 export declare function describeSize(size: Size): string;
 export declare function describeElement(elements: (Pick<BaseElement, 'rect' | 'content'> & {
     id: string;
@@ -19,7 +20,7 @@ export declare function distance(point1: {
     y: number;
 }): number;
 export declare const samplePageDescription = "\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n  <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n    The username is accepted\n  </h4>\n  ...many more\n</div>\n====================\n";
-export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, opt?: {
+export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, modelPreferences: IModelPreferences, opt?: {
     truncateTextLength?: number;
     filterNonTextContent?: boolean;
     domIncluded?: boolean | 'visible-only';

package/dist/types/ai-model/service-caller/index.d.ts CHANGED Viewed

@@ -1,11 +1,10 @@
 import { type AIUsageInfo } from '../../types';
 import type { StreamingCallback } from '../../types';
+import { type IModelPreferences } from '@midscene/shared/env';
 import OpenAI from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources/index';
 import { AIActionType, type AIArgs } from '../common';
-export declare function checkAIConfig(): boolean;
-export declare function getModelName(): string;
-export declare function call(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, responseFormat?: OpenAI.ChatCompletionCreateParams['response_format'] | OpenAI.ResponseFormatJSONObject, options?: {
+export declare function call(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences, options?: {
     stream?: boolean;
     onChunk?: StreamingCallback;
 }): Promise<{
@@ -13,14 +12,15 @@ export declare function call(messages: ChatCompletionMessageParam[], AIActionTyp
     usage?: AIUsageInfo;
     isStreamed: boolean;
 }>;
-export declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
+export declare const getResponseFormat: (modelName: string, AIActionTypeValue: AIActionType) => OpenAI.ChatCompletionCreateParams["response_format"] | OpenAI.ResponseFormatJSONObject;
+export declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences): Promise<{
     content: T;
     usage?: AIUsageInfo;
 }>;
-export declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType): Promise<{
+export declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType, modelPreferences: IModelPreferences): Promise<{
     content: string;
     usage?: AIUsageInfo;
 }>;
 export declare function extractJSONFromCodeBlock(response: string): string;
 export declare function preprocessDoubaoBboxJson(input: string): string;
-export declare function safeParseJson(input: string): any;
+export declare function safeParseJson(input: string, modelPreferences: IModelPreferences): any;

package/dist/types/ai-model/ui-tars-planning.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { AIUsageInfo, MidsceneYamlFlowItem, PlanningAction, Size } from '../types';
+import { type IModelPreferences } from '@midscene/shared/env';
 import { actionParser } from '@ui-tars/action-parser';
 import type { ChatCompletionMessageParam } from 'openai/resources/index';
 type ActionType = 'click' | 'drag' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait' | 'androidBackButton' | 'androidHomeButton' | 'androidRecentAppsButton' | 'androidLongPress' | 'androidPull';
@@ -9,6 +10,7 @@ export declare function vlmPlanning(options: {
         width: number;
         height: number;
     };
+    modelPreferences: IModelPreferences;
 }): Promise<{
     actions: PlanningAction<any>[];
     actionsFromModel: ReturnType<typeof actionParser>['parsed'];
@@ -72,5 +74,5 @@ interface AndroidLongPressAction extends BaseAction {
     };
 }
 export type Action = ClickAction | DragAction | TypeAction | HotkeyAction | ScrollAction | FinishedAction | WaitAction | AndroidLongPressAction;
-export declare function resizeImageForUiTars(imageBase64: string, size: Size): Promise<string>;
+export declare function resizeImageForUiTars(imageBase64: string, size: Size, modelPreferences: IModelPreferences): Promise<string>;
 export {};

package/dist/types/index.d.ts CHANGED Viewed

@@ -1,9 +1,11 @@
+import { z } from 'zod';
 import { Executor } from './ai-model/action-executor';
 import Insight from './insight/index';
 import { getVersion } from './utils';
-export { plan, describeUserPage, AiLocateElement, } from './ai-model/index';
+export { plan, describeUserPage, AiLocateElement, getMidsceneLocationSchema, type MidsceneLocationResultType, PointSchema, SizeSchema, RectSchema, TMultimodalPromptSchema, TUserPromptSchema, type TMultimodalPrompt, type TUserPrompt, } from './ai-model/index';
 export { getAIConfig, MIDSCENE_MODEL_NAME } from '@midscene/shared/env';
 export type * from './types';
+export { z };
 export default Insight;
 export { Executor, Insight, getVersion };
 export type { MidsceneYamlScript, MidsceneYamlTask, MidsceneYamlFlowItem, MidsceneYamlFlowItemAIRightClick, MidsceneYamlConfigResult, LocateOption, DetailedLocateParam, } from './yaml';

package/dist/types/insight/index.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { callAiFn } from '../ai-model/common';
-import type { AIDescribeElementResponse, AIElementResponse, AIUsageInfo, BaseElement, DetailedLocateParam, DumpSubscriber, InsightAction, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, Rect, TMultimodalPrompt, UIContext } from '../types';
+import type { AIElementResponse, AIUsageInfo, BaseElement, DetailedLocateParam, DumpSubscriber, InsightAction, InsightExtractOption, InsightExtractParam, InsightOptions, InsightTaskInfo, LocateResult, UIContext } from '../types';
+import type { TMultimodalPrompt } from '../ai-model/common';
 export interface LocateOpts {
     context?: UIContext<BaseElement>;
     callAI?: typeof callAiFn<AIElementResponse>;
@@ -19,7 +20,4 @@ export default class Insight<ElementType extends BaseElement = BaseElement, Cont
         thought?: string;
         usage?: AIUsageInfo;
     }>;
-    describe(target: Rect | [number, number], opt?: {
-        deepThink?: boolean;
-    }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
 }

package/dist/types/types.d.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import type { NodeType } from '@midscene/shared/constants';
 import type { BaseElement, ElementTreeNode, Rect, Size } from '@midscene/shared/types';
 import type { ChatCompletionMessageParam } from 'openai/resources/index';
+import type { z } from 'zod';
+import type { TUserPrompt } from './ai-model/common';
 import type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';
 export type { ElementTreeNode, BaseElement, Rect, Size, Point, } from '@midscene/shared/types';
 export * from './yaml';
@@ -106,10 +108,10 @@ export type EnsureObject<T> = {
 export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
 export type InsightExtractParam = string | Record<string, string>;
 export type LocateResultElement = {
-    id: string;
-    indexId?: number;
     center: [number, number];
     rect: Rect;
+    id: string;
+    indexId?: number;
     xpaths: string[];
     attributes: {
         nodeType: NodeType;
@@ -189,7 +191,7 @@ export interface PlanningLocateParam extends DetailedLocateParam {
 }
 export interface PlanningAction<ParamType = any> {
     thought?: string;
-    type: 'Locate' | 'Tap' | 'RightClick' | 'Hover' | 'Drag' | 'Input' | 'KeyboardPress' | 'Scroll' | 'Error' | 'Assert' | 'AssertWithoutThrow' | 'Sleep' | 'Finished' | 'AndroidBackButton' | 'AndroidHomeButton' | 'AndroidRecentAppsButton' | 'AndroidLongPress' | 'AndroidPull';
+    type: string;
     param: ParamType;
     locate?: PlanningLocateParam | null;
 }
@@ -241,7 +243,6 @@ export interface Color {
 }
 export interface BaseAgentParserOpt {
     selector?: string;
-    ignoreMarker?: boolean;
 }
 export interface PuppeteerParserOpt extends BaseAgentParserOpt {
 }
@@ -369,33 +370,10 @@ export interface StreamingAIResponse {
     /** Whether the response was streamed */
     isStreamed: boolean;
 }
-export type TMultimodalPrompt = {
-    /**
-     * Support use image to inspect elements.
-     * The "images" field is an object that uses image name as key and image url as value.
-     * The image url can be a local path, a http link , or a base64 string.
-     */
-    images?: {
-        name: string;
-        url: string;
-    }[];
-    /**
-     * By default, the image url in the "images" filed starts with `https://` or `http://` will be directly sent to the LLM.
-     * In case the images are not accessible to the LLM (One common case is that image url is internal network only.), you can enable this option.
-     * Then image will be download and convert to base64 format.
-     */
-    convertHttpImage2Base64?: boolean;
-};
-export type TUserPrompt = string | ({
-    prompt: string;
-} & Partial<TMultimodalPrompt>);
-export interface DeviceAction<ParamType = any> {
+export interface DeviceAction<T = {}> {
     name: string;
-    interfaceAlias?: string;
     description?: string;
-    paramSchema?: string;
-    paramDescription?: string;
-    location?: 'required' | 'optional' | false;
-    whatToLocate?: string;
-    call: (context: ExecutorContext, param: ParamType) => Promise<void> | void;
+    interfaceAlias?: string;
+    paramSchema?: z.ZodType<T>;
+    call: (param: T, context: ExecutorContext) => Promise<void> | void;
 }

package/dist/types/yaml.d.ts CHANGED Viewed

@@ -1,6 +1,8 @@
-import type { Rect, TUserPrompt } from './types';
+import type { TUserPrompt } from './ai-model/common';
+import type { Rect } from './types';
 import type { BaseElement, UIContext } from './types';
 export interface LocateOption {
+    prompt?: TUserPrompt;
     deepThink?: boolean;
     cacheable?: boolean;
     xpath?: string;
@@ -11,6 +13,7 @@ export interface InsightExtractOption {
     screenshotIncluded?: boolean;
     returnThought?: boolean;
     isWaitForAssert?: boolean;
+    doNotThrowError?: boolean;
 }
 export interface ReferenceImage {
     base64: string;
@@ -72,6 +75,7 @@ export interface MidsceneYamlFlowItemAIAction {
 export interface MidsceneYamlFlowItemAIAssert {
     aiAssert: string;
     errorMessage?: string;
+    name?: string;
 }
 export interface MidsceneYamlFlowItemAIQuery extends InsightExtractOption {
     aiQuery: string;
@@ -116,7 +120,7 @@ export interface MidsceneYamlFlowItemAIInput extends LocateOption {
 }
 export interface MidsceneYamlFlowItemAIKeyboardPress extends LocateOption {
     aiKeyboardPress: TUserPrompt | undefined;
-    key: string;
+    keyName: string;
 }
 export interface MidsceneYamlFlowItemAIScroll extends LocateOption, ScrollParam {
     aiScroll: TUserPrompt | undefined;

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@midscene/core",
   "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
-  "version": "0.26.7-beta-20250818081955.0",
+  "version": "0.26.7-beta-20250820150415.0",
   "repository": "https://github.com/web-infra-dev/midscene",
   "homepage": "https://midscenejs.com/",
   "main": "./dist/lib/index.js",
@@ -60,8 +60,9 @@
     "langsmith": "0.3.7",
     "openai": "4.81.0",
     "socks-proxy-agent": "8.0.4",
-    "@midscene/recorder": "0.26.7-beta-20250818081955.0",
-    "@midscene/shared": "0.26.7-beta-20250818081955.0"
+    "zod": "3.24.3",
+    "@midscene/recorder": "0.26.7-beta-20250820150415.0",
+    "@midscene/shared": "0.26.7-beta-20250820150415.0"
   },
   "devDependencies": {
     "@microsoft/api-extractor": "^7.52.10",