npm - @midscene/shared - Versions diffs - 1.8.5-beta-20260525033347.0 → 1.8.5 - Mend

@midscene/shared 1.8.5-beta-20260525033347.0 → 1.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/es/cli/cli-args.mjs +41 -2
package/dist/es/cli/cli-runner.mjs +4 -3
package/dist/es/constants/example-code.mjs +1 -1
package/dist/es/env/constants.mjs +5 -1
package/dist/es/env/parse-model-config.mjs +11 -2
package/dist/es/env/types.mjs +7 -4
package/dist/es/mcp/tool-generator.mjs +14 -11
package/dist/es/mcp/user-prompt.mjs +66 -0
package/dist/lib/cli/cli-args.js +44 -2
package/dist/lib/cli/cli-runner.js +3 -2
package/dist/lib/constants/example-code.js +1 -1
package/dist/lib/env/constants.js +4 -0
package/dist/lib/env/parse-model-config.js +10 -1
package/dist/lib/env/types.js +12 -3
package/dist/lib/mcp/tool-generator.js +17 -11
package/dist/lib/mcp/user-prompt.js +103 -0
package/dist/types/cli/cli-args.d.ts +8 -0
package/dist/types/constants/example-code.d.ts +1 -1
package/dist/types/env/constants.d.ts +1 -0
package/dist/types/env/types.d.ts +15 -5
package/dist/types/mcp/tool-generator.d.ts +2 -0
package/dist/types/mcp/types.d.ts +21 -1
package/dist/types/mcp/user-prompt.d.ts +13 -0
package/package.json +1 -1
package/src/cli/cli-args.ts +65 -1
package/src/cli/cli-runner.ts +10 -2
package/src/constants/example-code.ts +1 -1
package/src/env/constants.ts +9 -0
package/src/env/parse-model-config.ts +12 -0
package/src/env/types.ts +19 -3
package/src/img/transform.ts +1 -1
package/src/mcp/tool-generator.ts +15 -11
package/src/mcp/types.ts +21 -1
package/src/mcp/user-prompt.ts +102 -0

package/src/constants/example-code.ts CHANGED Viewed

@@ -5,7 +5,7 @@ export const PLAYWRIGHT_EXAMPLE_CODE = `
 IMPORTANT: Follow these exact type signatures for AI functions:
 // Type signatures for AI functions:
-aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<void>
+aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<string | undefined>
 aiInput(text: string, locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
 aiTap(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
 aiHover(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>

package/src/env/constants.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import {
   MIDSCENE_INSIGHT_MODEL_FAMILY,
   MIDSCENE_INSIGHT_MODEL_HTTP_PROXY,
   MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
+  MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
   MIDSCENE_INSIGHT_MODEL_NAME,
   MIDSCENE_INSIGHT_MODEL_REASONING_BUDGET,
   MIDSCENE_INSIGHT_MODEL_REASONING_EFFORT,
@@ -20,6 +21,7 @@ import {
   MIDSCENE_MODEL_FAMILY,
   MIDSCENE_MODEL_HTTP_PROXY,
   MIDSCENE_MODEL_INIT_CONFIG_JSON,
+  MIDSCENE_MODEL_MAX_TOKENS,
   MIDSCENE_MODEL_NAME,
   MIDSCENE_MODEL_REASONING_BUDGET,
   MIDSCENE_MODEL_REASONING_EFFORT,
@@ -38,6 +40,7 @@ import {
   MIDSCENE_PLANNING_MODEL_FAMILY,
   MIDSCENE_PLANNING_MODEL_HTTP_PROXY,
   MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
+  MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
   MIDSCENE_PLANNING_MODEL_NAME,
   MIDSCENE_PLANNING_MODEL_REASONING_BUDGET,
   MIDSCENE_PLANNING_MODEL_REASONING_EFFORT,
@@ -49,6 +52,7 @@ import {
   MIDSCENE_PLANNING_MODEL_TIMEOUT,
   OPENAI_API_KEY,
   OPENAI_BASE_URL,
+  OPENAI_MAX_TOKENS,
 } from './types';
 interface IModelConfigKeys {
@@ -65,6 +69,7 @@ interface IModelConfigKeys {
   openaiApiKey: string;
   openaiExtraConfig: string;
   extraBody: string;
+  maxTokens: string;
   /**
    * Extra
    */
@@ -101,6 +106,7 @@ export const INSIGHT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
   openaiApiKey: MIDSCENE_INSIGHT_MODEL_API_KEY,
   openaiExtraConfig: MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
   extraBody: MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
+  maxTokens: MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
   /**
    * Extra
    */
@@ -137,6 +143,7 @@ export const PLANNING_MODEL_CONFIG_KEYS: IModelConfigKeys = {
   openaiApiKey: MIDSCENE_PLANNING_MODEL_API_KEY,
   openaiExtraConfig: MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
   extraBody: MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
+  maxTokens: MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
   /**
    * Extra
    */
@@ -174,6 +181,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
   openaiApiKey: MIDSCENE_MODEL_API_KEY,
   openaiExtraConfig: MIDSCENE_MODEL_INIT_CONFIG_JSON,
   extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
+  maxTokens: MIDSCENE_MODEL_MAX_TOKENS,
   /**
    * Extra
    */
@@ -211,6 +219,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS_LEGACY: IModelConfigKeys = {
   openaiApiKey: OPENAI_API_KEY,
   openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
   extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
+  maxTokens: OPENAI_MAX_TOKENS,
   /**
    * Extra
    */

package/src/env/parse-model-config.ts CHANGED Viewed

@@ -18,6 +18,7 @@ import {
   MODEL_FAMILY_VALUES,
   OPENAI_API_KEY,
   OPENAI_BASE_URL,
+  OPENAI_MAX_TOKENS,
   type TIntent,
   type TModelFamily,
   UITarsModelVersion,
@@ -200,6 +201,9 @@ export const parseOpenaiSdkConfig = ({
   const legacyOpenaiExtraConfig = useLegacyLogic
     ? provider[MIDSCENE_OPENAI_INIT_CONFIG_JSON]
     : undefined;
+  const legacyMaxTokens = useLegacyLogic
+    ? provider[OPENAI_MAX_TOKENS]
+    : undefined;
   const legacyModelFamily = useLegacyLogic
     ? legacyConfigToModelFamily(provider)
     : undefined;
@@ -222,6 +226,13 @@ export const parseOpenaiSdkConfig = ({
   );
   const extraBodyStr: string | undefined = provider[keys.extraBody];
   const extraBody = parseJson(keys.extraBody, extraBodyStr);
+  const maxTokensStr = provider[keys.maxTokens] || legacyMaxTokens;
+  const maxTokens = (() => {
+    const val = maxTokensStr?.trim();
+    if (!val) return undefined;
+    const num = Number(val);
+    return Number.isFinite(num) ? num : undefined;
+  })();
   const temperature = provider[keys.temperature]
     ? Number(provider[keys.temperature])
     : 0;
@@ -239,6 +250,7 @@ export const parseOpenaiSdkConfig = ({
     openaiApiKey,
     openaiExtraConfig: normalizeOpenaiExtraConfig(openaiExtraConfig),
     extraBody,
+    maxTokens,
     modelFamily,
     uiTarsModelVersion,
     modelName: modelName!,

package/src/env/types.ts CHANGED Viewed

@@ -106,6 +106,8 @@ export const MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON =
   'MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON';
 export const MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON =
   'MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON';
+export const MIDSCENE_INSIGHT_MODEL_MAX_TOKENS =
+  'MIDSCENE_INSIGHT_MODEL_MAX_TOKENS';
 export const MIDSCENE_INSIGHT_MODEL_TIMEOUT = 'MIDSCENE_INSIGHT_MODEL_TIMEOUT';
 export const MIDSCENE_INSIGHT_MODEL_TEMPERATURE =
   'MIDSCENE_INSIGHT_MODEL_TEMPERATURE';
@@ -135,6 +137,8 @@ export const MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON =
   'MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON';
 export const MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON =
   'MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON';
+export const MIDSCENE_PLANNING_MODEL_MAX_TOKENS =
+  'MIDSCENE_PLANNING_MODEL_MAX_TOKENS';
 export const MIDSCENE_PLANNING_MODEL_TIMEOUT =
   'MIDSCENE_PLANNING_MODEL_TIMEOUT';
 export const MIDSCENE_PLANNING_MODEL_TEMPERATURE =
@@ -178,14 +182,11 @@ export const BOOLEAN_ENV_KEYS = [
 ] as const;
 export const NUMBER_ENV_KEYS = [
-  MIDSCENE_MODEL_MAX_TOKENS,
   MIDSCENE_CACHE_MAX_FILENAME_LENGTH,
   MIDSCENE_REPLANNING_CYCLE_LIMIT,
 ] as const;
 export const STRING_ENV_KEYS = [
-  MIDSCENE_MODEL_MAX_TOKENS,
-  OPENAI_MAX_TOKENS,
   MIDSCENE_ADB_PATH,
   MIDSCENE_ADB_REMOTE_HOST,
   MIDSCENE_ADB_REMOTE_PORT,
@@ -224,6 +225,7 @@ export const MODEL_ENV_KEYS = [
   MIDSCENE_MODEL_BASE_URL,
   MIDSCENE_MODEL_SOCKS_PROXY,
   MIDSCENE_MODEL_HTTP_PROXY,
+  MIDSCENE_MODEL_MAX_TOKENS,
   MIDSCENE_MODEL_TIMEOUT,
   MIDSCENE_MODEL_TEMPERATURE,
   MIDSCENE_MODEL_RETRY_COUNT,
@@ -240,6 +242,7 @@ export const MODEL_ENV_KEYS = [
   // model default legacy
   OPENAI_API_KEY,
   OPENAI_BASE_URL,
+  OPENAI_MAX_TOKENS,
   MIDSCENE_OPENAI_INIT_CONFIG_JSON,
   MIDSCENE_OPENAI_HTTP_PROXY,
   MIDSCENE_OPENAI_SOCKS_PROXY,
@@ -251,6 +254,7 @@ export const MODEL_ENV_KEYS = [
   MIDSCENE_INSIGHT_MODEL_API_KEY,
   MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
   MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
+  MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
   MIDSCENE_INSIGHT_MODEL_TIMEOUT,
   MIDSCENE_INSIGHT_MODEL_TEMPERATURE,
   MIDSCENE_INSIGHT_MODEL_RETRY_COUNT,
@@ -267,6 +271,7 @@ export const MODEL_ENV_KEYS = [
   MIDSCENE_PLANNING_MODEL_API_KEY,
   MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
   MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
+  MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
   MIDSCENE_PLANNING_MODEL_TIMEOUT,
   MIDSCENE_PLANNING_MODEL_TEMPERATURE,
   MIDSCENE_PLANNING_MODEL_RETRY_COUNT,
@@ -335,6 +340,8 @@ export interface IModelConfigForInsight {
   [MIDSCENE_INSIGHT_MODEL_API_KEY]?: string;
   [MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON]?: string;
   [MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON]?: string;
+  // max tokens
+  [MIDSCENE_INSIGHT_MODEL_MAX_TOKENS]?: string;
   // timeout
   [MIDSCENE_INSIGHT_MODEL_TIMEOUT]?: string;
   // temperature
@@ -354,6 +361,8 @@ export interface IModelConfigForPlanning {
   [MIDSCENE_PLANNING_MODEL_API_KEY]?: string;
   [MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON]?: string;
   [MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON]?: string;
+  // max tokens
+  [MIDSCENE_PLANNING_MODEL_MAX_TOKENS]?: string;
   // timeout
   [MIDSCENE_PLANNING_MODEL_TIMEOUT]?: string;
   // temperature
@@ -381,6 +390,8 @@ export interface IModelConfigForDefault {
   [MIDSCENE_MODEL_API_KEY]?: string;
   [MIDSCENE_MODEL_INIT_CONFIG_JSON]?: string;
   [MIDSCENE_MODEL_EXTRA_BODY_JSON]?: string;
+  // max tokens
+  [MIDSCENE_MODEL_MAX_TOKENS]?: string;
   // extra
   [MIDSCENE_MODEL_FAMILY]?: TModelFamily;
   // temperature
@@ -403,6 +414,7 @@ export interface IModelConfigForDefaultLegacy {
   [OPENAI_BASE_URL]?: string;
   [OPENAI_API_KEY]?: string;
   [MIDSCENE_OPENAI_INIT_CONFIG_JSON]?: string;
+  [OPENAI_MAX_TOKENS]?: string;
 }
 /**
@@ -479,6 +491,10 @@ export interface IModelConfig {
    * Example: { "chat_template_kwargs": { "enable_thinking": true } }
    */
   extraBody?: Record<string, unknown>;
+  /**
+   * max_tokens for model responses.
+   */
+  maxTokens?: number;
   /**
    * Timeout for API calls in milliseconds.
    * If not set, uses OpenAI SDK default (10 minutes).

package/src/img/transform.ts CHANGED Viewed

@@ -4,8 +4,8 @@ import { readFileSync } from 'node:fs';
 import { writeFile } from 'node:fs/promises';
 import path from 'node:path';
 import type { PhotonImage as PhotonImageType } from '@silvia-odwyer/photon-node';
-import type { Rect } from 'src/types';
 import { getDebug } from '../logger';
+import type { Rect } from '../types';
 import { ifInNode } from '../utils';
 import getPhoton from './get-photon';
 import getSharp from './get-sharp';

package/src/mcp/tool-generator.ts CHANGED Viewed

@@ -15,6 +15,9 @@ import type {
   ToolResult,
   ToolSchema,
 } from './types';
+import { composeUserPrompt, promptInputExtraSchema } from './user-prompt';
+export { composeUserPrompt };
 /**
  * Generate MCP tool description from ActionSpaceItem
@@ -568,6 +571,9 @@ export function generateCommonTools(
           if (!screenshot) {
             return createErrorResult('Screenshot not available');
           }
+          await agent.recordToReport?.('take_screenshot', {
+            screenshotBase64: screenshot,
+          });
           const { mimeType, body } = parseBase64(screenshot);
           return {
             content: [{ type: 'image', data: body, mimeType }],
@@ -604,16 +610,7 @@ export function generateCommonTools(
             return createErrorResult('act is not supported by this agent');
           }
           const result = await agent.aiAction(prompt, { deepThink: false });
-          const screenshotResult = await captureScreenshotResult(agent, 'act');
-          if (result) {
-            const message =
-              typeof result === 'string' ? result : JSON.stringify(result);
-            screenshotResult.content.unshift({
-              type: 'text',
-              text: `Task finished, message: ${message}`,
-            });
-          }
-          return screenshotResult;
+          return await captureScreenshotResult(agent, 'act', result);
         } catch (error: unknown) {
           const errorMessage = getErrorMessage(error);
           console.error('Error executing act:', errorMessage);
@@ -631,6 +628,7 @@ export function generateCommonTools(
           .describe(
             'Natural language assertion to verify, e.g. "there is a login button visible"',
           ),
+        ...promptInputExtraSchema,
         ...initArgSchema,
       },
       cli: mergeToolCliMetadata(undefined, initArgCliMetadata),
@@ -643,7 +641,13 @@ export function generateCommonTools(
           if (!agent.aiAssert) {
             return createErrorResult('assert is not supported by this agent');
           }
-          await agent.aiAssert(prompt);
+          const userPrompt = composeUserPrompt({
+            prompt,
+            image: args.image,
+            imageName: args.imageName,
+            convertHttpImage2Base64: args.convertHttpImage2Base64,
+          });
+          await agent.aiAssert(userPrompt);
           return {
             content: [{ type: 'text', text: 'Assertion passed.' }],
           };

package/src/mcp/types.ts CHANGED Viewed

@@ -83,6 +83,22 @@ export interface ActionSpaceItem {
   paramSchema?: z.ZodTypeAny;
 }
+/**
+ * Structural shape compatible with @midscene/core `TUserPrompt`.
+ * Declared locally to avoid a circular dep on `@midscene/core` from `@midscene/shared`.
+ *
+ * Currently consumed only by the `assert` tool in `generateCommonTools`.
+ * `aiAction` and `aiWaitFor` stay string-only at the CLI surface because the
+ * tools generator does not yet expose multimodal entry points for them.
+ */
+export type UserPromptLike =
+  | string
+  | {
+      prompt: string;
+      images?: Array<{ name: string; url: string }>;
+      convertHttpImage2Base64?: boolean;
+    };
 /**
  * Base agent interface
  * Represents a platform-specific agent (Android, iOS, Web)
@@ -94,6 +110,10 @@ export interface BaseAgent {
   page?: {
     screenshotBase64(): Promise<string>;
   };
+  recordToReport?: (
+    title?: string,
+    opt?: { content?: string; screenshotBase64?: string },
+  ) => Promise<void>;
   callActionInActionSpace?: (
     actionName: string,
     params?: unknown,
@@ -107,7 +127,7 @@ export interface BaseAgent {
     options: Record<string, unknown>,
   ) => Promise<unknown>;
   aiAssert?: (
-    assertion: string,
+    assertion: UserPromptLike,
     msg?: string,
     options?: Record<string, unknown>,
   ) => Promise<unknown>;

package/src/mcp/user-prompt.ts ADDED Viewed

@@ -0,0 +1,102 @@
+import { z } from 'zod';
+import type { UserPromptLike } from './types';
+type PromptReferenceImage = { name: string; url: string };
+function normalizeStringList(raw: unknown, fieldName: string): string[] {
+  if (raw === undefined || raw === null) return [];
+  if (typeof raw === 'string') {
+    const trimmed = raw.trim();
+    return trimmed ? [trimmed] : [];
+  }
+  if (Array.isArray(raw)) {
+    return raw.map((item, index) => {
+      if (typeof item !== 'string') {
+        throw new Error(`${fieldName}[${index}]: expected a string.`);
+      }
+      return item.trim();
+    });
+  }
+  throw new Error(
+    `${fieldName}: expected a string or string array, got ${typeof raw}.`,
+  );
+}
+function composeImages(input: {
+  image?: unknown;
+  imageName?: unknown;
+}): PromptReferenceImage[] {
+  const urls = normalizeStringList(input.image, 'image');
+  const names = normalizeStringList(input.imageName, 'imageName');
+  if (urls.length !== names.length) {
+    throw new Error(
+      `image/imageName: expected the same number of --image and --image-name values, got ${urls.length} image(s) and ${names.length} image name(s).`,
+    );
+  }
+  return urls.map((url, index) => ({ name: names[index], url }));
+}
+function coerceBoolean(value: unknown): boolean | undefined {
+  if (value === undefined || value === null) return undefined;
+  if (typeof value === 'boolean') return value;
+  if (typeof value === 'string') {
+    const trimmed = value.trim();
+    if (!trimmed) return undefined;
+    const v = trimmed.toLowerCase();
+    if (v === 'true' || v === '1') return true;
+    if (v === 'false' || v === '0') return false;
+    throw new Error(
+      `convertHttpImage2Base64: expected "true", "false", "1", or "0"; got ${JSON.stringify(value)}.`,
+    );
+  }
+  throw new Error(
+    `convertHttpImage2Base64: expected a boolean, got ${typeof value}.`,
+  );
+}
+export function composeUserPrompt(input: {
+  prompt: string;
+  image?: unknown;
+  imageName?: unknown;
+  convertHttpImage2Base64?: unknown;
+}): UserPromptLike {
+  const images = composeImages({
+    image: input.image,
+    imageName: input.imageName,
+  });
+  const convertFlag = coerceBoolean(input.convertHttpImage2Base64);
+  if (images.length === 0 && convertFlag === undefined) {
+    return input.prompt;
+  }
+  const payload: Exclude<UserPromptLike, string> = { prompt: input.prompt };
+  if (images.length > 0) {
+    payload.images = images;
+  }
+  if (convertFlag !== undefined) {
+    payload.convertHttpImage2Base64 = convertFlag;
+  }
+  return payload;
+}
+export const promptInputExtraSchema = {
+  image: z
+    .union([z.string(), z.array(z.string())])
+    .optional()
+    .describe('Reference image URL/path. Repeat --image for multiple images.'),
+  imageName: z
+    .union([z.string(), z.array(z.string())])
+    .optional()
+    .describe(
+      'Reference image name. Repeat --image-name; must align with --image order.',
+    ),
+  convertHttpImage2Base64: z
+    .union([z.boolean(), z.string()])
+    .optional()
+    .describe(
+      'If true, convert http(s) image URLs to base64 before sending to the model.',
+    ),
+};