@midscene/shared 1.8.5-beta-20260525033347.0 → 1.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@ export const PLAYWRIGHT_EXAMPLE_CODE = `
5
5
  IMPORTANT: Follow these exact type signatures for AI functions:
6
6
 
7
7
  // Type signatures for AI functions:
8
- aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<void>
8
+ aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<string | undefined>
9
9
  aiInput(text: string, locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
10
10
  aiTap(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
11
11
  aiHover(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
@@ -5,6 +5,7 @@ import {
5
5
  MIDSCENE_INSIGHT_MODEL_FAMILY,
6
6
  MIDSCENE_INSIGHT_MODEL_HTTP_PROXY,
7
7
  MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
8
+ MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
8
9
  MIDSCENE_INSIGHT_MODEL_NAME,
9
10
  MIDSCENE_INSIGHT_MODEL_REASONING_BUDGET,
10
11
  MIDSCENE_INSIGHT_MODEL_REASONING_EFFORT,
@@ -20,6 +21,7 @@ import {
20
21
  MIDSCENE_MODEL_FAMILY,
21
22
  MIDSCENE_MODEL_HTTP_PROXY,
22
23
  MIDSCENE_MODEL_INIT_CONFIG_JSON,
24
+ MIDSCENE_MODEL_MAX_TOKENS,
23
25
  MIDSCENE_MODEL_NAME,
24
26
  MIDSCENE_MODEL_REASONING_BUDGET,
25
27
  MIDSCENE_MODEL_REASONING_EFFORT,
@@ -38,6 +40,7 @@ import {
38
40
  MIDSCENE_PLANNING_MODEL_FAMILY,
39
41
  MIDSCENE_PLANNING_MODEL_HTTP_PROXY,
40
42
  MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
43
+ MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
41
44
  MIDSCENE_PLANNING_MODEL_NAME,
42
45
  MIDSCENE_PLANNING_MODEL_REASONING_BUDGET,
43
46
  MIDSCENE_PLANNING_MODEL_REASONING_EFFORT,
@@ -49,6 +52,7 @@ import {
49
52
  MIDSCENE_PLANNING_MODEL_TIMEOUT,
50
53
  OPENAI_API_KEY,
51
54
  OPENAI_BASE_URL,
55
+ OPENAI_MAX_TOKENS,
52
56
  } from './types';
53
57
 
54
58
  interface IModelConfigKeys {
@@ -65,6 +69,7 @@ interface IModelConfigKeys {
65
69
  openaiApiKey: string;
66
70
  openaiExtraConfig: string;
67
71
  extraBody: string;
72
+ maxTokens: string;
68
73
  /**
69
74
  * Extra
70
75
  */
@@ -101,6 +106,7 @@ export const INSIGHT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
101
106
  openaiApiKey: MIDSCENE_INSIGHT_MODEL_API_KEY,
102
107
  openaiExtraConfig: MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
103
108
  extraBody: MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
109
+ maxTokens: MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
104
110
  /**
105
111
  * Extra
106
112
  */
@@ -137,6 +143,7 @@ export const PLANNING_MODEL_CONFIG_KEYS: IModelConfigKeys = {
137
143
  openaiApiKey: MIDSCENE_PLANNING_MODEL_API_KEY,
138
144
  openaiExtraConfig: MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
139
145
  extraBody: MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
146
+ maxTokens: MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
140
147
  /**
141
148
  * Extra
142
149
  */
@@ -174,6 +181,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
174
181
  openaiApiKey: MIDSCENE_MODEL_API_KEY,
175
182
  openaiExtraConfig: MIDSCENE_MODEL_INIT_CONFIG_JSON,
176
183
  extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
184
+ maxTokens: MIDSCENE_MODEL_MAX_TOKENS,
177
185
  /**
178
186
  * Extra
179
187
  */
@@ -211,6 +219,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS_LEGACY: IModelConfigKeys = {
211
219
  openaiApiKey: OPENAI_API_KEY,
212
220
  openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
213
221
  extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
222
+ maxTokens: OPENAI_MAX_TOKENS,
214
223
  /**
215
224
  * Extra
216
225
  */
@@ -18,6 +18,7 @@ import {
18
18
  MODEL_FAMILY_VALUES,
19
19
  OPENAI_API_KEY,
20
20
  OPENAI_BASE_URL,
21
+ OPENAI_MAX_TOKENS,
21
22
  type TIntent,
22
23
  type TModelFamily,
23
24
  UITarsModelVersion,
@@ -200,6 +201,9 @@ export const parseOpenaiSdkConfig = ({
200
201
  const legacyOpenaiExtraConfig = useLegacyLogic
201
202
  ? provider[MIDSCENE_OPENAI_INIT_CONFIG_JSON]
202
203
  : undefined;
204
+ const legacyMaxTokens = useLegacyLogic
205
+ ? provider[OPENAI_MAX_TOKENS]
206
+ : undefined;
203
207
  const legacyModelFamily = useLegacyLogic
204
208
  ? legacyConfigToModelFamily(provider)
205
209
  : undefined;
@@ -222,6 +226,13 @@ export const parseOpenaiSdkConfig = ({
222
226
  );
223
227
  const extraBodyStr: string | undefined = provider[keys.extraBody];
224
228
  const extraBody = parseJson(keys.extraBody, extraBodyStr);
229
+ const maxTokensStr = provider[keys.maxTokens] || legacyMaxTokens;
230
+ const maxTokens = (() => {
231
+ const val = maxTokensStr?.trim();
232
+ if (!val) return undefined;
233
+ const num = Number(val);
234
+ return Number.isFinite(num) ? num : undefined;
235
+ })();
225
236
  const temperature = provider[keys.temperature]
226
237
  ? Number(provider[keys.temperature])
227
238
  : 0;
@@ -239,6 +250,7 @@ export const parseOpenaiSdkConfig = ({
239
250
  openaiApiKey,
240
251
  openaiExtraConfig: normalizeOpenaiExtraConfig(openaiExtraConfig),
241
252
  extraBody,
253
+ maxTokens,
242
254
  modelFamily,
243
255
  uiTarsModelVersion,
244
256
  modelName: modelName!,
package/src/env/types.ts CHANGED
@@ -106,6 +106,8 @@ export const MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON =
106
106
  'MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON';
107
107
  export const MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON =
108
108
  'MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON';
109
+ export const MIDSCENE_INSIGHT_MODEL_MAX_TOKENS =
110
+ 'MIDSCENE_INSIGHT_MODEL_MAX_TOKENS';
109
111
  export const MIDSCENE_INSIGHT_MODEL_TIMEOUT = 'MIDSCENE_INSIGHT_MODEL_TIMEOUT';
110
112
  export const MIDSCENE_INSIGHT_MODEL_TEMPERATURE =
111
113
  'MIDSCENE_INSIGHT_MODEL_TEMPERATURE';
@@ -135,6 +137,8 @@ export const MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON =
135
137
  'MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON';
136
138
  export const MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON =
137
139
  'MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON';
140
+ export const MIDSCENE_PLANNING_MODEL_MAX_TOKENS =
141
+ 'MIDSCENE_PLANNING_MODEL_MAX_TOKENS';
138
142
  export const MIDSCENE_PLANNING_MODEL_TIMEOUT =
139
143
  'MIDSCENE_PLANNING_MODEL_TIMEOUT';
140
144
  export const MIDSCENE_PLANNING_MODEL_TEMPERATURE =
@@ -178,14 +182,11 @@ export const BOOLEAN_ENV_KEYS = [
178
182
  ] as const;
179
183
 
180
184
  export const NUMBER_ENV_KEYS = [
181
- MIDSCENE_MODEL_MAX_TOKENS,
182
185
  MIDSCENE_CACHE_MAX_FILENAME_LENGTH,
183
186
  MIDSCENE_REPLANNING_CYCLE_LIMIT,
184
187
  ] as const;
185
188
 
186
189
  export const STRING_ENV_KEYS = [
187
- MIDSCENE_MODEL_MAX_TOKENS,
188
- OPENAI_MAX_TOKENS,
189
190
  MIDSCENE_ADB_PATH,
190
191
  MIDSCENE_ADB_REMOTE_HOST,
191
192
  MIDSCENE_ADB_REMOTE_PORT,
@@ -224,6 +225,7 @@ export const MODEL_ENV_KEYS = [
224
225
  MIDSCENE_MODEL_BASE_URL,
225
226
  MIDSCENE_MODEL_SOCKS_PROXY,
226
227
  MIDSCENE_MODEL_HTTP_PROXY,
228
+ MIDSCENE_MODEL_MAX_TOKENS,
227
229
  MIDSCENE_MODEL_TIMEOUT,
228
230
  MIDSCENE_MODEL_TEMPERATURE,
229
231
  MIDSCENE_MODEL_RETRY_COUNT,
@@ -240,6 +242,7 @@ export const MODEL_ENV_KEYS = [
240
242
  // model default legacy
241
243
  OPENAI_API_KEY,
242
244
  OPENAI_BASE_URL,
245
+ OPENAI_MAX_TOKENS,
243
246
  MIDSCENE_OPENAI_INIT_CONFIG_JSON,
244
247
  MIDSCENE_OPENAI_HTTP_PROXY,
245
248
  MIDSCENE_OPENAI_SOCKS_PROXY,
@@ -251,6 +254,7 @@ export const MODEL_ENV_KEYS = [
251
254
  MIDSCENE_INSIGHT_MODEL_API_KEY,
252
255
  MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
253
256
  MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
257
+ MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
254
258
  MIDSCENE_INSIGHT_MODEL_TIMEOUT,
255
259
  MIDSCENE_INSIGHT_MODEL_TEMPERATURE,
256
260
  MIDSCENE_INSIGHT_MODEL_RETRY_COUNT,
@@ -267,6 +271,7 @@ export const MODEL_ENV_KEYS = [
267
271
  MIDSCENE_PLANNING_MODEL_API_KEY,
268
272
  MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
269
273
  MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
274
+ MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
270
275
  MIDSCENE_PLANNING_MODEL_TIMEOUT,
271
276
  MIDSCENE_PLANNING_MODEL_TEMPERATURE,
272
277
  MIDSCENE_PLANNING_MODEL_RETRY_COUNT,
@@ -335,6 +340,8 @@ export interface IModelConfigForInsight {
335
340
  [MIDSCENE_INSIGHT_MODEL_API_KEY]?: string;
336
341
  [MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON]?: string;
337
342
  [MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON]?: string;
343
+ // max tokens
344
+ [MIDSCENE_INSIGHT_MODEL_MAX_TOKENS]?: string;
338
345
  // timeout
339
346
  [MIDSCENE_INSIGHT_MODEL_TIMEOUT]?: string;
340
347
  // temperature
@@ -354,6 +361,8 @@ export interface IModelConfigForPlanning {
354
361
  [MIDSCENE_PLANNING_MODEL_API_KEY]?: string;
355
362
  [MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON]?: string;
356
363
  [MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON]?: string;
364
+ // max tokens
365
+ [MIDSCENE_PLANNING_MODEL_MAX_TOKENS]?: string;
357
366
  // timeout
358
367
  [MIDSCENE_PLANNING_MODEL_TIMEOUT]?: string;
359
368
  // temperature
@@ -381,6 +390,8 @@ export interface IModelConfigForDefault {
381
390
  [MIDSCENE_MODEL_API_KEY]?: string;
382
391
  [MIDSCENE_MODEL_INIT_CONFIG_JSON]?: string;
383
392
  [MIDSCENE_MODEL_EXTRA_BODY_JSON]?: string;
393
+ // max tokens
394
+ [MIDSCENE_MODEL_MAX_TOKENS]?: string;
384
395
  // extra
385
396
  [MIDSCENE_MODEL_FAMILY]?: TModelFamily;
386
397
  // temperature
@@ -403,6 +414,7 @@ export interface IModelConfigForDefaultLegacy {
403
414
  [OPENAI_BASE_URL]?: string;
404
415
  [OPENAI_API_KEY]?: string;
405
416
  [MIDSCENE_OPENAI_INIT_CONFIG_JSON]?: string;
417
+ [OPENAI_MAX_TOKENS]?: string;
406
418
  }
407
419
 
408
420
  /**
@@ -479,6 +491,10 @@ export interface IModelConfig {
479
491
  * Example: { "chat_template_kwargs": { "enable_thinking": true } }
480
492
  */
481
493
  extraBody?: Record<string, unknown>;
494
+ /**
495
+ * max_tokens for model responses.
496
+ */
497
+ maxTokens?: number;
482
498
  /**
483
499
  * Timeout for API calls in milliseconds.
484
500
  * If not set, uses OpenAI SDK default (10 minutes).
@@ -4,8 +4,8 @@ import { readFileSync } from 'node:fs';
4
4
  import { writeFile } from 'node:fs/promises';
5
5
  import path from 'node:path';
6
6
  import type { PhotonImage as PhotonImageType } from '@silvia-odwyer/photon-node';
7
- import type { Rect } from 'src/types';
8
7
  import { getDebug } from '../logger';
8
+ import type { Rect } from '../types';
9
9
  import { ifInNode } from '../utils';
10
10
  import getPhoton from './get-photon';
11
11
  import getSharp from './get-sharp';
@@ -15,6 +15,9 @@ import type {
15
15
  ToolResult,
16
16
  ToolSchema,
17
17
  } from './types';
18
+ import { composeUserPrompt, promptInputExtraSchema } from './user-prompt';
19
+
20
+ export { composeUserPrompt };
18
21
 
19
22
  /**
20
23
  * Generate MCP tool description from ActionSpaceItem
@@ -568,6 +571,9 @@ export function generateCommonTools(
568
571
  if (!screenshot) {
569
572
  return createErrorResult('Screenshot not available');
570
573
  }
574
+ await agent.recordToReport?.('take_screenshot', {
575
+ screenshotBase64: screenshot,
576
+ });
571
577
  const { mimeType, body } = parseBase64(screenshot);
572
578
  return {
573
579
  content: [{ type: 'image', data: body, mimeType }],
@@ -604,16 +610,7 @@ export function generateCommonTools(
604
610
  return createErrorResult('act is not supported by this agent');
605
611
  }
606
612
  const result = await agent.aiAction(prompt, { deepThink: false });
607
- const screenshotResult = await captureScreenshotResult(agent, 'act');
608
- if (result) {
609
- const message =
610
- typeof result === 'string' ? result : JSON.stringify(result);
611
- screenshotResult.content.unshift({
612
- type: 'text',
613
- text: `Task finished, message: ${message}`,
614
- });
615
- }
616
- return screenshotResult;
613
+ return await captureScreenshotResult(agent, 'act', result);
617
614
  } catch (error: unknown) {
618
615
  const errorMessage = getErrorMessage(error);
619
616
  console.error('Error executing act:', errorMessage);
@@ -631,6 +628,7 @@ export function generateCommonTools(
631
628
  .describe(
632
629
  'Natural language assertion to verify, e.g. "there is a login button visible"',
633
630
  ),
631
+ ...promptInputExtraSchema,
634
632
  ...initArgSchema,
635
633
  },
636
634
  cli: mergeToolCliMetadata(undefined, initArgCliMetadata),
@@ -643,7 +641,13 @@ export function generateCommonTools(
643
641
  if (!agent.aiAssert) {
644
642
  return createErrorResult('assert is not supported by this agent');
645
643
  }
646
- await agent.aiAssert(prompt);
644
+ const userPrompt = composeUserPrompt({
645
+ prompt,
646
+ image: args.image,
647
+ imageName: args.imageName,
648
+ convertHttpImage2Base64: args.convertHttpImage2Base64,
649
+ });
650
+ await agent.aiAssert(userPrompt);
647
651
  return {
648
652
  content: [{ type: 'text', text: 'Assertion passed.' }],
649
653
  };
package/src/mcp/types.ts CHANGED
@@ -83,6 +83,22 @@ export interface ActionSpaceItem {
83
83
  paramSchema?: z.ZodTypeAny;
84
84
  }
85
85
 
86
+ /**
87
+ * Structural shape compatible with @midscene/core `TUserPrompt`.
88
+ * Declared locally to avoid a circular dep on `@midscene/core` from `@midscene/shared`.
89
+ *
90
+ * Currently consumed only by the `assert` tool in `generateCommonTools`.
91
+ * `aiAction` and `aiWaitFor` stay string-only at the CLI surface because the
92
+ * tools generator does not yet expose multimodal entry points for them.
93
+ */
94
+ export type UserPromptLike =
95
+ | string
96
+ | {
97
+ prompt: string;
98
+ images?: Array<{ name: string; url: string }>;
99
+ convertHttpImage2Base64?: boolean;
100
+ };
101
+
86
102
  /**
87
103
  * Base agent interface
88
104
  * Represents a platform-specific agent (Android, iOS, Web)
@@ -94,6 +110,10 @@ export interface BaseAgent {
94
110
  page?: {
95
111
  screenshotBase64(): Promise<string>;
96
112
  };
113
+ recordToReport?: (
114
+ title?: string,
115
+ opt?: { content?: string; screenshotBase64?: string },
116
+ ) => Promise<void>;
97
117
  callActionInActionSpace?: (
98
118
  actionName: string,
99
119
  params?: unknown,
@@ -107,7 +127,7 @@ export interface BaseAgent {
107
127
  options: Record<string, unknown>,
108
128
  ) => Promise<unknown>;
109
129
  aiAssert?: (
110
- assertion: string,
130
+ assertion: UserPromptLike,
111
131
  msg?: string,
112
132
  options?: Record<string, unknown>,
113
133
  ) => Promise<unknown>;
@@ -0,0 +1,102 @@
1
+ import { z } from 'zod';
2
+ import type { UserPromptLike } from './types';
3
+
4
+ type PromptReferenceImage = { name: string; url: string };
5
+
6
+ function normalizeStringList(raw: unknown, fieldName: string): string[] {
7
+ if (raw === undefined || raw === null) return [];
8
+ if (typeof raw === 'string') {
9
+ const trimmed = raw.trim();
10
+ return trimmed ? [trimmed] : [];
11
+ }
12
+ if (Array.isArray(raw)) {
13
+ return raw.map((item, index) => {
14
+ if (typeof item !== 'string') {
15
+ throw new Error(`${fieldName}[${index}]: expected a string.`);
16
+ }
17
+ return item.trim();
18
+ });
19
+ }
20
+ throw new Error(
21
+ `${fieldName}: expected a string or string array, got ${typeof raw}.`,
22
+ );
23
+ }
24
+
25
+ function composeImages(input: {
26
+ image?: unknown;
27
+ imageName?: unknown;
28
+ }): PromptReferenceImage[] {
29
+ const urls = normalizeStringList(input.image, 'image');
30
+ const names = normalizeStringList(input.imageName, 'imageName');
31
+
32
+ if (urls.length !== names.length) {
33
+ throw new Error(
34
+ `image/imageName: expected the same number of --image and --image-name values, got ${urls.length} image(s) and ${names.length} image name(s).`,
35
+ );
36
+ }
37
+
38
+ return urls.map((url, index) => ({ name: names[index], url }));
39
+ }
40
+
41
+ function coerceBoolean(value: unknown): boolean | undefined {
42
+ if (value === undefined || value === null) return undefined;
43
+ if (typeof value === 'boolean') return value;
44
+ if (typeof value === 'string') {
45
+ const trimmed = value.trim();
46
+ if (!trimmed) return undefined;
47
+ const v = trimmed.toLowerCase();
48
+ if (v === 'true' || v === '1') return true;
49
+ if (v === 'false' || v === '0') return false;
50
+ throw new Error(
51
+ `convertHttpImage2Base64: expected "true", "false", "1", or "0"; got ${JSON.stringify(value)}.`,
52
+ );
53
+ }
54
+ throw new Error(
55
+ `convertHttpImage2Base64: expected a boolean, got ${typeof value}.`,
56
+ );
57
+ }
58
+
59
+ export function composeUserPrompt(input: {
60
+ prompt: string;
61
+ image?: unknown;
62
+ imageName?: unknown;
63
+ convertHttpImage2Base64?: unknown;
64
+ }): UserPromptLike {
65
+ const images = composeImages({
66
+ image: input.image,
67
+ imageName: input.imageName,
68
+ });
69
+ const convertFlag = coerceBoolean(input.convertHttpImage2Base64);
70
+
71
+ if (images.length === 0 && convertFlag === undefined) {
72
+ return input.prompt;
73
+ }
74
+
75
+ const payload: Exclude<UserPromptLike, string> = { prompt: input.prompt };
76
+ if (images.length > 0) {
77
+ payload.images = images;
78
+ }
79
+ if (convertFlag !== undefined) {
80
+ payload.convertHttpImage2Base64 = convertFlag;
81
+ }
82
+ return payload;
83
+ }
84
+
85
+ export const promptInputExtraSchema = {
86
+ image: z
87
+ .union([z.string(), z.array(z.string())])
88
+ .optional()
89
+ .describe('Reference image URL/path. Repeat --image for multiple images.'),
90
+ imageName: z
91
+ .union([z.string(), z.array(z.string())])
92
+ .optional()
93
+ .describe(
94
+ 'Reference image name. Repeat --image-name; must align with --image order.',
95
+ ),
96
+ convertHttpImage2Base64: z
97
+ .union([z.boolean(), z.string()])
98
+ .optional()
99
+ .describe(
100
+ 'If true, convert http(s) image URLs to base64 before sending to the model.',
101
+ ),
102
+ };