@midscene/shared 1.8.5-beta-20260525033347.0 → 1.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/cli/cli-args.mjs +41 -2
- package/dist/es/cli/cli-runner.mjs +4 -3
- package/dist/es/constants/example-code.mjs +1 -1
- package/dist/es/env/constants.mjs +5 -1
- package/dist/es/env/parse-model-config.mjs +11 -2
- package/dist/es/env/types.mjs +7 -4
- package/dist/es/mcp/tool-generator.mjs +14 -11
- package/dist/es/mcp/user-prompt.mjs +66 -0
- package/dist/lib/cli/cli-args.js +44 -2
- package/dist/lib/cli/cli-runner.js +3 -2
- package/dist/lib/constants/example-code.js +1 -1
- package/dist/lib/env/constants.js +4 -0
- package/dist/lib/env/parse-model-config.js +10 -1
- package/dist/lib/env/types.js +12 -3
- package/dist/lib/mcp/tool-generator.js +17 -11
- package/dist/lib/mcp/user-prompt.js +103 -0
- package/dist/types/cli/cli-args.d.ts +8 -0
- package/dist/types/constants/example-code.d.ts +1 -1
- package/dist/types/env/constants.d.ts +1 -0
- package/dist/types/env/types.d.ts +15 -5
- package/dist/types/mcp/tool-generator.d.ts +2 -0
- package/dist/types/mcp/types.d.ts +21 -1
- package/dist/types/mcp/user-prompt.d.ts +13 -0
- package/package.json +1 -1
- package/src/cli/cli-args.ts +65 -1
- package/src/cli/cli-runner.ts +10 -2
- package/src/constants/example-code.ts +1 -1
- package/src/env/constants.ts +9 -0
- package/src/env/parse-model-config.ts +12 -0
- package/src/env/types.ts +19 -3
- package/src/img/transform.ts +1 -1
- package/src/mcp/tool-generator.ts +15 -11
- package/src/mcp/types.ts +21 -1
- package/src/mcp/user-prompt.ts +102 -0
|
@@ -5,7 +5,7 @@ export const PLAYWRIGHT_EXAMPLE_CODE = `
|
|
|
5
5
|
IMPORTANT: Follow these exact type signatures for AI functions:
|
|
6
6
|
|
|
7
7
|
// Type signatures for AI functions:
|
|
8
|
-
aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<
|
|
8
|
+
aiAct(prompt: string, options?: { cacheable?: boolean, deepThink?: 'unset' | true | false }): Promise<string | undefined>
|
|
9
9
|
aiInput(text: string, locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
|
|
10
10
|
aiTap(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
|
|
11
11
|
aiHover(locate: string, options?: { deepLocate?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
|
package/src/env/constants.ts
CHANGED
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
MIDSCENE_INSIGHT_MODEL_FAMILY,
|
|
6
6
|
MIDSCENE_INSIGHT_MODEL_HTTP_PROXY,
|
|
7
7
|
MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
|
|
8
|
+
MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
|
|
8
9
|
MIDSCENE_INSIGHT_MODEL_NAME,
|
|
9
10
|
MIDSCENE_INSIGHT_MODEL_REASONING_BUDGET,
|
|
10
11
|
MIDSCENE_INSIGHT_MODEL_REASONING_EFFORT,
|
|
@@ -20,6 +21,7 @@ import {
|
|
|
20
21
|
MIDSCENE_MODEL_FAMILY,
|
|
21
22
|
MIDSCENE_MODEL_HTTP_PROXY,
|
|
22
23
|
MIDSCENE_MODEL_INIT_CONFIG_JSON,
|
|
24
|
+
MIDSCENE_MODEL_MAX_TOKENS,
|
|
23
25
|
MIDSCENE_MODEL_NAME,
|
|
24
26
|
MIDSCENE_MODEL_REASONING_BUDGET,
|
|
25
27
|
MIDSCENE_MODEL_REASONING_EFFORT,
|
|
@@ -38,6 +40,7 @@ import {
|
|
|
38
40
|
MIDSCENE_PLANNING_MODEL_FAMILY,
|
|
39
41
|
MIDSCENE_PLANNING_MODEL_HTTP_PROXY,
|
|
40
42
|
MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
|
|
43
|
+
MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
|
|
41
44
|
MIDSCENE_PLANNING_MODEL_NAME,
|
|
42
45
|
MIDSCENE_PLANNING_MODEL_REASONING_BUDGET,
|
|
43
46
|
MIDSCENE_PLANNING_MODEL_REASONING_EFFORT,
|
|
@@ -49,6 +52,7 @@ import {
|
|
|
49
52
|
MIDSCENE_PLANNING_MODEL_TIMEOUT,
|
|
50
53
|
OPENAI_API_KEY,
|
|
51
54
|
OPENAI_BASE_URL,
|
|
55
|
+
OPENAI_MAX_TOKENS,
|
|
52
56
|
} from './types';
|
|
53
57
|
|
|
54
58
|
interface IModelConfigKeys {
|
|
@@ -65,6 +69,7 @@ interface IModelConfigKeys {
|
|
|
65
69
|
openaiApiKey: string;
|
|
66
70
|
openaiExtraConfig: string;
|
|
67
71
|
extraBody: string;
|
|
72
|
+
maxTokens: string;
|
|
68
73
|
/**
|
|
69
74
|
* Extra
|
|
70
75
|
*/
|
|
@@ -101,6 +106,7 @@ export const INSIGHT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
|
|
|
101
106
|
openaiApiKey: MIDSCENE_INSIGHT_MODEL_API_KEY,
|
|
102
107
|
openaiExtraConfig: MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
|
|
103
108
|
extraBody: MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
|
|
109
|
+
maxTokens: MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
|
|
104
110
|
/**
|
|
105
111
|
* Extra
|
|
106
112
|
*/
|
|
@@ -137,6 +143,7 @@ export const PLANNING_MODEL_CONFIG_KEYS: IModelConfigKeys = {
|
|
|
137
143
|
openaiApiKey: MIDSCENE_PLANNING_MODEL_API_KEY,
|
|
138
144
|
openaiExtraConfig: MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
|
|
139
145
|
extraBody: MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
|
|
146
|
+
maxTokens: MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
|
|
140
147
|
/**
|
|
141
148
|
* Extra
|
|
142
149
|
*/
|
|
@@ -174,6 +181,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS: IModelConfigKeys = {
|
|
|
174
181
|
openaiApiKey: MIDSCENE_MODEL_API_KEY,
|
|
175
182
|
openaiExtraConfig: MIDSCENE_MODEL_INIT_CONFIG_JSON,
|
|
176
183
|
extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
|
|
184
|
+
maxTokens: MIDSCENE_MODEL_MAX_TOKENS,
|
|
177
185
|
/**
|
|
178
186
|
* Extra
|
|
179
187
|
*/
|
|
@@ -211,6 +219,7 @@ export const DEFAULT_MODEL_CONFIG_KEYS_LEGACY: IModelConfigKeys = {
|
|
|
211
219
|
openaiApiKey: OPENAI_API_KEY,
|
|
212
220
|
openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
|
|
213
221
|
extraBody: MIDSCENE_MODEL_EXTRA_BODY_JSON,
|
|
222
|
+
maxTokens: OPENAI_MAX_TOKENS,
|
|
214
223
|
/**
|
|
215
224
|
* Extra
|
|
216
225
|
*/
|
|
@@ -18,6 +18,7 @@ import {
|
|
|
18
18
|
MODEL_FAMILY_VALUES,
|
|
19
19
|
OPENAI_API_KEY,
|
|
20
20
|
OPENAI_BASE_URL,
|
|
21
|
+
OPENAI_MAX_TOKENS,
|
|
21
22
|
type TIntent,
|
|
22
23
|
type TModelFamily,
|
|
23
24
|
UITarsModelVersion,
|
|
@@ -200,6 +201,9 @@ export const parseOpenaiSdkConfig = ({
|
|
|
200
201
|
const legacyOpenaiExtraConfig = useLegacyLogic
|
|
201
202
|
? provider[MIDSCENE_OPENAI_INIT_CONFIG_JSON]
|
|
202
203
|
: undefined;
|
|
204
|
+
const legacyMaxTokens = useLegacyLogic
|
|
205
|
+
? provider[OPENAI_MAX_TOKENS]
|
|
206
|
+
: undefined;
|
|
203
207
|
const legacyModelFamily = useLegacyLogic
|
|
204
208
|
? legacyConfigToModelFamily(provider)
|
|
205
209
|
: undefined;
|
|
@@ -222,6 +226,13 @@ export const parseOpenaiSdkConfig = ({
|
|
|
222
226
|
);
|
|
223
227
|
const extraBodyStr: string | undefined = provider[keys.extraBody];
|
|
224
228
|
const extraBody = parseJson(keys.extraBody, extraBodyStr);
|
|
229
|
+
const maxTokensStr = provider[keys.maxTokens] || legacyMaxTokens;
|
|
230
|
+
const maxTokens = (() => {
|
|
231
|
+
const val = maxTokensStr?.trim();
|
|
232
|
+
if (!val) return undefined;
|
|
233
|
+
const num = Number(val);
|
|
234
|
+
return Number.isFinite(num) ? num : undefined;
|
|
235
|
+
})();
|
|
225
236
|
const temperature = provider[keys.temperature]
|
|
226
237
|
? Number(provider[keys.temperature])
|
|
227
238
|
: 0;
|
|
@@ -239,6 +250,7 @@ export const parseOpenaiSdkConfig = ({
|
|
|
239
250
|
openaiApiKey,
|
|
240
251
|
openaiExtraConfig: normalizeOpenaiExtraConfig(openaiExtraConfig),
|
|
241
252
|
extraBody,
|
|
253
|
+
maxTokens,
|
|
242
254
|
modelFamily,
|
|
243
255
|
uiTarsModelVersion,
|
|
244
256
|
modelName: modelName!,
|
package/src/env/types.ts
CHANGED
|
@@ -106,6 +106,8 @@ export const MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON =
|
|
|
106
106
|
'MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON';
|
|
107
107
|
export const MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON =
|
|
108
108
|
'MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON';
|
|
109
|
+
export const MIDSCENE_INSIGHT_MODEL_MAX_TOKENS =
|
|
110
|
+
'MIDSCENE_INSIGHT_MODEL_MAX_TOKENS';
|
|
109
111
|
export const MIDSCENE_INSIGHT_MODEL_TIMEOUT = 'MIDSCENE_INSIGHT_MODEL_TIMEOUT';
|
|
110
112
|
export const MIDSCENE_INSIGHT_MODEL_TEMPERATURE =
|
|
111
113
|
'MIDSCENE_INSIGHT_MODEL_TEMPERATURE';
|
|
@@ -135,6 +137,8 @@ export const MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON =
|
|
|
135
137
|
'MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON';
|
|
136
138
|
export const MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON =
|
|
137
139
|
'MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON';
|
|
140
|
+
export const MIDSCENE_PLANNING_MODEL_MAX_TOKENS =
|
|
141
|
+
'MIDSCENE_PLANNING_MODEL_MAX_TOKENS';
|
|
138
142
|
export const MIDSCENE_PLANNING_MODEL_TIMEOUT =
|
|
139
143
|
'MIDSCENE_PLANNING_MODEL_TIMEOUT';
|
|
140
144
|
export const MIDSCENE_PLANNING_MODEL_TEMPERATURE =
|
|
@@ -178,14 +182,11 @@ export const BOOLEAN_ENV_KEYS = [
|
|
|
178
182
|
] as const;
|
|
179
183
|
|
|
180
184
|
export const NUMBER_ENV_KEYS = [
|
|
181
|
-
MIDSCENE_MODEL_MAX_TOKENS,
|
|
182
185
|
MIDSCENE_CACHE_MAX_FILENAME_LENGTH,
|
|
183
186
|
MIDSCENE_REPLANNING_CYCLE_LIMIT,
|
|
184
187
|
] as const;
|
|
185
188
|
|
|
186
189
|
export const STRING_ENV_KEYS = [
|
|
187
|
-
MIDSCENE_MODEL_MAX_TOKENS,
|
|
188
|
-
OPENAI_MAX_TOKENS,
|
|
189
190
|
MIDSCENE_ADB_PATH,
|
|
190
191
|
MIDSCENE_ADB_REMOTE_HOST,
|
|
191
192
|
MIDSCENE_ADB_REMOTE_PORT,
|
|
@@ -224,6 +225,7 @@ export const MODEL_ENV_KEYS = [
|
|
|
224
225
|
MIDSCENE_MODEL_BASE_URL,
|
|
225
226
|
MIDSCENE_MODEL_SOCKS_PROXY,
|
|
226
227
|
MIDSCENE_MODEL_HTTP_PROXY,
|
|
228
|
+
MIDSCENE_MODEL_MAX_TOKENS,
|
|
227
229
|
MIDSCENE_MODEL_TIMEOUT,
|
|
228
230
|
MIDSCENE_MODEL_TEMPERATURE,
|
|
229
231
|
MIDSCENE_MODEL_RETRY_COUNT,
|
|
@@ -240,6 +242,7 @@ export const MODEL_ENV_KEYS = [
|
|
|
240
242
|
// model default legacy
|
|
241
243
|
OPENAI_API_KEY,
|
|
242
244
|
OPENAI_BASE_URL,
|
|
245
|
+
OPENAI_MAX_TOKENS,
|
|
243
246
|
MIDSCENE_OPENAI_INIT_CONFIG_JSON,
|
|
244
247
|
MIDSCENE_OPENAI_HTTP_PROXY,
|
|
245
248
|
MIDSCENE_OPENAI_SOCKS_PROXY,
|
|
@@ -251,6 +254,7 @@ export const MODEL_ENV_KEYS = [
|
|
|
251
254
|
MIDSCENE_INSIGHT_MODEL_API_KEY,
|
|
252
255
|
MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON,
|
|
253
256
|
MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON,
|
|
257
|
+
MIDSCENE_INSIGHT_MODEL_MAX_TOKENS,
|
|
254
258
|
MIDSCENE_INSIGHT_MODEL_TIMEOUT,
|
|
255
259
|
MIDSCENE_INSIGHT_MODEL_TEMPERATURE,
|
|
256
260
|
MIDSCENE_INSIGHT_MODEL_RETRY_COUNT,
|
|
@@ -267,6 +271,7 @@ export const MODEL_ENV_KEYS = [
|
|
|
267
271
|
MIDSCENE_PLANNING_MODEL_API_KEY,
|
|
268
272
|
MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON,
|
|
269
273
|
MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON,
|
|
274
|
+
MIDSCENE_PLANNING_MODEL_MAX_TOKENS,
|
|
270
275
|
MIDSCENE_PLANNING_MODEL_TIMEOUT,
|
|
271
276
|
MIDSCENE_PLANNING_MODEL_TEMPERATURE,
|
|
272
277
|
MIDSCENE_PLANNING_MODEL_RETRY_COUNT,
|
|
@@ -335,6 +340,8 @@ export interface IModelConfigForInsight {
|
|
|
335
340
|
[MIDSCENE_INSIGHT_MODEL_API_KEY]?: string;
|
|
336
341
|
[MIDSCENE_INSIGHT_MODEL_INIT_CONFIG_JSON]?: string;
|
|
337
342
|
[MIDSCENE_INSIGHT_MODEL_EXTRA_BODY_JSON]?: string;
|
|
343
|
+
// max tokens
|
|
344
|
+
[MIDSCENE_INSIGHT_MODEL_MAX_TOKENS]?: string;
|
|
338
345
|
// timeout
|
|
339
346
|
[MIDSCENE_INSIGHT_MODEL_TIMEOUT]?: string;
|
|
340
347
|
// temperature
|
|
@@ -354,6 +361,8 @@ export interface IModelConfigForPlanning {
|
|
|
354
361
|
[MIDSCENE_PLANNING_MODEL_API_KEY]?: string;
|
|
355
362
|
[MIDSCENE_PLANNING_MODEL_INIT_CONFIG_JSON]?: string;
|
|
356
363
|
[MIDSCENE_PLANNING_MODEL_EXTRA_BODY_JSON]?: string;
|
|
364
|
+
// max tokens
|
|
365
|
+
[MIDSCENE_PLANNING_MODEL_MAX_TOKENS]?: string;
|
|
357
366
|
// timeout
|
|
358
367
|
[MIDSCENE_PLANNING_MODEL_TIMEOUT]?: string;
|
|
359
368
|
// temperature
|
|
@@ -381,6 +390,8 @@ export interface IModelConfigForDefault {
|
|
|
381
390
|
[MIDSCENE_MODEL_API_KEY]?: string;
|
|
382
391
|
[MIDSCENE_MODEL_INIT_CONFIG_JSON]?: string;
|
|
383
392
|
[MIDSCENE_MODEL_EXTRA_BODY_JSON]?: string;
|
|
393
|
+
// max tokens
|
|
394
|
+
[MIDSCENE_MODEL_MAX_TOKENS]?: string;
|
|
384
395
|
// extra
|
|
385
396
|
[MIDSCENE_MODEL_FAMILY]?: TModelFamily;
|
|
386
397
|
// temperature
|
|
@@ -403,6 +414,7 @@ export interface IModelConfigForDefaultLegacy {
|
|
|
403
414
|
[OPENAI_BASE_URL]?: string;
|
|
404
415
|
[OPENAI_API_KEY]?: string;
|
|
405
416
|
[MIDSCENE_OPENAI_INIT_CONFIG_JSON]?: string;
|
|
417
|
+
[OPENAI_MAX_TOKENS]?: string;
|
|
406
418
|
}
|
|
407
419
|
|
|
408
420
|
/**
|
|
@@ -479,6 +491,10 @@ export interface IModelConfig {
|
|
|
479
491
|
* Example: { "chat_template_kwargs": { "enable_thinking": true } }
|
|
480
492
|
*/
|
|
481
493
|
extraBody?: Record<string, unknown>;
|
|
494
|
+
/**
|
|
495
|
+
* max_tokens for model responses.
|
|
496
|
+
*/
|
|
497
|
+
maxTokens?: number;
|
|
482
498
|
/**
|
|
483
499
|
* Timeout for API calls in milliseconds.
|
|
484
500
|
* If not set, uses OpenAI SDK default (10 minutes).
|
package/src/img/transform.ts
CHANGED
|
@@ -4,8 +4,8 @@ import { readFileSync } from 'node:fs';
|
|
|
4
4
|
import { writeFile } from 'node:fs/promises';
|
|
5
5
|
import path from 'node:path';
|
|
6
6
|
import type { PhotonImage as PhotonImageType } from '@silvia-odwyer/photon-node';
|
|
7
|
-
import type { Rect } from 'src/types';
|
|
8
7
|
import { getDebug } from '../logger';
|
|
8
|
+
import type { Rect } from '../types';
|
|
9
9
|
import { ifInNode } from '../utils';
|
|
10
10
|
import getPhoton from './get-photon';
|
|
11
11
|
import getSharp from './get-sharp';
|
|
@@ -15,6 +15,9 @@ import type {
|
|
|
15
15
|
ToolResult,
|
|
16
16
|
ToolSchema,
|
|
17
17
|
} from './types';
|
|
18
|
+
import { composeUserPrompt, promptInputExtraSchema } from './user-prompt';
|
|
19
|
+
|
|
20
|
+
export { composeUserPrompt };
|
|
18
21
|
|
|
19
22
|
/**
|
|
20
23
|
* Generate MCP tool description from ActionSpaceItem
|
|
@@ -568,6 +571,9 @@ export function generateCommonTools(
|
|
|
568
571
|
if (!screenshot) {
|
|
569
572
|
return createErrorResult('Screenshot not available');
|
|
570
573
|
}
|
|
574
|
+
await agent.recordToReport?.('take_screenshot', {
|
|
575
|
+
screenshotBase64: screenshot,
|
|
576
|
+
});
|
|
571
577
|
const { mimeType, body } = parseBase64(screenshot);
|
|
572
578
|
return {
|
|
573
579
|
content: [{ type: 'image', data: body, mimeType }],
|
|
@@ -604,16 +610,7 @@ export function generateCommonTools(
|
|
|
604
610
|
return createErrorResult('act is not supported by this agent');
|
|
605
611
|
}
|
|
606
612
|
const result = await agent.aiAction(prompt, { deepThink: false });
|
|
607
|
-
|
|
608
|
-
if (result) {
|
|
609
|
-
const message =
|
|
610
|
-
typeof result === 'string' ? result : JSON.stringify(result);
|
|
611
|
-
screenshotResult.content.unshift({
|
|
612
|
-
type: 'text',
|
|
613
|
-
text: `Task finished, message: ${message}`,
|
|
614
|
-
});
|
|
615
|
-
}
|
|
616
|
-
return screenshotResult;
|
|
613
|
+
return await captureScreenshotResult(agent, 'act', result);
|
|
617
614
|
} catch (error: unknown) {
|
|
618
615
|
const errorMessage = getErrorMessage(error);
|
|
619
616
|
console.error('Error executing act:', errorMessage);
|
|
@@ -631,6 +628,7 @@ export function generateCommonTools(
|
|
|
631
628
|
.describe(
|
|
632
629
|
'Natural language assertion to verify, e.g. "there is a login button visible"',
|
|
633
630
|
),
|
|
631
|
+
...promptInputExtraSchema,
|
|
634
632
|
...initArgSchema,
|
|
635
633
|
},
|
|
636
634
|
cli: mergeToolCliMetadata(undefined, initArgCliMetadata),
|
|
@@ -643,7 +641,13 @@ export function generateCommonTools(
|
|
|
643
641
|
if (!agent.aiAssert) {
|
|
644
642
|
return createErrorResult('assert is not supported by this agent');
|
|
645
643
|
}
|
|
646
|
-
|
|
644
|
+
const userPrompt = composeUserPrompt({
|
|
645
|
+
prompt,
|
|
646
|
+
image: args.image,
|
|
647
|
+
imageName: args.imageName,
|
|
648
|
+
convertHttpImage2Base64: args.convertHttpImage2Base64,
|
|
649
|
+
});
|
|
650
|
+
await agent.aiAssert(userPrompt);
|
|
647
651
|
return {
|
|
648
652
|
content: [{ type: 'text', text: 'Assertion passed.' }],
|
|
649
653
|
};
|
package/src/mcp/types.ts
CHANGED
|
@@ -83,6 +83,22 @@ export interface ActionSpaceItem {
|
|
|
83
83
|
paramSchema?: z.ZodTypeAny;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
|
+
/**
|
|
87
|
+
* Structural shape compatible with @midscene/core `TUserPrompt`.
|
|
88
|
+
* Declared locally to avoid a circular dep on `@midscene/core` from `@midscene/shared`.
|
|
89
|
+
*
|
|
90
|
+
* Currently consumed only by the `assert` tool in `generateCommonTools`.
|
|
91
|
+
* `aiAction` and `aiWaitFor` stay string-only at the CLI surface because the
|
|
92
|
+
* tools generator does not yet expose multimodal entry points for them.
|
|
93
|
+
*/
|
|
94
|
+
export type UserPromptLike =
|
|
95
|
+
| string
|
|
96
|
+
| {
|
|
97
|
+
prompt: string;
|
|
98
|
+
images?: Array<{ name: string; url: string }>;
|
|
99
|
+
convertHttpImage2Base64?: boolean;
|
|
100
|
+
};
|
|
101
|
+
|
|
86
102
|
/**
|
|
87
103
|
* Base agent interface
|
|
88
104
|
* Represents a platform-specific agent (Android, iOS, Web)
|
|
@@ -94,6 +110,10 @@ export interface BaseAgent {
|
|
|
94
110
|
page?: {
|
|
95
111
|
screenshotBase64(): Promise<string>;
|
|
96
112
|
};
|
|
113
|
+
recordToReport?: (
|
|
114
|
+
title?: string,
|
|
115
|
+
opt?: { content?: string; screenshotBase64?: string },
|
|
116
|
+
) => Promise<void>;
|
|
97
117
|
callActionInActionSpace?: (
|
|
98
118
|
actionName: string,
|
|
99
119
|
params?: unknown,
|
|
@@ -107,7 +127,7 @@ export interface BaseAgent {
|
|
|
107
127
|
options: Record<string, unknown>,
|
|
108
128
|
) => Promise<unknown>;
|
|
109
129
|
aiAssert?: (
|
|
110
|
-
assertion:
|
|
130
|
+
assertion: UserPromptLike,
|
|
111
131
|
msg?: string,
|
|
112
132
|
options?: Record<string, unknown>,
|
|
113
133
|
) => Promise<unknown>;
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import type { UserPromptLike } from './types';
|
|
3
|
+
|
|
4
|
+
type PromptReferenceImage = { name: string; url: string };
|
|
5
|
+
|
|
6
|
+
function normalizeStringList(raw: unknown, fieldName: string): string[] {
|
|
7
|
+
if (raw === undefined || raw === null) return [];
|
|
8
|
+
if (typeof raw === 'string') {
|
|
9
|
+
const trimmed = raw.trim();
|
|
10
|
+
return trimmed ? [trimmed] : [];
|
|
11
|
+
}
|
|
12
|
+
if (Array.isArray(raw)) {
|
|
13
|
+
return raw.map((item, index) => {
|
|
14
|
+
if (typeof item !== 'string') {
|
|
15
|
+
throw new Error(`${fieldName}[${index}]: expected a string.`);
|
|
16
|
+
}
|
|
17
|
+
return item.trim();
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
throw new Error(
|
|
21
|
+
`${fieldName}: expected a string or string array, got ${typeof raw}.`,
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function composeImages(input: {
|
|
26
|
+
image?: unknown;
|
|
27
|
+
imageName?: unknown;
|
|
28
|
+
}): PromptReferenceImage[] {
|
|
29
|
+
const urls = normalizeStringList(input.image, 'image');
|
|
30
|
+
const names = normalizeStringList(input.imageName, 'imageName');
|
|
31
|
+
|
|
32
|
+
if (urls.length !== names.length) {
|
|
33
|
+
throw new Error(
|
|
34
|
+
`image/imageName: expected the same number of --image and --image-name values, got ${urls.length} image(s) and ${names.length} image name(s).`,
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return urls.map((url, index) => ({ name: names[index], url }));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function coerceBoolean(value: unknown): boolean | undefined {
|
|
42
|
+
if (value === undefined || value === null) return undefined;
|
|
43
|
+
if (typeof value === 'boolean') return value;
|
|
44
|
+
if (typeof value === 'string') {
|
|
45
|
+
const trimmed = value.trim();
|
|
46
|
+
if (!trimmed) return undefined;
|
|
47
|
+
const v = trimmed.toLowerCase();
|
|
48
|
+
if (v === 'true' || v === '1') return true;
|
|
49
|
+
if (v === 'false' || v === '0') return false;
|
|
50
|
+
throw new Error(
|
|
51
|
+
`convertHttpImage2Base64: expected "true", "false", "1", or "0"; got ${JSON.stringify(value)}.`,
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
throw new Error(
|
|
55
|
+
`convertHttpImage2Base64: expected a boolean, got ${typeof value}.`,
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function composeUserPrompt(input: {
|
|
60
|
+
prompt: string;
|
|
61
|
+
image?: unknown;
|
|
62
|
+
imageName?: unknown;
|
|
63
|
+
convertHttpImage2Base64?: unknown;
|
|
64
|
+
}): UserPromptLike {
|
|
65
|
+
const images = composeImages({
|
|
66
|
+
image: input.image,
|
|
67
|
+
imageName: input.imageName,
|
|
68
|
+
});
|
|
69
|
+
const convertFlag = coerceBoolean(input.convertHttpImage2Base64);
|
|
70
|
+
|
|
71
|
+
if (images.length === 0 && convertFlag === undefined) {
|
|
72
|
+
return input.prompt;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const payload: Exclude<UserPromptLike, string> = { prompt: input.prompt };
|
|
76
|
+
if (images.length > 0) {
|
|
77
|
+
payload.images = images;
|
|
78
|
+
}
|
|
79
|
+
if (convertFlag !== undefined) {
|
|
80
|
+
payload.convertHttpImage2Base64 = convertFlag;
|
|
81
|
+
}
|
|
82
|
+
return payload;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export const promptInputExtraSchema = {
|
|
86
|
+
image: z
|
|
87
|
+
.union([z.string(), z.array(z.string())])
|
|
88
|
+
.optional()
|
|
89
|
+
.describe('Reference image URL/path. Repeat --image for multiple images.'),
|
|
90
|
+
imageName: z
|
|
91
|
+
.union([z.string(), z.array(z.string())])
|
|
92
|
+
.optional()
|
|
93
|
+
.describe(
|
|
94
|
+
'Reference image name. Repeat --image-name; must align with --image order.',
|
|
95
|
+
),
|
|
96
|
+
convertHttpImage2Base64: z
|
|
97
|
+
.union([z.boolean(), z.string()])
|
|
98
|
+
.optional()
|
|
99
|
+
.describe(
|
|
100
|
+
'If true, convert http(s) image URLs to base64 before sending to the model.',
|
|
101
|
+
),
|
|
102
|
+
};
|