@midscene/core 1.9.6 → 1.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +40 -8
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +3 -3
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +18 -3
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/prompt/describe.mjs +10 -2
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
- package/dist/es/ai-model/prompt/markdown-generator.mjs +150 -40
- package/dist/es/ai-model/prompt/markdown-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/recorder-generation-common.mjs +74 -14
- package/dist/es/ai-model/prompt/recorder-generation-common.mjs.map +1 -1
- package/dist/es/ai-model/prompt/recorder-metadata-generator.mjs +3 -5
- package/dist/es/ai-model/prompt/recorder-metadata-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/recorder-ui-describer.mjs +10 -6
- package/dist/es/ai-model/prompt/recorder-ui-describer.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +2 -2
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +33 -3
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/recorder-ui-describer.mjs +33 -84
- package/dist/es/recorder-ui-describer.mjs.map +1 -1
- package/dist/es/service/index.mjs +11 -3
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/service/utils.mjs +50 -1
- package/dist/es/service/utils.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/agent.js +39 -7
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +3 -3
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +20 -2
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/prompt/describe.js +10 -2
- package/dist/lib/ai-model/prompt/describe.js.map +1 -1
- package/dist/lib/ai-model/prompt/markdown-generator.js +150 -40
- package/dist/lib/ai-model/prompt/markdown-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/recorder-generation-common.js +75 -12
- package/dist/lib/ai-model/prompt/recorder-generation-common.js.map +1 -1
- package/dist/lib/ai-model/prompt/recorder-metadata-generator.js +2 -4
- package/dist/lib/ai-model/prompt/recorder-metadata-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/recorder-ui-describer.js +10 -6
- package/dist/lib/ai-model/prompt/recorder-ui-describer.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +2 -2
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +33 -3
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/recorder-ui-describer.js +33 -84
- package/dist/lib/recorder-ui-describer.js.map +1 -1
- package/dist/lib/service/index.js +10 -2
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/service/utils.js +53 -1
- package/dist/lib/service/utils.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/agent/agent.d.ts +17 -6
- package/dist/types/agent/index.d.ts +1 -1
- package/dist/types/agent/tasks.d.ts +4 -2
- package/dist/types/agent/utils.d.ts +4 -1
- package/dist/types/ai-model/prompt/recorder-generation-common.d.ts +11 -7
- package/dist/types/ai-model/prompt/recorder-ui-describer.d.ts +1 -1
- package/dist/types/device/index.d.ts +6 -0
- package/dist/types/service/index.d.ts +1 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/types.d.ts +1 -0
- package/package.json +2 -2
|
@@ -1,7 +1,88 @@
|
|
|
1
|
+
import { imageInfoOfBase64, parseBase64, resizeImgBase64 } from "@midscene/shared/img";
|
|
2
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
1
3
|
import { getMidsceneRecorderEventDescription, stringifyMidsceneRecorderTargetBlock } from "@midscene/shared/recorder";
|
|
2
4
|
import { callAIWithStringResponse } from "../index.mjs";
|
|
3
5
|
import { getModelRuntime } from "../models/index.mjs";
|
|
4
6
|
import { prepareRecorderGenerationContext, validateEvents } from "./recorder-generation-common.mjs";
|
|
7
|
+
const MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET = 600000;
|
|
8
|
+
const MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE = 768;
|
|
9
|
+
const debugMarkdownReplay = getDebug('ai:recorder-markdown', {
|
|
10
|
+
console: true
|
|
11
|
+
});
|
|
12
|
+
function limitScreenshotAssetsForMarkdownReplay(screenshotAssets) {
|
|
13
|
+
let usedPayload = 0;
|
|
14
|
+
return screenshotAssets.filter((asset)=>{
|
|
15
|
+
const payloadSize = asset.dataUrl.length;
|
|
16
|
+
if (payloadSize > MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET || usedPayload + payloadSize > MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET) return false;
|
|
17
|
+
usedPayload += payloadSize;
|
|
18
|
+
return true;
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
async function compressScreenshotAssetForMarkdownReplay(asset) {
|
|
22
|
+
const { width, height } = await imageInfoOfBase64(asset.dataUrl);
|
|
23
|
+
const longestEdge = Math.max(width, height);
|
|
24
|
+
if (longestEdge <= MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE) return asset;
|
|
25
|
+
const scale = MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE / longestEdge;
|
|
26
|
+
const dataUrl = await resizeImgBase64(asset.dataUrl, {
|
|
27
|
+
width: Math.max(1, Math.round(width * scale)),
|
|
28
|
+
height: Math.max(1, Math.round(height * scale))
|
|
29
|
+
});
|
|
30
|
+
const { body, mimeType } = parseBase64(dataUrl);
|
|
31
|
+
return {
|
|
32
|
+
...asset,
|
|
33
|
+
dataUrl,
|
|
34
|
+
base64Data: body,
|
|
35
|
+
mimeType
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
async function prepareScreenshotAssetsForMarkdownReplay(screenshotAssets) {
|
|
39
|
+
const compressedAssets = [];
|
|
40
|
+
for (const asset of screenshotAssets)try {
|
|
41
|
+
compressedAssets.push(await compressScreenshotAssetForMarkdownReplay(asset));
|
|
42
|
+
} catch {
|
|
43
|
+
compressedAssets.push(asset);
|
|
44
|
+
}
|
|
45
|
+
return limitScreenshotAssetsForMarkdownReplay(compressedAssets);
|
|
46
|
+
}
|
|
47
|
+
function summarizeScreenshotAssets(screenshotAssets) {
|
|
48
|
+
const payloadSizes = screenshotAssets.map((asset)=>asset.dataUrl.length);
|
|
49
|
+
return {
|
|
50
|
+
count: screenshotAssets.length,
|
|
51
|
+
totalPayloadChars: payloadSizes.reduce((sum, size)=>sum + size, 0),
|
|
52
|
+
maxPayloadChars: payloadSizes.length ? Math.max(...payloadSizes) : 0
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
function getPromptShape(prompt) {
|
|
56
|
+
let textChars = 0;
|
|
57
|
+
let imageCount = 0;
|
|
58
|
+
for (const message of prompt){
|
|
59
|
+
const content = message.content;
|
|
60
|
+
if ('string' == typeof content) {
|
|
61
|
+
textChars += content.length;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (Array.isArray(content)) {
|
|
65
|
+
for (const part of content)if ('object' == typeof part && part && 'type' in part) {
|
|
66
|
+
if ('text' === part.type && 'text' in part) textChars += String(part.text).length;
|
|
67
|
+
if ('image_url' === part.type) imageCount += 1;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
textChars,
|
|
73
|
+
imageCount
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
function removeOmittedScreenshotPaths(summary, screenshotAssets) {
|
|
77
|
+
const includedScreenshotPaths = new Set(screenshotAssets.map((asset)=>asset.relativePath));
|
|
78
|
+
return {
|
|
79
|
+
...summary,
|
|
80
|
+
events: summary.events.map((event)=>event.screenshotPath && !includedScreenshotPaths.has(event.screenshotPath) ? {
|
|
81
|
+
...event,
|
|
82
|
+
screenshotPath: void 0
|
|
83
|
+
} : event)
|
|
84
|
+
};
|
|
85
|
+
}
|
|
5
86
|
function getMarkdownLanguageInstruction(language) {
|
|
6
87
|
const normalizedLanguage = language?.trim();
|
|
7
88
|
if (!normalizedLanguage) return '';
|
|
@@ -21,17 +102,62 @@ function resolveModelRuntime(model) {
|
|
|
21
102
|
}
|
|
22
103
|
function createRecorderMarkdownReplayPrompt(input) {
|
|
23
104
|
validateEvents(input.events);
|
|
24
|
-
const { summary, screenshotAssets } = prepareRecorderGenerationContext(input);
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
105
|
+
const { summary: rawSummary, screenshotAssets: rawScreenshotAssets } = prepareRecorderGenerationContext(input);
|
|
106
|
+
const screenshotAssets = limitScreenshotAssetsForMarkdownReplay(rawScreenshotAssets);
|
|
107
|
+
const summary = removeOmittedScreenshotPaths(rawSummary, screenshotAssets);
|
|
108
|
+
return createRecorderMarkdownReplayPromptFromContext(input, summary, screenshotAssets);
|
|
109
|
+
}
|
|
110
|
+
async function createRecorderMarkdownReplayPromptForGeneration(input) {
|
|
111
|
+
validateEvents(input.events);
|
|
112
|
+
const { summary: rawSummary, screenshotAssets: rawScreenshotAssets } = prepareRecorderGenerationContext(input);
|
|
113
|
+
const screenshotAssets = await prepareScreenshotAssetsForMarkdownReplay(rawScreenshotAssets);
|
|
114
|
+
const summary = removeOmittedScreenshotPaths(rawSummary, screenshotAssets);
|
|
115
|
+
const prompt = createRecorderMarkdownReplayPromptFromContext(input, summary, screenshotAssets);
|
|
116
|
+
debugMarkdownReplay('markdown replay prompt shape %o', {
|
|
117
|
+
eventCount: input.events.length,
|
|
118
|
+
maxScreenshots: input.maxScreenshots,
|
|
119
|
+
rawScreenshots: summarizeScreenshotAssets(rawScreenshotAssets),
|
|
120
|
+
includedScreenshots: summarizeScreenshotAssets(screenshotAssets),
|
|
121
|
+
prompt: getPromptShape(prompt)
|
|
122
|
+
});
|
|
123
|
+
return prompt;
|
|
124
|
+
}
|
|
125
|
+
function createRecorderMarkdownReplayPromptFromContext(input, summary, screenshotAssets) {
|
|
126
|
+
const screenshotIndexByEventHash = new Map(screenshotAssets.map((asset, index)=>[
|
|
127
|
+
asset.eventHashId,
|
|
128
|
+
`screenshot-${index + 1}`
|
|
129
|
+
]));
|
|
130
|
+
const events = summary.events.map((event)=>{
|
|
131
|
+
const screenshotRef = screenshotIndexByEventHash.get(event.hashId);
|
|
132
|
+
const { screenshotPath, ...eventWithoutScreenshotPath } = event;
|
|
133
|
+
return screenshotRef ? {
|
|
134
|
+
...eventWithoutScreenshotPath,
|
|
135
|
+
screenshotRef
|
|
136
|
+
} : eventWithoutScreenshotPath;
|
|
137
|
+
});
|
|
138
|
+
const promptPayload = {
|
|
139
|
+
testName: input.testName || summary.testName,
|
|
140
|
+
target: {
|
|
141
|
+
platformId: input.target.platformId,
|
|
142
|
+
label: input.target.label,
|
|
143
|
+
values: input.target.values
|
|
144
|
+
},
|
|
145
|
+
startUrl: summary.startUrl,
|
|
146
|
+
events,
|
|
147
|
+
screenshots: screenshotAssets.map((asset, index)=>({
|
|
148
|
+
screenshotRef: `screenshot-${index + 1}`,
|
|
149
|
+
eventIndex: asset.eventIndex,
|
|
150
|
+
eventHashId: asset.eventHashId,
|
|
151
|
+
eventType: asset.eventType,
|
|
152
|
+
description: getMidsceneRecorderEventDescription(input.events[asset.eventIndex])
|
|
153
|
+
}))
|
|
154
|
+
};
|
|
155
|
+
const promptText = `Generate a Markdown replay script for Midscene Agent. It will be executed with:
|
|
156
|
+
await agent.aiAct(markdownReplayPrompt)
|
|
29
157
|
|
|
30
|
-
|
|
158
|
+
Use only the recorder data and screenshots below.
|
|
31
159
|
|
|
32
|
-
Target
|
|
33
|
-
- Preserve this exact platform: ${input.target.platformId}
|
|
34
|
-
- Target block:
|
|
160
|
+
Target block:
|
|
35
161
|
${stringifyMidsceneRecorderTargetBlock(input.target)}
|
|
36
162
|
|
|
37
163
|
Replay goal:
|
|
@@ -41,13 +167,18 @@ Replay goal:
|
|
|
41
167
|
- Do not invent alternative navigation paths.
|
|
42
168
|
- Do not skip, merge, reorder, or add extra user actions.
|
|
43
169
|
- Prefer recorded UI text, element descriptions, URLs, input values, and scroll direction.
|
|
44
|
-
-
|
|
45
|
-
-
|
|
170
|
+
- For input events, enter event.typedText/event.value exactly; do not infer or correct the text from screenshots.
|
|
171
|
+
- Prefer event.semantic.replayInstruction and event.semantic.elementDescription when event.semantic.source is "aiDescribe" or "recorderAI" and event.semantic.status is "ready".
|
|
172
|
+
- For scroll events, preserve the recorded scroll region from event.semantic.elementDescription/replayInstruction. If the scroll happened in a specific panel, list, table, dialog body, menu, navigation area, or content pane, keep that region in the Markdown step instead of generalizing it to the whole page.
|
|
173
|
+
- If event.semantic.source is "heuristic" or event.semantic.status is "pending"/"failed", use the screenshot/context to write the best visual instruction.
|
|
46
174
|
- Coordinates are only fallback hints. Do not make coordinates the primary instruction when text or screenshots are available.
|
|
175
|
+
- For a click/tap that only focuses a field before an input event, describe the target as the field/control itself. Do not target a placeholder character, typed character, caret, or inner text fragment inside the field.
|
|
47
176
|
- If a target cannot be found, stop and report the missing step. Do not click similar-looking elements.
|
|
48
|
-
-
|
|
177
|
+
- Screenshots are only generation-time visual evidence for you. The generated Markdown will be passed directly to agent.aiAct(markdownReplayPrompt), which accepts text only and cannot receive attached images.
|
|
178
|
+
- Convert any useful screenshot evidence into textual replay instructions. Do not include screenshots, image syntax, image paths, or reference-image names in the generated Markdown.
|
|
179
|
+
- Never write Markdown image syntax such as , reference-style images, HTML <img> tags, ./screenshots/... paths, or screenshot-* names in the output.
|
|
49
180
|
|
|
50
|
-
Required
|
|
181
|
+
Required structure:
|
|
51
182
|
# ${input.testName || summary.testName}
|
|
52
183
|
|
|
53
184
|
## Goal
|
|
@@ -57,33 +188,11 @@ Reproduce the recorded user workflow exactly.
|
|
|
57
188
|
- Platform: ${input.target.platformId}
|
|
58
189
|
- Start target: ${summary.startUrl || input.target.label || input.target.deviceId || 'Recorded target'}
|
|
59
190
|
|
|
60
|
-
## Replay rules
|
|
61
|
-
- Follow the steps in order.
|
|
62
|
-
- Do not invent alternative navigation paths.
|
|
63
|
-
- If a referenced target cannot be found, stop and report the missing step.
|
|
64
|
-
|
|
65
191
|
## Steps
|
|
66
192
|
1. ...
|
|
67
193
|
|
|
68
|
-
|
|
69
|
-
${JSON.stringify({
|
|
70
|
-
...summary,
|
|
71
|
-
target: input.target,
|
|
72
|
-
events: summary.events,
|
|
73
|
-
screenshotAssets: screenshotAssets.map((asset)=>({
|
|
74
|
-
eventIndex: asset.eventIndex,
|
|
75
|
-
eventHashId: asset.eventHashId,
|
|
76
|
-
eventType: asset.eventType,
|
|
77
|
-
relativePath: asset.relativePath,
|
|
78
|
-
description: getMidsceneRecorderEventDescription(input.events[asset.eventIndex])
|
|
79
|
-
}))
|
|
80
|
-
}, null, 2)}
|
|
81
|
-
|
|
82
|
-
Screenshot rules:
|
|
83
|
-
- Insert a screenshot directly under the step that needs visual grounding.
|
|
84
|
-
- Use Markdown image syntax exactly like: 
|
|
85
|
-
- Only reference paths listed in screenshotAssets.
|
|
86
|
-
- Do not reference images that are not listed.${getMarkdownLanguageInstruction(input.language)}
|
|
194
|
+
Recorder data:
|
|
195
|
+
${JSON.stringify(promptPayload, null, 2)}${getMarkdownLanguageInstruction(input.language)}
|
|
87
196
|
|
|
88
197
|
Important: Return ONLY raw Markdown. Do NOT wrap the response in markdown code blocks.`;
|
|
89
198
|
const content = [
|
|
@@ -93,9 +202,10 @@ Important: Return ONLY raw Markdown. Do NOT wrap the response in markdown code b
|
|
|
93
202
|
}
|
|
94
203
|
];
|
|
95
204
|
for (const asset of screenshotAssets){
|
|
205
|
+
const screenshotRef = screenshotIndexByEventHash.get(asset.eventHashId);
|
|
96
206
|
content.push({
|
|
97
207
|
type: 'text',
|
|
98
|
-
text:
|
|
208
|
+
text: `${screenshotRef} for event #${asset.eventIndex + 1}`
|
|
99
209
|
});
|
|
100
210
|
content.push({
|
|
101
211
|
type: 'image_url',
|
|
@@ -107,7 +217,7 @@ Important: Return ONLY raw Markdown. Do NOT wrap the response in markdown code b
|
|
|
107
217
|
return [
|
|
108
218
|
{
|
|
109
219
|
role: 'system',
|
|
110
|
-
content: "You generate precise Markdown replay scripts for Midscene agent.
|
|
220
|
+
content: "You generate precise Markdown replay scripts for Midscene agent.aiAct. The final output is plain text that will be passed directly to agent.aiAct, so it must be deterministic, ordered, safe for AI execution, and must not contain image references, screenshot paths, or screenshot labels."
|
|
111
221
|
},
|
|
112
222
|
{
|
|
113
223
|
role: 'user',
|
|
@@ -117,7 +227,7 @@ Important: Return ONLY raw Markdown. Do NOT wrap the response in markdown code b
|
|
|
117
227
|
}
|
|
118
228
|
async function generateRecorderMarkdownReplay(input, model) {
|
|
119
229
|
try {
|
|
120
|
-
const prompt =
|
|
230
|
+
const prompt = await createRecorderMarkdownReplayPromptForGeneration(input);
|
|
121
231
|
const response = await callAIWithStringResponse(prompt, resolveModelRuntime(model));
|
|
122
232
|
if (response?.content && 'string' == typeof response.content) return normalizeGeneratedMarkdown(response.content);
|
|
123
233
|
throw new Error('Failed to generate recorder Markdown replay');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/markdown-generator.mjs","sources":["../../../../src/ai-model/prompt/markdown-generator.ts"],"sourcesContent":["import type { IModelConfig } from '@midscene/shared/env';\nimport {\n getMidsceneRecorderEventDescription,\n stringifyMidsceneRecorderTargetBlock,\n} from '@midscene/shared/recorder';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { callAIWithStringResponse } from '../index';\nimport { type ModelRuntime, getModelRuntime } from '../models';\nimport {\n type RecorderGenerationInput,\n prepareRecorderGenerationContext,\n validateEvents,\n} from './recorder-generation-common';\n\nexport type RecorderMarkdownGenerationInput = RecorderGenerationInput;\n\nfunction getMarkdownLanguageInstruction(language?: string) {\n const normalizedLanguage = language?.trim();\n if (!normalizedLanguage) {\n return '';\n }\n\n return `\nLanguage requirement:\n- Write all human-readable Markdown instructions in ${normalizedLanguage}.\n- Keep file paths, URLs, platform ids, API names, and quoted UI text unchanged.`;\n}\n\nfunction normalizeGeneratedMarkdown(content: string) {\n const trimmed = content.trim();\n const fencedMatch = trimmed.match(\n /^```(?:md|markdown)?\\s*([\\s\\S]*?)\\s*```$/i,\n );\n return `${(fencedMatch?.[1] ?? trimmed).trim()}\\n`;\n}\n\nfunction resolveModelRuntime(model: IModelConfig | ModelRuntime): ModelRuntime {\n if ('config' in model && 'adapter' in model) {\n return model;\n }\n return getModelRuntime(model);\n}\n\nexport function createRecorderMarkdownReplayPrompt(\n input: RecorderMarkdownGenerationInput,\n): ChatCompletionMessageParam[] {\n validateEvents(input.events);\n\n const { summary, screenshotAssets } = prepareRecorderGenerationContext(input);\n const promptText = `Generate a Markdown replay script for Midscene Agent.\n\nThis Markdown will be executed with:\nawait agent.runMarkdown('./x.md')\n\nIt is an AI-executable replay script, not a human report.\n\nTarget platform:\n- Preserve this exact platform: ${input.target.platformId}\n- Target block:\n${stringifyMidsceneRecorderTargetBlock(input.target)}\n\nReplay goal:\n- Reproduce the recorded user workflow exactly.\n- Preserve event order.\n- Preserve the user's original intent.\n- Do not invent alternative navigation paths.\n- Do not skip, merge, reorder, or add extra user actions.\n- Prefer recorded UI text, element descriptions, URLs, input values, and scroll direction.\n- Prefer event.replayInstruction and event.elementDescription when descriptionSource is \"ai\".\n- If descriptionSource is \"fallback\", use the screenshot/context to write the best visual instruction.\n- Coordinates are only fallback hints. Do not make coordinates the primary instruction when text or screenshots are available.\n- If a target cannot be found, stop and report the missing step. Do not click similar-looking elements.\n- Use screenshots only when they are provided below. Reference them by their exact relative paths.\n\nRequired Markdown structure:\n# ${input.testName || summary.testName}\n\n## Goal\nReproduce the recorded user workflow exactly.\n\n## Target\n- Platform: ${input.target.platformId}\n- Start target: ${summary.startUrl || input.target.label || input.target.deviceId || 'Recorded target'}\n\n## Replay rules\n- Follow the steps in order.\n- Do not invent alternative navigation paths.\n- If a referenced target cannot be found, stop and report the missing step.\n\n## Steps\n1. ...\n\nEvent summary:\n${JSON.stringify(\n {\n ...summary,\n target: input.target,\n events: summary.events,\n screenshotAssets: screenshotAssets.map((asset) => ({\n eventIndex: asset.eventIndex,\n eventHashId: asset.eventHashId,\n eventType: asset.eventType,\n relativePath: asset.relativePath,\n description: getMidsceneRecorderEventDescription(\n input.events[asset.eventIndex],\n ),\n })),\n },\n null,\n 2,\n)}\n\nScreenshot rules:\n- Insert a screenshot directly under the step that needs visual grounding.\n- Use Markdown image syntax exactly like: \n- Only reference paths listed in screenshotAssets.\n- Do not reference images that are not listed.${getMarkdownLanguageInstruction(input.language)}\n\nImportant: Return ONLY raw Markdown. Do NOT wrap the response in markdown code blocks.`;\n\n const content: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n for (const asset of screenshotAssets) {\n content.push({\n type: 'text',\n text: `Screenshot asset for event #${asset.eventIndex + 1}: ${asset.relativePath}`,\n });\n content.push({\n type: 'image_url',\n image_url: {\n url: asset.dataUrl,\n },\n });\n }\n\n return [\n {\n role: 'system',\n content:\n 'You generate precise Markdown replay scripts for Midscene agent.runMarkdown. The output must be deterministic, ordered, and safe for AI execution.',\n },\n {\n role: 'user',\n content,\n },\n ];\n}\n\nexport async function generateRecorderMarkdownReplay(\n input: RecorderMarkdownGenerationInput,\n model: IModelConfig | ModelRuntime,\n): Promise<string> {\n try {\n const prompt = createRecorderMarkdownReplayPrompt(input);\n const response = await callAIWithStringResponse(\n prompt,\n resolveModelRuntime(model),\n );\n\n if (response?.content && typeof response.content === 'string') {\n return normalizeGeneratedMarkdown(response.content);\n }\n\n throw new Error('Failed to generate recorder Markdown replay');\n } catch (error) {\n throw new Error(`Failed to generate recorder Markdown replay: ${error}`);\n }\n}\n\nexport async function convertRecordLogIntoMarkdown(\n log: RecorderMarkdownGenerationInput,\n modelConfig: IModelConfig,\n): Promise<string> {\n return generateRecorderMarkdownReplay(log, modelConfig);\n}\n"],"names":["getMarkdownLanguageInstruction","language","normalizedLanguage","normalizeGeneratedMarkdown","content","trimmed","fencedMatch","resolveModelRuntime","model","getModelRuntime","createRecorderMarkdownReplayPrompt","input","validateEvents","summary","screenshotAssets","prepareRecorderGenerationContext","promptText","stringifyMidsceneRecorderTargetBlock","JSON","asset","getMidsceneRecorderEventDescription","generateRecorderMarkdownReplay","prompt","response","callAIWithStringResponse","Error","error","convertRecordLogIntoMarkdown","log","modelConfig"],"mappings":";;;;AAgBA,SAASA,+BAA+BC,QAAiB;IACvD,MAAMC,qBAAqBD,UAAU;IACrC,IAAI,CAACC,oBACH,OAAO;IAGT,OAAO,CAAC;;oDAE0C,EAAEA,mBAAmB;+EACM,CAAC;AAChF;AAEA,SAASC,2BAA2BC,OAAe;IACjD,MAAMC,UAAUD,QAAQ,IAAI;IAC5B,MAAME,cAAcD,QAAQ,KAAK,CAC/B;IAEF,OAAO,GAAIC,AAAAA,CAAAA,aAAa,CAAC,EAAE,IAAID,OAAM,EAAG,IAAI,GAAG,EAAE,CAAC;AACpD;AAEA,SAASE,oBAAoBC,KAAkC;IAC7D,IAAI,YAAYA,SAAS,aAAaA,OACpC,OAAOA;IAET,OAAOC,gBAAgBD;AACzB;AAEO,SAASE,mCACdC,KAAsC;IAEtCC,eAAeD,MAAM,MAAM;IAE3B,MAAM,EAAEE,OAAO,EAAEC,gBAAgB,EAAE,GAAGC,iCAAiCJ;IACvE,MAAMK,aAAa,CAAC;;;;;;;;gCAQU,EAAEL,MAAM,MAAM,CAAC,UAAU,CAAC;;AAE1D,EAAEM,qCAAqCN,MAAM,MAAM,EAAE;;;;;;;;;;;;;;;;EAgBnD,EAAEA,MAAM,QAAQ,IAAIE,QAAQ,QAAQ,CAAC;;;;;;YAM3B,EAAEF,MAAM,MAAM,CAAC,UAAU,CAAC;gBACtB,EAAEE,QAAQ,QAAQ,IAAIF,MAAM,MAAM,CAAC,KAAK,IAAIA,MAAM,MAAM,CAAC,QAAQ,IAAI,kBAAkB;;;;;;;;;;;AAWvG,EAAEO,KAAK,SAAS,CACd;QACE,GAAGL,OAAO;QACV,QAAQF,MAAM,MAAM;QACpB,QAAQE,QAAQ,MAAM;QACtB,kBAAkBC,iBAAiB,GAAG,CAAC,CAACK,QAAW;gBACjD,YAAYA,MAAM,UAAU;gBAC5B,aAAaA,MAAM,WAAW;gBAC9B,WAAWA,MAAM,SAAS;gBAC1B,cAAcA,MAAM,YAAY;gBAChC,aAAaC,oCACXT,MAAM,MAAM,CAACQ,MAAM,UAAU,CAAC;YAElC;IACF,GACA,MACA,GACA;;;;;;8CAM4C,EAAEnB,+BAA+BW,MAAM,QAAQ,EAAE;;sFAET,CAAC;IAErF,MAAMP,UAAiB;QACrB;YACE,MAAM;YACN,MAAMY;QACR;KACD;IAED,KAAK,MAAMG,SAASL,iBAAkB;QACpCV,QAAQ,IAAI,CAAC;YACX,MAAM;YACN,MAAM,CAAC,4BAA4B,EAAEe,MAAM,UAAU,GAAG,EAAE,EAAE,EAAEA,MAAM,YAAY,EAAE;QACpF;QACAf,QAAQ,IAAI,CAAC;YACX,MAAM;YACN,WAAW;gBACT,KAAKe,MAAM,OAAO;YACpB;QACF;IACF;IAEA,OAAO;QACL;YACE,MAAM;YACN,SACE;QACJ;QACA;YACE,MAAM;YACNf;QACF;KACD;AACH;AAEO,eAAeiB,+BACpBV,KAAsC,EACtCH,KAAkC;IAElC,IAAI;QACF,MAAMc,SAASZ,mCAAmCC;QAClD,MAAMY,WAAW,MAAMC,yBACrBF,QACAf,oBAAoBC;QAGtB,IAAIe,UAAU,WAAW,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAOpB,2BAA2BoB,SAAS,OAAO;QAGpD,MAAM,IAAIE,MAAM;IAClB,EAAE,OAAOC,OAAO;QACd,MAAM,IAAID,MAAM,CAAC,6CAA6C,EAAEC,OAAO;IACzE;AACF;AAEO,eAAeC,6BACpBC,GAAoC,EACpCC,WAAyB;IAEzB,OAAOR,+BAA+BO,KAAKC;AAC7C"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/markdown-generator.mjs","sources":["../../../../src/ai-model/prompt/markdown-generator.ts"],"sourcesContent":["import type { IModelConfig } from '@midscene/shared/env';\nimport {\n imageInfoOfBase64,\n parseBase64,\n resizeImgBase64,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport {\n type MidsceneRecorderMarkdownScreenshotAsset,\n getMidsceneRecorderEventDescription,\n stringifyMidsceneRecorderTargetBlock,\n} from '@midscene/shared/recorder';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { callAIWithStringResponse } from '../index';\nimport { type ModelRuntime, getModelRuntime } from '../models';\nimport {\n type RecorderGenerationInput,\n prepareRecorderGenerationContext,\n validateEvents,\n} from './recorder-generation-common';\n\nexport type RecorderMarkdownGenerationInput = RecorderGenerationInput;\n\nconst MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET = 600_000;\nconst MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE = 768;\nconst debugMarkdownReplay = getDebug('ai:recorder-markdown', {\n console: true,\n});\n\nfunction limitScreenshotAssetsForMarkdownReplay(\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[],\n) {\n let usedPayload = 0;\n return screenshotAssets.filter((asset) => {\n const payloadSize = asset.dataUrl.length;\n if (\n payloadSize > MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET ||\n usedPayload + payloadSize > MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET\n ) {\n return false;\n }\n usedPayload += payloadSize;\n return true;\n });\n}\n\nasync function compressScreenshotAssetForMarkdownReplay(\n asset: MidsceneRecorderMarkdownScreenshotAsset,\n): Promise<MidsceneRecorderMarkdownScreenshotAsset> {\n const { width, height } = await imageInfoOfBase64(asset.dataUrl);\n const longestEdge = Math.max(width, height);\n if (longestEdge <= MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE) {\n return asset;\n }\n\n const scale = MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE / longestEdge;\n const dataUrl = await resizeImgBase64(asset.dataUrl, {\n width: Math.max(1, Math.round(width * scale)),\n height: Math.max(1, Math.round(height * scale)),\n });\n const { body, mimeType } = parseBase64(dataUrl);\n return {\n ...asset,\n dataUrl,\n base64Data: body,\n mimeType,\n };\n}\n\nasync function prepareScreenshotAssetsForMarkdownReplay(\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[],\n) {\n const compressedAssets: MidsceneRecorderMarkdownScreenshotAsset[] = [];\n for (const asset of screenshotAssets) {\n try {\n compressedAssets.push(\n await compressScreenshotAssetForMarkdownReplay(asset),\n );\n } catch {\n compressedAssets.push(asset);\n }\n }\n return limitScreenshotAssetsForMarkdownReplay(compressedAssets);\n}\n\nfunction summarizeScreenshotAssets(\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[],\n) {\n const payloadSizes = screenshotAssets.map((asset) => asset.dataUrl.length);\n return {\n count: screenshotAssets.length,\n totalPayloadChars: payloadSizes.reduce((sum, size) => sum + size, 0),\n maxPayloadChars: payloadSizes.length ? Math.max(...payloadSizes) : 0,\n };\n}\n\nfunction getPromptShape(prompt: ChatCompletionMessageParam[]) {\n let textChars = 0;\n let imageCount = 0;\n for (const message of prompt) {\n const content = message.content;\n if (typeof content === 'string') {\n textChars += content.length;\n continue;\n }\n if (!Array.isArray(content)) {\n continue;\n }\n for (const part of content) {\n if (typeof part === 'object' && part && 'type' in part) {\n if (part.type === 'text' && 'text' in part) {\n textChars += String(part.text).length;\n }\n if (part.type === 'image_url') {\n imageCount += 1;\n }\n }\n }\n }\n return { textChars, imageCount };\n}\n\nfunction removeOmittedScreenshotPaths(\n summary: ReturnType<typeof prepareRecorderGenerationContext>['summary'],\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[],\n) {\n const includedScreenshotPaths = new Set(\n screenshotAssets.map((asset) => asset.relativePath),\n );\n return {\n ...summary,\n events: summary.events.map((event) =>\n event.screenshotPath && !includedScreenshotPaths.has(event.screenshotPath)\n ? { ...event, screenshotPath: undefined }\n : event,\n ),\n };\n}\n\nfunction getMarkdownLanguageInstruction(language?: string) {\n const normalizedLanguage = language?.trim();\n if (!normalizedLanguage) {\n return '';\n }\n\n return `\nLanguage requirement:\n- Write all human-readable Markdown instructions in ${normalizedLanguage}.\n- Keep file paths, URLs, platform ids, API names, and quoted UI text unchanged.`;\n}\n\nfunction normalizeGeneratedMarkdown(content: string) {\n const trimmed = content.trim();\n const fencedMatch = trimmed.match(\n /^```(?:md|markdown)?\\s*([\\s\\S]*?)\\s*```$/i,\n );\n return `${(fencedMatch?.[1] ?? trimmed).trim()}\\n`;\n}\n\nfunction resolveModelRuntime(model: IModelConfig | ModelRuntime): ModelRuntime {\n if ('config' in model && 'adapter' in model) {\n return model;\n }\n return getModelRuntime(model);\n}\n\nexport function createRecorderMarkdownReplayPrompt(\n input: RecorderMarkdownGenerationInput,\n): ChatCompletionMessageParam[] {\n validateEvents(input.events);\n\n const { summary: rawSummary, screenshotAssets: rawScreenshotAssets } =\n prepareRecorderGenerationContext(input);\n const screenshotAssets =\n limitScreenshotAssetsForMarkdownReplay(rawScreenshotAssets);\n const summary = removeOmittedScreenshotPaths(rawSummary, screenshotAssets);\n return createRecorderMarkdownReplayPromptFromContext(\n input,\n summary,\n screenshotAssets,\n );\n}\n\nasync function createRecorderMarkdownReplayPromptForGeneration(\n input: RecorderMarkdownGenerationInput,\n): Promise<ChatCompletionMessageParam[]> {\n validateEvents(input.events);\n\n const { summary: rawSummary, screenshotAssets: rawScreenshotAssets } =\n prepareRecorderGenerationContext(input);\n const screenshotAssets =\n await prepareScreenshotAssetsForMarkdownReplay(rawScreenshotAssets);\n const summary = removeOmittedScreenshotPaths(rawSummary, screenshotAssets);\n const prompt = createRecorderMarkdownReplayPromptFromContext(\n input,\n summary,\n screenshotAssets,\n );\n debugMarkdownReplay('markdown replay prompt shape %o', {\n eventCount: input.events.length,\n maxScreenshots: input.maxScreenshots,\n rawScreenshots: summarizeScreenshotAssets(rawScreenshotAssets),\n includedScreenshots: summarizeScreenshotAssets(screenshotAssets),\n prompt: getPromptShape(prompt),\n });\n return prompt;\n}\n\nfunction createRecorderMarkdownReplayPromptFromContext(\n input: RecorderMarkdownGenerationInput,\n summary: ReturnType<typeof prepareRecorderGenerationContext>['summary'],\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[],\n): ChatCompletionMessageParam[] {\n const screenshotIndexByEventHash = new Map(\n screenshotAssets.map((asset, index) => [\n asset.eventHashId,\n `screenshot-${index + 1}`,\n ]),\n );\n const events = summary.events.map((event) => {\n const screenshotRef = screenshotIndexByEventHash.get(event.hashId);\n const { screenshotPath, ...eventWithoutScreenshotPath } = event;\n return screenshotRef\n ? { ...eventWithoutScreenshotPath, screenshotRef }\n : eventWithoutScreenshotPath;\n });\n const promptPayload = {\n testName: input.testName || summary.testName,\n target: {\n platformId: input.target.platformId,\n label: input.target.label,\n values: input.target.values,\n },\n startUrl: summary.startUrl,\n events,\n screenshots: screenshotAssets.map((asset, index) => ({\n screenshotRef: `screenshot-${index + 1}`,\n eventIndex: asset.eventIndex,\n eventHashId: asset.eventHashId,\n eventType: asset.eventType,\n description: getMidsceneRecorderEventDescription(\n input.events[asset.eventIndex],\n ),\n })),\n };\n const promptText = `Generate a Markdown replay script for Midscene Agent. It will be executed with:\nawait agent.aiAct(markdownReplayPrompt)\n\nUse only the recorder data and screenshots below.\n\nTarget block:\n${stringifyMidsceneRecorderTargetBlock(input.target)}\n\nReplay goal:\n- Reproduce the recorded user workflow exactly.\n- Preserve event order.\n- Preserve the user's original intent.\n- Do not invent alternative navigation paths.\n- Do not skip, merge, reorder, or add extra user actions.\n- Prefer recorded UI text, element descriptions, URLs, input values, and scroll direction.\n- For input events, enter event.typedText/event.value exactly; do not infer or correct the text from screenshots.\n- Prefer event.semantic.replayInstruction and event.semantic.elementDescription when event.semantic.source is \"aiDescribe\" or \"recorderAI\" and event.semantic.status is \"ready\".\n- For scroll events, preserve the recorded scroll region from event.semantic.elementDescription/replayInstruction. If the scroll happened in a specific panel, list, table, dialog body, menu, navigation area, or content pane, keep that region in the Markdown step instead of generalizing it to the whole page.\n- If event.semantic.source is \"heuristic\" or event.semantic.status is \"pending\"/\"failed\", use the screenshot/context to write the best visual instruction.\n- Coordinates are only fallback hints. Do not make coordinates the primary instruction when text or screenshots are available.\n- For a click/tap that only focuses a field before an input event, describe the target as the field/control itself. Do not target a placeholder character, typed character, caret, or inner text fragment inside the field.\n- If a target cannot be found, stop and report the missing step. Do not click similar-looking elements.\n- Screenshots are only generation-time visual evidence for you. The generated Markdown will be passed directly to agent.aiAct(markdownReplayPrompt), which accepts text only and cannot receive attached images.\n- Convert any useful screenshot evidence into textual replay instructions. Do not include screenshots, image syntax, image paths, or reference-image names in the generated Markdown.\n- Never write Markdown image syntax such as , reference-style images, HTML <img> tags, ./screenshots/... paths, or screenshot-* names in the output.\n\nRequired structure:\n# ${input.testName || summary.testName}\n\n## Goal\nReproduce the recorded user workflow exactly.\n\n## Target\n- Platform: ${input.target.platformId}\n- Start target: ${summary.startUrl || input.target.label || input.target.deviceId || 'Recorded target'}\n\n## Steps\n1. ...\n\nRecorder data:\n${JSON.stringify(promptPayload, null, 2)}${getMarkdownLanguageInstruction(input.language)}\n\nImportant: Return ONLY raw Markdown. Do NOT wrap the response in markdown code blocks.`;\n\n const content: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n for (const asset of screenshotAssets) {\n const screenshotRef = screenshotIndexByEventHash.get(asset.eventHashId);\n content.push({\n type: 'text',\n text: `${screenshotRef} for event #${asset.eventIndex + 1}`,\n });\n content.push({\n type: 'image_url',\n image_url: {\n url: asset.dataUrl,\n },\n });\n }\n\n return [\n {\n role: 'system',\n content:\n 'You generate precise Markdown replay scripts for Midscene agent.aiAct. The final output is plain text that will be passed directly to agent.aiAct, so it must be deterministic, ordered, safe for AI execution, and must not contain image references, screenshot paths, or screenshot labels.',\n },\n {\n role: 'user',\n content,\n },\n ];\n}\n\nexport async function generateRecorderMarkdownReplay(\n input: RecorderMarkdownGenerationInput,\n model: IModelConfig | ModelRuntime,\n): Promise<string> {\n try {\n const prompt = await createRecorderMarkdownReplayPromptForGeneration(input);\n const response = await callAIWithStringResponse(\n prompt,\n resolveModelRuntime(model),\n );\n\n if (response?.content && typeof response.content === 'string') {\n return normalizeGeneratedMarkdown(response.content);\n }\n\n throw new Error('Failed to generate recorder Markdown replay');\n } catch (error) {\n throw new Error(`Failed to generate recorder Markdown replay: ${error}`);\n }\n}\n\nexport async function convertRecordLogIntoMarkdown(\n log: RecorderMarkdownGenerationInput,\n modelConfig: IModelConfig,\n): Promise<string> {\n return generateRecorderMarkdownReplay(log, modelConfig);\n}\n"],"names":["MARKDOWN_REPLAY_SCREENSHOT_PAYLOAD_BUDGET","MARKDOWN_REPLAY_SCREENSHOT_MAX_EDGE","debugMarkdownReplay","getDebug","limitScreenshotAssetsForMarkdownReplay","screenshotAssets","usedPayload","asset","payloadSize","compressScreenshotAssetForMarkdownReplay","width","height","imageInfoOfBase64","longestEdge","Math","scale","dataUrl","resizeImgBase64","body","mimeType","parseBase64","prepareScreenshotAssetsForMarkdownReplay","compressedAssets","summarizeScreenshotAssets","payloadSizes","sum","size","getPromptShape","prompt","textChars","imageCount","message","content","Array","part","String","removeOmittedScreenshotPaths","summary","includedScreenshotPaths","Set","event","undefined","getMarkdownLanguageInstruction","language","normalizedLanguage","normalizeGeneratedMarkdown","trimmed","fencedMatch","resolveModelRuntime","model","getModelRuntime","createRecorderMarkdownReplayPrompt","input","validateEvents","rawSummary","rawScreenshotAssets","prepareRecorderGenerationContext","createRecorderMarkdownReplayPromptFromContext","createRecorderMarkdownReplayPromptForGeneration","screenshotIndexByEventHash","Map","index","events","screenshotRef","screenshotPath","eventWithoutScreenshotPath","promptPayload","getMidsceneRecorderEventDescription","promptText","stringifyMidsceneRecorderTargetBlock","JSON","generateRecorderMarkdownReplay","response","callAIWithStringResponse","Error","error","convertRecordLogIntoMarkdown","log","modelConfig"],"mappings":";;;;;;AAuBA,MAAMA,4CAA4C;AAClD,MAAMC,sCAAsC;AAC5C,MAAMC,sBAAsBC,SAAS,wBAAwB;IAC3D,SAAS;AACX;AAEA,SAASC,uCACPC,gBAA2D;IAE3D,IAAIC,cAAc;IAClB,OAAOD,iBAAiB,MAAM,CAAC,CAACE;QAC9B,MAAMC,cAAcD,MAAM,OAAO,CAAC,MAAM;QACxC,IACEC,cAAcR,6CACdM,cAAcE,cAAcR,2CAE5B,OAAO;QAETM,eAAeE;QACf,OAAO;IACT;AACF;AAEA,eAAeC,yCACbF,KAA8C;IAE9C,MAAM,EAAEG,KAAK,EAAEC,MAAM,EAAE,GAAG,MAAMC,kBAAkBL,MAAM,OAAO;IAC/D,MAAMM,cAAcC,KAAK,GAAG,CAACJ,OAAOC;IACpC,IAAIE,eAAeZ,qCACjB,OAAOM;IAGT,MAAMQ,QAAQd,sCAAsCY;IACpD,MAAMG,UAAU,MAAMC,gBAAgBV,MAAM,OAAO,EAAE;QACnD,OAAOO,KAAK,GAAG,CAAC,GAAGA,KAAK,KAAK,CAACJ,QAAQK;QACtC,QAAQD,KAAK,GAAG,CAAC,GAAGA,KAAK,KAAK,CAACH,SAASI;IAC1C;IACA,MAAM,EAAEG,IAAI,EAAEC,QAAQ,EAAE,GAAGC,YAAYJ;IACvC,OAAO;QACL,GAAGT,KAAK;QACRS;QACA,YAAYE;QACZC;IACF;AACF;AAEA,eAAeE,yCACbhB,gBAA2D;IAE3D,MAAMiB,mBAA8D,EAAE;IACtE,KAAK,MAAMf,SAASF,iBAClB,IAAI;QACFiB,iBAAiB,IAAI,CACnB,MAAMb,yCAAyCF;IAEnD,EAAE,OAAM;QACNe,iBAAiB,IAAI,CAACf;IACxB;IAEF,OAAOH,uCAAuCkB;AAChD;AAEA,SAASC,0BACPlB,gBAA2D;IAE3D,MAAMmB,eAAenB,iBAAiB,GAAG,CAAC,CAACE,QAAUA,MAAM,OAAO,CAAC,MAAM;IACzE,OAAO;QACL,OAAOF,iBAAiB,MAAM;QAC9B,mBAAmBmB,aAAa,MAAM,CAAC,CAACC,KAAKC,OAASD,MAAMC,MAAM;QAClE,iBAAiBF,aAAa,MAAM,GAAGV,KAAK,GAAG,IAAIU,gBAAgB;IACrE;AACF;AAEA,SAASG,eAAeC,MAAoC;IAC1D,IAAIC,YAAY;IAChB,IAAIC,aAAa;IACjB,KAAK,MAAMC,WAAWH,OAAQ;QAC5B,MAAMI,UAAUD,QAAQ,OAAO;QAC/B,IAAI,AAAmB,YAAnB,OAAOC,SAAsB;YAC/BH,aAAaG,QAAQ,MAAM;YAC3B;QACF;QACA,IAAKC,MAAM,OAAO,CAACD,UAGnB;YAAA,KAAK,MAAME,QAAQF,QACjB,IAAI,AAAgB,YAAhB,OAAOE,QAAqBA,QAAQ,UAAUA,MAAM;gBACtD,IAAIA,AAAc,WAAdA,KAAK,IAAI,IAAe,UAAUA,MACpCL,aAAaM,OAAOD,KAAK,IAAI,EAAE,MAAM;gBAEvC,IAAIA,AAAc,gBAAdA,KAAK,IAAI,EACXJ,cAAc;YAElB;QACF;IACF;IACA,OAAO;QAAED;QAAWC;IAAW;AACjC;AAEA,SAASM,6BACPC,OAAuE,EACvEhC,gBAA2D;IAE3D,MAAMiC,0BAA0B,IAAIC,IAClClC,iBAAiB,GAAG,CAAC,CAACE,QAAUA,MAAM,YAAY;IAEpD,OAAO;QACL,GAAG8B,OAAO;QACV,QAAQA,QAAQ,MAAM,CAAC,GAAG,CAAC,CAACG,QAC1BA,MAAM,cAAc,IAAI,CAACF,wBAAwB,GAAG,CAACE,MAAM,cAAc,IACrE;gBAAE,GAAGA,KAAK;gBAAE,gBAAgBC;YAAU,IACtCD;IAER;AACF;AAEA,SAASE,+BAA+BC,QAAiB;IACvD,MAAMC,qBAAqBD,UAAU;IACrC,IAAI,CAACC,oBACH,OAAO;IAGT,OAAO,CAAC;;oDAE0C,EAAEA,mBAAmB;+EACM,CAAC;AAChF;AAEA,SAASC,2BAA2Bb,OAAe;IACjD,MAAMc,UAAUd,QAAQ,IAAI;IAC5B,MAAMe,cAAcD,QAAQ,KAAK,CAC/B;IAEF,OAAO,GAAIC,AAAAA,CAAAA,aAAa,CAAC,EAAE,IAAID,OAAM,EAAG,IAAI,GAAG,EAAE,CAAC;AACpD;AAEA,SAASE,oBAAoBC,KAAkC;IAC7D,IAAI,YAAYA,SAAS,aAAaA,OACpC,OAAOA;IAET,OAAOC,gBAAgBD;AACzB;AAEO,SAASE,mCACdC,KAAsC;IAEtCC,eAAeD,MAAM,MAAM;IAE3B,MAAM,EAAE,SAASE,UAAU,EAAE,kBAAkBC,mBAAmB,EAAE,GAClEC,iCAAiCJ;IACnC,MAAM/C,mBACJD,uCAAuCmD;IACzC,MAAMlB,UAAUD,6BAA6BkB,YAAYjD;IACzD,OAAOoD,8CACLL,OACAf,SACAhC;AAEJ;AAEA,eAAeqD,gDACbN,KAAsC;IAEtCC,eAAeD,MAAM,MAAM;IAE3B,MAAM,EAAE,SAASE,UAAU,EAAE,kBAAkBC,mBAAmB,EAAE,GAClEC,iCAAiCJ;IACnC,MAAM/C,mBACJ,MAAMgB,yCAAyCkC;IACjD,MAAMlB,UAAUD,6BAA6BkB,YAAYjD;IACzD,MAAMuB,SAAS6B,8CACbL,OACAf,SACAhC;IAEFH,oBAAoB,mCAAmC;QACrD,YAAYkD,MAAM,MAAM,CAAC,MAAM;QAC/B,gBAAgBA,MAAM,cAAc;QACpC,gBAAgB7B,0BAA0BgC;QAC1C,qBAAqBhC,0BAA0BlB;QAC/C,QAAQsB,eAAeC;IACzB;IACA,OAAOA;AACT;AAEA,SAAS6B,8CACPL,KAAsC,EACtCf,OAAuE,EACvEhC,gBAA2D;IAE3D,MAAMsD,6BAA6B,IAAIC,IACrCvD,iBAAiB,GAAG,CAAC,CAACE,OAAOsD,QAAU;YACrCtD,MAAM,WAAW;YACjB,CAAC,WAAW,EAAEsD,QAAQ,GAAG;SAC1B;IAEH,MAAMC,SAASzB,QAAQ,MAAM,CAAC,GAAG,CAAC,CAACG;QACjC,MAAMuB,gBAAgBJ,2BAA2B,GAAG,CAACnB,MAAM,MAAM;QACjE,MAAM,EAAEwB,cAAc,EAAE,GAAGC,4BAA4B,GAAGzB;QAC1D,OAAOuB,gBACH;YAAE,GAAGE,0BAA0B;YAAEF;QAAc,IAC/CE;IACN;IACA,MAAMC,gBAAgB;QACpB,UAAUd,MAAM,QAAQ,IAAIf,QAAQ,QAAQ;QAC5C,QAAQ;YACN,YAAYe,MAAM,MAAM,CAAC,UAAU;YACnC,OAAOA,MAAM,MAAM,CAAC,KAAK;YACzB,QAAQA,MAAM,MAAM,CAAC,MAAM;QAC7B;QACA,UAAUf,QAAQ,QAAQ;QAC1ByB;QACA,aAAazD,iBAAiB,GAAG,CAAC,CAACE,OAAOsD,QAAW;gBACnD,eAAe,CAAC,WAAW,EAAEA,QAAQ,GAAG;gBACxC,YAAYtD,MAAM,UAAU;gBAC5B,aAAaA,MAAM,WAAW;gBAC9B,WAAWA,MAAM,SAAS;gBAC1B,aAAa4D,oCACXf,MAAM,MAAM,CAAC7C,MAAM,UAAU,CAAC;YAElC;IACF;IACA,MAAM6D,aAAa,CAAC;;;;;;AAMtB,EAAEC,qCAAqCjB,MAAM,MAAM,EAAE;;;;;;;;;;;;;;;;;;;;;EAqBnD,EAAEA,MAAM,QAAQ,IAAIf,QAAQ,QAAQ,CAAC;;;;;;YAM3B,EAAEe,MAAM,MAAM,CAAC,UAAU,CAAC;gBACtB,EAAEf,QAAQ,QAAQ,IAAIe,MAAM,MAAM,CAAC,KAAK,IAAIA,MAAM,MAAM,CAAC,QAAQ,IAAI,kBAAkB;;;;;;AAMvG,EAAEkB,KAAK,SAAS,CAACJ,eAAe,MAAM,KAAKxB,+BAA+BU,MAAM,QAAQ,EAAE;;sFAEJ,CAAC;IAErF,MAAMpB,UAAiB;QACrB;YACE,MAAM;YACN,MAAMoC;QACR;KACD;IAED,KAAK,MAAM7D,SAASF,iBAAkB;QACpC,MAAM0D,gBAAgBJ,2BAA2B,GAAG,CAACpD,MAAM,WAAW;QACtEyB,QAAQ,IAAI,CAAC;YACX,MAAM;YACN,MAAM,GAAG+B,cAAc,YAAY,EAAExD,MAAM,UAAU,GAAG,GAAG;QAC7D;QACAyB,QAAQ,IAAI,CAAC;YACX,MAAM;YACN,WAAW;gBACT,KAAKzB,MAAM,OAAO;YACpB;QACF;IACF;IAEA,OAAO;QACL;YACE,MAAM;YACN,SACE;QACJ;QACA;YACE,MAAM;YACNyB;QACF;KACD;AACH;AAEO,eAAeuC,+BACpBnB,KAAsC,EACtCH,KAAkC;IAElC,IAAI;QACF,MAAMrB,SAAS,MAAM8B,gDAAgDN;QACrE,MAAMoB,WAAW,MAAMC,yBACrB7C,QACAoB,oBAAoBC;QAGtB,IAAIuB,UAAU,WAAW,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAO3B,2BAA2B2B,SAAS,OAAO;QAGpD,MAAM,IAAIE,MAAM;IAClB,EAAE,OAAOC,OAAO;QACd,MAAM,IAAID,MAAM,CAAC,6CAA6C,EAAEC,OAAO;IACzE;AACF;AAEO,eAAeC,6BACpBC,GAAoC,EACpCC,WAAyB;IAEzB,OAAOP,+BAA+BM,KAAKC;AAC7C"}
|
|
@@ -1,7 +1,42 @@
|
|
|
1
|
-
import { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription } from "@midscene/shared/recorder";
|
|
1
|
+
import { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription, getMidsceneRecorderSemantic } from "@midscene/shared/recorder";
|
|
2
|
+
const MAX_RECORDER_GENERATION_SEMANTIC_TEXT_LENGTH = 1200;
|
|
3
|
+
const MAX_RECORDER_GENERATION_SEMANTIC_ERROR_LENGTH = 400;
|
|
2
4
|
function cleanRecorderSemanticField(value) {
|
|
3
5
|
return value?.trim() === 'AI is analyzing element...' ? void 0 : value;
|
|
4
6
|
}
|
|
7
|
+
function truncateRecorderGenerationText(value, maxLength) {
|
|
8
|
+
if (value.length <= maxLength) return value;
|
|
9
|
+
return `${value.slice(0, maxLength)}... [truncated ${value.length - maxLength} chars]`;
|
|
10
|
+
}
|
|
11
|
+
function compactRecorderSemanticText(value) {
|
|
12
|
+
const cleaned = cleanRecorderSemanticField(value);
|
|
13
|
+
return cleaned ? truncateRecorderGenerationText(cleaned, MAX_RECORDER_GENERATION_SEMANTIC_TEXT_LENGTH) : void 0;
|
|
14
|
+
}
|
|
15
|
+
function compactRecorderSemanticError(value) {
|
|
16
|
+
return value ? truncateRecorderGenerationText(value, MAX_RECORDER_GENERATION_SEMANTIC_ERROR_LENGTH) : void 0;
|
|
17
|
+
}
|
|
18
|
+
function compactRecorderSemanticForGeneration(semantic) {
|
|
19
|
+
if (!semantic) return;
|
|
20
|
+
return {
|
|
21
|
+
source: semantic.source,
|
|
22
|
+
status: semantic.status,
|
|
23
|
+
confidence: semantic.confidence,
|
|
24
|
+
elementDescription: compactRecorderSemanticText(semantic.elementDescription),
|
|
25
|
+
replayInstruction: compactRecorderSemanticText(semantic.replayInstruction),
|
|
26
|
+
actionSummary: compactRecorderSemanticText(semantic.actionSummary),
|
|
27
|
+
error: compactRecorderSemanticError(semantic.error),
|
|
28
|
+
...semantic.aiDescribe ? {
|
|
29
|
+
aiDescribe: {
|
|
30
|
+
verifyPrompt: semantic.aiDescribe.verifyPrompt,
|
|
31
|
+
verifyPassed: semantic.aiDescribe.verifyPassed,
|
|
32
|
+
deepLocate: semantic.aiDescribe.deepLocate,
|
|
33
|
+
centerDistance: semantic.aiDescribe.centerDistance,
|
|
34
|
+
expectedCenter: semantic.aiDescribe.expectedCenter,
|
|
35
|
+
actualCenter: semantic.aiDescribe.actualCenter
|
|
36
|
+
}
|
|
37
|
+
} : {}
|
|
38
|
+
};
|
|
39
|
+
}
|
|
5
40
|
const validateEvents = (events)=>{
|
|
6
41
|
if (!events.length) throw new Error('No events provided for test generation');
|
|
7
42
|
};
|
|
@@ -22,35 +57,60 @@ const createEventCounts = (filteredEvents, totalEvents)=>({
|
|
|
22
57
|
scroll: filteredEvents.scrollEvents.length,
|
|
23
58
|
total: totalEvents
|
|
24
59
|
});
|
|
25
|
-
const extractInputDescriptions = (inputEvents)=>inputEvents.map((event)=>
|
|
26
|
-
|
|
60
|
+
const extractInputDescriptions = (inputEvents)=>inputEvents.map((event)=>{
|
|
61
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
62
|
+
return {
|
|
63
|
+
description: cleanRecorderSemanticField(semantic?.elementDescription) || '',
|
|
27
64
|
value: event.value || ''
|
|
28
|
-
}
|
|
29
|
-
|
|
65
|
+
};
|
|
66
|
+
}).filter((item)=>item.description && item.value);
|
|
67
|
+
const processEventsForLLM = (events, screenshotPathByEventHash = new Map())=>{
|
|
68
|
+
let inputIndex = 0;
|
|
69
|
+
return events.map((event, index)=>{
|
|
70
|
+
const previousEvent = events[index - 1];
|
|
71
|
+
const nextEvent = events[index + 1];
|
|
72
|
+
const previousInput = events.slice(0, index).reverse().find((candidate)=>'input' === candidate.type);
|
|
73
|
+
const nextInput = events.slice(index + 1).find((candidate)=>'input' === candidate.type);
|
|
74
|
+
const isInput = 'input' === event.type;
|
|
75
|
+
const inputSequenceIndex = isInput ? ++inputIndex : void 0;
|
|
76
|
+
const hasNeighborInput = Boolean(previousInput || nextInput);
|
|
77
|
+
const neighborInputValues = isInput ? [
|
|
78
|
+
previousInput?.value,
|
|
79
|
+
nextInput?.value
|
|
80
|
+
].filter((value)=>Boolean(value)) : void 0;
|
|
81
|
+
const semantic = compactRecorderSemanticForGeneration(getMidsceneRecorderSemantic(event));
|
|
82
|
+
return {
|
|
30
83
|
hashId: event.hashId,
|
|
31
84
|
type: event.type,
|
|
32
85
|
timestamp: event.timestamp,
|
|
33
86
|
source: event.source,
|
|
34
87
|
actionType: event.actionType,
|
|
35
|
-
descriptionSource: event.descriptionSource,
|
|
36
|
-
descriptionError: event.descriptionError,
|
|
37
88
|
url: event.url,
|
|
38
89
|
title: event.title,
|
|
39
|
-
|
|
40
|
-
replayInstruction: cleanRecorderSemanticField(event.replayInstruction),
|
|
41
|
-
actionSummary: cleanRecorderSemanticField(event.actionSummary),
|
|
42
|
-
semanticConfidence: event.semanticConfidence,
|
|
90
|
+
semantic,
|
|
43
91
|
description: getMidsceneRecorderEventDescription(event),
|
|
44
92
|
value: event.value,
|
|
93
|
+
previousActionDescription: previousEvent ? getMidsceneRecorderEventDescription(previousEvent) : void 0,
|
|
94
|
+
nextActionDescription: nextEvent ? getMidsceneRecorderEventDescription(nextEvent) : void 0,
|
|
95
|
+
...isInput ? {
|
|
96
|
+
typedText: event.value || '',
|
|
97
|
+
inputIndex: inputSequenceIndex,
|
|
98
|
+
isSequentialInput: previousEvent?.type === 'input' || nextEvent?.type === 'input',
|
|
99
|
+
hasNeighborInput,
|
|
100
|
+
previousInputDescription: previousInput ? getMidsceneRecorderEventDescription(previousInput) : void 0,
|
|
101
|
+
neighborInputValues: neighborInputValues && neighborInputValues.length > 0 ? neighborInputValues : void 0
|
|
102
|
+
} : {},
|
|
45
103
|
pageInfo: event.pageInfo,
|
|
46
104
|
elementRect: event.elementRect,
|
|
47
105
|
screenshotPath: screenshotPathByEventHash.get(event.hashId)
|
|
48
|
-
}
|
|
106
|
+
};
|
|
107
|
+
});
|
|
108
|
+
};
|
|
49
109
|
const prepareEventSummary = (events, options = {})=>{
|
|
50
110
|
const filteredEvents = filterEventsByType(events);
|
|
51
111
|
const eventCounts = createEventCounts(filteredEvents, events.length);
|
|
52
112
|
const startUrl = filteredEvents.navigationEvents.length > 0 ? filteredEvents.navigationEvents[0].url || '' : '';
|
|
53
|
-
const clickDescriptions = filteredEvents.clickEvents.map((event)=>event
|
|
113
|
+
const clickDescriptions = filteredEvents.clickEvents.map((event)=>getMidsceneRecorderSemantic(event)?.elementDescription).filter((desc)=>Boolean(desc)).slice(0, 10);
|
|
54
114
|
const inputDescriptions = extractInputDescriptions(filteredEvents.inputEvents).slice(0, 10);
|
|
55
115
|
const urls = filteredEvents.navigationEvents.map((e)=>e.url).filter((url)=>Boolean(url)).slice(0, 5);
|
|
56
116
|
const processedEvents = processEventsForLLM(events, options.screenshotPathByEventHash);
|
|
@@ -106,6 +166,6 @@ const createMessageContent = (promptText, screenshots = [], includeScreenshots =
|
|
|
106
166
|
}
|
|
107
167
|
return messageContent;
|
|
108
168
|
};
|
|
109
|
-
export { createEventCounts, createMessageContent, extractInputDescriptions, filterEventsByType, getScreenshotsForLLM, prepareEventSummary, prepareRecorderGenerationContext, processEventsForLLM, validateEvents };
|
|
169
|
+
export { compactRecorderSemanticForGeneration, createEventCounts, createMessageContent, extractInputDescriptions, filterEventsByType, getScreenshotsForLLM, prepareEventSummary, prepareRecorderGenerationContext, processEventsForLLM, validateEvents };
|
|
110
170
|
|
|
111
171
|
//# sourceMappingURL=recorder-generation-common.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/recorder-generation-common.mjs","sources":["../../../../src/ai-model/prompt/recorder-generation-common.ts"],"sourcesContent":["import {\n DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS,\n type MidsceneRecorderEvent,\n type MidsceneRecorderMarkdownScreenshotAsset,\n type MidsceneRecorderTarget,\n createMidsceneRecorderMarkdownScreenshotAssets,\n getMidsceneRecorderEventDescription,\n} from '@midscene/shared/recorder';\n\nexport interface EventCounts {\n navigation: number;\n click: number;\n input: number;\n scroll: number;\n total: number;\n}\n\nexport interface InputDescription {\n description: string;\n value: string;\n}\n\nexport interface ProcessedEvent {\n hashId: string;\n type: string;\n timestamp: number;\n source?: string;\n actionType?: string;\n descriptionSource?: string;\n descriptionError?: string;\n url?: string;\n title?: string;\n elementDescription?: string;\n replayInstruction?: string;\n actionSummary?: string;\n semanticConfidence?: string;\n description?: string;\n value?: string;\n pageInfo?: any;\n elementRect?: any;\n screenshotPath?: string;\n}\n\nexport interface EventSummary {\n testName: string;\n startUrl: string;\n eventCounts: EventCounts;\n urls: string[];\n clickDescriptions: string[];\n inputDescriptions: InputDescription[];\n events: ProcessedEvent[];\n}\n\nexport interface RecorderGenerationContext {\n summary: EventSummary;\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[];\n}\n\nexport type ChromeRecordedEvent = MidsceneRecorderEvent;\n\nexport interface RecorderGenerationOptions {\n testName?: string;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n /** Language for human-readable generated content (e.g. 'English', 'Chinese'). Keys and API names are kept as-is. */\n language?: string;\n navigationInfo?: {\n urls?: string[];\n titles?: string[];\n initialViewport?: {\n width?: number;\n height?: number;\n };\n };\n}\n\nexport interface RecorderGenerationInput extends RecorderGenerationOptions {\n target: MidsceneRecorderTarget;\n events: MidsceneRecorderEvent[];\n}\n\nexport interface FilteredEvents {\n navigationEvents: ChromeRecordedEvent[];\n clickEvents: ChromeRecordedEvent[];\n inputEvents: ChromeRecordedEvent[];\n scrollEvents: ChromeRecordedEvent[];\n}\n\nfunction cleanRecorderSemanticField(value?: string) {\n return value?.trim() === 'AI is analyzing element...' ? undefined : value;\n}\n\nexport const validateEvents = (events: ChromeRecordedEvent[]): void => {\n if (!events.length) {\n throw new Error('No events provided for test generation');\n }\n};\n\nexport const getScreenshotsForLLM = (\n events: ChromeRecordedEvent[],\n maxScreenshots = 1,\n): string[] => {\n return createMidsceneRecorderMarkdownScreenshotAssets(events, {\n baseDir: './screenshots',\n maxScreenshots,\n }).map((asset) => asset.dataUrl);\n};\n\nexport const filterEventsByType = (\n events: ChromeRecordedEvent[],\n): FilteredEvents => {\n return {\n navigationEvents: events.filter((event) => event.type === 'navigation'),\n clickEvents: events.filter((event) => event.type === 'click'),\n inputEvents: events.filter((event) => event.type === 'input'),\n scrollEvents: events.filter((event) => event.type === 'scroll'),\n };\n};\n\nexport const createEventCounts = (\n filteredEvents: FilteredEvents,\n totalEvents: number,\n): EventCounts => {\n return {\n navigation: filteredEvents.navigationEvents.length,\n click: filteredEvents.clickEvents.length,\n input: filteredEvents.inputEvents.length,\n scroll: filteredEvents.scrollEvents.length,\n total: totalEvents,\n };\n};\n\nexport const extractInputDescriptions = (\n inputEvents: ChromeRecordedEvent[],\n): InputDescription[] => {\n return inputEvents\n .map((event) => ({\n description: cleanRecorderSemanticField(event.elementDescription) || '',\n value: event.value || '',\n }))\n .filter((item) => item.description && item.value);\n};\n\nexport const processEventsForLLM = (\n events: ChromeRecordedEvent[],\n screenshotPathByEventHash: Map<string, string> = new Map(),\n): ProcessedEvent[] => {\n return events.map((event) => ({\n hashId: event.hashId,\n type: event.type,\n timestamp: event.timestamp,\n source: event.source,\n actionType: event.actionType,\n descriptionSource: event.descriptionSource,\n descriptionError: event.descriptionError,\n url: event.url,\n title: event.title,\n elementDescription: cleanRecorderSemanticField(event.elementDescription),\n replayInstruction: cleanRecorderSemanticField(event.replayInstruction),\n actionSummary: cleanRecorderSemanticField(event.actionSummary),\n semanticConfidence: event.semanticConfidence,\n description: getMidsceneRecorderEventDescription(event),\n value: event.value,\n pageInfo: event.pageInfo,\n elementRect: event.elementRect,\n screenshotPath: screenshotPathByEventHash.get(event.hashId),\n }));\n};\n\nexport const prepareEventSummary = (\n events: ChromeRecordedEvent[],\n options: {\n testName?: string;\n maxScreenshots?: number;\n screenshotPathByEventHash?: Map<string, string>;\n } = {},\n): EventSummary => {\n const filteredEvents = filterEventsByType(events);\n const eventCounts = createEventCounts(filteredEvents, events.length);\n\n const startUrl =\n filteredEvents.navigationEvents.length > 0\n ? filteredEvents.navigationEvents[0].url || ''\n : '';\n\n const clickDescriptions = filteredEvents.clickEvents\n .map((event) => event.elementDescription)\n .filter((desc): desc is string => Boolean(desc))\n .slice(0, 10);\n\n const inputDescriptions = extractInputDescriptions(\n filteredEvents.inputEvents,\n ).slice(0, 10);\n\n const urls = filteredEvents.navigationEvents\n .map((e) => e.url)\n .filter((url): url is string => Boolean(url))\n .slice(0, 5);\n\n const processedEvents = processEventsForLLM(\n events,\n options.screenshotPathByEventHash,\n );\n\n return {\n testName: options.testName || 'Automated test from recorded events',\n startUrl,\n eventCounts,\n urls,\n clickDescriptions,\n inputDescriptions,\n events: processedEvents,\n };\n};\n\nexport function prepareRecorderGenerationContext(\n input: RecorderGenerationInput,\n): RecorderGenerationContext {\n validateEvents(input.events);\n\n const maxScreenshots =\n input.maxScreenshots ?? DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS;\n const screenshotAssets = createMidsceneRecorderMarkdownScreenshotAssets(\n input.events,\n {\n baseDir: './screenshots',\n maxScreenshots,\n },\n );\n const screenshotPathByEventHash = new Map(\n screenshotAssets.map((asset) => [asset.eventHashId, asset.relativePath]),\n );\n\n return {\n summary: prepareEventSummary(input.events, {\n testName: input.testName,\n screenshotPathByEventHash,\n }),\n screenshotAssets,\n };\n}\n\nexport const createMessageContent = (\n promptText: string,\n screenshots: string[] = [],\n includeScreenshots = true,\n) => {\n const messageContent: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n if (includeScreenshots && screenshots.length > 0) {\n messageContent.unshift({\n type: 'text',\n text: 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n screenshots.forEach((screenshot) => {\n messageContent.push({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n });\n });\n }\n\n return messageContent;\n};\n"],"names":["cleanRecorderSemanticField","value","undefined","validateEvents","events","Error","getScreenshotsForLLM","maxScreenshots","createMidsceneRecorderMarkdownScreenshotAssets","asset","filterEventsByType","event","createEventCounts","filteredEvents","totalEvents","extractInputDescriptions","inputEvents","item","processEventsForLLM","screenshotPathByEventHash","Map","getMidsceneRecorderEventDescription","prepareEventSummary","options","eventCounts","startUrl","clickDescriptions","desc","Boolean","inputDescriptions","urls","e","url","processedEvents","prepareRecorderGenerationContext","input","DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS","screenshotAssets","createMessageContent","promptText","screenshots","includeScreenshots","messageContent","screenshot"],"mappings":";AAyFA,SAASA,2BAA2BC,KAAc;IAChD,OAAOA,OAAO,WAAW,+BAA+BC,SAAYD;AACtE;AAEO,MAAME,iBAAiB,CAACC;IAC7B,IAAI,CAACA,OAAO,MAAM,EAChB,MAAM,IAAIC,MAAM;AAEpB;AAEO,MAAMC,uBAAuB,CAClCF,QACAG,iBAAiB,CAAC,GAEXC,+CAA+CJ,QAAQ;QAC5D,SAAS;QACTG;IACF,GAAG,GAAG,CAAC,CAACE,QAAUA,MAAM,OAAO;AAG1B,MAAMC,qBAAqB,CAChCN,SAEO;QACL,kBAAkBA,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,iBAAfA,MAAM,IAAI;QACrD,aAAaP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,aAAaP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,cAAcP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,aAAfA,MAAM,IAAI;IACnD;AAGK,MAAMC,oBAAoB,CAC/BC,gBACAC,cAEO;QACL,YAAYD,eAAe,gBAAgB,CAAC,MAAM;QAClD,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,QAAQA,eAAe,YAAY,CAAC,MAAM;QAC1C,OAAOC;IACT;AAGK,MAAMC,2BAA2B,CACtCC,cAEOA,YACJ,GAAG,CAAC,CAACL,QAAW;YACf,aAAaX,2BAA2BW,MAAM,kBAAkB,KAAK;YACrE,OAAOA,MAAM,KAAK,IAAI;QACxB,IACC,MAAM,CAAC,CAACM,OAASA,KAAK,WAAW,IAAIA,KAAK,KAAK;AAG7C,MAAMC,sBAAsB,CACjCd,QACAe,4BAAiD,IAAIC,KAAK,GAEnDhB,OAAO,GAAG,CAAC,CAACO,QAAW;YAC5B,QAAQA,MAAM,MAAM;YACpB,MAAMA,MAAM,IAAI;YAChB,WAAWA,MAAM,SAAS;YAC1B,QAAQA,MAAM,MAAM;YACpB,YAAYA,MAAM,UAAU;YAC5B,mBAAmBA,MAAM,iBAAiB;YAC1C,kBAAkBA,MAAM,gBAAgB;YACxC,KAAKA,MAAM,GAAG;YACd,OAAOA,MAAM,KAAK;YAClB,oBAAoBX,2BAA2BW,MAAM,kBAAkB;YACvE,mBAAmBX,2BAA2BW,MAAM,iBAAiB;YACrE,eAAeX,2BAA2BW,MAAM,aAAa;YAC7D,oBAAoBA,MAAM,kBAAkB;YAC5C,aAAaU,oCAAoCV;YACjD,OAAOA,MAAM,KAAK;YAClB,UAAUA,MAAM,QAAQ;YACxB,aAAaA,MAAM,WAAW;YAC9B,gBAAgBQ,0BAA0B,GAAG,CAACR,MAAM,MAAM;QAC5D;AAGK,MAAMW,sBAAsB,CACjClB,QACAmB,UAII,CAAC,CAAC;IAEN,MAAMV,iBAAiBH,mBAAmBN;IAC1C,MAAMoB,cAAcZ,kBAAkBC,gBAAgBT,OAAO,MAAM;IAEnE,MAAMqB,WACJZ,eAAe,gBAAgB,CAAC,MAAM,GAAG,IACrCA,eAAe,gBAAgB,CAAC,EAAE,CAAC,GAAG,IAAI,KAC1C;IAEN,MAAMa,oBAAoBb,eAAe,WAAW,CACjD,GAAG,CAAC,CAACF,QAAUA,MAAM,kBAAkB,EACvC,MAAM,CAAC,CAACgB,OAAyBC,QAAQD,OACzC,KAAK,CAAC,GAAG;IAEZ,MAAME,oBAAoBd,yBACxBF,eAAe,WAAW,EAC1B,KAAK,CAAC,GAAG;IAEX,MAAMiB,OAAOjB,eAAe,gBAAgB,CACzC,GAAG,CAAC,CAACkB,IAAMA,EAAE,GAAG,EAChB,MAAM,CAAC,CAACC,MAAuBJ,QAAQI,MACvC,KAAK,CAAC,GAAG;IAEZ,MAAMC,kBAAkBf,oBACtBd,QACAmB,QAAQ,yBAAyB;IAGnC,OAAO;QACL,UAAUA,QAAQ,QAAQ,IAAI;QAC9BE;QACAD;QACAM;QACAJ;QACAG;QACA,QAAQI;IACV;AACF;AAEO,SAASC,iCACdC,KAA8B;IAE9BhC,eAAegC,MAAM,MAAM;IAE3B,MAAM5B,iBACJ4B,MAAM,cAAc,IAAIC;IAC1B,MAAMC,mBAAmB7B,+CACvB2B,MAAM,MAAM,EACZ;QACE,SAAS;QACT5B;IACF;IAEF,MAAMY,4BAA4B,IAAIC,IACpCiB,iBAAiB,GAAG,CAAC,CAAC5B,QAAU;YAACA,MAAM,WAAW;YAAEA,MAAM,YAAY;SAAC;IAGzE,OAAO;QACL,SAASa,oBAAoBa,MAAM,MAAM,EAAE;YACzC,UAAUA,MAAM,QAAQ;YACxBhB;QACF;QACAkB;IACF;AACF;AAEO,MAAMC,uBAAuB,CAClCC,YACAC,cAAwB,EAAE,EAC1BC,qBAAqB,IAAI;IAEzB,MAAMC,iBAAwB;QAC5B;YACE,MAAM;YACN,MAAMH;QACR;KACD;IAED,IAAIE,sBAAsBD,YAAY,MAAM,GAAG,GAAG;QAChDE,eAAe,OAAO,CAAC;YACrB,MAAM;YACN,MAAM;QACR;QAEAF,YAAY,OAAO,CAAC,CAACG;YACnBD,eAAe,IAAI,CAAC;gBAClB,MAAM;gBACN,WAAW;oBACT,KAAKC;gBACP;YACF;QACF;IACF;IAEA,OAAOD;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/recorder-generation-common.mjs","sources":["../../../../src/ai-model/prompt/recorder-generation-common.ts"],"sourcesContent":["import {\n DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS,\n type MidsceneRecorderEvent,\n type MidsceneRecorderMarkdownScreenshotAsset,\n type MidsceneRecorderSemantic,\n type MidsceneRecorderTarget,\n createMidsceneRecorderMarkdownScreenshotAssets,\n getMidsceneRecorderEventDescription,\n getMidsceneRecorderSemantic,\n} from '@midscene/shared/recorder';\n\nexport interface EventCounts {\n navigation: number;\n click: number;\n input: number;\n scroll: number;\n total: number;\n}\n\nexport interface InputDescription {\n description: string;\n value: string;\n}\n\nexport interface ProcessedEvent {\n hashId: string;\n type: string;\n timestamp: number;\n source?: string;\n actionType?: string;\n url?: string;\n title?: string;\n semantic?: MidsceneRecorderSemantic;\n description?: string;\n value?: string;\n typedText?: string;\n inputIndex?: number;\n isSequentialInput?: boolean;\n hasNeighborInput?: boolean;\n previousInputDescription?: string;\n previousActionDescription?: string;\n nextActionDescription?: string;\n neighborInputValues?: string[];\n pageInfo?: any;\n elementRect?: any;\n screenshotPath?: string;\n}\n\nexport interface EventSummary {\n testName: string;\n startUrl: string;\n eventCounts: EventCounts;\n urls: string[];\n clickDescriptions: string[];\n inputDescriptions: InputDescription[];\n events: ProcessedEvent[];\n}\n\nexport interface RecorderGenerationContext {\n summary: EventSummary;\n screenshotAssets: MidsceneRecorderMarkdownScreenshotAsset[];\n}\n\nexport type ChromeRecordedEvent = MidsceneRecorderEvent;\n\nconst MAX_RECORDER_GENERATION_SEMANTIC_TEXT_LENGTH = 1200;\nconst MAX_RECORDER_GENERATION_SEMANTIC_ERROR_LENGTH = 400;\n\nexport interface RecorderGenerationOptions {\n testName?: string;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n /** Language for human-readable generated content (e.g. 'English', 'Chinese'). Keys and API names are kept as-is. */\n language?: string;\n navigationInfo?: {\n urls?: string[];\n titles?: string[];\n initialViewport?: {\n width?: number;\n height?: number;\n };\n };\n}\n\nexport interface RecorderGenerationInput extends RecorderGenerationOptions {\n target: MidsceneRecorderTarget;\n events: MidsceneRecorderEvent[];\n}\n\nexport interface FilteredEvents {\n navigationEvents: ChromeRecordedEvent[];\n clickEvents: ChromeRecordedEvent[];\n inputEvents: ChromeRecordedEvent[];\n scrollEvents: ChromeRecordedEvent[];\n}\n\nfunction cleanRecorderSemanticField(value?: string) {\n return value?.trim() === 'AI is analyzing element...' ? undefined : value;\n}\n\nfunction truncateRecorderGenerationText(value: string, maxLength: number) {\n if (value.length <= maxLength) {\n return value;\n }\n return `${value.slice(0, maxLength)}... [truncated ${value.length - maxLength} chars]`;\n}\n\nfunction compactRecorderSemanticText(value?: string) {\n const cleaned = cleanRecorderSemanticField(value);\n return cleaned\n ? truncateRecorderGenerationText(\n cleaned,\n MAX_RECORDER_GENERATION_SEMANTIC_TEXT_LENGTH,\n )\n : undefined;\n}\n\nfunction compactRecorderSemanticError(value?: string) {\n return value\n ? truncateRecorderGenerationText(\n value,\n MAX_RECORDER_GENERATION_SEMANTIC_ERROR_LENGTH,\n )\n : undefined;\n}\n\nexport function compactRecorderSemanticForGeneration(\n semantic?: MidsceneRecorderSemantic,\n): MidsceneRecorderSemantic | undefined {\n if (!semantic) {\n return undefined;\n }\n\n return {\n source: semantic.source,\n status: semantic.status,\n confidence: semantic.confidence,\n elementDescription: compactRecorderSemanticText(\n semantic.elementDescription,\n ),\n replayInstruction: compactRecorderSemanticText(semantic.replayInstruction),\n actionSummary: compactRecorderSemanticText(semantic.actionSummary),\n error: compactRecorderSemanticError(semantic.error),\n ...(semantic.aiDescribe\n ? {\n aiDescribe: {\n verifyPrompt: semantic.aiDescribe.verifyPrompt,\n verifyPassed: semantic.aiDescribe.verifyPassed,\n deepLocate: semantic.aiDescribe.deepLocate,\n centerDistance: semantic.aiDescribe.centerDistance,\n expectedCenter: semantic.aiDescribe.expectedCenter,\n actualCenter: semantic.aiDescribe.actualCenter,\n },\n }\n : {}),\n };\n}\n\nexport const validateEvents = (events: ChromeRecordedEvent[]): void => {\n if (!events.length) {\n throw new Error('No events provided for test generation');\n }\n};\n\nexport const getScreenshotsForLLM = (\n events: ChromeRecordedEvent[],\n maxScreenshots = 1,\n): string[] => {\n return createMidsceneRecorderMarkdownScreenshotAssets(events, {\n baseDir: './screenshots',\n maxScreenshots,\n }).map((asset) => asset.dataUrl);\n};\n\nexport const filterEventsByType = (\n events: ChromeRecordedEvent[],\n): FilteredEvents => {\n return {\n navigationEvents: events.filter((event) => event.type === 'navigation'),\n clickEvents: events.filter((event) => event.type === 'click'),\n inputEvents: events.filter((event) => event.type === 'input'),\n scrollEvents: events.filter((event) => event.type === 'scroll'),\n };\n};\n\nexport const createEventCounts = (\n filteredEvents: FilteredEvents,\n totalEvents: number,\n): EventCounts => {\n return {\n navigation: filteredEvents.navigationEvents.length,\n click: filteredEvents.clickEvents.length,\n input: filteredEvents.inputEvents.length,\n scroll: filteredEvents.scrollEvents.length,\n total: totalEvents,\n };\n};\n\nexport const extractInputDescriptions = (\n inputEvents: ChromeRecordedEvent[],\n): InputDescription[] => {\n return inputEvents\n .map((event) => {\n const semantic = getMidsceneRecorderSemantic(event);\n return {\n description:\n cleanRecorderSemanticField(semantic?.elementDescription) || '',\n value: event.value || '',\n };\n })\n .filter((item) => item.description && item.value);\n};\n\nexport const processEventsForLLM = (\n events: ChromeRecordedEvent[],\n screenshotPathByEventHash: Map<string, string> = new Map(),\n): ProcessedEvent[] => {\n let inputIndex = 0;\n return events.map((event, index) => {\n const previousEvent = events[index - 1];\n const nextEvent = events[index + 1];\n const previousInput = events\n .slice(0, index)\n .reverse()\n .find((candidate) => candidate.type === 'input');\n const nextInput = events\n .slice(index + 1)\n .find((candidate) => candidate.type === 'input');\n const isInput = event.type === 'input';\n const inputSequenceIndex = isInput ? ++inputIndex : undefined;\n const hasNeighborInput = Boolean(previousInput || nextInput);\n const neighborInputValues = isInput\n ? [previousInput?.value, nextInput?.value].filter(\n (value): value is string => Boolean(value),\n )\n : undefined;\n const semantic = compactRecorderSemanticForGeneration(\n getMidsceneRecorderSemantic(event),\n );\n\n return {\n hashId: event.hashId,\n type: event.type,\n timestamp: event.timestamp,\n source: event.source,\n actionType: event.actionType,\n url: event.url,\n title: event.title,\n semantic,\n description: getMidsceneRecorderEventDescription(event),\n value: event.value,\n previousActionDescription: previousEvent\n ? getMidsceneRecorderEventDescription(previousEvent)\n : undefined,\n nextActionDescription: nextEvent\n ? getMidsceneRecorderEventDescription(nextEvent)\n : undefined,\n ...(isInput\n ? {\n typedText: event.value || '',\n inputIndex: inputSequenceIndex,\n isSequentialInput:\n previousEvent?.type === 'input' || nextEvent?.type === 'input',\n hasNeighborInput,\n previousInputDescription: previousInput\n ? getMidsceneRecorderEventDescription(previousInput)\n : undefined,\n neighborInputValues:\n neighborInputValues && neighborInputValues.length > 0\n ? neighborInputValues\n : undefined,\n }\n : {}),\n pageInfo: event.pageInfo,\n elementRect: event.elementRect,\n screenshotPath: screenshotPathByEventHash.get(event.hashId),\n };\n });\n};\n\nexport const prepareEventSummary = (\n events: ChromeRecordedEvent[],\n options: {\n testName?: string;\n maxScreenshots?: number;\n screenshotPathByEventHash?: Map<string, string>;\n } = {},\n): EventSummary => {\n const filteredEvents = filterEventsByType(events);\n const eventCounts = createEventCounts(filteredEvents, events.length);\n\n const startUrl =\n filteredEvents.navigationEvents.length > 0\n ? filteredEvents.navigationEvents[0].url || ''\n : '';\n\n const clickDescriptions = filteredEvents.clickEvents\n .map((event) => getMidsceneRecorderSemantic(event)?.elementDescription)\n .filter((desc): desc is string => Boolean(desc))\n .slice(0, 10);\n\n const inputDescriptions = extractInputDescriptions(\n filteredEvents.inputEvents,\n ).slice(0, 10);\n\n const urls = filteredEvents.navigationEvents\n .map((e) => e.url)\n .filter((url): url is string => Boolean(url))\n .slice(0, 5);\n\n const processedEvents = processEventsForLLM(\n events,\n options.screenshotPathByEventHash,\n );\n\n return {\n testName: options.testName || 'Automated test from recorded events',\n startUrl,\n eventCounts,\n urls,\n clickDescriptions,\n inputDescriptions,\n events: processedEvents,\n };\n};\n\nexport function prepareRecorderGenerationContext(\n input: RecorderGenerationInput,\n): RecorderGenerationContext {\n validateEvents(input.events);\n\n const maxScreenshots =\n input.maxScreenshots ?? DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS;\n const screenshotAssets = createMidsceneRecorderMarkdownScreenshotAssets(\n input.events,\n {\n baseDir: './screenshots',\n maxScreenshots,\n },\n );\n const screenshotPathByEventHash = new Map(\n screenshotAssets.map((asset) => [asset.eventHashId, asset.relativePath]),\n );\n\n return {\n summary: prepareEventSummary(input.events, {\n testName: input.testName,\n screenshotPathByEventHash,\n }),\n screenshotAssets,\n };\n}\n\nexport const createMessageContent = (\n promptText: string,\n screenshots: string[] = [],\n includeScreenshots = true,\n) => {\n const messageContent: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n if (includeScreenshots && screenshots.length > 0) {\n messageContent.unshift({\n type: 'text',\n text: 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n screenshots.forEach((screenshot) => {\n messageContent.push({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n });\n });\n }\n\n return messageContent;\n};\n"],"names":["MAX_RECORDER_GENERATION_SEMANTIC_TEXT_LENGTH","MAX_RECORDER_GENERATION_SEMANTIC_ERROR_LENGTH","cleanRecorderSemanticField","value","undefined","truncateRecorderGenerationText","maxLength","compactRecorderSemanticText","cleaned","compactRecorderSemanticError","compactRecorderSemanticForGeneration","semantic","validateEvents","events","Error","getScreenshotsForLLM","maxScreenshots","createMidsceneRecorderMarkdownScreenshotAssets","asset","filterEventsByType","event","createEventCounts","filteredEvents","totalEvents","extractInputDescriptions","inputEvents","getMidsceneRecorderSemantic","item","processEventsForLLM","screenshotPathByEventHash","Map","inputIndex","index","previousEvent","nextEvent","previousInput","candidate","nextInput","isInput","inputSequenceIndex","hasNeighborInput","Boolean","neighborInputValues","getMidsceneRecorderEventDescription","prepareEventSummary","options","eventCounts","startUrl","clickDescriptions","desc","inputDescriptions","urls","e","url","processedEvents","prepareRecorderGenerationContext","input","DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS","screenshotAssets","createMessageContent","promptText","screenshots","includeScreenshots","messageContent","screenshot"],"mappings":";AAiEA,MAAMA,+CAA+C;AACrD,MAAMC,gDAAgD;AA+BtD,SAASC,2BAA2BC,KAAc;IAChD,OAAOA,OAAO,WAAW,+BAA+BC,SAAYD;AACtE;AAEA,SAASE,+BAA+BF,KAAa,EAAEG,SAAiB;IACtE,IAAIH,MAAM,MAAM,IAAIG,WAClB,OAAOH;IAET,OAAO,GAAGA,MAAM,KAAK,CAAC,GAAGG,WAAW,eAAe,EAAEH,MAAM,MAAM,GAAGG,UAAU,OAAO,CAAC;AACxF;AAEA,SAASC,4BAA4BJ,KAAc;IACjD,MAAMK,UAAUN,2BAA2BC;IAC3C,OAAOK,UACHH,+BACEG,SACAR,gDAEFI;AACN;AAEA,SAASK,6BAA6BN,KAAc;IAClD,OAAOA,QACHE,+BACEF,OACAF,iDAEFG;AACN;AAEO,SAASM,qCACdC,QAAmC;IAEnC,IAAI,CAACA,UACH;IAGF,OAAO;QACL,QAAQA,SAAS,MAAM;QACvB,QAAQA,SAAS,MAAM;QACvB,YAAYA,SAAS,UAAU;QAC/B,oBAAoBJ,4BAClBI,SAAS,kBAAkB;QAE7B,mBAAmBJ,4BAA4BI,SAAS,iBAAiB;QACzE,eAAeJ,4BAA4BI,SAAS,aAAa;QACjE,OAAOF,6BAA6BE,SAAS,KAAK;QAClD,GAAIA,SAAS,UAAU,GACnB;YACE,YAAY;gBACV,cAAcA,SAAS,UAAU,CAAC,YAAY;gBAC9C,cAAcA,SAAS,UAAU,CAAC,YAAY;gBAC9C,YAAYA,SAAS,UAAU,CAAC,UAAU;gBAC1C,gBAAgBA,SAAS,UAAU,CAAC,cAAc;gBAClD,gBAAgBA,SAAS,UAAU,CAAC,cAAc;gBAClD,cAAcA,SAAS,UAAU,CAAC,YAAY;YAChD;QACF,IACA,CAAC,CAAC;IACR;AACF;AAEO,MAAMC,iBAAiB,CAACC;IAC7B,IAAI,CAACA,OAAO,MAAM,EAChB,MAAM,IAAIC,MAAM;AAEpB;AAEO,MAAMC,uBAAuB,CAClCF,QACAG,iBAAiB,CAAC,GAEXC,+CAA+CJ,QAAQ;QAC5D,SAAS;QACTG;IACF,GAAG,GAAG,CAAC,CAACE,QAAUA,MAAM,OAAO;AAG1B,MAAMC,qBAAqB,CAChCN,SAEO;QACL,kBAAkBA,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,iBAAfA,MAAM,IAAI;QACrD,aAAaP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,aAAaP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,cAAcP,OAAO,MAAM,CAAC,CAACO,QAAUA,AAAe,aAAfA,MAAM,IAAI;IACnD;AAGK,MAAMC,oBAAoB,CAC/BC,gBACAC,cAEO;QACL,YAAYD,eAAe,gBAAgB,CAAC,MAAM;QAClD,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,QAAQA,eAAe,YAAY,CAAC,MAAM;QAC1C,OAAOC;IACT;AAGK,MAAMC,2BAA2B,CACtCC,cAEOA,YACJ,GAAG,CAAC,CAACL;QACJ,MAAMT,WAAWe,4BAA4BN;QAC7C,OAAO;YACL,aACElB,2BAA2BS,UAAU,uBAAuB;YAC9D,OAAOS,MAAM,KAAK,IAAI;QACxB;IACF,GACC,MAAM,CAAC,CAACO,OAASA,KAAK,WAAW,IAAIA,KAAK,KAAK;AAG7C,MAAMC,sBAAsB,CACjCf,QACAgB,4BAAiD,IAAIC,KAAK;IAE1D,IAAIC,aAAa;IACjB,OAAOlB,OAAO,GAAG,CAAC,CAACO,OAAOY;QACxB,MAAMC,gBAAgBpB,MAAM,CAACmB,QAAQ,EAAE;QACvC,MAAME,YAAYrB,MAAM,CAACmB,QAAQ,EAAE;QACnC,MAAMG,gBAAgBtB,OACnB,KAAK,CAAC,GAAGmB,OACT,OAAO,GACP,IAAI,CAAC,CAACI,YAAcA,AAAmB,YAAnBA,UAAU,IAAI;QACrC,MAAMC,YAAYxB,OACf,KAAK,CAACmB,QAAQ,GACd,IAAI,CAAC,CAACI,YAAcA,AAAmB,YAAnBA,UAAU,IAAI;QACrC,MAAME,UAAUlB,AAAe,YAAfA,MAAM,IAAI;QAC1B,MAAMmB,qBAAqBD,UAAU,EAAEP,aAAa3B;QACpD,MAAMoC,mBAAmBC,QAAQN,iBAAiBE;QAClD,MAAMK,sBAAsBJ,UACxB;YAACH,eAAe;YAAOE,WAAW;SAAM,CAAC,MAAM,CAC7C,CAAClC,QAA2BsC,QAAQtC,UAEtCC;QACJ,MAAMO,WAAWD,qCACfgB,4BAA4BN;QAG9B,OAAO;YACL,QAAQA,MAAM,MAAM;YACpB,MAAMA,MAAM,IAAI;YAChB,WAAWA,MAAM,SAAS;YAC1B,QAAQA,MAAM,MAAM;YACpB,YAAYA,MAAM,UAAU;YAC5B,KAAKA,MAAM,GAAG;YACd,OAAOA,MAAM,KAAK;YAClBT;YACA,aAAagC,oCAAoCvB;YACjD,OAAOA,MAAM,KAAK;YAClB,2BAA2Ba,gBACvBU,oCAAoCV,iBACpC7B;YACJ,uBAAuB8B,YACnBS,oCAAoCT,aACpC9B;YACJ,GAAIkC,UACA;gBACE,WAAWlB,MAAM,KAAK,IAAI;gBAC1B,YAAYmB;gBACZ,mBACEN,eAAe,SAAS,WAAWC,WAAW,SAAS;gBACzDM;gBACA,0BAA0BL,gBACtBQ,oCAAoCR,iBACpC/B;gBACJ,qBACEsC,uBAAuBA,oBAAoB,MAAM,GAAG,IAChDA,sBACAtC;YACR,IACA,CAAC,CAAC;YACN,UAAUgB,MAAM,QAAQ;YACxB,aAAaA,MAAM,WAAW;YAC9B,gBAAgBS,0BAA0B,GAAG,CAACT,MAAM,MAAM;QAC5D;IACF;AACF;AAEO,MAAMwB,sBAAsB,CACjC/B,QACAgC,UAII,CAAC,CAAC;IAEN,MAAMvB,iBAAiBH,mBAAmBN;IAC1C,MAAMiC,cAAczB,kBAAkBC,gBAAgBT,OAAO,MAAM;IAEnE,MAAMkC,WACJzB,eAAe,gBAAgB,CAAC,MAAM,GAAG,IACrCA,eAAe,gBAAgB,CAAC,EAAE,CAAC,GAAG,IAAI,KAC1C;IAEN,MAAM0B,oBAAoB1B,eAAe,WAAW,CACjD,GAAG,CAAC,CAACF,QAAUM,4BAA4BN,QAAQ,oBACnD,MAAM,CAAC,CAAC6B,OAAyBR,QAAQQ,OACzC,KAAK,CAAC,GAAG;IAEZ,MAAMC,oBAAoB1B,yBACxBF,eAAe,WAAW,EAC1B,KAAK,CAAC,GAAG;IAEX,MAAM6B,OAAO7B,eAAe,gBAAgB,CACzC,GAAG,CAAC,CAAC8B,IAAMA,EAAE,GAAG,EAChB,MAAM,CAAC,CAACC,MAAuBZ,QAAQY,MACvC,KAAK,CAAC,GAAG;IAEZ,MAAMC,kBAAkB1B,oBACtBf,QACAgC,QAAQ,yBAAyB;IAGnC,OAAO;QACL,UAAUA,QAAQ,QAAQ,IAAI;QAC9BE;QACAD;QACAK;QACAH;QACAE;QACA,QAAQI;IACV;AACF;AAEO,SAASC,iCACdC,KAA8B;IAE9B5C,eAAe4C,MAAM,MAAM;IAE3B,MAAMxC,iBACJwC,MAAM,cAAc,IAAIC;IAC1B,MAAMC,mBAAmBzC,+CACvBuC,MAAM,MAAM,EACZ;QACE,SAAS;QACTxC;IACF;IAEF,MAAMa,4BAA4B,IAAIC,IACpC4B,iBAAiB,GAAG,CAAC,CAACxC,QAAU;YAACA,MAAM,WAAW;YAAEA,MAAM,YAAY;SAAC;IAGzE,OAAO;QACL,SAAS0B,oBAAoBY,MAAM,MAAM,EAAE;YACzC,UAAUA,MAAM,QAAQ;YACxB3B;QACF;QACA6B;IACF;AACF;AAEO,MAAMC,uBAAuB,CAClCC,YACAC,cAAwB,EAAE,EAC1BC,qBAAqB,IAAI;IAEzB,MAAMC,iBAAwB;QAC5B;YACE,MAAM;YACN,MAAMH;QACR;KACD;IAED,IAAIE,sBAAsBD,YAAY,MAAM,GAAG,GAAG;QAChDE,eAAe,OAAO,CAAC;YACrB,MAAM;YACN,MAAM;QACR;QAEAF,YAAY,OAAO,CAAC,CAACG;YACnBD,eAAe,IAAI,CAAC;gBAClB,MAAM;gBACN,WAAW;oBACT,KAAKC;gBACP;YACF;QACF;IACF;IAEA,OAAOD;AACT"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM } from "@midscene/shared/recorder";
|
|
1
|
+
import { getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM, getMidsceneRecorderSemantic } from "@midscene/shared/recorder";
|
|
2
2
|
import { callAIWithObjectResponse } from "../service-caller/index.mjs";
|
|
3
|
+
import { compactRecorderSemanticForGeneration } from "./recorder-generation-common.mjs";
|
|
3
4
|
function summarizeRecorderEvents(input) {
|
|
4
5
|
const events = input.events;
|
|
5
6
|
const navigationEvents = events.filter((event)=>'navigation' === event.type);
|
|
@@ -28,10 +29,7 @@ function summarizeRecorderEvents(input) {
|
|
|
28
29
|
title: event.title,
|
|
29
30
|
value: event.value,
|
|
30
31
|
description: getMidsceneRecorderEventDescription(event),
|
|
31
|
-
|
|
32
|
-
replayInstruction: event.replayInstruction,
|
|
33
|
-
actionSummary: event.actionSummary,
|
|
34
|
-
semanticConfidence: event.semanticConfidence
|
|
32
|
+
semantic: compactRecorderSemanticForGeneration(getMidsceneRecorderSemantic(event))
|
|
35
33
|
}))
|
|
36
34
|
};
|
|
37
35
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/recorder-metadata-generator.mjs","sources":["../../../../src/ai-model/prompt/recorder-metadata-generator.ts"],"sourcesContent":["import type { IModelConfig } from '@midscene/shared/env';\nimport {\n type MidsceneRecorderEvent,\n type MidsceneRecorderTarget,\n getMidsceneRecorderEventDescription,\n getMidsceneRecorderScreenshotsForLLM,\n} from '@midscene/shared/recorder';\nimport { callAIWithObjectResponse } from '../service-caller/index';\n\nexport interface RecorderMetadataGenerationInput {\n target: MidsceneRecorderTarget;\n events: MidsceneRecorderEvent[];\n fallbackName?: string;\n maxScreenshots?: number;\n}\n\nexport interface RecorderGeneratedMetadata {\n title?: string;\n description?: string;\n}\n\nfunction summarizeRecorderEvents(input: RecorderMetadataGenerationInput) {\n const events = input.events;\n const navigationEvents = events.filter(\n (event) => event.type === 'navigation',\n );\n const clickEvents = events.filter((event) => event.type === 'click');\n const inputEvents = events.filter((event) => event.type === 'input');\n const scrollEvents = events.filter((event) => event.type === 'scroll');\n const urls = navigationEvents\n .map((event) => event.url)\n .filter((url): url is string => Boolean(url));\n const titles = navigationEvents\n .map((event) => event.title)\n .filter((title): title is string => Boolean(title));\n\n return {\n platform: input.target.platformId,\n target: input.target,\n fallbackName: input.fallbackName,\n pageCount: navigationEvents.length,\n pageTitles: titles.slice(0, 5),\n urls: urls.slice(0, 5),\n clickCount: clickEvents.length,\n inputCount: inputEvents.length,\n scrollCount: scrollEvents.length,\n totalActions: events.length,\n firstUrl: urls[0] || input.target.values.url || '',\n lastUrl: urls[urls.length - 1] || '',\n events: events.slice(0, 20).map((event) => ({\n type: event.type,\n actionType: event.actionType,\n url: event.url,\n title: event.title,\n value: event.value,\n description: getMidsceneRecorderEventDescription(event),\n
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/recorder-metadata-generator.mjs","sources":["../../../../src/ai-model/prompt/recorder-metadata-generator.ts"],"sourcesContent":["import type { IModelConfig } from '@midscene/shared/env';\nimport {\n type MidsceneRecorderEvent,\n type MidsceneRecorderTarget,\n getMidsceneRecorderEventDescription,\n getMidsceneRecorderScreenshotsForLLM,\n getMidsceneRecorderSemantic,\n} from '@midscene/shared/recorder';\nimport { callAIWithObjectResponse } from '../service-caller/index';\nimport { compactRecorderSemanticForGeneration } from './recorder-generation-common';\n\nexport interface RecorderMetadataGenerationInput {\n target: MidsceneRecorderTarget;\n events: MidsceneRecorderEvent[];\n fallbackName?: string;\n maxScreenshots?: number;\n}\n\nexport interface RecorderGeneratedMetadata {\n title?: string;\n description?: string;\n}\n\nfunction summarizeRecorderEvents(input: RecorderMetadataGenerationInput) {\n const events = input.events;\n const navigationEvents = events.filter(\n (event) => event.type === 'navigation',\n );\n const clickEvents = events.filter((event) => event.type === 'click');\n const inputEvents = events.filter((event) => event.type === 'input');\n const scrollEvents = events.filter((event) => event.type === 'scroll');\n const urls = navigationEvents\n .map((event) => event.url)\n .filter((url): url is string => Boolean(url));\n const titles = navigationEvents\n .map((event) => event.title)\n .filter((title): title is string => Boolean(title));\n\n return {\n platform: input.target.platformId,\n target: input.target,\n fallbackName: input.fallbackName,\n pageCount: navigationEvents.length,\n pageTitles: titles.slice(0, 5),\n urls: urls.slice(0, 5),\n clickCount: clickEvents.length,\n inputCount: inputEvents.length,\n scrollCount: scrollEvents.length,\n totalActions: events.length,\n firstUrl: urls[0] || input.target.values.url || '',\n lastUrl: urls[urls.length - 1] || '',\n events: events.slice(0, 20).map((event) => ({\n type: event.type,\n actionType: event.actionType,\n url: event.url,\n title: event.title,\n value: event.value,\n description: getMidsceneRecorderEventDescription(event),\n semantic: compactRecorderSemanticForGeneration(\n getMidsceneRecorderSemantic(event),\n ),\n })),\n };\n}\n\nfunction normalizeMetadataValue(value: unknown) {\n return typeof value === 'string' ? value.trim() : '';\n}\n\nexport async function generateRecorderSessionMetadata(\n input: RecorderMetadataGenerationInput,\n modelConfig: IModelConfig,\n): Promise<RecorderGeneratedMetadata> {\n if (!input?.events?.length) {\n throw new Error('generateRecorderSessionMetadata: events are required.');\n }\n if (!modelConfig?.modelName) {\n throw new Error(\n 'generateRecorderSessionMetadata: modelConfig.modelName is required.',\n );\n }\n\n const summary = summarizeRecorderEvents(input);\n const screenshots = getMidsceneRecorderScreenshotsForLLM(\n input.events,\n input.maxScreenshots ?? 1,\n );\n const messageContent: any[] = [\n {\n type: 'text',\n text: `Generate a concise title (5-7 words) and brief description (1-2 sentences) for a Studio recording of user actions.\n\nThe recording can target Web, Android, iOS, HarmonyOS, or Computer. Do not assume it is a browser session unless the platform is web.\nDescribe what the user did or accomplished. The description should use the user as the subject, preferably starting with \"The user ...\". Do not start the description with \"The session ...\".\nThe title should be action-oriented and highlight the main task accomplished.\n\nSummary:\n${JSON.stringify(summary, null, 2)}\n\nRespond with a JSON object containing exactly \"title\" and \"description\".`,\n },\n ];\n\n for (const screenshot of screenshots) {\n messageContent.push({\n type: 'image_url',\n image_url: { url: screenshot },\n });\n }\n\n const response = await callAIWithObjectResponse<{\n title?: string;\n description?: string;\n }>(\n [\n {\n role: 'system',\n content:\n 'You generate clear, task-oriented titles and descriptions for recorded automation sessions.',\n },\n {\n role: 'user',\n content: messageContent,\n },\n ],\n modelConfig,\n );\n\n return {\n title: normalizeMetadataValue(response.content.title),\n description: normalizeMetadataValue(response.content.description),\n };\n}\n"],"names":["summarizeRecorderEvents","input","events","navigationEvents","event","clickEvents","inputEvents","scrollEvents","urls","url","Boolean","titles","title","getMidsceneRecorderEventDescription","compactRecorderSemanticForGeneration","getMidsceneRecorderSemantic","normalizeMetadataValue","value","generateRecorderSessionMetadata","modelConfig","Error","summary","screenshots","getMidsceneRecorderScreenshotsForLLM","messageContent","JSON","screenshot","response","callAIWithObjectResponse"],"mappings":";;;AAuBA,SAASA,wBAAwBC,KAAsC;IACrE,MAAMC,SAASD,MAAM,MAAM;IAC3B,MAAME,mBAAmBD,OAAO,MAAM,CACpC,CAACE,QAAUA,AAAe,iBAAfA,MAAM,IAAI;IAEvB,MAAMC,cAAcH,OAAO,MAAM,CAAC,CAACE,QAAUA,AAAe,YAAfA,MAAM,IAAI;IACvD,MAAME,cAAcJ,OAAO,MAAM,CAAC,CAACE,QAAUA,AAAe,YAAfA,MAAM,IAAI;IACvD,MAAMG,eAAeL,OAAO,MAAM,CAAC,CAACE,QAAUA,AAAe,aAAfA,MAAM,IAAI;IACxD,MAAMI,OAAOL,iBACV,GAAG,CAAC,CAACC,QAAUA,MAAM,GAAG,EACxB,MAAM,CAAC,CAACK,MAAuBC,QAAQD;IAC1C,MAAME,SAASR,iBACZ,GAAG,CAAC,CAACC,QAAUA,MAAM,KAAK,EAC1B,MAAM,CAAC,CAACQ,QAA2BF,QAAQE;IAE9C,OAAO;QACL,UAAUX,MAAM,MAAM,CAAC,UAAU;QACjC,QAAQA,MAAM,MAAM;QACpB,cAAcA,MAAM,YAAY;QAChC,WAAWE,iBAAiB,MAAM;QAClC,YAAYQ,OAAO,KAAK,CAAC,GAAG;QAC5B,MAAMH,KAAK,KAAK,CAAC,GAAG;QACpB,YAAYH,YAAY,MAAM;QAC9B,YAAYC,YAAY,MAAM;QAC9B,aAAaC,aAAa,MAAM;QAChC,cAAcL,OAAO,MAAM;QAC3B,UAAUM,IAAI,CAAC,EAAE,IAAIP,MAAM,MAAM,CAAC,MAAM,CAAC,GAAG,IAAI;QAChD,SAASO,IAAI,CAACA,KAAK,MAAM,GAAG,EAAE,IAAI;QAClC,QAAQN,OAAO,KAAK,CAAC,GAAG,IAAI,GAAG,CAAC,CAACE,QAAW;gBAC1C,MAAMA,MAAM,IAAI;gBAChB,YAAYA,MAAM,UAAU;gBAC5B,KAAKA,MAAM,GAAG;gBACd,OAAOA,MAAM,KAAK;gBAClB,OAAOA,MAAM,KAAK;gBAClB,aAAaS,oCAAoCT;gBACjD,UAAUU,qCACRC,4BAA4BX;YAEhC;IACF;AACF;AAEA,SAASY,uBAAuBC,KAAc;IAC5C,OAAO,AAAiB,YAAjB,OAAOA,QAAqBA,MAAM,IAAI,KAAK;AACpD;AAEO,eAAeC,gCACpBjB,KAAsC,EACtCkB,WAAyB;IAEzB,IAAI,CAAClB,OAAO,QAAQ,QAClB,MAAM,IAAImB,MAAM;IAElB,IAAI,CAACD,aAAa,WAChB,MAAM,IAAIC,MACR;IAIJ,MAAMC,UAAUrB,wBAAwBC;IACxC,MAAMqB,cAAcC,qCAClBtB,MAAM,MAAM,EACZA,MAAM,cAAc,IAAI;IAE1B,MAAMuB,iBAAwB;QAC5B;YACE,MAAM;YACN,MAAM,CAAC;;;;;;;AAOb,EAAEC,KAAK,SAAS,CAACJ,SAAS,MAAM,GAAG;;wEAEqC,CAAC;QACrE;KACD;IAED,KAAK,MAAMK,cAAcJ,YACvBE,eAAe,IAAI,CAAC;QAClB,MAAM;QACN,WAAW;YAAE,KAAKE;QAAW;IAC/B;IAGF,MAAMC,WAAW,MAAMC,yBAIrB;QACE;YACE,MAAM;YACN,SACE;QACJ;QACA;YACE,MAAM;YACN,SAASJ;QACX;KACD,EACDL;IAGF,OAAO;QACL,OAAOH,uBAAuBW,SAAS,OAAO,CAAC,KAAK;QACpD,aAAaX,uBAAuBW,SAAS,OAAO,CAAC,WAAW;IAClE;AACF"}
|
|
@@ -15,8 +15,10 @@ Output JSON only:
|
|
|
15
15
|
Rules:
|
|
16
16
|
- Do NOT output coordinates as the main description.
|
|
17
17
|
- Do NOT mention "near coordinates", "nearby element", "near point", "red marker", highlighted box, highlighted element, or screenshot.
|
|
18
|
-
- Prefer stable target descriptions in this order: exact
|
|
19
|
-
-
|
|
18
|
+
- Prefer stable target descriptions in this order: exact stable control text > stable label > role + stable section/context > icon purpose > visual position.
|
|
19
|
+
- Treat placeholder or hint text that can change by user, time, data, or context as dynamic. Do not use dynamic hint values as the primary target description; prefer role + stable region + intent.
|
|
20
|
+
- For repeated collections, treat item identity text as dynamic unless the user is clearly verifying that exact item. This includes any list/grid/table/feed/menu or repeated record surface. Do not output descriptions like "<role> titled/named '<content>'"; prefer stable role + region + selection intent.
|
|
21
|
+
- Keep quoted UI text in the original UI language when it is a stable control label.
|
|
20
22
|
- Apply the platform guidance from the user event:
|
|
21
23
|
- Web: button, input, link, menu item, tab, dialog, aria-label, placeholder, form section.
|
|
22
24
|
- Mobile: tab, list item, text field, icon button, navigation bar, bottom bar, sheet, card, screen section.
|
|
@@ -31,13 +33,15 @@ Rules:
|
|
|
31
33
|
- Input-specific rules:
|
|
32
34
|
- The highlighted BEFORE screenshot marks the field that receives the text.
|
|
33
35
|
- The screenshot after the action may show the typed value; use it only to confirm the field, never as the field description.
|
|
34
|
-
- elementDescription must identify the field itself
|
|
35
|
-
- Never use "AI is analyzing element", the typed value, or a generic "input field" as elementDescription.
|
|
36
|
+
- elementDescription must identify the field itself by stable field role, nearby label, region, section, or sequence intent.
|
|
37
|
+
- Never use "AI is analyzing element", the typed value, the page title alone, or a generic "input field" as elementDescription.
|
|
38
|
+
- For consecutive input events, distinguish fields by stable role, section, order, current focus, filled/empty state, and neighboring actions instead of reusing the same generic field description.
|
|
36
39
|
- Input replayInstruction format: Input "<value>" into the element described as "<elementDescription>".
|
|
37
40
|
- Scroll target quality bar:
|
|
38
|
-
- elementDescription describes the scrollable page, panel, list, table, or section.
|
|
41
|
+
- elementDescription describes the scrollable page, panel, list, table, or section at the highlighted scroll point.
|
|
42
|
+
- When multiple scrollable regions are visible, preserve the specific region where the scroll happened, such as left/right/top/bottom panel, navigation area, content pane, dialog body, table, list, or menu. Do not generalize a panel/list scroll into the whole page.
|
|
39
43
|
- scrollDestinationDescription is required and describes what the scroll is trying to reveal or reach, using newly visible headings, section titles, list items, or stable content from the AFTER screenshot.
|
|
40
|
-
- Prefer descriptions like "
|
|
44
|
+
- Prefer descriptions like "Playwright integration documentation page, scrolling toward the API reference section" or "Android API documentation page, scrolling to the installation steps section".
|
|
41
45
|
- Do NOT write generic phrases like "more content", "the page", "current screen", or "main scrollable area".
|
|
42
46
|
- Scroll replayInstruction format: Scroll the page/region with description "<elementDescription>" by value "<recorded value>" until "<scrollDestinationDescription>" is visible.
|
|
43
47
|
- Scroll actionSummary format: Scroll <elementDescription> toward <scrollDestinationDescription>.
|