@midscene/shared 1.9.7-beta-20260616025249.0 → 1.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/env/parse-model-config.mjs +1 -1
- package/dist/es/img/index.mjs +2 -2
- package/dist/es/img/transform.mjs +29 -2
- package/dist/es/recorder.mjs +88 -8
- package/dist/lib/env/parse-model-config.js +1 -1
- package/dist/lib/img/index.js +8 -2
- package/dist/lib/img/transform.js +34 -1
- package/dist/lib/recorder.js +96 -7
- package/dist/types/img/index.d.ts +1 -1
- package/dist/types/img/transform.d.ts +2 -0
- package/dist/types/recorder.d.ts +35 -5
- package/package.json +1 -1
- package/src/img/index.ts +2 -0
- package/src/img/transform.ts +69 -1
- package/src/recorder.ts +175 -20
|
@@ -5,7 +5,7 @@ import { assert } from "../utils.mjs";
|
|
|
5
5
|
import { maskConfig, parseJson } from "./helper.mjs";
|
|
6
6
|
import { initDebugConfig } from "./init-debug.mjs";
|
|
7
7
|
const MODEL_CONFIG_DOC_URL = 'https://midscenejs.com/model-common-config.html';
|
|
8
|
-
const getCurrentVersion = ()=>"1.9.7
|
|
8
|
+
const getCurrentVersion = ()=>"1.9.7";
|
|
9
9
|
const getInvalidModelFamilyMessage = (modelFamily)=>`Invalid MIDSCENE_MODEL_FAMILY value: ${modelFamily}. Current version v${getCurrentVersion()} accepts the following model families: ${MODEL_FAMILY_VALUES.join(', ')}. You can also visit ${MODEL_CONFIG_DOC_URL} for the latest configuration information.`;
|
|
10
10
|
const KEYS_MAP = {
|
|
11
11
|
insight: INSIGHT_MODEL_CONFIG_KEYS,
|
package/dist/es/img/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import { imageInfoOfBase64, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, validateScreenshotBuffer } from "./info.mjs";
|
|
2
|
-
import { createImgBase64ByFormat, cropByRect, httpImg2Base64, localImg2Base64, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o } from "./transform.mjs";
|
|
2
|
+
import { createImgBase64ByFormat, cropByRect, httpImg2Base64, inferBase64ImageFormat, localImg2Base64, normalizeBase64Image, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o } from "./transform.mjs";
|
|
3
3
|
import { annotateRects, compositeElementInfoImg, compositePointMarkerImg, processImageElementInfo } from "./box-select.mjs";
|
|
4
|
-
export { annotateRects, compositeElementInfoImg, compositePointMarkerImg, createImgBase64ByFormat, cropByRect, httpImg2Base64, imageInfoOfBase64, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, localImg2Base64, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, processImageElementInfo, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, validateScreenshotBuffer, zoomForGPT4o };
|
|
4
|
+
export { annotateRects, compositeElementInfoImg, compositePointMarkerImg, createImgBase64ByFormat, cropByRect, httpImg2Base64, imageInfoOfBase64, inferBase64ImageFormat, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, localImg2Base64, normalizeBase64Image, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, processImageElementInfo, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, validateScreenshotBuffer, zoomForGPT4o };
|
|
@@ -70,7 +70,26 @@ async function resizeAndConvertImgBuffer(inputFormat, inputData, newSize) {
|
|
|
70
70
|
};
|
|
71
71
|
}
|
|
72
72
|
const normalizeBase64Body = (body)=>body.replace(/\s/g, '');
|
|
73
|
+
const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
|
|
74
|
+
const inferBase64ImageFormat = (base64Body)=>{
|
|
75
|
+
if (base64Body.startsWith('iVBORw0KGgo')) return 'png';
|
|
76
|
+
return 'jpeg';
|
|
77
|
+
};
|
|
78
|
+
function detectImageMimeTypeFromBuffer(buffer) {
|
|
79
|
+
if (buffer.length >= 8 && 0x89 === buffer[0] && 0x50 === buffer[1] && 0x4e === buffer[2] && 0x47 === buffer[3] && 0x0d === buffer[4] && 0x0a === buffer[5] && 0x1a === buffer[6] && 0x0a === buffer[7]) return 'image/png';
|
|
80
|
+
if (buffer.length >= 3 && 0xff === buffer[0] && 0xd8 === buffer[1] && 0xff === buffer[2]) return 'image/jpeg';
|
|
81
|
+
if (buffer.length >= 6 && 'GIF' === buffer.subarray(0, 3).toString('ascii')) return 'image/gif';
|
|
82
|
+
if (buffer.length >= 12 && 'RIFF' === buffer.subarray(0, 4).toString('ascii') && 'WEBP' === buffer.subarray(8, 12).toString('ascii')) return 'image/webp';
|
|
83
|
+
if (buffer.length >= 2 && 0x42 === buffer[0] && 0x4d === buffer[1]) return 'image/bmp';
|
|
84
|
+
}
|
|
73
85
|
const createImgBase64ByFormat = (format, body)=>`data:image/${format};base64,${normalizeBase64Body(body)}`;
|
|
86
|
+
const normalizeBase64Image = (base64)=>{
|
|
87
|
+
const trimmedBase64 = base64.trim();
|
|
88
|
+
if (base64ImageDataUrlPattern.test(trimmedBase64)) return trimmedBase64;
|
|
89
|
+
const base64Body = normalizeBase64Body(trimmedBase64);
|
|
90
|
+
node_assert(base64Body, 'base64 image must include image data');
|
|
91
|
+
return createImgBase64ByFormat(inferBase64ImageFormat(base64Body), base64Body);
|
|
92
|
+
};
|
|
74
93
|
async function resizeImgBase64(inputBase64, newSize) {
|
|
75
94
|
const { body, mimeType } = parseBase64(inputBase64);
|
|
76
95
|
const imageBuffer = Buffer.from(body, 'base64');
|
|
@@ -198,7 +217,15 @@ const parseBase64 = (fullBase64String)=>{
|
|
|
198
217
|
try {
|
|
199
218
|
const separator = ';base64,';
|
|
200
219
|
const index = fullBase64String.indexOf(separator);
|
|
201
|
-
if (-1 === index)
|
|
220
|
+
if (-1 === index) {
|
|
221
|
+
const body = normalizeBase64Body(fullBase64String);
|
|
222
|
+
const mimeType = detectImageMimeTypeFromBuffer(Buffer.from(body, 'base64'));
|
|
223
|
+
if (!mimeType) throw new Error('Invalid base64 string');
|
|
224
|
+
return {
|
|
225
|
+
mimeType,
|
|
226
|
+
body
|
|
227
|
+
};
|
|
228
|
+
}
|
|
202
229
|
return {
|
|
203
230
|
mimeType: fullBase64String.slice(5, index),
|
|
204
231
|
body: normalizeBase64Body(fullBase64String.slice(index + separator.length))
|
|
@@ -266,4 +293,4 @@ async function scaleImage(imageBase64, scale) {
|
|
|
266
293
|
imageBase64: base64
|
|
267
294
|
};
|
|
268
295
|
}
|
|
269
|
-
export { createImgBase64ByFormat, cropByRect, httpImg2Base64, localImg2Base64, normalizeBase64Body, paddingToMatchBlock, paddingToMatchBlockByBase64, parseBase64, photonFromBase64, photonToBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o };
|
|
296
|
+
export { createImgBase64ByFormat, cropByRect, httpImg2Base64, inferBase64ImageFormat, localImg2Base64, normalizeBase64Body, normalizeBase64Image, paddingToMatchBlock, paddingToMatchBlockByBase64, parseBase64, photonFromBase64, photonToBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o };
|
package/dist/es/recorder.mjs
CHANGED
|
@@ -2,10 +2,87 @@ const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
|
|
|
2
2
|
function isMidsceneRecorderPendingDescription(value) {
|
|
3
3
|
return value?.trim() === 'AI is analyzing element...';
|
|
4
4
|
}
|
|
5
|
+
function getMidsceneRecorderSemantic(event) {
|
|
6
|
+
return event.semantic;
|
|
7
|
+
}
|
|
8
|
+
function getRecorderPointerActionVerb(actionType) {
|
|
9
|
+
switch(actionType){
|
|
10
|
+
case 'Tap':
|
|
11
|
+
return 'Tap';
|
|
12
|
+
case 'DoubleClick':
|
|
13
|
+
return 'Double click';
|
|
14
|
+
case 'LongPress':
|
|
15
|
+
return 'Long press';
|
|
16
|
+
case 'RightClick':
|
|
17
|
+
return 'Right click';
|
|
18
|
+
default:
|
|
19
|
+
return 'Click';
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
function getRecorderDragActionVerb(actionType) {
|
|
23
|
+
switch(actionType){
|
|
24
|
+
case 'Swipe':
|
|
25
|
+
return 'Swipe';
|
|
26
|
+
case 'DragAndDrop':
|
|
27
|
+
return 'Drag';
|
|
28
|
+
default:
|
|
29
|
+
return 'Drag';
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function buildMidsceneRecorderReplayInstruction(event, elementDescription) {
|
|
33
|
+
switch(event.type){
|
|
34
|
+
case 'navigation':
|
|
35
|
+
if ('Stop' === event.actionType) return 'Stop loading the current page.';
|
|
36
|
+
if ('GoBack' === event.actionType) return 'Go back in the browser.';
|
|
37
|
+
if ('GoForward' === event.actionType) return 'Go forward in the browser.';
|
|
38
|
+
if ('Reload' === event.actionType) return 'Reload the current page.';
|
|
39
|
+
if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at \`${event.url}\`.`;
|
|
40
|
+
return event.url ? `Navigate to \`${event.url}\`.` : `Navigate using ${elementDescription}.`;
|
|
41
|
+
case 'scroll':
|
|
42
|
+
return event.scrollDestinationDescription ? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.` : `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
|
|
43
|
+
case 'drag':
|
|
44
|
+
{
|
|
45
|
+
const verb = getRecorderDragActionVerb(event.actionType);
|
|
46
|
+
return `${verb} through the area described as "${elementDescription}".`;
|
|
47
|
+
}
|
|
48
|
+
case 'input':
|
|
49
|
+
return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
|
|
50
|
+
case 'keydown':
|
|
51
|
+
return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
|
|
52
|
+
default:
|
|
53
|
+
{
|
|
54
|
+
const verb = getRecorderPointerActionVerb(event.actionType);
|
|
55
|
+
if ('Long press' === verb) return `${verb} the element described as "${elementDescription}".`;
|
|
56
|
+
return `${verb} on the element described as "${elementDescription}".`;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
function buildMidsceneRecorderActionSummary(event, elementDescription) {
|
|
61
|
+
switch(event.type){
|
|
62
|
+
case 'navigation':
|
|
63
|
+
if ('Stop' === event.actionType) return 'Stop page loading';
|
|
64
|
+
if ('GoBack' === event.actionType) return 'Go back';
|
|
65
|
+
if ('GoForward' === event.actionType) return 'Go forward';
|
|
66
|
+
if ('Reload' === event.actionType) return 'Reload page';
|
|
67
|
+
if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at ${event.url}`;
|
|
68
|
+
return event.url ? `Navigate to ${event.url}` : 'Navigate';
|
|
69
|
+
case 'scroll':
|
|
70
|
+
return event.scrollDestinationDescription ? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}` : `Scroll ${elementDescription}`;
|
|
71
|
+
case 'drag':
|
|
72
|
+
return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
|
|
73
|
+
case 'input':
|
|
74
|
+
return `Input into ${elementDescription}`;
|
|
75
|
+
case 'keydown':
|
|
76
|
+
return `Press ${event.value || 'key'} on ${elementDescription}`;
|
|
77
|
+
default:
|
|
78
|
+
return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
5
81
|
function getMidsceneRecorderEventDescription(event) {
|
|
6
|
-
|
|
7
|
-
if (
|
|
8
|
-
if (
|
|
82
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
83
|
+
if (semantic?.actionSummary && !isMidsceneRecorderPendingDescription(semantic.actionSummary)) return semantic.actionSummary;
|
|
84
|
+
if (semantic?.elementDescription && !isMidsceneRecorderPendingDescription(semantic.elementDescription)) return semantic.elementDescription;
|
|
85
|
+
if (semantic?.replayInstruction && !isMidsceneRecorderPendingDescription(semantic.replayInstruction)) return semantic.replayInstruction;
|
|
9
86
|
if ('navigation' === event.type && event.url) return `Navigate to ${event.url}`;
|
|
10
87
|
if (event.value) return event.actionType ? `${event.actionType} ${event.value}` : event.value;
|
|
11
88
|
if (event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0) {
|
|
@@ -54,10 +131,12 @@ function getRecorderEventScreenshot(event) {
|
|
|
54
131
|
return event.screenshotWithBox || event.screenshotAfter || event.screenshotBefore;
|
|
55
132
|
}
|
|
56
133
|
function hasCoordinateFallback(event) {
|
|
57
|
-
|
|
134
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
135
|
+
return !semantic?.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
|
|
58
136
|
}
|
|
59
137
|
function shouldIncludeMarkdownScreenshot(event, eventIndex, lastEventIndex) {
|
|
60
|
-
|
|
138
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
139
|
+
return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !semantic?.elementDescription || hasCoordinateFallback(event);
|
|
61
140
|
}
|
|
62
141
|
function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, lastEventIndex) {
|
|
63
142
|
const event = candidate.event;
|
|
@@ -66,9 +145,10 @@ function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, last
|
|
|
66
145
|
if (candidate.eventIndex === lastEventIndex) priority += 95;
|
|
67
146
|
if ('navigation' === event.type) priority += 80;
|
|
68
147
|
if (event.screenshotWithBox) priority += 70;
|
|
69
|
-
|
|
148
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
149
|
+
if (semantic?.source === 'heuristic' || semantic?.confidence === 'low' || semantic?.error) priority += 60;
|
|
70
150
|
if ('input' === event.type || 'scroll' === event.type) priority += 40;
|
|
71
|
-
if (!
|
|
151
|
+
if (!semantic?.elementDescription || hasCoordinateFallback(event)) priority += 30;
|
|
72
152
|
return priority;
|
|
73
153
|
}
|
|
74
154
|
function selectEvenlyDistributedCandidates(candidates, count) {
|
|
@@ -166,4 +246,4 @@ function stringifyMidsceneRecorderTargetBlock(target) {
|
|
|
166
246
|
for (const [key, value] of values)lines.push(` ${key}: ${scalarToYaml(value)}`);
|
|
167
247
|
return lines.join('\n');
|
|
168
248
|
}
|
|
169
|
-
export { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM, sanitizeMidsceneRecorderFileName, stringifyMidsceneRecorderTargetBlock };
|
|
249
|
+
export { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, buildMidsceneRecorderActionSummary, buildMidsceneRecorderReplayInstruction, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM, getMidsceneRecorderSemantic, sanitizeMidsceneRecorderFileName, stringifyMidsceneRecorderTargetBlock };
|
|
@@ -37,7 +37,7 @@ const external_utils_js_namespaceObject = require("../utils.js");
|
|
|
37
37
|
const external_helper_js_namespaceObject = require("./helper.js");
|
|
38
38
|
const external_init_debug_js_namespaceObject = require("./init-debug.js");
|
|
39
39
|
const MODEL_CONFIG_DOC_URL = 'https://midscenejs.com/model-common-config.html';
|
|
40
|
-
const getCurrentVersion = ()=>"1.9.7
|
|
40
|
+
const getCurrentVersion = ()=>"1.9.7";
|
|
41
41
|
const getInvalidModelFamilyMessage = (modelFamily)=>`Invalid MIDSCENE_MODEL_FAMILY value: ${modelFamily}. Current version v${getCurrentVersion()} accepts the following model families: ${external_types_js_namespaceObject.MODEL_FAMILY_VALUES.join(', ')}. You can also visit ${MODEL_CONFIG_DOC_URL} for the latest configuration information.`;
|
|
42
42
|
const KEYS_MAP = {
|
|
43
43
|
insight: external_constants_js_namespaceObject.INSIGHT_MODEL_CONFIG_KEYS,
|
package/dist/lib/img/index.js
CHANGED
|
@@ -25,6 +25,7 @@ var __webpack_exports__ = {};
|
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
27
|
paddingToMatchBlockByBase64: ()=>external_transform_js_namespaceObject.paddingToMatchBlockByBase64,
|
|
28
|
+
inferBase64ImageFormat: ()=>external_transform_js_namespaceObject.inferBase64ImageFormat,
|
|
28
29
|
localImg2Base64: ()=>external_transform_js_namespaceObject.localImg2Base64,
|
|
29
30
|
parseBase64: ()=>external_transform_js_namespaceObject.parseBase64,
|
|
30
31
|
resizeAndConvertImgBuffer: ()=>external_transform_js_namespaceObject.resizeAndConvertImgBuffer,
|
|
@@ -36,10 +37,11 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
36
37
|
annotateRects: ()=>external_box_select_js_namespaceObject.annotateRects,
|
|
37
38
|
compositeElementInfoImg: ()=>external_box_select_js_namespaceObject.compositeElementInfoImg,
|
|
38
39
|
isValidPNGImageBuffer: ()=>external_info_js_namespaceObject.isValidPNGImageBuffer,
|
|
39
|
-
|
|
40
|
+
normalizeBase64Image: ()=>external_transform_js_namespaceObject.normalizeBase64Image,
|
|
40
41
|
compositePointMarkerImg: ()=>external_box_select_js_namespaceObject.compositePointMarkerImg,
|
|
41
|
-
|
|
42
|
+
cropByRect: ()=>external_transform_js_namespaceObject.cropByRect,
|
|
42
43
|
isValidJPEGImageBuffer: ()=>external_info_js_namespaceObject.isValidJPEGImageBuffer,
|
|
44
|
+
processImageElementInfo: ()=>external_box_select_js_namespaceObject.processImageElementInfo,
|
|
43
45
|
scaleImage: ()=>external_transform_js_namespaceObject.scaleImage,
|
|
44
46
|
zoomForGPT4o: ()=>external_transform_js_namespaceObject.zoomForGPT4o,
|
|
45
47
|
createImgBase64ByFormat: ()=>external_transform_js_namespaceObject.createImgBase64ByFormat,
|
|
@@ -56,10 +58,12 @@ exports.createImgBase64ByFormat = __webpack_exports__.createImgBase64ByFormat;
|
|
|
56
58
|
exports.cropByRect = __webpack_exports__.cropByRect;
|
|
57
59
|
exports.httpImg2Base64 = __webpack_exports__.httpImg2Base64;
|
|
58
60
|
exports.imageInfoOfBase64 = __webpack_exports__.imageInfoOfBase64;
|
|
61
|
+
exports.inferBase64ImageFormat = __webpack_exports__.inferBase64ImageFormat;
|
|
59
62
|
exports.isValidImageBuffer = __webpack_exports__.isValidImageBuffer;
|
|
60
63
|
exports.isValidJPEGImageBuffer = __webpack_exports__.isValidJPEGImageBuffer;
|
|
61
64
|
exports.isValidPNGImageBuffer = __webpack_exports__.isValidPNGImageBuffer;
|
|
62
65
|
exports.localImg2Base64 = __webpack_exports__.localImg2Base64;
|
|
66
|
+
exports.normalizeBase64Image = __webpack_exports__.normalizeBase64Image;
|
|
63
67
|
exports.paddingToMatchBlockByBase64 = __webpack_exports__.paddingToMatchBlockByBase64;
|
|
64
68
|
exports.parseBase64 = __webpack_exports__.parseBase64;
|
|
65
69
|
exports.preProcessImageUrl = __webpack_exports__.preProcessImageUrl;
|
|
@@ -78,10 +82,12 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
|
78
82
|
"cropByRect",
|
|
79
83
|
"httpImg2Base64",
|
|
80
84
|
"imageInfoOfBase64",
|
|
85
|
+
"inferBase64ImageFormat",
|
|
81
86
|
"isValidImageBuffer",
|
|
82
87
|
"isValidJPEGImageBuffer",
|
|
83
88
|
"isValidPNGImageBuffer",
|
|
84
89
|
"localImg2Base64",
|
|
90
|
+
"normalizeBase64Image",
|
|
85
91
|
"paddingToMatchBlockByBase64",
|
|
86
92
|
"parseBase64",
|
|
87
93
|
"preProcessImageUrl",
|
|
@@ -34,6 +34,7 @@ var __webpack_exports__ = {};
|
|
|
34
34
|
__webpack_require__.r(__webpack_exports__);
|
|
35
35
|
__webpack_require__.d(__webpack_exports__, {
|
|
36
36
|
paddingToMatchBlockByBase64: ()=>paddingToMatchBlockByBase64,
|
|
37
|
+
inferBase64ImageFormat: ()=>inferBase64ImageFormat,
|
|
37
38
|
localImg2Base64: ()=>localImg2Base64,
|
|
38
39
|
photonFromBase64: ()=>photonFromBase64,
|
|
39
40
|
photonToBase64: ()=>photonToBase64,
|
|
@@ -44,6 +45,7 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
44
45
|
preProcessImageUrl: ()=>preProcessImageUrl,
|
|
45
46
|
cropByRect: ()=>cropByRect,
|
|
46
47
|
scaleImage: ()=>scaleImage,
|
|
48
|
+
normalizeBase64Image: ()=>normalizeBase64Image,
|
|
47
49
|
normalizeBase64Body: ()=>normalizeBase64Body,
|
|
48
50
|
zoomForGPT4o: ()=>zoomForGPT4o,
|
|
49
51
|
createImgBase64ByFormat: ()=>createImgBase64ByFormat,
|
|
@@ -126,7 +128,26 @@ async function resizeAndConvertImgBuffer(inputFormat, inputData, newSize) {
|
|
|
126
128
|
};
|
|
127
129
|
}
|
|
128
130
|
const normalizeBase64Body = (body)=>body.replace(/\s/g, '');
|
|
131
|
+
const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
|
|
132
|
+
const inferBase64ImageFormat = (base64Body)=>{
|
|
133
|
+
if (base64Body.startsWith('iVBORw0KGgo')) return 'png';
|
|
134
|
+
return 'jpeg';
|
|
135
|
+
};
|
|
136
|
+
function detectImageMimeTypeFromBuffer(buffer) {
|
|
137
|
+
if (buffer.length >= 8 && 0x89 === buffer[0] && 0x50 === buffer[1] && 0x4e === buffer[2] && 0x47 === buffer[3] && 0x0d === buffer[4] && 0x0a === buffer[5] && 0x1a === buffer[6] && 0x0a === buffer[7]) return 'image/png';
|
|
138
|
+
if (buffer.length >= 3 && 0xff === buffer[0] && 0xd8 === buffer[1] && 0xff === buffer[2]) return 'image/jpeg';
|
|
139
|
+
if (buffer.length >= 6 && 'GIF' === buffer.subarray(0, 3).toString('ascii')) return 'image/gif';
|
|
140
|
+
if (buffer.length >= 12 && 'RIFF' === buffer.subarray(0, 4).toString('ascii') && 'WEBP' === buffer.subarray(8, 12).toString('ascii')) return 'image/webp';
|
|
141
|
+
if (buffer.length >= 2 && 0x42 === buffer[0] && 0x4d === buffer[1]) return 'image/bmp';
|
|
142
|
+
}
|
|
129
143
|
const createImgBase64ByFormat = (format, body)=>`data:image/${format};base64,${normalizeBase64Body(body)}`;
|
|
144
|
+
const normalizeBase64Image = (base64)=>{
|
|
145
|
+
const trimmedBase64 = base64.trim();
|
|
146
|
+
if (base64ImageDataUrlPattern.test(trimmedBase64)) return trimmedBase64;
|
|
147
|
+
const base64Body = normalizeBase64Body(trimmedBase64);
|
|
148
|
+
external_node_assert_default()(base64Body, 'base64 image must include image data');
|
|
149
|
+
return createImgBase64ByFormat(inferBase64ImageFormat(base64Body), base64Body);
|
|
150
|
+
};
|
|
130
151
|
async function resizeImgBase64(inputBase64, newSize) {
|
|
131
152
|
const { body, mimeType } = parseBase64(inputBase64);
|
|
132
153
|
const imageBuffer = external_node_buffer_namespaceObject.Buffer.from(body, 'base64');
|
|
@@ -254,7 +275,15 @@ const parseBase64 = (fullBase64String)=>{
|
|
|
254
275
|
try {
|
|
255
276
|
const separator = ';base64,';
|
|
256
277
|
const index = fullBase64String.indexOf(separator);
|
|
257
|
-
if (-1 === index)
|
|
278
|
+
if (-1 === index) {
|
|
279
|
+
const body = normalizeBase64Body(fullBase64String);
|
|
280
|
+
const mimeType = detectImageMimeTypeFromBuffer(external_node_buffer_namespaceObject.Buffer.from(body, 'base64'));
|
|
281
|
+
if (!mimeType) throw new Error('Invalid base64 string');
|
|
282
|
+
return {
|
|
283
|
+
mimeType,
|
|
284
|
+
body
|
|
285
|
+
};
|
|
286
|
+
}
|
|
258
287
|
return {
|
|
259
288
|
mimeType: fullBase64String.slice(5, index),
|
|
260
289
|
body: normalizeBase64Body(fullBase64String.slice(index + separator.length))
|
|
@@ -325,8 +354,10 @@ async function scaleImage(imageBase64, scale) {
|
|
|
325
354
|
exports.createImgBase64ByFormat = __webpack_exports__.createImgBase64ByFormat;
|
|
326
355
|
exports.cropByRect = __webpack_exports__.cropByRect;
|
|
327
356
|
exports.httpImg2Base64 = __webpack_exports__.httpImg2Base64;
|
|
357
|
+
exports.inferBase64ImageFormat = __webpack_exports__.inferBase64ImageFormat;
|
|
328
358
|
exports.localImg2Base64 = __webpack_exports__.localImg2Base64;
|
|
329
359
|
exports.normalizeBase64Body = __webpack_exports__.normalizeBase64Body;
|
|
360
|
+
exports.normalizeBase64Image = __webpack_exports__.normalizeBase64Image;
|
|
330
361
|
exports.paddingToMatchBlock = __webpack_exports__.paddingToMatchBlock;
|
|
331
362
|
exports.paddingToMatchBlockByBase64 = __webpack_exports__.paddingToMatchBlockByBase64;
|
|
332
363
|
exports.parseBase64 = __webpack_exports__.parseBase64;
|
|
@@ -342,8 +373,10 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
|
342
373
|
"createImgBase64ByFormat",
|
|
343
374
|
"cropByRect",
|
|
344
375
|
"httpImg2Base64",
|
|
376
|
+
"inferBase64ImageFormat",
|
|
345
377
|
"localImg2Base64",
|
|
346
378
|
"normalizeBase64Body",
|
|
379
|
+
"normalizeBase64Image",
|
|
347
380
|
"paddingToMatchBlock",
|
|
348
381
|
"paddingToMatchBlockByBase64",
|
|
349
382
|
"parseBase64",
|
package/dist/lib/recorder.js
CHANGED
|
@@ -25,9 +25,12 @@ var __webpack_exports__ = {};
|
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
27
|
DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS: ()=>DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS,
|
|
28
|
+
buildMidsceneRecorderActionSummary: ()=>buildMidsceneRecorderActionSummary,
|
|
29
|
+
buildMidsceneRecorderReplayInstruction: ()=>buildMidsceneRecorderReplayInstruction,
|
|
28
30
|
createMidsceneRecorderMarkdownScreenshotAssets: ()=>createMidsceneRecorderMarkdownScreenshotAssets,
|
|
29
31
|
getMidsceneRecorderEventDescription: ()=>getMidsceneRecorderEventDescription,
|
|
30
32
|
getMidsceneRecorderScreenshotsForLLM: ()=>getMidsceneRecorderScreenshotsForLLM,
|
|
33
|
+
getMidsceneRecorderSemantic: ()=>getMidsceneRecorderSemantic,
|
|
31
34
|
sanitizeMidsceneRecorderFileName: ()=>sanitizeMidsceneRecorderFileName,
|
|
32
35
|
stringifyMidsceneRecorderTargetBlock: ()=>stringifyMidsceneRecorderTargetBlock
|
|
33
36
|
});
|
|
@@ -35,10 +38,87 @@ const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
|
|
|
35
38
|
function isMidsceneRecorderPendingDescription(value) {
|
|
36
39
|
return value?.trim() === 'AI is analyzing element...';
|
|
37
40
|
}
|
|
41
|
+
function getMidsceneRecorderSemantic(event) {
|
|
42
|
+
return event.semantic;
|
|
43
|
+
}
|
|
44
|
+
function getRecorderPointerActionVerb(actionType) {
|
|
45
|
+
switch(actionType){
|
|
46
|
+
case 'Tap':
|
|
47
|
+
return 'Tap';
|
|
48
|
+
case 'DoubleClick':
|
|
49
|
+
return 'Double click';
|
|
50
|
+
case 'LongPress':
|
|
51
|
+
return 'Long press';
|
|
52
|
+
case 'RightClick':
|
|
53
|
+
return 'Right click';
|
|
54
|
+
default:
|
|
55
|
+
return 'Click';
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
function getRecorderDragActionVerb(actionType) {
|
|
59
|
+
switch(actionType){
|
|
60
|
+
case 'Swipe':
|
|
61
|
+
return 'Swipe';
|
|
62
|
+
case 'DragAndDrop':
|
|
63
|
+
return 'Drag';
|
|
64
|
+
default:
|
|
65
|
+
return 'Drag';
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
function buildMidsceneRecorderReplayInstruction(event, elementDescription) {
|
|
69
|
+
switch(event.type){
|
|
70
|
+
case 'navigation':
|
|
71
|
+
if ('Stop' === event.actionType) return 'Stop loading the current page.';
|
|
72
|
+
if ('GoBack' === event.actionType) return 'Go back in the browser.';
|
|
73
|
+
if ('GoForward' === event.actionType) return 'Go forward in the browser.';
|
|
74
|
+
if ('Reload' === event.actionType) return 'Reload the current page.';
|
|
75
|
+
if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at \`${event.url}\`.`;
|
|
76
|
+
return event.url ? `Navigate to \`${event.url}\`.` : `Navigate using ${elementDescription}.`;
|
|
77
|
+
case 'scroll':
|
|
78
|
+
return event.scrollDestinationDescription ? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.` : `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
|
|
79
|
+
case 'drag':
|
|
80
|
+
{
|
|
81
|
+
const verb = getRecorderDragActionVerb(event.actionType);
|
|
82
|
+
return `${verb} through the area described as "${elementDescription}".`;
|
|
83
|
+
}
|
|
84
|
+
case 'input':
|
|
85
|
+
return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
|
|
86
|
+
case 'keydown':
|
|
87
|
+
return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
|
|
88
|
+
default:
|
|
89
|
+
{
|
|
90
|
+
const verb = getRecorderPointerActionVerb(event.actionType);
|
|
91
|
+
if ('Long press' === verb) return `${verb} the element described as "${elementDescription}".`;
|
|
92
|
+
return `${verb} on the element described as "${elementDescription}".`;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
function buildMidsceneRecorderActionSummary(event, elementDescription) {
|
|
97
|
+
switch(event.type){
|
|
98
|
+
case 'navigation':
|
|
99
|
+
if ('Stop' === event.actionType) return 'Stop page loading';
|
|
100
|
+
if ('GoBack' === event.actionType) return 'Go back';
|
|
101
|
+
if ('GoForward' === event.actionType) return 'Go forward';
|
|
102
|
+
if ('Reload' === event.actionType) return 'Reload page';
|
|
103
|
+
if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at ${event.url}`;
|
|
104
|
+
return event.url ? `Navigate to ${event.url}` : 'Navigate';
|
|
105
|
+
case 'scroll':
|
|
106
|
+
return event.scrollDestinationDescription ? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}` : `Scroll ${elementDescription}`;
|
|
107
|
+
case 'drag':
|
|
108
|
+
return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
|
|
109
|
+
case 'input':
|
|
110
|
+
return `Input into ${elementDescription}`;
|
|
111
|
+
case 'keydown':
|
|
112
|
+
return `Press ${event.value || 'key'} on ${elementDescription}`;
|
|
113
|
+
default:
|
|
114
|
+
return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
38
117
|
function getMidsceneRecorderEventDescription(event) {
|
|
39
|
-
|
|
40
|
-
if (
|
|
41
|
-
if (
|
|
118
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
119
|
+
if (semantic?.actionSummary && !isMidsceneRecorderPendingDescription(semantic.actionSummary)) return semantic.actionSummary;
|
|
120
|
+
if (semantic?.elementDescription && !isMidsceneRecorderPendingDescription(semantic.elementDescription)) return semantic.elementDescription;
|
|
121
|
+
if (semantic?.replayInstruction && !isMidsceneRecorderPendingDescription(semantic.replayInstruction)) return semantic.replayInstruction;
|
|
42
122
|
if ('navigation' === event.type && event.url) return `Navigate to ${event.url}`;
|
|
43
123
|
if (event.value) return event.actionType ? `${event.actionType} ${event.value}` : event.value;
|
|
44
124
|
if (event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0) {
|
|
@@ -87,10 +167,12 @@ function getRecorderEventScreenshot(event) {
|
|
|
87
167
|
return event.screenshotWithBox || event.screenshotAfter || event.screenshotBefore;
|
|
88
168
|
}
|
|
89
169
|
function hasCoordinateFallback(event) {
|
|
90
|
-
|
|
170
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
171
|
+
return !semantic?.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
|
|
91
172
|
}
|
|
92
173
|
function shouldIncludeMarkdownScreenshot(event, eventIndex, lastEventIndex) {
|
|
93
|
-
|
|
174
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
175
|
+
return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !semantic?.elementDescription || hasCoordinateFallback(event);
|
|
94
176
|
}
|
|
95
177
|
function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, lastEventIndex) {
|
|
96
178
|
const event = candidate.event;
|
|
@@ -99,9 +181,10 @@ function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, last
|
|
|
99
181
|
if (candidate.eventIndex === lastEventIndex) priority += 95;
|
|
100
182
|
if ('navigation' === event.type) priority += 80;
|
|
101
183
|
if (event.screenshotWithBox) priority += 70;
|
|
102
|
-
|
|
184
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
185
|
+
if (semantic?.source === 'heuristic' || semantic?.confidence === 'low' || semantic?.error) priority += 60;
|
|
103
186
|
if ('input' === event.type || 'scroll' === event.type) priority += 40;
|
|
104
|
-
if (!
|
|
187
|
+
if (!semantic?.elementDescription || hasCoordinateFallback(event)) priority += 30;
|
|
105
188
|
return priority;
|
|
106
189
|
}
|
|
107
190
|
function selectEvenlyDistributedCandidates(candidates, count) {
|
|
@@ -200,16 +283,22 @@ function stringifyMidsceneRecorderTargetBlock(target) {
|
|
|
200
283
|
return lines.join('\n');
|
|
201
284
|
}
|
|
202
285
|
exports.DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = __webpack_exports__.DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS;
|
|
286
|
+
exports.buildMidsceneRecorderActionSummary = __webpack_exports__.buildMidsceneRecorderActionSummary;
|
|
287
|
+
exports.buildMidsceneRecorderReplayInstruction = __webpack_exports__.buildMidsceneRecorderReplayInstruction;
|
|
203
288
|
exports.createMidsceneRecorderMarkdownScreenshotAssets = __webpack_exports__.createMidsceneRecorderMarkdownScreenshotAssets;
|
|
204
289
|
exports.getMidsceneRecorderEventDescription = __webpack_exports__.getMidsceneRecorderEventDescription;
|
|
205
290
|
exports.getMidsceneRecorderScreenshotsForLLM = __webpack_exports__.getMidsceneRecorderScreenshotsForLLM;
|
|
291
|
+
exports.getMidsceneRecorderSemantic = __webpack_exports__.getMidsceneRecorderSemantic;
|
|
206
292
|
exports.sanitizeMidsceneRecorderFileName = __webpack_exports__.sanitizeMidsceneRecorderFileName;
|
|
207
293
|
exports.stringifyMidsceneRecorderTargetBlock = __webpack_exports__.stringifyMidsceneRecorderTargetBlock;
|
|
208
294
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
209
295
|
"DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS",
|
|
296
|
+
"buildMidsceneRecorderActionSummary",
|
|
297
|
+
"buildMidsceneRecorderReplayInstruction",
|
|
210
298
|
"createMidsceneRecorderMarkdownScreenshotAssets",
|
|
211
299
|
"getMidsceneRecorderEventDescription",
|
|
212
300
|
"getMidsceneRecorderScreenshotsForLLM",
|
|
301
|
+
"getMidsceneRecorderSemantic",
|
|
213
302
|
"sanitizeMidsceneRecorderFileName",
|
|
214
303
|
"stringifyMidsceneRecorderTargetBlock"
|
|
215
304
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
export { imageInfoOfBase64, isValidPNGImageBuffer, isValidJPEGImageBuffer, isValidImageBuffer, validateScreenshotBuffer, type ValidateScreenshotBufferOptions, } from './info';
|
|
2
|
-
export { resizeAndConvertImgBuffer, resizeImgBase64, zoomForGPT4o, saveBase64Image, paddingToMatchBlockByBase64, cropByRect, scaleImage, localImg2Base64, httpImg2Base64, preProcessImageUrl, parseBase64, createImgBase64ByFormat, } from './transform';
|
|
2
|
+
export { resizeAndConvertImgBuffer, resizeImgBase64, zoomForGPT4o, saveBase64Image, paddingToMatchBlockByBase64, cropByRect, scaleImage, localImg2Base64, httpImg2Base64, preProcessImageUrl, parseBase64, createImgBase64ByFormat, inferBase64ImageFormat, normalizeBase64Image, } from './transform';
|
|
3
3
|
export { processImageElementInfo, compositeElementInfoImg, compositePointMarkerImg, annotateRects, } from './box-select';
|
|
@@ -27,7 +27,9 @@ export declare function resizeAndConvertImgBuffer(inputFormat: string, inputData
|
|
|
27
27
|
format: string;
|
|
28
28
|
}>;
|
|
29
29
|
export declare const normalizeBase64Body: (body: string) => string;
|
|
30
|
+
export declare const inferBase64ImageFormat: (base64Body: string) => "jpeg" | "png";
|
|
30
31
|
export declare const createImgBase64ByFormat: (format: string, body: string) => string;
|
|
32
|
+
export declare const normalizeBase64Image: (base64: string) => string;
|
|
31
33
|
export declare function resizeImgBase64(inputBase64: string, newSize: {
|
|
32
34
|
width: number;
|
|
33
35
|
height: number;
|
package/dist/types/recorder.d.ts
CHANGED
|
@@ -25,16 +25,43 @@ export interface MidsceneRecorderEvent {
|
|
|
25
25
|
pageInfo: MidsceneRecorderPageInfo;
|
|
26
26
|
screenshotBefore?: string;
|
|
27
27
|
screenshotAfter?: string;
|
|
28
|
+
semantic?: MidsceneRecorderSemantic;
|
|
28
29
|
elementDescription?: string;
|
|
29
|
-
replayInstruction?: string;
|
|
30
|
-
actionSummary?: string;
|
|
31
|
-
semanticConfidence?: 'high' | 'medium' | 'low';
|
|
32
30
|
descriptionLoading?: boolean;
|
|
33
|
-
descriptionSource?: 'ai' | 'fallback';
|
|
34
|
-
descriptionError?: string;
|
|
35
31
|
screenshotWithBox?: string;
|
|
36
32
|
timestamp: number;
|
|
37
33
|
hashId: string;
|
|
34
|
+
mergedHashIds?: string[];
|
|
35
|
+
}
|
|
36
|
+
export type MidsceneRecorderSemanticSource = 'aiDescribe' | 'recorderAI' | 'heuristic';
|
|
37
|
+
export type MidsceneRecorderSemanticStatus = 'pending' | 'ready' | 'failed';
|
|
38
|
+
export type MidsceneRecorderSemanticConfidence = 'high' | 'medium' | 'low';
|
|
39
|
+
export interface MidsceneRecorderSemanticAiDescribe {
|
|
40
|
+
verifyPrompt: boolean;
|
|
41
|
+
verifyPassed?: boolean;
|
|
42
|
+
deepLocate?: boolean;
|
|
43
|
+
centerDistance?: number;
|
|
44
|
+
expectedCenter?: [number, number];
|
|
45
|
+
actualCenter?: [number, number];
|
|
46
|
+
annotatedScreenshotPath?: string;
|
|
47
|
+
}
|
|
48
|
+
export interface MidsceneRecorderSemantic {
|
|
49
|
+
source: MidsceneRecorderSemanticSource;
|
|
50
|
+
status: MidsceneRecorderSemanticStatus;
|
|
51
|
+
elementDescription?: string;
|
|
52
|
+
replayInstruction?: string;
|
|
53
|
+
actionSummary?: string;
|
|
54
|
+
confidence?: MidsceneRecorderSemanticConfidence;
|
|
55
|
+
error?: string;
|
|
56
|
+
aiDescribe?: MidsceneRecorderSemanticAiDescribe;
|
|
57
|
+
fallbackFrom?: MidsceneRecorderSemantic;
|
|
58
|
+
}
|
|
59
|
+
export interface MidsceneRecorderSemanticAction {
|
|
60
|
+
type: MidsceneRecorderEventType;
|
|
61
|
+
actionType?: string;
|
|
62
|
+
value?: string;
|
|
63
|
+
url?: string;
|
|
64
|
+
scrollDestinationDescription?: string;
|
|
38
65
|
}
|
|
39
66
|
export interface MidsceneRecorderTarget {
|
|
40
67
|
platformId: MidsceneRecorderPlatformId;
|
|
@@ -62,6 +89,9 @@ export interface MidsceneRecorderMarkdownScreenshotOptions {
|
|
|
62
89
|
maxScreenshots?: number;
|
|
63
90
|
}
|
|
64
91
|
export declare const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
|
|
92
|
+
export declare function getMidsceneRecorderSemantic(event: Pick<MidsceneRecorderEvent, 'semantic'>): MidsceneRecorderSemantic | undefined;
|
|
93
|
+
export declare function buildMidsceneRecorderReplayInstruction(event: MidsceneRecorderSemanticAction, elementDescription: string): string;
|
|
94
|
+
export declare function buildMidsceneRecorderActionSummary(event: MidsceneRecorderSemanticAction, elementDescription: string): string;
|
|
65
95
|
export declare function getMidsceneRecorderEventDescription(event: MidsceneRecorderEvent): string;
|
|
66
96
|
export declare function getMidsceneRecorderScreenshotsForLLM(events: MidsceneRecorderEvent[], maxScreenshots?: number): string[];
|
|
67
97
|
export declare function sanitizeMidsceneRecorderFileName(value: string): string;
|
package/package.json
CHANGED
package/src/img/index.ts
CHANGED
package/src/img/transform.ts
CHANGED
|
@@ -156,10 +156,71 @@ export async function resizeAndConvertImgBuffer(
|
|
|
156
156
|
|
|
157
157
|
export const normalizeBase64Body = (body: string) => body.replace(/\s/g, '');
|
|
158
158
|
|
|
159
|
+
const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
|
|
160
|
+
|
|
161
|
+
export const inferBase64ImageFormat = (base64Body: string) => {
|
|
162
|
+
if (base64Body.startsWith('iVBORw0KGgo')) {
|
|
163
|
+
return 'png';
|
|
164
|
+
}
|
|
165
|
+
return 'jpeg';
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
function detectImageMimeTypeFromBuffer(buffer: Buffer): string | undefined {
|
|
169
|
+
if (
|
|
170
|
+
buffer.length >= 8 &&
|
|
171
|
+
buffer[0] === 0x89 &&
|
|
172
|
+
buffer[1] === 0x50 &&
|
|
173
|
+
buffer[2] === 0x4e &&
|
|
174
|
+
buffer[3] === 0x47 &&
|
|
175
|
+
buffer[4] === 0x0d &&
|
|
176
|
+
buffer[5] === 0x0a &&
|
|
177
|
+
buffer[6] === 0x1a &&
|
|
178
|
+
buffer[7] === 0x0a
|
|
179
|
+
) {
|
|
180
|
+
return 'image/png';
|
|
181
|
+
}
|
|
182
|
+
if (
|
|
183
|
+
buffer.length >= 3 &&
|
|
184
|
+
buffer[0] === 0xff &&
|
|
185
|
+
buffer[1] === 0xd8 &&
|
|
186
|
+
buffer[2] === 0xff
|
|
187
|
+
) {
|
|
188
|
+
return 'image/jpeg';
|
|
189
|
+
}
|
|
190
|
+
if (buffer.length >= 6 && buffer.subarray(0, 3).toString('ascii') === 'GIF') {
|
|
191
|
+
return 'image/gif';
|
|
192
|
+
}
|
|
193
|
+
if (
|
|
194
|
+
buffer.length >= 12 &&
|
|
195
|
+
buffer.subarray(0, 4).toString('ascii') === 'RIFF' &&
|
|
196
|
+
buffer.subarray(8, 12).toString('ascii') === 'WEBP'
|
|
197
|
+
) {
|
|
198
|
+
return 'image/webp';
|
|
199
|
+
}
|
|
200
|
+
if (buffer.length >= 2 && buffer[0] === 0x42 && buffer[1] === 0x4d) {
|
|
201
|
+
return 'image/bmp';
|
|
202
|
+
}
|
|
203
|
+
return undefined;
|
|
204
|
+
}
|
|
205
|
+
|
|
159
206
|
export const createImgBase64ByFormat = (format: string, body: string) => {
|
|
160
207
|
return `data:image/${format};base64,${normalizeBase64Body(body)}`;
|
|
161
208
|
};
|
|
162
209
|
|
|
210
|
+
export const normalizeBase64Image = (base64: string) => {
|
|
211
|
+
const trimmedBase64 = base64.trim();
|
|
212
|
+
if (base64ImageDataUrlPattern.test(trimmedBase64)) {
|
|
213
|
+
return trimmedBase64;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const base64Body = normalizeBase64Body(trimmedBase64);
|
|
217
|
+
assert(base64Body, 'base64 image must include image data');
|
|
218
|
+
return createImgBase64ByFormat(
|
|
219
|
+
inferBase64ImageFormat(base64Body),
|
|
220
|
+
base64Body,
|
|
221
|
+
);
|
|
222
|
+
};
|
|
223
|
+
|
|
163
224
|
export async function resizeImgBase64(
|
|
164
225
|
inputBase64: string,
|
|
165
226
|
newSize: {
|
|
@@ -411,7 +472,14 @@ export const parseBase64 = (
|
|
|
411
472
|
const separator = ';base64,';
|
|
412
473
|
const index = fullBase64String.indexOf(separator);
|
|
413
474
|
if (index === -1) {
|
|
414
|
-
|
|
475
|
+
const body = normalizeBase64Body(fullBase64String);
|
|
476
|
+
const mimeType = detectImageMimeTypeFromBuffer(
|
|
477
|
+
Buffer.from(body, 'base64'),
|
|
478
|
+
);
|
|
479
|
+
if (!mimeType) {
|
|
480
|
+
throw new Error('Invalid base64 string');
|
|
481
|
+
}
|
|
482
|
+
return { mimeType, body };
|
|
415
483
|
}
|
|
416
484
|
return {
|
|
417
485
|
// 5 means 'data:'
|
package/src/recorder.ts
CHANGED
|
@@ -46,16 +46,52 @@ export interface MidsceneRecorderEvent {
|
|
|
46
46
|
pageInfo: MidsceneRecorderPageInfo;
|
|
47
47
|
screenshotBefore?: string;
|
|
48
48
|
screenshotAfter?: string;
|
|
49
|
+
semantic?: MidsceneRecorderSemantic;
|
|
49
50
|
elementDescription?: string;
|
|
50
|
-
replayInstruction?: string;
|
|
51
|
-
actionSummary?: string;
|
|
52
|
-
semanticConfidence?: 'high' | 'medium' | 'low';
|
|
53
51
|
descriptionLoading?: boolean;
|
|
54
|
-
descriptionSource?: 'ai' | 'fallback';
|
|
55
|
-
descriptionError?: string;
|
|
56
52
|
screenshotWithBox?: string;
|
|
57
53
|
timestamp: number;
|
|
58
54
|
hashId: string;
|
|
55
|
+
mergedHashIds?: string[];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export type MidsceneRecorderSemanticSource =
|
|
59
|
+
| 'aiDescribe'
|
|
60
|
+
| 'recorderAI'
|
|
61
|
+
| 'heuristic';
|
|
62
|
+
|
|
63
|
+
export type MidsceneRecorderSemanticStatus = 'pending' | 'ready' | 'failed';
|
|
64
|
+
|
|
65
|
+
export type MidsceneRecorderSemanticConfidence = 'high' | 'medium' | 'low';
|
|
66
|
+
|
|
67
|
+
export interface MidsceneRecorderSemanticAiDescribe {
|
|
68
|
+
verifyPrompt: boolean;
|
|
69
|
+
verifyPassed?: boolean;
|
|
70
|
+
deepLocate?: boolean;
|
|
71
|
+
centerDistance?: number;
|
|
72
|
+
expectedCenter?: [number, number];
|
|
73
|
+
actualCenter?: [number, number];
|
|
74
|
+
annotatedScreenshotPath?: string;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export interface MidsceneRecorderSemantic {
|
|
78
|
+
source: MidsceneRecorderSemanticSource;
|
|
79
|
+
status: MidsceneRecorderSemanticStatus;
|
|
80
|
+
elementDescription?: string;
|
|
81
|
+
replayInstruction?: string;
|
|
82
|
+
actionSummary?: string;
|
|
83
|
+
confidence?: MidsceneRecorderSemanticConfidence;
|
|
84
|
+
error?: string;
|
|
85
|
+
aiDescribe?: MidsceneRecorderSemanticAiDescribe;
|
|
86
|
+
fallbackFrom?: MidsceneRecorderSemantic;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export interface MidsceneRecorderSemanticAction {
|
|
90
|
+
type: MidsceneRecorderEventType;
|
|
91
|
+
actionType?: string;
|
|
92
|
+
value?: string;
|
|
93
|
+
url?: string;
|
|
94
|
+
scrollDestinationDescription?: string;
|
|
59
95
|
}
|
|
60
96
|
|
|
61
97
|
export interface MidsceneRecorderTarget {
|
|
@@ -93,26 +129,142 @@ function isMidsceneRecorderPendingDescription(value?: string) {
|
|
|
93
129
|
return value?.trim() === 'AI is analyzing element...';
|
|
94
130
|
}
|
|
95
131
|
|
|
132
|
+
export function getMidsceneRecorderSemantic(
|
|
133
|
+
event: Pick<MidsceneRecorderEvent, 'semantic'>,
|
|
134
|
+
) {
|
|
135
|
+
return event.semantic;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function getRecorderPointerActionVerb(actionType?: string) {
|
|
139
|
+
switch (actionType) {
|
|
140
|
+
case 'Tap':
|
|
141
|
+
return 'Tap';
|
|
142
|
+
case 'DoubleClick':
|
|
143
|
+
return 'Double click';
|
|
144
|
+
case 'LongPress':
|
|
145
|
+
return 'Long press';
|
|
146
|
+
case 'RightClick':
|
|
147
|
+
return 'Right click';
|
|
148
|
+
default:
|
|
149
|
+
return 'Click';
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function getRecorderDragActionVerb(actionType?: string) {
|
|
154
|
+
switch (actionType) {
|
|
155
|
+
case 'Swipe':
|
|
156
|
+
return 'Swipe';
|
|
157
|
+
case 'DragAndDrop':
|
|
158
|
+
return 'Drag';
|
|
159
|
+
default:
|
|
160
|
+
return 'Drag';
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export function buildMidsceneRecorderReplayInstruction(
|
|
165
|
+
event: MidsceneRecorderSemanticAction,
|
|
166
|
+
elementDescription: string,
|
|
167
|
+
) {
|
|
168
|
+
switch (event.type) {
|
|
169
|
+
case 'navigation':
|
|
170
|
+
if (event.actionType === 'Stop') {
|
|
171
|
+
return 'Stop loading the current page.';
|
|
172
|
+
}
|
|
173
|
+
if (event.actionType === 'GoBack') {
|
|
174
|
+
return 'Go back in the browser.';
|
|
175
|
+
}
|
|
176
|
+
if (event.actionType === 'GoForward') {
|
|
177
|
+
return 'Go forward in the browser.';
|
|
178
|
+
}
|
|
179
|
+
if (event.actionType === 'Reload') {
|
|
180
|
+
return 'Reload the current page.';
|
|
181
|
+
}
|
|
182
|
+
if (event.actionType === 'NavigationChanged' && event.url) {
|
|
183
|
+
return `Wait for navigation to complete at \`${event.url}\`.`;
|
|
184
|
+
}
|
|
185
|
+
return event.url
|
|
186
|
+
? `Navigate to \`${event.url}\`.`
|
|
187
|
+
: `Navigate using ${elementDescription}.`;
|
|
188
|
+
case 'scroll':
|
|
189
|
+
return event.scrollDestinationDescription
|
|
190
|
+
? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.`
|
|
191
|
+
: `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
|
|
192
|
+
case 'drag': {
|
|
193
|
+
const verb = getRecorderDragActionVerb(event.actionType);
|
|
194
|
+
return `${verb} through the area described as "${elementDescription}".`;
|
|
195
|
+
}
|
|
196
|
+
case 'input':
|
|
197
|
+
return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
|
|
198
|
+
case 'keydown':
|
|
199
|
+
return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
|
|
200
|
+
default: {
|
|
201
|
+
const verb = getRecorderPointerActionVerb(event.actionType);
|
|
202
|
+
if (verb === 'Long press') {
|
|
203
|
+
return `${verb} the element described as "${elementDescription}".`;
|
|
204
|
+
}
|
|
205
|
+
return `${verb} on the element described as "${elementDescription}".`;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
export function buildMidsceneRecorderActionSummary(
|
|
211
|
+
event: MidsceneRecorderSemanticAction,
|
|
212
|
+
elementDescription: string,
|
|
213
|
+
) {
|
|
214
|
+
switch (event.type) {
|
|
215
|
+
case 'navigation':
|
|
216
|
+
if (event.actionType === 'Stop') {
|
|
217
|
+
return 'Stop page loading';
|
|
218
|
+
}
|
|
219
|
+
if (event.actionType === 'GoBack') {
|
|
220
|
+
return 'Go back';
|
|
221
|
+
}
|
|
222
|
+
if (event.actionType === 'GoForward') {
|
|
223
|
+
return 'Go forward';
|
|
224
|
+
}
|
|
225
|
+
if (event.actionType === 'Reload') {
|
|
226
|
+
return 'Reload page';
|
|
227
|
+
}
|
|
228
|
+
if (event.actionType === 'NavigationChanged' && event.url) {
|
|
229
|
+
return `Wait for navigation to complete at ${event.url}`;
|
|
230
|
+
}
|
|
231
|
+
return event.url ? `Navigate to ${event.url}` : 'Navigate';
|
|
232
|
+
case 'scroll':
|
|
233
|
+
return event.scrollDestinationDescription
|
|
234
|
+
? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}`
|
|
235
|
+
: `Scroll ${elementDescription}`;
|
|
236
|
+
case 'drag':
|
|
237
|
+
return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
|
|
238
|
+
case 'input':
|
|
239
|
+
return `Input into ${elementDescription}`;
|
|
240
|
+
case 'keydown':
|
|
241
|
+
return `Press ${event.value || 'key'} on ${elementDescription}`;
|
|
242
|
+
default:
|
|
243
|
+
return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
96
247
|
export function getMidsceneRecorderEventDescription(
|
|
97
248
|
event: MidsceneRecorderEvent,
|
|
98
249
|
) {
|
|
250
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
99
251
|
if (
|
|
100
|
-
|
|
101
|
-
!isMidsceneRecorderPendingDescription(
|
|
252
|
+
semantic?.actionSummary &&
|
|
253
|
+
!isMidsceneRecorderPendingDescription(semantic.actionSummary)
|
|
102
254
|
) {
|
|
103
|
-
return
|
|
255
|
+
return semantic.actionSummary;
|
|
104
256
|
}
|
|
105
257
|
if (
|
|
106
|
-
|
|
107
|
-
!isMidsceneRecorderPendingDescription(
|
|
258
|
+
semantic?.elementDescription &&
|
|
259
|
+
!isMidsceneRecorderPendingDescription(semantic.elementDescription)
|
|
108
260
|
) {
|
|
109
|
-
return
|
|
261
|
+
return semantic.elementDescription;
|
|
110
262
|
}
|
|
111
263
|
if (
|
|
112
|
-
|
|
113
|
-
!isMidsceneRecorderPendingDescription(
|
|
264
|
+
semantic?.replayInstruction &&
|
|
265
|
+
!isMidsceneRecorderPendingDescription(semantic.replayInstruction)
|
|
114
266
|
) {
|
|
115
|
-
return
|
|
267
|
+
return semantic.replayInstruction;
|
|
116
268
|
}
|
|
117
269
|
if (event.type === 'navigation' && event.url) {
|
|
118
270
|
return `Navigate to ${event.url}`;
|
|
@@ -212,8 +364,9 @@ function getRecorderEventScreenshot(event: MidsceneRecorderEvent) {
|
|
|
212
364
|
}
|
|
213
365
|
|
|
214
366
|
function hasCoordinateFallback(event: MidsceneRecorderEvent) {
|
|
367
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
215
368
|
return (
|
|
216
|
-
!
|
|
369
|
+
!semantic?.elementDescription &&
|
|
217
370
|
event.elementRect?.x !== undefined &&
|
|
218
371
|
event.elementRect?.y !== undefined
|
|
219
372
|
);
|
|
@@ -224,6 +377,7 @@ function shouldIncludeMarkdownScreenshot(
|
|
|
224
377
|
eventIndex: number,
|
|
225
378
|
lastEventIndex: number,
|
|
226
379
|
) {
|
|
380
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
227
381
|
return (
|
|
228
382
|
eventIndex === 0 ||
|
|
229
383
|
eventIndex === lastEventIndex ||
|
|
@@ -231,7 +385,7 @@ function shouldIncludeMarkdownScreenshot(
|
|
|
231
385
|
event.type === 'scroll' ||
|
|
232
386
|
event.type === 'input' ||
|
|
233
387
|
Boolean(event.screenshotWithBox) ||
|
|
234
|
-
!
|
|
388
|
+
!semantic?.elementDescription ||
|
|
235
389
|
hasCoordinateFallback(event)
|
|
236
390
|
);
|
|
237
391
|
}
|
|
@@ -262,17 +416,18 @@ function getRecorderScreenshotCandidatePriority(
|
|
|
262
416
|
if (event.screenshotWithBox) {
|
|
263
417
|
priority += 70;
|
|
264
418
|
}
|
|
419
|
+
const semantic = getMidsceneRecorderSemantic(event);
|
|
265
420
|
if (
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
421
|
+
semantic?.source === 'heuristic' ||
|
|
422
|
+
semantic?.confidence === 'low' ||
|
|
423
|
+
semantic?.error
|
|
269
424
|
) {
|
|
270
425
|
priority += 60;
|
|
271
426
|
}
|
|
272
427
|
if (event.type === 'input' || event.type === 'scroll') {
|
|
273
428
|
priority += 40;
|
|
274
429
|
}
|
|
275
|
-
if (!
|
|
430
|
+
if (!semantic?.elementDescription || hasCoordinateFallback(event)) {
|
|
276
431
|
priority += 30;
|
|
277
432
|
}
|
|
278
433
|
|