@midscene/shared 1.9.7-beta-20260616025249.0 → 1.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@ import { assert } from "../utils.mjs";
5
5
  import { maskConfig, parseJson } from "./helper.mjs";
6
6
  import { initDebugConfig } from "./init-debug.mjs";
7
7
  const MODEL_CONFIG_DOC_URL = 'https://midscenejs.com/model-common-config.html';
8
- const getCurrentVersion = ()=>"1.9.7-beta-20260616025249.0";
8
+ const getCurrentVersion = ()=>"1.9.7";
9
9
  const getInvalidModelFamilyMessage = (modelFamily)=>`Invalid MIDSCENE_MODEL_FAMILY value: ${modelFamily}. Current version v${getCurrentVersion()} accepts the following model families: ${MODEL_FAMILY_VALUES.join(', ')}. You can also visit ${MODEL_CONFIG_DOC_URL} for the latest configuration information.`;
10
10
  const KEYS_MAP = {
11
11
  insight: INSIGHT_MODEL_CONFIG_KEYS,
@@ -1,4 +1,4 @@
1
1
  import { imageInfoOfBase64, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, validateScreenshotBuffer } from "./info.mjs";
2
- import { createImgBase64ByFormat, cropByRect, httpImg2Base64, localImg2Base64, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o } from "./transform.mjs";
2
+ import { createImgBase64ByFormat, cropByRect, httpImg2Base64, inferBase64ImageFormat, localImg2Base64, normalizeBase64Image, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o } from "./transform.mjs";
3
3
  import { annotateRects, compositeElementInfoImg, compositePointMarkerImg, processImageElementInfo } from "./box-select.mjs";
4
- export { annotateRects, compositeElementInfoImg, compositePointMarkerImg, createImgBase64ByFormat, cropByRect, httpImg2Base64, imageInfoOfBase64, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, localImg2Base64, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, processImageElementInfo, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, validateScreenshotBuffer, zoomForGPT4o };
4
+ export { annotateRects, compositeElementInfoImg, compositePointMarkerImg, createImgBase64ByFormat, cropByRect, httpImg2Base64, imageInfoOfBase64, inferBase64ImageFormat, isValidImageBuffer, isValidJPEGImageBuffer, isValidPNGImageBuffer, localImg2Base64, normalizeBase64Image, paddingToMatchBlockByBase64, parseBase64, preProcessImageUrl, processImageElementInfo, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, validateScreenshotBuffer, zoomForGPT4o };
@@ -70,7 +70,26 @@ async function resizeAndConvertImgBuffer(inputFormat, inputData, newSize) {
70
70
  };
71
71
  }
72
72
  const normalizeBase64Body = (body)=>body.replace(/\s/g, '');
73
+ const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
74
+ const inferBase64ImageFormat = (base64Body)=>{
75
+ if (base64Body.startsWith('iVBORw0KGgo')) return 'png';
76
+ return 'jpeg';
77
+ };
78
+ function detectImageMimeTypeFromBuffer(buffer) {
79
+ if (buffer.length >= 8 && 0x89 === buffer[0] && 0x50 === buffer[1] && 0x4e === buffer[2] && 0x47 === buffer[3] && 0x0d === buffer[4] && 0x0a === buffer[5] && 0x1a === buffer[6] && 0x0a === buffer[7]) return 'image/png';
80
+ if (buffer.length >= 3 && 0xff === buffer[0] && 0xd8 === buffer[1] && 0xff === buffer[2]) return 'image/jpeg';
81
+ if (buffer.length >= 6 && 'GIF' === buffer.subarray(0, 3).toString('ascii')) return 'image/gif';
82
+ if (buffer.length >= 12 && 'RIFF' === buffer.subarray(0, 4).toString('ascii') && 'WEBP' === buffer.subarray(8, 12).toString('ascii')) return 'image/webp';
83
+ if (buffer.length >= 2 && 0x42 === buffer[0] && 0x4d === buffer[1]) return 'image/bmp';
84
+ }
73
85
  const createImgBase64ByFormat = (format, body)=>`data:image/${format};base64,${normalizeBase64Body(body)}`;
86
+ const normalizeBase64Image = (base64)=>{
87
+ const trimmedBase64 = base64.trim();
88
+ if (base64ImageDataUrlPattern.test(trimmedBase64)) return trimmedBase64;
89
+ const base64Body = normalizeBase64Body(trimmedBase64);
90
+ node_assert(base64Body, 'base64 image must include image data');
91
+ return createImgBase64ByFormat(inferBase64ImageFormat(base64Body), base64Body);
92
+ };
74
93
  async function resizeImgBase64(inputBase64, newSize) {
75
94
  const { body, mimeType } = parseBase64(inputBase64);
76
95
  const imageBuffer = Buffer.from(body, 'base64');
@@ -198,7 +217,15 @@ const parseBase64 = (fullBase64String)=>{
198
217
  try {
199
218
  const separator = ';base64,';
200
219
  const index = fullBase64String.indexOf(separator);
201
- if (-1 === index) throw new Error('Invalid base64 string');
220
+ if (-1 === index) {
221
+ const body = normalizeBase64Body(fullBase64String);
222
+ const mimeType = detectImageMimeTypeFromBuffer(Buffer.from(body, 'base64'));
223
+ if (!mimeType) throw new Error('Invalid base64 string');
224
+ return {
225
+ mimeType,
226
+ body
227
+ };
228
+ }
202
229
  return {
203
230
  mimeType: fullBase64String.slice(5, index),
204
231
  body: normalizeBase64Body(fullBase64String.slice(index + separator.length))
@@ -266,4 +293,4 @@ async function scaleImage(imageBase64, scale) {
266
293
  imageBase64: base64
267
294
  };
268
295
  }
269
- export { createImgBase64ByFormat, cropByRect, httpImg2Base64, localImg2Base64, normalizeBase64Body, paddingToMatchBlock, paddingToMatchBlockByBase64, parseBase64, photonFromBase64, photonToBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o };
296
+ export { createImgBase64ByFormat, cropByRect, httpImg2Base64, inferBase64ImageFormat, localImg2Base64, normalizeBase64Body, normalizeBase64Image, paddingToMatchBlock, paddingToMatchBlockByBase64, parseBase64, photonFromBase64, photonToBase64, preProcessImageUrl, resizeAndConvertImgBuffer, resizeImgBase64, saveBase64Image, scaleImage, zoomForGPT4o };
@@ -2,10 +2,87 @@ const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
2
2
  function isMidsceneRecorderPendingDescription(value) {
3
3
  return value?.trim() === 'AI is analyzing element...';
4
4
  }
5
+ function getMidsceneRecorderSemantic(event) {
6
+ return event.semantic;
7
+ }
8
+ function getRecorderPointerActionVerb(actionType) {
9
+ switch(actionType){
10
+ case 'Tap':
11
+ return 'Tap';
12
+ case 'DoubleClick':
13
+ return 'Double click';
14
+ case 'LongPress':
15
+ return 'Long press';
16
+ case 'RightClick':
17
+ return 'Right click';
18
+ default:
19
+ return 'Click';
20
+ }
21
+ }
22
+ function getRecorderDragActionVerb(actionType) {
23
+ switch(actionType){
24
+ case 'Swipe':
25
+ return 'Swipe';
26
+ case 'DragAndDrop':
27
+ return 'Drag';
28
+ default:
29
+ return 'Drag';
30
+ }
31
+ }
32
+ function buildMidsceneRecorderReplayInstruction(event, elementDescription) {
33
+ switch(event.type){
34
+ case 'navigation':
35
+ if ('Stop' === event.actionType) return 'Stop loading the current page.';
36
+ if ('GoBack' === event.actionType) return 'Go back in the browser.';
37
+ if ('GoForward' === event.actionType) return 'Go forward in the browser.';
38
+ if ('Reload' === event.actionType) return 'Reload the current page.';
39
+ if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at \`${event.url}\`.`;
40
+ return event.url ? `Navigate to \`${event.url}\`.` : `Navigate using ${elementDescription}.`;
41
+ case 'scroll':
42
+ return event.scrollDestinationDescription ? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.` : `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
43
+ case 'drag':
44
+ {
45
+ const verb = getRecorderDragActionVerb(event.actionType);
46
+ return `${verb} through the area described as "${elementDescription}".`;
47
+ }
48
+ case 'input':
49
+ return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
50
+ case 'keydown':
51
+ return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
52
+ default:
53
+ {
54
+ const verb = getRecorderPointerActionVerb(event.actionType);
55
+ if ('Long press' === verb) return `${verb} the element described as "${elementDescription}".`;
56
+ return `${verb} on the element described as "${elementDescription}".`;
57
+ }
58
+ }
59
+ }
60
+ function buildMidsceneRecorderActionSummary(event, elementDescription) {
61
+ switch(event.type){
62
+ case 'navigation':
63
+ if ('Stop' === event.actionType) return 'Stop page loading';
64
+ if ('GoBack' === event.actionType) return 'Go back';
65
+ if ('GoForward' === event.actionType) return 'Go forward';
66
+ if ('Reload' === event.actionType) return 'Reload page';
67
+ if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at ${event.url}`;
68
+ return event.url ? `Navigate to ${event.url}` : 'Navigate';
69
+ case 'scroll':
70
+ return event.scrollDestinationDescription ? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}` : `Scroll ${elementDescription}`;
71
+ case 'drag':
72
+ return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
73
+ case 'input':
74
+ return `Input into ${elementDescription}`;
75
+ case 'keydown':
76
+ return `Press ${event.value || 'key'} on ${elementDescription}`;
77
+ default:
78
+ return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
79
+ }
80
+ }
5
81
  function getMidsceneRecorderEventDescription(event) {
6
- if (event.actionSummary && !isMidsceneRecorderPendingDescription(event.actionSummary)) return event.actionSummary;
7
- if (event.elementDescription && !isMidsceneRecorderPendingDescription(event.elementDescription)) return event.elementDescription;
8
- if (event.replayInstruction && !isMidsceneRecorderPendingDescription(event.replayInstruction)) return event.replayInstruction;
82
+ const semantic = getMidsceneRecorderSemantic(event);
83
+ if (semantic?.actionSummary && !isMidsceneRecorderPendingDescription(semantic.actionSummary)) return semantic.actionSummary;
84
+ if (semantic?.elementDescription && !isMidsceneRecorderPendingDescription(semantic.elementDescription)) return semantic.elementDescription;
85
+ if (semantic?.replayInstruction && !isMidsceneRecorderPendingDescription(semantic.replayInstruction)) return semantic.replayInstruction;
9
86
  if ('navigation' === event.type && event.url) return `Navigate to ${event.url}`;
10
87
  if (event.value) return event.actionType ? `${event.actionType} ${event.value}` : event.value;
11
88
  if (event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0) {
@@ -54,10 +131,12 @@ function getRecorderEventScreenshot(event) {
54
131
  return event.screenshotWithBox || event.screenshotAfter || event.screenshotBefore;
55
132
  }
56
133
  function hasCoordinateFallback(event) {
57
- return !event.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
134
+ const semantic = getMidsceneRecorderSemantic(event);
135
+ return !semantic?.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
58
136
  }
59
137
  function shouldIncludeMarkdownScreenshot(event, eventIndex, lastEventIndex) {
60
- return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !event.elementDescription || hasCoordinateFallback(event);
138
+ const semantic = getMidsceneRecorderSemantic(event);
139
+ return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !semantic?.elementDescription || hasCoordinateFallback(event);
61
140
  }
62
141
  function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, lastEventIndex) {
63
142
  const event = candidate.event;
@@ -66,9 +145,10 @@ function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, last
66
145
  if (candidate.eventIndex === lastEventIndex) priority += 95;
67
146
  if ('navigation' === event.type) priority += 80;
68
147
  if (event.screenshotWithBox) priority += 70;
69
- if ('fallback' === event.descriptionSource || 'low' === event.semanticConfidence || event.descriptionError) priority += 60;
148
+ const semantic = getMidsceneRecorderSemantic(event);
149
+ if (semantic?.source === 'heuristic' || semantic?.confidence === 'low' || semantic?.error) priority += 60;
70
150
  if ('input' === event.type || 'scroll' === event.type) priority += 40;
71
- if (!event.elementDescription || hasCoordinateFallback(event)) priority += 30;
151
+ if (!semantic?.elementDescription || hasCoordinateFallback(event)) priority += 30;
72
152
  return priority;
73
153
  }
74
154
  function selectEvenlyDistributedCandidates(candidates, count) {
@@ -166,4 +246,4 @@ function stringifyMidsceneRecorderTargetBlock(target) {
166
246
  for (const [key, value] of values)lines.push(` ${key}: ${scalarToYaml(value)}`);
167
247
  return lines.join('\n');
168
248
  }
169
- export { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM, sanitizeMidsceneRecorderFileName, stringifyMidsceneRecorderTargetBlock };
249
+ export { DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS, buildMidsceneRecorderActionSummary, buildMidsceneRecorderReplayInstruction, createMidsceneRecorderMarkdownScreenshotAssets, getMidsceneRecorderEventDescription, getMidsceneRecorderScreenshotsForLLM, getMidsceneRecorderSemantic, sanitizeMidsceneRecorderFileName, stringifyMidsceneRecorderTargetBlock };
@@ -37,7 +37,7 @@ const external_utils_js_namespaceObject = require("../utils.js");
37
37
  const external_helper_js_namespaceObject = require("./helper.js");
38
38
  const external_init_debug_js_namespaceObject = require("./init-debug.js");
39
39
  const MODEL_CONFIG_DOC_URL = 'https://midscenejs.com/model-common-config.html';
40
- const getCurrentVersion = ()=>"1.9.7-beta-20260616025249.0";
40
+ const getCurrentVersion = ()=>"1.9.7";
41
41
  const getInvalidModelFamilyMessage = (modelFamily)=>`Invalid MIDSCENE_MODEL_FAMILY value: ${modelFamily}. Current version v${getCurrentVersion()} accepts the following model families: ${external_types_js_namespaceObject.MODEL_FAMILY_VALUES.join(', ')}. You can also visit ${MODEL_CONFIG_DOC_URL} for the latest configuration information.`;
42
42
  const KEYS_MAP = {
43
43
  insight: external_constants_js_namespaceObject.INSIGHT_MODEL_CONFIG_KEYS,
@@ -25,6 +25,7 @@ var __webpack_exports__ = {};
25
25
  __webpack_require__.r(__webpack_exports__);
26
26
  __webpack_require__.d(__webpack_exports__, {
27
27
  paddingToMatchBlockByBase64: ()=>external_transform_js_namespaceObject.paddingToMatchBlockByBase64,
28
+ inferBase64ImageFormat: ()=>external_transform_js_namespaceObject.inferBase64ImageFormat,
28
29
  localImg2Base64: ()=>external_transform_js_namespaceObject.localImg2Base64,
29
30
  parseBase64: ()=>external_transform_js_namespaceObject.parseBase64,
30
31
  resizeAndConvertImgBuffer: ()=>external_transform_js_namespaceObject.resizeAndConvertImgBuffer,
@@ -36,10 +37,11 @@ __webpack_require__.d(__webpack_exports__, {
36
37
  annotateRects: ()=>external_box_select_js_namespaceObject.annotateRects,
37
38
  compositeElementInfoImg: ()=>external_box_select_js_namespaceObject.compositeElementInfoImg,
38
39
  isValidPNGImageBuffer: ()=>external_info_js_namespaceObject.isValidPNGImageBuffer,
39
- cropByRect: ()=>external_transform_js_namespaceObject.cropByRect,
40
+ normalizeBase64Image: ()=>external_transform_js_namespaceObject.normalizeBase64Image,
40
41
  compositePointMarkerImg: ()=>external_box_select_js_namespaceObject.compositePointMarkerImg,
41
- processImageElementInfo: ()=>external_box_select_js_namespaceObject.processImageElementInfo,
42
+ cropByRect: ()=>external_transform_js_namespaceObject.cropByRect,
42
43
  isValidJPEGImageBuffer: ()=>external_info_js_namespaceObject.isValidJPEGImageBuffer,
44
+ processImageElementInfo: ()=>external_box_select_js_namespaceObject.processImageElementInfo,
43
45
  scaleImage: ()=>external_transform_js_namespaceObject.scaleImage,
44
46
  zoomForGPT4o: ()=>external_transform_js_namespaceObject.zoomForGPT4o,
45
47
  createImgBase64ByFormat: ()=>external_transform_js_namespaceObject.createImgBase64ByFormat,
@@ -56,10 +58,12 @@ exports.createImgBase64ByFormat = __webpack_exports__.createImgBase64ByFormat;
56
58
  exports.cropByRect = __webpack_exports__.cropByRect;
57
59
  exports.httpImg2Base64 = __webpack_exports__.httpImg2Base64;
58
60
  exports.imageInfoOfBase64 = __webpack_exports__.imageInfoOfBase64;
61
+ exports.inferBase64ImageFormat = __webpack_exports__.inferBase64ImageFormat;
59
62
  exports.isValidImageBuffer = __webpack_exports__.isValidImageBuffer;
60
63
  exports.isValidJPEGImageBuffer = __webpack_exports__.isValidJPEGImageBuffer;
61
64
  exports.isValidPNGImageBuffer = __webpack_exports__.isValidPNGImageBuffer;
62
65
  exports.localImg2Base64 = __webpack_exports__.localImg2Base64;
66
+ exports.normalizeBase64Image = __webpack_exports__.normalizeBase64Image;
63
67
  exports.paddingToMatchBlockByBase64 = __webpack_exports__.paddingToMatchBlockByBase64;
64
68
  exports.parseBase64 = __webpack_exports__.parseBase64;
65
69
  exports.preProcessImageUrl = __webpack_exports__.preProcessImageUrl;
@@ -78,10 +82,12 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
78
82
  "cropByRect",
79
83
  "httpImg2Base64",
80
84
  "imageInfoOfBase64",
85
+ "inferBase64ImageFormat",
81
86
  "isValidImageBuffer",
82
87
  "isValidJPEGImageBuffer",
83
88
  "isValidPNGImageBuffer",
84
89
  "localImg2Base64",
90
+ "normalizeBase64Image",
85
91
  "paddingToMatchBlockByBase64",
86
92
  "parseBase64",
87
93
  "preProcessImageUrl",
@@ -34,6 +34,7 @@ var __webpack_exports__ = {};
34
34
  __webpack_require__.r(__webpack_exports__);
35
35
  __webpack_require__.d(__webpack_exports__, {
36
36
  paddingToMatchBlockByBase64: ()=>paddingToMatchBlockByBase64,
37
+ inferBase64ImageFormat: ()=>inferBase64ImageFormat,
37
38
  localImg2Base64: ()=>localImg2Base64,
38
39
  photonFromBase64: ()=>photonFromBase64,
39
40
  photonToBase64: ()=>photonToBase64,
@@ -44,6 +45,7 @@ __webpack_require__.d(__webpack_exports__, {
44
45
  preProcessImageUrl: ()=>preProcessImageUrl,
45
46
  cropByRect: ()=>cropByRect,
46
47
  scaleImage: ()=>scaleImage,
48
+ normalizeBase64Image: ()=>normalizeBase64Image,
47
49
  normalizeBase64Body: ()=>normalizeBase64Body,
48
50
  zoomForGPT4o: ()=>zoomForGPT4o,
49
51
  createImgBase64ByFormat: ()=>createImgBase64ByFormat,
@@ -126,7 +128,26 @@ async function resizeAndConvertImgBuffer(inputFormat, inputData, newSize) {
126
128
  };
127
129
  }
128
130
  const normalizeBase64Body = (body)=>body.replace(/\s/g, '');
131
+ const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
132
+ const inferBase64ImageFormat = (base64Body)=>{
133
+ if (base64Body.startsWith('iVBORw0KGgo')) return 'png';
134
+ return 'jpeg';
135
+ };
136
+ function detectImageMimeTypeFromBuffer(buffer) {
137
+ if (buffer.length >= 8 && 0x89 === buffer[0] && 0x50 === buffer[1] && 0x4e === buffer[2] && 0x47 === buffer[3] && 0x0d === buffer[4] && 0x0a === buffer[5] && 0x1a === buffer[6] && 0x0a === buffer[7]) return 'image/png';
138
+ if (buffer.length >= 3 && 0xff === buffer[0] && 0xd8 === buffer[1] && 0xff === buffer[2]) return 'image/jpeg';
139
+ if (buffer.length >= 6 && 'GIF' === buffer.subarray(0, 3).toString('ascii')) return 'image/gif';
140
+ if (buffer.length >= 12 && 'RIFF' === buffer.subarray(0, 4).toString('ascii') && 'WEBP' === buffer.subarray(8, 12).toString('ascii')) return 'image/webp';
141
+ if (buffer.length >= 2 && 0x42 === buffer[0] && 0x4d === buffer[1]) return 'image/bmp';
142
+ }
129
143
  const createImgBase64ByFormat = (format, body)=>`data:image/${format};base64,${normalizeBase64Body(body)}`;
144
+ const normalizeBase64Image = (base64)=>{
145
+ const trimmedBase64 = base64.trim();
146
+ if (base64ImageDataUrlPattern.test(trimmedBase64)) return trimmedBase64;
147
+ const base64Body = normalizeBase64Body(trimmedBase64);
148
+ external_node_assert_default()(base64Body, 'base64 image must include image data');
149
+ return createImgBase64ByFormat(inferBase64ImageFormat(base64Body), base64Body);
150
+ };
130
151
  async function resizeImgBase64(inputBase64, newSize) {
131
152
  const { body, mimeType } = parseBase64(inputBase64);
132
153
  const imageBuffer = external_node_buffer_namespaceObject.Buffer.from(body, 'base64');
@@ -254,7 +275,15 @@ const parseBase64 = (fullBase64String)=>{
254
275
  try {
255
276
  const separator = ';base64,';
256
277
  const index = fullBase64String.indexOf(separator);
257
- if (-1 === index) throw new Error('Invalid base64 string');
278
+ if (-1 === index) {
279
+ const body = normalizeBase64Body(fullBase64String);
280
+ const mimeType = detectImageMimeTypeFromBuffer(external_node_buffer_namespaceObject.Buffer.from(body, 'base64'));
281
+ if (!mimeType) throw new Error('Invalid base64 string');
282
+ return {
283
+ mimeType,
284
+ body
285
+ };
286
+ }
258
287
  return {
259
288
  mimeType: fullBase64String.slice(5, index),
260
289
  body: normalizeBase64Body(fullBase64String.slice(index + separator.length))
@@ -325,8 +354,10 @@ async function scaleImage(imageBase64, scale) {
325
354
  exports.createImgBase64ByFormat = __webpack_exports__.createImgBase64ByFormat;
326
355
  exports.cropByRect = __webpack_exports__.cropByRect;
327
356
  exports.httpImg2Base64 = __webpack_exports__.httpImg2Base64;
357
+ exports.inferBase64ImageFormat = __webpack_exports__.inferBase64ImageFormat;
328
358
  exports.localImg2Base64 = __webpack_exports__.localImg2Base64;
329
359
  exports.normalizeBase64Body = __webpack_exports__.normalizeBase64Body;
360
+ exports.normalizeBase64Image = __webpack_exports__.normalizeBase64Image;
330
361
  exports.paddingToMatchBlock = __webpack_exports__.paddingToMatchBlock;
331
362
  exports.paddingToMatchBlockByBase64 = __webpack_exports__.paddingToMatchBlockByBase64;
332
363
  exports.parseBase64 = __webpack_exports__.parseBase64;
@@ -342,8 +373,10 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
342
373
  "createImgBase64ByFormat",
343
374
  "cropByRect",
344
375
  "httpImg2Base64",
376
+ "inferBase64ImageFormat",
345
377
  "localImg2Base64",
346
378
  "normalizeBase64Body",
379
+ "normalizeBase64Image",
347
380
  "paddingToMatchBlock",
348
381
  "paddingToMatchBlockByBase64",
349
382
  "parseBase64",
@@ -25,9 +25,12 @@ var __webpack_exports__ = {};
25
25
  __webpack_require__.r(__webpack_exports__);
26
26
  __webpack_require__.d(__webpack_exports__, {
27
27
  DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS: ()=>DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS,
28
+ buildMidsceneRecorderActionSummary: ()=>buildMidsceneRecorderActionSummary,
29
+ buildMidsceneRecorderReplayInstruction: ()=>buildMidsceneRecorderReplayInstruction,
28
30
  createMidsceneRecorderMarkdownScreenshotAssets: ()=>createMidsceneRecorderMarkdownScreenshotAssets,
29
31
  getMidsceneRecorderEventDescription: ()=>getMidsceneRecorderEventDescription,
30
32
  getMidsceneRecorderScreenshotsForLLM: ()=>getMidsceneRecorderScreenshotsForLLM,
33
+ getMidsceneRecorderSemantic: ()=>getMidsceneRecorderSemantic,
31
34
  sanitizeMidsceneRecorderFileName: ()=>sanitizeMidsceneRecorderFileName,
32
35
  stringifyMidsceneRecorderTargetBlock: ()=>stringifyMidsceneRecorderTargetBlock
33
36
  });
@@ -35,10 +38,87 @@ const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
35
38
  function isMidsceneRecorderPendingDescription(value) {
36
39
  return value?.trim() === 'AI is analyzing element...';
37
40
  }
41
+ function getMidsceneRecorderSemantic(event) {
42
+ return event.semantic;
43
+ }
44
+ function getRecorderPointerActionVerb(actionType) {
45
+ switch(actionType){
46
+ case 'Tap':
47
+ return 'Tap';
48
+ case 'DoubleClick':
49
+ return 'Double click';
50
+ case 'LongPress':
51
+ return 'Long press';
52
+ case 'RightClick':
53
+ return 'Right click';
54
+ default:
55
+ return 'Click';
56
+ }
57
+ }
58
+ function getRecorderDragActionVerb(actionType) {
59
+ switch(actionType){
60
+ case 'Swipe':
61
+ return 'Swipe';
62
+ case 'DragAndDrop':
63
+ return 'Drag';
64
+ default:
65
+ return 'Drag';
66
+ }
67
+ }
68
+ function buildMidsceneRecorderReplayInstruction(event, elementDescription) {
69
+ switch(event.type){
70
+ case 'navigation':
71
+ if ('Stop' === event.actionType) return 'Stop loading the current page.';
72
+ if ('GoBack' === event.actionType) return 'Go back in the browser.';
73
+ if ('GoForward' === event.actionType) return 'Go forward in the browser.';
74
+ if ('Reload' === event.actionType) return 'Reload the current page.';
75
+ if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at \`${event.url}\`.`;
76
+ return event.url ? `Navigate to \`${event.url}\`.` : `Navigate using ${elementDescription}.`;
77
+ case 'scroll':
78
+ return event.scrollDestinationDescription ? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.` : `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
79
+ case 'drag':
80
+ {
81
+ const verb = getRecorderDragActionVerb(event.actionType);
82
+ return `${verb} through the area described as "${elementDescription}".`;
83
+ }
84
+ case 'input':
85
+ return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
86
+ case 'keydown':
87
+ return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
88
+ default:
89
+ {
90
+ const verb = getRecorderPointerActionVerb(event.actionType);
91
+ if ('Long press' === verb) return `${verb} the element described as "${elementDescription}".`;
92
+ return `${verb} on the element described as "${elementDescription}".`;
93
+ }
94
+ }
95
+ }
96
+ function buildMidsceneRecorderActionSummary(event, elementDescription) {
97
+ switch(event.type){
98
+ case 'navigation':
99
+ if ('Stop' === event.actionType) return 'Stop page loading';
100
+ if ('GoBack' === event.actionType) return 'Go back';
101
+ if ('GoForward' === event.actionType) return 'Go forward';
102
+ if ('Reload' === event.actionType) return 'Reload page';
103
+ if ('NavigationChanged' === event.actionType && event.url) return `Wait for navigation to complete at ${event.url}`;
104
+ return event.url ? `Navigate to ${event.url}` : 'Navigate';
105
+ case 'scroll':
106
+ return event.scrollDestinationDescription ? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}` : `Scroll ${elementDescription}`;
107
+ case 'drag':
108
+ return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
109
+ case 'input':
110
+ return `Input into ${elementDescription}`;
111
+ case 'keydown':
112
+ return `Press ${event.value || 'key'} on ${elementDescription}`;
113
+ default:
114
+ return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
115
+ }
116
+ }
38
117
  function getMidsceneRecorderEventDescription(event) {
39
- if (event.actionSummary && !isMidsceneRecorderPendingDescription(event.actionSummary)) return event.actionSummary;
40
- if (event.elementDescription && !isMidsceneRecorderPendingDescription(event.elementDescription)) return event.elementDescription;
41
- if (event.replayInstruction && !isMidsceneRecorderPendingDescription(event.replayInstruction)) return event.replayInstruction;
118
+ const semantic = getMidsceneRecorderSemantic(event);
119
+ if (semantic?.actionSummary && !isMidsceneRecorderPendingDescription(semantic.actionSummary)) return semantic.actionSummary;
120
+ if (semantic?.elementDescription && !isMidsceneRecorderPendingDescription(semantic.elementDescription)) return semantic.elementDescription;
121
+ if (semantic?.replayInstruction && !isMidsceneRecorderPendingDescription(semantic.replayInstruction)) return semantic.replayInstruction;
42
122
  if ('navigation' === event.type && event.url) return `Navigate to ${event.url}`;
43
123
  if (event.value) return event.actionType ? `${event.actionType} ${event.value}` : event.value;
44
124
  if (event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0) {
@@ -87,10 +167,12 @@ function getRecorderEventScreenshot(event) {
87
167
  return event.screenshotWithBox || event.screenshotAfter || event.screenshotBefore;
88
168
  }
89
169
  function hasCoordinateFallback(event) {
90
- return !event.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
170
+ const semantic = getMidsceneRecorderSemantic(event);
171
+ return !semantic?.elementDescription && event.elementRect?.x !== void 0 && event.elementRect?.y !== void 0;
91
172
  }
92
173
  function shouldIncludeMarkdownScreenshot(event, eventIndex, lastEventIndex) {
93
- return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !event.elementDescription || hasCoordinateFallback(event);
174
+ const semantic = getMidsceneRecorderSemantic(event);
175
+ return 0 === eventIndex || eventIndex === lastEventIndex || 'navigation' === event.type || 'scroll' === event.type || 'input' === event.type || Boolean(event.screenshotWithBox) || !semantic?.elementDescription || hasCoordinateFallback(event);
94
176
  }
95
177
  function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, lastEventIndex) {
96
178
  const event = candidate.event;
@@ -99,9 +181,10 @@ function getRecorderScreenshotCandidatePriority(candidate, firstEventIndex, last
99
181
  if (candidate.eventIndex === lastEventIndex) priority += 95;
100
182
  if ('navigation' === event.type) priority += 80;
101
183
  if (event.screenshotWithBox) priority += 70;
102
- if ('fallback' === event.descriptionSource || 'low' === event.semanticConfidence || event.descriptionError) priority += 60;
184
+ const semantic = getMidsceneRecorderSemantic(event);
185
+ if (semantic?.source === 'heuristic' || semantic?.confidence === 'low' || semantic?.error) priority += 60;
103
186
  if ('input' === event.type || 'scroll' === event.type) priority += 40;
104
- if (!event.elementDescription || hasCoordinateFallback(event)) priority += 30;
187
+ if (!semantic?.elementDescription || hasCoordinateFallback(event)) priority += 30;
105
188
  return priority;
106
189
  }
107
190
  function selectEvenlyDistributedCandidates(candidates, count) {
@@ -200,16 +283,22 @@ function stringifyMidsceneRecorderTargetBlock(target) {
200
283
  return lines.join('\n');
201
284
  }
202
285
  exports.DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = __webpack_exports__.DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS;
286
+ exports.buildMidsceneRecorderActionSummary = __webpack_exports__.buildMidsceneRecorderActionSummary;
287
+ exports.buildMidsceneRecorderReplayInstruction = __webpack_exports__.buildMidsceneRecorderReplayInstruction;
203
288
  exports.createMidsceneRecorderMarkdownScreenshotAssets = __webpack_exports__.createMidsceneRecorderMarkdownScreenshotAssets;
204
289
  exports.getMidsceneRecorderEventDescription = __webpack_exports__.getMidsceneRecorderEventDescription;
205
290
  exports.getMidsceneRecorderScreenshotsForLLM = __webpack_exports__.getMidsceneRecorderScreenshotsForLLM;
291
+ exports.getMidsceneRecorderSemantic = __webpack_exports__.getMidsceneRecorderSemantic;
206
292
  exports.sanitizeMidsceneRecorderFileName = __webpack_exports__.sanitizeMidsceneRecorderFileName;
207
293
  exports.stringifyMidsceneRecorderTargetBlock = __webpack_exports__.stringifyMidsceneRecorderTargetBlock;
208
294
  for(var __rspack_i in __webpack_exports__)if (-1 === [
209
295
  "DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS",
296
+ "buildMidsceneRecorderActionSummary",
297
+ "buildMidsceneRecorderReplayInstruction",
210
298
  "createMidsceneRecorderMarkdownScreenshotAssets",
211
299
  "getMidsceneRecorderEventDescription",
212
300
  "getMidsceneRecorderScreenshotsForLLM",
301
+ "getMidsceneRecorderSemantic",
213
302
  "sanitizeMidsceneRecorderFileName",
214
303
  "stringifyMidsceneRecorderTargetBlock"
215
304
  ].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
@@ -1,3 +1,3 @@
1
1
  export { imageInfoOfBase64, isValidPNGImageBuffer, isValidJPEGImageBuffer, isValidImageBuffer, validateScreenshotBuffer, type ValidateScreenshotBufferOptions, } from './info';
2
- export { resizeAndConvertImgBuffer, resizeImgBase64, zoomForGPT4o, saveBase64Image, paddingToMatchBlockByBase64, cropByRect, scaleImage, localImg2Base64, httpImg2Base64, preProcessImageUrl, parseBase64, createImgBase64ByFormat, } from './transform';
2
+ export { resizeAndConvertImgBuffer, resizeImgBase64, zoomForGPT4o, saveBase64Image, paddingToMatchBlockByBase64, cropByRect, scaleImage, localImg2Base64, httpImg2Base64, preProcessImageUrl, parseBase64, createImgBase64ByFormat, inferBase64ImageFormat, normalizeBase64Image, } from './transform';
3
3
  export { processImageElementInfo, compositeElementInfoImg, compositePointMarkerImg, annotateRects, } from './box-select';
@@ -27,7 +27,9 @@ export declare function resizeAndConvertImgBuffer(inputFormat: string, inputData
27
27
  format: string;
28
28
  }>;
29
29
  export declare const normalizeBase64Body: (body: string) => string;
30
+ export declare const inferBase64ImageFormat: (base64Body: string) => "jpeg" | "png";
30
31
  export declare const createImgBase64ByFormat: (format: string, body: string) => string;
32
+ export declare const normalizeBase64Image: (base64: string) => string;
31
33
  export declare function resizeImgBase64(inputBase64: string, newSize: {
32
34
  width: number;
33
35
  height: number;
@@ -25,16 +25,43 @@ export interface MidsceneRecorderEvent {
25
25
  pageInfo: MidsceneRecorderPageInfo;
26
26
  screenshotBefore?: string;
27
27
  screenshotAfter?: string;
28
+ semantic?: MidsceneRecorderSemantic;
28
29
  elementDescription?: string;
29
- replayInstruction?: string;
30
- actionSummary?: string;
31
- semanticConfidence?: 'high' | 'medium' | 'low';
32
30
  descriptionLoading?: boolean;
33
- descriptionSource?: 'ai' | 'fallback';
34
- descriptionError?: string;
35
31
  screenshotWithBox?: string;
36
32
  timestamp: number;
37
33
  hashId: string;
34
+ mergedHashIds?: string[];
35
+ }
36
+ export type MidsceneRecorderSemanticSource = 'aiDescribe' | 'recorderAI' | 'heuristic';
37
+ export type MidsceneRecorderSemanticStatus = 'pending' | 'ready' | 'failed';
38
+ export type MidsceneRecorderSemanticConfidence = 'high' | 'medium' | 'low';
39
+ export interface MidsceneRecorderSemanticAiDescribe {
40
+ verifyPrompt: boolean;
41
+ verifyPassed?: boolean;
42
+ deepLocate?: boolean;
43
+ centerDistance?: number;
44
+ expectedCenter?: [number, number];
45
+ actualCenter?: [number, number];
46
+ annotatedScreenshotPath?: string;
47
+ }
48
+ export interface MidsceneRecorderSemantic {
49
+ source: MidsceneRecorderSemanticSource;
50
+ status: MidsceneRecorderSemanticStatus;
51
+ elementDescription?: string;
52
+ replayInstruction?: string;
53
+ actionSummary?: string;
54
+ confidence?: MidsceneRecorderSemanticConfidence;
55
+ error?: string;
56
+ aiDescribe?: MidsceneRecorderSemanticAiDescribe;
57
+ fallbackFrom?: MidsceneRecorderSemantic;
58
+ }
59
+ export interface MidsceneRecorderSemanticAction {
60
+ type: MidsceneRecorderEventType;
61
+ actionType?: string;
62
+ value?: string;
63
+ url?: string;
64
+ scrollDestinationDescription?: string;
38
65
  }
39
66
  export interface MidsceneRecorderTarget {
40
67
  platformId: MidsceneRecorderPlatformId;
@@ -62,6 +89,9 @@ export interface MidsceneRecorderMarkdownScreenshotOptions {
62
89
  maxScreenshots?: number;
63
90
  }
64
91
  export declare const DEFAULT_MIDSCENE_RECORDER_MARKDOWN_MAX_SCREENSHOTS = 20;
92
+ export declare function getMidsceneRecorderSemantic(event: Pick<MidsceneRecorderEvent, 'semantic'>): MidsceneRecorderSemantic | undefined;
93
+ export declare function buildMidsceneRecorderReplayInstruction(event: MidsceneRecorderSemanticAction, elementDescription: string): string;
94
+ export declare function buildMidsceneRecorderActionSummary(event: MidsceneRecorderSemanticAction, elementDescription: string): string;
65
95
  export declare function getMidsceneRecorderEventDescription(event: MidsceneRecorderEvent): string;
66
96
  export declare function getMidsceneRecorderScreenshotsForLLM(events: MidsceneRecorderEvent[], maxScreenshots?: number): string[];
67
97
  export declare function sanitizeMidsceneRecorderFileName(value: string): string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@midscene/shared",
3
- "version": "1.9.7-beta-20260616025249.0",
3
+ "version": "1.9.7",
4
4
  "repository": "https://github.com/web-infra-dev/midscene",
5
5
  "homepage": "https://midscenejs.com/",
6
6
  "types": "./dist/types/index.d.ts",
package/src/img/index.ts CHANGED
@@ -19,6 +19,8 @@ export {
19
19
  preProcessImageUrl,
20
20
  parseBase64,
21
21
  createImgBase64ByFormat,
22
+ inferBase64ImageFormat,
23
+ normalizeBase64Image,
22
24
  } from './transform';
23
25
  export {
24
26
  processImageElementInfo,
@@ -156,10 +156,71 @@ export async function resizeAndConvertImgBuffer(
156
156
 
157
157
  export const normalizeBase64Body = (body: string) => body.replace(/\s/g, '');
158
158
 
159
+ const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
160
+
161
+ export const inferBase64ImageFormat = (base64Body: string) => {
162
+ if (base64Body.startsWith('iVBORw0KGgo')) {
163
+ return 'png';
164
+ }
165
+ return 'jpeg';
166
+ };
167
+
168
+ function detectImageMimeTypeFromBuffer(buffer: Buffer): string | undefined {
169
+ if (
170
+ buffer.length >= 8 &&
171
+ buffer[0] === 0x89 &&
172
+ buffer[1] === 0x50 &&
173
+ buffer[2] === 0x4e &&
174
+ buffer[3] === 0x47 &&
175
+ buffer[4] === 0x0d &&
176
+ buffer[5] === 0x0a &&
177
+ buffer[6] === 0x1a &&
178
+ buffer[7] === 0x0a
179
+ ) {
180
+ return 'image/png';
181
+ }
182
+ if (
183
+ buffer.length >= 3 &&
184
+ buffer[0] === 0xff &&
185
+ buffer[1] === 0xd8 &&
186
+ buffer[2] === 0xff
187
+ ) {
188
+ return 'image/jpeg';
189
+ }
190
+ if (buffer.length >= 6 && buffer.subarray(0, 3).toString('ascii') === 'GIF') {
191
+ return 'image/gif';
192
+ }
193
+ if (
194
+ buffer.length >= 12 &&
195
+ buffer.subarray(0, 4).toString('ascii') === 'RIFF' &&
196
+ buffer.subarray(8, 12).toString('ascii') === 'WEBP'
197
+ ) {
198
+ return 'image/webp';
199
+ }
200
+ if (buffer.length >= 2 && buffer[0] === 0x42 && buffer[1] === 0x4d) {
201
+ return 'image/bmp';
202
+ }
203
+ return undefined;
204
+ }
205
+
159
206
  export const createImgBase64ByFormat = (format: string, body: string) => {
160
207
  return `data:image/${format};base64,${normalizeBase64Body(body)}`;
161
208
  };
162
209
 
210
+ export const normalizeBase64Image = (base64: string) => {
211
+ const trimmedBase64 = base64.trim();
212
+ if (base64ImageDataUrlPattern.test(trimmedBase64)) {
213
+ return trimmedBase64;
214
+ }
215
+
216
+ const base64Body = normalizeBase64Body(trimmedBase64);
217
+ assert(base64Body, 'base64 image must include image data');
218
+ return createImgBase64ByFormat(
219
+ inferBase64ImageFormat(base64Body),
220
+ base64Body,
221
+ );
222
+ };
223
+
163
224
  export async function resizeImgBase64(
164
225
  inputBase64: string,
165
226
  newSize: {
@@ -411,7 +472,14 @@ export const parseBase64 = (
411
472
  const separator = ';base64,';
412
473
  const index = fullBase64String.indexOf(separator);
413
474
  if (index === -1) {
414
- throw new Error('Invalid base64 string');
475
+ const body = normalizeBase64Body(fullBase64String);
476
+ const mimeType = detectImageMimeTypeFromBuffer(
477
+ Buffer.from(body, 'base64'),
478
+ );
479
+ if (!mimeType) {
480
+ throw new Error('Invalid base64 string');
481
+ }
482
+ return { mimeType, body };
415
483
  }
416
484
  return {
417
485
  // 5 means 'data:'
package/src/recorder.ts CHANGED
@@ -46,16 +46,52 @@ export interface MidsceneRecorderEvent {
46
46
  pageInfo: MidsceneRecorderPageInfo;
47
47
  screenshotBefore?: string;
48
48
  screenshotAfter?: string;
49
+ semantic?: MidsceneRecorderSemantic;
49
50
  elementDescription?: string;
50
- replayInstruction?: string;
51
- actionSummary?: string;
52
- semanticConfidence?: 'high' | 'medium' | 'low';
53
51
  descriptionLoading?: boolean;
54
- descriptionSource?: 'ai' | 'fallback';
55
- descriptionError?: string;
56
52
  screenshotWithBox?: string;
57
53
  timestamp: number;
58
54
  hashId: string;
55
+ mergedHashIds?: string[];
56
+ }
57
+
58
+ export type MidsceneRecorderSemanticSource =
59
+ | 'aiDescribe'
60
+ | 'recorderAI'
61
+ | 'heuristic';
62
+
63
+ export type MidsceneRecorderSemanticStatus = 'pending' | 'ready' | 'failed';
64
+
65
+ export type MidsceneRecorderSemanticConfidence = 'high' | 'medium' | 'low';
66
+
67
+ export interface MidsceneRecorderSemanticAiDescribe {
68
+ verifyPrompt: boolean;
69
+ verifyPassed?: boolean;
70
+ deepLocate?: boolean;
71
+ centerDistance?: number;
72
+ expectedCenter?: [number, number];
73
+ actualCenter?: [number, number];
74
+ annotatedScreenshotPath?: string;
75
+ }
76
+
77
+ export interface MidsceneRecorderSemantic {
78
+ source: MidsceneRecorderSemanticSource;
79
+ status: MidsceneRecorderSemanticStatus;
80
+ elementDescription?: string;
81
+ replayInstruction?: string;
82
+ actionSummary?: string;
83
+ confidence?: MidsceneRecorderSemanticConfidence;
84
+ error?: string;
85
+ aiDescribe?: MidsceneRecorderSemanticAiDescribe;
86
+ fallbackFrom?: MidsceneRecorderSemantic;
87
+ }
88
+
89
+ export interface MidsceneRecorderSemanticAction {
90
+ type: MidsceneRecorderEventType;
91
+ actionType?: string;
92
+ value?: string;
93
+ url?: string;
94
+ scrollDestinationDescription?: string;
59
95
  }
60
96
 
61
97
  export interface MidsceneRecorderTarget {
@@ -93,26 +129,142 @@ function isMidsceneRecorderPendingDescription(value?: string) {
93
129
  return value?.trim() === 'AI is analyzing element...';
94
130
  }
95
131
 
132
+ export function getMidsceneRecorderSemantic(
133
+ event: Pick<MidsceneRecorderEvent, 'semantic'>,
134
+ ) {
135
+ return event.semantic;
136
+ }
137
+
138
+ function getRecorderPointerActionVerb(actionType?: string) {
139
+ switch (actionType) {
140
+ case 'Tap':
141
+ return 'Tap';
142
+ case 'DoubleClick':
143
+ return 'Double click';
144
+ case 'LongPress':
145
+ return 'Long press';
146
+ case 'RightClick':
147
+ return 'Right click';
148
+ default:
149
+ return 'Click';
150
+ }
151
+ }
152
+
153
+ function getRecorderDragActionVerb(actionType?: string) {
154
+ switch (actionType) {
155
+ case 'Swipe':
156
+ return 'Swipe';
157
+ case 'DragAndDrop':
158
+ return 'Drag';
159
+ default:
160
+ return 'Drag';
161
+ }
162
+ }
163
+
164
+ export function buildMidsceneRecorderReplayInstruction(
165
+ event: MidsceneRecorderSemanticAction,
166
+ elementDescription: string,
167
+ ) {
168
+ switch (event.type) {
169
+ case 'navigation':
170
+ if (event.actionType === 'Stop') {
171
+ return 'Stop loading the current page.';
172
+ }
173
+ if (event.actionType === 'GoBack') {
174
+ return 'Go back in the browser.';
175
+ }
176
+ if (event.actionType === 'GoForward') {
177
+ return 'Go forward in the browser.';
178
+ }
179
+ if (event.actionType === 'Reload') {
180
+ return 'Reload the current page.';
181
+ }
182
+ if (event.actionType === 'NavigationChanged' && event.url) {
183
+ return `Wait for navigation to complete at \`${event.url}\`.`;
184
+ }
185
+ return event.url
186
+ ? `Navigate to \`${event.url}\`.`
187
+ : `Navigate using ${elementDescription}.`;
188
+ case 'scroll':
189
+ return event.scrollDestinationDescription
190
+ ? `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}" until "${event.scrollDestinationDescription}" is visible.`
191
+ : `Scroll the page/region with description "${elementDescription}" by value "${event.value || 'down'}".`;
192
+ case 'drag': {
193
+ const verb = getRecorderDragActionVerb(event.actionType);
194
+ return `${verb} through the area described as "${elementDescription}".`;
195
+ }
196
+ case 'input':
197
+ return `Input "${event.value || ''}" into the element described as "${elementDescription}".`;
198
+ case 'keydown':
199
+ return `Press "${event.value || 'the recorded key'}" on the element described as "${elementDescription}".`;
200
+ default: {
201
+ const verb = getRecorderPointerActionVerb(event.actionType);
202
+ if (verb === 'Long press') {
203
+ return `${verb} the element described as "${elementDescription}".`;
204
+ }
205
+ return `${verb} on the element described as "${elementDescription}".`;
206
+ }
207
+ }
208
+ }
209
+
210
+ export function buildMidsceneRecorderActionSummary(
211
+ event: MidsceneRecorderSemanticAction,
212
+ elementDescription: string,
213
+ ) {
214
+ switch (event.type) {
215
+ case 'navigation':
216
+ if (event.actionType === 'Stop') {
217
+ return 'Stop page loading';
218
+ }
219
+ if (event.actionType === 'GoBack') {
220
+ return 'Go back';
221
+ }
222
+ if (event.actionType === 'GoForward') {
223
+ return 'Go forward';
224
+ }
225
+ if (event.actionType === 'Reload') {
226
+ return 'Reload page';
227
+ }
228
+ if (event.actionType === 'NavigationChanged' && event.url) {
229
+ return `Wait for navigation to complete at ${event.url}`;
230
+ }
231
+ return event.url ? `Navigate to ${event.url}` : 'Navigate';
232
+ case 'scroll':
233
+ return event.scrollDestinationDescription
234
+ ? `Scroll ${elementDescription} toward ${event.scrollDestinationDescription}`
235
+ : `Scroll ${elementDescription}`;
236
+ case 'drag':
237
+ return `${getRecorderDragActionVerb(event.actionType)} ${elementDescription}`;
238
+ case 'input':
239
+ return `Input into ${elementDescription}`;
240
+ case 'keydown':
241
+ return `Press ${event.value || 'key'} on ${elementDescription}`;
242
+ default:
243
+ return `${getRecorderPointerActionVerb(event.actionType)} ${elementDescription}`;
244
+ }
245
+ }
246
+
96
247
  export function getMidsceneRecorderEventDescription(
97
248
  event: MidsceneRecorderEvent,
98
249
  ) {
250
+ const semantic = getMidsceneRecorderSemantic(event);
99
251
  if (
100
- event.actionSummary &&
101
- !isMidsceneRecorderPendingDescription(event.actionSummary)
252
+ semantic?.actionSummary &&
253
+ !isMidsceneRecorderPendingDescription(semantic.actionSummary)
102
254
  ) {
103
- return event.actionSummary;
255
+ return semantic.actionSummary;
104
256
  }
105
257
  if (
106
- event.elementDescription &&
107
- !isMidsceneRecorderPendingDescription(event.elementDescription)
258
+ semantic?.elementDescription &&
259
+ !isMidsceneRecorderPendingDescription(semantic.elementDescription)
108
260
  ) {
109
- return event.elementDescription;
261
+ return semantic.elementDescription;
110
262
  }
111
263
  if (
112
- event.replayInstruction &&
113
- !isMidsceneRecorderPendingDescription(event.replayInstruction)
264
+ semantic?.replayInstruction &&
265
+ !isMidsceneRecorderPendingDescription(semantic.replayInstruction)
114
266
  ) {
115
- return event.replayInstruction;
267
+ return semantic.replayInstruction;
116
268
  }
117
269
  if (event.type === 'navigation' && event.url) {
118
270
  return `Navigate to ${event.url}`;
@@ -212,8 +364,9 @@ function getRecorderEventScreenshot(event: MidsceneRecorderEvent) {
212
364
  }
213
365
 
214
366
  function hasCoordinateFallback(event: MidsceneRecorderEvent) {
367
+ const semantic = getMidsceneRecorderSemantic(event);
215
368
  return (
216
- !event.elementDescription &&
369
+ !semantic?.elementDescription &&
217
370
  event.elementRect?.x !== undefined &&
218
371
  event.elementRect?.y !== undefined
219
372
  );
@@ -224,6 +377,7 @@ function shouldIncludeMarkdownScreenshot(
224
377
  eventIndex: number,
225
378
  lastEventIndex: number,
226
379
  ) {
380
+ const semantic = getMidsceneRecorderSemantic(event);
227
381
  return (
228
382
  eventIndex === 0 ||
229
383
  eventIndex === lastEventIndex ||
@@ -231,7 +385,7 @@ function shouldIncludeMarkdownScreenshot(
231
385
  event.type === 'scroll' ||
232
386
  event.type === 'input' ||
233
387
  Boolean(event.screenshotWithBox) ||
234
- !event.elementDescription ||
388
+ !semantic?.elementDescription ||
235
389
  hasCoordinateFallback(event)
236
390
  );
237
391
  }
@@ -262,17 +416,18 @@ function getRecorderScreenshotCandidatePriority(
262
416
  if (event.screenshotWithBox) {
263
417
  priority += 70;
264
418
  }
419
+ const semantic = getMidsceneRecorderSemantic(event);
265
420
  if (
266
- event.descriptionSource === 'fallback' ||
267
- event.semanticConfidence === 'low' ||
268
- event.descriptionError
421
+ semantic?.source === 'heuristic' ||
422
+ semantic?.confidence === 'low' ||
423
+ semantic?.error
269
424
  ) {
270
425
  priority += 60;
271
426
  }
272
427
  if (event.type === 'input' || event.type === 'scroll') {
273
428
  priority += 40;
274
429
  }
275
- if (!event.elementDescription || hasCoordinateFallback(event)) {
430
+ if (!semantic?.elementDescription || hasCoordinateFallback(event)) {
276
431
  priority += 30;
277
432
  }
278
433