@midscene/core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintignore +3 -0
- package/.eslintrc.js +9 -0
- package/CONTRIBUTING.md +5 -0
- package/LICENSE +21 -0
- package/demo_data/demo.actions.json +160 -0
- package/demo_data/demo.insight.json +3571 -0
- package/demo_data/index.d.ts +1 -0
- package/demo_data/index.js +6 -0
- package/dist/es/ai-model.js +429 -0
- package/dist/es/image.js +261 -0
- package/dist/es/index.js +1083 -0
- package/dist/es/utils.js +96 -0
- package/dist/lib/ai-model.js +467 -0
- package/dist/lib/image.js +307 -0
- package/dist/lib/index.js +1124 -0
- package/dist/lib/utils.js +141 -0
- package/dist/types/ai-model.d.ts +32 -0
- package/dist/types/image.d.ts +119 -0
- package/dist/types/index.d.ts +43 -0
- package/dist/types/types-1f7912d5.d.ts +219 -0
- package/dist/types/util-3a13ce3d.d.ts +21 -0
- package/dist/types/utils.d.ts +20 -0
- package/modern.config.ts +18 -0
- package/package.json +85 -0
- package/third-party-licenses.txt +415 -0
- package/tsconfig.json +22 -0
- package/vitest.config.ts +20 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
declare module 'midscene/demo_data';
|
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
3
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
4
|
+
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
5
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
6
|
+
var __spreadValues = (a, b) => {
|
|
7
|
+
for (var prop in b || (b = {}))
|
|
8
|
+
if (__hasOwnProp.call(b, prop))
|
|
9
|
+
__defNormalProp(a, prop, b[prop]);
|
|
10
|
+
if (__getOwnPropSymbols)
|
|
11
|
+
for (var prop of __getOwnPropSymbols(b)) {
|
|
12
|
+
if (__propIsEnum.call(b, prop))
|
|
13
|
+
__defNormalProp(a, prop, b[prop]);
|
|
14
|
+
}
|
|
15
|
+
return a;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
// src/ai-model/openai.ts
|
|
19
|
+
import assert from "assert";
|
|
20
|
+
import OpenAI from "openai";
|
|
21
|
+
import wrapper from "langsmith/wrappers";
|
|
22
|
+
var envConfigKey = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
|
|
23
|
+
var envModelKey = "MIDSCENE_MODEL_NAME";
|
|
24
|
+
var envSmithDebug = "MIDSCENE_LANGSMITH_DEBUG";
|
|
25
|
+
var extraConfig = {};
|
|
26
|
+
if (typeof process.env[envConfigKey] === "string") {
|
|
27
|
+
console.log("will use env config for openai");
|
|
28
|
+
extraConfig = JSON.parse(process.env[envConfigKey]);
|
|
29
|
+
}
|
|
30
|
+
var model = "gpt-4o";
|
|
31
|
+
if (typeof process.env[envModelKey] === "string") {
|
|
32
|
+
console.log(`will use model: ${process.env[envModelKey]}`);
|
|
33
|
+
model = process.env[envModelKey];
|
|
34
|
+
}
|
|
35
|
+
async function createOpenAI() {
|
|
36
|
+
const openai = new OpenAI(extraConfig);
|
|
37
|
+
if (process.env[envSmithDebug]) {
|
|
38
|
+
console.log("DEBUGGING MODE: using langsmith wrapper");
|
|
39
|
+
const openai2 = wrapper.wrapOpenAI(new OpenAI());
|
|
40
|
+
return openai2;
|
|
41
|
+
}
|
|
42
|
+
return openai;
|
|
43
|
+
}
|
|
44
|
+
async function call(messages, responseFormat) {
|
|
45
|
+
const openai = await createOpenAI();
|
|
46
|
+
const completion = await openai.chat.completions.create({
|
|
47
|
+
model,
|
|
48
|
+
messages,
|
|
49
|
+
response_format: { type: responseFormat }
|
|
50
|
+
});
|
|
51
|
+
const { content } = completion.choices[0].message;
|
|
52
|
+
assert(content, "empty content");
|
|
53
|
+
return content;
|
|
54
|
+
}
|
|
55
|
+
async function callToGetJSONObject(messages) {
|
|
56
|
+
const response = await call(messages, "json_object" /* JSON */);
|
|
57
|
+
assert(response, "empty response");
|
|
58
|
+
return JSON.parse(response);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// src/ai-model/prompt/element_inspector.ts
|
|
62
|
+
function systemPromptToFindElement(description, multi) {
|
|
63
|
+
return `
|
|
64
|
+
## Role:
|
|
65
|
+
You are an expert in software page image (2D) and page element text analysis.
|
|
66
|
+
|
|
67
|
+
## Objective:
|
|
68
|
+
- Identify elements in screenshots and text that match the user's description.
|
|
69
|
+
- Return JSON data containing the selection reason and element ID.
|
|
70
|
+
|
|
71
|
+
## Skills:
|
|
72
|
+
- Image analysis and recognition
|
|
73
|
+
- Multilingual text understanding
|
|
74
|
+
- Software UI design and testing
|
|
75
|
+
|
|
76
|
+
## Workflow:
|
|
77
|
+
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
|
|
78
|
+
2. Based on the description (${description}), locate the target element ID in the list of element descriptions and the screenshot.
|
|
79
|
+
3. Return the number of elements: ${multi ? "multiple elements matching the description (two or more)" : "The element closest to the description (only one)"}.
|
|
80
|
+
4. Return JSON data containing the selection reason and element ID.
|
|
81
|
+
|
|
82
|
+
## Constraints:
|
|
83
|
+
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
|
|
84
|
+
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
|
|
85
|
+
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
|
|
86
|
+
- If no elements are found, the "elements" array should be empty.
|
|
87
|
+
- The returned data must conform to the specified JSON format.
|
|
88
|
+
|
|
89
|
+
## Output Format:
|
|
90
|
+
\`\`\`json
|
|
91
|
+
{
|
|
92
|
+
"elements": [
|
|
93
|
+
// If no matching elements are found, return an empty array []
|
|
94
|
+
{
|
|
95
|
+
"reason": "xxx", // The thought process for finding the element, replace xxx with your thought process
|
|
96
|
+
"text": "xxx", // Replace xxx with the text of elementInfo, if none, leave empty
|
|
97
|
+
"id": "xxx" // Replace xxx with the ID of elementInfo
|
|
98
|
+
}
|
|
99
|
+
// More elements...
|
|
100
|
+
],
|
|
101
|
+
"errors": [] // Array of strings containing any error messages
|
|
102
|
+
}
|
|
103
|
+
\`\`\`
|
|
104
|
+
|
|
105
|
+
## Example:
|
|
106
|
+
Example 1:
|
|
107
|
+
Input Example:
|
|
108
|
+
\`\`\`json
|
|
109
|
+
// Description: "Shopping cart icon in the upper right corner"
|
|
110
|
+
{
|
|
111
|
+
"screenshot": "path/screenshot.png",
|
|
112
|
+
"text": '{
|
|
113
|
+
"pageSize": {
|
|
114
|
+
"width": 400, // Width of the page
|
|
115
|
+
"height": 905 // Height of the page
|
|
116
|
+
},
|
|
117
|
+
"elementInfos": [
|
|
118
|
+
{
|
|
119
|
+
"id": "3", // ID of the element
|
|
120
|
+
"attributes": { // Attributes of the element
|
|
121
|
+
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
122
|
+
"src": "https://ap-southeast-3.m",
|
|
123
|
+
"class": ".img"
|
|
124
|
+
},
|
|
125
|
+
"content": "", // Text content of the element
|
|
126
|
+
"rect": {
|
|
127
|
+
"left": 280, // Distance from the left side of the page
|
|
128
|
+
"top": 8, // Distance from the top of the page
|
|
129
|
+
"width": 44, // Width of the element
|
|
130
|
+
"height": 44 // Height of the element
|
|
131
|
+
}
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"id": "4", // ID of the element
|
|
135
|
+
"attributes": { // Attributes of the element
|
|
136
|
+
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
137
|
+
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
|
|
138
|
+
"class": ".icon"
|
|
139
|
+
},
|
|
140
|
+
"content": "", // Text content of the element
|
|
141
|
+
"rect": {
|
|
142
|
+
"left": 350, // Distance from the left side of the page
|
|
143
|
+
"top": 16, // Distance from the top of the page
|
|
144
|
+
"width": 25, // Width of the element
|
|
145
|
+
"height": 25 // Height of the element
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
...
|
|
149
|
+
{
|
|
150
|
+
"id": "27",
|
|
151
|
+
"attributes": {
|
|
152
|
+
"nodeType": "TEXT Node",
|
|
153
|
+
"class": ".product-name"
|
|
154
|
+
},
|
|
155
|
+
"center": [
|
|
156
|
+
288,
|
|
157
|
+
834
|
|
158
|
+
],
|
|
159
|
+
"content": "Mango Drink",
|
|
160
|
+
"rect": {
|
|
161
|
+
"left": 188,
|
|
162
|
+
"top": 827,
|
|
163
|
+
"width": 199,
|
|
164
|
+
"height": 13
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
...
|
|
168
|
+
]
|
|
169
|
+
}
|
|
170
|
+
'
|
|
171
|
+
}
|
|
172
|
+
\`\`\`
|
|
173
|
+
Output Example:
|
|
174
|
+
\`\`\`json
|
|
175
|
+
{
|
|
176
|
+
"elements": [
|
|
177
|
+
{
|
|
178
|
+
// Describe the reason for finding this element, replace with actual value in practice
|
|
179
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
180
|
+
"text": "",
|
|
181
|
+
// ID of this element, replace with actual value in practice
|
|
182
|
+
"id": "4"
|
|
183
|
+
}
|
|
184
|
+
],
|
|
185
|
+
"errors": []
|
|
186
|
+
}
|
|
187
|
+
\`\`\`
|
|
188
|
+
|
|
189
|
+
`;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// src/ai-model/prompt/util.ts
|
|
193
|
+
import assert4 from "assert";
|
|
194
|
+
|
|
195
|
+
// src/image/info.ts
|
|
196
|
+
import assert2 from "assert";
|
|
197
|
+
import { Buffer as Buffer2 } from "buffer";
|
|
198
|
+
import { readFileSync } from "fs";
|
|
199
|
+
import Sharp from "sharp";
|
|
200
|
+
async function imageInfo(image) {
|
|
201
|
+
const { width, height } = await Sharp(image).metadata();
|
|
202
|
+
assert2(width && height, `invalid image: ${image}`);
|
|
203
|
+
return { width, height };
|
|
204
|
+
}
|
|
205
|
+
async function imageInfoOfBase64(imageBase64) {
|
|
206
|
+
const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
|
|
207
|
+
return imageInfo(Buffer2.from(base64Data, "base64"));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// src/image/transform.ts
|
|
211
|
+
import { Buffer as Buffer3 } from "buffer";
|
|
212
|
+
import Sharp2 from "sharp";
|
|
213
|
+
|
|
214
|
+
// src/image/visualization.ts
|
|
215
|
+
import { Buffer as Buffer4 } from "buffer";
|
|
216
|
+
import Sharp3 from "sharp";
|
|
217
|
+
|
|
218
|
+
// src/utils.ts
|
|
219
|
+
import { tmpdir } from "os";
|
|
220
|
+
import { basename, join } from "path";
|
|
221
|
+
import { copyFileSync, existsSync, mkdirSync, readFileSync as readFileSync2, writeFileSync } from "fs";
|
|
222
|
+
import { randomUUID } from "crypto";
|
|
223
|
+
import assert3 from "assert";
|
|
224
|
+
var logDir = join(process.cwd(), "./midscene_run/");
|
|
225
|
+
|
|
226
|
+
// src/ai-model/prompt/util.ts
|
|
227
|
+
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
228
|
+
var contextFormatIntro = `
|
|
229
|
+
The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
|
|
230
|
+
var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
|
|
231
|
+
var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
|
|
232
|
+
var skillSegment = `skill name: segment_a_web_page
|
|
233
|
+
skill content:
|
|
234
|
+
Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
|
|
235
|
+
Some general rules for segmentation:
|
|
236
|
+
* Each section should NOT overlap with each other.
|
|
237
|
+
* Each text should only belong to one section.
|
|
238
|
+
* [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
|
|
239
|
+
* Analyze the page in a top-to-bottom and left-to-right order.
|
|
240
|
+
* The evidence indicates a separate section, for example
|
|
241
|
+
- The background color of certain parts of the page changes.
|
|
242
|
+
- A section of a page includes a title.
|
|
243
|
+
* Provide the following data for each of the UI section you found.
|
|
244
|
+
{
|
|
245
|
+
"name": "name of the section",
|
|
246
|
+
"description": "briefly summarize the key content or usage of this section.",
|
|
247
|
+
"sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
|
|
248
|
+
"textIds": ["5", "6", "7"], // ids of all text elements in this section
|
|
249
|
+
}
|
|
250
|
+
`;
|
|
251
|
+
var skillExtractData = `skill name: extract_data_from_UI
|
|
252
|
+
related input: DATA_DEMAND
|
|
253
|
+
skill content:
|
|
254
|
+
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
255
|
+
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
256
|
+
- ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
|
|
257
|
+
function promptsOfSectionQuery(constraints) {
|
|
258
|
+
if (!constraints.length) {
|
|
259
|
+
return "";
|
|
260
|
+
}
|
|
261
|
+
const instruction = "Use your segment_a_web_page skill to find the following section(s)";
|
|
262
|
+
const singleSection = (c) => {
|
|
263
|
+
assert4(
|
|
264
|
+
c.name || c.description,
|
|
265
|
+
"either `name` or `description` is required to define a section constraint"
|
|
266
|
+
);
|
|
267
|
+
const number = "One section";
|
|
268
|
+
const name = c.name ? `named \`${c.name}\`` : "";
|
|
269
|
+
const description = c.description ? `, usage or criteria : ${c.description}` : "";
|
|
270
|
+
const basic = `* ${number} ${name}${description}`;
|
|
271
|
+
return basic;
|
|
272
|
+
};
|
|
273
|
+
return `${instruction}
|
|
274
|
+
${constraints.map(singleSection).join("\n")}`;
|
|
275
|
+
}
|
|
276
|
+
function systemPromptToExtract(dataQuery, sections) {
|
|
277
|
+
const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
|
|
278
|
+
const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
|
|
279
|
+
const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
|
|
280
|
+
return `
|
|
281
|
+
${characteristic}
|
|
282
|
+
${contextFormatIntro}
|
|
283
|
+
|
|
284
|
+
You have the following skills:
|
|
285
|
+
${allSectionNames.length ? skillSegment : ""}
|
|
286
|
+
${skillExtractData}
|
|
287
|
+
|
|
288
|
+
Now, do the following jobs:
|
|
289
|
+
${sectionFindingPrompt}
|
|
290
|
+
Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
|
|
291
|
+
DATA_DEMAND start:
|
|
292
|
+
${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
|
|
293
|
+
${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
|
|
294
|
+
DATA_DEMAND ends.
|
|
295
|
+
|
|
296
|
+
Return in the following JSON format:
|
|
297
|
+
{
|
|
298
|
+
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
299
|
+
${sectionReturnFormat}
|
|
300
|
+
data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
|
|
301
|
+
errors?: [], // string[], error message if any
|
|
302
|
+
}
|
|
303
|
+
`;
|
|
304
|
+
}
|
|
305
|
+
function describeSize(size) {
|
|
306
|
+
return `${size.width} x ${size.height}`;
|
|
307
|
+
}
|
|
308
|
+
function truncateText(text) {
|
|
309
|
+
const maxLength = 50;
|
|
310
|
+
if (text && text.length > maxLength) {
|
|
311
|
+
return `${text.slice(0, maxLength)}...`;
|
|
312
|
+
}
|
|
313
|
+
return text;
|
|
314
|
+
}
|
|
315
|
+
async function describeUserPage(context) {
|
|
316
|
+
const { screenshotBase64 } = context;
|
|
317
|
+
const { width, height } = await imageInfoOfBase64(screenshotBase64);
|
|
318
|
+
const elementsInfo = context.content;
|
|
319
|
+
const idElementMap = {};
|
|
320
|
+
elementsInfo.forEach((item) => {
|
|
321
|
+
idElementMap[item.id] = item;
|
|
322
|
+
return __spreadValues({}, item);
|
|
323
|
+
});
|
|
324
|
+
const elementInfosDescription = cropfieldInformation(elementsInfo);
|
|
325
|
+
return {
|
|
326
|
+
description: `
|
|
327
|
+
{
|
|
328
|
+
// The size of the page
|
|
329
|
+
"pageSize": ${describeSize({ width, height })},
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
// json description of the element
|
|
333
|
+
"elementInfos": ${JSON.stringify(elementInfosDescription)}
|
|
334
|
+
}`,
|
|
335
|
+
elementById(id) {
|
|
336
|
+
assert4(typeof id !== "undefined", "id is required for query");
|
|
337
|
+
const item = idElementMap[`${id}`];
|
|
338
|
+
return item;
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
function cropfieldInformation(elementsInfo) {
|
|
343
|
+
const elementInfosDescription = elementsInfo.map((item) => {
|
|
344
|
+
const { id, attributes = {}, rect, content } = item;
|
|
345
|
+
const tailorContent = truncateText(content);
|
|
346
|
+
const tailorAttributes = Object.keys(attributes).reduce((res, currentKey) => {
|
|
347
|
+
const attributeVal = attributes[currentKey];
|
|
348
|
+
res[currentKey] = truncateText(attributeVal);
|
|
349
|
+
return res;
|
|
350
|
+
}, {});
|
|
351
|
+
return {
|
|
352
|
+
id,
|
|
353
|
+
attributes: tailorAttributes,
|
|
354
|
+
rect,
|
|
355
|
+
content: tailorContent
|
|
356
|
+
};
|
|
357
|
+
});
|
|
358
|
+
return JSON.stringify(elementInfosDescription);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// src/ai-model/inspect.ts
|
|
362
|
+
async function AiInspectElement(options) {
|
|
363
|
+
const { context, multi, findElementDescription, callAI = callToGetJSONObject } = options;
|
|
364
|
+
const { screenshotBase64 } = context;
|
|
365
|
+
const { description, elementById } = await describeUserPage(context);
|
|
366
|
+
const systemPrompt = systemPromptToFindElement(findElementDescription, multi);
|
|
367
|
+
const msgs = [
|
|
368
|
+
{ role: "system", content: systemPrompt },
|
|
369
|
+
{
|
|
370
|
+
role: "user",
|
|
371
|
+
content: [
|
|
372
|
+
{
|
|
373
|
+
type: "image_url",
|
|
374
|
+
image_url: {
|
|
375
|
+
url: screenshotBase64,
|
|
376
|
+
detail: "high"
|
|
377
|
+
}
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
type: "text",
|
|
381
|
+
text: description
|
|
382
|
+
}
|
|
383
|
+
]
|
|
384
|
+
}
|
|
385
|
+
];
|
|
386
|
+
const parseResult = await callAI(msgs);
|
|
387
|
+
return {
|
|
388
|
+
parseResult,
|
|
389
|
+
elementById,
|
|
390
|
+
systemPrompt
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
async function AiExtractElementInfo(options) {
|
|
394
|
+
const { dataQuery, sectionConstraints, context, callAI = callToGetJSONObject } = options;
|
|
395
|
+
const systemPrompt = systemPromptToExtract(dataQuery, sectionConstraints);
|
|
396
|
+
const { screenshotBase64 } = context;
|
|
397
|
+
const { description, elementById } = await describeUserPage(context);
|
|
398
|
+
const msgs = [
|
|
399
|
+
{ role: "system", content: systemPrompt },
|
|
400
|
+
{
|
|
401
|
+
role: "user",
|
|
402
|
+
content: [
|
|
403
|
+
{
|
|
404
|
+
type: "image_url",
|
|
405
|
+
image_url: {
|
|
406
|
+
url: screenshotBase64
|
|
407
|
+
}
|
|
408
|
+
},
|
|
409
|
+
{
|
|
410
|
+
type: "text",
|
|
411
|
+
text: description
|
|
412
|
+
}
|
|
413
|
+
]
|
|
414
|
+
}
|
|
415
|
+
];
|
|
416
|
+
const parseResult = await callAI(msgs);
|
|
417
|
+
return {
|
|
418
|
+
parseResult,
|
|
419
|
+
elementById,
|
|
420
|
+
systemPrompt
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
export {
|
|
424
|
+
AiExtractElementInfo,
|
|
425
|
+
AiInspectElement,
|
|
426
|
+
callToGetJSONObject,
|
|
427
|
+
describeUserPage,
|
|
428
|
+
systemPromptToFindElement
|
|
429
|
+
};
|
package/dist/es/image.js
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
// src/image/info.ts
|
|
2
|
+
import assert from "assert";
|
|
3
|
+
import { Buffer } from "buffer";
|
|
4
|
+
import { readFileSync } from "fs";
|
|
5
|
+
import Sharp from "sharp";
|
|
6
|
+
async function imageInfo(image) {
|
|
7
|
+
const { width, height } = await Sharp(image).metadata();
|
|
8
|
+
assert(width && height, `invalid image: ${image}`);
|
|
9
|
+
return { width, height };
|
|
10
|
+
}
|
|
11
|
+
async function imageInfoOfBase64(imageBase64) {
|
|
12
|
+
const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
|
|
13
|
+
return imageInfo(Buffer.from(base64Data, "base64"));
|
|
14
|
+
}
|
|
15
|
+
function base64Encoded(image, withHeader = true) {
|
|
16
|
+
const imageBuffer = readFileSync(image);
|
|
17
|
+
if (!withHeader) {
|
|
18
|
+
return imageBuffer.toString("base64");
|
|
19
|
+
}
|
|
20
|
+
if (image.endsWith("png")) {
|
|
21
|
+
return `data:image/png;base64,${imageBuffer.toString("base64")}`;
|
|
22
|
+
} else if (image.endsWith("jpg") || image.endsWith("jpeg")) {
|
|
23
|
+
return `data:image/jpeg;base64,${imageBuffer.toString("base64")}`;
|
|
24
|
+
}
|
|
25
|
+
throw new Error("unsupported image type");
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// src/image/transform.ts
|
|
29
|
+
import { Buffer as Buffer2 } from "buffer";
|
|
30
|
+
import Sharp2 from "sharp";
|
|
31
|
+
async function saveBase64Image(options) {
|
|
32
|
+
const { base64Data, outputPath } = options;
|
|
33
|
+
const base64Image = base64Data.split(";base64,").pop() || base64Data;
|
|
34
|
+
const imageBuffer = Buffer2.from(base64Image, "base64");
|
|
35
|
+
await Sharp2(imageBuffer).toFile(outputPath);
|
|
36
|
+
console.log("Image successfully written to file.");
|
|
37
|
+
}
|
|
38
|
+
async function transformImgPathToBase64(inputPath) {
|
|
39
|
+
return await Sharp2(inputPath).toBuffer().then((data) => {
|
|
40
|
+
const base64Data = data.toString("base64");
|
|
41
|
+
return base64Data;
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
async function resizeImg(base64Data) {
|
|
45
|
+
const base64Image = base64Data.split(";base64,").pop() || base64Data;
|
|
46
|
+
const imageBuffer = Buffer2.from(base64Image, "base64");
|
|
47
|
+
const metadata = await Sharp2(imageBuffer).metadata();
|
|
48
|
+
const { width, height } = metadata;
|
|
49
|
+
if (!width || !height) {
|
|
50
|
+
throw Error("undefined width or height with url");
|
|
51
|
+
}
|
|
52
|
+
const newSize = calculateNewDimensions(width, height);
|
|
53
|
+
return await Sharp2(imageBuffer).resize(newSize.width, newSize.height).toBuffer().then((data) => {
|
|
54
|
+
const base64Data2 = data.toString("base64");
|
|
55
|
+
return base64Data2;
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
function calculateNewDimensions(originalWidth, originalHeight) {
|
|
59
|
+
const maxWidth = 768;
|
|
60
|
+
const maxHeight = 2048;
|
|
61
|
+
let newWidth = originalWidth;
|
|
62
|
+
let newHeight = originalHeight;
|
|
63
|
+
const aspectRatio = originalWidth / originalHeight;
|
|
64
|
+
if (originalWidth > maxWidth) {
|
|
65
|
+
newWidth = maxWidth;
|
|
66
|
+
newHeight = newWidth / aspectRatio;
|
|
67
|
+
}
|
|
68
|
+
if (newHeight > maxHeight) {
|
|
69
|
+
newHeight = maxHeight;
|
|
70
|
+
newWidth = newHeight * aspectRatio;
|
|
71
|
+
}
|
|
72
|
+
return {
|
|
73
|
+
width: Math.round(newWidth),
|
|
74
|
+
height: Math.round(newHeight)
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
async function trimImage(image) {
|
|
78
|
+
const { info } = await Sharp2(image).trim().toBuffer({
|
|
79
|
+
resolveWithObject: true
|
|
80
|
+
});
|
|
81
|
+
if (typeof info.trimOffsetLeft === "undefined" || typeof info.trimOffsetTop === "undefined") {
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
return {
|
|
85
|
+
trimOffsetLeft: info.trimOffsetLeft,
|
|
86
|
+
trimOffsetTop: info.trimOffsetTop,
|
|
87
|
+
width: info.width,
|
|
88
|
+
height: info.height
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
async function alignCoordByTrim(image, center) {
|
|
92
|
+
const img = await Sharp2(image).extract(center).toBuffer();
|
|
93
|
+
const trimInfo = await trimImage(img);
|
|
94
|
+
if (!trimInfo) {
|
|
95
|
+
return center;
|
|
96
|
+
}
|
|
97
|
+
return {
|
|
98
|
+
left: center.left - trimInfo.trimOffsetLeft,
|
|
99
|
+
top: center.top - trimInfo.trimOffsetTop,
|
|
100
|
+
width: trimInfo.width,
|
|
101
|
+
height: trimInfo.height
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// src/image/visualization.ts
|
|
106
|
+
import { Buffer as Buffer3 } from "buffer";
|
|
107
|
+
import Sharp3 from "sharp";
|
|
108
|
+
|
|
109
|
+
// src/utils.ts
|
|
110
|
+
import { tmpdir } from "os";
|
|
111
|
+
import { basename, join } from "path";
|
|
112
|
+
import { copyFileSync, existsSync, mkdirSync, readFileSync as readFileSync2, writeFileSync } from "fs";
|
|
113
|
+
import { randomUUID } from "crypto";
|
|
114
|
+
import assert2 from "assert";
|
|
115
|
+
var pkg;
|
|
116
|
+
function getPkgInfo() {
|
|
117
|
+
if (pkg) {
|
|
118
|
+
return pkg;
|
|
119
|
+
}
|
|
120
|
+
let pkgJsonFile = "";
|
|
121
|
+
if (existsSync(join(__dirname, "../package.json"))) {
|
|
122
|
+
pkgJsonFile = join(__dirname, "../package.json");
|
|
123
|
+
} else if (existsSync(join(__dirname, "../../../package.json"))) {
|
|
124
|
+
pkgJsonFile = join(__dirname, "../../../package.json");
|
|
125
|
+
}
|
|
126
|
+
if (pkgJsonFile) {
|
|
127
|
+
const { name, version } = JSON.parse(readFileSync2(pkgJsonFile, "utf-8"));
|
|
128
|
+
pkg = { name, version };
|
|
129
|
+
return pkg;
|
|
130
|
+
} else {
|
|
131
|
+
return {
|
|
132
|
+
name: "midscene-unknown-page-name",
|
|
133
|
+
version: "0.0.0"
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
var logDir = join(process.cwd(), "./midscene_run/");
|
|
138
|
+
function getTmpDir() {
|
|
139
|
+
const path = join(tmpdir(), getPkgInfo().name);
|
|
140
|
+
mkdirSync(path, { recursive: true });
|
|
141
|
+
return path;
|
|
142
|
+
}
|
|
143
|
+
function getTmpFile(fileExt) {
|
|
144
|
+
const filename = `${randomUUID()}.${fileExt}`;
|
|
145
|
+
return join(getTmpDir(), filename);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// src/image/visualization.ts
|
|
149
|
+
var colors = [
|
|
150
|
+
{
|
|
151
|
+
name: "Red",
|
|
152
|
+
hex: "#FF0000"
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
name: "Green",
|
|
156
|
+
hex: "#00FF00"
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
name: "Blue",
|
|
160
|
+
hex: "#0000FF"
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
name: "Yellow",
|
|
164
|
+
hex: "#FFFF00"
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
name: "Cyan",
|
|
168
|
+
hex: "#00FFFF"
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
name: "Magenta",
|
|
172
|
+
hex: "#FF00FF"
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
name: "Orange",
|
|
176
|
+
hex: "#FFA500"
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
name: "Purple",
|
|
180
|
+
hex: "#800080"
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
name: "Brown",
|
|
184
|
+
hex: "#A52A2A"
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
name: "Pink",
|
|
188
|
+
hex: "#FFC0CB"
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
name: "Light Blue",
|
|
192
|
+
hex: "#ADD8E6"
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
name: "Lime",
|
|
196
|
+
hex: "#00FF00"
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
name: "Violet",
|
|
200
|
+
hex: "#EE82EE"
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
name: "Gold",
|
|
204
|
+
hex: "#FFD700"
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
name: "Teal",
|
|
208
|
+
hex: "#008080"
|
|
209
|
+
}
|
|
210
|
+
];
|
|
211
|
+
var sizeLimit = 512;
|
|
212
|
+
var textFontSize = 12;
|
|
213
|
+
async function composeSectionDiagram(sections, context) {
|
|
214
|
+
const { width, height } = await imageInfo(context.screenshotBase64);
|
|
215
|
+
const ratio = Math.min(sizeLimit / width, sizeLimit / height, 1);
|
|
216
|
+
const canvasWidth = width * ratio;
|
|
217
|
+
const canvasHeight = height * ratio;
|
|
218
|
+
const sectionNameColorMap = {};
|
|
219
|
+
const rects = sections.map((section, index) => {
|
|
220
|
+
const { left, top, width: width2, height: height2 } = section.rect;
|
|
221
|
+
const color = colors[index % colors.length];
|
|
222
|
+
sectionNameColorMap[section.name] = color;
|
|
223
|
+
return `
|
|
224
|
+
<rect x="${left * ratio}" y="${top * ratio}" width="${width2 * ratio}" height="${height2 * ratio}" fill="${color.hex}" />
|
|
225
|
+
<text x="${left * ratio}" y="${top * ratio + textFontSize}" font-family="Arial" font-size="${textFontSize}" fill="black">
|
|
226
|
+
${section.name}
|
|
227
|
+
</text>
|
|
228
|
+
`;
|
|
229
|
+
});
|
|
230
|
+
const rectangles = `
|
|
231
|
+
<svg width="${canvasWidth}" height="${canvasHeight}">
|
|
232
|
+
${rects.join("\n")}
|
|
233
|
+
</svg>
|
|
234
|
+
`;
|
|
235
|
+
const svgBuffer = Buffer3.from(rectangles);
|
|
236
|
+
const file = getTmpFile("png");
|
|
237
|
+
await Sharp3({
|
|
238
|
+
create: {
|
|
239
|
+
width: canvasWidth,
|
|
240
|
+
height: canvasHeight,
|
|
241
|
+
channels: 4,
|
|
242
|
+
background: { r: 255, g: 255, b: 255, alpha: 1 }
|
|
243
|
+
}
|
|
244
|
+
}).composite([{ input: svgBuffer }]).png().toFile(file);
|
|
245
|
+
return {
|
|
246
|
+
file,
|
|
247
|
+
sectionNameColorMap
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
export {
|
|
251
|
+
alignCoordByTrim,
|
|
252
|
+
base64Encoded,
|
|
253
|
+
calculateNewDimensions,
|
|
254
|
+
composeSectionDiagram,
|
|
255
|
+
imageInfo,
|
|
256
|
+
imageInfoOfBase64,
|
|
257
|
+
resizeImg,
|
|
258
|
+
saveBase64Image,
|
|
259
|
+
transformImgPathToBase64,
|
|
260
|
+
trimImage
|
|
261
|
+
};
|