@rpascene/core 0.30.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +9 -0
- package/dist/es/agent/agent.mjs +636 -0
- package/dist/es/agent/agent.mjs.map +1 -0
- package/dist/es/agent/common.mjs +0 -0
- package/dist/es/agent/index.mjs +6 -0
- package/dist/es/agent/task-cache.mjs +184 -0
- package/dist/es/agent/task-cache.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +666 -0
- package/dist/es/agent/tasks.mjs.map +1 -0
- package/dist/es/agent/ui-utils.mjs +72 -0
- package/dist/es/agent/ui-utils.mjs.map +1 -0
- package/dist/es/agent/utils.mjs +162 -0
- package/dist/es/agent/utils.mjs.map +1 -0
- package/dist/es/ai-model/action-executor.mjs +129 -0
- package/dist/es/ai-model/action-executor.mjs.map +1 -0
- package/dist/es/ai-model/common.mjs +355 -0
- package/dist/es/ai-model/common.mjs.map +1 -0
- package/dist/es/ai-model/conversation-history.mjs +58 -0
- package/dist/es/ai-model/conversation-history.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +11 -0
- package/dist/es/ai-model/inspect.mjs +286 -0
- package/dist/es/ai-model/inspect.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +140 -0
- package/dist/es/ai-model/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/assertion.mjs +31 -0
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -0
- package/dist/es/ai-model/prompt/common.mjs +7 -0
- package/dist/es/ai-model/prompt/common.mjs.map +1 -0
- package/dist/es/ai-model/prompt/describe.mjs +44 -0
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +140 -0
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +275 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +367 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +47 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs +34 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/util.mjs +124 -0
- package/dist/es/ai-model/prompt/util.mjs.map +1 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +537 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +201 -0
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
- package/dist/es/device/index.mjs +152 -0
- package/dist/es/device/index.mjs.map +1 -0
- package/dist/es/image/index.mjs +2 -0
- package/dist/es/index.mjs +11 -0
- package/dist/es/index.mjs.map +1 -0
- package/dist/es/insight/index.mjs +233 -0
- package/dist/es/insight/index.mjs.map +1 -0
- package/dist/es/insight/utils.mjs +15 -0
- package/dist/es/insight/utils.mjs.map +1 -0
- package/dist/es/report.mjs +88 -0
- package/dist/es/report.mjs.map +1 -0
- package/dist/es/tree.mjs +2 -0
- package/dist/es/types.mjs +11 -0
- package/dist/es/types.mjs.map +1 -0
- package/dist/es/utils.mjs +204 -0
- package/dist/es/utils.mjs.map +1 -0
- package/dist/es/yaml/builder.mjs +13 -0
- package/dist/es/yaml/builder.mjs.map +1 -0
- package/dist/es/yaml/index.mjs +3 -0
- package/dist/es/yaml/player.mjs +372 -0
- package/dist/es/yaml/player.mjs.map +1 -0
- package/dist/es/yaml/utils.mjs +73 -0
- package/dist/es/yaml/utils.mjs.map +1 -0
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/agent/agent.js +683 -0
- package/dist/lib/agent/agent.js.map +1 -0
- package/dist/lib/agent/common.js +5 -0
- package/dist/lib/agent/index.js +81 -0
- package/dist/lib/agent/index.js.map +1 -0
- package/dist/lib/agent/task-cache.js +236 -0
- package/dist/lib/agent/task-cache.js.map +1 -0
- package/dist/lib/agent/tasks.js +703 -0
- package/dist/lib/agent/tasks.js.map +1 -0
- package/dist/lib/agent/ui-utils.js +121 -0
- package/dist/lib/agent/ui-utils.js.map +1 -0
- package/dist/lib/agent/utils.js +233 -0
- package/dist/lib/agent/utils.js.map +1 -0
- package/dist/lib/ai-model/action-executor.js +163 -0
- package/dist/lib/ai-model/action-executor.js.map +1 -0
- package/dist/lib/ai-model/common.js +461 -0
- package/dist/lib/ai-model/common.js.map +1 -0
- package/dist/lib/ai-model/conversation-history.js +92 -0
- package/dist/lib/ai-model/conversation-history.js.map +1 -0
- package/dist/lib/ai-model/index.js +131 -0
- package/dist/lib/ai-model/index.js.map +1 -0
- package/dist/lib/ai-model/inspect.js +326 -0
- package/dist/lib/ai-model/inspect.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +174 -0
- package/dist/lib/ai-model/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/assertion.js +65 -0
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -0
- package/dist/lib/ai-model/prompt/common.js +41 -0
- package/dist/lib/ai-model/prompt/common.js.map +1 -0
- package/dist/lib/ai-model/prompt/describe.js +78 -0
- package/dist/lib/ai-model/prompt/describe.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +180 -0
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +315 -0
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +407 -0
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +84 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js +68 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/util.js +176 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +623 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -0
- package/dist/lib/ai-model/ui-tars-planning.js +238 -0
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
- package/dist/lib/device/index.js +255 -0
- package/dist/lib/device/index.js.map +1 -0
- package/dist/lib/image/index.js +56 -0
- package/dist/lib/image/index.js.map +1 -0
- package/dist/lib/index.js +103 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/insight/index.js +267 -0
- package/dist/lib/insight/index.js.map +1 -0
- package/dist/lib/insight/utils.js +49 -0
- package/dist/lib/insight/utils.js.map +1 -0
- package/dist/lib/report.js +122 -0
- package/dist/lib/report.js.map +1 -0
- package/dist/lib/tree.js +44 -0
- package/dist/lib/tree.js.map +1 -0
- package/dist/lib/types.js +82 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.js +283 -0
- package/dist/lib/utils.js.map +1 -0
- package/dist/lib/yaml/builder.js +57 -0
- package/dist/lib/yaml/builder.js.map +1 -0
- package/dist/lib/yaml/index.js +80 -0
- package/dist/lib/yaml/index.js.map +1 -0
- package/dist/lib/yaml/player.js +406 -0
- package/dist/lib/yaml/player.js.map +1 -0
- package/dist/lib/yaml/utils.js +126 -0
- package/dist/lib/yaml/utils.js.map +1 -0
- package/dist/lib/yaml.js +20 -0
- package/dist/lib/yaml.js.map +1 -0
- package/dist/types/agent/agent.d.ts +156 -0
- package/dist/types/agent/common.d.ts +0 -0
- package/dist/types/agent/index.d.ts +9 -0
- package/dist/types/agent/task-cache.d.ts +48 -0
- package/dist/types/agent/tasks.d.ts +48 -0
- package/dist/types/agent/ui-utils.d.ts +7 -0
- package/dist/types/agent/utils.d.ts +52 -0
- package/dist/types/ai-model/action-executor.d.ts +19 -0
- package/dist/types/ai-model/common.d.ts +569 -0
- package/dist/types/ai-model/conversation-history.d.ts +18 -0
- package/dist/types/ai-model/index.d.ts +13 -0
- package/dist/types/ai-model/inspect.d.ts +46 -0
- package/dist/types/ai-model/llm-planning.d.ts +11 -0
- package/dist/types/ai-model/prompt/assertion.d.ts +2 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +4 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +9 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +9 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +6 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
- package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +1 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +47 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +100 -0
- package/dist/types/ai-model/service-caller/index.d.ts +48 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +59 -0
- package/dist/types/device/index.d.ts +2158 -0
- package/dist/types/image/index.d.ts +1 -0
- package/dist/types/index.d.ts +12 -0
- package/dist/types/insight/index.d.ts +31 -0
- package/dist/types/insight/utils.d.ts +2 -0
- package/dist/types/report.d.ts +12 -0
- package/dist/types/tree.d.ts +1 -0
- package/dist/types/types.d.ts +414 -0
- package/dist/types/utils.d.ts +40 -0
- package/dist/types/yaml/builder.d.ts +2 -0
- package/dist/types/yaml/index.d.ts +3 -0
- package/dist/types/yaml/player.d.ts +34 -0
- package/dist/types/yaml/utils.d.ts +9 -0
- package/dist/types/yaml.d.ts +178 -0
- package/package.json +108 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@rpascene/shared/img";
|
|
2
|
+
import { getDebug } from "@rpascene/shared/logger";
|
|
3
|
+
import { assert } from "@rpascene/shared/utils";
|
|
4
|
+
import { AIActionType, adaptBboxToRect, expandSearchArea, markupImageForLLM, mergeRects } from "./common.mjs";
|
|
5
|
+
import { extractDataQueryPrompt, systemPromptToExtract } from "./prompt/extraction.mjs";
|
|
6
|
+
import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
|
|
7
|
+
import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
|
|
8
|
+
import { describeUserPage, distance, distanceThreshold, elementByPositionWithElementInfo } from "./prompt/util.mjs";
|
|
9
|
+
import { callAIWithObjectResponse } from "./service-caller/index.mjs";
|
|
10
|
+
const debugInspect = getDebug('ai:inspect');
|
|
11
|
+
const debugSection = getDebug('ai:section');
|
|
12
|
+
const extraTextFromUserPrompt = (prompt)=>{
|
|
13
|
+
if ('string' == typeof prompt) return prompt;
|
|
14
|
+
return prompt.prompt;
|
|
15
|
+
};
|
|
16
|
+
const promptsToChatParam = async (multimodalPrompt)=>{
|
|
17
|
+
var _multimodalPrompt_images;
|
|
18
|
+
const msgs = [];
|
|
19
|
+
if (null == multimodalPrompt ? void 0 : null == (_multimodalPrompt_images = multimodalPrompt.images) ? void 0 : _multimodalPrompt_images.length) {
|
|
20
|
+
msgs.push({
|
|
21
|
+
role: 'user',
|
|
22
|
+
content: [
|
|
23
|
+
{
|
|
24
|
+
type: 'text',
|
|
25
|
+
text: 'Next, I will provide all the reference images.'
|
|
26
|
+
}
|
|
27
|
+
]
|
|
28
|
+
});
|
|
29
|
+
for (const item of multimodalPrompt.images){
|
|
30
|
+
const base64 = await preProcessImageUrl(item.url, !!multimodalPrompt.convertHttpImage2Base64);
|
|
31
|
+
msgs.push({
|
|
32
|
+
role: 'user',
|
|
33
|
+
content: [
|
|
34
|
+
{
|
|
35
|
+
type: 'text',
|
|
36
|
+
text: `reference image ${item.name}:`
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
});
|
|
40
|
+
msgs.push({
|
|
41
|
+
role: 'user',
|
|
42
|
+
content: [
|
|
43
|
+
{
|
|
44
|
+
type: 'image_url',
|
|
45
|
+
image_url: {
|
|
46
|
+
url: base64,
|
|
47
|
+
detail: 'high'
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
]
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return msgs;
|
|
55
|
+
};
|
|
56
|
+
async function AiLocateElement(options) {
|
|
57
|
+
const { context, targetElementDescription, callAIFn, modelConfig } = options;
|
|
58
|
+
const { vlMode } = modelConfig;
|
|
59
|
+
const { screenshotBase64 } = context;
|
|
60
|
+
const { description, elementById, insertElementByPosition } = await describeUserPage(context, {
|
|
61
|
+
vlMode
|
|
62
|
+
});
|
|
63
|
+
assert(targetElementDescription, "cannot find the target element description");
|
|
64
|
+
const userInstructionPrompt = await findElementPrompt.format({
|
|
65
|
+
pageDescription: description,
|
|
66
|
+
targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
|
|
67
|
+
});
|
|
68
|
+
const systemPrompt = systemPromptToLocateElement(vlMode);
|
|
69
|
+
let imagePayload = screenshotBase64;
|
|
70
|
+
let imageWidth = context.size.width;
|
|
71
|
+
let imageHeight = context.size.height;
|
|
72
|
+
let originalImageWidth = imageWidth;
|
|
73
|
+
let originalImageHeight = imageHeight;
|
|
74
|
+
if (options.searchConfig) {
|
|
75
|
+
var _options_searchConfig_rect, _options_searchConfig_rect1;
|
|
76
|
+
assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
|
|
77
|
+
assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
|
|
78
|
+
imagePayload = options.searchConfig.imageBase64;
|
|
79
|
+
imageWidth = null == (_options_searchConfig_rect = options.searchConfig.rect) ? void 0 : _options_searchConfig_rect.width;
|
|
80
|
+
imageHeight = null == (_options_searchConfig_rect1 = options.searchConfig.rect) ? void 0 : _options_searchConfig_rect1.height;
|
|
81
|
+
originalImageWidth = imageWidth;
|
|
82
|
+
originalImageHeight = imageHeight;
|
|
83
|
+
} else if ('qwen-vl' === vlMode) {
|
|
84
|
+
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
85
|
+
imageWidth = paddedResult.width;
|
|
86
|
+
imageHeight = paddedResult.height;
|
|
87
|
+
imagePayload = paddedResult.imageBase64;
|
|
88
|
+
} else if ('qwen3-vl' === vlMode) ;
|
|
89
|
+
else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
90
|
+
const msgs = [
|
|
91
|
+
{
|
|
92
|
+
role: 'system',
|
|
93
|
+
content: systemPrompt
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
role: 'user',
|
|
97
|
+
content: [
|
|
98
|
+
{
|
|
99
|
+
type: 'image_url',
|
|
100
|
+
image_url: {
|
|
101
|
+
url: imagePayload,
|
|
102
|
+
detail: 'high'
|
|
103
|
+
}
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
type: 'text',
|
|
107
|
+
text: userInstructionPrompt
|
|
108
|
+
}
|
|
109
|
+
]
|
|
110
|
+
}
|
|
111
|
+
];
|
|
112
|
+
if ('string' != typeof targetElementDescription) {
|
|
113
|
+
const addOns = await promptsToChatParam({
|
|
114
|
+
images: targetElementDescription.images,
|
|
115
|
+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
|
|
116
|
+
});
|
|
117
|
+
msgs.push(...addOns);
|
|
118
|
+
}
|
|
119
|
+
const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);
|
|
120
|
+
const rawResponse = JSON.stringify(res.content);
|
|
121
|
+
let resRect;
|
|
122
|
+
let matchedElements = 'elements' in res.content ? res.content.elements : [];
|
|
123
|
+
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
124
|
+
try {
|
|
125
|
+
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
|
|
126
|
+
var _options_searchConfig_rect2, _options_searchConfig, _options_searchConfig_rect3, _options_searchConfig1;
|
|
127
|
+
resRect = adaptBboxToRect(res.content.bbox, imageWidth, imageHeight, null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect3.top, originalImageWidth, originalImageHeight, vlMode);
|
|
128
|
+
debugInspect('resRect', resRect);
|
|
129
|
+
const rectCenter = {
|
|
130
|
+
x: resRect.left + resRect.width / 2,
|
|
131
|
+
y: resRect.top + resRect.height / 2
|
|
132
|
+
};
|
|
133
|
+
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
|
|
134
|
+
const distanceToCenter = element ? distance({
|
|
135
|
+
x: element.center[0],
|
|
136
|
+
y: element.center[1]
|
|
137
|
+
}, rectCenter) : 0;
|
|
138
|
+
if (!element || distanceToCenter > distanceThreshold) element = insertElementByPosition(rectCenter);
|
|
139
|
+
if (element) {
|
|
140
|
+
matchedElements = [
|
|
141
|
+
element
|
|
142
|
+
];
|
|
143
|
+
errors = [];
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
} catch (e) {
|
|
147
|
+
const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : 'unknown error in locate';
|
|
148
|
+
if (errors && (null == errors ? void 0 : errors.length) !== 0) errors.push(`(${msg})`);
|
|
149
|
+
else errors = [
|
|
150
|
+
msg
|
|
151
|
+
];
|
|
152
|
+
}
|
|
153
|
+
return {
|
|
154
|
+
rect: resRect,
|
|
155
|
+
parseResult: {
|
|
156
|
+
elements: matchedElements,
|
|
157
|
+
errors
|
|
158
|
+
},
|
|
159
|
+
rawResponse,
|
|
160
|
+
elementById,
|
|
161
|
+
usage: res.usage,
|
|
162
|
+
isOrderSensitive: 'object' == typeof res.content && null !== res.content && 'isOrderSensitive' in res.content ? res.content.isOrderSensitive : void 0
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
async function AiLocateSection(options) {
|
|
166
|
+
const { context, sectionDescription, modelConfig } = options;
|
|
167
|
+
const { vlMode } = modelConfig;
|
|
168
|
+
const { screenshotBase64 } = context;
|
|
169
|
+
const systemPrompt = systemPromptToLocateSection(vlMode);
|
|
170
|
+
const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
|
|
171
|
+
sectionDescription: extraTextFromUserPrompt(sectionDescription)
|
|
172
|
+
});
|
|
173
|
+
const msgs = [
|
|
174
|
+
{
|
|
175
|
+
role: 'system',
|
|
176
|
+
content: systemPrompt
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
role: 'user',
|
|
180
|
+
content: [
|
|
181
|
+
{
|
|
182
|
+
type: 'image_url',
|
|
183
|
+
image_url: {
|
|
184
|
+
url: screenshotBase64,
|
|
185
|
+
detail: 'high'
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
type: 'text',
|
|
190
|
+
text: sectionLocatorInstructionText
|
|
191
|
+
}
|
|
192
|
+
]
|
|
193
|
+
}
|
|
194
|
+
];
|
|
195
|
+
if ('string' != typeof sectionDescription) {
|
|
196
|
+
const addOns = await promptsToChatParam({
|
|
197
|
+
images: sectionDescription.images,
|
|
198
|
+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
|
|
199
|
+
});
|
|
200
|
+
msgs.push(...addOns);
|
|
201
|
+
}
|
|
202
|
+
const result = await callAIWithObjectResponse(msgs, AIActionType.EXTRACT_DATA, modelConfig);
|
|
203
|
+
let sectionRect;
|
|
204
|
+
const sectionBbox = result.content.bbox;
|
|
205
|
+
if (sectionBbox) {
|
|
206
|
+
const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height, vlMode);
|
|
207
|
+
debugSection('original targetRect %j', targetRect);
|
|
208
|
+
const referenceBboxList = result.content.references_bbox || [];
|
|
209
|
+
debugSection('referenceBboxList %j', referenceBboxList);
|
|
210
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height, vlMode));
|
|
211
|
+
debugSection('referenceRects %j', referenceRects);
|
|
212
|
+
const mergedRect = mergeRects([
|
|
213
|
+
targetRect,
|
|
214
|
+
...referenceRects
|
|
215
|
+
]);
|
|
216
|
+
debugSection('mergedRect %j', mergedRect);
|
|
217
|
+
sectionRect = expandSearchArea(mergedRect, context.size, vlMode);
|
|
218
|
+
debugSection('expanded sectionRect %j', sectionRect);
|
|
219
|
+
}
|
|
220
|
+
let imageBase64 = screenshotBase64;
|
|
221
|
+
if (sectionRect) {
|
|
222
|
+
const croppedResult = await cropByRect(screenshotBase64, sectionRect, 'qwen-vl' === vlMode);
|
|
223
|
+
imageBase64 = croppedResult.imageBase64;
|
|
224
|
+
sectionRect.width = croppedResult.width;
|
|
225
|
+
sectionRect.height = croppedResult.height;
|
|
226
|
+
}
|
|
227
|
+
return {
|
|
228
|
+
rect: sectionRect,
|
|
229
|
+
imageBase64,
|
|
230
|
+
error: result.content.error,
|
|
231
|
+
rawResponse: JSON.stringify(result.content),
|
|
232
|
+
usage: result.usage
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
async function AiExtractElementInfo(options) {
|
|
236
|
+
const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } = options;
|
|
237
|
+
const { vlMode } = modelConfig;
|
|
238
|
+
const systemPrompt = systemPromptToExtract();
|
|
239
|
+
const { screenshotBase64 } = context;
|
|
240
|
+
const { description, elementById } = await describeUserPage(context, {
|
|
241
|
+
truncateTextLength: 200,
|
|
242
|
+
filterNonTextContent: false,
|
|
243
|
+
visibleOnly: false,
|
|
244
|
+
domIncluded: null == extractOption ? void 0 : extractOption.domIncluded,
|
|
245
|
+
vlMode
|
|
246
|
+
});
|
|
247
|
+
const extractDataPromptText = await extractDataQueryPrompt(description, dataQuery);
|
|
248
|
+
const userContent = [];
|
|
249
|
+
if ((null == extractOption ? void 0 : extractOption.screenshotIncluded) !== false) userContent.push({
|
|
250
|
+
type: 'image_url',
|
|
251
|
+
image_url: {
|
|
252
|
+
url: screenshotBase64,
|
|
253
|
+
detail: 'high'
|
|
254
|
+
}
|
|
255
|
+
});
|
|
256
|
+
userContent.push({
|
|
257
|
+
type: 'text',
|
|
258
|
+
text: extractDataPromptText
|
|
259
|
+
});
|
|
260
|
+
const msgs = [
|
|
261
|
+
{
|
|
262
|
+
role: 'system',
|
|
263
|
+
content: systemPrompt
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
role: 'user',
|
|
267
|
+
content: userContent
|
|
268
|
+
}
|
|
269
|
+
];
|
|
270
|
+
if (multimodalPrompt) {
|
|
271
|
+
const addOns = await promptsToChatParam({
|
|
272
|
+
images: multimodalPrompt.images,
|
|
273
|
+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
|
|
274
|
+
});
|
|
275
|
+
msgs.push(...addOns);
|
|
276
|
+
}
|
|
277
|
+
const result = await callAIWithObjectResponse(msgs, AIActionType.EXTRACT_DATA, modelConfig);
|
|
278
|
+
return {
|
|
279
|
+
parseResult: result.content,
|
|
280
|
+
elementById,
|
|
281
|
+
usage: result.usage
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
export { AiExtractElementInfo, AiLocateElement, AiLocateSection };
|
|
285
|
+
|
|
286
|
+
//# sourceMappingURL=inspect.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model\\inspect.mjs","sources":["webpack://@rpascene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@rpascene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@rpascene/shared/img';\nimport { getDebug } from '@rpascene/shared/logger';\nimport { assert } from '@rpascene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, { vlMode });\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (vlMode === 'qwen3-vl') {\n // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);\n // imageWidth = paddedResult.width;\n // imageHeight = paddedResult.height;\n // imagePayload = paddedResult.imageBase64;\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n // console.log(msgs, 'msgs')\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const { vlMode } = modelConfig;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n vlMode,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","_options_searchConfig_rect","_options_searchConfig_rect1","paddedResult","paddingToMatchBlockByBase64","markupImageForLLM","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;AA0DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OASD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBT,SAAS;QAAEI;IAAO;IAE3CM,OACET,0BACA;IAEF,MAAMU,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BjB,wBAAwBY;IACpD;IACA,MAAMY,eAAeC,4BAA4BV;IAEjD,IAAIW,eAAeV;IACnB,IAAIW,aAAahB,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIiB,cAAcjB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIkB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIlB,QAAQ,YAAY,EAAE;YAWXqB,4BACCC;QAXdX,OACEX,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFW,OACEX,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;QAC/CiB,aAAa,QAAAI,CAAAA,6BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,2BAA2B,KAAK;QAC7CH,cAAc,QAAAI,CAAAA,8BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,4BAA2B,MAAM;QAC/CH,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIb,AAAW,cAAXA,QAAsB;QAC/B,MAAMkB,eAAe,MAAMC,4BAA4BR;QACvDC,aAAaM,aAAa,KAAK;QAC/BL,cAAcK,aAAa,MAAM;QACjCP,eAAeO,aAAa,WAAW;IACzC,OAAO,IAAIlB,AAAW,eAAXA;SAKJ,IAAI,CAACA,QACVW,eAAe,MAAMS,kBACnBnB,kBACAL,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOV,0BAAuC;QAChD,MAAMwB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAMC,MAAM,MAAMxB,SAASR,MAAMiC,aAAa,eAAe,EAAExB;IAE/D,MAAMyB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAK1DQ,6BAAAA,uBACAC,6BAAAA;YALFL,UAAUM,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChBV,YACAC,aAAAA,QACAiB,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG,EAC/BjB,oBACAC,qBACAf;YAGFlB,aAAa,WAAW4C;YAExB,MAAMO,aAAa;gBACjB,GAAGP,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIQ,UAAUC,iCAAiCvC,QAAQ,IAAI,EAAEqC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU9B,wBAAwB6B;YAGpC,IAAIC,SAAS;gBACXP,kBAAkB;oBAACO;iBAAQ;gBAC3BN,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOW,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACX,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEY,IAAI,CAAC,CAAC;aAFtBZ,SAAS;YAACY;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMd;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACArB;QACA,OAAOmB,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAChBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC9BA,IAAI,OAAO,CAAS,gBAAgB,GACrCoB;IACR;AACF;AAEO,eAAeC,gBAAgBhD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEgD,kBAAkB,EAAE7C,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMa,eAAeoC,4BAA4B7C;IACjD,MAAM8C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB9D,wBAAwB2D;IAC9C;IACA,MAAMtD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM6C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMvB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQyD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAtD,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAGF,IAAImD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAvD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BoE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DhE,aAAa,wBAAwBqE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS1B,MAAM,OAAO,CAAC0B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBsE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DtE,aAAa,iBAAiBwE;QAG9BN,cAAcQ,iBAAiBF,YAAY5D,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2BkE;IAC1C;IAEA,IAAIS,cAAc1D;IAClB,IAAIiD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B5D,kBACAiD,aACAlD,AAAW,cAAXA;QAEF2D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAavB,KAAK,SAAS,CAACuB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBnE,OAMD;IACC,MAAM,EAAEoE,SAAS,EAAEnE,OAAO,EAAEoE,aAAa,EAAE5E,gBAAgB,EAAEW,WAAW,EAAE,GACxEJ;IACF,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAMU,eAAewD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBT,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAaoE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;QACvChE;IACF;IAEA,MAAMkE,wBAAwB,MAAMC,uBAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM5E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAIhF,kBAAkB;QACpB,MAAMiC,SAAS,MAAMlC,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAEF,OAAO;QACL,aAAaiD,OAAO,OAAO;QAC3B7C;QACA,OAAO6C,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import { paddingToMatchBlockByBase64 } from "@rpascene/shared/img";
|
|
2
|
+
import { getDebug } from "@rpascene/shared/logger";
|
|
3
|
+
import { assert } from "@rpascene/shared/utils";
|
|
4
|
+
import { AIActionType, buildYamlFlowFromPlans, fillBboxParam, findAllRpasceneLocatorField, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
|
|
5
|
+
import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
|
+
import { describeUserPage } from "./prompt/util.mjs";
|
|
7
|
+
import { callAIWithObjectResponse } from "./service-caller/index.mjs";
|
|
8
|
+
const debug = getDebug('planning');
|
|
9
|
+
async function plan(userInstruction, opts) {
|
|
10
|
+
var _opts_conversationHistory, _planFromAI_action;
|
|
11
|
+
const { context, modelConfig, conversationHistory } = opts;
|
|
12
|
+
const { screenshotBase64, size } = context;
|
|
13
|
+
const { modelName, vlMode } = modelConfig;
|
|
14
|
+
const { description: pageDescription, elementById } = await describeUserPage(context, {
|
|
15
|
+
vlMode
|
|
16
|
+
});
|
|
17
|
+
const systemPrompt = await systemPromptToTaskPlanning({
|
|
18
|
+
actionSpace: opts.actionSpace,
|
|
19
|
+
vlMode: vlMode
|
|
20
|
+
});
|
|
21
|
+
let imagePayload = screenshotBase64;
|
|
22
|
+
let imageWidth = size.width;
|
|
23
|
+
let imageHeight = size.height;
|
|
24
|
+
const rightLimit = imageWidth;
|
|
25
|
+
const bottomLimit = imageHeight;
|
|
26
|
+
if ('qwen-vl' === vlMode) {
|
|
27
|
+
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
28
|
+
imageWidth = paddedResult.width;
|
|
29
|
+
imageHeight = paddedResult.height;
|
|
30
|
+
imagePayload = paddedResult.imageBase64;
|
|
31
|
+
} else if ('qwen3-vl' === vlMode) ;
|
|
32
|
+
else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
|
|
33
|
+
width: imageWidth,
|
|
34
|
+
height: imageHeight
|
|
35
|
+
});
|
|
36
|
+
warnGPT4oSizeLimit(size, modelName);
|
|
37
|
+
const historyLog = (null == (_opts_conversationHistory = opts.conversationHistory) ? void 0 : _opts_conversationHistory.snapshot()) || [];
|
|
38
|
+
const knowledgeContext = opts.actionContext ? [
|
|
39
|
+
{
|
|
40
|
+
role: 'user',
|
|
41
|
+
content: [
|
|
42
|
+
{
|
|
43
|
+
type: 'text',
|
|
44
|
+
text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
] : [];
|
|
49
|
+
const instruction = [
|
|
50
|
+
{
|
|
51
|
+
role: 'user',
|
|
52
|
+
content: [
|
|
53
|
+
{
|
|
54
|
+
type: 'text',
|
|
55
|
+
text: `<user_instruction>${userInstruction}</user_instruction>`
|
|
56
|
+
}
|
|
57
|
+
]
|
|
58
|
+
}
|
|
59
|
+
];
|
|
60
|
+
const msgs = [
|
|
61
|
+
{
|
|
62
|
+
role: 'system',
|
|
63
|
+
content: systemPrompt
|
|
64
|
+
},
|
|
65
|
+
...knowledgeContext,
|
|
66
|
+
...instruction,
|
|
67
|
+
...historyLog,
|
|
68
|
+
{
|
|
69
|
+
role: 'user',
|
|
70
|
+
content: [
|
|
71
|
+
{
|
|
72
|
+
type: 'image_url',
|
|
73
|
+
image_url: {
|
|
74
|
+
url: imagePayload,
|
|
75
|
+
detail: 'high'
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
...vlMode ? [] : [
|
|
79
|
+
{
|
|
80
|
+
type: 'text',
|
|
81
|
+
text: pageDescription
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
];
|
|
87
|
+
const { content, usage } = await callAIWithObjectResponse(msgs, AIActionType.PLAN, modelConfig);
|
|
88
|
+
const rawResponse = JSON.stringify(content, void 0, 2);
|
|
89
|
+
const planFromAI = content;
|
|
90
|
+
const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
|
|
91
|
+
planFromAI.action
|
|
92
|
+
] : planFromAI.actions) || [];
|
|
93
|
+
const returnValue = {
|
|
94
|
+
...planFromAI,
|
|
95
|
+
actions,
|
|
96
|
+
rawResponse,
|
|
97
|
+
usage,
|
|
98
|
+
yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
|
|
99
|
+
};
|
|
100
|
+
assert(planFromAI, "can't get plans from AI");
|
|
101
|
+
actions.forEach((action)=>{
|
|
102
|
+
const type = action.type;
|
|
103
|
+
const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
|
|
104
|
+
debug('actionInActionSpace matched', actionInActionSpace);
|
|
105
|
+
const locateFields = actionInActionSpace ? findAllRpasceneLocatorField(actionInActionSpace.paramSchema) : [];
|
|
106
|
+
debug('locateFields', locateFields);
|
|
107
|
+
locateFields.forEach((field)=>{
|
|
108
|
+
const locateResult = action.param[field];
|
|
109
|
+
if (locateResult) if (vlMode) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, vlMode);
|
|
110
|
+
else {
|
|
111
|
+
const element = elementById(locateResult);
|
|
112
|
+
if (element) action.param[field].id = element.id;
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
});
|
|
116
|
+
assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
117
|
+
if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
|
|
118
|
+
null == conversationHistory || conversationHistory.append({
|
|
119
|
+
role: 'assistant',
|
|
120
|
+
content: [
|
|
121
|
+
{
|
|
122
|
+
type: 'text',
|
|
123
|
+
text: rawResponse
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
});
|
|
127
|
+
null == conversationHistory || conversationHistory.append({
|
|
128
|
+
role: 'user',
|
|
129
|
+
content: [
|
|
130
|
+
{
|
|
131
|
+
type: 'text',
|
|
132
|
+
text: 'I have finished the action previously planned'
|
|
133
|
+
}
|
|
134
|
+
]
|
|
135
|
+
});
|
|
136
|
+
return returnValue;
|
|
137
|
+
}
|
|
138
|
+
export { plan };
|
|
139
|
+
|
|
140
|
+
//# sourceMappingURL=llm-planning.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model\\llm-planning.mjs","sources":["webpack://@rpascene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@rpascene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@rpascene/shared/img';\nimport { getDebug } from '@rpascene/shared/logger';\nimport { assert } from '@rpascene/shared/utils';\nimport type {\n ChatCompletionContentPart,\n ChatCompletionMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllRpasceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory?: ConversationHistory;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { modelName, vlMode } = modelConfig;\n\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n { vlMode },\n );\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlMode,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (vlMode === 'qwen3-vl') {\n // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);\n // imageWidth = paddedResult.width;\n // imageHeight = paddedResult.height;\n // imagePayload = paddedResult.imageBase64;\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {\n width: imageWidth,\n height: imageHeight,\n });\n }\n\n warnGPT4oSizeLimit(size, modelName);\n\n const historyLog = opts.conversationHistory?.snapshot() || [];\n // .filter((item) => item.role === 'assistant') || [];\n\n const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext\n ? [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`,\n },\n ],\n },\n ]\n : [];\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...knowledgeContext,\n ...instruction,\n ...historyLog,\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ...(vlMode\n ? []\n : ([\n {\n type: 'text',\n text: pageDescription,\n },\n ] as ChatCompletionContentPart[])),\n ],\n },\n ];\n\n const { content, usage } = await callAIWithObjectResponse<PlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllRpasceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlMode) {\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory?.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n conversationHistory?.append({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'I have finished the action previously planned',\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_opts_conversationHistory","_planFromAI_action","context","modelConfig","conversationHistory","screenshotBase64","size","modelName","vlMode","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","historyLog","knowledgeContext","instruction","msgs","content","usage","callAIWithObjectResponse","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllRpasceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;;;;;AA2BA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QAwCkBC,2BAiEhBC;IAvGH,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGL;IACtD,MAAM,EAAEM,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,SAAS,EAAEC,MAAM,EAAE,GAAGL;IAE9B,MAAM,EAAE,aAAaM,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DT,SACA;QAAEM;IAAO;IAEX,MAAMI,eAAe,MAAMC,2BAA2B;QACpD,aAAad,KAAK,WAAW;QAC7B,QAAQS;IACV;IAEA,IAAIM,eAAeT;IACnB,IAAIU,aAAaT,KAAK,KAAK;IAC3B,IAAIU,cAAcV,KAAK,MAAM;IAC7B,MAAMW,aAAaF;IACnB,MAAMG,cAAcF;IACpB,IAAIR,AAAW,cAAXA,QAAsB;QACxB,MAAMW,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC,OAAO,IAAIX,AAAW,eAAXA;SAKJ,IAAI,CAACA,QACVM,eAAe,MAAMO,kBAAkBhB,kBAAkBH,QAAQ,IAAI,EAAE;QACrE,OAAOa;QACP,QAAQC;IACV;IAGFM,mBAAmBhB,MAAMC;IAEzB,MAAMgB,aAAavB,AAAAA,SAAAA,CAAAA,4BAAAA,KAAK,mBAAmB,AAAD,IAAvBA,KAAAA,IAAAA,0BAA0B,QAAQ,EAAC,KAAK,EAAE;IAG7D,MAAMwB,mBAAiDzB,KAAK,aAAa,GACrE;QACA;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,0BAA0B,CAAC;gBAClF;aACD;QACH;KACD,GACC,EAAE;IAEN,MAAM0B,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,kBAAkB,EAAE3B,gBAAgB,mBAAmB,CAAC;gBACjE;aACD;QACH;KACD;IAED,MAAM4B,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASd;QAAa;WACrCY;WACAC;WACAF;QACH;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;mBACIN,SACA,EAAE,GACD;oBACD;wBACE,MAAM;wBACN,MAAMC;oBACR;iBACD;aACJ;QACH;KACD;IAED,MAAM,EAAEkB,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC/BH,MACAI,aAAa,IAAI,EACjB3B;IAEF,MAAM4B,cAAcC,KAAK,SAAS,CAACL,SAASM,QAAW;IACvD,MAAMC,aAAaP;IAEnB,MAAMQ,UACHlC,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAACiC,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAH;QACA,UAAUS,uBACRF,SACApC,KAAK,WAAW,EAChBmC,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsB1C,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACwC,SAAWA,OAAO,IAAI,KAAKC;QAG9B7C,MAAM,+BAA+B8C;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAEN9C,MAAM,gBAAgB+C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAIrC,QACF+B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA9B,YACAC,aACAC,YACAC,aACAV;iBAEG;gBACL,MAAMuC,UAAUrC,YAAYmC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;QAEJ;IACF;IAEAT,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAlD;IAIJM,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM2B;YACR;SACD;IACH;IACA3B,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;SACD;IACH;IAEA,OAAOgC;AACT"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
const assertSchema = {
|
|
2
|
+
type: 'json_schema',
|
|
3
|
+
json_schema: {
|
|
4
|
+
name: 'assert',
|
|
5
|
+
strict: true,
|
|
6
|
+
schema: {
|
|
7
|
+
type: 'object',
|
|
8
|
+
properties: {
|
|
9
|
+
pass: {
|
|
10
|
+
type: 'boolean',
|
|
11
|
+
description: 'Whether the assertion passed or failed'
|
|
12
|
+
},
|
|
13
|
+
thought: {
|
|
14
|
+
type: [
|
|
15
|
+
'string',
|
|
16
|
+
'null'
|
|
17
|
+
],
|
|
18
|
+
description: 'The thought process behind the assertion'
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
required: [
|
|
22
|
+
'pass',
|
|
23
|
+
'thought'
|
|
24
|
+
],
|
|
25
|
+
additionalProperties: false
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
export { assertSchema };
|
|
30
|
+
|
|
31
|
+
//# sourceMappingURL=assertion.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model\\prompt\\assertion.mjs","sources":["webpack://@rpascene/core/./src/ai-model/prompt/assertion.ts"],"sourcesContent":["import type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nexport const assertSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'assert',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n pass: {\n type: 'boolean',\n description: 'Whether the assertion passed or failed',\n },\n thought: {\n type: ['string', 'null'],\n description: 'The thought process behind the assertion',\n },\n },\n required: ['pass', 'thought'],\n additionalProperties: false,\n },\n },\n};\n"],"names":["assertSchema"],"mappings":"AAEO,MAAMA,eAAyC;IACpD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,MAAM;oBACJ,MAAM;oBACN,aAAa;gBACf;gBACA,SAAS;oBACP,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAQ;aAAU;YAC7B,sBAAsB;QACxB;IACF;AACF"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
function bboxDescription(vlMode) {
|
|
2
|
+
if ('gemini' === vlMode) return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';
|
|
3
|
+
return '2d bounding box as [xmin, ymin, xmax, ymax]';
|
|
4
|
+
}
|
|
5
|
+
export { bboxDescription };
|
|
6
|
+
|
|
7
|
+
//# sourceMappingURL=common.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model\\prompt\\common.mjs","sources":["webpack://@rpascene/core/./src/ai-model/prompt/common.ts"],"sourcesContent":["import type { TVlModeTypes } from '@rpascene/shared/env';\nexport function bboxDescription(vlMode: TVlModeTypes | undefined) {\n if (vlMode === 'gemini') {\n return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';\n }\n return '2d bounding box as [xmin, ymin, xmax, ymax]';\n}\n"],"names":["bboxDescription","vlMode"],"mappings":"AACO,SAASA,gBAAgBC,MAAgC;IAC9D,IAAIA,AAAW,aAAXA,QACF,OAAO;IAET,OAAO;AACT"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@rpascene/shared/env";
|
|
2
|
+
const elementDescriberInstruction = ()=>`
|
|
3
|
+
Describe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.
|
|
4
|
+
|
|
5
|
+
CRITICAL REQUIREMENTS:
|
|
6
|
+
1. UNIQUENESS: The description must uniquely identify this element on the current page
|
|
7
|
+
2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
|
|
8
|
+
3. PRECISION: Be specific enough to distinguish from similar elements
|
|
9
|
+
|
|
10
|
+
DESCRIPTION STRUCTURE:
|
|
11
|
+
1. Element type (button, input, link, div, etc.)
|
|
12
|
+
2. Primary identifier (in order of preference):
|
|
13
|
+
- Unique text content: "with text 'Login'"
|
|
14
|
+
- Unique attribute: "with aria-label 'Search'"
|
|
15
|
+
- Unique class/ID: "with class 'primary-button'"
|
|
16
|
+
- Unique position: "in header navigation"
|
|
17
|
+
3. Secondary identifiers (if needed for uniqueness):
|
|
18
|
+
- Visual features: "blue background", "with icon"
|
|
19
|
+
- Relative position: "below search bar", "in sidebar"
|
|
20
|
+
- Parent context: "in login form", "in main menu"
|
|
21
|
+
|
|
22
|
+
GUIDELINES:
|
|
23
|
+
- Keep description under 25 words
|
|
24
|
+
- Prioritize semantic identifiers over visual ones
|
|
25
|
+
- Use consistent terminology across similar elements
|
|
26
|
+
- Avoid page-specific or temporary content
|
|
27
|
+
- Don't mention the red rectangle or selection box
|
|
28
|
+
- Focus on stable, reusable characteristics
|
|
29
|
+
|
|
30
|
+
EXAMPLES:
|
|
31
|
+
- "Login button with text 'Sign In'"
|
|
32
|
+
- "Search input with placeholder 'Enter keywords'"
|
|
33
|
+
- "Navigation link with text 'Home' in header"
|
|
34
|
+
- "Submit button in contact form"
|
|
35
|
+
- "Menu icon with aria-label 'Open menu'"
|
|
36
|
+
|
|
37
|
+
Return JSON:
|
|
38
|
+
{
|
|
39
|
+
"description": "unique element identifier",
|
|
40
|
+
"error"?: "error message if any"
|
|
41
|
+
}`;
|
|
42
|
+
export { elementDescriberInstruction };
|
|
43
|
+
|
|
44
|
+
//# sourceMappingURL=describe.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model\\prompt\\describe.mjs","sources":["webpack://@rpascene/core/./src/ai-model/prompt/describe.ts"],"sourcesContent":["import { getPreferredLanguage } from '@rpascene/shared/env';\n\nexport const elementDescriberInstruction = () => {\n return `\nDescribe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.\n\nCRITICAL REQUIREMENTS:\n1. UNIQUENESS: The description must uniquely identify this element on the current page\n2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts\n3. PRECISION: Be specific enough to distinguish from similar elements\n\nDESCRIPTION STRUCTURE:\n1. Element type (button, input, link, div, etc.)\n2. Primary identifier (in order of preference):\n - Unique text content: \"with text 'Login'\"\n - Unique attribute: \"with aria-label 'Search'\"\n - Unique class/ID: \"with class 'primary-button'\"\n - Unique position: \"in header navigation\"\n3. Secondary identifiers (if needed for uniqueness):\n - Visual features: \"blue background\", \"with icon\"\n - Relative position: \"below search bar\", \"in sidebar\"\n - Parent context: \"in login form\", \"in main menu\"\n\nGUIDELINES:\n- Keep description under 25 words\n- Prioritize semantic identifiers over visual ones\n- Use consistent terminology across similar elements\n- Avoid page-specific or temporary content\n- Don't mention the red rectangle or selection box\n- Focus on stable, reusable characteristics\n\nEXAMPLES:\n- \"Login button with text 'Sign In'\"\n- \"Search input with placeholder 'Enter keywords'\"\n- \"Navigation link with text 'Home' in header\"\n- \"Submit button in contact form\"\n- \"Menu icon with aria-label 'Open menu'\"\n\nReturn JSON:\n{\n \"description\": \"unique element identifier\",\n \"error\"?: \"error message if any\"\n}`;\n};\n"],"names":["elementDescriberInstruction","getPreferredLanguage"],"mappings":";AAEO,MAAMA,8BAA8B,IAClC,CAAC;0EACgE,EAAEC,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAsClG,CAAC"}
|