@godscene/core 1.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +9 -0
- package/dist/es/agent/agent.mjs +767 -0
- package/dist/es/agent/common.mjs +0 -0
- package/dist/es/agent/execution-session.mjs +39 -0
- package/dist/es/agent/index.mjs +6 -0
- package/dist/es/agent/task-builder.mjs +343 -0
- package/dist/es/agent/task-cache.mjs +212 -0
- package/dist/es/agent/tasks.mjs +428 -0
- package/dist/es/agent/ui-utils.mjs +101 -0
- package/dist/es/agent/utils.mjs +167 -0
- package/dist/es/ai-model/auto-glm/actions.mjs +237 -0
- package/dist/es/ai-model/auto-glm/index.mjs +6 -0
- package/dist/es/ai-model/auto-glm/parser.mjs +237 -0
- package/dist/es/ai-model/auto-glm/planning.mjs +69 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs +220 -0
- package/dist/es/ai-model/auto-glm/util.mjs +7 -0
- package/dist/es/ai-model/connectivity.mjs +136 -0
- package/dist/es/ai-model/conversation-history.mjs +193 -0
- package/dist/es/ai-model/index.mjs +12 -0
- package/dist/es/ai-model/inspect.mjs +395 -0
- package/dist/es/ai-model/llm-planning.mjs +231 -0
- package/dist/es/ai-model/prompt/common.mjs +5 -0
- package/dist/es/ai-model/prompt/describe.mjs +64 -0
- package/dist/es/ai-model/prompt/extraction.mjs +129 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +49 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +584 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +42 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +33 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +115 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +34 -0
- package/dist/es/ai-model/prompt/util.mjs +57 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +201 -0
- package/dist/es/ai-model/service-caller/codex-app-server.mjs +573 -0
- package/dist/es/ai-model/service-caller/image-detail.mjs +4 -0
- package/dist/es/ai-model/service-caller/index.mjs +648 -0
- package/dist/es/ai-model/service-caller/request-timeout.mjs +47 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +247 -0
- package/dist/es/common.mjs +382 -0
- package/dist/es/device/device-options.mjs +0 -0
- package/dist/es/device/index.mjs +340 -0
- package/dist/es/dump/html-utils.mjs +290 -0
- package/dist/es/dump/index.mjs +3 -0
- package/dist/es/dump/screenshot-restoration.mjs +30 -0
- package/dist/es/dump/screenshot-store.mjs +125 -0
- package/dist/es/index.mjs +17 -0
- package/dist/es/report-cli.mjs +149 -0
- package/dist/es/report-generator.mjs +203 -0
- package/dist/es/report-markdown.mjs +216 -0
- package/dist/es/report.mjs +287 -0
- package/dist/es/screenshot-item.mjs +120 -0
- package/dist/es/service/index.mjs +272 -0
- package/dist/es/service/utils.mjs +13 -0
- package/dist/es/skill/index.mjs +35 -0
- package/dist/es/task-runner.mjs +261 -0
- package/dist/es/task-timing.mjs +10 -0
- package/dist/es/tree.mjs +11 -0
- package/dist/es/types.mjs +202 -0
- package/dist/es/utils.mjs +232 -0
- package/dist/es/yaml/builder.mjs +11 -0
- package/dist/es/yaml/index.mjs +4 -0
- package/dist/es/yaml/player.mjs +425 -0
- package/dist/es/yaml/utils.mjs +100 -0
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/agent/agent.js +815 -0
- package/dist/lib/agent/common.js +5 -0
- package/dist/lib/agent/execution-session.js +73 -0
- package/dist/lib/agent/index.js +76 -0
- package/dist/lib/agent/task-builder.js +380 -0
- package/dist/lib/agent/task-cache.js +264 -0
- package/dist/lib/agent/tasks.js +471 -0
- package/dist/lib/agent/ui-utils.js +153 -0
- package/dist/lib/agent/utils.js +238 -0
- package/dist/lib/ai-model/auto-glm/actions.js +271 -0
- package/dist/lib/ai-model/auto-glm/index.js +64 -0
- package/dist/lib/ai-model/auto-glm/parser.js +280 -0
- package/dist/lib/ai-model/auto-glm/planning.js +103 -0
- package/dist/lib/ai-model/auto-glm/prompt.js +257 -0
- package/dist/lib/ai-model/auto-glm/util.js +44 -0
- package/dist/lib/ai-model/connectivity.js +180 -0
- package/dist/lib/ai-model/conversation-history.js +227 -0
- package/dist/lib/ai-model/index.js +127 -0
- package/dist/lib/ai-model/inspect.js +441 -0
- package/dist/lib/ai-model/llm-planning.js +268 -0
- package/dist/lib/ai-model/prompt/common.js +39 -0
- package/dist/lib/ai-model/prompt/describe.js +98 -0
- package/dist/lib/ai-model/prompt/extraction.js +169 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +86 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +621 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +79 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js +70 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +176 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +71 -0
- package/dist/lib/ai-model/prompt/util.js +103 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +262 -0
- package/dist/lib/ai-model/service-caller/codex-app-server.js +622 -0
- package/dist/lib/ai-model/service-caller/image-detail.js +38 -0
- package/dist/lib/ai-model/service-caller/index.js +716 -0
- package/dist/lib/ai-model/service-caller/request-timeout.js +93 -0
- package/dist/lib/ai-model/ui-tars-planning.js +281 -0
- package/dist/lib/common.js +491 -0
- package/dist/lib/device/device-options.js +18 -0
- package/dist/lib/device/index.js +467 -0
- package/dist/lib/dump/html-utils.js +366 -0
- package/dist/lib/dump/index.js +58 -0
- package/dist/lib/dump/screenshot-restoration.js +64 -0
- package/dist/lib/dump/screenshot-store.js +165 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/report-cli.js +189 -0
- package/dist/lib/report-generator.js +244 -0
- package/dist/lib/report-markdown.js +253 -0
- package/dist/lib/report.js +333 -0
- package/dist/lib/screenshot-item.js +154 -0
- package/dist/lib/service/index.js +306 -0
- package/dist/lib/service/utils.js +47 -0
- package/dist/lib/skill/index.js +69 -0
- package/dist/lib/task-runner.js +298 -0
- package/dist/lib/task-timing.js +44 -0
- package/dist/lib/tree.js +51 -0
- package/dist/lib/types.js +298 -0
- package/dist/lib/utils.js +314 -0
- package/dist/lib/yaml/builder.js +55 -0
- package/dist/lib/yaml/index.js +79 -0
- package/dist/lib/yaml/player.js +459 -0
- package/dist/lib/yaml/utils.js +153 -0
- package/dist/lib/yaml.js +18 -0
- package/dist/types/agent/agent.d.ts +220 -0
- package/dist/types/agent/common.d.ts +0 -0
- package/dist/types/agent/execution-session.d.ts +36 -0
- package/dist/types/agent/index.d.ts +9 -0
- package/dist/types/agent/task-builder.d.ts +34 -0
- package/dist/types/agent/task-cache.d.ts +49 -0
- package/dist/types/agent/tasks.d.ts +70 -0
- package/dist/types/agent/ui-utils.d.ts +14 -0
- package/dist/types/agent/utils.d.ts +25 -0
- package/dist/types/ai-model/auto-glm/actions.d.ts +78 -0
- package/dist/types/ai-model/auto-glm/index.d.ts +6 -0
- package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
- package/dist/types/ai-model/auto-glm/planning.d.ts +12 -0
- package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
- package/dist/types/ai-model/auto-glm/util.d.ts +13 -0
- package/dist/types/ai-model/connectivity.d.ts +20 -0
- package/dist/types/ai-model/conversation-history.d.ts +105 -0
- package/dist/types/ai-model/index.d.ts +16 -0
- package/dist/types/ai-model/inspect.d.ts +67 -0
- package/dist/types/ai-model/llm-planning.d.ts +19 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +7 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +10 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +33 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +102 -0
- package/dist/types/ai-model/service-caller/codex-app-server.d.ts +42 -0
- package/dist/types/ai-model/service-caller/image-detail.d.ts +2 -0
- package/dist/types/ai-model/service-caller/index.d.ts +60 -0
- package/dist/types/ai-model/service-caller/request-timeout.d.ts +32 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +72 -0
- package/dist/types/common.d.ts +288 -0
- package/dist/types/device/device-options.d.ts +155 -0
- package/dist/types/device/index.d.ts +2565 -0
- package/dist/types/dump/html-utils.d.ts +75 -0
- package/dist/types/dump/index.d.ts +5 -0
- package/dist/types/dump/screenshot-restoration.d.ts +8 -0
- package/dist/types/dump/screenshot-store.d.ts +49 -0
- package/dist/types/index.d.ts +21 -0
- package/dist/types/report-cli.d.ts +36 -0
- package/dist/types/report-generator.d.ts +88 -0
- package/dist/types/report-markdown.d.ts +24 -0
- package/dist/types/report.d.ts +52 -0
- package/dist/types/screenshot-item.d.ts +67 -0
- package/dist/types/service/index.d.ts +24 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/skill/index.d.ts +25 -0
- package/dist/types/task-runner.d.ts +50 -0
- package/dist/types/task-timing.d.ts +8 -0
- package/dist/types/tree.d.ts +4 -0
- package/dist/types/types.d.ts +684 -0
- package/dist/types/utils.d.ts +45 -0
- package/dist/types/yaml/builder.d.ts +2 -0
- package/dist/types/yaml/index.d.ts +4 -0
- package/dist/types/yaml/player.d.ts +34 -0
- package/dist/types/yaml/utils.d.ts +9 -0
- package/dist/types/yaml.d.ts +215 -0
- package/package.json +130 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
const DEFAULT_AI_CALL_TIMEOUT_MS = 180000;
|
|
2
|
+
const AI_CALL_HARD_TIMEOUT_CODE = 'AI_CALL_HARD_TIMEOUT';
|
|
3
|
+
function resolveEffectiveTimeoutMs(modelConfig) {
|
|
4
|
+
const { timeout } = modelConfig;
|
|
5
|
+
if ('number' != typeof timeout) return DEFAULT_AI_CALL_TIMEOUT_MS;
|
|
6
|
+
if (timeout <= 0) return null;
|
|
7
|
+
return timeout;
|
|
8
|
+
}
|
|
9
|
+
function isHardTimeoutError(err) {
|
|
10
|
+
if (!err || 'object' != typeof err) return false;
|
|
11
|
+
const code = err.code;
|
|
12
|
+
if (code === AI_CALL_HARD_TIMEOUT_CODE) return true;
|
|
13
|
+
const cause = err.cause;
|
|
14
|
+
if (cause && 'object' == typeof cause && cause.code === AI_CALL_HARD_TIMEOUT_CODE) return true;
|
|
15
|
+
return false;
|
|
16
|
+
}
|
|
17
|
+
function buildRequestAbortSignal(timeoutMs, userSignal) {
|
|
18
|
+
const controller = new AbortController();
|
|
19
|
+
if (userSignal?.aborted) {
|
|
20
|
+
controller.abort(userSignal.reason);
|
|
21
|
+
return {
|
|
22
|
+
signal: controller.signal,
|
|
23
|
+
cleanup: ()=>{}
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
let timer;
|
|
27
|
+
if (null !== timeoutMs) {
|
|
28
|
+
timer = setTimeout(()=>{
|
|
29
|
+
const err = new Error(`AI call hard timeout after ${timeoutMs}ms (full request time exceeded)`);
|
|
30
|
+
err.code = AI_CALL_HARD_TIMEOUT_CODE;
|
|
31
|
+
controller.abort(err);
|
|
32
|
+
}, timeoutMs);
|
|
33
|
+
if ('function' == typeof timer.unref) timer.unref();
|
|
34
|
+
}
|
|
35
|
+
const onUserAbort = userSignal ? ()=>controller.abort(userSignal.reason) : void 0;
|
|
36
|
+
if (userSignal && onUserAbort) userSignal.addEventListener('abort', onUserAbort, {
|
|
37
|
+
once: true
|
|
38
|
+
});
|
|
39
|
+
return {
|
|
40
|
+
signal: controller.signal,
|
|
41
|
+
cleanup: ()=>{
|
|
42
|
+
if (timer) clearTimeout(timer);
|
|
43
|
+
if (userSignal && onUserAbort) userSignal.removeEventListener('abort', onUserAbort);
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
export { AI_CALL_HARD_TIMEOUT_CODE, DEFAULT_AI_CALL_TIMEOUT_MS, buildRequestAbortSignal, isHardTimeoutError, resolveEffectiveTimeoutMs };
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { getDebug } from "@godscene/shared/logger";
|
|
2
|
+
import { transformHotkeyInput } from "@godscene/shared/us-keyboard-layout";
|
|
3
|
+
import { assert } from "@godscene/shared/utils";
|
|
4
|
+
import { actionParser } from "@ui-tars/action-parser";
|
|
5
|
+
import { getSummary, getUiTarsPlanningPrompt } from "./prompt/ui-tars-planning.mjs";
|
|
6
|
+
import { AIResponseParseError, callAIWithStringResponse } from "./service-caller/index.mjs";
|
|
7
|
+
const debug = getDebug('ui-tars-planning');
|
|
8
|
+
const warnLog = getDebug('ui-tars-planning', {
|
|
9
|
+
console: true
|
|
10
|
+
});
|
|
11
|
+
const bboxSize = 10;
|
|
12
|
+
const pointToBbox = (point, width, height)=>[
|
|
13
|
+
Math.round(Math.max(point.x - bboxSize / 2, 0)),
|
|
14
|
+
Math.round(Math.max(point.y - bboxSize / 2, 0)),
|
|
15
|
+
Math.round(Math.min(point.x + bboxSize / 2, width)),
|
|
16
|
+
Math.round(Math.min(point.y + bboxSize / 2, height))
|
|
17
|
+
];
|
|
18
|
+
async function uiTarsPlanning(userInstruction, options) {
|
|
19
|
+
const { conversationHistory, context, modelConfig, actionContext } = options;
|
|
20
|
+
const { uiTarsModelVersion } = modelConfig;
|
|
21
|
+
let instruction = userInstruction;
|
|
22
|
+
if (actionContext) instruction = `<high_priority_knowledge>${actionContext}</high_priority_knowledge>\n<user_instruction>${userInstruction}</user_instruction>`;
|
|
23
|
+
const systemPrompt = getUiTarsPlanningPrompt() + instruction;
|
|
24
|
+
const screenshotBase64 = context.screenshot.base64;
|
|
25
|
+
conversationHistory.append({
|
|
26
|
+
role: 'user',
|
|
27
|
+
content: [
|
|
28
|
+
{
|
|
29
|
+
type: 'image_url',
|
|
30
|
+
image_url: {
|
|
31
|
+
url: screenshotBase64
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
});
|
|
36
|
+
const res = await callAIWithStringResponse([
|
|
37
|
+
{
|
|
38
|
+
role: 'user',
|
|
39
|
+
content: systemPrompt
|
|
40
|
+
},
|
|
41
|
+
...conversationHistory.snapshot()
|
|
42
|
+
], modelConfig, {
|
|
43
|
+
abortSignal: options.abortSignal
|
|
44
|
+
});
|
|
45
|
+
let convertedText;
|
|
46
|
+
let parsed;
|
|
47
|
+
try {
|
|
48
|
+
convertedText = convertBboxToCoordinates(res.content);
|
|
49
|
+
const { shotSize } = context;
|
|
50
|
+
const parseResult = actionParser({
|
|
51
|
+
prediction: convertedText,
|
|
52
|
+
factor: [
|
|
53
|
+
1000,
|
|
54
|
+
1000
|
|
55
|
+
],
|
|
56
|
+
screenContext: {
|
|
57
|
+
width: shotSize.width,
|
|
58
|
+
height: shotSize.height
|
|
59
|
+
},
|
|
60
|
+
modelVer: uiTarsModelVersion
|
|
61
|
+
});
|
|
62
|
+
parsed = parseResult.parsed;
|
|
63
|
+
} catch (parseError) {
|
|
64
|
+
const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
|
|
65
|
+
throw new AIResponseParseError(`Parse error: ${errorMessage}`, JSON.stringify(res.content, void 0, 2), res.usage);
|
|
66
|
+
}
|
|
67
|
+
const { shotSize } = context;
|
|
68
|
+
debug('ui-tars modelVer', uiTarsModelVersion, ', parsed', JSON.stringify(parsed));
|
|
69
|
+
const transformActions = [];
|
|
70
|
+
const unhandledActions = [];
|
|
71
|
+
let shouldContinue = true;
|
|
72
|
+
parsed.forEach((action)=>{
|
|
73
|
+
const actionType = (action.action_type || '').toLowerCase();
|
|
74
|
+
if ('click' === actionType) {
|
|
75
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
76
|
+
const point = getPoint(action.action_inputs.start_box, shotSize);
|
|
77
|
+
const locate = {
|
|
78
|
+
prompt: action.thought || '',
|
|
79
|
+
bbox: pointToBbox({
|
|
80
|
+
x: point[0],
|
|
81
|
+
y: point[1]
|
|
82
|
+
}, shotSize.width, shotSize.height)
|
|
83
|
+
};
|
|
84
|
+
transformActions.push({
|
|
85
|
+
type: 'Tap',
|
|
86
|
+
param: {
|
|
87
|
+
locate: locate
|
|
88
|
+
}
|
|
89
|
+
});
|
|
90
|
+
} else if ('left_double' === actionType) {
|
|
91
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
92
|
+
const point = getPoint(action.action_inputs.start_box, shotSize);
|
|
93
|
+
const locate = {
|
|
94
|
+
prompt: action.thought || '',
|
|
95
|
+
bbox: pointToBbox({
|
|
96
|
+
x: point[0],
|
|
97
|
+
y: point[1]
|
|
98
|
+
}, shotSize.width, shotSize.height)
|
|
99
|
+
};
|
|
100
|
+
transformActions.push({
|
|
101
|
+
type: 'DoubleClick',
|
|
102
|
+
param: {
|
|
103
|
+
locate: locate
|
|
104
|
+
},
|
|
105
|
+
thought: action.thought || ''
|
|
106
|
+
});
|
|
107
|
+
} else if ('right_single' === actionType) {
|
|
108
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
109
|
+
const point = getPoint(action.action_inputs.start_box, shotSize);
|
|
110
|
+
const locate = {
|
|
111
|
+
prompt: action.thought || '',
|
|
112
|
+
bbox: pointToBbox({
|
|
113
|
+
x: point[0],
|
|
114
|
+
y: point[1]
|
|
115
|
+
}, shotSize.width, shotSize.height)
|
|
116
|
+
};
|
|
117
|
+
transformActions.push({
|
|
118
|
+
type: 'RightClick',
|
|
119
|
+
param: {
|
|
120
|
+
locate: locate
|
|
121
|
+
},
|
|
122
|
+
thought: action.thought || ''
|
|
123
|
+
});
|
|
124
|
+
} else if ('drag' === actionType) {
|
|
125
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
126
|
+
assert(action.action_inputs.end_box, 'end_box is required');
|
|
127
|
+
const startPoint = getPoint(action.action_inputs.start_box, shotSize);
|
|
128
|
+
const endPoint = getPoint(action.action_inputs.end_box, shotSize);
|
|
129
|
+
transformActions.push({
|
|
130
|
+
type: 'DragAndDrop',
|
|
131
|
+
param: {
|
|
132
|
+
from: {
|
|
133
|
+
prompt: action.thought || '',
|
|
134
|
+
bbox: pointToBbox({
|
|
135
|
+
x: startPoint[0],
|
|
136
|
+
y: startPoint[1]
|
|
137
|
+
}, shotSize.width, shotSize.height)
|
|
138
|
+
},
|
|
139
|
+
to: {
|
|
140
|
+
prompt: action.thought || '',
|
|
141
|
+
bbox: pointToBbox({
|
|
142
|
+
x: endPoint[0],
|
|
143
|
+
y: endPoint[1]
|
|
144
|
+
}, shotSize.width, shotSize.height)
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
thought: action.thought || ''
|
|
148
|
+
});
|
|
149
|
+
} else if ('type' === actionType) transformActions.push({
|
|
150
|
+
type: 'Input',
|
|
151
|
+
param: {
|
|
152
|
+
value: action.action_inputs.content
|
|
153
|
+
},
|
|
154
|
+
thought: action.thought || ''
|
|
155
|
+
});
|
|
156
|
+
else if ('scroll' === actionType) transformActions.push({
|
|
157
|
+
type: 'Scroll',
|
|
158
|
+
param: {
|
|
159
|
+
direction: action.action_inputs.direction
|
|
160
|
+
},
|
|
161
|
+
thought: action.thought || ''
|
|
162
|
+
});
|
|
163
|
+
else if ('finished' === actionType) {
|
|
164
|
+
shouldContinue = false;
|
|
165
|
+
transformActions.push({
|
|
166
|
+
type: 'Finished',
|
|
167
|
+
param: {},
|
|
168
|
+
thought: action.thought || ''
|
|
169
|
+
});
|
|
170
|
+
} else if ('hotkey' === actionType) if (action.action_inputs.key) {
|
|
171
|
+
const keys = transformHotkeyInput(action.action_inputs.key);
|
|
172
|
+
transformActions.push({
|
|
173
|
+
type: 'KeyboardPress',
|
|
174
|
+
param: {
|
|
175
|
+
keyName: keys.join('+')
|
|
176
|
+
},
|
|
177
|
+
thought: action.thought || ''
|
|
178
|
+
});
|
|
179
|
+
} else warnLog('No key found in action: hotkey. Will not perform action.');
|
|
180
|
+
else if ('wait' === actionType) transformActions.push({
|
|
181
|
+
type: 'Sleep',
|
|
182
|
+
param: {
|
|
183
|
+
timeMs: 1000
|
|
184
|
+
},
|
|
185
|
+
thought: action.thought || ''
|
|
186
|
+
});
|
|
187
|
+
else if (actionType) {
|
|
188
|
+
unhandledActions.push({
|
|
189
|
+
type: actionType,
|
|
190
|
+
thought: action.thought || ''
|
|
191
|
+
});
|
|
192
|
+
debug('Unhandled action type:', actionType, 'thought:', action.thought);
|
|
193
|
+
}
|
|
194
|
+
});
|
|
195
|
+
if (0 === transformActions.length) {
|
|
196
|
+
const errorDetails = [];
|
|
197
|
+
if (0 === parsed.length) {
|
|
198
|
+
errorDetails.push('Action parser returned no actions');
|
|
199
|
+
if (res.content.includes('Thought:') && !res.content.includes('Action:')) errorDetails.push('Response contains "Thought:" but missing "Action:" line');
|
|
200
|
+
else errorDetails.push('Response may be malformed or empty');
|
|
201
|
+
}
|
|
202
|
+
if (unhandledActions.length > 0) {
|
|
203
|
+
const types = unhandledActions.map((a)=>a.type).join(', ');
|
|
204
|
+
errorDetails.push(`Unhandled action types: ${types}`);
|
|
205
|
+
}
|
|
206
|
+
const errorMessage = [
|
|
207
|
+
'No actions found in UI-TARS response.',
|
|
208
|
+
...errorDetails
|
|
209
|
+
].join('\n');
|
|
210
|
+
throw new AIResponseParseError(errorMessage, JSON.stringify(res.content, void 0, 2), res.usage);
|
|
211
|
+
}
|
|
212
|
+
debug('transformActions', JSON.stringify(transformActions, null, 2));
|
|
213
|
+
const log = getSummary(res.content);
|
|
214
|
+
conversationHistory.append({
|
|
215
|
+
role: 'assistant',
|
|
216
|
+
content: log
|
|
217
|
+
});
|
|
218
|
+
return {
|
|
219
|
+
actions: transformActions,
|
|
220
|
+
log,
|
|
221
|
+
usage: res.usage,
|
|
222
|
+
rawResponse: JSON.stringify(res.content, void 0, 2),
|
|
223
|
+
shouldContinuePlanning: shouldContinue
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
function convertBboxToCoordinates(text) {
|
|
227
|
+
const pattern = /<bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)<\/bbox>/g;
|
|
228
|
+
function replaceMatch(match, x1, y1, x2, y2) {
|
|
229
|
+
const x1Num = Number.parseInt(x1, 10);
|
|
230
|
+
const y1Num = Number.parseInt(y1, 10);
|
|
231
|
+
const x2Num = Number.parseInt(x2, 10);
|
|
232
|
+
const y2Num = Number.parseInt(y2, 10);
|
|
233
|
+
const x = Math.floor((x1Num + x2Num) / 2);
|
|
234
|
+
const y = Math.floor((y1Num + y2Num) / 2);
|
|
235
|
+
return `(${x},${y})`;
|
|
236
|
+
}
|
|
237
|
+
const cleanedText = text.replace(/\[EOS\]/g, '');
|
|
238
|
+
return cleanedText.replace(pattern, replaceMatch).trim();
|
|
239
|
+
}
|
|
240
|
+
function getPoint(startBox, size) {
|
|
241
|
+
const [x, y] = JSON.parse(startBox);
|
|
242
|
+
return [
|
|
243
|
+
x * size.width,
|
|
244
|
+
y * size.height
|
|
245
|
+
];
|
|
246
|
+
}
|
|
247
|
+
export { uiTarsPlanning };
|
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
import { assert, isPlainObject } from "@godscene/shared/utils";
|
|
2
|
+
import { isUITars } from "./ai-model/auto-glm/util.mjs";
|
|
3
|
+
import { NodeType } from "@godscene/shared/constants";
|
|
4
|
+
import { treeToList } from "@godscene/shared/extractor";
|
|
5
|
+
import { compositeElementInfoImg } from "@godscene/shared/img";
|
|
6
|
+
import { getDebug } from "@godscene/shared/logger";
|
|
7
|
+
import { z } from "zod";
|
|
8
|
+
const defaultBboxSize = 20;
|
|
9
|
+
const debugInspectUtils = getDebug('ai:common');
|
|
10
|
+
function pointToBbox(x, y, bboxSize = defaultBboxSize) {
|
|
11
|
+
const halfSize = bboxSize / 2;
|
|
12
|
+
const x1 = Math.max(x - halfSize, 0);
|
|
13
|
+
const y1 = Math.max(y - halfSize, 0);
|
|
14
|
+
const x2 = Math.min(x + halfSize, 1000);
|
|
15
|
+
const y2 = Math.min(y + halfSize, 1000);
|
|
16
|
+
return [
|
|
17
|
+
x1,
|
|
18
|
+
y1,
|
|
19
|
+
x2,
|
|
20
|
+
y2
|
|
21
|
+
];
|
|
22
|
+
}
|
|
23
|
+
function fillBboxParam(locate, width, height, modelFamily) {
|
|
24
|
+
if (locate.bbox_2d && !locate?.bbox) {
|
|
25
|
+
locate.bbox = locate.bbox_2d;
|
|
26
|
+
delete locate.bbox_2d;
|
|
27
|
+
}
|
|
28
|
+
if (locate?.bbox) locate.bbox = adaptBbox(locate.bbox, width, height, modelFamily);
|
|
29
|
+
return locate;
|
|
30
|
+
}
|
|
31
|
+
function adaptQwen2_5Bbox(bbox) {
|
|
32
|
+
if (bbox.length < 2) {
|
|
33
|
+
const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
|
|
34
|
+
throw new Error(msg);
|
|
35
|
+
}
|
|
36
|
+
const result = [
|
|
37
|
+
Math.round(bbox[0]),
|
|
38
|
+
Math.round(bbox[1]),
|
|
39
|
+
'number' == typeof bbox[2] ? Math.round(bbox[2]) : Math.round(bbox[0] + defaultBboxSize),
|
|
40
|
+
'number' == typeof bbox[3] ? Math.round(bbox[3]) : Math.round(bbox[1] + defaultBboxSize)
|
|
41
|
+
];
|
|
42
|
+
return result;
|
|
43
|
+
}
|
|
44
|
+
function adaptGpt5Bbox(bbox) {
|
|
45
|
+
if (!Array.isArray(bbox) || 4 !== bbox.length || !bbox.every((value)=>'number' == typeof value && Number.isFinite(value))) {
|
|
46
|
+
const msg = `invalid bbox data for gpt-5 mode: ${JSON.stringify(bbox)} `;
|
|
47
|
+
throw new Error(msg);
|
|
48
|
+
}
|
|
49
|
+
const numericBbox = bbox;
|
|
50
|
+
return [
|
|
51
|
+
numericBbox[0],
|
|
52
|
+
numericBbox[1],
|
|
53
|
+
numericBbox[2],
|
|
54
|
+
numericBbox[3]
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
function adaptDoubaoBbox(bbox, width, height) {
|
|
58
|
+
assert(width > 0 && height > 0, 'width and height must be greater than 0 in doubao mode');
|
|
59
|
+
if ('string' == typeof bbox) {
|
|
60
|
+
assert(/^(\d+)\s(\d+)\s(\d+)\s(\d+)$/.test(bbox.trim()), `invalid bbox data string for doubao-vision mode: ${bbox}`);
|
|
61
|
+
const splitted = bbox.split(' ');
|
|
62
|
+
if (4 === splitted.length) return [
|
|
63
|
+
Math.round(Number(splitted[0]) * width / 1000),
|
|
64
|
+
Math.round(Number(splitted[1]) * height / 1000),
|
|
65
|
+
Math.round(Number(splitted[2]) * width / 1000),
|
|
66
|
+
Math.round(Number(splitted[3]) * height / 1000)
|
|
67
|
+
];
|
|
68
|
+
throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);
|
|
69
|
+
}
|
|
70
|
+
let bboxList = [];
|
|
71
|
+
if (Array.isArray(bbox) && 'string' == typeof bbox[0]) bbox.forEach((item)=>{
|
|
72
|
+
if ('string' == typeof item && item.includes(',')) {
|
|
73
|
+
const [x, y] = item.split(',');
|
|
74
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
75
|
+
} else if ('string' == typeof item && item.includes(' ')) {
|
|
76
|
+
const [x, y] = item.split(' ');
|
|
77
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
78
|
+
} else bboxList.push(Number(item));
|
|
79
|
+
});
|
|
80
|
+
else bboxList = bbox;
|
|
81
|
+
if (4 === bboxList.length || 5 === bboxList.length) return [
|
|
82
|
+
Math.round(bboxList[0] * width / 1000),
|
|
83
|
+
Math.round(bboxList[1] * height / 1000),
|
|
84
|
+
Math.round(bboxList[2] * width / 1000),
|
|
85
|
+
Math.round(bboxList[3] * height / 1000)
|
|
86
|
+
];
|
|
87
|
+
if (6 === bboxList.length || 2 === bboxList.length || 3 === bboxList.length || 7 === bboxList.length) return [
|
|
88
|
+
Math.max(0, Math.round(bboxList[0] * width / 1000) - defaultBboxSize / 2),
|
|
89
|
+
Math.max(0, Math.round(bboxList[1] * height / 1000) - defaultBboxSize / 2),
|
|
90
|
+
Math.min(width, Math.round(bboxList[0] * width / 1000) + defaultBboxSize / 2),
|
|
91
|
+
Math.min(height, Math.round(bboxList[1] * height / 1000) + defaultBboxSize / 2)
|
|
92
|
+
];
|
|
93
|
+
if (8 === bbox.length) return [
|
|
94
|
+
Math.round(bboxList[0] * width / 1000),
|
|
95
|
+
Math.round(bboxList[1] * height / 1000),
|
|
96
|
+
Math.round(bboxList[4] * width / 1000),
|
|
97
|
+
Math.round(bboxList[5] * height / 1000)
|
|
98
|
+
];
|
|
99
|
+
const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
|
|
100
|
+
throw new Error(msg);
|
|
101
|
+
}
|
|
102
|
+
function normalizeBboxInput(bbox) {
|
|
103
|
+
if (Array.isArray(bbox)) {
|
|
104
|
+
if (Array.isArray(bbox[0])) return bbox[0];
|
|
105
|
+
}
|
|
106
|
+
return bbox;
|
|
107
|
+
}
|
|
108
|
+
function adaptBbox(bbox, width, height, modelFamily) {
|
|
109
|
+
const normalizedBbox = normalizeBboxInput(bbox);
|
|
110
|
+
let result = [
|
|
111
|
+
0,
|
|
112
|
+
0,
|
|
113
|
+
0,
|
|
114
|
+
0
|
|
115
|
+
];
|
|
116
|
+
result = 'doubao-vision' === modelFamily || 'doubao-seed' === modelFamily || isUITars(modelFamily) ? adaptDoubaoBbox(normalizedBbox, width, height) : 'gemini' === modelFamily ? adaptGeminiBbox(normalizedBbox, width, height) : 'qwen2.5-vl' === modelFamily ? adaptQwen2_5Bbox(normalizedBbox) : 'gpt-5' === modelFamily ? adaptGpt5Bbox(normalizedBbox) : normalized01000(normalizedBbox, width, height);
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
function normalized01000(bbox, width, height) {
|
|
120
|
+
return [
|
|
121
|
+
Math.round(bbox[0] * width / 1000),
|
|
122
|
+
Math.round(bbox[1] * height / 1000),
|
|
123
|
+
Math.round(bbox[2] * width / 1000),
|
|
124
|
+
Math.round(bbox[3] * height / 1000)
|
|
125
|
+
];
|
|
126
|
+
}
|
|
127
|
+
function adaptGeminiBbox(bbox, width, height) {
|
|
128
|
+
const left = Math.round(bbox[1] * width / 1000);
|
|
129
|
+
const top = Math.round(bbox[0] * height / 1000);
|
|
130
|
+
const right = Math.round(bbox[3] * width / 1000);
|
|
131
|
+
const bottom = Math.round(bbox[2] * height / 1000);
|
|
132
|
+
return [
|
|
133
|
+
left,
|
|
134
|
+
top,
|
|
135
|
+
right,
|
|
136
|
+
bottom
|
|
137
|
+
];
|
|
138
|
+
}
|
|
139
|
+
function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0, rightLimit = width, bottomLimit = height, modelFamily, scale = 1) {
|
|
140
|
+
debugInspectUtils('adaptBboxToRect', bbox, width, height, 'offset', offsetX, offsetY, 'limit', rightLimit, bottomLimit, 'modelFamily', modelFamily, 'scale', scale);
|
|
141
|
+
const [left, top, right, bottom] = adaptBbox(bbox, width, height, modelFamily);
|
|
142
|
+
const rectLeft = Math.max(0, left);
|
|
143
|
+
const rectTop = Math.max(0, top);
|
|
144
|
+
const boundedRight = Math.min(right, rightLimit);
|
|
145
|
+
const boundedBottom = Math.min(bottom, bottomLimit);
|
|
146
|
+
const rectWidth = boundedRight - rectLeft + 1;
|
|
147
|
+
const rectHeight = boundedBottom - rectTop + 1;
|
|
148
|
+
const finalLeft = 1 !== scale ? Math.round(rectLeft / scale) : rectLeft;
|
|
149
|
+
const finalTop = 1 !== scale ? Math.round(rectTop / scale) : rectTop;
|
|
150
|
+
const finalWidth = 1 !== scale ? Math.round(rectWidth / scale) : rectWidth;
|
|
151
|
+
const finalHeight = 1 !== scale ? Math.round(rectHeight / scale) : rectHeight;
|
|
152
|
+
const rect = {
|
|
153
|
+
left: finalLeft + offsetX,
|
|
154
|
+
top: finalTop + offsetY,
|
|
155
|
+
width: finalWidth,
|
|
156
|
+
height: finalHeight
|
|
157
|
+
};
|
|
158
|
+
debugInspectUtils('adaptBboxToRect, result=', rect);
|
|
159
|
+
return rect;
|
|
160
|
+
}
|
|
161
|
+
function mergeRects(rects) {
|
|
162
|
+
const minLeft = Math.min(...rects.map((r)=>r.left));
|
|
163
|
+
const minTop = Math.min(...rects.map((r)=>r.top));
|
|
164
|
+
const maxRight = Math.max(...rects.map((r)=>r.left + r.width));
|
|
165
|
+
const maxBottom = Math.max(...rects.map((r)=>r.top + r.height));
|
|
166
|
+
return {
|
|
167
|
+
left: minLeft,
|
|
168
|
+
top: minTop,
|
|
169
|
+
width: maxRight - minLeft,
|
|
170
|
+
height: maxBottom - minTop
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
function expandSearchArea(rect, screenSize) {
|
|
174
|
+
const minArea = 160000;
|
|
175
|
+
const expandSize = 100;
|
|
176
|
+
const expandedLeft = Math.max(rect.left - expandSize, 0);
|
|
177
|
+
const expandedTop = Math.max(rect.top - expandSize, 0);
|
|
178
|
+
const expandRect = {
|
|
179
|
+
left: expandedLeft,
|
|
180
|
+
top: expandedTop,
|
|
181
|
+
width: Math.min(rect.left - expandedLeft + rect.width + expandSize, screenSize.width - expandedLeft),
|
|
182
|
+
height: Math.min(rect.top - expandedTop + rect.height + expandSize, screenSize.height - expandedTop)
|
|
183
|
+
};
|
|
184
|
+
const currentArea = expandRect.width * expandRect.height;
|
|
185
|
+
if (currentArea >= minArea) return expandRect;
|
|
186
|
+
const centerX = expandRect.left + expandRect.width / 2;
|
|
187
|
+
const centerY = expandRect.top + expandRect.height / 2;
|
|
188
|
+
const scaleFactor = Math.sqrt(minArea / currentArea);
|
|
189
|
+
const newWidth = Math.round(expandRect.width * scaleFactor);
|
|
190
|
+
const newHeight = Math.round(expandRect.height * scaleFactor);
|
|
191
|
+
const newLeft = Math.round(centerX - newWidth / 2);
|
|
192
|
+
const newTop = Math.round(centerY - newHeight / 2);
|
|
193
|
+
const left = Math.max(newLeft, 0);
|
|
194
|
+
const top = Math.max(newTop, 0);
|
|
195
|
+
return {
|
|
196
|
+
left,
|
|
197
|
+
top,
|
|
198
|
+
width: Math.min(newWidth, screenSize.width - left),
|
|
199
|
+
height: Math.min(newHeight, screenSize.height - top)
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
async function markupImageForLLM(screenshotBase64, tree, size) {
|
|
203
|
+
const elementsInfo = treeToList(tree);
|
|
204
|
+
const elementsPositionInfoWithoutText = elementsInfo.filter((elementInfo)=>{
|
|
205
|
+
if (elementInfo.attributes.nodeType === NodeType.TEXT) return false;
|
|
206
|
+
return true;
|
|
207
|
+
});
|
|
208
|
+
const imagePayload = await compositeElementInfoImg({
|
|
209
|
+
inputImgBase64: screenshotBase64,
|
|
210
|
+
elementsPositionInfo: elementsPositionInfoWithoutText,
|
|
211
|
+
size
|
|
212
|
+
});
|
|
213
|
+
return imagePayload;
|
|
214
|
+
}
|
|
215
|
+
function buildYamlFlowFromPlans(plans, actionSpace) {
|
|
216
|
+
const flow = [];
|
|
217
|
+
for (const plan of plans){
|
|
218
|
+
const verb = plan.type;
|
|
219
|
+
const action = actionSpace.find((action)=>action.name === verb);
|
|
220
|
+
if (!action) {
|
|
221
|
+
console.warn(`Cannot convert action ${verb} to yaml flow. Will ignore it.`);
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
const flowKey = action.interfaceAlias || verb;
|
|
225
|
+
const flowParam = action.paramSchema ? dumpActionParam(plan.param || {}, action.paramSchema) : {};
|
|
226
|
+
const shortcutField = 'Launch' === action.name || 'launch' === action.interfaceAlias ? 'uri' : 'Terminate' === action.name || 'terminate' === action.interfaceAlias ? 'uri' : 'RunAdbShell' === action.name || 'runAdbShell' === action.interfaceAlias ? 'command' : void 0;
|
|
227
|
+
const shortcutKeys = shortcutField ? Object.keys(flowParam) : [];
|
|
228
|
+
const canInlineShortcut = shortcutField && 1 === shortcutKeys.length && shortcutKeys[0] === shortcutField && 'string' == typeof flowParam[shortcutField];
|
|
229
|
+
const flowItem = canInlineShortcut ? {
|
|
230
|
+
[flowKey]: flowParam[shortcutField]
|
|
231
|
+
} : {
|
|
232
|
+
[flowKey]: '',
|
|
233
|
+
...flowParam
|
|
234
|
+
};
|
|
235
|
+
flow.push(flowItem);
|
|
236
|
+
}
|
|
237
|
+
return flow;
|
|
238
|
+
}
|
|
239
|
+
const PointSchema = z.object({
|
|
240
|
+
left: z.number(),
|
|
241
|
+
top: z.number()
|
|
242
|
+
});
|
|
243
|
+
const SizeSchema = z.object({
|
|
244
|
+
width: z.number(),
|
|
245
|
+
height: z.number()
|
|
246
|
+
});
|
|
247
|
+
const RectSchema = PointSchema.and(SizeSchema).and(z.object({
|
|
248
|
+
zoom: z.number().optional()
|
|
249
|
+
}));
|
|
250
|
+
const TMultimodalPromptSchema = z.object({
|
|
251
|
+
images: z.array(z.object({
|
|
252
|
+
name: z.string(),
|
|
253
|
+
url: z.string()
|
|
254
|
+
})).optional(),
|
|
255
|
+
convertHttpImage2Base64: z.boolean().optional()
|
|
256
|
+
});
|
|
257
|
+
const TUserPromptSchema = z.union([
|
|
258
|
+
z.string(),
|
|
259
|
+
z.object({
|
|
260
|
+
prompt: z.string()
|
|
261
|
+
}).and(TMultimodalPromptSchema.partial())
|
|
262
|
+
]);
|
|
263
|
+
const locateFieldFlagName = 'midscene_location_field_flag';
|
|
264
|
+
const MidsceneLocationInput = z.object({
|
|
265
|
+
prompt: TUserPromptSchema,
|
|
266
|
+
deepLocate: z.boolean().optional(),
|
|
267
|
+
deepThink: z.boolean().optional().describe('@deprecated Use `deepLocate` instead.'),
|
|
268
|
+
cacheable: z.boolean().optional(),
|
|
269
|
+
xpath: z.union([
|
|
270
|
+
z.string(),
|
|
271
|
+
z.boolean()
|
|
272
|
+
]).optional()
|
|
273
|
+
}).passthrough();
|
|
274
|
+
const getMidsceneLocationSchema = ()=>MidsceneLocationInput;
|
|
275
|
+
const ifMidsceneLocatorField = (field)=>{
|
|
276
|
+
let actualField = field;
|
|
277
|
+
if (actualField._def?.typeName === 'ZodOptional') actualField = actualField._def.innerType;
|
|
278
|
+
if (actualField._def?.typeName === 'ZodObject') {
|
|
279
|
+
const shape = actualField._def.shape();
|
|
280
|
+
if (locateFieldFlagName in shape) return true;
|
|
281
|
+
if ('prompt' in shape && shape.prompt) return true;
|
|
282
|
+
}
|
|
283
|
+
return false;
|
|
284
|
+
};
|
|
285
|
+
const formatPromptWithImages = (promptObj)=>{
|
|
286
|
+
let promptString = promptObj.prompt;
|
|
287
|
+
if (Array.isArray(promptObj.images) && promptObj.images.length > 0) {
|
|
288
|
+
const imageCount = promptObj.images.length;
|
|
289
|
+
promptString += ` (with ${imageCount} image${imageCount > 1 ? 's' : ''})`;
|
|
290
|
+
}
|
|
291
|
+
return promptString;
|
|
292
|
+
};
|
|
293
|
+
const dumpMidsceneLocatorField = (field)=>{
|
|
294
|
+
assert(ifMidsceneLocatorField(field), 'field is not a midscene locator field');
|
|
295
|
+
if ('string' == typeof field) return field;
|
|
296
|
+
if (field && 'object' == typeof field && field.prompt) {
|
|
297
|
+
if ('string' == typeof field.prompt) return field.prompt;
|
|
298
|
+
if ('object' == typeof field.prompt && field.prompt.prompt) return formatPromptWithImages(field.prompt);
|
|
299
|
+
}
|
|
300
|
+
return String(field);
|
|
301
|
+
};
|
|
302
|
+
const findAllMidsceneLocatorField = (zodType, requiredOnly)=>{
|
|
303
|
+
if (!zodType) return [];
|
|
304
|
+
const zodObject = zodType;
|
|
305
|
+
if (zodObject._def?.typeName === 'ZodObject' && zodObject.shape) {
|
|
306
|
+
const keys = Object.keys(zodObject.shape);
|
|
307
|
+
return keys.filter((key)=>{
|
|
308
|
+
const field = zodObject.shape[key];
|
|
309
|
+
if (!ifMidsceneLocatorField(field)) return false;
|
|
310
|
+
if (requiredOnly) return field._def?.typeName !== 'ZodOptional';
|
|
311
|
+
return true;
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
return [];
|
|
315
|
+
};
|
|
316
|
+
const dumpActionParam = (jsonObject, zodSchema)=>{
|
|
317
|
+
if (!isPlainObject(jsonObject)) return {};
|
|
318
|
+
const locatorFields = findAllMidsceneLocatorField(zodSchema);
|
|
319
|
+
const result = {
|
|
320
|
+
...jsonObject
|
|
321
|
+
};
|
|
322
|
+
for (const fieldName of locatorFields){
|
|
323
|
+
const fieldValue = result[fieldName];
|
|
324
|
+
if (fieldValue) {
|
|
325
|
+
if ('string' == typeof fieldValue) result[fieldName] = fieldValue;
|
|
326
|
+
else if ('object' == typeof fieldValue) {
|
|
327
|
+
if (fieldValue.prompt) {
|
|
328
|
+
if ('string' == typeof fieldValue.prompt) result[fieldName] = fieldValue.prompt;
|
|
329
|
+
else if ('object' == typeof fieldValue.prompt && fieldValue.prompt.prompt) result[fieldName] = formatPromptWithImages(fieldValue.prompt);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
return result;
|
|
335
|
+
};
|
|
336
|
+
const parseActionParam = (rawParam, zodSchema, options)=>{
|
|
337
|
+
if (!zodSchema) return;
|
|
338
|
+
const param = rawParam ?? {};
|
|
339
|
+
const locateFields = findAllMidsceneLocatorField(zodSchema);
|
|
340
|
+
if (0 === locateFields.length) return zodSchema.parse(param);
|
|
341
|
+
const locateFieldValues = {};
|
|
342
|
+
for (const fieldName of locateFields)if (fieldName in param) locateFieldValues[fieldName] = param[fieldName];
|
|
343
|
+
const paramsForValidation = {};
|
|
344
|
+
for(const key in param)if (locateFields.includes(key)) paramsForValidation[key] = {
|
|
345
|
+
prompt: '_dummy_'
|
|
346
|
+
};
|
|
347
|
+
else paramsForValidation[key] = param[key];
|
|
348
|
+
const validated = zodSchema.parse(paramsForValidation);
|
|
349
|
+
const ratio = options?.shrunkShotToLogicalRatio;
|
|
350
|
+
for(const fieldName in locateFieldValues){
|
|
351
|
+
let value = locateFieldValues[fieldName];
|
|
352
|
+
if (void 0 !== ratio && 1 !== ratio && value && 'object' == typeof value && value.center && value.rect) value = {
|
|
353
|
+
...value,
|
|
354
|
+
center: [
|
|
355
|
+
Math.round(value.center[0] / ratio),
|
|
356
|
+
Math.round(value.center[1] / ratio)
|
|
357
|
+
],
|
|
358
|
+
rect: {
|
|
359
|
+
...value.rect,
|
|
360
|
+
left: Math.round(value.rect.left / ratio),
|
|
361
|
+
top: Math.round(value.rect.top / ratio),
|
|
362
|
+
width: Math.round(value.rect.width / ratio),
|
|
363
|
+
height: Math.round(value.rect.height / ratio)
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
validated[fieldName] = value;
|
|
367
|
+
}
|
|
368
|
+
return validated;
|
|
369
|
+
};
|
|
370
|
+
const finalizeActionName = 'Finalize';
|
|
371
|
+
const getReadableTimeString = (format = 'YYYY-MM-DD HH:mm:ss', timestamp)=>{
|
|
372
|
+
const now = void 0 !== timestamp ? new Date(timestamp) : new Date();
|
|
373
|
+
const year = now.getFullYear();
|
|
374
|
+
const month = String(now.getMonth() + 1).padStart(2, '0');
|
|
375
|
+
const day = String(now.getDate()).padStart(2, '0');
|
|
376
|
+
const hours = String(now.getHours()).padStart(2, '0');
|
|
377
|
+
const minutes = String(now.getMinutes()).padStart(2, '0');
|
|
378
|
+
const seconds = String(now.getSeconds()).padStart(2, '0');
|
|
379
|
+
const timeString = format.replace('YYYY', String(year)).replace('MM', month).replace('DD', day).replace('HH', hours).replace('mm', minutes).replace('ss', seconds);
|
|
380
|
+
return `${timeString} (${format})`;
|
|
381
|
+
};
|
|
382
|
+
export { PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptGpt5Bbox, adaptQwen2_5Bbox, buildYamlFlowFromPlans, dumpActionParam, dumpMidsceneLocatorField, expandSearchArea, fillBboxParam, finalizeActionName, findAllMidsceneLocatorField, getMidsceneLocationSchema, getReadableTimeString, ifMidsceneLocatorField, markupImageForLLM, mergeRects, normalized01000, parseActionParam, pointToBbox };
|
|
File without changes
|