@donggui/core 1.5.4-donggui.5 → 1.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +1 -1
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/cache-adapter.mjs +0 -0
- package/dist/es/agent/task-builder.mjs +2 -2
- package/dist/es/agent/task-builder.mjs.map +1 -1
- package/dist/es/agent/task-cache.mjs +93 -44
- package/dist/es/agent/task-cache.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +357 -153
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/codex-app-server.mjs +584 -0
- package/dist/es/ai-model/service-caller/codex-app-server.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +2 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/device/index.mjs +1 -1
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/agent.js +1 -1
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/cache-adapter.js +20 -0
- package/dist/lib/agent/cache-adapter.js.map +1 -0
- package/dist/lib/agent/task-builder.js +2 -2
- package/dist/lib/agent/task-builder.js.map +1 -1
- package/dist/lib/agent/task-cache.js +93 -44
- package/dist/lib/agent/task-cache.js.map +1 -1
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +357 -153
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/service-caller/codex-app-server.js +633 -0
- package/dist/lib/ai-model/service-caller/codex-app-server.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +2 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/device/index.js +1 -1
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/agent/cache-adapter.d.ts +32 -0
- package/dist/types/agent/index.d.ts +2 -0
- package/dist/types/agent/task-cache.d.ts +16 -7
- package/dist/types/ai-model/service-caller/codex-app-server.d.ts +46 -0
- package/package.json +3 -3
|
@@ -140,92 +140,6 @@ const descriptionForAction = (action, locatorSchemaTypeDescription, includeBbox
|
|
|
140
140
|
${tab}${fields.join(`\n${tab}`)}
|
|
141
141
|
`.trim();
|
|
142
142
|
};
|
|
143
|
-
const EXPLICIT_INSTRUCTION_RULES = `
|
|
144
|
-
**Explicit instructions vs. High-level goals:**
|
|
145
|
-
- **Explicit steps** (e.g., "click X", "type Y", "fill form"): Execute EXACTLY those steps - nothing more
|
|
146
|
-
- **High-level goals** (e.g., "log in", "complete purchase"): Determine necessary steps
|
|
147
|
-
|
|
148
|
-
**Examples:**
|
|
149
|
-
- "fill out the form" → Fill fields only, do NOT submit
|
|
150
|
-
- "click the button" → Click only, do NOT wait for page load
|
|
151
|
-
- "type 'hello'" → Type only, do NOT press Enter
|
|
152
|
-
- "select the first item" → Select only, do NOT proceed to checkout`;
|
|
153
|
-
const ASSERTION_RULES = `
|
|
154
|
-
**Assertion instructions:**
|
|
155
|
-
- If assertion condition is NOT satisfied and cannot be satisfied, mark as failed (success="false")
|
|
156
|
-
- If page is loading (spinner, skeleton, progress bar), wait before asserting`;
|
|
157
|
-
const PAGE_NAVIGATION_RULES = `
|
|
158
|
-
**Page navigation restriction:**
|
|
159
|
-
- Complete task on current page unless explicitly instructed to navigate
|
|
160
|
-
- Do NOT click links, use browser navigation, or open new URLs
|
|
161
|
-
- If task cannot be accomplished, report failure instead of navigating`;
|
|
162
|
-
const generateSubGoalSection = ()=>`
|
|
163
|
-
* <update-plan-content> tag
|
|
164
|
-
|
|
165
|
-
<update-plan-content>
|
|
166
|
-
<sub-goal index="1" status="finished|pending">description</sub-goal>
|
|
167
|
-
<sub-goal index="2" status="finished|pending">description</sub-goal>
|
|
168
|
-
</update-plan-content>
|
|
169
|
-
|
|
170
|
-
* <mark-sub-goal-done> tag
|
|
171
|
-
|
|
172
|
-
<mark-sub-goal-done>
|
|
173
|
-
<sub-goal index="1" status="finished" />
|
|
174
|
-
</mark-sub-goal-done>
|
|
175
|
-
|
|
176
|
-
IMPORTANT: Mark sub-goal as "finished" ONLY AFTER confirming completion in screenshot.`;
|
|
177
|
-
const generateSimplifiedExample = (includeSubGoals, locateNameField, locateEmailField)=>{
|
|
178
|
-
if (includeSubGoals) return `
|
|
179
|
-
## Example: Fill registration form
|
|
180
|
-
|
|
181
|
-
**User:** "fill out the form with name 'John' and email 'xxxx(真实的email地址)', then return the email"
|
|
182
|
-
|
|
183
|
-
**Response:**
|
|
184
|
-
<thought>Form has Name and Email fields. Both empty. Break into sub-goals.</thought>
|
|
185
|
-
<update-plan-content>
|
|
186
|
-
<sub-goal index="1" status="pending">Fill Name with 'John'</sub-goal>
|
|
187
|
-
<sub-goal index="2" status="pending">Fill Email with 'xxxx(真实的email地址)'</sub-goal>
|
|
188
|
-
<sub-goal index="3" status="pending">Return email address</sub-goal>
|
|
189
|
-
</update-plan-content>
|
|
190
|
-
<log>Click Name field</log>
|
|
191
|
-
<action-type>Tap</action-type>
|
|
192
|
-
<action-param-json>{"locate": ${locateNameField}}</action-param-json>
|
|
193
|
-
|
|
194
|
-
(After typing name)
|
|
195
|
-
<thought>Name='John'. Sub-goal 1 complete. Fill Email next.</thought>
|
|
196
|
-
<mark-sub-goal-done><sub-goal index="1" status="finished" /></mark-sub-goal-done>
|
|
197
|
-
<log>Click Email field</log>
|
|
198
|
-
<action-type>Tap</action-type>
|
|
199
|
-
<action-param-json>{"locate": ${locateEmailField}}</action-param-json>
|
|
200
|
-
|
|
201
|
-
(After typing email)
|
|
202
|
-
<thought>Both fields filled. All sub-goals complete.</thought>
|
|
203
|
-
<mark-sub-goal-done>
|
|
204
|
-
<sub-goal index="2" status="finished" />
|
|
205
|
-
<sub-goal index="3" status="finished" />
|
|
206
|
-
</mark-sub-goal-done>
|
|
207
|
-
<complete success="true">xxxx(真实的email地址)</complete>`;
|
|
208
|
-
return `
|
|
209
|
-
## Example: Fill registration form
|
|
210
|
-
|
|
211
|
-
**User:** "fill out the form with name 'John' and email 'xxxx(真实的email地址)', then return the email"
|
|
212
|
-
|
|
213
|
-
**Response:**
|
|
214
|
-
<thought>Form has Name and Email fields. Both empty. Start with Name.</thought>
|
|
215
|
-
<log>Click Name field</log>
|
|
216
|
-
<action-type>Tap</action-type>
|
|
217
|
-
<action-param-json>{"locate": ${locateNameField}}</action-param-json>
|
|
218
|
-
|
|
219
|
-
(After typing name)
|
|
220
|
-
<thought>Name='John'. Fill Email next.</thought>
|
|
221
|
-
<log>Click Email field</log>
|
|
222
|
-
<action-type>Tap</action-type>
|
|
223
|
-
<action-param-json>{"locate": ${locateEmailField}}</action-param-json>
|
|
224
|
-
|
|
225
|
-
(After typing email)
|
|
226
|
-
<thought>Both fields filled. Task complete.</thought>
|
|
227
|
-
<complete success="true">xxxx(真实的email地址)</complete>`;
|
|
228
|
-
};
|
|
229
143
|
async function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbox, includeThought, includeSubGoals }) {
|
|
230
144
|
const preferredLanguage = (0, env_namespaceObject.getPreferredLanguage)();
|
|
231
145
|
if (includeBbox && !modelFamily) throw new Error('modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.');
|
|
@@ -250,27 +164,79 @@ async function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbo
|
|
|
250
164
|
}` : `{
|
|
251
165
|
"prompt": "Email input field in the registration form"
|
|
252
166
|
}`;
|
|
253
|
-
const step1Title = shouldIncludeSubGoals ? '## Step 1: Observe and Plan' : '## Step 1: Observe';
|
|
254
|
-
const step1Description = shouldIncludeSubGoals ? "
|
|
255
|
-
const
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
${
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
167
|
+
const step1Title = shouldIncludeSubGoals ? '## Step 1: Observe and Plan (related tags: <thought>, <update-plan-content>, <mark-sub-goal-done>)' : '## Step 1: Observe (related tags: <thought>)';
|
|
168
|
+
const step1Description = shouldIncludeSubGoals ? "First, observe the current screenshot and previous logs, then break down the user's instruction into multiple high-level sub-goals. Update the status of sub-goals based on what you see in the current screenshot." : 'First, observe the current screenshot and previous logs to understand the current state.';
|
|
169
|
+
const explicitInstructionRule = 'CRITICAL - Following Explicit Instructions: When the user gives you specific operation steps (not high-level goals), you MUST execute ONLY those exact steps - nothing more, nothing less. Do NOT add extra actions even if they seem logical. For example: "fill out the form" means only fill fields, do NOT submit; "click the button" means only click, do NOT wait for page load or verify results; "type \'hello\'" means only type, do NOT press Enter.';
|
|
170
|
+
const thoughtTagDescription = shouldIncludeSubGoals ? `REQUIRED: You MUST always output the <thought> tag. Never skip it.
|
|
171
|
+
|
|
172
|
+
Include your thought process in the <thought> tag. It should answer: What is the user's requirement? What is the current state based on the screenshot? Are all sub-goals completed? If not, what should be the next action? Write your thoughts naturally without numbering or section headers.
|
|
173
|
+
|
|
174
|
+
${explicitInstructionRule}` : `REQUIRED: You MUST always output the <thought> tag. Never skip it.
|
|
175
|
+
|
|
176
|
+
Include your thought process in the <thought> tag. It should answer: What is the current state based on the screenshot? What should be the next action? Write your thoughts naturally without numbering or section headers.
|
|
177
|
+
|
|
178
|
+
${explicitInstructionRule}`;
|
|
179
|
+
const subGoalTags = shouldIncludeSubGoals ? `
|
|
180
|
+
|
|
181
|
+
* <update-plan-content> tag
|
|
182
|
+
|
|
183
|
+
Use this structure to give or update your plan:
|
|
184
|
+
|
|
185
|
+
<update-plan-content>
|
|
186
|
+
<sub-goal index="1" status="finished|pending">sub goal description</sub-goal>
|
|
187
|
+
<sub-goal index="2" status="finished|pending">sub goal description</sub-goal>
|
|
188
|
+
...
|
|
189
|
+
</update-plan-content>
|
|
190
|
+
|
|
191
|
+
* <mark-sub-goal-done> tag
|
|
192
|
+
|
|
193
|
+
Use this structure to mark a sub-goal as done:
|
|
194
|
+
|
|
195
|
+
<mark-sub-goal-done>
|
|
196
|
+
<sub-goal index="1" status="finished" />
|
|
197
|
+
</mark-sub-goal-done>
|
|
198
|
+
|
|
199
|
+
IMPORTANT: You MUST only mark a sub-goal as "finished" AFTER you have confirmed the task is actually completed by observing the result in the screenshot. Do NOT mark a sub-goal as done just because you expect the next action will complete it. Wait until you see visual confirmation in the screenshot that the sub-goal has been achieved.
|
|
200
|
+
|
|
201
|
+
* Note
|
|
202
|
+
|
|
203
|
+
During execution, you can call <update-plan-content> at any time to update the plan based on the latest screenshot and completed sub-goals.
|
|
204
|
+
|
|
205
|
+
### Example
|
|
206
|
+
|
|
207
|
+
If the user wants to "log in to a system using username and password, complete all to-do items, and submit a registration form", you can break it down into the following sub-goals:
|
|
208
|
+
|
|
209
|
+
<thought>...</thought>
|
|
210
|
+
<update-plan-content>
|
|
211
|
+
<sub-goal index="1" status="pending">Log in to the system</sub-goal>
|
|
212
|
+
<sub-goal index="2" status="pending">Complete all to-do items</sub-goal>
|
|
213
|
+
<sub-goal index="3" status="pending">Submit the registration form</sub-goal>
|
|
214
|
+
</update-plan-content>
|
|
215
|
+
|
|
216
|
+
After logging in and seeing the to-do items, you can mark the sub-goal as done:
|
|
217
|
+
|
|
218
|
+
<mark-sub-goal-done>
|
|
219
|
+
<sub-goal index="1" status="finished" />
|
|
220
|
+
</mark-sub-goal-done>
|
|
221
|
+
|
|
222
|
+
At this point, the status of all sub-goals is:
|
|
223
|
+
|
|
224
|
+
<update-plan-content>
|
|
225
|
+
<sub-goal index="1" status="finished" />
|
|
226
|
+
<sub-goal index="2" status="pending" />
|
|
227
|
+
<sub-goal index="3" status="pending" />
|
|
228
|
+
</update-plan-content>
|
|
229
|
+
|
|
230
|
+
After some time, when the last sub-goal is also completed, you can mark it as done as well:
|
|
231
|
+
|
|
232
|
+
<mark-sub-goal-done>
|
|
233
|
+
<sub-goal index="3" status="finished" />
|
|
234
|
+
</mark-sub-goal-done>` : '';
|
|
265
235
|
const memoryStepNumber = 2;
|
|
266
236
|
const checkGoalStepNumber = shouldIncludeSubGoals ? 3 : 2;
|
|
267
237
|
const actionStepNumber = shouldIncludeSubGoals ? 4 : 3;
|
|
268
|
-
const goalAccomplished = shouldIncludeSubGoals ? 'goal is accomplished' : 'instruction is fulfilled';
|
|
269
238
|
return `
|
|
270
|
-
|
|
271
|
-
You are an expert UI automation agent. Accomplish user instructions by observing screenshots and executing actions.
|
|
272
|
-
|
|
273
|
-
# Steps
|
|
239
|
+
Target: You are an expert to manipulate the UI to accomplish the user's instruction. User will give you an instruction, some screenshots, background knowledge and previous logs indicating what have been done. Your task is to accomplish the instruction by thinking through the path to complete the task and give the next action to execute.
|
|
274
240
|
|
|
275
241
|
${step1Title}
|
|
276
242
|
|
|
@@ -279,75 +245,88 @@ ${step1Description}
|
|
|
279
245
|
* <thought> tag (REQUIRED)
|
|
280
246
|
|
|
281
247
|
${thoughtTagDescription}
|
|
282
|
-
${
|
|
248
|
+
${subGoalTags}
|
|
283
249
|
${shouldIncludeSubGoals ? `
|
|
284
|
-
## Step ${memoryStepNumber}: Memory Data
|
|
250
|
+
## Step ${memoryStepNumber}: Memory Data from Current Screenshot (related tags: <memory>)
|
|
285
251
|
|
|
286
|
-
|
|
252
|
+
While observing the current screenshot, if you notice any information that might be needed in follow-up actions, record it here. The current screenshot will NOT be available in subsequent steps, so this memory is your only way to preserve essential information. Examples: extracted data, element states, content that needs to be referenced.
|
|
287
253
|
|
|
288
|
-
|
|
254
|
+
Don't use this tag if no information needs to be preserved.
|
|
289
255
|
` : ''}
|
|
290
|
-
## Step ${checkGoalStepNumber}:
|
|
291
|
-
|
|
292
|
-
${shouldIncludeSubGoals ? 'Based on screenshot and sub-goals status, determine' : 'Determine'} if task is completed.
|
|
256
|
+
## Step ${checkGoalStepNumber}: ${shouldIncludeSubGoals ? 'Check if Goal is Accomplished' : 'Check if the Instruction is Fulfilled'} (related tags: <complete>)
|
|
293
257
|
|
|
294
|
-
|
|
258
|
+
${shouldIncludeSubGoals ? 'Based on the current screenshot and the status of all sub-goals, determine' : 'Determine'} if the entire task is completed.
|
|
295
259
|
|
|
296
|
-
The
|
|
260
|
+
### CRITICAL: The User's Instruction is the Supreme Authority
|
|
297
261
|
|
|
298
|
-
|
|
262
|
+
The user's instruction defines the EXACT scope of what you must accomplish. You MUST follow it precisely - nothing more, nothing less. Violating this rule may cause severe consequences such as data loss, unintended operations, or system failures.
|
|
299
263
|
|
|
300
|
-
**
|
|
301
|
-
-
|
|
302
|
-
-
|
|
303
|
-
|
|
304
|
-
${
|
|
264
|
+
**Explicit instructions vs. High-level goals:**
|
|
265
|
+
- If the user gives you **explicit operation steps** (e.g., "click X", "type Y", "fill out the form"), treat them as exact commands. Execute ONLY those steps, nothing more.
|
|
266
|
+
- If the user gives you a **high-level goal** (e.g., "log in to the system", "complete the purchase"), you may determine the necessary steps to achieve it.
|
|
267
|
+
|
|
268
|
+
**What "${shouldIncludeSubGoals ? 'goal accomplished' : 'instruction fulfilled'}" means:**
|
|
269
|
+
- The ${shouldIncludeSubGoals ? 'goal is accomplished' : 'instruction is fulfilled'} when you have done EXACTLY what the user asked - no extra steps, no assumptions.
|
|
270
|
+
- Do NOT perform any action beyond the explicit instruction, even if it seems logical or helpful.
|
|
271
|
+
|
|
272
|
+
**Examples - Explicit instructions (execute exactly, no extra steps):**
|
|
273
|
+
- "fill out the form" → ${shouldIncludeSubGoals ? 'Goal accomplished' : 'Instruction fulfilled'} when all fields are filled. Do NOT submit the form.
|
|
274
|
+
- "click the login button" → ${shouldIncludeSubGoals ? 'Goal accomplished' : 'Instruction fulfilled'} once clicked. Do NOT wait for page load or verify login success.
|
|
275
|
+
- "type 'hello' in the search box" → ${shouldIncludeSubGoals ? 'Goal accomplished' : 'Instruction fulfilled'} when 'hello' is typed. Do NOT press Enter or trigger search.
|
|
276
|
+
- "select the first item" → ${shouldIncludeSubGoals ? 'Goal accomplished' : 'Instruction fulfilled'} when selected. Do NOT proceed to checkout.
|
|
277
|
+
|
|
278
|
+
**Special case - Assertion instructions:**
|
|
279
|
+
- If the user's instruction includes an assertion (e.g., "verify that...", "check that...", "assert..."), and you observe from the screenshot that the assertion condition is NOT satisfied and cannot be satisfied, mark ${shouldIncludeSubGoals ? 'the goal' : 'it'} as failed (success="false").
|
|
280
|
+
- If the page is still loading (e.g., you see a loading spinner, skeleton screen, or progress bar), do NOT assert yet. Wait for the page to finish loading before evaluating the assertion.
|
|
305
281
|
${!shouldIncludeSubGoals ? `
|
|
306
|
-
|
|
282
|
+
**Page navigation restriction:**
|
|
283
|
+
- Unless the user's instruction explicitly asks you to click a link, jump to another page, or navigate to a URL, you MUST complete the task on the current page only.
|
|
284
|
+
- Do NOT navigate away from the current page on your own initiative (e.g., do not click links that lead to other pages, do not use browser back/forward, do not open new URLs).
|
|
285
|
+
- If the task cannot be accomplished on the current page and the user has not instructed you to navigate, report it as a failure (success="false") instead of attempting to navigate to other pages.
|
|
307
286
|
` : ''}
|
|
308
287
|
### Output Rules
|
|
309
288
|
|
|
310
|
-
- If NOT complete, continue to Step ${actionStepNumber}
|
|
311
|
-
- Use <complete success="true|false">message</complete> to output result
|
|
312
|
-
- success="true"
|
|
313
|
-
-
|
|
314
|
-
|
|
315
|
-
- If outputting <complete>, do NOT output <action-type> or <action-param-json>
|
|
289
|
+
- If the task is NOT complete, skip this section and continue to Step ${actionStepNumber}.
|
|
290
|
+
- Use the <complete success="true|false">message</complete> tag to output the result if the goal is accomplished or failed.
|
|
291
|
+
- the 'success' attribute is required. ${shouldIncludeSubGoals ? 'It means whether the expected goal is accomplished based on what you observe in the current screenshot. ' : ''}No matter what actions were executed or what errors occurred during execution, if the ${shouldIncludeSubGoals ? 'expected goal is accomplished' : 'instruction is fulfilled'}, set success="true". If the ${shouldIncludeSubGoals ? 'expected goal is not accomplished and cannot be accomplished' : 'instruction is not fulfilled and cannot be fulfilled'}, set success="false".
|
|
292
|
+
- the 'message' is the information that will be provided to the user. If the user asks for a specific format, strictly follow that.
|
|
293
|
+
- If you output <complete>, do NOT output <action-type> or <action-param-json>. The task ends here.
|
|
316
294
|
|
|
317
|
-
## Step ${actionStepNumber}: Determine Next Action
|
|
295
|
+
## Step ${actionStepNumber}: Determine Next Action (related tags: <log>, <action-type>, <action-param-json>, <error>)
|
|
318
296
|
|
|
319
|
-
ONLY if task is not complete:
|
|
297
|
+
ONLY if the task is not complete: Think what the next action is according to the current screenshot${shouldIncludeSubGoals ? ' and the plan' : ''}.
|
|
320
298
|
|
|
321
|
-
- Don't give extra actions beyond instruction
|
|
322
|
-
- Consider current screenshot
|
|
323
|
-
-
|
|
324
|
-
- Give just the next ONE action
|
|
325
|
-
- If
|
|
299
|
+
- Don't give extra actions or plans beyond the instruction or the plan. For example, don't try to submit the form if the instruction is only to fill something.
|
|
300
|
+
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
301
|
+
- Make sure the previous actions are completed successfully. Otherwise, retry or do something else to recover.
|
|
302
|
+
- Give just the next ONE action you should do (if any)
|
|
303
|
+
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 3 times, you should think this is an error and set the "error" field to the error message.
|
|
326
304
|
|
|
327
305
|
### Supporting actions list
|
|
328
306
|
|
|
329
307
|
${actionList}
|
|
330
308
|
|
|
331
|
-
### Log
|
|
309
|
+
### Log to give user feedback (preamble message)
|
|
310
|
+
|
|
311
|
+
The <log> tag is a brief preamble message to the user explaining what you're about to do. It should follow these principles and examples:
|
|
332
312
|
|
|
333
|
-
Brief message to user explaining next action:
|
|
334
313
|
- **Use ${preferredLanguage}**
|
|
335
|
-
- **Keep concise**: 1-2 sentences, 8
|
|
336
|
-
- **Build on context**: connect with
|
|
337
|
-
- **
|
|
314
|
+
- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).
|
|
315
|
+
- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what's been done so far and create a sense of momentum and clarity for the user to understand your next actions.
|
|
316
|
+
- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.
|
|
338
317
|
|
|
339
318
|
**Examples:**
|
|
340
319
|
- <log>Click the login button</log>
|
|
341
|
-
- <log>Scroll to find the 'Yes' button</log>
|
|
342
|
-
|
|
343
|
-
|
|
320
|
+
- <log>Scroll to find the 'Yes' button in popup</log>
|
|
321
|
+
- <log>Previous actions failed to find the 'Yes' button, i will try again</log>
|
|
322
|
+
- <log>Go back to find the login button</log>
|
|
344
323
|
|
|
345
|
-
|
|
346
|
-
- <action-type> MUST be from supporting actions list
|
|
347
|
-
- Use EXACT field names from action description
|
|
348
|
-
- 'complete' is NOT a valid action-type
|
|
324
|
+
### If there is some action to do ...
|
|
349
325
|
|
|
350
|
-
|
|
326
|
+
- Use the <action-type> and <action-param-json> tags to output the action to be executed.
|
|
327
|
+
- The <action-type> MUST be one of the supporting actions. 'complete' is NOT a valid action-type.
|
|
328
|
+
- Parameter names are strict. Use EXACTLY the field names listed for the selected action. Do NOT invent alias fields. If an action has a "sample" in its description, follow that structure.
|
|
329
|
+
For example:
|
|
351
330
|
<action-type>Tap</action-type>
|
|
352
331
|
<action-param-json>
|
|
353
332
|
{
|
|
@@ -355,38 +334,263 @@ Example:
|
|
|
355
334
|
}
|
|
356
335
|
</action-param-json>
|
|
357
336
|
|
|
358
|
-
###
|
|
337
|
+
### If you think there is an error ...
|
|
359
338
|
|
|
360
|
-
Use <error> tag
|
|
339
|
+
- Use the <error> tag to output the error message.
|
|
340
|
+
|
|
341
|
+
For example:
|
|
361
342
|
<error>Unable to find the required element on the page</error>
|
|
362
343
|
|
|
363
|
-
|
|
344
|
+
### If there is no action to do ...
|
|
345
|
+
|
|
346
|
+
- Don't output <action-type> or <action-param-json> if there is no action to do.
|
|
347
|
+
|
|
348
|
+
## Return Format
|
|
349
|
+
|
|
350
|
+
Return in XML format following this decision flow:
|
|
364
351
|
|
|
365
352
|
**Always include (REQUIRED):**
|
|
353
|
+
<!-- Step 1: Observe${shouldIncludeSubGoals ? ' and Plan' : ''} -->
|
|
366
354
|
<thought>Your thought process here. NEVER skip this tag.</thought>
|
|
367
355
|
${shouldIncludeSubGoals ? `
|
|
356
|
+
<!-- required when no update-plan-content is provided in the previous response -->
|
|
368
357
|
<update-plan-content>...</update-plan-content>
|
|
358
|
+
|
|
359
|
+
<!-- required when any sub-goal is completed -->
|
|
369
360
|
<mark-sub-goal-done>
|
|
370
361
|
<sub-goal index="1" status="finished" />
|
|
371
362
|
</mark-sub-goal-done>
|
|
372
363
|
` : ''}${shouldIncludeSubGoals ? `
|
|
364
|
+
<!-- Step ${memoryStepNumber}: Memory data from current screenshot if needed -->
|
|
373
365
|
<memory>...</memory>
|
|
374
366
|
` : ''}
|
|
375
|
-
**Then choose ONE
|
|
367
|
+
**Then choose ONE of the following paths:**
|
|
376
368
|
|
|
377
|
-
**Path A: If ${
|
|
369
|
+
**Path A: If the ${shouldIncludeSubGoals ? 'goal is accomplished' : 'instruction is fulfilled'} or failed (Step ${checkGoalStepNumber})**
|
|
378
370
|
<complete success="true|false">...</complete>
|
|
379
371
|
|
|
380
|
-
**Path B: If ${
|
|
372
|
+
**Path B: If the ${shouldIncludeSubGoals ? 'goal is NOT complete' : 'instruction is NOT fulfilled'} yet (Step ${actionStepNumber})**
|
|
373
|
+
<!-- Determine next action -->
|
|
381
374
|
<log>...</log>
|
|
382
375
|
<action-type>...</action-type>
|
|
383
376
|
<action-param-json>...</action-param-json>
|
|
384
377
|
|
|
385
378
|
<!-- OR if there's an error -->
|
|
386
379
|
<error>...</error>
|
|
380
|
+
${shouldIncludeSubGoals ? `
|
|
381
|
+
## Multi-turn Conversation Example
|
|
382
|
+
|
|
383
|
+
Below is an example of a multi-turn conversation for "fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address":
|
|
384
|
+
|
|
385
|
+
### Turn 1 - Initial instruction
|
|
386
|
+
|
|
387
|
+
**User message:**
|
|
388
|
+
<user_instruction>fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address</user_instruction>
|
|
389
|
+
|
|
390
|
+
**Screenshot:** [Shows a registration form with empty Name and Email fields]
|
|
391
|
+
|
|
392
|
+
**Your response:**
|
|
393
|
+
<thought>The user wants me to fill out the registration form with specific values and return the email address. I can see the form has two fields: Name and Email. Both are currently empty. I'll break this down into sub-goals and start with the Name field. Note: The instruction is to fill the form only (not submit), and return the email at the end.</thought>
|
|
394
|
+
<update-plan-content>
|
|
395
|
+
<sub-goal index="1" status="pending">Fill in the Name field with 'John'</sub-goal>
|
|
396
|
+
<sub-goal index="2" status="pending">Fill in the Email field with 'john@example.com'</sub-goal>
|
|
397
|
+
<sub-goal index="3" status="pending">Return the filled email address</sub-goal>
|
|
398
|
+
</update-plan-content>
|
|
399
|
+
<log>Click on the Name field to start filling the form</log>
|
|
400
|
+
<action-type>Tap</action-type>
|
|
401
|
+
<action-param-json>
|
|
402
|
+
{
|
|
403
|
+
"locate": ${locateNameField}
|
|
404
|
+
}
|
|
405
|
+
</action-param-json>
|
|
406
|
+
|
|
407
|
+
### Turn 2 - After clicking Name field
|
|
408
|
+
|
|
409
|
+
**User message:**
|
|
410
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
411
|
+
|
|
412
|
+
Sub-goals:
|
|
413
|
+
1. Fill in the Name field with 'John' (running)
|
|
414
|
+
2. Fill in the Email field with 'john@example.com' (pending)
|
|
415
|
+
3. Return the filled email address (pending)
|
|
416
|
+
Current sub-goal is: Fill in the Name field with 'John'
|
|
417
|
+
Actions performed for current sub-goal:
|
|
418
|
+
- Click on the Name field to start filling the form
|
|
419
|
+
|
|
420
|
+
**Screenshot:** [Shows the form with Name field now focused/active]
|
|
421
|
+
|
|
422
|
+
**Your response:**
|
|
423
|
+
<thought>The Name field is now focused. I need to type 'John' into this field. Current sub-goal is running, will be completed after input.</thought>
|
|
424
|
+
<log>Typing 'John' into the Name field</log>
|
|
425
|
+
<action-type>Input</action-type>
|
|
426
|
+
<action-param-json>
|
|
427
|
+
{
|
|
428
|
+
"value": "John"
|
|
429
|
+
}
|
|
430
|
+
</action-param-json>
|
|
431
|
+
|
|
432
|
+
### Turn 3 - After entering name
|
|
433
|
+
|
|
434
|
+
**User message:**
|
|
435
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
436
|
+
|
|
437
|
+
Sub-goals:
|
|
438
|
+
1. Fill in the Name field with 'John' (running)
|
|
439
|
+
2. Fill in the Email field with 'john@example.com' (pending)
|
|
440
|
+
3. Return the filled email address (pending)
|
|
441
|
+
Current sub-goal is: Fill in the Name field with 'John'
|
|
442
|
+
Actions performed for current sub-goal:
|
|
443
|
+
- Click on the Name field to start filling the form
|
|
444
|
+
- Typing 'John' into the Name field
|
|
445
|
+
|
|
446
|
+
**Screenshot:** [Shows the form with Name field containing 'John']
|
|
447
|
+
|
|
448
|
+
**Your response:**
|
|
449
|
+
<thought>The Name field now shows 'John'. Sub-goal 1 is complete. Next, I need to fill the Email field for sub-goal 2.</thought>
|
|
450
|
+
<mark-sub-goal-done>
|
|
451
|
+
<sub-goal index="1" status="finished" />
|
|
452
|
+
</mark-sub-goal-done>
|
|
453
|
+
<memory>Name field has been filled with 'John'</memory>
|
|
454
|
+
<log>Moving to the Email field</log>
|
|
455
|
+
<action-type>Tap</action-type>
|
|
456
|
+
<action-param-json>
|
|
457
|
+
{
|
|
458
|
+
"locate": ${locateEmailField}
|
|
459
|
+
}
|
|
460
|
+
</action-param-json>
|
|
461
|
+
|
|
462
|
+
### Turn 4 - After clicking Email field
|
|
463
|
+
|
|
464
|
+
**User message:**
|
|
465
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
466
|
+
|
|
467
|
+
Sub-goals:
|
|
468
|
+
1. Fill in the Name field with 'John' (finished)
|
|
469
|
+
2. Fill in the Email field with 'john@example.com' (running)
|
|
470
|
+
3. Return the filled email address (pending)
|
|
471
|
+
Current sub-goal is: Fill in the Email field with 'john@example.com'
|
|
472
|
+
Actions performed for current sub-goal:
|
|
473
|
+
- Moving to the Email field
|
|
474
|
+
|
|
475
|
+
**Screenshot:** [Shows the form with Name='John' and Email field focused]
|
|
476
|
+
|
|
477
|
+
**Your response:**
|
|
478
|
+
<thought>The Email field is now focused. I'll enter 'john@example.com'. After this, sub-goal 2 will be complete and my task will be done.</thought>
|
|
479
|
+
<log>Typing email address into the Email field</log>
|
|
480
|
+
<action-type>Input</action-type>
|
|
481
|
+
<action-param-json>
|
|
482
|
+
{
|
|
483
|
+
"value": "john@example.com"
|
|
484
|
+
}
|
|
485
|
+
</action-param-json>
|
|
486
|
+
|
|
487
|
+
### Turn 5 - After entering email (Goal accomplished)
|
|
488
|
+
|
|
489
|
+
**User message:**
|
|
490
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
491
|
+
|
|
492
|
+
Sub-goals:
|
|
493
|
+
1. Fill in the Name field with 'John' (finished)
|
|
494
|
+
2. Fill in the Email field with 'john@example.com' (running)
|
|
495
|
+
3. Return the filled email address (pending)
|
|
496
|
+
Current sub-goal is: Fill in the Email field with 'john@example.com'
|
|
497
|
+
Actions performed for current sub-goal:
|
|
498
|
+
- Moving to the Email field
|
|
499
|
+
- Typing email address into the Email field
|
|
500
|
+
|
|
501
|
+
**Screenshot:** [Shows the form with Name='John' and Email='john@example.com']
|
|
502
|
+
|
|
503
|
+
**Your response:**
|
|
504
|
+
<thought>Both fields are now filled: Name shows 'John' and Email shows 'john@example.com'. Sub-goal 2 is complete. The user asked me to return the filled email address, so I need to include 'john@example.com' in my response. All sub-goals are now finished.</thought>
|
|
505
|
+
<mark-sub-goal-done>
|
|
506
|
+
<sub-goal index="2" status="finished" />
|
|
507
|
+
<sub-goal index="3" status="finished" />
|
|
508
|
+
</mark-sub-goal-done>
|
|
509
|
+
<complete success="true">john@example.com</complete>
|
|
510
|
+
` : `
|
|
511
|
+
## Multi-turn Conversation Example
|
|
512
|
+
|
|
513
|
+
Below is an example of a multi-turn conversation for "fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address":
|
|
514
|
+
|
|
515
|
+
### Turn 1 - Initial instruction
|
|
516
|
+
|
|
517
|
+
**User message:**
|
|
518
|
+
<user_instruction>fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address</user_instruction>
|
|
519
|
+
|
|
520
|
+
**Screenshot:** [Shows a registration form with empty Name and Email fields]
|
|
521
|
+
|
|
522
|
+
**Your response:**
|
|
523
|
+
<thought>The user wants me to fill out the registration form with specific values and return the email address. I can see the form has two fields: Name and Email. Both are currently empty. I should start by clicking on the Name field. Note: The instruction is to fill the form only (not submit), and return the email at the end.</thought>
|
|
524
|
+
<log>Click on the Name field to start filling the form</log>
|
|
525
|
+
<action-type>Tap</action-type>
|
|
526
|
+
<action-param-json>
|
|
527
|
+
{
|
|
528
|
+
"locate": ${locateNameField}
|
|
529
|
+
}
|
|
530
|
+
</action-param-json>
|
|
531
|
+
|
|
532
|
+
### Turn 2 - After clicking Name field
|
|
533
|
+
|
|
534
|
+
**User message:**
|
|
535
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
536
|
+
|
|
537
|
+
**Screenshot:** [Shows the form with Name field now focused/active]
|
|
538
|
+
|
|
539
|
+
**Your response:**
|
|
540
|
+
<thought>The Name field is now focused. I need to type 'John' into this field. After this, I'll need to fill the Email field as well.</thought>
|
|
541
|
+
<log>Typing 'John' into the Name field</log>
|
|
542
|
+
<action-type>Input</action-type>
|
|
543
|
+
<action-param-json>
|
|
544
|
+
{
|
|
545
|
+
"value": "John"
|
|
546
|
+
}
|
|
547
|
+
</action-param-json>
|
|
548
|
+
|
|
549
|
+
### Turn 3 - After entering name
|
|
550
|
+
|
|
551
|
+
**User message:**
|
|
552
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
553
|
+
|
|
554
|
+
**Screenshot:** [Shows the form with Name field containing 'John']
|
|
555
|
+
|
|
556
|
+
**Your response:**
|
|
557
|
+
<thought>The Name field now shows 'John'. Next, I need to fill the Email field with 'john@example.com'. I'll click on the Email field first.</thought>
|
|
558
|
+
<log>Moving to the Email field</log>
|
|
559
|
+
<action-type>Tap</action-type>
|
|
560
|
+
<action-param-json>
|
|
561
|
+
{
|
|
562
|
+
"locate": ${locateEmailField}
|
|
563
|
+
}
|
|
564
|
+
</action-param-json>
|
|
565
|
+
|
|
566
|
+
### Turn 4 - After clicking Email field
|
|
567
|
+
|
|
568
|
+
**User message:**
|
|
569
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
570
|
+
|
|
571
|
+
**Screenshot:** [Shows the form with Name='John' and Email field focused]
|
|
572
|
+
|
|
573
|
+
**Your response:**
|
|
574
|
+
<thought>The Email field is now focused. I'll enter 'john@example.com'. After this, both required fields will be filled and my task will be complete.</thought>
|
|
575
|
+
<log>Typing email address into the Email field</log>
|
|
576
|
+
<action-type>Input</action-type>
|
|
577
|
+
<action-param-json>
|
|
578
|
+
{
|
|
579
|
+
"value": "john@example.com"
|
|
580
|
+
}
|
|
581
|
+
</action-param-json>
|
|
582
|
+
|
|
583
|
+
### Turn 5 - After entering email (Instruction fulfilled)
|
|
584
|
+
|
|
585
|
+
**User message:**
|
|
586
|
+
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
587
|
+
|
|
588
|
+
**Screenshot:** [Shows the form with Name='John' and Email='john@example.com']
|
|
387
589
|
|
|
388
|
-
|
|
389
|
-
|
|
590
|
+
**Your response:**
|
|
591
|
+
<thought>Both fields are now filled: Name shows 'John' and Email shows 'john@example.com'. The user asked me to return the filled email address, so I should include 'john@example.com' in my response. The instruction has been fulfilled.</thought>
|
|
592
|
+
<complete success="true">john@example.com</complete>
|
|
593
|
+
`}`;
|
|
390
594
|
}
|
|
391
595
|
exports.descriptionForAction = __webpack_exports__.descriptionForAction;
|
|
392
596
|
exports.systemPromptToTaskPlanning = __webpack_exports__.systemPromptToTaskPlanning;
|