@midscene/core 0.5.2-beta-20241010035503.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.js +170 -173
- package/dist/es/index.js +171 -172
- package/dist/lib/ai-model.js +170 -173
- package/dist/lib/index.js +171 -172
- package/dist/types/ai-model.d.ts +1 -1
- package/package.json +5 -3
- package/report/index.html +1 -1
package/dist/es/ai-model.js
CHANGED
|
@@ -4222,155 +4222,6 @@ var wrapOpenAI = (openai, options) => {
|
|
|
4222
4222
|
// src/ai-model/openai/index.ts
|
|
4223
4223
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
4224
4224
|
|
|
4225
|
-
// src/ai-model/automation/planning.ts
|
|
4226
|
-
function systemPromptToTaskPlanning() {
|
|
4227
|
-
return `
|
|
4228
|
-
## Role:
|
|
4229
|
-
|
|
4230
|
-
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
4231
|
-
|
|
4232
|
-
## Objective 1 (main objective): Decompose the task user asked into a series of actions:
|
|
4233
|
-
|
|
4234
|
-
- Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
|
|
4235
|
-
- Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
|
|
4236
|
-
|
|
4237
|
-
Each action has a type and corresponding param. To be detailed:
|
|
4238
|
-
* type: 'Locate', it means to locate one element
|
|
4239
|
-
* param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
|
|
4240
|
-
* type: 'Tap', tap the previous element found
|
|
4241
|
-
* param: null
|
|
4242
|
-
* type: 'Hover', hover the previous element found
|
|
4243
|
-
* param: null
|
|
4244
|
-
* type: 'Input', replace the value in the input field
|
|
4245
|
-
* param: { value: string }, The input value must not be an empty string. Provide a meaningful final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. After locating the input field, do not use 'Tap' action, proceed directly to 'Input' action.
|
|
4246
|
-
* type: 'KeyboardPress', press a key
|
|
4247
|
-
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4248
|
-
* type: 'Scroll'
|
|
4249
|
-
* param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
|
|
4250
|
-
* type: 'Error'
|
|
4251
|
-
* param: { message: string }, the error message
|
|
4252
|
-
* type: 'Sleep'
|
|
4253
|
-
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4254
|
-
|
|
4255
|
-
Here is an example of how to decompose a task.
|
|
4256
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4257
|
-
* Locate: 'The search bar'
|
|
4258
|
-
* Input: 'Weather in Shanghai'
|
|
4259
|
-
* Sleep: 1000
|
|
4260
|
-
* KeyboardPress: 'Enter'
|
|
4261
|
-
|
|
4262
|
-
Remember:
|
|
4263
|
-
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4264
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4265
|
-
|
|
4266
|
-
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4267
|
-
|
|
4268
|
-
## Objective 2 (sub objective): Give a quick answer to the action with type "Locate" you just planned
|
|
4269
|
-
|
|
4270
|
-
Review the action you just planned. If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4271
|
-
{
|
|
4272
|
-
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4273
|
-
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4274
|
-
"id": "4" // ID of this element, replace with actual value in practice
|
|
4275
|
-
}
|
|
4276
|
-
|
|
4277
|
-
If the action type is not 'Locate', or there is no element meets the description in the prompt (usually because it will show up after some interaction), the answer should be null.
|
|
4278
|
-
|
|
4279
|
-
## Output JSON Format:
|
|
4280
|
-
|
|
4281
|
-
Please return the result in JSON format as follows:
|
|
4282
|
-
{
|
|
4283
|
-
queryLanguage: '', // language of the description of the task
|
|
4284
|
-
actions: [ // always return in Array
|
|
4285
|
-
{
|
|
4286
|
-
"thought": "find out the search bar",
|
|
4287
|
-
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4288
|
-
"param": {
|
|
4289
|
-
"prompt": "The search bar"
|
|
4290
|
-
},
|
|
4291
|
-
"quickAnswer": { // since the first action is Locate, so we need to give a quick answer
|
|
4292
|
-
"reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
|
|
4293
|
-
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4294
|
-
"id": "4" // ID of this element, replace with actual value in practice
|
|
4295
|
-
} | null,
|
|
4296
|
-
},
|
|
4297
|
-
{
|
|
4298
|
-
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4299
|
-
"type": "Tap", // Type of action, like 'Tap' 'Hover' ...
|
|
4300
|
-
"param": any, // Parameter towards the task type
|
|
4301
|
-
},
|
|
4302
|
-
// ... more actions
|
|
4303
|
-
],
|
|
4304
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4305
|
-
}
|
|
4306
|
-
`;
|
|
4307
|
-
}
|
|
4308
|
-
var planSchema = {
|
|
4309
|
-
type: "json_schema",
|
|
4310
|
-
json_schema: {
|
|
4311
|
-
name: "action_items",
|
|
4312
|
-
strict: true,
|
|
4313
|
-
schema: {
|
|
4314
|
-
type: "object",
|
|
4315
|
-
properties: {
|
|
4316
|
-
queryLanguage: {
|
|
4317
|
-
type: "string",
|
|
4318
|
-
description: "Language of the description of the task"
|
|
4319
|
-
},
|
|
4320
|
-
actions: {
|
|
4321
|
-
type: "array",
|
|
4322
|
-
items: {
|
|
4323
|
-
type: "object",
|
|
4324
|
-
properties: {
|
|
4325
|
-
thought: {
|
|
4326
|
-
type: "string",
|
|
4327
|
-
description: "Reasons for generating this task, and why this task is feasible on this page"
|
|
4328
|
-
},
|
|
4329
|
-
type: {
|
|
4330
|
-
type: "string",
|
|
4331
|
-
description: 'Type of action, like "Tap", "Hover", etc.'
|
|
4332
|
-
},
|
|
4333
|
-
param: {
|
|
4334
|
-
type: ["object", "null"],
|
|
4335
|
-
description: "Parameter towards the task type, can be null"
|
|
4336
|
-
},
|
|
4337
|
-
quickAnswer: {
|
|
4338
|
-
type: ["object", "null"],
|
|
4339
|
-
nullable: true,
|
|
4340
|
-
properties: {
|
|
4341
|
-
reason: {
|
|
4342
|
-
type: "string",
|
|
4343
|
-
description: "Reason for finding element 4"
|
|
4344
|
-
},
|
|
4345
|
-
text: {
|
|
4346
|
-
type: "string",
|
|
4347
|
-
description: "Text of elementInfo, if none, leave empty"
|
|
4348
|
-
},
|
|
4349
|
-
id: {
|
|
4350
|
-
type: "string",
|
|
4351
|
-
description: "ID of this element"
|
|
4352
|
-
}
|
|
4353
|
-
},
|
|
4354
|
-
required: ["reason", "text", "id"],
|
|
4355
|
-
additionalProperties: false
|
|
4356
|
-
}
|
|
4357
|
-
},
|
|
4358
|
-
required: ["thought", "type", "param", "quickAnswer"],
|
|
4359
|
-
additionalProperties: false
|
|
4360
|
-
},
|
|
4361
|
-
description: "List of actions to be performed"
|
|
4362
|
-
},
|
|
4363
|
-
error: {
|
|
4364
|
-
type: ["string", "null"],
|
|
4365
|
-
description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
|
|
4366
|
-
}
|
|
4367
|
-
},
|
|
4368
|
-
required: ["queryLanguage", "actions", "error"],
|
|
4369
|
-
additionalProperties: false
|
|
4370
|
-
}
|
|
4371
|
-
}
|
|
4372
|
-
};
|
|
4373
|
-
|
|
4374
4225
|
// src/ai-model/coze/index.ts
|
|
4375
4226
|
import assert from "assert";
|
|
4376
4227
|
import fetch2 from "node-fetch";
|
|
@@ -4569,7 +4420,8 @@ Input Example:
|
|
|
4569
4420
|
},
|
|
4570
4421
|
"elementInfos": [
|
|
4571
4422
|
{
|
|
4572
|
-
"id": "
|
|
4423
|
+
"id": "we23xsfwe", // ID of the element
|
|
4424
|
+
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4573
4425
|
"attributes": { // Attributes of the element
|
|
4574
4426
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
4575
4427
|
"src": "https://ap-southeast-3.m",
|
|
@@ -4584,7 +4436,8 @@ Input Example:
|
|
|
4584
4436
|
}
|
|
4585
4437
|
},
|
|
4586
4438
|
{
|
|
4587
|
-
"id": "
|
|
4439
|
+
"id": "wefew2222few2", // ID of the element
|
|
4440
|
+
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4588
4441
|
"attributes": { // Attributes of the element
|
|
4589
4442
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
4590
4443
|
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
|
|
@@ -4600,7 +4453,8 @@ Input Example:
|
|
|
4600
4453
|
},
|
|
4601
4454
|
...
|
|
4602
4455
|
{
|
|
4603
|
-
"id": "
|
|
4456
|
+
"id": "kwekfj2323",
|
|
4457
|
+
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4604
4458
|
"attributes": {
|
|
4605
4459
|
"nodeType": "TEXT Node",
|
|
4606
4460
|
"class": ".product-name"
|
|
@@ -4632,7 +4486,7 @@ Output Example:
|
|
|
4632
4486
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4633
4487
|
"text": "",
|
|
4634
4488
|
// ID of this element, replace with actual value in practice
|
|
4635
|
-
"id": "
|
|
4489
|
+
"id": "wefew2222few2"
|
|
4636
4490
|
}
|
|
4637
4491
|
],
|
|
4638
4492
|
"errors": []
|
|
@@ -4689,6 +4543,155 @@ var findElementSchema = {
|
|
|
4689
4543
|
}
|
|
4690
4544
|
};
|
|
4691
4545
|
|
|
4546
|
+
// src/ai-model/prompt/planning.ts
|
|
4547
|
+
function systemPromptToTaskPlanning() {
|
|
4548
|
+
return `
|
|
4549
|
+
## Role:
|
|
4550
|
+
|
|
4551
|
+
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
4552
|
+
|
|
4553
|
+
## Objective 1 (main objective): Decompose the task user asked into a series of actions:
|
|
4554
|
+
|
|
4555
|
+
- Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
|
|
4556
|
+
- Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
|
|
4557
|
+
|
|
4558
|
+
Each action has a type and corresponding param. To be detailed:
|
|
4559
|
+
* type: 'Locate', it means to locate one element
|
|
4560
|
+
* param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
|
|
4561
|
+
* type: 'Tap', tap the previous element found
|
|
4562
|
+
* param: null
|
|
4563
|
+
* type: 'Hover', hover the previous element found
|
|
4564
|
+
* param: null
|
|
4565
|
+
* type: 'Input', replace the value in the input field
|
|
4566
|
+
* param: { value: string }, The input value must not be an empty string. Provide a meaningful final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. After locating the input field, do not use 'Tap' action, proceed directly to 'Input' action.
|
|
4567
|
+
* type: 'KeyboardPress', press a key
|
|
4568
|
+
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4569
|
+
* type: 'Scroll'
|
|
4570
|
+
* param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
|
|
4571
|
+
* type: 'Error'
|
|
4572
|
+
* param: { message: string }, the error message
|
|
4573
|
+
* type: 'Sleep'
|
|
4574
|
+
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4575
|
+
|
|
4576
|
+
Here is an example of how to decompose a task.
|
|
4577
|
+
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4578
|
+
* Locate: 'The search bar'
|
|
4579
|
+
* Input: 'Weather in Shanghai'
|
|
4580
|
+
* Sleep: 1000
|
|
4581
|
+
* KeyboardPress: 'Enter'
|
|
4582
|
+
|
|
4583
|
+
Remember:
|
|
4584
|
+
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4585
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
4586
|
+
|
|
4587
|
+
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
|
|
4588
|
+
|
|
4589
|
+
## Objective 2 (sub objective): Give a quick answer to the action with type "Locate" you just planned
|
|
4590
|
+
|
|
4591
|
+
Review the action you just planned. If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4592
|
+
{
|
|
4593
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4594
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4595
|
+
"id": "wefew2222few2" // id of this element, replace with actual value in practice
|
|
4596
|
+
}
|
|
4597
|
+
|
|
4598
|
+
If the action type is not 'Locate', or there is no element meets the description in the prompt (usually because it will show up after some interaction), the answer should be null.
|
|
4599
|
+
|
|
4600
|
+
## Output JSON Format:
|
|
4601
|
+
|
|
4602
|
+
Please return the result in JSON format as follows:
|
|
4603
|
+
{
|
|
4604
|
+
queryLanguage: '', // language of the description of the task
|
|
4605
|
+
actions: [ // always return in Array
|
|
4606
|
+
{
|
|
4607
|
+
"thought": "find out the search bar",
|
|
4608
|
+
"type": "Locate", // Type of action, like 'Tap' 'Hover' ...
|
|
4609
|
+
"param": {
|
|
4610
|
+
"prompt": "The search bar"
|
|
4611
|
+
},
|
|
4612
|
+
"quickAnswer": { // since the first action is Locate, so we need to give a quick answer
|
|
4613
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
|
|
4614
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4615
|
+
"id": "wefew2222few2" // ID of this element, replace with actual value in practice
|
|
4616
|
+
} | null,
|
|
4617
|
+
},
|
|
4618
|
+
{
|
|
4619
|
+
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4620
|
+
"type": "Tap", // Type of action, like 'Tap' 'Hover' ...
|
|
4621
|
+
"param": any, // Parameter towards the task type
|
|
4622
|
+
},
|
|
4623
|
+
// ... more actions
|
|
4624
|
+
],
|
|
4625
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4626
|
+
}
|
|
4627
|
+
`;
|
|
4628
|
+
}
|
|
4629
|
+
var planSchema = {
|
|
4630
|
+
type: "json_schema",
|
|
4631
|
+
json_schema: {
|
|
4632
|
+
name: "action_items",
|
|
4633
|
+
strict: true,
|
|
4634
|
+
schema: {
|
|
4635
|
+
type: "object",
|
|
4636
|
+
properties: {
|
|
4637
|
+
queryLanguage: {
|
|
4638
|
+
type: "string",
|
|
4639
|
+
description: "Language of the description of the task"
|
|
4640
|
+
},
|
|
4641
|
+
actions: {
|
|
4642
|
+
type: "array",
|
|
4643
|
+
items: {
|
|
4644
|
+
type: "object",
|
|
4645
|
+
properties: {
|
|
4646
|
+
thought: {
|
|
4647
|
+
type: "string",
|
|
4648
|
+
description: "Reasons for generating this task, and why this task is feasible on this page"
|
|
4649
|
+
},
|
|
4650
|
+
type: {
|
|
4651
|
+
type: "string",
|
|
4652
|
+
description: 'Type of action, like "Tap", "Hover", etc.'
|
|
4653
|
+
},
|
|
4654
|
+
param: {
|
|
4655
|
+
type: ["object", "null"],
|
|
4656
|
+
description: "Parameter towards the task type, can be null"
|
|
4657
|
+
},
|
|
4658
|
+
quickAnswer: {
|
|
4659
|
+
type: ["object", "null"],
|
|
4660
|
+
nullable: true,
|
|
4661
|
+
properties: {
|
|
4662
|
+
reason: {
|
|
4663
|
+
type: "string",
|
|
4664
|
+
description: "Reason for finding element 4"
|
|
4665
|
+
},
|
|
4666
|
+
text: {
|
|
4667
|
+
type: "string",
|
|
4668
|
+
description: "Text of elementInfo, if none, leave empty"
|
|
4669
|
+
},
|
|
4670
|
+
id: {
|
|
4671
|
+
type: "string",
|
|
4672
|
+
description: "ID of this element"
|
|
4673
|
+
}
|
|
4674
|
+
},
|
|
4675
|
+
required: ["reason", "text", "id"],
|
|
4676
|
+
additionalProperties: false
|
|
4677
|
+
}
|
|
4678
|
+
},
|
|
4679
|
+
required: ["thought", "type", "param", "quickAnswer"],
|
|
4680
|
+
additionalProperties: false
|
|
4681
|
+
},
|
|
4682
|
+
description: "List of actions to be performed"
|
|
4683
|
+
},
|
|
4684
|
+
error: {
|
|
4685
|
+
type: ["string", "null"],
|
|
4686
|
+
description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
|
|
4687
|
+
}
|
|
4688
|
+
},
|
|
4689
|
+
required: ["queryLanguage", "actions", "error"],
|
|
4690
|
+
additionalProperties: false
|
|
4691
|
+
}
|
|
4692
|
+
}
|
|
4693
|
+
};
|
|
4694
|
+
|
|
4692
4695
|
// src/ai-model/prompt/util.ts
|
|
4693
4696
|
import assert2 from "assert";
|
|
4694
4697
|
|
|
@@ -4707,18 +4710,10 @@ import {
|
|
|
4707
4710
|
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
4708
4711
|
var contextFormatIntro = `
|
|
4709
4712
|
The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
|
|
4710
|
-
var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
|
|
4711
|
-
var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
|
|
4712
|
-
var skillExtractData = `skill name: extract_data_from_UI
|
|
4713
|
-
related input: DATA_DEMAND
|
|
4714
|
-
skill content:
|
|
4715
|
-
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
4716
|
-
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
4717
|
-
- ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
|
|
4718
4713
|
function systemPromptToExtract() {
|
|
4719
4714
|
return `
|
|
4720
4715
|
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
4721
|
-
The user will give you a screenshot and the
|
|
4716
|
+
The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
|
|
4722
4717
|
|
|
4723
4718
|
You have the following skills:
|
|
4724
4719
|
|
|
@@ -4807,14 +4802,15 @@ async function describeUserPage(context) {
|
|
|
4807
4802
|
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4808
4803
|
return {
|
|
4809
4804
|
description: `
|
|
4810
|
-
|
|
4811
|
-
|
|
4812
|
-
|
|
4805
|
+
{
|
|
4806
|
+
// The size of the page
|
|
4807
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4813
4808
|
|
|
4809
|
+
// json description of the element
|
|
4810
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4814
4811
|
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
}`,
|
|
4812
|
+
}`,
|
|
4813
|
+
// // json description of the element
|
|
4818
4814
|
elementById(id) {
|
|
4819
4815
|
assert2(typeof id !== "undefined", "id is required for query");
|
|
4820
4816
|
const item = idElementMap[`${id}`];
|
|
@@ -4837,6 +4833,7 @@ function cropFieldInformation(elementsInfo) {
|
|
|
4837
4833
|
);
|
|
4838
4834
|
return {
|
|
4839
4835
|
id,
|
|
4836
|
+
markerId: item.indexId,
|
|
4840
4837
|
attributes: tailorAttributes,
|
|
4841
4838
|
rect,
|
|
4842
4839
|
content: tailorContent
|
|
@@ -4880,7 +4877,7 @@ async function createOpenAI() {
|
|
|
4880
4877
|
}
|
|
4881
4878
|
if (process.env[MIDSCENE_LANGSMITH_DEBUG]) {
|
|
4882
4879
|
console.log("DEBUGGING MODE: langsmith wrapper enabled");
|
|
4883
|
-
const openai2 = wrapOpenAI(new OpenAI());
|
|
4880
|
+
const openai2 = wrapOpenAI(new OpenAI(extraConfig));
|
|
4884
4881
|
return openai2;
|
|
4885
4882
|
}
|
|
4886
4883
|
return openai;
|
|
@@ -4893,7 +4890,7 @@ async function call(messages, responseFormat) {
|
|
|
4893
4890
|
model,
|
|
4894
4891
|
messages,
|
|
4895
4892
|
response_format: responseFormat,
|
|
4896
|
-
temperature: 0.
|
|
4893
|
+
temperature: 0.1,
|
|
4897
4894
|
stream: false
|
|
4898
4895
|
});
|
|
4899
4896
|
shouldPrintTiming && console.timeEnd("Midscene - AI call");
|
|
@@ -4949,7 +4946,7 @@ function extractJSONFromCodeBlock(response) {
|
|
|
4949
4946
|
import assert4 from "assert";
|
|
4950
4947
|
async function AiInspectElement(options) {
|
|
4951
4948
|
var _a;
|
|
4952
|
-
const { context, multi,
|
|
4949
|
+
const { context, multi, targetElementDescription, callAI, useModel } = options;
|
|
4953
4950
|
const { screenshotBase64 } = context;
|
|
4954
4951
|
const { description, elementById } = await describeUserPage(context);
|
|
4955
4952
|
if (((_a = options.quickAnswer) == null ? void 0 : _a.id) && elementById(options.quickAnswer.id)) {
|
|
@@ -4979,10 +4976,10 @@ async function AiInspectElement(options) {
|
|
|
4979
4976
|
|
|
4980
4977
|
${description}
|
|
4981
4978
|
|
|
4982
|
-
Here is the
|
|
4979
|
+
Here is the item user want to find. Just go ahead:
|
|
4983
4980
|
=====================================
|
|
4984
4981
|
${JSON.stringify({
|
|
4985
|
-
description:
|
|
4982
|
+
description: targetElementDescription,
|
|
4986
4983
|
multi: multiDescription(multi)
|
|
4987
4984
|
})}
|
|
4988
4985
|
=====================================
|