@midscene/core 0.5.2-beta-20241010035503.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4222,155 +4222,6 @@ var wrapOpenAI = (openai, options) => {
4222
4222
  // src/ai-model/openai/index.ts
4223
4223
  import OpenAI, { AzureOpenAI } from "openai";
4224
4224
 
4225
- // src/ai-model/automation/planning.ts
4226
- function systemPromptToTaskPlanning() {
4227
- return `
4228
- ## Role:
4229
-
4230
- You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
4231
-
4232
- ## Objective 1 (main objective): Decompose the task user asked into a series of actions:
4233
-
4234
- - Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
4235
- - Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
4236
-
4237
- Each action has a type and corresponding param. To be detailed:
4238
- * type: 'Locate', it means to locate one element
4239
- * param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
4240
- * type: 'Tap', tap the previous element found
4241
- * param: null
4242
- * type: 'Hover', hover the previous element found
4243
- * param: null
4244
- * type: 'Input', replace the value in the input field
4245
- * param: { value: string }, The input value must not be an empty string. Provide a meaningful final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. After locating the input field, do not use 'Tap' action, proceed directly to 'Input' action.
4246
- * type: 'KeyboardPress', press a key
4247
- * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4248
- * type: 'Scroll'
4249
- * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4250
- * type: 'Error'
4251
- * param: { message: string }, the error message
4252
- * type: 'Sleep'
4253
- * param: { timeMs: number }, wait for timeMs milliseconds
4254
-
4255
- Here is an example of how to decompose a task.
4256
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4257
- * Locate: 'The search bar'
4258
- * Input: 'Weather in Shanghai'
4259
- * Sleep: 1000
4260
- * KeyboardPress: 'Enter'
4261
-
4262
- Remember:
4263
- 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4264
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4265
-
4266
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4267
-
4268
- ## Objective 2 (sub objective): Give a quick answer to the action with type "Locate" you just planned
4269
-
4270
- Review the action you just planned. If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4271
- {
4272
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4273
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4274
- "id": "4" // ID of this element, replace with actual value in practice
4275
- }
4276
-
4277
- If the action type is not 'Locate', or there is no element meets the description in the prompt (usually because it will show up after some interaction), the answer should be null.
4278
-
4279
- ## Output JSON Format:
4280
-
4281
- Please return the result in JSON format as follows:
4282
- {
4283
- queryLanguage: '', // language of the description of the task
4284
- actions: [ // always return in Array
4285
- {
4286
- "thought": "find out the search bar",
4287
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4288
- "param": {
4289
- "prompt": "The search bar"
4290
- },
4291
- "quickAnswer": { // since the first action is Locate, so we need to give a quick answer
4292
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4293
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4294
- "id": "4" // ID of this element, replace with actual value in practice
4295
- } | null,
4296
- },
4297
- {
4298
- "thought": "Reasons for generating this task, and why this task is feasible on this page",
4299
- "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4300
- "param": any, // Parameter towards the task type
4301
- },
4302
- // ... more actions
4303
- ],
4304
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4305
- }
4306
- `;
4307
- }
4308
- var planSchema = {
4309
- type: "json_schema",
4310
- json_schema: {
4311
- name: "action_items",
4312
- strict: true,
4313
- schema: {
4314
- type: "object",
4315
- properties: {
4316
- queryLanguage: {
4317
- type: "string",
4318
- description: "Language of the description of the task"
4319
- },
4320
- actions: {
4321
- type: "array",
4322
- items: {
4323
- type: "object",
4324
- properties: {
4325
- thought: {
4326
- type: "string",
4327
- description: "Reasons for generating this task, and why this task is feasible on this page"
4328
- },
4329
- type: {
4330
- type: "string",
4331
- description: 'Type of action, like "Tap", "Hover", etc.'
4332
- },
4333
- param: {
4334
- type: ["object", "null"],
4335
- description: "Parameter towards the task type, can be null"
4336
- },
4337
- quickAnswer: {
4338
- type: ["object", "null"],
4339
- nullable: true,
4340
- properties: {
4341
- reason: {
4342
- type: "string",
4343
- description: "Reason for finding element 4"
4344
- },
4345
- text: {
4346
- type: "string",
4347
- description: "Text of elementInfo, if none, leave empty"
4348
- },
4349
- id: {
4350
- type: "string",
4351
- description: "ID of this element"
4352
- }
4353
- },
4354
- required: ["reason", "text", "id"],
4355
- additionalProperties: false
4356
- }
4357
- },
4358
- required: ["thought", "type", "param", "quickAnswer"],
4359
- additionalProperties: false
4360
- },
4361
- description: "List of actions to be performed"
4362
- },
4363
- error: {
4364
- type: ["string", "null"],
4365
- description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
4366
- }
4367
- },
4368
- required: ["queryLanguage", "actions", "error"],
4369
- additionalProperties: false
4370
- }
4371
- }
4372
- };
4373
-
4374
4225
  // src/ai-model/coze/index.ts
4375
4226
  import assert from "assert";
4376
4227
  import fetch2 from "node-fetch";
@@ -4569,7 +4420,8 @@ Input Example:
4569
4420
  },
4570
4421
  "elementInfos": [
4571
4422
  {
4572
- "id": "3", // ID of the element
4423
+ "id": "we23xsfwe", // ID of the element
4424
+ "indexId": "0", // Index of the element,The image is labeled to the left of the element
4573
4425
  "attributes": { // Attributes of the element
4574
4426
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
4575
4427
  "src": "https://ap-southeast-3.m",
@@ -4584,7 +4436,8 @@ Input Example:
4584
4436
  }
4585
4437
  },
4586
4438
  {
4587
- "id": "4", // ID of the element
4439
+ "id": "wefew2222few2", // ID of the element
4440
+ "indexId": "1", // Index of the element,The image is labeled to the left of the element
4588
4441
  "attributes": { // Attributes of the element
4589
4442
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
4590
4443
  "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
@@ -4600,7 +4453,8 @@ Input Example:
4600
4453
  },
4601
4454
  ...
4602
4455
  {
4603
- "id": "27",
4456
+ "id": "kwekfj2323",
4457
+ "indexId": "2", // Index of the element,The image is labeled to the left of the element
4604
4458
  "attributes": {
4605
4459
  "nodeType": "TEXT Node",
4606
4460
  "class": ".product-name"
@@ -4632,7 +4486,7 @@ Output Example:
4632
4486
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4633
4487
  "text": "",
4634
4488
  // ID of this element, replace with actual value in practice
4635
- "id": "4"
4489
+ "id": "wefew2222few2"
4636
4490
  }
4637
4491
  ],
4638
4492
  "errors": []
@@ -4689,6 +4543,155 @@ var findElementSchema = {
4689
4543
  }
4690
4544
  };
4691
4545
 
4546
+ // src/ai-model/prompt/planning.ts
4547
+ function systemPromptToTaskPlanning() {
4548
+ return `
4549
+ ## Role:
4550
+
4551
+ You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
4552
+
4553
+ ## Objective 1 (main objective): Decompose the task user asked into a series of actions:
4554
+
4555
+ - Based on the page context information (screenshot and description) you get, decompose the task user asked into a series of actions.
4556
+ - Actions are executed in the order listed in the list. After executing the actions, the task should be completed.
4557
+
4558
+ Each action has a type and corresponding param. To be detailed:
4559
+ * type: 'Locate', it means to locate one element
4560
+ * param: { prompt: string }, the prompt describes 'which element to focus on page'. Our AI engine will use this prompt to locate the element, so it should clearly describe the obvious features of the element, such as its content, color, size, shape, and position. For example, 'The biggest Download Button on the left side of the page.'
4561
+ * type: 'Tap', tap the previous element found
4562
+ * param: null
4563
+ * type: 'Hover', hover the previous element found
4564
+ * param: null
4565
+ * type: 'Input', replace the value in the input field
4566
+ * param: { value: string }, The input value must not be an empty string. Provide a meaningful final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. After locating the input field, do not use 'Tap' action, proceed directly to 'Input' action.
4567
+ * type: 'KeyboardPress', press a key
4568
+ * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4569
+ * type: 'Scroll'
4570
+ * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
4571
+ * type: 'Error'
4572
+ * param: { message: string }, the error message
4573
+ * type: 'Sleep'
4574
+ * param: { timeMs: number }, wait for timeMs milliseconds
4575
+
4576
+ Here is an example of how to decompose a task.
4577
+ When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4578
+ * Locate: 'The search bar'
4579
+ * Input: 'Weather in Shanghai'
4580
+ * Sleep: 1000
4581
+ * KeyboardPress: 'Enter'
4582
+
4583
+ Remember:
4584
+ 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4585
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
4586
+
4587
+ If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
4588
+
4589
+ ## Objective 2 (sub objective): Give a quick answer to the action with type "Locate" you just planned
4590
+
4591
+ Review the action you just planned. If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4592
+ {
4593
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4594
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4595
+ "id": "wefew2222few2" // id of this element, replace with actual value in practice
4596
+ }
4597
+
4598
+ If the action type is not 'Locate', or there is no element meets the description in the prompt (usually because it will show up after some interaction), the answer should be null.
4599
+
4600
+ ## Output JSON Format:
4601
+
4602
+ Please return the result in JSON format as follows:
4603
+ {
4604
+ queryLanguage: '', // language of the description of the task
4605
+ actions: [ // always return in Array
4606
+ {
4607
+ "thought": "find out the search bar",
4608
+ "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4609
+ "param": {
4610
+ "prompt": "The search bar"
4611
+ },
4612
+ "quickAnswer": { // since the first action is Locate, so we need to give a quick answer
4613
+ "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4614
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4615
+ "id": "wefew2222few2" // ID of this element, replace with actual value in practice
4616
+ } | null,
4617
+ },
4618
+ {
4619
+ "thought": "Reasons for generating this task, and why this task is feasible on this page",
4620
+ "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4621
+ "param": any, // Parameter towards the task type
4622
+ },
4623
+ // ... more actions
4624
+ ],
4625
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4626
+ }
4627
+ `;
4628
+ }
4629
+ var planSchema = {
4630
+ type: "json_schema",
4631
+ json_schema: {
4632
+ name: "action_items",
4633
+ strict: true,
4634
+ schema: {
4635
+ type: "object",
4636
+ properties: {
4637
+ queryLanguage: {
4638
+ type: "string",
4639
+ description: "Language of the description of the task"
4640
+ },
4641
+ actions: {
4642
+ type: "array",
4643
+ items: {
4644
+ type: "object",
4645
+ properties: {
4646
+ thought: {
4647
+ type: "string",
4648
+ description: "Reasons for generating this task, and why this task is feasible on this page"
4649
+ },
4650
+ type: {
4651
+ type: "string",
4652
+ description: 'Type of action, like "Tap", "Hover", etc.'
4653
+ },
4654
+ param: {
4655
+ type: ["object", "null"],
4656
+ description: "Parameter towards the task type, can be null"
4657
+ },
4658
+ quickAnswer: {
4659
+ type: ["object", "null"],
4660
+ nullable: true,
4661
+ properties: {
4662
+ reason: {
4663
+ type: "string",
4664
+ description: "Reason for finding element 4"
4665
+ },
4666
+ text: {
4667
+ type: "string",
4668
+ description: "Text of elementInfo, if none, leave empty"
4669
+ },
4670
+ id: {
4671
+ type: "string",
4672
+ description: "ID of this element"
4673
+ }
4674
+ },
4675
+ required: ["reason", "text", "id"],
4676
+ additionalProperties: false
4677
+ }
4678
+ },
4679
+ required: ["thought", "type", "param", "quickAnswer"],
4680
+ additionalProperties: false
4681
+ },
4682
+ description: "List of actions to be performed"
4683
+ },
4684
+ error: {
4685
+ type: ["string", "null"],
4686
+ description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
4687
+ }
4688
+ },
4689
+ required: ["queryLanguage", "actions", "error"],
4690
+ additionalProperties: false
4691
+ }
4692
+ }
4693
+ };
4694
+
4692
4695
  // src/ai-model/prompt/util.ts
4693
4696
  import assert2 from "assert";
4694
4697
 
@@ -4707,18 +4710,10 @@ import {
4707
4710
  var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
4708
4711
  var contextFormatIntro = `
4709
4712
  The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
4710
- var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
4711
- var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
4712
- var skillExtractData = `skill name: extract_data_from_UI
4713
- related input: DATA_DEMAND
4714
- skill content:
4715
- * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
4716
- * There may be some special commands in DATA_DEMAND, please pay extra attention
4717
- - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
4718
4713
  function systemPromptToExtract() {
4719
4714
  return `
4720
4715
  You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
4721
- The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
4716
+ The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
4722
4717
 
4723
4718
  You have the following skills:
4724
4719
 
@@ -4807,14 +4802,15 @@ async function describeUserPage(context) {
4807
4802
  const elementInfosDescription = cropFieldInformation(elementsInfo);
4808
4803
  return {
4809
4804
  description: `
4810
- {
4811
- // The size of the page
4812
- "pageSize": ${describeSize({ width, height })},
4805
+ {
4806
+ // The size of the page
4807
+ "pageSize": ${describeSize({ width, height })},
4813
4808
 
4809
+ // json description of the element
4810
+ "content": ${JSON.stringify(elementInfosDescription)}
4814
4811
 
4815
- // json description of the element
4816
- "elementInfos": ${JSON.stringify(elementInfosDescription)}
4817
- }`,
4812
+ }`,
4813
+ // // json description of the element
4818
4814
  elementById(id) {
4819
4815
  assert2(typeof id !== "undefined", "id is required for query");
4820
4816
  const item = idElementMap[`${id}`];
@@ -4837,6 +4833,7 @@ function cropFieldInformation(elementsInfo) {
4837
4833
  );
4838
4834
  return {
4839
4835
  id,
4836
+ markerId: item.indexId,
4840
4837
  attributes: tailorAttributes,
4841
4838
  rect,
4842
4839
  content: tailorContent
@@ -4880,7 +4877,7 @@ async function createOpenAI() {
4880
4877
  }
4881
4878
  if (process.env[MIDSCENE_LANGSMITH_DEBUG]) {
4882
4879
  console.log("DEBUGGING MODE: langsmith wrapper enabled");
4883
- const openai2 = wrapOpenAI(new OpenAI());
4880
+ const openai2 = wrapOpenAI(new OpenAI(extraConfig));
4884
4881
  return openai2;
4885
4882
  }
4886
4883
  return openai;
@@ -4893,7 +4890,7 @@ async function call(messages, responseFormat) {
4893
4890
  model,
4894
4891
  messages,
4895
4892
  response_format: responseFormat,
4896
- temperature: 0.2,
4893
+ temperature: 0.1,
4897
4894
  stream: false
4898
4895
  });
4899
4896
  shouldPrintTiming && console.timeEnd("Midscene - AI call");
@@ -4949,7 +4946,7 @@ function extractJSONFromCodeBlock(response) {
4949
4946
  import assert4 from "assert";
4950
4947
  async function AiInspectElement(options) {
4951
4948
  var _a;
4952
- const { context, multi, findElementDescription, callAI, useModel } = options;
4949
+ const { context, multi, targetElementDescription, callAI, useModel } = options;
4953
4950
  const { screenshotBase64 } = context;
4954
4951
  const { description, elementById } = await describeUserPage(context);
4955
4952
  if (((_a = options.quickAnswer) == null ? void 0 : _a.id) && elementById(options.quickAnswer.id)) {
@@ -4979,10 +4976,10 @@ async function AiInspectElement(options) {
4979
4976
 
4980
4977
  ${description}
4981
4978
 
4982
- Here is the description of the findElement. Just go ahead:
4979
+ Here is the item user want to find. Just go ahead:
4983
4980
  =====================================
4984
4981
  ${JSON.stringify({
4985
- description: findElementDescription,
4982
+ description: targetElementDescription,
4986
4983
  multi: multiDescription(multi)
4987
4984
  })}
4988
4985
  =====================================