@midscene/core 1.0.1-beta-20251024063839.0 → 1.0.1-beta-20251027033034.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/es/agent/agent.mjs +2 -3
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/index.mjs +2 -2
  4. package/dist/es/agent/task-builder.mjs +11 -7
  5. package/dist/es/agent/task-builder.mjs.map +1 -1
  6. package/dist/es/agent/tasks.mjs +8 -1
  7. package/dist/es/agent/tasks.mjs.map +1 -1
  8. package/dist/es/agent/ui-utils.mjs +14 -11
  9. package/dist/es/agent/ui-utils.mjs.map +1 -1
  10. package/dist/es/agent/utils.mjs +6 -50
  11. package/dist/es/agent/utils.mjs.map +1 -1
  12. package/dist/es/ai-model/common.mjs.map +1 -1
  13. package/dist/es/ai-model/index.mjs +2 -2
  14. package/dist/es/ai-model/inspect.mjs +12 -31
  15. package/dist/es/ai-model/inspect.mjs.map +1 -1
  16. package/dist/es/ai-model/prompt/util.mjs +3 -88
  17. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  18. package/dist/es/ai-model/service-caller/index.mjs +23 -31
  19. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  20. package/dist/es/insight/index.mjs +6 -15
  21. package/dist/es/insight/index.mjs.map +1 -1
  22. package/dist/es/tree.mjs +13 -2
  23. package/dist/es/tree.mjs.map +1 -0
  24. package/dist/es/types.mjs.map +1 -1
  25. package/dist/es/utils.mjs +2 -2
  26. package/dist/lib/agent/agent.js +1 -2
  27. package/dist/lib/agent/agent.js.map +1 -1
  28. package/dist/lib/agent/index.js +0 -3
  29. package/dist/lib/agent/task-builder.js +11 -7
  30. package/dist/lib/agent/task-builder.js.map +1 -1
  31. package/dist/lib/agent/tasks.js +8 -1
  32. package/dist/lib/agent/tasks.js.map +1 -1
  33. package/dist/lib/agent/ui-utils.js +14 -11
  34. package/dist/lib/agent/ui-utils.js.map +1 -1
  35. package/dist/lib/agent/utils.js +5 -52
  36. package/dist/lib/agent/utils.js.map +1 -1
  37. package/dist/lib/ai-model/common.js.map +1 -1
  38. package/dist/lib/ai-model/index.js +11 -14
  39. package/dist/lib/ai-model/inspect.js +11 -30
  40. package/dist/lib/ai-model/inspect.js.map +1 -1
  41. package/dist/lib/ai-model/prompt/util.js +5 -93
  42. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  43. package/dist/lib/ai-model/service-caller/index.js +23 -31
  44. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  45. package/dist/lib/insight/index.js +6 -15
  46. package/dist/lib/insight/index.js.map +1 -1
  47. package/dist/lib/tree.js +10 -1
  48. package/dist/lib/tree.js.map +1 -1
  49. package/dist/lib/types.js.map +1 -1
  50. package/dist/lib/utils.js +2 -2
  51. package/dist/types/agent/index.d.ts +1 -1
  52. package/dist/types/agent/utils.d.ts +2 -33
  53. package/dist/types/ai-model/index.d.ts +1 -1
  54. package/dist/types/ai-model/inspect.d.ts +12 -10
  55. package/dist/types/ai-model/prompt/util.d.ts +2 -34
  56. package/dist/types/insight/index.d.ts +6 -6
  57. package/dist/types/tree.d.ts +4 -1
  58. package/dist/types/types.d.ts +8 -37
  59. package/dist/types/yaml.d.ts +2 -2
  60. package/package.json +3 -3
@@ -5,5 +5,5 @@ export { locateParamStr, paramStr, taskTitleStr, typeStr } from './ui-utils';
5
5
  export { type LocateCache, type PlanningCache, TaskCache } from './task-cache';
6
6
  export { cacheFileExt } from './task-cache';
7
7
  export { TaskExecutor } from './tasks';
8
- export { getCurrentExecutionFile, trimContextByViewport, } from './utils';
8
+ export { getCurrentExecutionFile } from './utils';
9
9
  export type { AgentOpt } from '../types';
@@ -1,6 +1,6 @@
1
1
  import type { TMultimodalPrompt, TUserPrompt } from '../ai-model/common';
2
2
  import type { AbstractInterface } from '../device';
3
- import type { BaseElement, ElementCacheFeature, ElementTreeNode, ExecutionDump, ExecutorContext, LocateResultElement, PlanningLocateParam, UIContext } from '../types';
3
+ import type { ElementCacheFeature, LocateResultElement, PlanningLocateParam, UIContext } from '../types';
4
4
  import type { TaskCache } from './task-cache';
5
5
  export declare function commonContextParser(interfaceInstance: AbstractInterface, _opt: {
6
6
  uploadServerUrl?: string;
@@ -13,42 +13,11 @@ export declare function printReportMsg(filepath: string): void;
13
13
  */
14
14
  export declare function getCurrentExecutionFile(trace?: string): string | false;
15
15
  export declare function generateCacheId(fileName?: string): string;
16
- export declare function matchElementFromPlan(planLocateParam: PlanningLocateParam, tree: ElementTreeNode<BaseElement>): any;
16
+ export declare function matchElementFromPlan(planLocateParam: PlanningLocateParam): LocateResultElement | undefined;
17
17
  export declare function matchElementFromCache(context: {
18
18
  taskCache?: TaskCache;
19
19
  interfaceInstance: AbstractInterface;
20
20
  }, cacheEntry: ElementCacheFeature | undefined, cachePrompt: TUserPrompt, cacheable: boolean | undefined): Promise<LocateResultElement | undefined>;
21
- export declare function trimContextByViewport(execution: ExecutionDump): {
22
- tasks: {
23
- type: any;
24
- subType?: string;
25
- subTask?: boolean;
26
- param?: any;
27
- thought?: string;
28
- locate?: PlanningLocateParam | null;
29
- uiContext?: UIContext;
30
- executor: (param: any, context: ExecutorContext) => void | Promise<void | import("../types").ExecutionTaskReturn<any, any> | undefined> | undefined;
31
- output?: any;
32
- log?: any;
33
- recorder?: import("../types").ExecutionRecorderItem[];
34
- hitBy?: import("../types").ExecutionTaskHitBy;
35
- status: "pending" | "running" | "finished" | "failed" | "cancelled";
36
- error?: Error;
37
- errorMessage?: string;
38
- errorStack?: string;
39
- timing?: {
40
- start: number;
41
- end?: number;
42
- cost?: number;
43
- };
44
- usage?: import("../types").AIUsageInfo;
45
- searchAreaUsage?: import("../types").AIUsageInfo;
46
- }[];
47
- name: string;
48
- description?: string;
49
- aiActionContext?: string;
50
- logTime: number;
51
- };
52
21
  export declare const getMidsceneVersion: () => string;
53
22
  export declare const parsePrompt: (prompt: TUserPrompt) => {
54
23
  textPrompt: string;
@@ -1,6 +1,6 @@
1
1
  export { callAIWithStringResponse, callAIWithObjectResponse, callAI, } from './service-caller/index';
2
2
  export { systemPromptToLocateElement } from './prompt/llm-locator';
3
- export { describeUserPage, elementByPositionWithElementInfo, } from './prompt/util';
3
+ export { describeUserPage } from './prompt/util';
4
4
  export { generatePlaywrightTest, generatePlaywrightTestStream, } from './prompt/playwright-generator';
5
5
  export { generateYamlTest, generateYamlTestStream, } from './prompt/yaml-generator';
6
6
  export type { ChatCompletionMessageParam } from 'openai/resources/index';
@@ -1,5 +1,6 @@
1
- import type { AIDataExtractionResponse, AIElementLocatorResponse, AIElementResponse, AIUsageInfo, BaseElement, ElementById, InsightExtractOption, Rect, ReferenceImage, UIContext } from '../types';
1
+ import type { AIDataExtractionResponse, AIElementResponse, AIUsageInfo, InsightExtractOption, Rect, ReferenceImage, UIContext } from '../types';
2
2
  import type { IModelConfig } from '@midscene/shared/env';
3
+ import type { LocateResultElement } from '@midscene/shared/types';
3
4
  import type { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources/index';
4
5
  import type { TMultimodalPrompt, TUserPrompt } from './common';
5
6
  import { callAIWithObjectResponse } from './service-caller/index';
@@ -7,23 +8,24 @@ export type AIArgs = [
7
8
  ChatCompletionSystemMessageParam,
8
9
  ...ChatCompletionUserMessageParam[]
9
10
  ];
10
- export declare function AiLocateElement<ElementType extends BaseElement = BaseElement>(options: {
11
- context: UIContext<ElementType>;
11
+ export declare function AiLocateElement(options: {
12
+ context: UIContext;
12
13
  targetElementDescription: TUserPrompt;
13
14
  referenceImage?: ReferenceImage;
14
15
  callAIFn: typeof callAIWithObjectResponse<AIElementResponse | [number, number]>;
15
16
  searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
16
17
  modelConfig: IModelConfig;
17
18
  }): Promise<{
18
- parseResult: AIElementLocatorResponse;
19
+ parseResult: {
20
+ elements: LocateResultElement[];
21
+ errors?: string[];
22
+ };
19
23
  rect?: Rect;
20
24
  rawResponse: string;
21
- elementById: ElementById;
22
25
  usage?: AIUsageInfo;
23
- isOrderSensitive?: boolean;
24
26
  }>;
25
27
  export declare function AiLocateSection(options: {
26
- context: UIContext<BaseElement>;
28
+ context: UIContext;
27
29
  sectionDescription: TUserPrompt;
28
30
  modelConfig: IModelConfig;
29
31
  }): Promise<{
@@ -33,14 +35,14 @@ export declare function AiLocateSection(options: {
33
35
  rawResponse: string;
34
36
  usage?: AIUsageInfo;
35
37
  }>;
36
- export declare function AiExtractElementInfo<T, ElementType extends BaseElement = BaseElement>(options: {
38
+ export declare function AiExtractElementInfo<T>(options: {
37
39
  dataQuery: string | Record<string, string>;
38
40
  multimodalPrompt?: TMultimodalPrompt;
39
- context: UIContext<ElementType>;
41
+ context: UIContext;
42
+ pageDescription?: string;
40
43
  extractOption?: InsightExtractOption;
41
44
  modelConfig: IModelConfig;
42
45
  }): Promise<{
43
46
  parseResult: AIDataExtractionResponse<T>;
44
- elementById: (idOrIndexId: string) => ElementType;
45
47
  usage: AIUsageInfo | undefined;
46
48
  }>;
@@ -1,17 +1,9 @@
1
- import type { BaseElement, ElementTreeNode, Size, UIContext } from '../../types';
2
- import type { TVlModeTypes } from '@midscene/shared/env';
1
+ import type { BaseElement, Size, UIContext } from '../../types';
3
2
  export declare function describeSize(size: Size): string;
4
3
  export declare function describeElement(elements: (Pick<BaseElement, 'rect' | 'content'> & {
5
4
  id: string;
6
5
  })[]): string;
7
6
  export declare const distanceThreshold = 16;
8
- export declare function elementByPositionWithElementInfo(treeRoot: ElementTreeNode<BaseElement>, position: {
9
- x: number;
10
- y: number;
11
- }, options?: {
12
- requireStrictDistance?: boolean;
13
- filterPositionElements?: boolean;
14
- }): BaseElement | undefined;
15
7
  export declare function distance(point1: {
16
8
  x: number;
17
9
  y: number;
@@ -20,28 +12,4 @@ export declare function distance(point1: {
20
12
  y: number;
21
13
  }): number;
22
14
  export declare const samplePageDescription = "\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n The username is accepted\n </h4>\n ...many more\n</div>\n====================\n";
23
- export declare function describeUserPage<ElementType extends BaseElement = BaseElement>(context: Omit<UIContext<ElementType>, 'describer'>, opt: {
24
- truncateTextLength?: number;
25
- filterNonTextContent?: boolean;
26
- domIncluded?: boolean | 'visible-only';
27
- visibleOnly?: boolean;
28
- vlMode: TVlModeTypes | undefined;
29
- }): Promise<{
30
- description: string;
31
- elementById(idOrIndexId: string): ElementType;
32
- elementByPosition(position: {
33
- x: number;
34
- y: number;
35
- }, size: {
36
- width: number;
37
- height: number;
38
- }): BaseElement | undefined;
39
- insertElementByPosition(position: {
40
- x: number;
41
- y: number;
42
- }): ElementType;
43
- size: {
44
- width: number;
45
- height: number;
46
- };
47
- }>;
15
+ export declare function describeUserPage(context: UIContext): Promise<string>;
@@ -1,9 +1,9 @@
1
1
  import { callAIWithObjectResponse } from '../ai-model/index';
2
- import type { AIDescribeElementResponse, BaseElement, DetailedLocateParam, InsightExtractOption, InsightExtractParam, InsightExtractResult, InsightTaskInfo, LocateResultWithDump, Rect, UIContext } from '../types';
2
+ import type { AIDescribeElementResponse, DetailedLocateParam, InsightExtractOption, InsightExtractParam, InsightExtractResult, InsightTaskInfo, LocateResultWithDump, Rect, UIContext } from '../types';
3
3
  import { type IModelConfig } from '@midscene/shared/env';
4
4
  import type { TMultimodalPrompt } from '../ai-model/common';
5
5
  export interface LocateOpts {
6
- context?: UIContext<BaseElement>;
6
+ context?: UIContext;
7
7
  }
8
8
  export type AnyValue<T> = {
9
9
  [K in keyof T]: unknown extends T[K] ? any : T[K];
@@ -12,13 +12,13 @@ interface InsightOptions {
12
12
  taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;
13
13
  aiVendorFn?: typeof callAIWithObjectResponse;
14
14
  }
15
- export default class Insight<ElementType extends BaseElement = BaseElement, ContextType extends UIContext<ElementType> = UIContext<ElementType>> {
16
- contextRetrieverFn: () => Promise<ContextType> | ContextType;
15
+ export default class Insight {
16
+ contextRetrieverFn: () => Promise<UIContext> | UIContext;
17
17
  aiVendorFn: Exclude<InsightOptions['aiVendorFn'], undefined>;
18
18
  taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;
19
- constructor(context: ContextType | (() => Promise<ContextType> | ContextType), opt?: InsightOptions);
19
+ constructor(context: UIContext | (() => Promise<UIContext> | UIContext), opt?: InsightOptions);
20
20
  locate(query: DetailedLocateParam, opt: LocateOpts, modelConfig: IModelConfig): Promise<LocateResultWithDump>;
21
- extract<T>(dataDemand: InsightExtractParam, modelConfig: IModelConfig, opt?: InsightExtractOption, multimodalPrompt?: TMultimodalPrompt): Promise<InsightExtractResult<T>>;
21
+ extract<T>(dataDemand: InsightExtractParam, modelConfig: IModelConfig, opt?: InsightExtractOption, pageDescription?: string, multimodalPrompt?: TMultimodalPrompt): Promise<InsightExtractResult<T>>;
22
22
  describe(target: Rect | [number, number], modelConfig: IModelConfig, opt?: {
23
23
  deepThink?: boolean;
24
24
  }): Promise<Pick<AIDescribeElementResponse, 'description'>>;
@@ -1 +1,4 @@
1
- export { truncateText, trimAttributes, descriptionOfTree, } from '@midscene/shared/extractor';
1
+ import type { BaseElement, ElementTreeNode } from '@midscene/shared/types';
2
+ import { trimAttributes, truncateText } from '@midscene/shared/extractor';
3
+ export { trimAttributes, truncateText };
4
+ export declare function descriptionOfTree<ElementType extends BaseElement = BaseElement>(tree: ElementTreeNode<ElementType>, truncateTextLength?: number, filterNonTextContent?: boolean, visibleOnly?: boolean): string;
@@ -1,6 +1,6 @@
1
1
  import type { NodeType } from '@midscene/shared/constants';
2
2
  import type { CreateOpenAIClientFn, TModelConfigFn } from '@midscene/shared/env';
3
- import type { BaseElement, ElementTreeNode, Rect, Size } from '@midscene/shared/types';
3
+ import type { BaseElement, LocateResultElement, Rect, Size } from '@midscene/shared/types';
4
4
  import type { z } from 'zod';
5
5
  import type { TUserPrompt } from './ai-model/common';
6
6
  import type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml';
@@ -15,6 +15,7 @@ export type AIUsageInfo = Record<string, any> & {
15
15
  model_description: string | undefined;
16
16
  intent: string | undefined;
17
17
  };
18
+ export type { LocateResultElement };
18
19
  /**
19
20
  * openai
20
21
  *
@@ -39,23 +40,12 @@ export type AISingleElementResponseByPosition = {
39
40
  text: string;
40
41
  };
41
42
  export type AISingleElementResponse = AISingleElementResponseById;
42
- export interface AIElementLocatorResponse {
43
- elements: {
44
- id: string;
45
- reason?: string;
46
- text?: string;
47
- xpaths?: string[];
48
- }[];
49
- bbox?: [number, number, number, number];
50
- isOrderSensitive?: boolean;
51
- errors?: string[];
52
- }
53
43
  export interface AIElementCoordinatesResponse {
54
44
  bbox: [number, number, number, number];
55
45
  isOrderSensitive?: boolean;
56
46
  errors?: string[];
57
47
  }
58
- export type AIElementResponse = AIElementLocatorResponse | AIElementCoordinatesResponse;
48
+ export type AIElementResponse = AIElementCoordinatesResponse;
59
49
  export interface AIDataExtractionResponse<DataDemand> {
60
50
  data: DataDemand;
61
51
  errors?: string[];
@@ -91,9 +81,8 @@ export interface AgentDescribeElementAtPointResult {
91
81
  /**
92
82
  * context
93
83
  */
94
- export declare abstract class UIContext<ElementType extends BaseElement = BaseElement> {
84
+ export declare abstract class UIContext {
95
85
  abstract screenshotBase64: string;
96
- abstract tree: ElementTreeNode<ElementType>;
97
86
  abstract size: Size;
98
87
  abstract _isFrozen?: boolean;
99
88
  }
@@ -103,18 +92,6 @@ export type EnsureObject<T> = {
103
92
  export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
104
93
  export type InsightExtractParam = string | Record<string, string>;
105
94
  export type ElementCacheFeature = Record<string, unknown>;
106
- export type LocateResultElement = {
107
- center: [number, number];
108
- rect: Rect;
109
- id: string;
110
- indexId?: number;
111
- xpaths: string[];
112
- attributes: {
113
- nodeType: NodeType;
114
- [key: string]: string;
115
- };
116
- isOrderSensitive?: boolean;
117
- };
118
95
  export interface LocateResult {
119
96
  element: LocateResultElement | null;
120
97
  rect?: Rect;
@@ -143,7 +120,7 @@ export interface InsightDump extends DumpMeta {
143
120
  dataDemand?: InsightExtractParam;
144
121
  assertion?: TUserPrompt;
145
122
  };
146
- matchedElement: BaseElement[];
123
+ matchedElement: LocateResultElement[];
147
124
  matchedRect?: Rect;
148
125
  deepThink?: boolean;
149
126
  data: any;
@@ -193,7 +170,6 @@ export interface AgentAssertOpt {
193
170
  *
194
171
  */
195
172
  export interface PlanningLocateParam extends DetailedLocateParam {
196
- id?: string;
197
173
  bbox?: [number, number, number, number];
198
174
  }
199
175
  export interface PlanningAction<ParamType = any> {
@@ -385,7 +361,7 @@ export interface WebElementInfo extends BaseElement {
385
361
  [key: string]: string;
386
362
  };
387
363
  }
388
- export type WebUIContext = UIContext<WebElementInfo>;
364
+ export type WebUIContext = UIContext;
389
365
  /**
390
366
  * Agent
391
367
  */
@@ -421,14 +397,9 @@ export interface AgentOpt {
421
397
  *
422
398
  * @example
423
399
  * ```typescript
424
- * createOpenAIClient: (config) => {
425
- * const openai = new OpenAI({
426
- * apiKey: config.openaiApiKey,
427
- * baseURL: config.openaiBaseURL,
428
- * });
429
- *
400
+ * createOpenAIClient: async (openai, opts) => {
430
401
  * // Wrap with langsmith for planning tasks
431
- * if (config.intent === 'planning') {
402
+ * if (opts.baseURL?.includes('planning')) {
432
403
  * return wrapOpenAI(openai, { metadata: { task: 'planning' } });
433
404
  * }
434
405
  *
@@ -1,13 +1,13 @@
1
1
  import type { TUserPrompt } from './ai-model/common';
2
2
  import type { AndroidDeviceOpt, IOSDeviceOpt } from './device';
3
3
  import type { AgentOpt, Rect } from './types';
4
- import type { BaseElement, UIContext } from './types';
4
+ import type { UIContext } from './types';
5
5
  export interface LocateOption {
6
6
  prompt?: TUserPrompt;
7
7
  deepThink?: boolean;
8
8
  cacheable?: boolean;
9
9
  xpath?: string;
10
- uiContext?: UIContext<BaseElement>;
10
+ uiContext?: UIContext;
11
11
  }
12
12
  export interface InsightExtractOption {
13
13
  domIncluded?: boolean | 'visible-only';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@midscene/core",
3
3
  "description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
4
- "version": "1.0.1-beta-20251024063839.0",
4
+ "version": "1.0.1-beta-20251027033034.0",
5
5
  "repository": "https://github.com/web-infra-dev/midscene",
6
6
  "homepage": "https://midscenejs.com/",
7
7
  "main": "./dist/lib/index.js",
@@ -89,8 +89,8 @@
89
89
  "zod": "3.24.3",
90
90
  "semver": "7.5.2",
91
91
  "js-yaml": "4.1.0",
92
- "@midscene/recorder": "1.0.1-beta-20251024063839.0",
93
- "@midscene/shared": "1.0.1-beta-20251024063839.0"
92
+ "@midscene/recorder": "1.0.1-beta-20251027033034.0",
93
+ "@midscene/shared": "1.0.1-beta-20251027033034.0"
94
94
  },
95
95
  "devDependencies": {
96
96
  "@rslib/core": "^0.11.2",