page-agent 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,21 +1,25 @@
1
- # PageAgent 🤖
1
+ # PageAgent 🤖🪄
2
2
 
3
- [![npm version](https://badge.fury.io/js/page-agent.svg)](https://badge.fury.io/js/page-agent) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![TypeScript](https://img.shields.io/badge/%3C%2F%3E-TypeScript-%230074c1.svg)](http://www.typescriptlang.org/) [![Downloads](https://img.shields.io/npm/dt/page-agent.svg)](https://www.npmjs.com/package/page-agent) [![Bundle Size](https://img.shields.io/bundlephobia/minzip/page-agent)](https://bundlephobia.com/package/page-agent) [![GitHub stars](https://img.shields.io/github/stars/gaomeng1900/page-agent.svg)](https://github.com/gaomeng1900/page-agent)
3
+ > Unfinished Project. See [**Roadmap**](./ROADMAP.md)
4
4
 
5
- **Transform any webpage into an AI-powered application with a single script tag.**
5
+ ![banner](https://img.alicdn.com/imgextra/i1/O1CN01RY0Wvh26ATVeDIX7v_!!6000000007621-0-tps-1672-512.jpg)
6
6
 
7
- PageAgent is an intelligent UI agent for web automation and DOM interaction. Built on browser-use architecture, it enables natural language control of web interfaces through LLM integration.
7
+ [![npm version](https://badge.fury.io/js/page-agent.svg)](https://badge.fury.io/js/page-agent) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![TypeScript](https://img.shields.io/badge/%3C%2F%3E-TypeScript-%230074c1.svg)](http://www.typescriptlang.org/) [![Downloads](https://img.shields.io/npm/dt/page-agent.svg)](https://www.npmjs.com/package/page-agent) [![Bundle Size](https://img.shields.io/bundlephobia/minzip/page-agent)](https://bundlephobia.com/package/page-agent) [![GitHub stars](https://img.shields.io/github/stars/alibaba/page-agent.svg)](https://github.com/alibaba/page-agent)
8
+
9
+ **Transform your webpage into an AI-powered application with a single script tag.**
10
+
11
+ An in-page UI agent in javascript. Control web interfaces with natural language.
8
12
 
9
13
  🌐 **English** | [中文](./README-zh.md)
10
14
 
11
- 👉 [📖 **Documentation**](#) | [🚀 **Try Demo**](#)
15
+ 👉 [🚀 **Demo**](https://alibaba.github.io/page-agent/) | [📖 **Documentation**](https://alibaba.github.io/page-agent/#/docs/introduction/overview)
12
16
 
13
17
  ---
14
18
 
15
19
  ## ✨ Features
16
20
 
17
- - **🎯 Easy Integration** - Add to any webpage via CDN or npm
18
- - **🔐 Client-Side Processing** - No data leaves the browser
21
+ - **🎯 Easy Integration**
22
+ - **🔐 Client-Side Processing**
19
23
  - **🧠 DOM Extraction**
20
24
  - **💬 Natural Language Interface**
21
25
  - **🎨 UI with Human in the loop**
@@ -45,7 +49,7 @@ npm install page-agent
45
49
  import { PageAgent } from 'page-agent'
46
50
 
47
51
  const agent = new PageAgent({
48
- modelName: 'gpt-4.1-mini'
52
+ modelName: 'gpt-4.1-mini',
49
53
  baseURL: 'xxxx',
50
54
  apiKey: 'xxxx'
51
55
  })
@@ -95,7 +99,23 @@ PageAgent is designed for **client-side web enhancement**, not server-side autom
95
99
 
96
100
  MIT License - see the [LICENSE](LICENSE) file for details.
97
101
 
98
- DOM processing components and prompt are derived from [browser-use](https://github.com/browser-use/browser-use) (MIT License). See [NOTICE](NOTICE) for full attribution.
102
+
103
+ ```
104
+ DOM processing components and prompt are derived from browser-use:
105
+
106
+ Browser Use
107
+ Copyright (c) 2024 Gregor Zunic
108
+ Licensed under the MIT License
109
+
110
+ Original browser-use project: <https://github.com/browser-use/browser-use>
111
+
112
+ We gratefully acknowledge the browser-use project and its contributors for their
113
+ excellent work on web automation and DOM interaction patterns that helped make
114
+ this project possible.
115
+
116
+ Third-party dependencies and their licenses can be found in the package.json
117
+ file and in the node_modules directory after installation.
118
+ ```
99
119
 
100
120
  ---
101
121
 
@@ -1,6 +1,5 @@
1
- import { LanguageModelUsage } from 'ai';
2
1
  import { Motion } from 'ai-motion';
3
- import { Tool } from 'ai';
2
+ import { z } from 'zod';
4
3
 
5
4
  export declare interface AgentBrain {
6
5
  evaluation_previous_goal: string;
@@ -13,11 +12,21 @@ export declare interface AgentHistory {
13
12
  action: {
14
13
  name: string;
15
14
  input: any;
16
- output: any;
15
+ output: string;
16
+ };
17
+ usage: {
18
+ promptTokens: number;
19
+ completionTokens: number;
20
+ totalTokens: number;
21
+ cachedTokens?: number;
22
+ reasoningTokens?: number;
17
23
  };
18
- usage: LanguageModelUsage;
19
24
  }
20
25
 
26
+ declare type DeepStringify<T> = {
27
+ [K in keyof T]: T[K] extends string ? string : T[K] extends object ? DeepStringify<T[K]> : T[K];
28
+ };
29
+
21
30
  declare interface DomConfig {
22
31
  interactiveBlacklist?: (Element | (() => Element))[];
23
32
  interactiveWhitelist?: (Element | (() => Element))[];
@@ -43,6 +52,55 @@ declare interface ElementDomNode {
43
52
  [key: string]: unknown;
44
53
  }
45
54
 
55
+ declare const enUS: {
56
+ readonly ui: {
57
+ readonly panel: {
58
+ readonly ready: "Ready";
59
+ readonly thinking: "Thinking...";
60
+ readonly paused: "Paused";
61
+ readonly taskInput: "Enter new task, describe steps in detail, press Enter to submit";
62
+ readonly userAnswerPrompt: "Please answer the question above, press Enter to submit";
63
+ readonly taskTerminated: "Task terminated";
64
+ readonly taskCompleted: "Task completed";
65
+ readonly continueExecution: "Continue execution";
66
+ readonly userAnswer: "User answer: {{input}}";
67
+ readonly question: "Question: {{question}}";
68
+ readonly waitingPlaceholder: "Waiting for task to start...";
69
+ readonly pause: "Pause";
70
+ readonly continue: "Continue";
71
+ readonly stop: "Stop";
72
+ readonly expand: "Expand history";
73
+ readonly collapse: "Collapse history";
74
+ readonly step: "Step {{number}} · {{time}}{{duration}}";
75
+ };
76
+ readonly tools: {
77
+ readonly clicking: "Clicking element [{{index}}]...";
78
+ readonly inputting: "Inputting text to element [{{index}}]...";
79
+ readonly selecting: "Selecting option \"{{text}}\"...";
80
+ readonly scrolling: "Scrolling page...";
81
+ readonly waiting: "Waiting {{seconds}} seconds...";
82
+ readonly done: "Task done";
83
+ readonly clicked: "🖱️ Clicked element [{{index}}]";
84
+ readonly inputted: "⌨️ Inputted text \"{{text}}\"";
85
+ readonly selected: "☑️ Selected option \"{{text}}\"";
86
+ readonly scrolled: "🛞 Page scrolled";
87
+ readonly waited: "⌛️ Wait completed";
88
+ readonly executing: "Executing {{toolName}}...";
89
+ readonly resultSuccess: "success";
90
+ readonly resultFailure: "failed";
91
+ readonly resultError: "error";
92
+ };
93
+ readonly errors: {
94
+ readonly elementNotFound: "No interactive element found at index {{index}}";
95
+ readonly taskRequired: "Task description is required";
96
+ readonly executionFailed: "Task execution failed";
97
+ readonly notInputElement: "Element is not an input or textarea";
98
+ readonly notSelectElement: "Element is not a select element";
99
+ readonly optionNotFound: "Option \"{{text}}\" not found";
100
+ };
101
+ };
102
+ };
103
+
46
104
  /**
47
105
  * Type-safe event bus
48
106
  * @note Mainly used to decouple logic and UI
@@ -111,15 +169,129 @@ declare interface InteractiveElementDomNode {
111
169
  declare interface LLMConfig {
112
170
  baseURL?: string;
113
171
  apiKey?: string;
114
- modelName?: string;
172
+ model?: string;
173
+ temperature?: number;
174
+ maxTokens?: number;
115
175
  maxRetries?: number;
116
176
  }
117
177
 
118
178
  declare const locales: {
119
- readonly 'zh-CN': TranslationSchema;
120
- readonly 'en-US': TranslationSchema;
179
+ readonly 'en-US': {
180
+ readonly ui: {
181
+ readonly panel: {
182
+ readonly ready: "Ready";
183
+ readonly thinking: "Thinking...";
184
+ readonly paused: "Paused";
185
+ readonly taskInput: "Enter new task, describe steps in detail, press Enter to submit";
186
+ readonly userAnswerPrompt: "Please answer the question above, press Enter to submit";
187
+ readonly taskTerminated: "Task terminated";
188
+ readonly taskCompleted: "Task completed";
189
+ readonly continueExecution: "Continue execution";
190
+ readonly userAnswer: "User answer: {{input}}";
191
+ readonly question: "Question: {{question}}";
192
+ readonly waitingPlaceholder: "Waiting for task to start...";
193
+ readonly pause: "Pause";
194
+ readonly continue: "Continue";
195
+ readonly stop: "Stop";
196
+ readonly expand: "Expand history";
197
+ readonly collapse: "Collapse history";
198
+ readonly step: "Step {{number}} · {{time}}{{duration}}";
199
+ };
200
+ readonly tools: {
201
+ readonly clicking: "Clicking element [{{index}}]...";
202
+ readonly inputting: "Inputting text to element [{{index}}]...";
203
+ readonly selecting: "Selecting option \"{{text}}\"...";
204
+ readonly scrolling: "Scrolling page...";
205
+ readonly waiting: "Waiting {{seconds}} seconds...";
206
+ readonly done: "Task done";
207
+ readonly clicked: "🖱️ Clicked element [{{index}}]";
208
+ readonly inputted: "⌨️ Inputted text \"{{text}}\"";
209
+ readonly selected: "☑️ Selected option \"{{text}}\"";
210
+ readonly scrolled: "🛞 Page scrolled";
211
+ readonly waited: "⌛️ Wait completed";
212
+ readonly executing: "Executing {{toolName}}...";
213
+ readonly resultSuccess: "success";
214
+ readonly resultFailure: "failed";
215
+ readonly resultError: "error";
216
+ };
217
+ readonly errors: {
218
+ readonly elementNotFound: "No interactive element found at index {{index}}";
219
+ readonly taskRequired: "Task description is required";
220
+ readonly executionFailed: "Task execution failed";
221
+ readonly notInputElement: "Element is not an input or textarea";
222
+ readonly notSelectElement: "Element is not a select element";
223
+ readonly optionNotFound: "Option \"{{text}}\" not found";
224
+ };
225
+ };
226
+ };
227
+ readonly 'zh-CN': {
228
+ readonly ui: {
229
+ readonly panel: {
230
+ readonly ready: "准备就绪";
231
+ readonly thinking: "正在思考...";
232
+ readonly paused: "暂停中,稍后";
233
+ readonly taskInput: "输入新任务,详细描述步骤,回车提交";
234
+ readonly userAnswerPrompt: "请回答上面问题,回车提交";
235
+ readonly taskTerminated: "任务已终止";
236
+ readonly taskCompleted: "任务结束";
237
+ readonly continueExecution: "继续执行";
238
+ readonly userAnswer: "用户回答: {{input}}";
239
+ readonly question: "询问: {{question}}";
240
+ readonly waitingPlaceholder: "等待任务开始...";
241
+ readonly pause: "暂停";
242
+ readonly continue: "继续";
243
+ readonly stop: "终止";
244
+ readonly expand: "展开历史";
245
+ readonly collapse: "收起历史";
246
+ readonly step: "步骤 {{number}} · {{time}}{{duration}}";
247
+ };
248
+ readonly tools: {
249
+ readonly clicking: "正在点击元素 [{{index}}]...";
250
+ readonly inputting: "正在输入文本到元素 [{{index}}]...";
251
+ readonly selecting: "正在选择选项 \"{{text}}\"...";
252
+ readonly scrolling: "正在滚动页面...";
253
+ readonly waiting: "等待 {{seconds}} 秒...";
254
+ readonly done: "结束任务";
255
+ readonly clicked: "🖱️ 已点击元素 [{{index}}]";
256
+ readonly inputted: "⌨️ 已输入文本 \"{{text}}\"";
257
+ readonly selected: "☑️ 已选择选项 \"{{text}}\"";
258
+ readonly scrolled: "🛞 页面滚动完成";
259
+ readonly waited: "⌛️ 等待完成";
260
+ readonly executing: "正在执行 {{toolName}}...";
261
+ readonly resultSuccess: "成功";
262
+ readonly resultFailure: "失败";
263
+ readonly resultError: "错误";
264
+ };
265
+ readonly errors: {
266
+ readonly elementNotFound: "未找到索引为 {{index}} 的交互元素";
267
+ readonly taskRequired: "任务描述不能为空";
268
+ readonly executionFailed: "任务执行失败";
269
+ readonly notInputElement: "元素不是输入框或文本域";
270
+ readonly notSelectElement: "元素不是选择框";
271
+ readonly optionNotFound: "未找到选项 \"{{text}}\"";
272
+ };
273
+ };
274
+ };
121
275
  };
122
276
 
277
+ /**
278
+ * MacroTool input structure
279
+ */
280
+ export declare interface MacroToolInput {
281
+ evaluation_previous_goal?: string;
282
+ memory?: string;
283
+ next_goal?: string;
284
+ action: Record<string, any>;
285
+ }
286
+
287
+ /**
288
+ * MacroTool output structure
289
+ */
290
+ export declare interface MacroToolResult {
291
+ input: MacroToolInput;
292
+ output: string;
293
+ }
294
+
123
295
  declare type NestedKeyOf<ObjectType extends object> = {
124
296
  [Key in keyof ObjectType & (string | number)]: ObjectType[Key] extends object ? `${Key}` | `${Key}.${NestedKeyOf<ObjectType[Key]>}` : `${Key}`;
125
297
  }[keyof ObjectType & (string | number)];
@@ -130,6 +302,7 @@ export declare class PageAgent extends EventTarget {
130
302
  id: string;
131
303
  bus: EventBus;
132
304
  i18n: I18n;
305
+ panel: Panel;
133
306
  paused: boolean;
134
307
  disposed: boolean;
135
308
  task: string;
@@ -147,11 +320,9 @@ export declare class PageAgent extends EventTarget {
147
320
  /** last time the tree was updated */
148
321
  lastTimeUpdate: number;
149
322
  /** Corresponds to actions in browser-use */
150
- tools: Map<string, Tool>;
323
+ tools: Map<string, PageAgentTool<any>>;
151
324
  /** Fullscreen mask */
152
325
  mask: SimulatorMask;
153
- /** Interactive panel */
154
- panel: Panel;
155
326
  /** History records */
156
327
  history: AgentHistory[];
157
328
  constructor(config?: PageAgentConfig);
@@ -189,6 +360,15 @@ declare interface PageAgentEventMap {
189
360
  };
190
361
  }
191
362
 
363
+ /**
364
+ * Internal tool definition that has access to PageAgent `this` context
365
+ */
366
+ declare interface PageAgentTool<TParams = any> {
367
+ description: string;
368
+ inputSchema: z.ZodType<TParams>;
369
+ execute: (this: PageAgent, args: TParams) => Promise<string>;
370
+ }
371
+
192
372
  /**
193
373
  * Agent control panel
194
374
  */
@@ -246,49 +426,7 @@ declare type TranslationKey = NestedKeyOf<TranslationSchema>;
246
426
 
247
427
  declare type TranslationParams = Record<string, string | number>;
248
428
 
249
- declare interface TranslationSchema {
250
- ui: {
251
- panel: {
252
- ready: string;
253
- thinking: string;
254
- paused: string;
255
- taskInput: string;
256
- userAnswerPrompt: string;
257
- taskTerminated: string;
258
- taskCompleted: string;
259
- continueExecution: string;
260
- userAnswer: string;
261
- pause: string;
262
- continue: string;
263
- stop: string;
264
- expand: string;
265
- collapse: string;
266
- step: string;
267
- };
268
- tools: {
269
- clicking: string;
270
- inputting: string;
271
- selecting: string;
272
- scrolling: string;
273
- waiting: string;
274
- done: string;
275
- clicked: string;
276
- inputted: string;
277
- selected: string;
278
- scrolled: string;
279
- waited: string;
280
- executing: string;
281
- };
282
- errors: {
283
- elementNotFound: string;
284
- taskRequired: string;
285
- executionFailed: string;
286
- notInputElement: string;
287
- notSelectElement: string;
288
- optionNotFound: string;
289
- };
290
- };
291
- }
429
+ declare type TranslationSchema = DeepStringify<typeof enUS>;
292
430
 
293
431
  declare interface UIConfig {
294
432
  language?: SupportedLanguage;