page-agent 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  # PageAgent 🤖🪄
2
2
 
3
- > ⚠️ See [**Roadmap**](./ROADMAP.md)
4
-
5
3
  ![banner](https://img.alicdn.com/imgextra/i1/O1CN01RY0Wvh26ATVeDIX7v_!!6000000007621-0-tps-1672-512.jpg)
6
4
 
7
5
  [![npm version](https://badge.fury.io/js/page-agent.svg)](https://badge.fury.io/js/page-agent) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![TypeScript](https://img.shields.io/badge/%3C%2F%3E-TypeScript-%230074c1.svg)](http://www.typescriptlang.org/) [![Downloads](https://img.shields.io/npm/dt/page-agent.svg)](https://www.npmjs.com/package/page-agent) [![Bundle Size](https://img.shields.io/bundlephobia/minzip/page-agent)](https://bundlephobia.com/package/page-agent) [![GitHub stars](https://img.shields.io/github/stars/alibaba/page-agent.svg)](https://github.com/alibaba/page-agent)
@@ -34,7 +32,11 @@ An in-page UI agent in javascript. Control web interfaces with natural language.
34
32
 
35
33
  ```html
36
34
  <!-- temporary CDN URL. May change in the future -->
37
- <script src="https://hwcxiuzfylggtcktqgij.supabase.co/storage/v1/object/public/demo-public/v0.0.2/page-agent.js" crossorigin="true" type="text/javascript"></script>
35
+ <script
36
+ src="https://hwcxiuzfylggtcktqgij.supabase.co/storage/v1/object/public/demo-public/v0.0.2/page-agent.js"
37
+ crossorigin="true"
38
+ type="text/javascript"
39
+ ></script>
38
40
  ```
39
41
 
40
42
  ### NPM Installation
@@ -54,13 +56,13 @@ const DEMO_BASE_URL = 'https://hwcxiuzfylggtcktqgij.supabase.co/functions/v1/llm
54
56
  const DEMO_API_KEY = 'PAGE-AGENT-FREE-TESTING-RANDOM'
55
57
 
56
58
  const agent = new PageAgent({
57
- modelName: DEMO_MODEL,
58
- baseURL: DEMO_BASE_URL,
59
- apiKey: DEMO_API_KEY,
60
- language: 'en-US'
59
+ modelName: DEMO_MODEL,
60
+ baseURL: DEMO_BASE_URL,
61
+ apiKey: DEMO_API_KEY,
62
+ language: 'en-US',
61
63
  })
62
64
 
63
- await agent.execute("Click the login button")
65
+ await agent.execute('Click the login button')
64
66
  ```
65
67
 
66
68
  ## 🏗️ Structure
@@ -104,9 +106,8 @@ PageAgent is designed for **client-side web enhancement**, not server-side autom
104
106
 
105
107
  MIT License - see the [LICENSE](LICENSE) file for details.
106
108
 
107
-
108
109
  ```
109
- DOM processing components and prompt are derived from browser-use:
110
+ DOM processing components and prompt are derived from browser-use:
110
111
 
111
112
  Browser Use
112
113
  Copyright (c) 2024 Gregor Zunic
@@ -7,6 +7,61 @@ export declare interface AgentBrain {
7
7
  next_goal: string;
8
8
  }
9
9
 
10
+ declare interface AgentConfig {
11
+ language?: SupportedLanguage;
12
+ /**
13
+ * Custom tools to extend PageAgent capabilities
14
+ * @experimental
15
+ * @note You can also override or remove internal tools by using the same name.
16
+ * @see [tools](../tools/index.ts)
17
+ *
18
+ * @example
19
+ * // override internal tool
20
+ * import { tool } from 'page-agent'
21
+ * const customTools = {
22
+ * ask_user: tool({
23
+ * description:
24
+ * 'Ask the user or parent model a question and wait for their answer. Use this if you need more information or clarification.',
25
+ * inputSchema: zod.object({
26
+ * question: zod.string(),
27
+ * }),
28
+ * execute: async function (this: PageAgent, input) {
29
+ * const answer = await do_some_thing(input.question)
30
+ * return "✅ Received user answer: " + answer
31
+ * },
32
+ * })
33
+ * }
34
+ *
35
+ * @example
36
+ * // remove internal tool
37
+ * const customTools = {
38
+ * ask_user: null // never ask user questions
39
+ * }
40
+ */
41
+ customTools?: Record<string, PageAgentTool | null>;
42
+ onBeforeStep?: (this: PageAgent, stepCnt: number) => Promise<void> | void;
43
+ onAfterStep?: (this: PageAgent, stepCnt: number, history: AgentHistory[]) => Promise<void> | void;
44
+ onBeforeTask?: (this: PageAgent) => Promise<void> | void;
45
+ onAfterTask?: (this: PageAgent, result: ExecutionResult) => Promise<void> | void;
46
+ /**
47
+ * @note this hook can block the disposal process
48
+ * @note when dispose caused by page unload, reason will be 'PAGE_UNLOADING'. this method CANNOT block unloading. async operations may be cut.
49
+ */
50
+ onDispose?: (this: PageAgent, reason?: string) => void;
51
+ /**
52
+ * TODO: @unimplemented
53
+ * hook when action causes a new page to be opened
54
+ * @note PageAgent will try to detect new pages and decide if it's caused by an action. But not very reliable.
55
+ */
56
+ onNewPageOpen?: (this: PageAgent, url: string) => Promise<void> | void;
57
+ /**
58
+ * TODO: @unimplemented
59
+ * try to navigate to a new page instead of opening a new tab/window.
60
+ * @note will unload the current page when a action tries to open a new page. so that things keep in the same tab/window.
61
+ */
62
+ experimentalPreventNewPage?: boolean;
63
+ }
64
+
10
65
  export declare interface AgentHistory {
11
66
  brain: AgentBrain;
12
67
  action: {
@@ -303,9 +358,11 @@ export declare class PageAgent extends EventTarget {
303
358
  bus: EventBus;
304
359
  i18n: I18n;
305
360
  panel: Panel;
361
+ tools: typeof tools;
306
362
  paused: boolean;
307
363
  disposed: boolean;
308
364
  task: string;
365
+ taskId: string;
309
366
  /** Corresponds to eval_page in browser-use */
310
367
  flatTree: FlatDomTree | null;
311
368
  /**
@@ -319,8 +376,6 @@ export declare class PageAgent extends EventTarget {
319
376
  simplifiedHTML: string;
320
377
  /** last time the tree was updated */
321
378
  lastTimeUpdate: number;
322
- /** Corresponds to actions in browser-use */
323
- tools: Map<string, PageAgentTool<any>>;
324
379
  /** Fullscreen mask */
325
380
  mask: SimulatorMask;
326
381
  /** History records */
@@ -330,10 +385,10 @@ export declare class PageAgent extends EventTarget {
330
385
  * @todo maybe return something?
331
386
  */
332
387
  execute(task: string): Promise<ExecutionResult>;
333
- dispose(): void;
388
+ dispose(reason?: string): void;
334
389
  }
335
390
 
336
- export declare type PageAgentConfig = LLMConfig & DomConfig & UIConfig;
391
+ export declare type PageAgentConfig = LLMConfig & AgentConfig & DomConfig;
337
392
 
338
393
  /**
339
394
  * Event mapping definitions
@@ -363,7 +418,7 @@ declare interface PageAgentEventMap {
363
418
  /**
364
419
  * Internal tool definition that has access to PageAgent `this` context
365
420
  */
366
- declare interface PageAgentTool<TParams = any> {
421
+ export declare interface PageAgentTool<TParams = any> {
367
422
  description: string;
368
423
  inputSchema: z.ZodType<TParams>;
369
424
  execute: (this: PageAgent, args: TParams) => Promise<string>;
@@ -422,14 +477,18 @@ declare interface TextDomNode {
422
477
  [key: string]: unknown;
423
478
  }
424
479
 
480
+ export declare function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TParams>;
481
+
482
+ /**
483
+ * Internal tools for PageAgent.
484
+ * Note: Using any to allow different parameter types for each tool
485
+ */
486
+ declare const tools: Map<string, PageAgentTool<any>>;
487
+
425
488
  declare type TranslationKey = NestedKeyOf<TranslationSchema>;
426
489
 
427
490
  declare type TranslationParams = Record<string, string | number>;
428
491
 
429
492
  declare type TranslationSchema = DeepStringify<typeof enUS>;
430
493
 
431
- declare interface UIConfig {
432
- language?: SupportedLanguage;
433
- }
434
-
435
494
  export { }
@@ -27,9 +27,9 @@ import chalk from "chalk";
27
27
  import zod, { z } from "zod";
28
28
  import { Motion } from "ai-motion";
29
29
  const VIEWPORT_EXPANSION = -1;
30
- const DEFAULT_MODEL_NAME = "gpt-41-mini-0414-global";
31
- const DEFAULT_API_KEY = "not-needed";
32
- const DEFAULT_BASE_URL = "http://localhost:3000/api/agent";
30
+ const DEFAULT_MODEL_NAME = "PAGE-AGENT-FREE-TESTING-RANDOM";
31
+ const DEFAULT_API_KEY = "PAGE-AGENT-FREE-TESTING-RANDOM";
32
+ const DEFAULT_BASE_URL = "https://hwcxiuzfylggtcktqgij.supabase.co/functions/v1/llm-testing-proxy";
33
33
  const LLM_MAX_RETRIES = 2;
34
34
  const MAX_STEPS = 20;
35
35
  const DEFAULT_TEMPERATURE = 0.7;
@@ -3333,9 +3333,11 @@ const _PageAgent = class _PageAgent extends EventTarget {
3333
3333
  __publicField(this, "bus", getEventBus(this.id));
3334
3334
  __publicField(this, "i18n");
3335
3335
  __publicField(this, "panel");
3336
+ __publicField(this, "tools");
3336
3337
  __publicField(this, "paused", false);
3337
3338
  __publicField(this, "disposed", false);
3338
3339
  __publicField(this, "task", "");
3340
+ __publicField(this, "taskId", "");
3339
3341
  __privateAdd(this, _llm);
3340
3342
  __privateAdd(this, _totalWaitTime, 0);
3341
3343
  __privateAdd(this, _abortController, new AbortController());
@@ -3352,8 +3354,6 @@ const _PageAgent = class _PageAgent extends EventTarget {
3352
3354
  __publicField(this, "simplifiedHTML", "<EMPTY>");
3353
3355
  /** last time the tree was updated */
3354
3356
  __publicField(this, "lastTimeUpdate", 0);
3355
- /** Corresponds to actions in browser-use */
3356
- __publicField(this, "tools", new Map(tools));
3357
3357
  /** Fullscreen mask */
3358
3358
  __publicField(this, "mask", new SimulatorMask());
3359
3359
  /** History records */
@@ -3362,7 +3362,20 @@ const _PageAgent = class _PageAgent extends EventTarget {
3362
3362
  __privateSet(this, _llm, new LLM(this.config, this.id));
3363
3363
  this.i18n = new I18n(this.config.language);
3364
3364
  this.panel = new Panel(this);
3365
+ this.tools = new Map(tools);
3366
+ if (this.config.customTools) {
3367
+ for (const [name, tool2] of Object.entries(this.config.customTools)) {
3368
+ if (tool2 === null) {
3369
+ this.tools.delete(name);
3370
+ continue;
3371
+ }
3372
+ this.tools.set(name, tool2);
3373
+ }
3374
+ }
3365
3375
  patchReact();
3376
+ window.addEventListener("beforeunload", (e) => {
3377
+ if (!this.disposed) this.dispose("PAGE_UNLOADING");
3378
+ });
3366
3379
  }
3367
3380
  /**
3368
3381
  * @todo maybe return something?
@@ -3370,12 +3383,18 @@ const _PageAgent = class _PageAgent extends EventTarget {
3370
3383
  async execute(task) {
3371
3384
  if (!task) throw new Error("Task is required");
3372
3385
  this.task = task;
3386
+ this.taskId = uid();
3387
+ const onBeforeStep = this.config.onBeforeStep || (() => void 0);
3388
+ const onAfterStep = this.config.onAfterStep || (() => void 0);
3389
+ const onBeforeTask = this.config.onBeforeTask || (() => void 0);
3390
+ const onAfterTask = this.config.onAfterTask || (() => void 0);
3391
+ await onBeforeTask.call(this);
3373
3392
  this.mask.show();
3374
3393
  this.bus.emit("panel:show");
3375
3394
  this.bus.emit("panel:reset");
3376
3395
  this.bus.emit("panel:update", {
3377
3396
  type: "input",
3378
- displayText: task
3397
+ displayText: this.task
3379
3398
  });
3380
3399
  if (__privateGet(this, _abortController)) {
3381
3400
  __privateGet(this, _abortController).abort();
@@ -3385,6 +3404,7 @@ const _PageAgent = class _PageAgent extends EventTarget {
3385
3404
  try {
3386
3405
  let step = 0;
3387
3406
  while (true) {
3407
+ await onBeforeStep.call(this, step);
3388
3408
  console.group(`step: ${step + 1}`);
3389
3409
  if (__privateGet(this, _abortController).signal.aborted) throw new Error("AbortError");
3390
3410
  await waitUntil(() => !this.paused);
@@ -3428,38 +3448,45 @@ const _PageAgent = class _PageAgent extends EventTarget {
3428
3448
  });
3429
3449
  console.log(chalk.green("Step finished:"), actionName);
3430
3450
  console.groupEnd();
3451
+ await onAfterStep.call(this, step, this.history);
3431
3452
  step++;
3432
3453
  if (step > MAX_STEPS) {
3433
3454
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, "Step count exceeded maximum limit", false);
3434
- return {
3455
+ const result2 = {
3435
3456
  success: false,
3436
3457
  data: "Step count exceeded maximum limit",
3437
3458
  history: this.history
3438
3459
  };
3460
+ await onAfterTask.call(this, result2);
3461
+ return result2;
3439
3462
  }
3440
3463
  if (actionName === "done") {
3441
3464
  const success = action.input?.success ?? false;
3442
3465
  const text = action.input?.text || "no text provided";
3443
3466
  console.log(chalk.green.bold("Task completed"), success, text);
3444
3467
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, text, success);
3445
- return {
3468
+ const result2 = {
3446
3469
  success,
3447
3470
  data: text,
3448
3471
  history: this.history
3449
3472
  };
3473
+ await onAfterTask.call(this, result2);
3474
+ return result2;
3450
3475
  }
3451
3476
  }
3452
3477
  } catch (error2) {
3453
3478
  console.error("Task failed", error2);
3454
3479
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, String(error2), false);
3455
- return {
3480
+ const result = {
3456
3481
  success: false,
3457
3482
  data: String(error2),
3458
3483
  history: this.history
3459
3484
  };
3485
+ await onAfterTask.call(this, result);
3486
+ return result;
3460
3487
  }
3461
3488
  }
3462
- dispose() {
3489
+ dispose(reason) {
3463
3490
  console.log("Disposing PageAgent...");
3464
3491
  this.disposed = true;
3465
3492
  cleanUpHighlights();
@@ -3469,7 +3496,8 @@ const _PageAgent = class _PageAgent extends EventTarget {
3469
3496
  this.panel.dispose();
3470
3497
  this.mask.dispose();
3471
3498
  this.history = [];
3472
- __privateGet(this, _abortController).abort("PageAgent disposed");
3499
+ __privateGet(this, _abortController).abort(reason ?? "PageAgent disposed");
3500
+ this.config.onDispose?.call(this, reason);
3473
3501
  }
3474
3502
  };
3475
3503
  _llm = new WeakMap();
@@ -3503,8 +3531,6 @@ packMacroTool_fn = /* @__PURE__ */ __name(function() {
3503
3531
  action: actionSchema
3504
3532
  });
3505
3533
  return {
3506
- // name: MACRO_TOOL_NAME,
3507
- // description: 'Execute agent action', // @todo remote
3508
3534
  inputSchema: macroToolSchema,
3509
3535
  execute: /* @__PURE__ */ __name(async (input2) => {
3510
3536
  if (__privateGet(this, _abortController).signal.aborted) throw new Error("AbortError");
@@ -3668,6 +3694,7 @@ updateTree_fn = /* @__PURE__ */ __name(function() {
3668
3694
  __name(_PageAgent, "PageAgent");
3669
3695
  let PageAgent = _PageAgent;
3670
3696
  export {
3671
- PageAgent
3697
+ PageAgent,
3698
+ tool
3672
3699
  };
3673
3700
  //# sourceMappingURL=page-agent.js.map