page-agent 0.0.4 β†’ 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,19 +4,19 @@
4
4
 
5
5
  [![npm version](https://badge.fury.io/js/page-agent.svg)](https://badge.fury.io/js/page-agent) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![TypeScript](https://img.shields.io/badge/%3C%2F%3E-TypeScript-%230074c1.svg)](http://www.typescriptlang.org/) [![Downloads](https://img.shields.io/npm/dt/page-agent.svg)](https://www.npmjs.com/package/page-agent) [![Bundle Size](https://img.shields.io/bundlephobia/minzip/page-agent)](https://bundlephobia.com/package/page-agent) [![GitHub stars](https://img.shields.io/github/stars/alibaba/page-agent.svg)](https://github.com/alibaba/page-agent)
6
6
 
7
- **Transform your webpage into an AI-powered application with a single script tag.**
8
-
9
- An in-page UI agent in javascript. Control web interfaces with natural language.
7
+ The GUI Agent Living in Your Webpage. Control web interfaces with natural language.
10
8
 
11
9
  🌐 **English** | [δΈ­ζ–‡](./README-zh.md)
12
10
 
13
- πŸ‘‰ [πŸš€ **Demo**](https://alibaba.github.io/page-agent/) | [πŸ“– **Documentation**](https://alibaba.github.io/page-agent/#/docs/introduction/overview)
11
+ πŸ‘‰ <a href="https://alibaba.github.io/page-agent/" target="_blank"><b>πŸš€ Demo</b></a> | <a href="https://alibaba.github.io/page-agent/#/docs/introduction/overview" target="_blank"><b>πŸ“– Documentation</b></a>
12
+
13
+ <video id="demo-video" src="https://github.com/user-attachments/assets/de8d1964-8bde-494f-a52f-2975469557a5" width="640" crossorigin muted autoplay loop></video>
14
14
 
15
15
  ---
16
16
 
17
17
  ## ✨ Features
18
18
 
19
- - **🎯 Easy Integration**
19
+ - **🎯 Easy Integration** - Transform your webpage into an agent with a single script tag.
20
20
  - **πŸ” Client-Side Processing**
21
21
  - **🧠 DOM Extraction**
22
22
  - **πŸ’¬ Natural Language Interface**
@@ -33,7 +33,7 @@ An in-page UI agent in javascript. Control web interfaces with natural language.
33
33
  ```html
34
34
  <!-- temporary CDN URL. May change in the future -->
35
35
  <script
36
- src="https://hwcxiuzfylggtcktqgij.supabase.co/storage/v1/object/public/demo-public/v0.0.2/page-agent.js"
36
+ src="https://hwcxiuzfylggtcktqgij.supabase.co/storage/v1/object/public/demo-public/v0.0.4/page-agent.js"
37
37
  crossorigin="true"
38
38
  type="text/javascript"
39
39
  ></script>
@@ -48,6 +48,13 @@ declare interface AgentConfig {
48
48
  * @note when dispose caused by page unload, reason will be 'PAGE_UNLOADING'. this method CANNOT block unloading. async operations may be cut.
49
49
  */
50
50
  onDispose?: (this: PageAgent, reason?: string) => void;
51
+ /**
52
+ * @experimental
53
+ * Enable the experimental script execution tool that allows executing generated JavaScript code on the page.
54
+ * @note Can cause unpredictable side effects.
55
+ * @note May bypass some safe guards and data-masking mechanisms.
56
+ */
57
+ experimentalScriptExecutionTool?: boolean;
51
58
  /**
52
59
  * TODO: @unimplemented
53
60
  * hook when action causes a new page to be opened
@@ -167,11 +174,11 @@ declare class EventBus extends EventTarget {
167
174
  /**
168
175
  * Listen to built-in events
169
176
  */
170
- on<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T & keyof PageAgentEventMap>): void;
177
+ on<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T>): void;
171
178
  /**
172
179
  * Listen to built-in events (one-time)
173
180
  */
174
- once<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T & keyof PageAgentEventMap>): void;
181
+ once<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T>): void;
175
182
  /**
176
183
  * Emit built-in events
177
184
  */
@@ -492,15 +499,3 @@ declare type TranslationParams = Record<string, string | number>;
492
499
  declare type TranslationSchema = DeepStringify<typeof enUS>;
493
500
 
494
501
  export { }
495
-
496
-
497
- declare module 'react-i18next' {
498
- interface CustomTypeOptions {
499
- defaultNS: 'common';
500
- resources: {
501
- common: typeof commonZh;
502
- home: typeof homeZh;
503
- docs: typeof docsZh;
504
- };
505
- }
506
- }
@@ -1189,7 +1189,7 @@ function flatTreeToString(flatTree, include_attributes) {
1189
1189
  }
1190
1190
  return false;
1191
1191
  }, "hasParentWithHighlightIndex");
1192
- const processNode = /* @__PURE__ */ __name((node, depth, result2) => {
1192
+ const processNode = /* @__PURE__ */ __name((node, depth, result22) => {
1193
1193
  let nextDepth = depth;
1194
1194
  const depthStr = " ".repeat(depth);
1195
1195
  if (node.type === "element") {
@@ -1264,23 +1264,23 @@ function flatTreeToString(flatTree, include_attributes) {
1264
1264
  line += " ";
1265
1265
  }
1266
1266
  line += " />";
1267
- result2.push(line);
1267
+ result22.push(line);
1268
1268
  }
1269
1269
  for (const child of node.children) {
1270
- processNode(child, nextDepth, result2);
1270
+ processNode(child, nextDepth, result22);
1271
1271
  }
1272
1272
  } else if (node.type === "text") {
1273
1273
  if (hasParentWithHighlightIndex(node)) {
1274
1274
  return;
1275
1275
  }
1276
1276
  if (node.parent && node.parent.type === "element" && node.parent.isVisible && node.parent.isTopElement) {
1277
- result2.push(`${depthStr}${node.text ?? ""}`);
1277
+ result22.push(`${depthStr}${node.text ?? ""}`);
1278
1278
  }
1279
1279
  }
1280
1280
  }, "processNode");
1281
- const result = [];
1282
- processNode(rootNode, 0, result);
1283
- return result.join("\n");
1281
+ const result2 = [];
1282
+ processNode(rootNode, 0, result2);
1283
+ return result2.join("\n");
1284
1284
  }
1285
1285
  __name(flatTreeToString, "flatTreeToString");
1286
1286
  const getAllTextTillNextClickableElement = /* @__PURE__ */ __name((node, maxDepth = -1) => {
@@ -1651,6 +1651,8 @@ function lenientParseMacroToolCall(responseData, inputSchema) {
1651
1651
  }
1652
1652
  switch (choice.finish_reason) {
1653
1653
  case "tool_calls":
1654
+ case "function_call":
1655
+ // gemini
1654
1656
  case "stop":
1655
1657
  break;
1656
1658
  case "length":
@@ -1902,8 +1904,8 @@ const _LLM = class _LLM {
1902
1904
  async invoke(messages, tools2, abortSignal) {
1903
1905
  return await withRetry(
1904
1906
  async () => {
1905
- const result = await this.client.invoke(messages, tools2, abortSignal);
1906
- return result;
1907
+ const result2 = await this.client.invoke(messages, tools2, abortSignal);
1908
+ return result2;
1907
1909
  },
1908
1910
  // retry settings
1909
1911
  {
@@ -2413,6 +2415,24 @@ tools.set(
2413
2415
  }, "execute")
2414
2416
  })
2415
2417
  );
2418
+ tools.set(
2419
+ "execute_javascript",
2420
+ tool({
2421
+ description: "Execute JavaScript code on the current page. Supports async/await syntax. Use with caution!",
2422
+ inputSchema: zod.object({
2423
+ script: zod.string()
2424
+ }),
2425
+ execute: /* @__PURE__ */ __name(async function(input) {
2426
+ try {
2427
+ const asyncFunction = eval(`(async () => { ${input.script} })`);
2428
+ const result = await asyncFunction();
2429
+ return `βœ… Executed JavaScript. Result: ${result}` + await getSystemInfo();
2430
+ } catch (error2) {
2431
+ return `❌ Error executing JavaScript: ${error2}` + await getSystemInfo();
2432
+ }
2433
+ }, "execute")
2434
+ })
2435
+ );
2416
2436
  async function waitUntil(check, timeout = 60 * 601e3) {
2417
2437
  if (check()) return true;
2418
2438
  return new Promise((resolve, reject) => {
@@ -3372,6 +3392,9 @@ const _PageAgent = class _PageAgent extends EventTarget {
3372
3392
  this.tools.set(name, tool2);
3373
3393
  }
3374
3394
  }
3395
+ if (!this.config.experimentalScriptExecutionTool) {
3396
+ this.tools.delete("execute_javascript");
3397
+ }
3375
3398
  patchReact();
3376
3399
  window.addEventListener("beforeunload", (e) => {
3377
3400
  if (!this.disposed) this.dispose("PAGE_UNLOADING");
@@ -3413,7 +3436,7 @@ const _PageAgent = class _PageAgent extends EventTarget {
3413
3436
  type: "thinking",
3414
3437
  displayText: this.i18n.t("ui.panel.thinking")
3415
3438
  });
3416
- const result = await __privateGet(this, _llm).invoke(
3439
+ const result2 = await __privateGet(this, _llm).invoke(
3417
3440
  [
3418
3441
  {
3419
3442
  role: "system",
@@ -3427,7 +3450,7 @@ const _PageAgent = class _PageAgent extends EventTarget {
3427
3450
  { AgentOutput: __privateMethod(this, _PageAgent_instances, packMacroTool_fn).call(this) },
3428
3451
  __privateGet(this, _abortController).signal
3429
3452
  );
3430
- const macroResult = result.toolResult;
3453
+ const macroResult = result2.toolResult;
3431
3454
  const input2 = macroResult.input;
3432
3455
  const output2 = macroResult.output;
3433
3456
  const brain = {
@@ -3444,7 +3467,7 @@ const _PageAgent = class _PageAgent extends EventTarget {
3444
3467
  this.history.push({
3445
3468
  brain,
3446
3469
  action,
3447
- usage: result.usage
3470
+ usage: result2.usage
3448
3471
  });
3449
3472
  console.log(chalk.green("Step finished:"), actionName);
3450
3473
  console.groupEnd();
@@ -3452,38 +3475,38 @@ const _PageAgent = class _PageAgent extends EventTarget {
3452
3475
  step++;
3453
3476
  if (step > MAX_STEPS) {
3454
3477
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, "Step count exceeded maximum limit", false);
3455
- const result2 = {
3478
+ const result22 = {
3456
3479
  success: false,
3457
3480
  data: "Step count exceeded maximum limit",
3458
3481
  history: this.history
3459
3482
  };
3460
- await onAfterTask.call(this, result2);
3461
- return result2;
3483
+ await onAfterTask.call(this, result22);
3484
+ return result22;
3462
3485
  }
3463
3486
  if (actionName === "done") {
3464
3487
  const success = action.input?.success ?? false;
3465
3488
  const text = action.input?.text || "no text provided";
3466
3489
  console.log(chalk.green.bold("Task completed"), success, text);
3467
3490
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, text, success);
3468
- const result2 = {
3491
+ const result22 = {
3469
3492
  success,
3470
3493
  data: text,
3471
3494
  history: this.history
3472
3495
  };
3473
- await onAfterTask.call(this, result2);
3474
- return result2;
3496
+ await onAfterTask.call(this, result22);
3497
+ return result22;
3475
3498
  }
3476
3499
  }
3477
3500
  } catch (error2) {
3478
3501
  console.error("Task failed", error2);
3479
3502
  __privateMethod(this, _PageAgent_instances, onDone_fn).call(this, String(error2), false);
3480
- const result = {
3503
+ const result2 = {
3481
3504
  success: false,
3482
3505
  data: String(error2),
3483
3506
  history: this.history
3484
3507
  };
3485
- await onAfterTask.call(this, result);
3486
- return result;
3508
+ await onAfterTask.call(this, result2);
3509
+ return result2;
3487
3510
  }
3488
3511
  }
3489
3512
  dispose(reason) {
@@ -3558,16 +3581,16 @@ packMacroTool_fn = /* @__PURE__ */ __name(function() {
3558
3581
  displayText: getToolExecutingText(toolName, toolInput, this.i18n)
3559
3582
  });
3560
3583
  const startTime = Date.now();
3561
- let result = await tool2.execute.bind(this)(toolInput);
3584
+ let result2 = await tool2.execute.bind(this)(toolInput);
3562
3585
  const duration = Date.now() - startTime;
3563
- console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result);
3586
+ console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result2);
3564
3587
  if (toolName === "wait") {
3565
3588
  __privateSet(this, _totalWaitTime, __privateGet(this, _totalWaitTime) + Math.round(toolInput.seconds + duration / 1e3));
3566
- result += `
3589
+ result2 += `
3567
3590
  <sys> You have waited ${__privateGet(this, _totalWaitTime)} seconds accumulatively.`;
3568
3591
  if (__privateGet(this, _totalWaitTime) >= 3)
3569
- result += "\nDo NOT wait any longer unless you have a good reason.\n";
3570
- result += "</sys>";
3592
+ result2 += "\nDo NOT wait any longer unless you have a good reason.\n";
3593
+ result2 += "</sys>";
3571
3594
  } else {
3572
3595
  __privateSet(this, _totalWaitTime, 0);
3573
3596
  }
@@ -3577,14 +3600,14 @@ packMacroTool_fn = /* @__PURE__ */ __name(function() {
3577
3600
  type: "tool_executing",
3578
3601
  toolName,
3579
3602
  toolArgs: toolInput,
3580
- toolResult: result,
3603
+ toolResult: result2,
3581
3604
  displayText: displayResult,
3582
3605
  duration
3583
3606
  });
3584
3607
  await new Promise((resolve) => setTimeout(resolve, 100));
3585
3608
  return {
3586
3609
  input: input2,
3587
- output: result
3610
+ output: result2
3588
3611
  };
3589
3612
  }, "execute")
3590
3613
  };