@mcp-browser-kit/server 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.js CHANGED
@@ -9474,9 +9474,12 @@ server.tool(
9474
9474
  "getTabs",
9475
9475
  [
9476
9476
  combinationDescription,
9477
- "# Description",
9478
- "- This tool returns the overview of the current tabs on user's Browser.",
9479
- "- Usually being called before other tools to get a specific tabId to read/interact with."
9477
+ "\u26A0\uFE0F CRITICAL FIRST STEP - ALWAYS START HERE BEFORE ANY OTHER TOOLS!",
9478
+ "* This tool MUST be called first to obtain the list of open browser tabs.",
9479
+ "* Each tab includes a unique ID that is required for all subsequent tool operations.",
9480
+ "* Note which tab is active (marked with 'active: true') as this is essential information.",
9481
+ "* The tabId from this list is required for captureActiveTab and all other interactions.",
9482
+ "* Workflow: 1) getTabs \u2192 2) captureActiveTab \u2192 3) interact with elements"
9480
9483
  ].join("\n"),
9481
9484
  {},
9482
9485
  async () => {
@@ -9491,15 +9494,55 @@ server.tool(
9491
9494
  };
9492
9495
  }
9493
9496
  );
9497
+ server.tool(
9498
+ "captureActiveTab",
9499
+ [
9500
+ combinationDescription,
9501
+ "\u26A0\uFE0F SECOND REQUIRED STEP - AFTER getTabs AND BEFORE ANY INTERACTION!",
9502
+ "* SEQUENCE: First call getTabs to get tabId \u2192 Then use captureActiveTab with that tabId",
9503
+ "* IMPORTANT: This tool ONLY works for the ACTIVE tab (marked with 'active: true' in getTabs results).",
9504
+ "* If you need to work with an INACTIVE tab, use the Readable tools instead (getReadableElements + clickOnReadableElement).",
9505
+ "* ALWAYS capture a screenshot before attempting interaction with the active tab.",
9506
+ "* FOR ACTIVE TAB, VISIBLE ELEMENTS: Use coordinate-based Viewable tools (clickOnViewableElement, fillTextToViewableElement).",
9507
+ "* FOR INACTIVE TABS or HIDDEN ELEMENTS: Use Readable tools (getReadableElements + clickOnReadableElement/fillTextToReadableElement).",
9508
+ "* DECISION RULE: Active tab visible elements \u2192 Viewable tools. Inactive tabs or hidden elements \u2192 Readable tools.",
9509
+ "* Returns visual context showing exactly where form fields, buttons, and other UI elements are located.",
9510
+ "* After any page change or navigation, YOU MUST capture a new screenshot before further interactions."
9511
+ ].join("\n"),
9512
+ {},
9513
+ async () => {
9514
+ const screenshot = await rpcClient.defer("captureActiveTab");
9515
+ return {
9516
+ content: [
9517
+ {
9518
+ type: "text",
9519
+ text: `Screenshot size [${screenshot.width}x${screenshot.height}] - Use these dimensions to calculate exact pixel coordinates for clicking and text entry`
9520
+ },
9521
+ {
9522
+ type: "image",
9523
+ mimeType: screenshot.mimeType,
9524
+ data: screenshot.data
9525
+ }
9526
+ ]
9527
+ };
9528
+ }
9529
+ );
9494
9530
  server.tool(
9495
9531
  "getInnerText",
9496
9532
  [
9497
9533
  combinationDescription,
9498
- "- Use this tool to get the innerText of the current tab.",
9499
- "- Usually being called after `getTabs` to read or identify element to interact with."
9534
+ "\u26A0\uFE0F FASTEST & MOST EFFICIENT TEXT EXTRACTION TOOL",
9535
+ "* PREFERRED FIRST CHOICE for any task that only needs to read text (no interaction required).",
9536
+ "* Much faster and more efficient than capturing screenshots for text-only operations.",
9537
+ "* Extracts all readable text content from the specified tab in a single call.",
9538
+ "* Ideal for: content analysis, information extraction, summarization, and search tasks.",
9539
+ "* Perfect for generating suggestions, answering questions, or analyzing page content.",
9540
+ "* Use this BEFORE screenshot capture when you only need to understand text context.",
9541
+ "* Works on any tab using tabId from getTabs, not just active tabs.",
9542
+ "* WARNING: This text extraction cannot be used for direct element interaction."
9500
9543
  ].join("\n"),
9501
9544
  {
9502
- tabId: z.string().describe("Tab ID to evaluate the code in")
9545
+ tabId: z.string().describe("Tab ID to extract text from")
9503
9546
  },
9504
9547
  async ({ tabId }) => {
9505
9548
  const innerText = await rpcClient.defer("getInnerText", tabId);
@@ -9517,11 +9560,14 @@ server.tool(
9517
9560
  "getReadableElements",
9518
9561
  [
9519
9562
  combinationDescription,
9520
- "- Use this tool to get a list of [index,tag,accessible-text] of all readable elements in the current tab.",
9521
- "- Usually being called after `getTabs` to read or identify element to interact with."
9563
+ "* Returns an indexed list of all interactive elements in the format: [index, HTML tag, accessible text].",
9564
+ "* This creates a map of elements you can interact with programmatically.",
9565
+ "* The element indexes can be used with clickOnReadableElement and fillTextToReadableElement.",
9566
+ "* Ideal for forms, navigation menus, and interactive page components.",
9567
+ "* Use with tabId from getTabs to target specific tabs."
9522
9568
  ].join("\n"),
9523
9569
  {
9524
- tabId: z.string().describe("Tab ID to evaluate the code in")
9570
+ tabId: z.string().describe("Tab ID to extract elements from")
9525
9571
  },
9526
9572
  async ({ tabId }) => {
9527
9573
  const elements = await rpcClient.defer("getReadableElements", tabId);
@@ -9536,19 +9582,88 @@ server.tool(
9536
9582
  }
9537
9583
  );
9538
9584
  server.tool(
9539
- "fillTextToIndex",
9585
+ "clickOnViewableElement",
9586
+ [
9587
+ combinationDescription,
9588
+ "\u26A0\uFE0F PREFERRED TOOL - Use this FIRST for ANY element visible in the screenshot from captureActiveTab!",
9589
+ "* Works on the ACTIVE tab for any element you can SEE in the viewport screenshot.",
9590
+ "* FIRST CHOICE: Always prefer this tool over clickOnReadableElement when the target is visible.",
9591
+ "* Simulates a mouse click at the exact (x,y) coordinates specified.",
9592
+ "* Use captureActiveTab \u2192 identify target element \u2192 determine its CENTER coordinates \u2192 use this tool.",
9593
+ "* Calculate the center by finding the midpoint of the element's width and height.",
9594
+ "* For buttons and links, always aim for the center to ensure proper click registration.",
9595
+ "* If this tool fails or element is outside viewport, THEN try clickOnReadableElement as a fallback.",
9596
+ "* After clicking, capture another screenshot to verify the action succeeded."
9597
+ ].join("\n"),
9598
+ {
9599
+ tabId: z.string().describe("Tab ID of the active tab"),
9600
+ x: z.number().describe("X coordinate (pixels) of the element to click"),
9601
+ y: z.number().describe("Y coordinate (pixels) of the element to click")
9602
+ },
9603
+ async ({ tabId, x, y }) => {
9604
+ await rpcClient.defer("clickOnViewableElement", tabId, x, y);
9605
+ return {
9606
+ content: [
9607
+ {
9608
+ type: "text",
9609
+ text: "Done"
9610
+ }
9611
+ ]
9612
+ };
9613
+ }
9614
+ );
9615
+ server.tool(
9616
+ "fillTextToViewableElement",
9617
+ [
9618
+ combinationDescription,
9619
+ "\u26A0\uFE0F PREFERRED TOOL - Use this FIRST for ANY input field visible in the screenshot from captureActiveTab!",
9620
+ "* Works on the ACTIVE tab for any input field you can SEE in the viewport screenshot.",
9621
+ "* FIRST CHOICE: Always prefer this tool over fillTextToReadableElement when the input field is visible.",
9622
+ "* Sets text value for an input element at the specified (x,y) coordinates.",
9623
+ "* Use captureActiveTab \u2192 identify input field \u2192 determine its CENTER coordinates \u2192 use this tool.",
9624
+ "* Calculate the center by finding the midpoint of the input field's width and height.",
9625
+ "* Clicking on the center ensures the field is properly selected before text entry.",
9626
+ "* If this tool fails or input field is outside viewport, THEN try fillTextToReadableElement as a fallback.",
9627
+ "* For multi-step forms, fill all inputs before submitting the form."
9628
+ ].join("\n"),
9629
+ {
9630
+ tabId: z.string().describe("Tab ID of the active tab"),
9631
+ x: z.number().describe("X coordinate (pixels) of the input element"),
9632
+ y: z.number().describe("Y coordinate (pixels) of the input element"),
9633
+ value: z.string().describe("Text to enter into the input field")
9634
+ },
9635
+ async ({ tabId, x, y, value }) => {
9636
+ await rpcClient.defer("fillTextToViewableElement", tabId, x, y, value);
9637
+ return {
9638
+ content: [
9639
+ {
9640
+ type: "text",
9641
+ text: "Done"
9642
+ }
9643
+ ]
9644
+ };
9645
+ }
9646
+ );
9647
+ server.tool(
9648
+ "fillTextToReadableElement",
9540
9649
  [
9541
9650
  combinationDescription,
9542
- "- Use this tool to set value to the ReadableElement at the specified index.",
9543
- "- For tasks requiring user input (text, numbers, passwords, etc.), such as fill form before submit, fill text before search, fill username & password before login"
9651
+ "\u26A0\uFE0F FALLBACK TOOL - Only use when fillTextToViewableElement cannot help!",
9652
+ "* Use this tool ONLY if fillTextToViewableElement failed or the input field is not visible.",
9653
+ "* Acts as a direct fallback when coordinate-based interaction with visible elements doesn't work.",
9654
+ "* Sets text value for an input element identified by its index from getReadableElements.",
9655
+ "* Works on any tab, not just the active one.",
9656
+ "* Run getReadableElements first to obtain the correct element index.",
9657
+ "* Use when form fields are not visible without scrolling or are in iframes/embedded content.",
9658
+ "* Also effective for cases where coordinate-based interaction failed or is unreliable."
9544
9659
  ].join("\n"),
9545
9660
  {
9546
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9547
- index: z.number().describe("Index of the element to set value"),
9548
- value: z.string().describe("Value to set")
9661
+ tabId: z.string().describe("Tab ID to target"),
9662
+ index: z.number().describe("Element index from getReadableElements"),
9663
+ value: z.string().describe("Text to enter into the input field")
9549
9664
  },
9550
9665
  async ({ tabId, index, value }) => {
9551
- await rpcClient.defer("fillTextToIndex", tabId, index, value);
9666
+ await rpcClient.defer("fillTextToReadableElement", tabId, index, value);
9552
9667
  return {
9553
9668
  content: [
9554
9669
  {
@@ -9560,20 +9675,24 @@ server.tool(
9560
9675
  }
9561
9676
  );
9562
9677
  server.tool(
9563
- "clickOnIndex",
9678
+ "clickOnReadableElement",
9564
9679
  [
9565
9680
  combinationDescription,
9566
- "- Use this tool to click on ReadableElement at the specified index.",
9567
- "- For tasks that involve clicking, like buttons, links, etc.",
9568
- "- Usually being called after `getReadableElements` to interact with the element.",
9569
- "- If the task potentially requires filling inputs, use `fillTextToIndex` first."
9681
+ "\u26A0\uFE0F FALLBACK TOOL - Only use when clickOnViewableElement cannot help!",
9682
+ "* Use this tool ONLY if clickOnViewableElement failed or the target element is not visible.",
9683
+ "* Acts as a direct fallback when coordinate-based clicking on visible elements doesn't work.",
9684
+ "* Clicks on an element identified by its index from getReadableElements.",
9685
+ "* Works on any tab, not just the active one.",
9686
+ "* Run getReadableElements first to obtain the correct element index.",
9687
+ "* Use when buttons/links are not visible without scrolling or are in iframes/embedded content.",
9688
+ "* Also effective for cases where coordinate-based clicking failed or is unreliable."
9570
9689
  ].join("\n"),
9571
9690
  {
9572
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9573
- index: z.number().describe("Index of the element to click")
9691
+ tabId: z.string().describe("Tab ID to target"),
9692
+ index: z.number().describe("Element index from getReadableElements")
9574
9693
  },
9575
9694
  async ({ tabId, index }) => {
9576
- await rpcClient.defer("clickOnIndex", tabId, index);
9695
+ await rpcClient.defer("clickOnReadableElement", tabId, index);
9577
9696
  return {
9578
9697
  content: [
9579
9698
  {
@@ -9588,14 +9707,16 @@ server.tool(
9588
9707
  "invokeJsFn",
9589
9708
  [
9590
9709
  combinationDescription,
9591
- "# Description",
9592
- "- Use this tool only if action cannot be performed via series of clicks and types.",
9593
- "- Use this tool to evaluate a JavaScript function body in the context of the page.",
9594
- "- Usually being called after `getReadableElements` to interact with the element."
9710
+ "\u26A0\uFE0F USE THIS TOOL AS A LAST RESORT ONLY.",
9711
+ "* Executes custom JavaScript code directly in the page context.",
9712
+ "* Only use when standard tools (clicking, text input) cannot accomplish the task.",
9713
+ "* The JavaScript function body must be self-contained and return a serializable value.",
9714
+ "* Useful for complex interactions, custom data extraction, or handling dynamic elements.",
9715
+ "* Example: scrolling, accessing hidden elements, or interacting with complex widgets."
9595
9716
  ].join("\n"),
9596
9717
  {
9597
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9598
- fnBodyCode: z.string().describe("A JavaScript function body to evaluate")
9718
+ tabId: z.string().describe("Tab ID to run JavaScript in"),
9719
+ fnBodyCode: z.string().describe("JavaScript function body to execute in page context")
9599
9720
  },
9600
9721
  async ({ tabId, fnBodyCode }) => {
9601
9722
  const result = await rpcClient.defer("invokeJsFn", tabId, fnBodyCode);