@mcp-browser-kit/server 2.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.js CHANGED
@@ -6524,6 +6524,7 @@ var startTRpcServer = () => {
6524
6524
  };
6525
6525
  process.on("SIGTERM", shutdown);
6526
6526
  process.on("SIGINT", shutdown);
6527
+ process.stdin.on("close", shutdown);
6527
6528
  };
6528
6529
 
6529
6530
  // ../../node_modules/@modelcontextprotocol/sdk/dist/esm/types.js
@@ -9474,9 +9475,12 @@ server.tool(
9474
9475
  "getTabs",
9475
9476
  [
9476
9477
  combinationDescription,
9477
- "# Description",
9478
- "- This tool returns the overview of the current tabs on user's Browser.",
9479
- "- Usually being called before other tools to get a specific tabId to read/interact with."
9478
+ "\u26A0\uFE0F CRITICAL FIRST STEP - ALWAYS START HERE BEFORE ANY OTHER TOOLS!",
9479
+ "* This tool MUST be called first to obtain the list of open browser tabs.",
9480
+ "* Each tab includes a unique ID that is required for all subsequent tool operations.",
9481
+ "* Note which tab is active (marked with 'active: true') as this is essential information.",
9482
+ "* The tabId from this list is required for captureActiveTab and all other interactions.",
9483
+ "* Workflow: 1) getTabs \u2192 2) captureActiveTab \u2192 3) interact with elements"
9480
9484
  ].join("\n"),
9481
9485
  {},
9482
9486
  async () => {
@@ -9491,15 +9495,55 @@ server.tool(
9491
9495
  };
9492
9496
  }
9493
9497
  );
9498
+ server.tool(
9499
+ "captureActiveTab",
9500
+ [
9501
+ combinationDescription,
9502
+ "\u26A0\uFE0F SECOND REQUIRED STEP - AFTER getTabs AND BEFORE ANY INTERACTION!",
9503
+ "* SEQUENCE: First call getTabs to get tabId \u2192 Then use captureActiveTab with that tabId",
9504
+ "* IMPORTANT: This tool ONLY works for the ACTIVE tab (marked with 'active: true' in getTabs results).",
9505
+ "* If you need to work with an INACTIVE tab, use the Readable tools instead (getReadableElements + clickOnReadableElement).",
9506
+ "* ALWAYS capture a screenshot before attempting interaction with the active tab.",
9507
+ "* FOR ACTIVE TAB, VISIBLE ELEMENTS: Use coordinate-based Viewable tools (clickOnViewableElement, fillTextToViewableElement).",
9508
+ "* FOR INACTIVE TABS or HIDDEN ELEMENTS: Use Readable tools (getReadableElements + clickOnReadableElement/fillTextToReadableElement).",
9509
+ "* DECISION RULE: Active tab visible elements \u2192 Viewable tools. Inactive tabs or hidden elements \u2192 Readable tools.",
9510
+ "* Returns visual context showing exactly where form fields, buttons, and other UI elements are located.",
9511
+ "* After any page change or navigation, YOU MUST capture a new screenshot before further interactions."
9512
+ ].join("\n"),
9513
+ {},
9514
+ async () => {
9515
+ const screenshot = await rpcClient.defer("captureActiveTab");
9516
+ return {
9517
+ content: [
9518
+ {
9519
+ type: "text",
9520
+ text: `Screenshot size [${screenshot.width}x${screenshot.height}] - Use these dimensions to calculate exact pixel coordinates for clicking and text entry`
9521
+ },
9522
+ {
9523
+ type: "image",
9524
+ mimeType: screenshot.mimeType,
9525
+ data: screenshot.data
9526
+ }
9527
+ ]
9528
+ };
9529
+ }
9530
+ );
9494
9531
  server.tool(
9495
9532
  "getInnerText",
9496
9533
  [
9497
9534
  combinationDescription,
9498
- "- Use this tool to get the innerText of the current tab.",
9499
- "- Usually being called after `getTabs` to read or identify element to interact with."
9535
+ "\u26A0\uFE0F FASTEST & MOST EFFICIENT TEXT EXTRACTION TOOL",
9536
+ "* PREFERRED FIRST CHOICE for any task that only needs to read text (no interaction required).",
9537
+ "* Much faster and more efficient than capturing screenshots for text-only operations.",
9538
+ "* Extracts all readable text content from the specified tab in a single call.",
9539
+ "* Ideal for: content analysis, information extraction, summarization, and search tasks.",
9540
+ "* Perfect for generating suggestions, answering questions, or analyzing page content.",
9541
+ "* Use this BEFORE screenshot capture when you only need to understand text context.",
9542
+ "* Works on any tab using tabId from getTabs, not just active tabs.",
9543
+ "* WARNING: This text extraction cannot be used for direct element interaction."
9500
9544
  ].join("\n"),
9501
9545
  {
9502
- tabId: z.string().describe("Tab ID to evaluate the code in")
9546
+ tabId: z.string().describe("Tab ID to extract text from")
9503
9547
  },
9504
9548
  async ({ tabId }) => {
9505
9549
  const innerText = await rpcClient.defer("getInnerText", tabId);
@@ -9517,11 +9561,14 @@ server.tool(
9517
9561
  "getReadableElements",
9518
9562
  [
9519
9563
  combinationDescription,
9520
- "- Use this tool to get a list of [index,tag,accessible-text] of all readable elements in the current tab.",
9521
- "- Usually being called after `getTabs` to read or identify element to interact with."
9564
+ "* Returns an indexed list of all interactive elements in the format: [index, HTML tag, accessible text].",
9565
+ "* This creates a map of elements you can interact with programmatically.",
9566
+ "* The element indexes can be used with clickOnReadableElement and fillTextToReadableElement.",
9567
+ "* Ideal for forms, navigation menus, and interactive page components.",
9568
+ "* Use with tabId from getTabs to target specific tabs."
9522
9569
  ].join("\n"),
9523
9570
  {
9524
- tabId: z.string().describe("Tab ID to evaluate the code in")
9571
+ tabId: z.string().describe("Tab ID to extract elements from")
9525
9572
  },
9526
9573
  async ({ tabId }) => {
9527
9574
  const elements = await rpcClient.defer("getReadableElements", tabId);
@@ -9536,19 +9583,88 @@ server.tool(
9536
9583
  }
9537
9584
  );
9538
9585
  server.tool(
9539
- "fillTextToIndex",
9586
+ "clickOnViewableElement",
9587
+ [
9588
+ combinationDescription,
9589
+ "\u26A0\uFE0F PREFERRED TOOL - Use this FIRST for ANY element visible in the screenshot from captureActiveTab!",
9590
+ "* Works on the ACTIVE tab for any element you can SEE in the viewport screenshot.",
9591
+ "* FIRST CHOICE: Always prefer this tool over clickOnReadableElement when the target is visible.",
9592
+ "* Simulates a mouse click at the exact (x,y) coordinates specified.",
9593
+ "* Use captureActiveTab \u2192 identify target element \u2192 determine its CENTER coordinates \u2192 use this tool.",
9594
+ "* Calculate the center by finding the midpoint of the element's width and height.",
9595
+ "* For buttons and links, always aim for the center to ensure proper click registration.",
9596
+ "* If this tool fails or element is outside viewport, THEN try clickOnReadableElement as a fallback.",
9597
+ "* After clicking, capture another screenshot to verify the action succeeded."
9598
+ ].join("\n"),
9599
+ {
9600
+ tabId: z.string().describe("Tab ID of the active tab"),
9601
+ x: z.number().describe("X coordinate (pixels) of the element to click"),
9602
+ y: z.number().describe("Y coordinate (pixels) of the element to click")
9603
+ },
9604
+ async ({ tabId, x, y }) => {
9605
+ await rpcClient.defer("clickOnViewableElement", tabId, x, y);
9606
+ return {
9607
+ content: [
9608
+ {
9609
+ type: "text",
9610
+ text: "Done"
9611
+ }
9612
+ ]
9613
+ };
9614
+ }
9615
+ );
9616
+ server.tool(
9617
+ "fillTextToViewableElement",
9618
+ [
9619
+ combinationDescription,
9620
+ "\u26A0\uFE0F PREFERRED TOOL - Use this FIRST for ANY input field visible in the screenshot from captureActiveTab!",
9621
+ "* Works on the ACTIVE tab for any input field you can SEE in the viewport screenshot.",
9622
+ "* FIRST CHOICE: Always prefer this tool over fillTextToReadableElement when the input field is visible.",
9623
+ "* Sets text value for an input element at the specified (x,y) coordinates.",
9624
+ "* Use captureActiveTab \u2192 identify input field \u2192 determine its CENTER coordinates \u2192 use this tool.",
9625
+ "* Calculate the center by finding the midpoint of the input field's width and height.",
9626
+ "* Clicking on the center ensures the field is properly selected before text entry.",
9627
+ "* If this tool fails or input field is outside viewport, THEN try fillTextToReadableElement as a fallback.",
9628
+ "* For multi-step forms, fill all inputs before submitting the form."
9629
+ ].join("\n"),
9630
+ {
9631
+ tabId: z.string().describe("Tab ID of the active tab"),
9632
+ x: z.number().describe("X coordinate (pixels) of the input element"),
9633
+ y: z.number().describe("Y coordinate (pixels) of the input element"),
9634
+ value: z.string().describe("Text to enter into the input field")
9635
+ },
9636
+ async ({ tabId, x, y, value }) => {
9637
+ await rpcClient.defer("fillTextToViewableElement", tabId, x, y, value);
9638
+ return {
9639
+ content: [
9640
+ {
9641
+ type: "text",
9642
+ text: "Done"
9643
+ }
9644
+ ]
9645
+ };
9646
+ }
9647
+ );
9648
+ server.tool(
9649
+ "fillTextToReadableElement",
9540
9650
  [
9541
9651
  combinationDescription,
9542
- "- Use this tool to set value to the ReadableElement at the specified index.",
9543
- "- For tasks requiring user input (text, numbers, passwords, etc.), such as fill form before submit, fill text before search, fill username & password before login"
9652
+ "\u26A0\uFE0F FALLBACK TOOL - Only use when fillTextToViewableElement cannot help!",
9653
+ "* Use this tool ONLY if fillTextToViewableElement failed or the input field is not visible.",
9654
+ "* Acts as a direct fallback when coordinate-based interaction with visible elements doesn't work.",
9655
+ "* Sets text value for an input element identified by its index from getReadableElements.",
9656
+ "* Works on any tab, not just the active one.",
9657
+ "* Run getReadableElements first to obtain the correct element index.",
9658
+ "* Use when form fields are not visible without scrolling or are in iframes/embedded content.",
9659
+ "* Also effective for cases where coordinate-based interaction failed or is unreliable."
9544
9660
  ].join("\n"),
9545
9661
  {
9546
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9547
- index: z.number().describe("Index of the element to set value"),
9548
- value: z.string().describe("Value to set")
9662
+ tabId: z.string().describe("Tab ID to target"),
9663
+ index: z.number().describe("Element index from getReadableElements"),
9664
+ value: z.string().describe("Text to enter into the input field")
9549
9665
  },
9550
9666
  async ({ tabId, index, value }) => {
9551
- await rpcClient.defer("fillTextToIndex", tabId, index, value);
9667
+ await rpcClient.defer("fillTextToReadableElement", tabId, index, value);
9552
9668
  return {
9553
9669
  content: [
9554
9670
  {
@@ -9560,20 +9676,24 @@ server.tool(
9560
9676
  }
9561
9677
  );
9562
9678
  server.tool(
9563
- "clickOnIndex",
9679
+ "clickOnReadableElement",
9564
9680
  [
9565
9681
  combinationDescription,
9566
- "- Use this tool to click on ReadableElement at the specified index.",
9567
- "- For tasks that involve clicking, like buttons, links, etc.",
9568
- "- Usually being called after `getReadableElements` to interact with the element.",
9569
- "- If the task potentially requires filling inputs, use `fillTextToIndex` first."
9682
+ "\u26A0\uFE0F FALLBACK TOOL - Only use when clickOnViewableElement cannot help!",
9683
+ "* Use this tool ONLY if clickOnViewableElement failed or the target element is not visible.",
9684
+ "* Acts as a direct fallback when coordinate-based clicking on visible elements doesn't work.",
9685
+ "* Clicks on an element identified by its index from getReadableElements.",
9686
+ "* Works on any tab, not just the active one.",
9687
+ "* Run getReadableElements first to obtain the correct element index.",
9688
+ "* Use when buttons/links are not visible without scrolling or are in iframes/embedded content.",
9689
+ "* Also effective for cases where coordinate-based clicking failed or is unreliable."
9570
9690
  ].join("\n"),
9571
9691
  {
9572
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9573
- index: z.number().describe("Index of the element to click")
9692
+ tabId: z.string().describe("Tab ID to target"),
9693
+ index: z.number().describe("Element index from getReadableElements")
9574
9694
  },
9575
9695
  async ({ tabId, index }) => {
9576
- await rpcClient.defer("clickOnIndex", tabId, index);
9696
+ await rpcClient.defer("clickOnReadableElement", tabId, index);
9577
9697
  return {
9578
9698
  content: [
9579
9699
  {
@@ -9588,14 +9708,16 @@ server.tool(
9588
9708
  "invokeJsFn",
9589
9709
  [
9590
9710
  combinationDescription,
9591
- "# Description",
9592
- "- Use this tool only if action cannot be performed via series of clicks and types.",
9593
- "- Use this tool to evaluate a JavaScript function body in the context of the page.",
9594
- "- Usually being called after `getReadableElements` to interact with the element."
9711
+ "\u26A0\uFE0F USE THIS TOOL AS A LAST RESORT ONLY.",
9712
+ "* Executes custom JavaScript code directly in the page context.",
9713
+ "* Only use when standard tools (clicking, text input) cannot accomplish the task.",
9714
+ "* The JavaScript function body must be self-contained and return a serializable value.",
9715
+ "* Useful for complex interactions, custom data extraction, or handling dynamic elements.",
9716
+ "* Example: scrolling, accessing hidden elements, or interacting with complex widgets."
9595
9717
  ].join("\n"),
9596
9718
  {
9597
- tabId: z.string().describe("Tab ID to evaluate the code in"),
9598
- fnBodyCode: z.string().describe("A JavaScript function body to evaluate")
9719
+ tabId: z.string().describe("Tab ID to run JavaScript in"),
9720
+ fnBodyCode: z.string().describe("JavaScript function body to execute in page context")
9599
9721
  },
9600
9722
  async ({ tabId, fnBodyCode }) => {
9601
9723
  const result = await rpcClient.defer("invokeJsFn", tabId, fnBodyCode);