agent-vision-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +117 -0
  3. package/dist/browser/cdp/browser-cdp-discovery-service.d.ts +10 -0
  4. package/dist/browser/cdp/browser-cdp-discovery-service.js +28 -0
  5. package/dist/browser/cdp/browser-live-tab-service.d.ts +16 -0
  6. package/dist/browser/cdp/browser-live-tab-service.js +42 -0
  7. package/dist/browser/cdp/browser-see-service.d.ts +33 -0
  8. package/dist/browser/cdp/browser-see-service.js +76 -0
  9. package/dist/browser/cdp/browser-tab-context-service.d.ts +23 -0
  10. package/dist/browser/cdp/browser-tab-context-service.js +90 -0
  11. package/dist/browser/cdp/browser-tab-resolution-service.d.ts +9 -0
  12. package/dist/browser/cdp/browser-tab-resolution-service.js +65 -0
  13. package/dist/browser/cdp/browser-tab-screenshot-service.d.ts +20 -0
  14. package/dist/browser/cdp/browser-tab-screenshot-service.js +59 -0
  15. package/dist/browser/cdp/cdp-websocket-session.d.ts +9 -0
  16. package/dist/browser/cdp/cdp-websocket-session.js +99 -0
  17. package/dist/browser/cdp/chrome-cdp-client.d.ts +12 -0
  18. package/dist/browser/cdp/chrome-cdp-client.js +141 -0
  19. package/dist/browser/cdp/live-browser-tab-registry.d.ts +12 -0
  20. package/dist/browser/cdp/live-browser-tab-registry.js +96 -0
  21. package/dist/browser/cdp/png-metadata.d.ts +5 -0
  22. package/dist/browser/cdp/png-metadata.js +16 -0
  23. package/dist/browser/cdp/tab-model.d.ts +33 -0
  24. package/dist/browser/cdp/tab-model.js +15 -0
  25. package/dist/browser/cdp/tab-resolution.d.ts +27 -0
  26. package/dist/browser/cdp/tab-resolution.js +48 -0
  27. package/dist/browser/cdp/types.d.ts +71 -0
  28. package/dist/browser/cdp/types.js +1 -0
  29. package/dist/capture/capture-pipeline.d.ts +5 -0
  30. package/dist/capture/capture-pipeline.js +1 -0
  31. package/dist/capture/create-screen-capture-provider.d.ts +3 -0
  32. package/dist/capture/create-screen-capture-provider.js +8 -0
  33. package/dist/capture/in-memory-capture-pipeline.d.ts +13 -0
  34. package/dist/capture/in-memory-capture-pipeline.js +52 -0
  35. package/dist/capture/in-memory-image-compositor.d.ts +5 -0
  36. package/dist/capture/in-memory-image-compositor.js +34 -0
  37. package/dist/capture/linux-portal-screenshot-provider.d.ts +8 -0
  38. package/dist/capture/linux-portal-screenshot-provider.js +181 -0
  39. package/dist/capture/mock-screen-capture-provider.d.ts +5 -0
  40. package/dist/capture/mock-screen-capture-provider.js +22 -0
  41. package/dist/capture/png-metadata.d.ts +5 -0
  42. package/dist/capture/png-metadata.js +18 -0
  43. package/dist/capture/screen-capture-provider.d.ts +4 -0
  44. package/dist/capture/screen-capture-provider.js +1 -0
  45. package/dist/capture/types.d.ts +38 -0
  46. package/dist/capture/types.js +1 -0
  47. package/dist/cdp-demo.d.ts +1 -0
  48. package/dist/cdp-demo.js +41 -0
  49. package/dist/demo.d.ts +1 -0
  50. package/dist/demo.js +54 -0
  51. package/dist/desktop/capture-now.d.ts +1 -0
  52. package/dist/desktop/capture-now.js +48 -0
  53. package/dist/desktop/controller.d.ts +25 -0
  54. package/dist/desktop/controller.js +77 -0
  55. package/dist/desktop/main.d.ts +1 -0
  56. package/dist/desktop/main.js +80 -0
  57. package/dist/desktop/preload.d.ts +1 -0
  58. package/dist/desktop/preload.js +26 -0
  59. package/dist/desktop/types.d.ts +31 -0
  60. package/dist/desktop/types.js +1 -0
  61. package/dist/errors/app-error.d.ts +7 -0
  62. package/dist/errors/app-error.js +11 -0
  63. package/dist/flow/types.d.ts +48 -0
  64. package/dist/flow/types.js +1 -0
  65. package/dist/flow/visual-capture-flow.d.ts +13 -0
  66. package/dist/flow/visual-capture-flow.js +196 -0
  67. package/dist/index.d.ts +1 -0
  68. package/dist/index.js +3 -0
  69. package/dist/logging/logger.d.ts +15 -0
  70. package/dist/logging/logger.js +28 -0
  71. package/dist/mcp/stdio-server.d.ts +19 -0
  72. package/dist/mcp/stdio-server.js +272 -0
  73. package/dist/mcp/tool-registry.d.ts +21 -0
  74. package/dist/mcp/tool-registry.js +33 -0
  75. package/dist/mcp-stdio.d.ts +2 -0
  76. package/dist/mcp-stdio.js +8 -0
  77. package/dist/overlay/local-overlay-agent.d.ts +46 -0
  78. package/dist/overlay/local-overlay-agent.js +551 -0
  79. package/dist/overlay/overlay-bundle-factory.d.ts +4 -0
  80. package/dist/overlay/overlay-bundle-factory.js +24 -0
  81. package/dist/overlay/types.d.ts +83 -0
  82. package/dist/overlay/types.js +1 -0
  83. package/dist/server.d.ts +19 -0
  84. package/dist/server.js +158 -0
  85. package/dist/session/capture-session-service.d.ts +21 -0
  86. package/dist/session/capture-session-service.js +50 -0
  87. package/dist/session/session-manager.d.ts +29 -0
  88. package/dist/session/session-manager.js +217 -0
  89. package/dist/session/session-store.d.ts +8 -0
  90. package/dist/session/session-store.js +15 -0
  91. package/dist/session/session-waiter.d.ts +14 -0
  92. package/dist/session/session-waiter.js +102 -0
  93. package/dist/types/annotation.d.ts +32 -0
  94. package/dist/types/annotation.js +1 -0
  95. package/dist/types/capture.d.ts +33 -0
  96. package/dist/types/capture.js +1 -0
  97. package/dist/types/session.d.ts +36 -0
  98. package/dist/types/session.js +1 -0
  99. package/package.json +38 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kedar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,117 @@
1
+ <div align="center">
2
+ <h1>Agent Vision</h1>
3
+ <p align="center">
4
+ <img src="./docs/banner.png" width="800" />
5
+ </p>
6
+ <p>
7
+ <img alt="TypeScript" src="https://img.shields.io/badge/TypeScript-ES2022-3178C6?style=flat-square" />
8
+ <img alt="Chrome CDP" src="https://img.shields.io/badge/Chrome-CDP-4285F4?style=flat-square" />
9
+ <img alt="MCP" src="https://img.shields.io/badge/MCP-stdio-7C3AED?style=flat-square" />
10
+ <img alt="License" src="https://img.shields.io/badge/License-MIT-22C55E?style=flat-square" />
11
+ </p>
12
+ </div>
13
+
14
+ Agent Vision is a browser-first MCP server that gives agents direct visual access to live Chrome or Brave tabs through the Chrome DevTools Protocol. Instead of asking users to manually capture screenshots and upload them into chat, Agent Vision lets a coding agent resolve a tab by title or URL, capture a real screenshot on demand, and pull structured browser context like visible text, page metadata, and viewport details in the same flow.
15
+
16
+ ## Problem
17
+
18
+ LLM workflows break down when visual browser context is trapped behind manual steps. A user sees an issue in a tab, but the agent cannot see the same thing unless the user pauses, takes a screenshot, uploads it, and adds extra explanation. That friction slows debugging, weakens iteration speed, and loses important structured context that browsers already expose natively. Agent Vision solves this by making browser state directly available as MCP tools.
19
+
20
+ ## Workflow
21
+
22
+ ```mermaid
23
+ flowchart TD
24
+ A[User asks agent to inspect a browser tab] --> B[Codex calls Agent Vision MCP tool]
25
+ B --> C[Agent Vision queries Chrome DevTools Protocol]
26
+ C --> D[Discover live tabs]
27
+ D --> E[Resolve best matching tab by title or URL]
28
+ E --> F[Capture screenshot through CDP]
29
+ E --> G[Evaluate page for visible text and metadata]
30
+ F --> H[Return MCP image content]
31
+ G --> I[Return MCP structured text content]
32
+ H --> J[Agent receives visual + structured browser context]
33
+ I --> J
34
+ ```
35
+
36
+ ## Tools
37
+
38
+ | Tool | What it does |
39
+ | --- | --- |
40
+ | `getBrowserCdpStatus` | Checks whether Agent Vision can reach the configured Chrome DevTools Protocol endpoint. |
41
+ | `discoverBrowserTabsViaCdp` | Returns the raw live tab list exposed by Chrome DevTools Protocol. |
42
+ | `refreshLiveBrowserTabs` | Refreshes the normalized in-memory live tab model from CDP. |
43
+ | `listLiveBrowserTabs` | Returns the cached normalized live tab model without re-querying the browser. |
44
+ | `pruneStaleLiveBrowserTabs` | Removes cached tabs that have become stale because they have not been refreshed recently. |
45
+ | `resolveLiveBrowserTab` | Resolves the active or best matching tab for a `/see`-style query using title and URL heuristics. |
46
+ | `captureResolvedBrowserTabScreenshot` | Captures a real PNG screenshot from the resolved browser tab via CDP. |
47
+ | `getResolvedBrowserTabContext` | Extracts structured page context such as visible text, page title, page URL, language, content type, and viewport metadata. |
48
+ | `seeBrowserTabViaCdp` | Runs the full high-level browser inspection flow: resolve tab, capture screenshot, and return structured browser context. |
49
+
50
+ ## Setup And Installation
51
+
52
+ ### Prerequisites
53
+
54
+ - Node.js 22+
55
+ - Brave, Chrome, or another Chromium-based browser
56
+ - Codex configured locally with MCP support
57
+
58
+ ### Install dependencies
59
+
60
+ ```bash
61
+ npm install
62
+ npm run build
63
+ ```
64
+
65
+ ### Start a CDP-enabled browser session
66
+
67
+ ```bash
68
+ brave-browser --remote-debugging-port=9222 --user-data-dir=/tmp/agent-vision-cdp
69
+ ```
70
+
71
+ ### Register Agent Vision in Codex
72
+
73
+ Add this to `~/.codex/config.toml`:
74
+
75
+ ```toml
76
+ [mcp_servers.agent-vision]
77
+ command = "/home/kedar/.nvm/versions/node/v22.22.0/bin/node"
78
+ args = ["/home/kedar/Desktop/Projects/llm_vision/dist/mcp-stdio.js"]
79
+ ```
80
+
81
+ Then restart Codex.
82
+
83
+ ### Verify the MCP server is available
84
+
85
+ ```bash
86
+ codex mcp list
87
+ ```
88
+
89
+ ## Example Usage
90
+
91
+ ### Manual local verification
92
+
93
+ ```bash
94
+ npm run demo-cdp
95
+ ```
96
+
97
+ ### Example Codex prompts
98
+
99
+ ```text
100
+ Use the agent-vision MCP server to list my live browser tabs.
101
+ Use the agent-vision MCP server to inspect the active browser tab.
102
+ Use the agent-vision MCP server to inspect the tab matching "docs".
103
+ ```
104
+
105
+ ### Future npm-distributed registration shape
106
+
107
+ After publishing, the Codex config can switch to:
108
+
109
+ ```toml
110
+ [mcp_servers.agent-vision]
111
+ command = "npx"
112
+ args = ["-y", "agent-vision-mcp"]
113
+ ```
114
+
115
+ ## License
116
+
117
+ MIT. See [LICENSE](./LICENSE).
@@ -0,0 +1,10 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { ChromeCdpClient } from "./chrome-cdp-client.js";
3
+ import type { CdpConnectionStatus, CdpTabDiscoveryResult } from "./types.js";
4
+ export declare class BrowserCdpDiscoveryService {
5
+ private readonly client;
6
+ private readonly logger;
7
+ constructor(client: ChromeCdpClient, logger: Logger);
8
+ getConnectionStatus(endpointOverride?: string): Promise<CdpConnectionStatus>;
9
+ discoverTabs(endpointOverride?: string): Promise<CdpTabDiscoveryResult>;
10
+ }
@@ -0,0 +1,28 @@
1
+ export class BrowserCdpDiscoveryService {
2
+ client;
3
+ logger;
4
+ constructor(client, logger) {
5
+ this.client = client;
6
+ this.logger = logger;
7
+ }
8
+ async getConnectionStatus(endpointOverride) {
9
+ const status = await this.client.getConnectionStatus(endpointOverride);
10
+ if (status.connected) {
11
+ this.logger.info("CDP connection ready", {
12
+ endpoint: status.endpoint,
13
+ browser: status.browser,
14
+ protocolVersion: status.protocolVersion
15
+ });
16
+ }
17
+ return status;
18
+ }
19
+ async discoverTabs(endpointOverride) {
20
+ const result = await this.client.discoverTabs(endpointOverride);
21
+ this.logger.info("CDP tab discovery completed", {
22
+ endpoint: result.endpoint,
23
+ tabCount: result.tabs.length,
24
+ browser: result.browser
25
+ });
26
+ return result;
27
+ }
28
+ }
@@ -0,0 +1,16 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { BrowserCdpDiscoveryService } from "./browser-cdp-discovery-service.js";
3
+ import { LiveBrowserTabRegistry } from "./live-browser-tab-registry.js";
4
+ import { type LiveBrowserTabDiscoveryResult, type LiveBrowserTab } from "./tab-model.js";
5
+ export declare class BrowserLiveTabService {
6
+ private readonly discovery;
7
+ private readonly registry;
8
+ private readonly logger;
9
+ constructor(discovery: BrowserCdpDiscoveryService, registry: LiveBrowserTabRegistry, logger: Logger);
10
+ refresh(endpointOverride?: string): Promise<LiveBrowserTabDiscoveryResult>;
11
+ pruneStale(maxAgeMs?: number): {
12
+ removed: LiveBrowserTab[];
13
+ remaining: LiveBrowserTabDiscoveryResult;
14
+ };
15
+ list(): LiveBrowserTabDiscoveryResult;
16
+ }
@@ -0,0 +1,42 @@
1
+ import { toDiscoveryResult } from "./tab-model.js";
2
+ export class BrowserLiveTabService {
3
+ discovery;
4
+ registry;
5
+ logger;
6
+ constructor(discovery, registry, logger) {
7
+ this.discovery = discovery;
8
+ this.registry = registry;
9
+ this.logger = logger;
10
+ }
11
+ async refresh(endpointOverride) {
12
+ const discoveryResult = await this.discovery.discoverTabs(endpointOverride);
13
+ const tabs = this.registry.upsertDiscovery({
14
+ endpoint: discoveryResult.endpoint,
15
+ discoveredAt: discoveryResult.discoveredAt,
16
+ tabs: discoveryResult.tabs
17
+ });
18
+ this.logger.info("Refreshed live browser tab model", {
19
+ endpoint: discoveryResult.endpoint,
20
+ tabCount: tabs.length,
21
+ browser: discoveryResult.browser
22
+ });
23
+ return toDiscoveryResult(discoveryResult, tabs);
24
+ }
25
+ pruneStale(maxAgeMs) {
26
+ const removed = this.registry.pruneStale(maxAgeMs);
27
+ return {
28
+ removed,
29
+ remaining: this.list()
30
+ };
31
+ }
32
+ list() {
33
+ const tabs = this.registry.list();
34
+ const discoveredAt = tabs[0]?.lastSeenAt ?? new Date().toISOString();
35
+ const endpoint = tabs[0]?.lastDiscoveryEndpoint ?? "unknown";
36
+ return {
37
+ endpoint,
38
+ discoveredAt,
39
+ tabs
40
+ };
41
+ }
42
+ }
@@ -0,0 +1,33 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { BrowserLiveTabService } from "./browser-live-tab-service.js";
3
+ import { BrowserTabContextService } from "./browser-tab-context-service.js";
4
+ import { BrowserTabScreenshotService } from "./browser-tab-screenshot-service.js";
5
+ import type { BrowserTabScreenshot, BrowserTabStructuredContext } from "./types.js";
6
+ import type { LiveBrowserTabCandidate } from "./tab-resolution.js";
7
+ export type BrowserSeeResult = {
8
+ status: "completed";
9
+ query?: string;
10
+ stage: "completed";
11
+ matchedCandidate: LiveBrowserTabCandidate;
12
+ candidateCount: number;
13
+ screenshot: BrowserTabScreenshot;
14
+ context: BrowserTabStructuredContext;
15
+ message: string;
16
+ recoveredAfterRefresh?: boolean;
17
+ } | {
18
+ status: "ambiguous" | "not_found";
19
+ query?: string;
20
+ stage: "needs_disambiguation" | "not_found";
21
+ candidates: LiveBrowserTabCandidate[];
22
+ message: string;
23
+ recoveredAfterRefresh?: boolean;
24
+ };
25
+ export declare class BrowserSeeService {
26
+ private readonly liveTabs;
27
+ private readonly screenshots;
28
+ private readonly contexts;
29
+ private readonly logger;
30
+ constructor(liveTabs: BrowserLiveTabService, screenshots: BrowserTabScreenshotService, contexts: BrowserTabContextService, logger: Logger);
31
+ see(query?: string): Promise<BrowserSeeResult>;
32
+ private seeOnce;
33
+ }
@@ -0,0 +1,76 @@
1
+ export class BrowserSeeService {
2
+ liveTabs;
3
+ screenshots;
4
+ contexts;
5
+ logger;
6
+ constructor(liveTabs, screenshots, contexts, logger) {
7
+ this.liveTabs = liveTabs;
8
+ this.screenshots = screenshots;
9
+ this.contexts = contexts;
10
+ this.logger = logger;
11
+ }
12
+ async see(query) {
13
+ try {
14
+ const firstAttempt = await this.seeOnce(query, false);
15
+ if (firstAttempt.status !== "not_found") {
16
+ return firstAttempt;
17
+ }
18
+ this.logger.warn("Browser-first /see returned not_found, retrying after refresh", {
19
+ query,
20
+ candidateCount: firstAttempt.candidates.length
21
+ });
22
+ await this.liveTabs.refresh();
23
+ return await this.seeOnce(query, true);
24
+ }
25
+ catch (error) {
26
+ const message = error instanceof Error ? error.message : String(error);
27
+ this.logger.warn("Browser-first /see failed before completion, retrying after refresh", {
28
+ query,
29
+ errorMessage: message
30
+ });
31
+ await this.liveTabs.refresh();
32
+ return await this.seeOnce(query, true);
33
+ }
34
+ }
35
+ async seeOnce(query, recoveredAfterRefresh) {
36
+ const screenshotResult = await this.screenshots.captureResolved(query);
37
+ if (screenshotResult.status !== "completed") {
38
+ this.logger.info("Browser-first /see did not complete immediately", {
39
+ query,
40
+ status: screenshotResult.status,
41
+ candidateCount: screenshotResult.candidates.length,
42
+ recoveredAfterRefresh
43
+ });
44
+ return {
45
+ status: screenshotResult.status,
46
+ query: screenshotResult.query,
47
+ stage: screenshotResult.status === "ambiguous" ? "needs_disambiguation" : "not_found",
48
+ candidates: screenshotResult.candidates,
49
+ message: screenshotResult.message,
50
+ ...(recoveredAfterRefresh ? { recoveredAfterRefresh: true } : {})
51
+ };
52
+ }
53
+ const context = await this.contexts.getForResolvedTab(screenshotResult.resolution, query);
54
+ const mapped = {
55
+ status: "completed",
56
+ query: screenshotResult.query,
57
+ stage: "completed",
58
+ matchedCandidate: screenshotResult.resolution.matchedCandidate,
59
+ candidateCount: screenshotResult.resolution.candidateCount,
60
+ screenshot: screenshotResult.screenshot,
61
+ context,
62
+ message: `Captured browser tab \"${screenshotResult.screenshot.title}\" through CDP with structured page context.`,
63
+ ...(recoveredAfterRefresh ? { recoveredAfterRefresh: true } : {})
64
+ };
65
+ this.logger.info("Completed browser-first /see flow", {
66
+ query,
67
+ title: mapped.screenshot.title,
68
+ targetId: mapped.screenshot.targetId,
69
+ width: mapped.screenshot.width,
70
+ height: mapped.screenshot.height,
71
+ visibleTextLength: mapped.context.visibleTextLength,
72
+ recoveredAfterRefresh
73
+ });
74
+ return mapped;
75
+ }
76
+ }
@@ -0,0 +1,23 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { BrowserTabResolutionService } from "./browser-tab-resolution-service.js";
3
+ import type { LiveBrowserTabResolution } from "./tab-resolution.js";
4
+ import type { BrowserTabStructuredContext } from "./types.js";
5
+ export type GetResolvedBrowserTabContextResult = {
6
+ status: "completed";
7
+ query?: string;
8
+ resolution: Extract<LiveBrowserTabResolution, {
9
+ status: "resolved";
10
+ }>;
11
+ context: BrowserTabStructuredContext;
12
+ } | Extract<LiveBrowserTabResolution, {
13
+ status: "ambiguous" | "not_found";
14
+ }>;
15
+ export declare class BrowserTabContextService {
16
+ private readonly resolver;
17
+ private readonly logger;
18
+ constructor(resolver: BrowserTabResolutionService, logger: Logger);
19
+ getForResolvedTab(resolution: Extract<LiveBrowserTabResolution, {
20
+ status: "resolved";
21
+ }>, query?: string): Promise<BrowserTabStructuredContext>;
22
+ getResolvedContext(query?: string): Promise<GetResolvedBrowserTabContextResult>;
23
+ }
@@ -0,0 +1,90 @@
1
+ import { withCdpTabSession } from "./cdp-websocket-session.js";
2
+ const nowIso = () => new Date().toISOString();
3
+ const CONTEXT_EVALUATION_EXPRESSION = `(() => {
4
+ const bodyText = document.body?.innerText ?? "";
5
+ return {
6
+ pageTitle: document.title ?? "",
7
+ pageUrl: window.location.href ?? "",
8
+ documentLanguage: document.documentElement?.lang ?? "",
9
+ contentType: document.contentType ?? "",
10
+ visibleText: bodyText,
11
+ viewport: {
12
+ width: window.innerWidth,
13
+ height: window.innerHeight,
14
+ devicePixelRatio: window.devicePixelRatio
15
+ }
16
+ };
17
+ })()`;
18
+ const asOptionalString = (value) => typeof value === "string" && value.trim() !== "" ? value : undefined;
19
+ const asVisibleText = (value) => (typeof value === "string" ? value : "");
20
+ const asViewport = (value) => {
21
+ if (!value || typeof value !== "object") {
22
+ return undefined;
23
+ }
24
+ const candidate = value;
25
+ return {
26
+ width: typeof candidate.width === "number" ? candidate.width : undefined,
27
+ height: typeof candidate.height === "number" ? candidate.height : undefined,
28
+ devicePixelRatio: typeof candidate.devicePixelRatio === "number" ? candidate.devicePixelRatio : undefined
29
+ };
30
+ };
31
+ const collectTabContext = async (resolution) => await withCdpTabSession(resolution.tab, async (session) => {
32
+ await session.sendCommand("Page.enable");
33
+ await session.sendCommand("Runtime.enable");
34
+ const evaluation = await session.sendCommand("Runtime.evaluate", {
35
+ expression: CONTEXT_EVALUATION_EXPRESSION,
36
+ returnByValue: true
37
+ });
38
+ const result = evaluation.result;
39
+ const value = result && typeof result === "object" && "value" in result
40
+ ? result.value
41
+ : undefined;
42
+ const payload = value && typeof value === "object" ? value : {};
43
+ const visibleText = asVisibleText(payload.visibleText);
44
+ return {
45
+ targetId: resolution.tab.targetId,
46
+ title: resolution.tab.title,
47
+ url: resolution.tab.url,
48
+ browserName: resolution.tab.browserName,
49
+ pageTitle: asOptionalString(payload.pageTitle),
50
+ pageUrl: asOptionalString(payload.pageUrl),
51
+ documentLanguage: asOptionalString(payload.documentLanguage),
52
+ contentType: asOptionalString(payload.contentType),
53
+ visibleText,
54
+ visibleTextLength: visibleText.length,
55
+ viewport: asViewport(payload.viewport),
56
+ collectedAt: nowIso(),
57
+ backend: "cdp-runtime-evaluate"
58
+ };
59
+ });
60
+ export class BrowserTabContextService {
61
+ resolver;
62
+ logger;
63
+ constructor(resolver, logger) {
64
+ this.resolver = resolver;
65
+ this.logger = logger;
66
+ }
67
+ async getForResolvedTab(resolution, query) {
68
+ const context = await collectTabContext(resolution);
69
+ this.logger.info("Collected structured browser tab context via CDP", {
70
+ query,
71
+ targetId: resolution.tab.targetId,
72
+ title: resolution.tab.title,
73
+ visibleTextLength: context.visibleTextLength
74
+ });
75
+ return context;
76
+ }
77
+ async getResolvedContext(query) {
78
+ const resolution = this.resolver.resolve(query);
79
+ if (resolution.status !== "resolved") {
80
+ return resolution;
81
+ }
82
+ const context = await this.getForResolvedTab(resolution, query);
83
+ return {
84
+ status: "completed",
85
+ query,
86
+ resolution,
87
+ context
88
+ };
89
+ }
90
+ }
@@ -0,0 +1,9 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { BrowserLiveTabService } from "./browser-live-tab-service.js";
3
+ import { type LiveBrowserTabResolution } from "./tab-resolution.js";
4
+ export declare class BrowserTabResolutionService {
5
+ private readonly liveTabs;
6
+ private readonly logger;
7
+ constructor(liveTabs: BrowserLiveTabService, logger: Logger);
8
+ resolve(query?: string): LiveBrowserTabResolution;
9
+ }
@@ -0,0 +1,65 @@
1
+ import { scoreLiveTab, sortCandidates } from "./tab-resolution.js";
2
+ export class BrowserTabResolutionService {
3
+ liveTabs;
4
+ logger;
5
+ constructor(liveTabs, logger) {
6
+ this.liveTabs = liveTabs;
7
+ this.logger = logger;
8
+ }
9
+ resolve(query) {
10
+ const live = this.liveTabs.list();
11
+ const tabs = live.tabs;
12
+ if (tabs.length === 0) {
13
+ return {
14
+ status: "not_found",
15
+ query,
16
+ candidates: [],
17
+ message: "No live browser tabs are available. Refresh CDP discovery first."
18
+ };
19
+ }
20
+ const candidates = tabs
21
+ .map((tab) => scoreLiveTab(tab, query))
22
+ .filter((candidate) => candidate !== undefined)
23
+ .sort(sortCandidates);
24
+ if (candidates.length === 0) {
25
+ return {
26
+ status: "not_found",
27
+ query,
28
+ candidates: tabs.slice(0, 5).map((tab) => scoreLiveTab(tab)).sort(sortCandidates),
29
+ message: "No live browser tab matched the query."
30
+ };
31
+ }
32
+ if (candidates.length > 1 && candidates[0].matchScore === candidates[1].matchScore) {
33
+ return {
34
+ status: "ambiguous",
35
+ query,
36
+ candidates: candidates.slice(0, 5),
37
+ message: "Multiple live browser tabs matched the query. Provide a more specific title or URL."
38
+ };
39
+ }
40
+ const matched = tabs.find((tab) => tab.targetId === candidates[0].targetId);
41
+ if (!matched) {
42
+ return {
43
+ status: "not_found",
44
+ query,
45
+ candidates: candidates.slice(0, 5),
46
+ message: "The best matching browser tab disappeared before it could be resolved."
47
+ };
48
+ }
49
+ this.logger.info("Resolved live browser tab", {
50
+ query,
51
+ targetId: matched.targetId,
52
+ title: matched.title,
53
+ url: matched.url,
54
+ matchReason: candidates[0].matchReason,
55
+ matchScore: candidates[0].matchScore
56
+ });
57
+ return {
58
+ status: "resolved",
59
+ query,
60
+ tab: matched,
61
+ matchedCandidate: candidates[0],
62
+ candidateCount: candidates.length
63
+ };
64
+ }
65
+ }
@@ -0,0 +1,20 @@
1
+ import type { Logger } from "../../logging/logger.js";
2
+ import { BrowserTabResolutionService } from "./browser-tab-resolution-service.js";
3
+ import type { LiveBrowserTabResolution } from "./tab-resolution.js";
4
+ import type { BrowserTabScreenshot } from "./types.js";
5
+ export type CaptureResolvedLiveBrowserTabScreenshotResult = {
6
+ status: "completed";
7
+ query?: string;
8
+ resolution: Extract<LiveBrowserTabResolution, {
9
+ status: "resolved";
10
+ }>;
11
+ screenshot: BrowserTabScreenshot;
12
+ } | Extract<LiveBrowserTabResolution, {
13
+ status: "ambiguous" | "not_found";
14
+ }>;
15
+ export declare class BrowserTabScreenshotService {
16
+ private readonly resolver;
17
+ private readonly logger;
18
+ constructor(resolver: BrowserTabResolutionService, logger: Logger);
19
+ captureResolved(query?: string): Promise<CaptureResolvedLiveBrowserTabScreenshotResult>;
20
+ }
@@ -0,0 +1,59 @@
1
+ import { withCdpTabSession } from "./cdp-websocket-session.js";
2
+ import { readPngDimensions } from "./png-metadata.js";
3
+ const nowIso = () => new Date().toISOString();
4
+ const captureTabScreenshot = async (resolution) => await withCdpTabSession(resolution.tab, async (session) => {
5
+ await session.sendCommand("Page.enable");
6
+ const captureResult = await session.sendCommand("Page.captureScreenshot", {
7
+ format: "png",
8
+ fromSurface: true,
9
+ captureBeyondViewport: true
10
+ });
11
+ const data = captureResult.data;
12
+ if (typeof data !== "string" || data.trim() === "") {
13
+ throw new Error("CDP screenshot response did not include PNG data");
14
+ }
15
+ const buffer = Buffer.from(data, "base64");
16
+ const dimensions = readPngDimensions(buffer);
17
+ return {
18
+ targetId: resolution.tab.targetId,
19
+ title: resolution.tab.title,
20
+ url: resolution.tab.url,
21
+ browserName: resolution.tab.browserName,
22
+ mimeType: "image/png",
23
+ bytesBase64: data,
24
+ width: dimensions.width,
25
+ height: dimensions.height,
26
+ byteLength: buffer.byteLength,
27
+ capturedAt: nowIso(),
28
+ backend: "cdp-page-capture"
29
+ };
30
+ });
31
+ export class BrowserTabScreenshotService {
32
+ resolver;
33
+ logger;
34
+ constructor(resolver, logger) {
35
+ this.resolver = resolver;
36
+ this.logger = logger;
37
+ }
38
+ async captureResolved(query) {
39
+ const resolution = this.resolver.resolve(query);
40
+ if (resolution.status !== "resolved") {
41
+ return resolution;
42
+ }
43
+ const screenshot = await captureTabScreenshot(resolution);
44
+ this.logger.info("Captured browser tab screenshot via CDP", {
45
+ query,
46
+ targetId: resolution.tab.targetId,
47
+ title: resolution.tab.title,
48
+ width: screenshot.width,
49
+ height: screenshot.height,
50
+ byteLength: screenshot.byteLength
51
+ });
52
+ return {
53
+ status: "completed",
54
+ query,
55
+ resolution,
56
+ screenshot
57
+ };
58
+ }
59
+ }
@@ -0,0 +1,9 @@
1
+ import WebSocket from "ws";
2
+ import type { LiveBrowserTab } from "./tab-model.js";
3
+ export declare class CdpWebSocketSession {
4
+ private readonly socket;
5
+ private nextCommandId;
6
+ constructor(socket: WebSocket);
7
+ sendCommand(method: string, params?: Record<string, unknown>, timeoutMs?: number): Promise<Record<string, unknown>>;
8
+ }
9
+ export declare const withCdpTabSession: <T>(tab: LiveBrowserTab, run: (session: CdpWebSocketSession) => Promise<T>) => Promise<T>;